From a6f37a2bd543921ba3030a605e22f42980a33087 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Thu, 17 Oct 2019 13:25:08 -0700
Subject: [PATCH 01/59] [relay][vm] Separate VM runtime with executable (#4100)

* [relay][vm] Separate VM runtime with executable

* Address comments

* move ctx back to vm

* make only vm related fields and methods protected

* integrate seriliaztion/deserialization to executable

* create stream
---
 include/tvm/runtime/vm.h                      | 210 ++++-
 python/tvm/relay/__init__.py                  |   2 -
 python/tvm/relay/backend/deserializer.py      |  81 --
 python/tvm/relay/backend/profiler_vm.py       |  12 +-
 python/tvm/relay/backend/serializer.py        | 191 -----
 python/tvm/relay/backend/vm.py                | 232 +++++-
 src/relay/backend/vm/compiler.cc              |  20 +-
 src/relay/backend/vm/compiler.h               |  12 +-
 src/relay/backend/vm/deserializer.cc          | 324 --------
 src/relay/backend/vm/deserializer.h           | 102 ---
 src/relay/backend/vm/profiler/compiler.cc     |   1 -
 src/relay/backend/vm/serializer.cc            | 439 -----------
 src/relay/backend/vm/serializer.h             | 202 -----
 src/runtime/vm/executable.cc                  | 734 ++++++++++++++++++
 src/runtime/vm/profiler/vm.cc                 |  29 +-
 src/runtime/vm/profiler/vm.h                  |   2 +
 .../backend => runtime}/vm/serialize_util.h   |  12 +-
 src/runtime/vm/vm.cc                          |  92 +--
 tests/python/relay/test_vm.py                 |  30 +-
 tests/python/relay/test_vm_serialization.py   | 119 ++-
 .../unittest/test_runtime_vm_profiler.py      |   4 +-
 21 files changed, 1285 insertions(+), 1565 deletions(-)
 delete mode 100644 python/tvm/relay/backend/deserializer.py
 delete mode 100644 python/tvm/relay/backend/serializer.py
 delete mode 100644 src/relay/backend/vm/deserializer.cc
 delete mode 100644 src/relay/backend/vm/deserializer.h
 delete mode 100644 src/relay/backend/vm/serializer.cc
 delete mode 100644 src/relay/backend/vm/serializer.h
 create mode 100644 src/runtime/vm/executable.cc
 rename src/{relay/backend => runtime}/vm/serialize_util.h (95%)
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index aa8543d569af..a276c658c496 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -26,6 +26,7 @@
 
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -430,15 +431,184 @@ struct VMFrame {
         caller_return_register(0) {}
 };
 
+/*! \brief The executable emitted by the VM compiler.
+ *
+ * The executable contains information (e.g. data in different memory regions)
+ * to run in a virtual machine.
+ *
+ *  - Global section, containing all globals.
+ *  - Constant section, storing the constant pool.
+ *  - Primitive name section, containing the function name of the primitive ops
+ *  used by the virtual machine.
+ *  - Code section, handling the VM functions and bytecode.
+ */
+class Executable : public ModuleNode {
+ public:
+  /*!
+   * \brief Get a PackedFunc from an executable module.
+   *
+   * \param name the name of the function.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc or nullptr when it is not available.
+   */
+  PackedFunc GetFunction(const std::string& name,
+                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+
+  /*!
+   * \brief Serialize the executable into global section, constant section, and
+   * code section.
+   *
+   * \return The binary representation of the VM.
+   */
+  TVMByteArray Save();
+
+  /*!
+   * \brief Load the saved VM executable.
+   *
+   * \param code The bytecode in string.
+   * \param lib The compiled runtime library.
+   *
+   * \return exe The constructed executable.
+   */
+  static runtime::Module Load(const std::string& code, const runtime::Module lib);
+
+  /*!
+   * \brief Get the serialized form of the `functions`. This is
+   * essentially bytecode serialization.
+   *
+   * \return The serialized vm bytecode.
+   *
+   * \note The bytecode is in the following format:
+   *   func_name reg_file_size num_instructions
+   *   param1 param2 ... paramM
+   *   instruction1
+   *   instruction2
+   *   ...
+   *   instructionN
+   *
+   * Each instruction is printed in the following format:
+   *   opcode num_fields field1 ... fieldX # The text format.
+   *
+   * Serializing an `Instruction` requires us to deal with the bytecode. Each line
+   * of the instructions could be serialized as the following format:
+   *   hash, opcode, f1, f2, ..., fX, field with variable length
+   *   1. hash: the hash of the instruction. This number will be used to help us
+   * validate if an instruction is well-formed during deserialization.
+   *   2. opcode: the opcode code of the instruction.
+   *   3. f1, f2, ..., fX. These fields together represent the fixed fields in
+   * an instruction, e.g., `from` and `dst` fields of a `Move` instruction. For
+   * example, `DLDataType` will be unpacked into three fields (code, bits, lanes).
+   *   4. The rest of the line indicates the field with variable length, e.g.,
+   * the shape of a tensor, the args used by an `InvokPacked` instruction, etc.
+
+   * The field starting from # is only used for debugging. The serialized code
+   * doesn't contain it, therefore the deserializer doens't need to handle it.
+   */
+  std::string GetBytecode() const;
+
+/*!
+   * \brief Print the detailed statistics of the given code, i.e. number of
+   * globls and constants, etc.
+   */
+  std::string Stats() const;
+
+  /*! \brief Get the `lib` module in an executable. Users have the flexibility to call
+   * `export_library` from the frontend to save the library to disk.
+   *
+   * \return The runtime module that contains the hardwre dependent code.
+   */
+  runtime::Module GetLib() const { return lib; }
+
+  virtual ~Executable() {}
+
+  const char* type_key() const final {
+    return "VMExecutable";
+  }
+
+  /*! \brief The runtime module/library that contains both the host and also the device
+   * code when executing on non-CPU devices. */
+  runtime::Module lib;
+  /*! \brief The global constant pool. */
+  std::vector<ObjectRef> constants;
+  /*! \brief A map from globals (as strings) to their index in the function map. */
+  std::unordered_map<std::string, Index> global_map;
+  /*! \brief A mapping from the packed function (as string) to the index that
+   * corresponds to the position of the `packed_funcs` list in a `VirtualMachine` object.
+   */
+  std::unordered_map<std::string, Index> primitive_map;
+  /*! \brief The virtual machine's function table. */
+  std::vector<VMFunction> functions;
+
+ private:
+  /*!
+   * \brief Save the globals.
+   *
+   * \param strm The input stream.
+   */
+  void SaveGlobalSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Save the constant pool.
+   *
+   * \param strm The input stream.
+   */
+  void SaveConstantSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Save primitive op names.
+   *
+   *  \param strm The input stream.
+   */
+  void SavePrimitiveOpNames(dmlc::Stream* strm);
+
+  /*!
+   * \brief Save the vm functions.
+   *
+   * \param strm The input stream.
+   */
+  void SaveCodeSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Load the globals.
+   *
+   * \param strm The input stream.
+   */
+  void LoadGlobalSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Load the constant pool.
+   *
+   * \param strm The input stream.
+   */
+  void LoadConstantSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Load primitive op names.
+   *
+   * \param strm The input stream.
+   */
+  void LoadPrimitiveOpNames(dmlc::Stream* strm);
+
+  /*!
+   * \brief Load the vm functions.
+   *
+   * \param strm The input stream.
+   */
+  void LoadCodeSection(dmlc::Stream* strm);
+
+  /*! \brief The serialized bytecode. */
+  std::string code_;
+};
+
 /*! \brief The virtual machine.
  *
  * The virtual machine contains all the current execution state,
- * as well as the global view of functions, the global constant
- * table, the compiled operators.
+ * as well as the executable.
  *
  * The goal is to have a single self-contained object,
  * enabling one to easily pass around VMs, execute them on
- * multiple threads, or serialized them to disk or over the
+ * multiple threads, or serialize them to disk or over the
  * wire.
  */
 class VirtualMachine : public runtime::ModuleNode {
@@ -486,16 +656,18 @@ class VirtualMachine : public runtime::ModuleNode {
     return "VirtualMachine";
   }
 
-  /*! \brief The runtime module/library that contains generated code. */
-  runtime::Module lib;
+  VirtualMachine() : frames(), func_index(0), code(nullptr), pc(0), exec(nullptr) {}
+
+  /*! \brief load the executable for the virtual machine.
+   *  \param exec The executable.
+   */
+  void LoadExecutable(const Executable* exec);
+
+ protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs;
-  /*! \brief The virtual machine's function table. */
-  std::vector<VMFunction> functions;
   /*! \brief The current stack of call frames. */
   std::vector<VMFrame> frames;
-  /*! \brief The global constant pool. */
-  std::vector<ObjectRef> constants;
   /*! \brief The fuction table index of the current function. */
   Index func_index;
   /*! \brief The current pointer to the code section. */
@@ -506,6 +678,9 @@ class VirtualMachine : public runtime::ModuleNode {
   /*! \brief The special return register. */
   ObjectRef return_register;
 
+  /*! \brief The executable the VM will operate on. */
+  const Executable* exec;
+
   /*! \brief The set of TVM contexts the VM is currently executing on. */
   std::vector<TVMContext> ctxs;
 
@@ -550,8 +725,6 @@ class VirtualMachine : public runtime::ModuleNode {
    */
   ObjectRef Invoke(const std::string& name, const std::vector<ObjectRef>& args);
 
-  VirtualMachine() : functions(), frames(), func_index(0), code(nullptr), pc(0) {}
-
   /*! \brief Initialize the virtual machine for a set of contexts.
    *  \param contexts The set of TVM contexts.
    */
@@ -565,21 +738,6 @@ class VirtualMachine : public runtime::ModuleNode {
    */
   TVMContext GetParamsContext() const;
 
-  /*!
-   * \brief Load parameters from the parameter bytearray.
-   * \param params The binary file that contains parameters.
-   */
-  void LoadParams(const std::string& params);
-
-  /*! \brief A map from globals (as strings) to their index in the function map.
-   */
-  std::unordered_map<std::string, Index> global_map;
-
-  /*! \brief A mapping from the packed function (as string) to the index that
-   * corresponds to the position of the `packed_funcs` list.
-   */
-  std::unordered_map<std::string, Index> primitive_map;
-
  private:
   /*! \brief Invoke a global setting up the VM state to execute.
    *
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index ceb98c4d251e..fff9c99e5007 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -37,8 +37,6 @@
 from . import feature
 from .backend import vm
 from .backend import profiler_vm
-from .backend import serializer
-from .backend import deserializer
 from .backend import vmobj
 
 # Root operators
diff --git a/python/tvm/relay/backend/deserializer.py b/python/tvm/relay/backend/deserializer.py
deleted file mode 100644
index fde702b1cd04..000000000000
--- a/python/tvm/relay/backend/deserializer.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# License .to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""
-The Relay Virtual Machine deserializer.
-
-Python interface for deserializing a Relay VM.
-"""
-from tvm import module
-from tvm._ffi.runtime_ctypes import TVMByteArray
-from . import _vm
-from . import vm as rly_vm
-
-def _create_deserializer(code, lib):
-    """Create a deserializer object.
-
-    Parameters
-    ----------
-    code : bytearray
-        The serialized virtual machine code.
-
-    lib : :py:class:`~tvm.module.Module`
-        The serialized runtime module/library that contains the hardware
-        dependent binary code.
-
-    Returns
-    -------
-    ret : Deserializer
-        The created virtual machine deserializer.
-    """
-    if isinstance(code, (bytes, str)):
-        code = bytearray(code)
-    elif not isinstance(code, (bytearray, TVMByteArray)):
-        raise TypeError("vm is expected to be the type of bytearray or " +
-                        "TVMByteArray, but received {}".format(type(code)))
-
-    if not isinstance(lib, module.Module):
-        raise TypeError("lib is expected to be the type of tvm.module.Module" +
-                        ", but received {}".format(type(lib)))
-    return _vm._Deserializer(code, lib)
-
-
-class Deserializer:
-    """Relay VM deserializer.
-
-    Parameters
-    ----------
-    code : bytearray
-        The serialized virtual machine code.
-
-    lib : :py:class:`~tvm.module.Module`
-        The serialized runtime module/library that contains the hardware
-        dependent binary code.
-    """
-    def __init__(self, code, lib):
-        self.mod = _create_deserializer(code, lib)
-        self._deserialize = self.mod["deserialize"]
-
-    def deserialize(self):
-        """Deserialize the serialized bytecode into a Relay VM.
-
-        Returns
-        -------
-        ret : VirtualMachine
-            The deserialized Relay VM.
-        """
-        return rly_vm.VirtualMachine(self._deserialize())
diff --git a/python/tvm/relay/backend/profiler_vm.py b/python/tvm/relay/backend/profiler_vm.py
index 8ae3161e0b83..b36715249f0a 100644
--- a/python/tvm/relay/backend/profiler_vm.py
+++ b/python/tvm/relay/backend/profiler_vm.py
@@ -49,8 +49,8 @@ def compile(mod, target=None, target_host=None, params=None):
 
     Returns
     -------
-    vm : VirtualMachineProfiler
-        The profile VM runtime.
+    exec : Executable
+        The executable with profiling code.
     """
     compiler = VMCompilerProfiler()
     target = compiler.update_target(target)
@@ -60,7 +60,7 @@ def compile(mod, target=None, target_host=None, params=None):
     tophub_context = compiler.tophub_context(target)
     with tophub_context:
         compiler._compile(mod, target, target_host)
-    return VirtualMachineProfiler(compiler._get_vm())
+    return vm.Executable(compiler._get_exec())
 
 class VMCompilerProfiler(vm.VMCompiler):
     """Build Relay module to run on VM runtime."""
@@ -68,13 +68,17 @@ def __init__(self):
         super().__init__()
         self.mod = _vm._VMCompilerProfiler()
         self._compile = self.mod["compile"]
-        self._get_vm = self.mod["get_vm"]
+        self._get_exec = self.mod["get_executable"]
         self._set_params_func = self.mod["set_params"]
 
 class VirtualMachineProfiler(vm.VirtualMachine):
     """Relay profile VM runtime."""
     def __init__(self, mod):
         super().__init__(mod)
+        m = mod.module if isinstance(mod, vm.Executable) else mod
+        self.mod = _vm._VirtualMachineDebug(m)
+        self._init = self.mod["init"]
+        self._invoke = self.mod["invoke"]
         self._get_stat = self.mod["get_stat"]
 
     def get_stat(self):
diff --git a/python/tvm/relay/backend/serializer.py b/python/tvm/relay/backend/serializer.py
deleted file mode 100644
index b45ba9116a15..000000000000
--- a/python/tvm/relay/backend/serializer.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# License .to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""
-The Relay Virtual Machine serializer.
-
-Python interface for serializing a Relay VM.
-"""
-import tvm
-from . import _vm
-from . import vm as rly_vm
-
-def _create_serializer(vm):
-    """Create a VM serializer.
-
-    Parameters
-    ----------
-    vm : Union[VirtualMachine, :py:class:`~tvm.module.Module`]
-        The virtual machine to be serialized.
-
-    Returns
-    -------
-    ret : Serializer
-        The created virtual machine serializer.
-    """
-    if isinstance(vm, rly_vm.VirtualMachine):
-        vm = vm.module
-    elif not isinstance(vm, tvm.module.Module):
-        raise TypeError("vm is expected to be the type of VirtualMachine or " +
-                        "tvm.Module, but received {}".format(type(vm)))
-
-    return _vm._Serializer(vm)
-
-
-class Serializer:
-    """Relay VM serializer."""
-    def __init__(self, vm):
-        self.mod = _create_serializer(vm)
-        self._get_lib = self.mod["get_lib"]
-        self._get_bytecode = self.mod["get_bytecode"]
-        self._get_globals = self.mod["get_globals"]
-        self._get_stats = self.mod["get_stats"]
-        self._get_primitive_ops = self.mod["get_primitive_ops"]
-        self._serialize = self.mod["serialize"]
-
-    @property
-    def stats(self):
-        """Get the statistics of the Relay VM.
-
-        Returns
-        -------
-        ret : String
-            The serialized statistic information.
-        """
-        return self._get_stats()
-
-    @property
-    def primitive_ops(self):
-        """Get the name of the primitive ops that are executed in the VM.
-
-        Returns
-        -------
-        ret : List[:py:class:`~tvm.expr.StringImm`]
-            The list of primitive ops.
-        """
-        return [prim_op.value for prim_op in self._get_primitive_ops()]
-
-    @property
-    def bytecode(self):
-        """Get the bytecode of the Relay VM.
-
-        Returns
-        -------
-        ret : String
-            The serialized bytecode.
-
-        Notes
-        -----
-        The bytecode is in the following format:
-          func_name reg_file_size num_instructions
-          param1 param2 ... paramM
-          instruction1
-          instruction2
-          ...
-          instructionN
-
-        Each instruction is printed in the following format:
-          hash opcode field1 ... fieldX # The text format.
-
-        The part starting from # is only used for visualization and debugging.
-        The real serialized code doesn't contain it, therefore the deserializer
-        doesn't need to deal with it as well.
-        """
-        return self._get_bytecode()
-
-    @property
-    def globals(self):
-        """Get the globals used by the Relay VM.
-
-        Returns
-        -------
-        ret : List[:py:class:`~tvm.expr.StringImm`]
-            The serialized globals.
-        """
-        return [glb.value for glb in self._get_globals()]
-
-    def serialize(self):
-        """Serialize the Relay VM.
-
-        Returns
-        -------
-        code : bytearray
-            The binary blob representing a serialized Relay VM. It can then be
-            saved to disk and later deserialized into a new VM.
-
-        lib : :py:class:`~tvm.module.Module`
-            The runtime module that contains the generated code. It is
-            basically a library that is composed of hardware dependent code.
-
-        Notes
-        -----
-        The returned code is organized with the following sections in order.
-         - Global section. This section contains the globals used by the
-         virtual machine.
-         - Constant section. This section is used to store the constant pool of
-         a virtual machine.
-         - Primitive name section. This section is introduced to accommodate
-         the list of primitive operator names that will be invoked by the
-         virtual machine.
-         - Code section. The VM functions, including bytecode, are sitting in
-         this section.
-
-        Examples
-        --------
-        .. code-block:: python
-
-            import numpy as np
-            import tvm
-            from tvm import relay
-
-            # define a simple network.
-            x = relay.var('x', shape=(10, 10))
-            f = relay.Function([x], x + x)
-            mod = relay.Module({"main": f})
-
-            # create a Relay VM.
-            ctx = tvm.cpu()
-            target = "llvm"
-            compiler = relay.vm.VMCompiler()
-            vm = compiler.compile(mod, target)
-            vm.init(ctx)
-
-            # serialize.
-            ser = relay.serializer.Serializer(vm)
-            code, lib = ser.serialize()
-
-            # save and load the code and lib file.
-            tmp = tvm.contrib.util.tempdir()
-            path_lib = tmp.relpath("lib.so")
-            lib.export_library(path_lib)
-            with open(tmp.relpath("code.bc"), "wb") as fo:
-                fo.write(code)
-
-            loaded_lib = tvm.module.load(path_lib)
-            loaded_code = bytearray(open(tmp.relpath("code.bc"), "rb").read())
-
-            # deserialize.
-            deser = relay.deserializer.Deserializer(loaded_code, loaded_lib)
-            des_vm = deser.deserialize()
-
-            # execute the deserialized vm.
-            des_vm.init(ctx)
-            x_data = np.random.rand(10, 10).astype('float32')
-            res = des_vm.run(x_data)
-            print(res.asnumpy())
-        """
-        return self._serialize(), self._get_lib()
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index c24b16ca6437..942c93b866f4 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -24,8 +24,8 @@
 
 import tvm
 from tvm import autotvm
-from tvm._ffi.runtime_ctypes import TVMByteArray
 from tvm.relay import expr as _expr
+from tvm._ffi.runtime_ctypes import TVMByteArray
 from . import _vm
 from . import vmobj as _obj
 from .interpreter import Executor
@@ -44,6 +44,7 @@ def _convert(arg, cargs):
     else:
         raise "unsupported type"
 
+
 def convert(args):
     cargs = []
     for arg in args:
@@ -52,12 +53,202 @@ def convert(args):
     return cargs
 
 
+class Executable(object):
+    """Relay VM executable"""
+    def __init__(self, mod):
+        self.mod = mod
+        self._save = self.mod["save"]
+        self._get_lib = self.mod["get_lib"]
+        self._get_bytecode = self.mod["get_bytecode"]
+        self._get_stats = self.mod["get_stats"]
+
+    def save(self):
+        """Save the Relay VM Executable.
+
+        Returns
+        -------
+        code : bytearray
+            The binary blob representing a serialized Relay VM executable. It
+            can then be saved to disk and later deserialized into a new
+            Executable.
+
+        lib : :py:class:`~tvm.module.Module`
+            The runtime module that contains the generated code. It is
+            basically a library that is composed of hardware dependent code.
+
+        Notes
+        -----
+        The returned code is organized with the following sections in order.
+         - Global section. This section contains the globals used by the
+         virtual machine.
+         - Constant section. This section is used to store the constant pool of
+         a virtual machine.
+         - Primitive name section. This section is introduced to accommodate
+         the list of primitive operator names that will be invoked by the
+         virtual machine.
+         - Code section. The VM functions, including bytecode, are sitting in
+         this section.
+
+        Examples
+        --------
+
+        .. code-block:: python
+
+            import numpy as np
+            import tvm
+            from tvm import relay
+            # define a simple network.
+            x = relay.var('x', shape=(10, 10))
+            f = relay.Function([x], x + x)
+            mod = relay.Module({"main": f})
+            # create a Relay VM.
+            ctx = tvm.cpu()
+            target = "llvm"
+            executable = relay.vm.compile(mod, target)
+            code, lib = executable.save()
+            # save and load the code and lib file.
+            tmp = tvm.contrib.util.tempdir()
+            path_lib = tmp.relpath("lib.so")
+            lib.export_library(path_lib)
+            with open(tmp.relpath("code.ro"), "wb") as fo:
+                fo.write(code)
+            loaded_lib = tvm.module.load(path_lib)
+            loaded_code = bytearray(open(tmp.relpath("code.ro"), "rb").read())
+            # deserialize.
+            des_exec = relay.vm.Executable.load_exec(loaded_code, loaded_code)
+            # execute the deserialized executable.
+            x_data = np.random.rand(10, 10).astype('float32')
+            des_vm = relay.vm.VirtualMachine(des_exec)
+            des_vm.init(ctx)
+            res = des_vm.run(x_data)
+            print(res.asnumpy())
+        """
+        return self._save(), self._get_lib()
+
+    @staticmethod
+    def load_exec(bytecode, lib):
+        """Construct an executable from saved artifacts.
+
+        Parameters
+        ----------
+        bytecode : bytearray
+            The binary blob representing a the Relay VM bytecode.
+
+        lib : :py:class:`~tvm.module.Module`
+            The runtime module that contains the generated code.
+
+        Returns
+        -------
+        exec: Executable
+            An executable constructed using the provided artifacts.
+        """
+        if isinstance(bytecode, (bytes, str)):
+            code = bytearray(bytecode)
+        elif not isinstance(bytecode, (bytearray, TVMByteArray)):
+            raise TypeError("bytecode is expected to be the type of bytearray " +
+                            "or TVMByteArray, but received {}".format(type(code)))
+
+        if not isinstance(lib, tvm.module.Module):
+            raise TypeError("lib is expected to be the type of tvm.module.Module" +
+                            ", but received {}".format(type(lib)))
+
+        return Executable(_vm.Load_Executable(bytecode, lib))
+
+    @property
+    def lib(self):
+        """Get the library that contains hardware dependent code.
+
+        Returns
+        -------
+        ret : :py:class:`~tvm.Module`
+            The runtime module that contains hardware dependent code.
+        """
+        return self._get_lib()
+
+    @property
+    def stats(self):
+        """Get the statistics of the Relay VM executable.
+
+        Returns
+        -------
+        ret : String
+            The statistic information of the VM executable.
+        """
+        return self._get_stats()
+
+    @property
+    def primitive_ops(self):
+        """Get the name of the primitive ops contained in the executable.
+
+        Returns
+        -------
+        ret : List[String]
+            The list of primitive ops.
+        """
+        ret = []
+        num_primitives = _vm.GetNumOfPrimitives(self.module)
+        for i in range(num_primitives):
+            ret.append(_vm.GetPrimitiveFields(self.module, i))
+        return ret
+
+    @property
+    def bytecode(self):
+        """Get the bytecode of the Relay VM executable.
+
+        Returns
+        -------
+        ret : String
+            The bytecode of the executable.
+
+        Notes
+        -----
+        The bytecode is in the following format:
+          func_name reg_file_size num_instructions
+          param1 param2 ... paramM
+          instruction1
+          instruction2
+          ...
+          instructionN
+
+        Each instruction is printed in the following format:
+          hash opcode field1 ... fieldX # The text format.
+
+        The part starting from # is only used for visualization and debugging.
+        The real serialized code doesn't contain it, therefore the deserializer
+        doesn't need to deal with it as well.
+        """
+        return self._get_bytecode()
+
+    @property
+    def globals(self):
+        """Get the globals used by the Relay VM executable.
+
+        Returns
+        -------
+        ret : List[String]
+            The globals contained in the executable.
+        """
+        ret = []
+        num_globals = _vm.GetNumOfGlobals(self.module)
+        for i in range(num_globals):
+            ret.append(_vm.GetGlobalFields(self.module, i))
+        return ret
+
+    @property
+    def module(self):
+        """Return the runtime module contained in a virtual machine executable."""
+        return self.mod
+
+
 class VirtualMachine(object):
     """Relay VM runtime."""
     def __init__(self, mod):
-        self.mod = mod
+        if not isinstance(mod, (Executable, tvm.module.Module)):
+            raise TypeError("mod is expected to be the type of Executable or " +
+                            "tvm.Module, but received {}".format(type(mod)))
+        m = mod.module if isinstance(mod, Executable) else mod
+        self.mod = _vm._VirtualMachine(m)
         self._init = self.mod["init"]
-        self._load_params = self.mod["load_params"]
         self._invoke = self.mod["invoke"]
 
     def init(self, ctx):
@@ -71,23 +262,6 @@ def init(self, ctx):
         args = [ctx.device_type, ctx.device_id]
         self._init(*args)
 
-    def load_params(self, params):
-        """Load parameters for the VM.
-
-        Parameters
-        ----------
-        params : Union[bytearray, Dict]
-            The dictionary that contains serialized parameters.
-        """
-        if isinstance(params, dict):
-            params = tvm.relay.save_param_dict(params)
-        elif isinstance(params, (bytes, str)):
-            params = bytearray(params)
-        if not isinstance(params, (bytearray, TVMByteArray)):
-            raise TypeError("params must be a bytearray")
-
-        self._load_params(bytearray(params))
-
     def invoke(self, func_name, *args):
         """Invoke a function.
 
@@ -122,11 +296,6 @@ def run(self, *args):
         """
         return self.invoke("main", *args)
 
-    @property
-    def module(self):
-        """Return the runtime module contained in a virtual machine."""
-        return self.mod
-
 
 def compile(mod, target=None, target_host=None, params=None):
     """
@@ -155,8 +324,8 @@ def compile(mod, target=None, target_host=None, params=None):
 
     Returns
     -------
-    vm : VirtualMachine
-        The VM runtime.
+    exec : Executable
+        The VM executable that contains both library code and bytecode.
     """
     compiler = VMCompiler()
 
@@ -167,14 +336,14 @@ def compile(mod, target=None, target_host=None, params=None):
     tophub_context = compiler.tophub_context(target)
     with tophub_context:
         compiler._compile(mod, target, target_host)
-    return VirtualMachine(compiler._get_vm())
+    return Executable(compiler._get_exec())
 
 class VMCompiler(object):
     """Build Relay module to run on VM runtime."""
     def __init__(self):
         self.mod = _vm._VMCompiler()
         self._compile = self.mod["compile"]
-        self._get_vm = self.mod["get_vm"]
+        self._get_exec = self.mod["get_executable"]
         self._set_params_func = self.mod["set_params"]
 
     def set_params(self, params):
@@ -240,7 +409,7 @@ class VMExecutor(Executor):
     mod : :py:class:`~tvm.relay.module.Module`
         The module to support the execution.
 
-    ctx : :py:class:`TVMContext`
+    ctx : :py:class:`~tvm.TVMContext`
         The runtime context to run the code on.
 
     target : :py:class:`Target`
@@ -252,7 +421,8 @@ def __init__(self, mod, ctx, target):
         self.mod = mod
         self.ctx = ctx
         self.target = target
-        self.vm = compile(mod, target)
+        self.executable = compile(mod, target)
+        self.vm = VirtualMachine(self.executable)
         self.vm.init(ctx)
 
     def _make_executor(self, expr=None):
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 0cfae374ab2c..f295ccd7a555 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -783,9 +783,9 @@ PackedFunc VMCompiler::GetFunction(const std::string& name,
       Module mod = args[0];
       this->Compile(mod, args[1], args[2]);
     });
-  } else if (name == "get_vm") {
+  } else if (name == "get_executable") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = runtime::Module(vm_);
+      *rv = runtime::Module(exec_);
     });
   } else if (name == "set_params") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -864,7 +864,7 @@ void VMCompiler::Compile(Module mod,
 
   // Next we get ready by allocating space for
   // the global state.
-  vm_->functions.resize(context_.module->functions.size());
+  exec_->functions.resize(context_.module->functions.size());
 
   for (auto named_func : context_.module->functions) {
     auto gvar = named_func.first;
@@ -873,25 +873,25 @@ void VMCompiler::Compile(Module mod,
     auto vm_func = func_compiler.Compile(gvar, func);
 
     size_t func_index = context_.global_map.at(gvar);
-    CHECK(func_index < vm_->functions.size());
-    vm_->functions[func_index] = vm_func;
+    CHECK(func_index < exec_->functions.size());
+    exec_->functions[func_index] = vm_func;
   }
 
 #if USE_RELAY_DEBUG
-  for (auto vm_func : vm_->functions) {
+  for (auto vm_func : exec_->functions) {
     DLOG(INFO) << vm_func << "-------------";
   }
 #endif  // USE_RELAY_DEBUG
 
   // populate constants
   for (auto data : context_.constants) {
-    vm_->constants.push_back(runtime::vm::Tensor(data));
+    exec_->constants.push_back(runtime::vm::Tensor(data));
   }
 
   LibraryCodegen();
 
   for (auto gv : context_.global_map) {
-    vm_->global_map.insert({gv.first->name_hint, gv.second});
+    exec_->global_map.insert({gv.first->name_hint, gv.second});
   }
 }
 
@@ -987,13 +987,13 @@ void VMCompiler::LibraryCodegen() {
     // therefore target won't be used in the build function
     runtime::Module mod = (*f)(funcs, Target(), target_host_);
     CHECK(mod.operator->());
-    vm_->lib = mod;
+    exec_->lib = mod;
   } else {
     LOG(FATAL) << "relay.backend.build is not registered";
   }
   size_t primitive_index = 0;
   for (auto cfunc : cached_funcs) {
-    vm_->primitive_map.insert({cfunc->funcs[0]->name, primitive_index++});
+    exec_->primitive_map.insert({cfunc->funcs[0]->name, primitive_index++});
   }
 }
 
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index dff1ef7f4569..215cc12c4cdb 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -92,12 +92,8 @@ class VMCompiler : public runtime::ModuleNode {
     return "VMCompiler";
   }
 
-  std::shared_ptr<VirtualMachine> GetVirtualMachine() const {
-    return vm_;
-  }
-
-  virtual void InitVM() {
-    vm_ = std::make_shared<VirtualMachine>();
+  void InitVM() {
+    exec_ = std::make_shared<Executable>();
   }
 
   /*!
@@ -144,8 +140,8 @@ class VMCompiler : public runtime::ModuleNode {
   tvm::Target target_host_;
   /*! \brief Global shared meta data */
   VMCompilerContext context_;
-  /*! \brief Compiled virtual machine. */
-  std::shared_ptr<VirtualMachine> vm_;
+  /*! \brief Compiled executable. */
+  std::shared_ptr<Executable> exec_;
   /*! \brief parameters */
   std::unordered_map<std::string, runtime::NDArray> params_;
 };
diff --git a/src/relay/backend/vm/deserializer.cc b/src/relay/backend/vm/deserializer.cc
deleted file mode 100644
index 777282782e99..000000000000
--- a/src/relay/backend/vm/deserializer.cc
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/deserializer.cc
- * \brief Implementation of APIs to deserialize the serialized VM bytecode.
- */
-
-#include "deserializer.h"
-
-#include <tvm/runtime/registry.h>
-#include <memory>
-#include <sstream>
-
-#include "serialize_util.h"
-
-namespace tvm {
-namespace relay {
-namespace vm {
-
-#define STREAM_CHECK(val, section)                                         \
-  CHECK(val) << "Invalid VM file format in the " << section << " section." \
-             << "\n";
-
-void Deserializer::Init(const std::string& code, const runtime::Module& lib) {
-  code_ = code;
-  vm_ = std::make_shared<VirtualMachine>();
-  vm_->lib = lib;
-  strm_ = new dmlc::MemoryStringStream(&code_);
-}
-
-runtime::PackedFunc Deserializer::GetFunction(
-    const std::string& name,
-    const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  if (name == "deserialize") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      this->Deserialize();
-      *rv = runtime::Module(vm_);
-    });
-  } else {
-    LOG(FATAL) << "Unknown packed function: " << name;
-    return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
-  }
-}
-
-void Deserializer::Deserialize() {
-  // Check header.
-  uint64_t header;
-  STREAM_CHECK(strm_->Read(&header), "header");
-  STREAM_CHECK(header == kTVMVMBytecodeMagic, "header");
-
-  // Check version.
-  std::string version;
-  STREAM_CHECK(strm_->Read(&version), "version");
-  STREAM_CHECK(version == TVM_VERSION, "version");
-
-  // Global section.
-  DeserializeGlobalSection();
-
-  // Constant section.
-  DeserializeConstantSection();
-
-  // Primitive names that will be invoked by `InvokePacked` instructions.
-  DeserializePrimitiveOpNames();
-
-  // Code section.
-  DeserializeCodeSection();
-}
-
-void Deserializer::DeserializeGlobalSection() {
-  std::vector<std::string> globals;
-  STREAM_CHECK(strm_->Read(&globals), "global");
-  for (size_t i = 0; i < globals.size(); i++) {
-    vm_->global_map.insert({globals[i], i});
-  }
-}
-
-void Deserializer::DeserializeConstantSection() {
-  uint64_t sz;
-  // Load the number of constants.
-  STREAM_CHECK(strm_->Read(&sz, sizeof(sz)), "constant");
-
-  size_t size = static_cast<size_t>(sz);
-  // Load each of the constants.
-  for (size_t i = 0; i < size; i++) {
-    runtime::NDArray constant;
-    STREAM_CHECK(constant.Load(strm_), "constant");
-    runtime::ObjectRef obj = runtime::vm::Tensor(constant);
-    vm_->constants.push_back(obj);
-  }
-}
-
-void Deserializer::DeserializePrimitiveOpNames() {
-  std::vector<std::string> primitive_names;
-  STREAM_CHECK(strm_->Read(&primitive_names), "primitive name");
-  for (size_t i = 0; i < primitive_names.size(); i++) {
-    vm_->primitive_map.insert({primitive_names[i], i});
-  }
-}
-
-// Extract the `cnt` number of fields started at `start` from the list
-// `instr_fields`.
-inline std::vector<Index> ExtractFields(const std::vector<Index>& instr_fields,
-                                        Index start,
-                                        Index cnt) {
-  CHECK_LE(static_cast<size_t>(start + cnt), instr_fields.size());
-  std::vector<Index> ret;
-  for (auto i = start; i < start + cnt; i++) {
-    ret.push_back(instr_fields[i]);
-  }
-  return ret;
-}
-
-Instruction DeserializeInstruction(const VMInstructionSerializer& instr) {
-  Opcode opcode = static_cast<Opcode>(instr.opcode);
-  switch (opcode) {
-    case Opcode::Move: {
-      // Number of fields = 2
-      DCHECK_EQ(instr.fields.size(), 2U);
-      return Instruction::Move(instr.fields[0], instr.fields[1]);
-    }
-    case Opcode::Ret: {
-      // Number of fields = 1
-      DCHECK_EQ(instr.fields.size(), 1U);
-      return Instruction::Ret(instr.fields[0]);
-    }
-    case Opcode::Fatal: {
-      // Number of fields = 0
-      DCHECK(instr.fields.empty());
-      return Instruction::Fatal();
-    }
-    case Opcode::InvokePacked: {
-      // Number of fields = 3 + instr.arity
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index packed_index = instr.fields[0];
-      Index arity = instr.fields[1];
-      Index output_size = instr.fields[2];
-      std::vector<RegName> args = ExtractFields(instr.fields, 3, arity);
-      return Instruction::InvokePacked(packed_index, arity, output_size, args);
-    }
-    case Opcode::AllocTensor: {
-      // Number of fields = 5 + instr.alloc_tensor.ndim
-      DCHECK_GE(instr.fields.size(), 5U);
-      DCHECK_EQ(instr.fields.size(), 5U + static_cast<size_t>(instr.fields[3]));
-
-      DLDataType dtype;
-      dtype.code = instr.fields[0];
-      dtype.bits = instr.fields[1];
-      dtype.lanes = instr.fields[2];
-
-      Index ndim = instr.fields[3];
-      RegName dst = instr.fields[4];
-
-      std::vector<Index> shape = ExtractFields(instr.fields, 5, ndim);
-
-      return Instruction::AllocTensor(shape, dtype, dst);
-    }
-    case Opcode::AllocTensorReg: {
-      // Number of fields = 5
-      DCHECK_EQ(instr.fields.size(), 5U);
-      Index shape_register = instr.fields[0];
-
-      DLDataType dtype;
-      dtype.code = instr.fields[1];
-      dtype.bits = instr.fields[2];
-      dtype.lanes = instr.fields[3];
-
-      RegName dst = instr.fields[4];
-
-      return Instruction::AllocTensorReg(shape_register, dtype, dst);
-    }
-    case Opcode::AllocDatatype: {
-      // Number of fields = 3 + instr.num_fields
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index constructor_tag = instr.fields[0];
-      Index num_fields = instr.fields[1];
-      RegName dst = instr.fields[2];
-      std::vector<Index> fields = ExtractFields(instr.fields, 3, num_fields);
-
-      return Instruction::AllocDatatype(constructor_tag, num_fields, fields, dst);
-    }
-    case Opcode::AllocClosure: {
-      // Number of fields = 3 + instr.num_freevar
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index clo_index = instr.fields[0];
-      Index num_freevar = instr.fields[1];
-      RegName dst = instr.fields[2];
-      std::vector<Index> free_vars = ExtractFields(instr.fields, 3, num_freevar);
-
-      return Instruction::AllocClosure(clo_index, num_freevar, free_vars, dst);
-    }
-    case Opcode::If: {
-      // Number of fields = 4
-      DCHECK_EQ(instr.fields.size(), 4U);
-      Index test = instr.fields[0];
-      Index target = instr.fields[1];
-      Index true_offset = instr.fields[2];
-      Index false_offset = instr.fields[3];
-
-      return Instruction::If(test, target, true_offset, false_offset);
-    }
-    case Opcode::Invoke: {
-      // Number of fields = 3 + instr.num_args
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index func_index = instr.fields[0];
-      Index num_args = instr.fields[1];
-      RegName dst = instr.fields[2];
-      std::vector<Index> args = ExtractFields(instr.fields, 3, num_args);
-
-      return Instruction::Invoke(func_index, args, dst);
-    }
-    case Opcode::InvokeClosure: {
-      // Number of fields = 3 + instr.num_closure_args
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index closure = instr.fields[0];
-      Index num_closure_args = instr.fields[1];
-      RegName dst = instr.fields[2];
-      std::vector<Index> args = ExtractFields(instr.fields, 3, num_closure_args);
-
-      return Instruction::InvokeClosure(closure, args, dst);
-    }
-    case Opcode::LoadConst: {
-      // Number of fields = 2
-      DCHECK_EQ(instr.fields.size(), 2U);
-      return Instruction::LoadConst(instr.fields[0], instr.fields[1]);
-    }
-    case Opcode::LoadConsti: {
-      // Number of fields = 2
-      DCHECK_EQ(instr.fields.size(), 2U);
-      return Instruction::LoadConsti(instr.fields[0], instr.fields[1]);
-    }
-    case Opcode::GetField: {
-      // Number of fields = 3
-      DCHECK_EQ(instr.fields.size(), 3U);
-      return Instruction::GetField(instr.fields[0], instr.fields[1], instr.fields[2]);
-    }
-    case Opcode::GetTag: {
-      // Number of fields = 2
-      DCHECK_EQ(instr.fields.size(), 2U);
-      return Instruction::GetTag(instr.fields[0], instr.fields[1]);
-    }
-    case Opcode::Goto: {
-      // Number of fields = 1
-      DCHECK_EQ(instr.fields.size(), 1U);
-      return Instruction::Goto(instr.fields[0]);
-    }
-    default:
-      LOG(FATAL) << "Invalid opcode" << instr.opcode;
-      return Instruction();
-  }
-}
-
-void Deserializer::DeserializeCodeSection() {
-  // Load the number of functions.
-  uint64_t sz;
-  STREAM_CHECK(strm_->Read(&sz, sizeof(sz)), "code");
-
-  size_t num_funcs = static_cast<size_t>(sz);
-  vm_->functions.resize(num_funcs);
-  for (size_t i = 0; i < num_funcs; i++) {
-    // Load the function info.
-    VMFunctionSerializer loaded_func;
-    STREAM_CHECK(loaded_func.Load(strm_), "code/function");
-
-    // Load the instructions.
-    std::vector<Instruction> instructions;
-    for (size_t j = 0; j < loaded_func.num_instructions; j++) {
-      VMInstructionSerializer instr;
-      std::vector<Index> instr_fields;
-      STREAM_CHECK(instr.Load(strm_), "code/instruction");
-      instructions.push_back(DeserializeInstruction(instr));
-    }
-
-    // Create the VM function.
-    VMFunction vm_func = VMFunction(loaded_func.name,
-                                    loaded_func.params,
-                                    instructions,
-                                    loaded_func.register_file_size);
-    auto it = vm_->global_map.find(loaded_func.name);
-    CHECK(it != vm_->global_map.end());
-    CHECK_LE(it->second, vm_->global_map.size());
-    vm_->functions[it->second] = vm_func;
-  }
-}
-
-runtime::Module CreateDeserializer(const std::string& code, const runtime::Module lib) {
-  std::shared_ptr<Deserializer> exec = std::make_shared<Deserializer>();
-  exec->Init(code, lib);
-  return runtime::Module(exec);
-}
-
-TVM_REGISTER_GLOBAL("relay._vm._Deserializer")
-.set_body_typed(CreateDeserializer);
-
-}  // namespace vm
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/vm/deserializer.h b/src/relay/backend/vm/deserializer.h
deleted file mode 100644
index 0caf72bee92c..000000000000
--- a/src/relay/backend/vm/deserializer.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/deserializer.h
- * \brief Define a deserializer for the serialized Relay VM.
- */
-
-#ifndef TVM_RELAY_BACKEND_VM_DESERIALIZER_H_
-#define TVM_RELAY_BACKEND_VM_DESERIALIZER_H_
-
-#include <dmlc/memory_io.h>
-#include <tvm/packed_func_ext.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/vm.h>
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace tvm {
-namespace relay {
-namespace vm {
-
-using namespace tvm::runtime::vm;
-namespace runtime = tvm::runtime;
-
-class Deserializer : public runtime::ModuleNode {
- public:
-  /*!
-   * \brief Initialize the deserializer for creating a virtual machine object.
-   *
-   * \param code The serialized code.
-   * \param lib The serialized runtime module/library that contains the
-   * hardware dependent code.
-   */
-  inline void Init(const std::string& code, const runtime::Module& lib);
-
-  /*!
-   * \brief Return the member function to the frontend.
-   *
-   * \param name The name of the function.
-   * \param sptr_to_self The pointer to the module node.
-   *
-   * \return The corresponding member function.
-   */
-  PackedFunc GetFunction(const std::string& name,
-                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-
-  const char* type_key() const final { return "Deserializer"; }
-
-  /*! \brief Deserialize the serialized VM. */
-  void Deserialize();
-
-  virtual ~Deserializer() { delete strm_; }
-
- private:
-  /*! \brief Deserialize the globals in `vm_`. */
-  void DeserializeGlobalSection();
-
-  /*! \brief Deserialize the constant pool in `vm_`. */
-  void DeserializeConstantSection();
-
-  /*! \brief Deserialize primitive op names in `vm_`. */
-  void DeserializePrimitiveOpNames();
-
-  /*! \brief Deserialize the vm functions in `vm_`. */
-  void DeserializeCodeSection();
-
-  /*! \brief The code to be serialized. */
-  std::string code_;
-
-  /*! \brief The stream used for serialization. */
-  dmlc::Stream* strm_;
-
-  /*! \brief The VM to be created. */
-  std::shared_ptr<VirtualMachine> vm_;
-};
-
-}  // namespace vm
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_BACKEND_VM_DESERIALIZER_H_
diff --git a/src/relay/backend/vm/profiler/compiler.cc b/src/relay/backend/vm/profiler/compiler.cc
index 9fd28e8c7f46..60c441a60cf0 100644
--- a/src/relay/backend/vm/profiler/compiler.cc
+++ b/src/relay/backend/vm/profiler/compiler.cc
@@ -33,7 +33,6 @@ namespace vm {
 class VMCompilerDebug : public VMCompiler {
  public:
   VMCompilerDebug() {}
-  void InitVM() override { vm_ = std::make_shared<VirtualMachineDebug>(); }
   virtual ~VMCompilerDebug() {}
 };
 
diff --git a/src/relay/backend/vm/serializer.cc b/src/relay/backend/vm/serializer.cc
deleted file mode 100644
index 0040ef9db470..000000000000
--- a/src/relay/backend/vm/serializer.cc
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/serializer.cc
- * \brief Implementation of serializing APIs for the Relay VM.
- */
-#include "serializer.h"
-
-#include <tvm/runtime/registry.h>
-#include <tvm/runtime/c_runtime_api.h>
-
-#include <algorithm>
-#include <memory>
-#include <sstream>
-#include <utility>
-#include <vector>
-
-#include "serialize_util.h"
-
-namespace tvm {
-namespace relay {
-namespace vm {
-
-void Serializer::Init(const VirtualMachine* vm) {
-  vm_ = vm;
-  // Initialize the stream object.
-  strm_ = new dmlc::MemoryStringStream(&code_);
-}
-
-runtime::PackedFunc Serializer::GetFunction(
-    const std::string& name,
-    const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  if (name == "get_lib") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->GetLib();
-    });
-  } else if (name == "get_primitive_ops") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->GetPrimitiveOps();
-    });
-  } else if (name == "get_bytecode") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->GetBytecode();
-    });
-  } else if (name == "get_globals") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->GetGlobals();
-    });
-  } else if (name == "get_stats") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->Stats();
-    });
-  } else if (name == "serialize") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->Serialize();
-    });
-  } else {
-    LOG(FATAL) << "Unknown packed function: " << name;
-    return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
-  }
-}
-
-tvm::Array<tvm::Expr> Serializer::GetPrimitiveOps() const {
-  std::vector<tvm::Expr> ret;
-  for (const auto& it : vm_->primitive_map) {
-    auto packed_name = tvm::ir::StringImm::make(it.first);
-    auto packed_index = static_cast<size_t>(it.second);
-    if (ret.size() <= packed_index) {
-      ret.resize(packed_index + 1);
-    }
-    ret[packed_index] = packed_name;
-  }
-  return ret;
-}
-
-std::string Serializer::Stats() const {
-  std::ostringstream oss;
-  oss << "Relay VM statistics:" << std::endl;
-
-  // Get the number of constants and the shape of each of them.
-  oss << "  Constant shapes (# " << vm_->constants.size() << "): [";
-  for (const auto& it : vm_->constants) {
-    auto* cell = it.as<runtime::vm::TensorObj>();
-    CHECK(cell != nullptr);
-    runtime::NDArray data = cell->data;
-    const auto& shape = data.Shape();
-
-    // Scalar
-    if (shape.empty()) {
-      oss << "scalar, ";
-      continue;
-    }
-
-    oss << "[";
-    for (auto s : shape) {
-      oss << s << ", ";
-    }
-    oss.seekp(-2, oss.cur);
-    oss << "], " << std::endl;
-  }
-  if (!vm_->constants.empty()) oss.seekp(-2, oss.cur);
-  oss << "]" << std::endl;
-
-  // Get the number of globals and the name of each of them.
-  oss << "  Globals (#" << vm_->global_map.size() << "): [";
-  for (const auto& it : vm_->global_map) {
-    oss << "(\"" << it.first << "\", " << it.second << ")" << ", ";
-  }
-  if (!vm_->global_map.empty()) oss.seekp(-2, oss.cur);
-  oss << "]" << std::endl;
-
-  // Get the number of primitive ops and the name of each of them.
-  oss << "  Primitive ops (#" << vm_->primitive_map.size() << "): [";
-  const auto& prim_ops = GetPrimitiveOps();
-  for (const auto& it : prim_ops) {
-    oss << it << ", ";
-  }
-  if (!prim_ops.empty()) oss.seekp(-2, oss.cur);
-  oss << "]" << std::endl;
-
-  return oss.str();
-}
-
-TVMByteArray Serializer::Serialize() {
-  uint64_t header = kTVMVMBytecodeMagic;
-  strm_->Write(header);
-  std::string version = TVM_VERSION;
-  strm_->Write(version);
-
-  // Global section.
-  SerializeGlobalSection();
-
-  // Constant section.
-  SerializeConstantSection();
-
-  // Primitive names.
-  SerializePrimitiveOpNames();
-
-  // Code section.
-  SerializeCodeSection();
-
-  TVMByteArray arr;
-  arr.data = code_.c_str();
-  arr.size = code_.length();
-  return arr;
-}
-
-void Serializer::SerializeGlobalSection() {
-  auto globals = GetGlobals();
-  std::vector<std::string> glbs;
-  for (const auto& it : globals) {
-    glbs.push_back(it.as<tvm::ir::StringImm>()->value);
-  }
-  strm_->Write(glbs);
-}
-
-void Serializer::SerializeConstantSection() {
-  std::vector<DLTensor*> arrays;
-  for (const auto& obj : vm_->constants) {
-    const auto* cell = obj.as<runtime::vm::TensorObj>();
-    CHECK(cell != nullptr);
-    runtime::NDArray data = cell->data;
-    arrays.push_back(const_cast<DLTensor*>(data.operator->()));
-  }
-  strm_->Write(static_cast<uint64_t>(vm_->constants.size()));
-  for (const auto& it : arrays) {
-    runtime::SaveDLTensor(strm_, it);
-  }
-}
-
-void Serializer::SerializePrimitiveOpNames() {
-  auto names = GetPrimitiveOps();
-  std::vector<std::string> primitive_names;
-  for (const auto& it : names) {
-    primitive_names.push_back(it.as<tvm::ir::StringImm>()->value);
-  }
-  strm_->Write(primitive_names);
-}
-
-// Serialize a virtual machine instruction. It creates a list that contains the
-// hash, opcode, and all fields of an instruction.
-//
-// For example, the function signature used to create an `AllocTensor`
-// instruction is:
-//   Instruction AllocTensor(std::vector<Index> shape, DLDataType dtype, RegName dst)
-//
-// The serialized form will be:
-//   `hash 5 dtype.code dtype.bits dtype.lanes ndim dst_register val1 val2 ... valn`
-//
-// where hash is the hash of serialized instruction that is computed internally
-// by the `VMInstructionSerializer`. It is used for sanity check before decoding.
-// 5 shows opcode of `AllocTensor`, `(dtype.code dtype.bits dtype.lanes)`
-// represents a `DLDataType`, `ndim` is the number of dimensions, `dst_register`
-// is the destination register, and the rest of it together indicates the shape
-// of the tensor to be allocated.
-VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
-  std::vector<Index> fields;
-  // Save the opcode.
-  DLOG(INFO) << "Serializing: " << instr << std::endl;
-  switch (instr.op) {
-    case Opcode::Move: {
-      // Number of fields = 2
-      fields.assign({instr.from, instr.dst});
-      break;
-    }
-    case Opcode::Ret: {
-      // Number of fields = 1
-      fields.push_back(instr.result);
-      break;
-    }
-    case Opcode::Fatal: {
-      // Number of fields = 0
-      break;
-    }
-    case Opcode::InvokePacked: {
-      // Number of fields = 3 + instr.arity
-      // Note that arity includes both input arguments and outputs. We will
-      // put all the `arity` number of fields in the end for serialization.
-      fields.assign({instr.packed_index, instr.arity, instr.output_size});
-      // Save the args.
-      fields.insert(fields.end(), instr.packed_args, instr.packed_args + instr.arity);
-      break;
-    }
-    case Opcode::AllocTensor: {
-      // Number of fields = 5 + instr.alloc_tensor.ndim
-      // Save `DLDataType` and the dst register.
-      const auto& dtype = instr.alloc_tensor.dtype;
-      fields.assign({dtype.code, dtype.bits, dtype.lanes});
-
-      // The number of dimensions is not needed for constructing an
-      // `AllocTensor` instruction as it equals to the length of the `shape`
-      // vector. However, we save it to conveniently deserialize the instruction
-      // because we will know how many fields are needed by the `shape` argument.
-      fields.push_back(instr.alloc_tensor.ndim);
-      fields.push_back(instr.dst);
-
-      // Save the shape of the tensor.
-      // Note that this field is rotated to the end of the list.
-      fields.insert(fields.end(), instr.alloc_tensor.shape,
-                    instr.alloc_tensor.shape + instr.alloc_tensor.ndim);
-      break;
-    }
-    case Opcode::AllocTensorReg: {
-      // Number of fields = 5
-      fields.push_back(instr.alloc_tensor_reg.shape_register);
-      // Save `DLDataType` and the dst register.
-      const auto& dtype = instr.alloc_tensor.dtype;
-      fields.assign({dtype.code, dtype.bits, dtype.lanes});
-      fields.push_back(instr.dst);
-      break;
-    }
-    case Opcode::AllocDatatype: {
-      // Number of fields = 3 + instr.num_fields
-      fields.assign({instr.constructor_tag, instr.num_fields, instr.dst});
-
-      // Save the fields.
-      fields.insert(fields.end(), instr.datatype_fields,
-                    instr.datatype_fields + instr.num_fields);
-      break;
-    }
-    case Opcode::AllocClosure: {
-      // Number of fields = 3 + instr.num_freevar
-      fields.assign({instr.clo_index, instr.num_freevar, instr.dst});
-
-      // Save the free vars.
-      fields.insert(fields.end(), instr.free_vars,
-                    instr.free_vars + instr.num_freevar);
-      break;
-    }
-    case Opcode::If: {
-      // Number of fields = 4
-      fields.assign({instr.if_op.test,
-                     instr.if_op.target,
-                     instr.if_op.true_offset,
-                     instr.if_op.false_offset});
-      break;
-    }
-    case Opcode::Invoke: {
-      // Number of fields = 3 + instr.num_args
-      fields.assign({instr.func_index, instr.num_args, instr.dst});
-
-      // Save the args.
-      fields.insert(fields.end(), instr.invoke_args_registers,
-                    instr.invoke_args_registers + instr.num_args);
-      break;
-    }
-    case Opcode::InvokeClosure: {
-      // Number of fields = 3 + instr.num_closure_args
-      fields.assign({instr.closure, instr.num_closure_args, instr.dst});
-
-      // Save the args.
-      fields.insert(fields.end(), instr.closure_args,
-                    instr.closure_args + instr.num_closure_args);
-      break;
-    }
-    case Opcode::LoadConst: {
-      // Number of fields = 2
-      fields.assign({instr.const_index, instr.dst});
-      break;
-    }
-    case Opcode::LoadConsti: {
-      // Number of fields = 2
-      fields.assign({instr.load_consti.val, instr.dst});
-      break;
-    }
-    case Opcode::GetField: {
-      // Number of fields = 3
-      fields.assign({instr.object, instr.field_index, instr.dst});
-      break;
-    }
-    case Opcode::GetTag: {
-      // Number of fields = 2
-      fields.assign({instr.get_tag.object, instr.dst});
-      break;
-    }
-    case Opcode::Goto: {
-      // Number of fields = 1
-      fields.push_back(instr.pc_offset);
-      break;
-    }
-    default:
-      LOG(FATAL) << "Invalid opcode" << static_cast<int>(instr.op);
-      break;
-  }
-
-  return VMInstructionSerializer(static_cast<Index>(instr.op), fields);
-}
-
-void Serializer::SerializeCodeSection() {
-  // Save the number of functions.
-  strm_->Write(static_cast<uint64_t>(vm_->functions.size()));
-  for (const auto& func : vm_->functions) {
-    // Serialize the function info.
-    VMFunctionSerializer func_format(func.name,
-                                     func.register_file_size,
-                                     func.instructions.size(),
-                                     func.params);
-    func_format.Save(strm_);
-
-    // Serialize each instruction.
-    for (const auto& instr : func.instructions) {
-      const auto& serialized_instr = SerializeInstruction(instr);
-      serialized_instr.Save(strm_);
-    }
-  }
-}
-
-tvm::Array<tvm::Expr> Serializer::GetGlobals() const {
-  tvm::Array<tvm::Expr> ret;
-  std::vector<std::pair<std::string, Index> > globals(vm_->global_map.begin(),
-                                                      vm_->global_map.end());
-  auto comp = [](const std::pair<std::string, Index>& a,
-                 const std::pair<std::string, Index>& b) {
-    return a.second < b.second;
-  };
-  std::sort(globals.begin(), globals.end(), comp);
-  for (const auto& it : globals) {
-    ret.push_back(tvm::ir::StringImm::make(it.first));
-  }
-  return ret;
-}
-
-std::string Serializer::GetBytecode() const {
-  std::ostringstream oss;
-
-  for (const auto& func : vm_->functions) {
-    // Print the header of the function format.
-    oss << "# func name, reg file size, param count, inst count:"
-        << std::endl;
-    oss << func.name << " "
-        << func.register_file_size << " "
-        << func.params.size() << " "
-        << func.instructions.size() << std::endl;
-
-    // Print pramams of a `VMFunction`.
-    oss << "# Parameters:"<< std::endl;
-    for (const auto& param : func.params) {
-      oss << param << " ";
-    }
-    oss << std::endl;
-
-    // Print the instructions of a `VMFunction`.
-    // The part after ";" is the instruction in text format.
-    oss << "hash, opcode, fields # inst(text):"<< std::endl;
-    for (const auto& instr : func.instructions) {
-      const auto& serialized_instr = SerializeInstruction(instr);
-      oss << std::hex << "0x" << serialized_instr.Hash() << " "
-          << std::dec << serialized_instr.opcode << " ";
-      for (auto it : serialized_instr.fields) {
-        oss << it << " ";
-      }
-      oss << "  # " << instr;
-      if (oss.str().back() != '\n') oss << std::endl;
-    }
-  }
-
-  return oss.str();
-}
-
-runtime::Module Serializer::GetLib() const {
-  return vm_->lib;
-}
-
-runtime::Module CreateSerializer(const VirtualMachine* vm) {
-  std::shared_ptr<Serializer> exec = std::make_shared<Serializer>();
-  exec->Init(vm);
-  return runtime::Module(exec);
-}
-
-TVM_REGISTER_GLOBAL("relay._vm._Serializer")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-  runtime::Module mod = args[0];
-  const auto* vm = dynamic_cast<VirtualMachine*>(mod.operator->());
-  CHECK(vm) << "Virtual machine has not been defined yet."
-            << "\n";
-  *rv = CreateSerializer(vm);
-});
-
-}  // namespace vm
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/vm/serializer.h b/src/relay/backend/vm/serializer.h
deleted file mode 100644
index 2371bb4c94f5..000000000000
--- a/src/relay/backend/vm/serializer.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/serializer.h
- * \brief Define a serializer for the Relay VM.
- *
- * The following components of a Relay VM will be serialized:
- *  - The `constants`, e.g., the constant pool, that contains the
- *  constants used in a Relay program.
- *  - The `packed_funcs` that essentially contains the generated code for
- *  a specific target. We return it as a runtime module that can be exported as
- *  a library file (e.g., .so, .o, or .tar).
- *  - The `global_map` that contains the globals.
- *  - The `primitive_map` that contains the name of individual primitive operators.
- *  - The `functions`, e.g., the `VMFunction`. Each `VMFunction` is composed of
- *  a list of instructions/bytecode.
- *
- * Note that only the library is returned as a separate module. All othere parts
- * are stored in a single serialized code that is organized with the following
- * sections in order.
- *  - Global section, containing all globals.
- *  - Constant section, storing the constant pool.
- *  - Primitive name section, containing the function name of the primitive ops
- *  used by the virtual machine.
- *  - Code section, handling the VM functions and bytecode.
- *
- * The code section is again organized as follows for each VM function:
- *   func_name, register_file_size, num_instructions (N)
- *   param1, param2, ..., paramM
- *   instruction1
- *   instruction2
- *   ...
- *   instructionN
- *
- * Serializing an `Instruction` requires us to deal with the bytecode. Each line
- * of the instructions could be serialized as the following format:
- *   hash, opcode, f1, f2, ..., fX, field with variable length
- *   1. hash: the hash of the instruction. This number will be used to help us
- * validate if an instruction is well-formed during deserialization.
- *   2. opcode: the opcode code of the instruction.
- *   3. f1, f2, ..., fX. These fields together represent the fixed fields in
- * an instruction, e.g., `from` and `dst` fields of a `Move` instruction. For
- * example, `DLDataType` will be unpacked into three fields (code, bits, lanes).
- *   4. The rest of the line indicates the field with variable length, e.g.,
- * the shape of a tensor, the args used by an `InvokPacked` instruction, etc.
- */
-
-#ifndef TVM_RELAY_BACKEND_VM_SERIALIZER_H_
-#define TVM_RELAY_BACKEND_VM_SERIALIZER_H_
-
-#include <dmlc/io.h>
-#include <dmlc/memory_io.h>
-#include <tvm/ir.h>
-#include <tvm/node/container.h>
-#include <tvm/packed_func_ext.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/vm.h>
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace tvm {
-namespace relay {
-namespace vm {
-
-using namespace tvm::runtime;
-using namespace tvm::runtime::vm;
-
-/*!
- * \brief The Relay VM serializer.
- */
-class Serializer : public runtime::ModuleNode {
- public:
-  /*!
-   * \brief Initialize the serializer for a virtual machine.
-   *
-   *  \param vm The Relay virtual machine.
-   */
-  inline void Init(const VirtualMachine* vm);
-
-  /*!
-   * \brief Return the member function to the frontend.
-   *
-   * \param name The name of the function.
-   * \param sptr_to_self The pointer to the module node.
-   *
-   * \return The corresponding member function.
-   */
-  PackedFunc GetFunction(const std::string& name,
-                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-
-  const char* type_key() const final { return "Serializer"; }
-
-  /*!
-   * \brief Print the detailed statistics of the given code, i.e. number of
-   * globls and constants, etc.
-   */
-  std::string Stats() const;
-
-  /*!
-   * \brief Serialize the `vm_` into global section, constant section, and code
-   * section.
-   *
-   * \return The binary representation of the VM.
-   */
-  TVMByteArray Serialize();
-
-  /*!
-   * \brief Get a list of the globals used by the `_vm`.
-   *
-   * \return The global map in the form a list.
-   */
-  tvm::Array<tvm::Expr> GetGlobals() const;
-
-  /*!
-   * \brief Get the primitive operators that are contained in the Relay VM.
-   *
-   * \return The list of primitve operators.
-   */
-  tvm::Array<tvm::Expr> GetPrimitiveOps() const;
-
-  /*!
-   * \brief Get the serialized form of the `functions` in `vm_`. This is
-   * essentially bytecode serialization.
-   *
-   * \return The serialized vm bytecode.
-   *
-   * \note The bytecode is in the following format:
-   *   func_name reg_file_size num_instructions
-   *   param1 param2 ... paramM
-   *   instruction1
-   *   instruction2
-   *   ...
-   *   instructionN
-   *
-   * Each instruction is printed in the following format:
-   *   opcode num_fields field1 ... fieldX # The text format.
-   *
-   * The field starting from # is only used for debugging. The serialized code
-   * doesn't contain it, therefore the deserializer doens't need to handle it.
-   */
-  std::string GetBytecode() const;
-
-  /*! \brief Get the `lib` module in vm_. Serialization of `runtime::module`
-   * has already been supported by TVM. Therefore, we only return the runtime
-   * module and let users have the flexibility to call `export_library` from
-   * the frontend to save the library to disk.
-   *
-   * \return The runtime module that contains the hardwre dependent code.
-   */
-  inline runtime::Module GetLib() const;
-
-  virtual ~Serializer() { delete strm_; }
-
- private:
-  /*! \brief Serialize the globals in vm_. */
-  void SerializeGlobalSection();
-
-  /*! \brief Serialize the constant pool in vm_. */
-  void SerializeConstantSection();
-
-  /*! \brief Serialize primitive op names in vm_. */
-  void SerializePrimitiveOpNames();
-
-  /*! \brief Serialize the vm functions in vm_. */
-  void SerializeCodeSection();
-
-  /*! \brief The Relay virtual machine for to be serialized. */
-  const VirtualMachine* vm_;
-
-  /*! \brief The stream used for serialization. */
-  dmlc::Stream* strm_;
-
-  /*! \brief The serialized code. */
-  std::string code_;
-};
-
-}  // namespace vm
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_BACKEND_VM_SERIALIZER_H_
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
new file mode 100644
index 000000000000..21f71af4eb8c
--- /dev/null
+++ b/src/runtime/vm/executable.cc
@@ -0,0 +1,734 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/runtime/vm/executable.cc
+ * \brief The implementation of a virtual machine executable APIs.
+ */
+
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/vm.h>
+
+#include <algorithm>
+#include <memory>
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "serialize_util.h"
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+#define STREAM_CHECK(val, section)                                         \
+  CHECK(val) << "Invalid VM file format in the " << section << " section." \
+             << "\n";
+
+// Helper to serialize a vm instruction.
+VMInstructionSerializer SerializeInstruction(const Instruction& instr);
+// Helper to deserialize a serialized vm instruction.
+Instruction DeserializeInstruction(const VMInstructionSerializer& instr);
+
+PackedFunc Executable::GetFunction(const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  if (name == "get_lib") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->GetLib();
+    });
+  } else if (name == "get_bytecode") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->GetBytecode();
+    });
+  } else if (name == "get_stats") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->Stats();
+    });
+  } else if (name == "save") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->Save();
+    });
+  } else {
+    LOG(FATAL) << "Unknown packed function: " << name;
+    return PackedFunc(nullptr);
+  }
+}
+
+std::string Executable::GetBytecode() const {
+  std::ostringstream oss;
+
+  for (const auto& func : functions) {
+    // Print the header of the function format.
+    oss << "# func name, reg file size, param count, inst count:"
+        << std::endl;
+    oss << func.name << " "
+        << func.register_file_size << " "
+        << func.params.size() << " "
+        << func.instructions.size() << std::endl;
+
+    // Print pramams of a `VMFunction`.
+    oss << "# Parameters: "<< std::endl;
+    for (const auto& param : func.params) {
+      oss << param << " ";
+    }
+    oss << std::endl;
+
+    // Print the instructions of a `VMFunction`.
+    // The part after ";" is the instruction in text format.
+    oss << "hash, opcode, fields # inst(text):"<< std::endl;
+    for (const auto& instr : func.instructions) {
+      const auto& serialized_instr = SerializeInstruction(instr);
+      oss << std::hex << "0x" << serialized_instr.Hash() << " "
+          << std::dec << serialized_instr.opcode << " ";
+      for (auto it : serialized_instr.fields) {
+        oss << it << " ";
+      }
+      oss << "  # " << instr;
+      if (oss.str().back() != '\n') oss << std::endl;
+    }
+  }
+
+  return oss.str();
+}
+
+std::string Executable::Stats() const {
+  std::ostringstream oss;
+  oss << "Relay VM executable statistics:" << std::endl;
+
+  // Get the number of constants and the shape of each of them.
+  oss << "  Constant shapes (# " << constants.size() << "): [";
+  for (const auto& it : constants) {
+    const auto* cell = it.as<TensorObj>();
+    CHECK(cell);
+    runtime::NDArray data = cell->data;
+    const auto& shape = data.Shape();
+
+    // Scalar
+    if (shape.empty()) {
+      oss << "scalar, ";
+      continue;
+    }
+
+    oss << "[";
+    for (auto s : shape) {
+      oss << s << ", ";
+    }
+    oss.seekp(-2, oss.cur);
+    oss << "], " << std::endl;
+  }
+  if (!constants.empty()) oss.seekp(-2, oss.cur);
+  oss << "]" << std::endl;
+
+  // Get the number of globals and the name of each of them.
+  oss << "  Globals (#" << global_map.size() << "): [";
+  for (const auto& it : global_map) {
+    oss << "(\"" << it.first << "\", " << it.second << ")" << ", ";
+  }
+  if (!global_map.empty()) oss.seekp(-2, oss.cur);
+  oss << "]" << std::endl;
+
+  // Get the number of primitive ops and the name of each of them.
+  oss << "  Primitive ops (#" << primitive_map.size() << "): [";
+  std::vector<std::string> prim_ops;
+  for (const auto& it : primitive_map) {
+    auto packed_index = static_cast<size_t>(it.second);
+    if (prim_ops.size() <= packed_index) {
+      prim_ops.resize(packed_index + 1);
+    }
+    prim_ops[packed_index] = it.first;
+  }
+  for (const auto& it : prim_ops) {
+    oss << it << ", ";
+  }
+  if (!prim_ops.empty()) oss.seekp(-2, oss.cur);
+  oss << "]" << std::endl;
+
+  return oss.str();
+}
+
+void SaveHeader(dmlc::Stream* strm) {
+  uint64_t header = kTVMVMBytecodeMagic;
+  strm->Write(header);
+  std::string version = TVM_VERSION;
+  strm->Write(version);
+}
+
+TVMByteArray Executable::Save() {
+  // Initialize the stream object.
+  code_.clear();
+  dmlc::MemoryStringStream strm(&code_);
+
+  // Save header
+  SaveHeader(&strm);
+
+  // Global section.
+  SaveGlobalSection(&strm);
+
+  // Constant section.
+  SaveConstantSection(&strm);
+
+  // Primitive names.
+  SavePrimitiveOpNames(&strm);
+
+  // Code section.
+  SaveCodeSection(&strm);
+
+  TVMByteArray arr;
+  arr.data = code_.c_str();
+  arr.size = code_.length();
+  return arr;
+}
+
+void Executable::SaveGlobalSection(dmlc::Stream* strm) {
+  std::vector<std::pair<std::string, Index> > globals(this->global_map.begin(),
+                                                      this->global_map.end());
+  auto comp = [](const std::pair<std::string, Index>& a,
+                 const std::pair<std::string, Index>& b) {
+    return a.second < b.second;
+  };
+  std::sort(globals.begin(), globals.end(), comp);
+
+  std::vector<std::string> glbs;
+  for (const auto& it : globals) {
+    glbs.push_back(it.first);
+  }
+  strm->Write(glbs);
+}
+
+void Executable::SaveConstantSection(dmlc::Stream* strm) {
+  std::vector<DLTensor*> arrays;
+  for (const auto& obj : this->constants) {
+    const auto* cell = obj.as<runtime::vm::TensorObj>();
+    CHECK(cell != nullptr);
+    runtime::NDArray data = cell->data;
+    arrays.push_back(const_cast<DLTensor*>(data.operator->()));
+  }
+  strm->Write(static_cast<uint64_t>(this->constants.size()));
+  for (const auto& it : arrays) {
+    runtime::SaveDLTensor(strm, it);
+  }
+}
+
+void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) {
+  std::vector<std::string> primitive_names;
+  for (const auto& it : this->primitive_map) {
+    auto packed_index = static_cast<size_t>(it.second);
+    if (primitive_names.size() <= packed_index) {
+      primitive_names.resize(packed_index + 1);
+    }
+    primitive_names[packed_index] = it.first;
+  }
+  strm->Write(primitive_names);
+}
+
+// Serialize a virtual machine instruction. It creates a list that contains the
+// hash, opcode, and all fields of an instruction.
+//
+// For example, the function signature used to create an `AllocTensor`
+// instruction is:
+//   Instruction AllocTensor(std::vector<Index> shape, DLDataType dtype, RegName dst)
+//
+// The serialized form will be:
+//   `hash 5 dtype.code dtype.bits dtype.lanes ndim dst_register val1 val2 ... valn`
+//
+// where hash is the hash of serialized instruction that is computed internally
+// by the `VMInstructionExecutable`. It is used for sanity check before decoding.
+// 5 shows opcode of `AllocTensor`, `(dtype.code dtype.bits dtype.lanes)`
+// represents a `DLDataType`, `ndim` is the number of dimensions, `dst_register`
+// is the destination register, and the rest of it together indicates the shape
+// of the tensor to be allocated.
+VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
+  std::vector<Index> fields;
+  // Save the opcode.
+  DLOG(INFO) << "Serializing: " << instr << std::endl;
+  switch (instr.op) {
+    case Opcode::Move: {
+      // Number of fields = 2
+      fields.assign({instr.from, instr.dst});
+      break;
+    }
+    case Opcode::Ret: {
+      // Number of fields = 1
+      fields.push_back(instr.result);
+      break;
+    }
+    case Opcode::Fatal: {
+      // Number of fields = 0
+      break;
+    }
+    case Opcode::InvokePacked: {
+      // Number of fields = 3 + instr.arity
+      // Note that arity includes both input arguments and outputs. We will
+      // put all the `arity` number of fields in the end for serialization.
+      fields.assign({instr.packed_index, instr.arity, instr.output_size});
+      // Save the args.
+      fields.insert(fields.end(), instr.packed_args, instr.packed_args + instr.arity);
+      break;
+    }
+    case Opcode::AllocTensor: {
+      // Number of fields = 5 + instr.alloc_tensor.ndim
+      // Save `DLDataType` and the dst register.
+      const auto& dtype = instr.alloc_tensor.dtype;
+      fields.assign({dtype.code, dtype.bits, dtype.lanes});
+
+      // The number of dimensions is not needed for constructing an
+      // `AllocTensor` instruction as it equals to the length of the `shape`
+      // vector. However, we save it to conveniently deserialize the instruction
+      // because we will know how many fields are needed by the `shape` argument.
+      fields.push_back(instr.alloc_tensor.ndim);
+      fields.push_back(instr.dst);
+
+      // Save the shape of the tensor.
+      // Note that this field is rotated to the end of the list.
+      fields.insert(fields.end(), instr.alloc_tensor.shape,
+                    instr.alloc_tensor.shape + instr.alloc_tensor.ndim);
+      break;
+    }
+    case Opcode::AllocTensorReg: {
+      // Number of fields = 5
+      fields.push_back(instr.alloc_tensor_reg.shape_register);
+      // Save `DLDataType` and the dst register.
+      const auto& dtype = instr.alloc_tensor.dtype;
+      fields.assign({dtype.code, dtype.bits, dtype.lanes});
+      fields.push_back(instr.dst);
+      break;
+    }
+    case Opcode::AllocDatatype: {
+      // Number of fields = 3 + instr.num_fields
+      fields.assign({instr.constructor_tag, instr.num_fields, instr.dst});
+
+      // Save the fields.
+      fields.insert(fields.end(), instr.datatype_fields,
+                    instr.datatype_fields + instr.num_fields);
+      break;
+    }
+    case Opcode::AllocClosure: {
+      // Number of fields = 3 + instr.num_freevar
+      fields.assign({instr.clo_index, instr.num_freevar, instr.dst});
+
+      // Save the free vars.
+      fields.insert(fields.end(), instr.free_vars,
+                    instr.free_vars + instr.num_freevar);
+      break;
+    }
+    case Opcode::If: {
+      // Number of fields = 4
+      fields.assign({instr.if_op.test,
+                     instr.if_op.target,
+                     instr.if_op.true_offset,
+                     instr.if_op.false_offset});
+      break;
+    }
+    case Opcode::Invoke: {
+      // Number of fields = 3 + instr.num_args
+      fields.assign({instr.func_index, instr.num_args, instr.dst});
+
+      // Save the args.
+      fields.insert(fields.end(), instr.invoke_args_registers,
+                    instr.invoke_args_registers + instr.num_args);
+      break;
+    }
+    case Opcode::InvokeClosure: {
+      // Number of fields = 3 + instr.num_closure_args
+      fields.assign({instr.closure, instr.num_closure_args, instr.dst});
+
+      // Save the args.
+      fields.insert(fields.end(), instr.closure_args,
+                    instr.closure_args + instr.num_closure_args);
+      break;
+    }
+    case Opcode::LoadConst: {
+      // Number of fields = 2
+      fields.assign({instr.const_index, instr.dst});
+      break;
+    }
+    case Opcode::LoadConsti: {
+      // Number of fields = 2
+      fields.assign({instr.load_consti.val, instr.dst});
+      break;
+    }
+    case Opcode::GetField: {
+      // Number of fields = 3
+      fields.assign({instr.object, instr.field_index, instr.dst});
+      break;
+    }
+    case Opcode::GetTag: {
+      // Number of fields = 2
+      fields.assign({instr.get_tag.object, instr.dst});
+      break;
+    }
+    case Opcode::Goto: {
+      // Number of fields = 1
+      fields.push_back(instr.pc_offset);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Invalid opcode" << static_cast<int>(instr.op);
+      break;
+  }
+
+  return VMInstructionSerializer(static_cast<Index>(instr.op), fields);
+}
+
+void Executable::SaveCodeSection(dmlc::Stream* strm) {
+  // Save the number of functions.
+  strm->Write(static_cast<uint64_t>(this->functions.size()));
+  for (const auto& func : this->functions) {
+    // Save the function info.
+    VMFunctionSerializer func_format(func.name,
+                                     func.register_file_size,
+                                     func.instructions.size(),
+                                     func.params);
+    func_format.Save(strm);
+
+    // Serialize each instruction.
+    for (const auto& instr : func.instructions) {
+      const auto& serialized_instr = SerializeInstruction(instr);
+      serialized_instr.Save(strm);
+    }
+  }
+}
+
+void LoadHeader(dmlc::Stream* strm) {
+  // Check header.
+  uint64_t header;
+  STREAM_CHECK(strm->Read(&header), "header");
+  STREAM_CHECK(header == kTVMVMBytecodeMagic, "header");
+
+  // Check version.
+  std::string version;
+  STREAM_CHECK(strm->Read(&version), "version");
+  STREAM_CHECK(version == TVM_VERSION, "version");
+}
+
+runtime::Module Executable::Load(const std::string& code, const runtime::Module lib) {
+  std::shared_ptr<Executable> exec = std::make_shared<Executable>();
+  exec->lib = lib;
+  exec->code_ = code;
+  dmlc::MemoryStringStream strm(&exec->code_);
+
+  // Load header.
+  LoadHeader(&strm);
+
+  // Global section.
+  exec->LoadGlobalSection(&strm);
+
+  // Constant section.
+  exec->LoadConstantSection(&strm);
+
+  // Primitive names that will be invoked by `InvokePacked` instructions.
+  exec->LoadPrimitiveOpNames(&strm);
+
+  // Code section.
+  exec->LoadCodeSection(&strm);
+
+  return runtime::Module(exec);
+}
+
+void Executable::LoadGlobalSection(dmlc::Stream* strm) {
+  std::vector<std::string> globals;
+  STREAM_CHECK(strm->Read(&globals), "global");
+  for (size_t i = 0; i < globals.size(); i++) {
+    this->global_map.insert({globals[i], i});
+  }
+}
+
+void Executable::LoadConstantSection(dmlc::Stream* strm) {
+  uint64_t sz;
+  // Load the number of constants.
+  STREAM_CHECK(strm->Read(&sz, sizeof(sz)), "constant");
+
+  size_t size = static_cast<size_t>(sz);
+  // Load each of the constants.
+  for (size_t i = 0; i < size; i++) {
+    runtime::NDArray constant;
+    STREAM_CHECK(constant.Load(strm), "constant");
+    runtime::ObjectRef obj = runtime::vm::Tensor(constant);
+    this->constants.push_back(obj);
+  }
+}
+
+void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) {
+  std::vector<std::string> primitive_names;
+  STREAM_CHECK(strm->Read(&primitive_names), "primitive name");
+  for (size_t i = 0; i < primitive_names.size(); i++) {
+    this->primitive_map.insert({primitive_names[i], i});
+  }
+}
+
+// Extract the `cnt` number of fields started at `start` from the list
+// `instr_fields`.
+inline std::vector<Index> ExtractFields(const std::vector<Index>& instr_fields,
+                                        Index start,
+                                        Index cnt) {
+  CHECK_LE(static_cast<size_t>(start + cnt), instr_fields.size());
+  std::vector<Index> ret;
+  for (auto i = start; i < start + cnt; i++) {
+    ret.push_back(instr_fields[i]);
+  }
+  return ret;
+}
+
+Instruction DeserializeInstruction(const VMInstructionSerializer& instr) {
+  Opcode opcode = static_cast<Opcode>(instr.opcode);
+  switch (opcode) {
+    case Opcode::Move: {
+      // Number of fields = 2
+      DCHECK_EQ(instr.fields.size(), 2U);
+      return Instruction::Move(instr.fields[0], instr.fields[1]);
+    }
+    case Opcode::Ret: {
+      // Number of fields = 1
+      DCHECK_EQ(instr.fields.size(), 1U);
+      return Instruction::Ret(instr.fields[0]);
+    }
+    case Opcode::Fatal: {
+      // Number of fields = 0
+      DCHECK(instr.fields.empty());
+      return Instruction::Fatal();
+    }
+    case Opcode::InvokePacked: {
+      // Number of fields = 3 + instr.arity
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index packed_index = instr.fields[0];
+      Index arity = instr.fields[1];
+      Index output_size = instr.fields[2];
+      std::vector<RegName> args = ExtractFields(instr.fields, 3, arity);
+      return Instruction::InvokePacked(packed_index, arity, output_size, args);
+    }
+    case Opcode::AllocTensor: {
+      // Number of fields = 5 + instr.alloc_tensor.ndim
+      DCHECK_GE(instr.fields.size(), 5U);
+      DCHECK_EQ(instr.fields.size(), 5U + static_cast<size_t>(instr.fields[3]));
+
+      DLDataType dtype;
+      dtype.code = instr.fields[0];
+      dtype.bits = instr.fields[1];
+      dtype.lanes = instr.fields[2];
+
+      Index ndim = instr.fields[3];
+      RegName dst = instr.fields[4];
+
+      std::vector<Index> shape = ExtractFields(instr.fields, 5, ndim);
+
+      return Instruction::AllocTensor(shape, dtype, dst);
+    }
+    case Opcode::AllocTensorReg: {
+      // Number of fields = 5
+      DCHECK_EQ(instr.fields.size(), 5U);
+      Index shape_register = instr.fields[0];
+
+      DLDataType dtype;
+      dtype.code = instr.fields[1];
+      dtype.bits = instr.fields[2];
+      dtype.lanes = instr.fields[3];
+
+      RegName dst = instr.fields[4];
+
+      return Instruction::AllocTensorReg(shape_register, dtype, dst);
+    }
+    case Opcode::AllocDatatype: {
+      // Number of fields = 3 + instr.num_fields
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index constructor_tag = instr.fields[0];
+      Index num_fields = instr.fields[1];
+      RegName dst = instr.fields[2];
+      std::vector<Index> fields = ExtractFields(instr.fields, 3, num_fields);
+
+      return Instruction::AllocDatatype(constructor_tag, num_fields, fields, dst);
+    }
+    case Opcode::AllocClosure: {
+      // Number of fields = 3 + instr.num_freevar
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index clo_index = instr.fields[0];
+      Index num_freevar = instr.fields[1];
+      RegName dst = instr.fields[2];
+      std::vector<Index> free_vars = ExtractFields(instr.fields, 3, num_freevar);
+
+      return Instruction::AllocClosure(clo_index, num_freevar, free_vars, dst);
+    }
+    case Opcode::If: {
+      // Number of fields = 4
+      DCHECK_EQ(instr.fields.size(), 4U);
+      Index test = instr.fields[0];
+      Index target = instr.fields[1];
+      Index true_offset = instr.fields[2];
+      Index false_offset = instr.fields[3];
+
+      return Instruction::If(test, target, true_offset, false_offset);
+    }
+    case Opcode::Invoke: {
+      // Number of fields = 3 + instr.num_args
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index func_index = instr.fields[0];
+      Index num_args = instr.fields[1];
+      RegName dst = instr.fields[2];
+      std::vector<Index> args = ExtractFields(instr.fields, 3, num_args);
+
+      return Instruction::Invoke(func_index, args, dst);
+    }
+    case Opcode::InvokeClosure: {
+      // Number of fields = 3 + instr.num_closure_args
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index closure = instr.fields[0];
+      Index num_closure_args = instr.fields[1];
+      RegName dst = instr.fields[2];
+      std::vector<Index> args = ExtractFields(instr.fields, 3, num_closure_args);
+
+      return Instruction::InvokeClosure(closure, args, dst);
+    }
+    case Opcode::LoadConst: {
+      // Number of fields = 2
+      DCHECK_EQ(instr.fields.size(), 2U);
+      return Instruction::LoadConst(instr.fields[0], instr.fields[1]);
+    }
+    case Opcode::LoadConsti: {
+      // Number of fields = 2
+      DCHECK_EQ(instr.fields.size(), 2U);
+      return Instruction::LoadConsti(instr.fields[0], instr.fields[1]);
+    }
+    case Opcode::GetField: {
+      // Number of fields = 3
+      DCHECK_EQ(instr.fields.size(), 3U);
+      return Instruction::GetField(instr.fields[0], instr.fields[1], instr.fields[2]);
+    }
+    case Opcode::GetTag: {
+      // Number of fields = 2
+      DCHECK_EQ(instr.fields.size(), 2U);
+      return Instruction::GetTag(instr.fields[0], instr.fields[1]);
+    }
+    case Opcode::Goto: {
+      // Number of fields = 1
+      DCHECK_EQ(instr.fields.size(), 1U);
+      return Instruction::Goto(instr.fields[0]);
+    }
+    default:
+      LOG(FATAL) << "Invalid opcode" << instr.opcode;
+      return Instruction();
+  }
+}
+
+void Executable::LoadCodeSection(dmlc::Stream* strm) {
+  // Load the number of functions.
+  uint64_t sz;
+  STREAM_CHECK(strm->Read(&sz, sizeof(sz)), "code");
+
+  size_t num_funcs = static_cast<size_t>(sz);
+  this->functions.resize(num_funcs);
+  for (size_t i = 0; i < num_funcs; i++) {
+    // Load the function info.
+    VMFunctionSerializer loaded_func;
+    STREAM_CHECK(loaded_func.Load(strm), "code/function");
+
+    // Load the instructions.
+    std::vector<Instruction> instructions;
+    for (size_t j = 0; j < loaded_func.num_instructions; j++) {
+      VMInstructionSerializer instr;
+      std::vector<Index> instr_fields;
+      STREAM_CHECK(instr.Load(strm), "code/instruction");
+      instructions.push_back(DeserializeInstruction(instr));
+    }
+
+    // Create the VM function.
+    VMFunction vm_func = VMFunction(loaded_func.name,
+                                    loaded_func.params,
+                                    instructions,
+                                    loaded_func.register_file_size);
+    auto it = this->global_map.find(loaded_func.name);
+    CHECK(it != this->global_map.end());
+    CHECK_LE(it->second, this->global_map.size());
+    this->functions[it->second] = vm_func;
+  }
+}
+
+TVM_REGISTER_GLOBAL("relay._vm.GetNumOfGlobals")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec);
+  *rv = static_cast<int>(exec->global_map.size());
+});
+
+TVM_REGISTER_GLOBAL("relay._vm.GetGlobalFields")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec);
+  int idx = args[1];
+  std::vector<std::pair<std::string, Index> > globals(exec->global_map.begin(),
+                                                      exec->global_map.end());
+  auto comp = [](const std::pair<std::string, Index>& a,
+                 const std::pair<std::string, Index>& b) {
+    return a.second < b.second;
+  };
+  std::sort(globals.begin(), globals.end(), comp);
+  CHECK_LT(idx, globals.size());
+  *rv = globals[idx].first;
+});
+
+TVM_REGISTER_GLOBAL("relay._vm.GetNumOfPrimitives")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec);
+  *rv = static_cast<int>(exec->primitive_map.size());
+});
+
+
+TVM_REGISTER_GLOBAL("relay._vm.GetPrimitiveFields")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec);
+  int idx = args[1];
+  CHECK_GE(idx, 0);
+  CHECK_LT(idx, exec->primitive_map.size());
+
+  for (const auto& it : exec->primitive_map) {
+    if (idx == static_cast<int>(it.second)) {
+      *rv = it.first;
+      break;
+    }
+  }
+});
+
+TVM_REGISTER_GLOBAL("relay._vm.Load_Executable")
+.set_body_typed<runtime::Module(std::string, runtime::Module)>([](
+    std::string code,
+    runtime::Module lib) {
+  return Executable::Load(code, lib);
+});
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 80e0ce57a8ae..821de0bda245 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -85,19 +85,25 @@ PackedFunc VirtualMachineDebug::GetFunction(
   }
 }
 
-void VirtualMachineDebug::Init(const std::vector<TVMContext>& ctxs) {
-  VirtualMachine::Init(ctxs);
-  for (auto kv : primitive_map) {
+void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
+  VirtualMachine::LoadExecutable(exec);
+  CHECK(this->exec);
+  for (auto kv : this->exec->primitive_map) {
     packed_index_map[kv.second] = kv.first;
     op_invokes[kv.second] = 0;
   }
 }
 
+void VirtualMachineDebug::Init(const std::vector<TVMContext>& ctxs) {
+  VirtualMachine::Init(ctxs);
+}
+
 void VirtualMachineDebug::InvokePacked(Index packed_index,
                                        const PackedFunc& func, Index arg_count,
                                        Index output_size,
                                        const std::vector<ObjectRef>& args) {
-  auto ctx = VirtualMachine::GetParamsContext();
+  CHECK(this->exec);
+  auto ctx = this->GetParamsContext();
   // warmup
   VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size,
                                args);
@@ -117,6 +123,21 @@ void VirtualMachineDebug::InvokePacked(Index packed_index,
   op_invokes[packed_index] += 1;
 }
 
+runtime::Module CreateVirtualMachineDebug(const Executable* exec) {
+  std::shared_ptr<VirtualMachineDebug> vm = std::make_shared<VirtualMachineDebug>();
+  vm->LoadExecutable(exec);
+  return runtime::Module(vm);
+}
+
+TVM_REGISTER_GLOBAL("relay._vm._VirtualMachineDebug")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec) << "Virtual machine has not been defined yet."
+              << "\n";
+  *rv = CreateVirtualMachineDebug(exec);
+});
+
 }  // namespace vm
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h
index 447967cafeb0..ff3296cb6c16 100644
--- a/src/runtime/vm/profiler/vm.h
+++ b/src/runtime/vm/profiler/vm.h
@@ -47,6 +47,8 @@ class VirtualMachineDebug : public VirtualMachine {
   void InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
                     Index output_size, const std::vector<ObjectRef>& args) final;
 
+  void LoadExecutable(const Executable* exec);
+
   ~VirtualMachineDebug() {}
 
  private:
diff --git a/src/relay/backend/vm/serialize_util.h b/src/runtime/vm/serialize_util.h
similarity index 95%
rename from src/relay/backend/vm/serialize_util.h
rename to src/runtime/vm/serialize_util.h
index 3e7508ebee9b..3931f2f0e023 100644
--- a/src/relay/backend/vm/serialize_util.h
+++ b/src/runtime/vm/serialize_util.h
@@ -19,11 +19,11 @@
 
 /*!
  *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/serialize_util.h
+ * \file src/runtime/vm/serialize_util.h
  * \brief Definitions of helpers for serializing and deserializing a Relay VM.
  */
-#ifndef TVM_RELAY_BACKEND_VM_SERIALIZE_UTIL_H_
-#define TVM_RELAY_BACKEND_VM_SERIALIZE_UTIL_H_
+#ifndef TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
+#define TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
 
 #include <dmlc/common.h>
 #include <dmlc/memory_io.h>
@@ -34,7 +34,7 @@
 #include <vector>
 
 namespace tvm {
-namespace relay {
+namespace runtime {
 namespace vm {
 
 /*! \brief The magic number for the serialized VM bytecode file  */
@@ -158,7 +158,7 @@ struct VMInstructionSerializer {
 };
 
 }  // namespace vm
-}  // namespace relay
+}  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RELAY_BACKEND_VM_SERIALIZE_UTIL_H_
+#endif  // TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 7dea9bdb95ea..78b74768b930 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -575,11 +575,12 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
                                        const std::shared_ptr<ModuleNode>& sptr_to_self) {
   if (name == "invoke") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      CHECK(exec) << "The executable is not created yet.";
       std::string func_name = args[0];
-      auto gvit = this->global_map.find(func_name);
-      CHECK(gvit != this->global_map.end()) << "Cannot find function " << func_name;
+      auto gvit = exec->global_map.find(func_name);
+      CHECK(gvit != exec->global_map.end()) << "Cannot find function " << func_name;
       auto func_index = gvit->second;
-      const auto& vm_func = this->functions[func_index];
+      const auto& vm_func = exec->functions[func_index];
       const auto& param_names = vm_func.params;
       auto ctx = this->GetParamsContext();
 
@@ -617,10 +618,6 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
       }
       this->Init(contexts);
     });
-  } else if (name == "load_params") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      this->LoadParams(args[0].operator std::string());
-    });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
     return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
@@ -628,43 +625,20 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
 }
 
 TVMContext VirtualMachine::GetParamsContext() const {
+  CHECK(!ctxs.empty()) << "Context has not been initialized yet."
+                       << "\n";
+
   // Use the fallback device if no device index is available.
   int fallback_device_type = static_cast<int>(ctxs[0].device_type);
   // TODO(wweic): For heterogeneous execution, get device information from byte
 
   const auto& cit =
-    std::find_if(ctxs.begin(), ctxs.end(), [&fallback_device_type](const TVMContext& c) {
-      return fallback_device_type == static_cast<int>(c.device_type);
-    });
+      std::find_if(ctxs.begin(), ctxs.end(), [&fallback_device_type](const TVMContext& c) {
+        return fallback_device_type == static_cast<int>(c.device_type);
+      });
   return (cit == ctxs.end() ? ctxs[0] : *cit);
 }
 
-void VirtualMachine::LoadParams(const std::string& params) {
-  dmlc::MemoryStringStream mss(const_cast<std::string*>(&params));
-  dmlc::Stream* strm = &mss;
-  uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid parameter file";
-  CHECK(header == kTVMNDArrayListMagic) << "Invalid parameter file";
-  CHECK(strm->Read(&reserved)) << "Invalid parameter file";
-
-  std::vector<std::string> names;
-  CHECK(strm->Read(&names)) << "Invalid parameter file";
-
-  uint64_t sz;
-  strm->Read(&sz);
-  size_t size = static_cast<size_t>(sz);
-  CHECK(size == names.size()) << "Invalid parameter file";
-
-  auto ctx = GetParamsContext();
-  for (size_t i = 0; i < size; i++) {
-    NDArray arr;
-    CHECK(arr.Load(strm)) << "Invalid parameter file";
-    ObjectRef obj = Tensor(arr);
-    auto copy = CopyTo(obj, ctx);
-    params_.emplace(std::make_pair(names[i], copy));
-  }
-}
-
 void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func) {
   auto frame = VMFrame(ret_pc, func_index, arg_count, code, vm_func.register_file_size);
   frames.push_back(frame);
@@ -699,15 +673,17 @@ ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<Objec
 
   InvokeGlobal(func, args);
   RunLoop();
+  // TODO(wweic) ctx could be obtained from the ctxs list.
   auto alloc = MemoryManager::Global()->GetAllocator(ctxs[0]);
   DLOG(INFO) << "Memory used: " << alloc->UsedMemory() << " B";
   return return_register;
 }
 
 ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<ObjectRef>& args) {
-  auto func_index = this->global_map[name];
+  CHECK(exec) << "The executable has not been created yet.";
+  auto func_index = exec->global_map.at(name);
   DLOG(INFO) << "Invoke Global " << name << " at index " << func_index;
-  return Invoke(this->functions[func_index], args);
+  return Invoke(exec->functions[func_index], args);
 }
 
 void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
@@ -744,14 +720,16 @@ void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
   func.CallPacked(TVMArgs(values.data(), codes.data(), arity), &rv);
 }
 
-void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
-  this->ctxs = ctxs;
+void VirtualMachine::LoadExecutable(const Executable* exec) {
+  CHECK(exec) << "The executable is not created yet.";
+  this->exec = exec;
 
+  runtime::Module lib = this->exec->lib;
   // Get the list of packed functions.
-  CHECK(primitive_map.empty() || lib.operator->())
+  CHECK(exec->primitive_map.empty() || lib.operator->())
       << "runtime module should have been built for primitive functions"
       << "\n";
-  for (const auto& it : primitive_map) {
+  for (const auto& it : this->exec->primitive_map) {
     const auto& packed_name = it.first;
     auto packed_index = static_cast<size_t>(it.second);
     if (packed_funcs.size() <= packed_index) {
@@ -761,6 +739,11 @@ void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
   }
 }
 
+
+void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
+  this->ctxs = ctxs;
+}
+
 inline void VirtualMachine::WriteRegister(Index r, const ObjectRef& val) {
   frames.back().register_file[r] = val;
 }
@@ -788,6 +771,7 @@ inline int32_t VirtualMachine::LoadScalarInt(Index r) const {
 
 void VirtualMachine::RunLoop() {
   CHECK(this->code);
+  CHECK(this->exec);
   this->pc = 0;
   Index frame_start = frames.size();
   while (true) {
@@ -810,7 +794,8 @@ void VirtualMachine::RunLoop() {
         throw std::runtime_error("VM encountered fatal error");
       }
       case Opcode::LoadConst: {
-        auto constant_obj = this->constants[instr.const_index];
+        auto constant_obj = exec->constants[instr.const_index];
+        // TODO(wweic) ctx could be obtained from the ctxs list.
         auto device_obj = CopyTo(constant_obj, ctxs[0]);
         WriteRegister(instr.dst, device_obj);
         pc++;
@@ -828,7 +813,7 @@ void VirtualMachine::RunLoop() {
         for (Index i = 0; i < instr.num_args; ++i) {
           args.push_back(ReadRegister(instr.invoke_args_registers[i]));
         }
-        InvokeGlobal(this->functions[instr.func_index], args);
+        InvokeGlobal(exec->functions[instr.func_index], args);
         frames.back().caller_return_register = instr.dst;
         goto main_loop;
       }
@@ -858,7 +843,7 @@ void VirtualMachine::RunLoop() {
         for (Index i = 0; i < instr.num_closure_args; ++i) {
           args.push_back(ReadRegister(instr.closure_args[i]));
         }
-        InvokeGlobal(this->functions[closure->func_index], args);
+        InvokeGlobal(exec->functions[closure->func_index], args);
         frames.back().caller_return_register = instr.dst;
         goto main_loop;
       }
@@ -910,6 +895,7 @@ void VirtualMachine::RunLoop() {
         for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
           shape[i] = instr.alloc_tensor.shape[i];
         }
+        // TODO(wweic) ctx could be obtained from the ctxs list.
         auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
         auto data = allocator->Empty(shape, instr.alloc_tensor.dtype, ctxs[0]);
         auto obj = Tensor(data);
@@ -931,6 +917,7 @@ void VirtualMachine::RunLoop() {
         auto num_dims = shape_tensor->shape[0];
         auto shape = std::vector<int64_t>(shape_tensor->shape[0]);
         shape.assign(dims, dims + num_dims);
+        // TODO(wweic) ctx could be obtained from the ctxs list.
         auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
         auto data = allocator->Empty(shape, instr.alloc_tensor_reg.dtype, ctxs[0]);
         auto obj = Tensor(data);
@@ -976,6 +963,21 @@ void VirtualMachine::RunLoop() {
   }
 }
 
+runtime::Module CreateVirtualMachine(const Executable* exec) {
+  std::shared_ptr<VirtualMachine> vm = std::make_shared<VirtualMachine>();
+  vm->LoadExecutable(exec);
+  return runtime::Module(vm);
+}
+
+TVM_REGISTER_GLOBAL("relay._vm._VirtualMachine")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec) << "The virtual machine executable has not been defined yet."
+              << "\n";
+  *rv = CreateVirtualMachine(exec);
+});
+
 }  // namespace vm
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index cedbc4f71859..1b40f894db08 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -47,14 +47,16 @@ def veval(f, *args, ctx=tvm.cpu(), target="llvm"):
     if isinstance(f, relay.Expr):
         mod = relay.Module()
         mod["main"] = f
-        vm = relay.vm.compile(mod, target)
-        vm.init(tvm.cpu())
+        exe = relay.vm.compile(mod, target)
+        vm = relay.vm.VirtualMachine(exe)
+        vm.init(ctx)
         return vm.invoke("main", *args)
     else:
         assert isinstance(f, relay.Module), "expected expression or module"
         mod = f
-        vm = relay.vm.compile(mod, target)
-        vm.init(tvm.cpu())
+        exe = relay.vm.compile(mod, target)
+        vm = relay.vm.VirtualMachine(exe)
+        vm.init(ctx)
         ret = vm.invoke("main", *args)
         return ret
 
@@ -573,25 +575,6 @@ def test_add_op_broadcast():
     mod["main"] = func
     check_result([x_data, y_data], x_data + y_data, mod=mod)
 
-def test_set_params():
-    mod = relay.Module()
-    x = relay.var('x', shape=(10, 5))
-    w = relay.var('w', shape=(6, 5))
-    b = relay.var('b', shape=(6,))
-    y = relay.nn.bias_add(relay.nn.dense(x, w), b)
-    mod["main"] = relay.Function([x, w, b], y)
-    vm = relay.vm.compile(mod, 'llvm')
-    vm.init(tvm.cpu())
-
-    x_np = np.random.uniform(size=(10, 5)).astype('float32')
-    w_np = np.random.uniform(size=(6, 5)).astype('float32')
-    b_np = np.random.uniform(size=(6,)).astype('float32')
-    ref_np = np.dot(x_np, w_np.T) + b_np
-    params = {'w': w_np}
-    vm.load_params(params)
-    out = vm.run(x_np, b_np)
-    tvm.testing.assert_allclose(out.asnumpy(), ref_np)
-
 
 if __name__ == "__main__":
     test_id()
@@ -626,4 +609,3 @@ def test_set_params():
     test_add_op_scalar()
     test_add_op_tensor()
     test_add_op_broadcast()
-    test_set_params()
diff --git a/tests/python/relay/test_vm_serialization.py b/tests/python/relay/test_vm_serialization.py
index 3a317fc2d111..014648099aeb 100644
--- a/tests/python/relay/test_vm_serialization.py
+++ b/tests/python/relay/test_vm_serialization.py
@@ -22,29 +22,25 @@
 from tvm import relay
 from tvm.relay.module import Module as rly_module
 from tvm.relay import vm as _vm
-from tvm.relay import serializer, deserializer
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay.prelude import Prelude
 from tvm.contrib import util
 from tvm.relay import testing
 
-def create_vm(f, ctx=tvm.cpu(), target="llvm", params=None):
+def create_exec(f, target="llvm", params=None):
     if isinstance(f, relay.Expr):
         mod = relay.Module()
         mod["main"] = f
-        vm = _vm.compile(mod, target=target, params=params)
-        vm.init(ctx)
-        return vm
+        executable = _vm.compile(mod, target=target, params=params)
+        return executable
     else:
         assert isinstance(f, relay.Module), "expected mod as relay.Module"
-        vm = _vm.compile(f, target=target, params=params)
-        vm.init(ctx)
-        return vm
+        executable = _vm.compile(f, target=target, params=params)
+        return executable
 
 
 def veval(vm, *args, ctx=tvm.cpu()):
     assert isinstance(vm, _vm.VirtualMachine), "expected VirtualMachine"
-    vm.init(ctx)
     ret = vm.run(*args)
     return ret
 
@@ -59,13 +55,11 @@ def get_vm_output(mod, data, params, target, ctx, dtype='float32'):
         return result.asnumpy().astype(dtype)
 
     def get_serialized_output(mod, data, params, target, ctx, dtype='float32'):
-        vm = create_vm(mod, ctx, target, params=params)
-        ser = serializer.Serializer(vm)
-        code, lib = ser.serialize()
-        deser = deserializer.Deserializer(code, lib)
-        des_vm = deser.deserialize()
+        exe = create_exec(mod, target, params=params)
+        code, lib = exe.save()
+        des_exec = _vm.Executable.load_exec(code, lib)
+        des_vm = _vm.VirtualMachine(des_exec)
         des_vm.init(ctx)
-        des_vm.load_params(params)
         result = des_vm.run(data)
         return result.asnumpy().astype(dtype)
 
@@ -99,26 +93,25 @@ def test_serializer():
     main = relay.Function([x1, y1], glb_f1(x1) * glb_f2(y1))
     mod["main"] = main
 
-    vm = create_vm(mod)
-    ser = serializer.Serializer(vm)
+    exe = create_exec(mod)
 
-    glbs = ser.globals
+    glbs = exe.globals
     assert len(glbs) == 3
     assert "f1" in glbs
     assert "f2" in glbs
     assert "main" in glbs
 
-    prim_ops = ser.primitive_ops
+    prim_ops = exe.primitive_ops
     assert any(item.startswith('fused_add') for item in prim_ops)
     assert any(item.startswith('fused_subtract') for item in prim_ops)
     assert any(item.startswith('fused_multiply') for item in prim_ops)
 
-    code = ser.bytecode
+    code = exe.bytecode
     assert "main 5 2 5" in code
     assert "f1 2 1 3" in code
     assert "f2 2 1 3" in code
 
-    code, lib = ser.serialize()
+    code, lib = exe.save()
     assert isinstance(code, bytearray)
     assert isinstance(lib, tvm.module.Module)
 
@@ -129,24 +122,24 @@ def test_save_load():
     x_data = np.random.rand(10, 10).astype('float32')
 
     # serialize.
-    vm = create_vm(f)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
+    vm = create_exec(f)
+    code, lib = vm.save()
     assert isinstance(code, bytearray)
 
     # save and load the code and lib file.
     tmp = util.tempdir()
     path_lib = tmp.relpath("lib.so")
     lib.export_library(path_lib)
-    with open(tmp.relpath("code.bc"), "wb") as fo:
+    with open(tmp.relpath("code.ro"), "wb") as fo:
         fo.write(code)
 
     loaded_lib = tvm.module.load(path_lib)
-    loaded_code = bytearray(open(tmp.relpath("code.bc"), "rb").read())
+    loaded_code = bytearray(open(tmp.relpath("code.ro"), "rb").read())
 
     # deserialize.
-    deser = deserializer.Deserializer(loaded_code, loaded_lib)
-    des_vm = deser.deserialize()
+    des_exec = _vm.Executable.load_exec(loaded_code, loaded_lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     res = veval(des_vm, x_data)
     tvm.testing.assert_allclose(res.asnumpy(), x_data + x_data)
@@ -156,12 +149,12 @@ def test_const():
     c = relay.const(1.0, "float32")
     x = relay.var('x', shape=(10, 10), dtype='float32')
     f = relay.Function([x], x + c)
-    vm = create_vm(f)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
+    exe = create_exec(f)
+    code, lib = exe.save()
     assert isinstance(code, bytearray)
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
     x_data = np.random.rand(10, 10).astype('float32')
     res = veval(des_vm, x_data)
     tvm.testing.assert_allclose(res.asnumpy(), x_data + 1)
@@ -177,11 +170,11 @@ def test_if():
     x_data = np.random.rand(10, 10).astype('float32')
     y_data = np.random.rand(10, 10).astype('float32')
 
-    vm = create_vm(f)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(f)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     # same
     res = veval(des_vm, x_data, x_data)
@@ -213,11 +206,11 @@ def test_loop():
     aarg = relay.var('accum', shape=[], dtype='int32')
     mod["main"] = relay.Function([iarg, aarg], sum_up(iarg, aarg))
 
-    vm = create_vm(mod)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(mod)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     result = veval(des_vm, i_data, accum_data)
     tvm.testing.assert_allclose(result.asnumpy(), sum(range(1, loop_bound + 1)))
@@ -230,11 +223,11 @@ def test_tuple():
     i_data = np.random.rand(41).astype('float32')
     j_data = np.random.rand(10).astype('float32')
 
-    vm = create_vm(f)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(f)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     result = veval(des_vm, (i_data, j_data))
     tvm.testing.assert_allclose(result.asnumpy(), j_data)
@@ -251,11 +244,11 @@ def test_adt_list():
     f = relay.Function([], l321)
     mod["main"] = f
 
-    vm = create_vm(mod)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(mod)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     result = veval(des_vm)
     assert len(result) == 2
@@ -297,11 +290,11 @@ def test_adt_compose():
     f = relay.Function([y], add_two_body)
     mod["main"] = f
 
-    vm = create_vm(mod)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(mod)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     x_data = np.array(np.random.rand()).astype('float32')
     result = veval(des_vm, x_data)
@@ -317,11 +310,11 @@ def test_closure():
     clo = ff(relay.const(1.0))
     main = clo(relay.const(2.0))
 
-    vm = create_vm(main)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(main)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     res = veval(des_vm)
     tvm.testing.assert_allclose(res.asnumpy(), 3.0)
diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py
index b5ce0ec70e51..53f573730576 100644
--- a/tests/python/unittest/test_runtime_vm_profiler.py
+++ b/tests/python/unittest/test_runtime_vm_profiler.py
@@ -26,9 +26,9 @@ def test_basic():
     mod, params = resnet.get_workload()
     target = 'llvm'
     ctx = tvm.cpu()
-    vm = relay.profiler_vm.compile(mod, target)
+    exe = relay.profiler_vm.compile(mod, target, params=params)
+    vm = relay.profiler_vm.VirtualMachineProfiler(exe)
     vm.init(ctx)
-    vm.load_params(params)
 
     data = np.random.rand(1, 3, 224, 224).astype('float32')
     res = vm.invoke("main", [data])

From 5faa6f70d7d70e56cb3f44ff3f4e5699be287a3e Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Thu, 17 Oct 2019 22:41:34 -0700
Subject: [PATCH 02/59] [Relay][Frontend][TF] Add tensor array ops (#3798)

* [Relay][Frontend][TF] Add tensor array ops

* rename

* delete test

* Move utility function

* Refactor

* fix tensor array ops

* fix test

* fix rebase

* Fix serializer bug

* Improve tf convert name lookup to use prelude api

* Fix lint

* Fix test
---
 python/tvm/relay/frontend/tensorflow.py       |  82 ++-
 python/tvm/relay/op/_tensor.py                |  26 +
 python/tvm/relay/prelude.py                   | 520 ++++++++++++++++++
 python/tvm/relay/testing/py_converter.py      |   8 +-
 src/runtime/vm/executable.cc                  |   4 +-
 .../frontend/tensorflow/test_forward.py       | 118 +++-
 tests/python/relay/test_adt.py                | 148 +++++
 tests/python/relay/test_feature.py            |   3 +-
 8 files changed, 899 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 38f9c523e0b1..eb67cf24b81e 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -22,10 +22,14 @@
 
 import warnings
 from collections import defaultdict
+
 # Numpy support
 import numpy as np
 
 import tvm
+
+from tvm.relay.prelude import Prelude
+
 from .. import analysis
 from .. import expr as _expr
 from .. import op as _op
@@ -508,6 +512,69 @@ def _impl(inputs, attr, params):
         return _op.concatenate(inputs_reshaped, axis)
     return _impl
 
+def _tensor_array():
+    def _impl(inputs, attr, params, prelude):
+        dtype_str = attr.get('dtype').name
+        tensor_array_constructor = prelude.get_var('tensor_array', dtype_str)
+        return tensor_array_constructor(_op.take(inputs[0], tvm.relay.const(0)))
+    return _impl
+
+def _tensor_array_scatter():
+    def _impl(inputs, attr, params, prelude):
+        dtype_str = attr.get('T').name
+        values_rank = len(inputs[2].type_annotation.shape)
+        unstack_name = "tensor_array_unstack_tensor{}".format(values_rank)
+        unstack_function = prelude.get_var(unstack_name, dtype_str)
+        values = unstack_function(inputs[2])
+        tensor_array_scatter_func = prelude.get_var('tensor_array_scatter', dtype_str)
+        return tensor_array_scatter_func(inputs[0], inputs[1], values)
+    return _impl
+
+def _tensor_array_gather():
+    def _impl(inputs, attr, params, prelude):
+        return prelude.tensor_array_gather(inputs[2], inputs[1])
+    return _impl
+
+def _tensor_array_size():
+    def _impl(inputs, attr, params, prelude):
+        return prelude.length(inputs[0])
+    return _impl
+
+def _tensor_array_write():
+    def _impl(inputs, attr, params, prelude):
+        input_rank = len(inputs[2].type_annotation.shape)
+        dtype = attr.get('T').name
+
+        tensor_name = 'tensor{}'.format(input_rank)
+        tensor_func = prelude.get_var(tensor_name, dtype)
+        v = tensor_func(inputs[2])
+        write_func = prelude.get_var('tensor_array_write', dtype)
+
+        return write_func(inputs[3], _op.take(inputs[1], tvm.relay.const(0)), v)
+    return _impl
+
+def _tensor_array_read():
+    def _impl(inputs, attr, params, prelude):
+        read_func = prelude.get_var('tensor_array_read', attr.get('dtype').name)
+        return read_func(inputs[2], _op.take(inputs[1], tvm.relay.const(0)))
+    return _impl
+
+def _tensor_array_split():
+    def _impl(inputs, attr, params, prelude):
+        input_rank = len(inputs[1].type_annotation.shape)
+        dtype_str = attr.get('T').name
+        v = prelude.get_var("tensor{}".format(input_rank), dtype_str)(inputs[1])
+        lengths = _op.cast(inputs[2], 'int32')
+        split_var = prelude.get_var('tensor_array_split', dtype_str)
+        return split_var(inputs[0], v, lengths)
+    return _impl
+
+def _tensor_array_concat():
+    def _impl(inputs, attr, params, prelude):
+        concat_func = prelude.get_var('tensor_array_concat', attr['dtype'].name)
+        return concat_func(inputs[1])
+    return _impl
+
 def _tile():
     def _impl(inputs, attr, params):
         reps = _get_list_param(params, inputs.pop())
@@ -1313,6 +1380,14 @@ def _impl(inputs, attr, params):
     'NotEqual'                          : _broadcast('not_equal'),
     'OneHot'                            : _one_hot(),
     'Pack'                              : _pack(),
+    'TensorArrayV3'                     : _tensor_array(),
+    'TensorArrayScatterV3'              : _tensor_array_scatter(),
+    'TensorArrayGatherV3'               : _tensor_array_gather(),
+    'TensorArraySizeV3'                 : _tensor_array_size(),
+    'TensorArrayWriteV3'                : _tensor_array_write(),
+    'TensorArrayReadV3'                 : _tensor_array_read(),
+    'TensorArraySplitV3'                : _tensor_array_split(),
+    'TensorArrayConcatV3'               : _tensor_array_concat(),
     'Pad'                               : _pad('Pad'),
     'PadV2'                             : _pad('PadV2'),
     'Pow'                               : _elemwise('power'),
@@ -1860,6 +1935,7 @@ def __init__(self):
         self._loops = {}
         self._branches = {}
         self._mod = _module.Module({})
+        self._prelude = Prelude(self._mod)
 
     def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         """Construct relay nodes from tensorflow graph definition - GraphDef.
@@ -2335,7 +2411,11 @@ def _convert_operator(self, op_name, inputs, attrs,
         if op_name in identity_list:
             sym = get_relay_op(op_name)(*inputs, **attrs)
         elif op_name in convert_map:
-            sym = convert_map[op_name](inputs, attrs, self._params)
+            if 'TensorArray' in op_name:
+                sym = convert_map[op_name](inputs, attrs, self._params, self._prelude)
+            else:
+                sym = convert_map[op_name](inputs, attrs, self._params)
+
         elif op_name in convert_map_rnn:
             sym = self._convert_rnn_operator(op_name, inputs, attrs,
                                              self._params, graph,
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index da5804906269..188b3bb15956 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -108,6 +108,29 @@ def clip_compute(attrs, inputs, output_type, target):
 
 register_schedule("clip", schedule_elemwise)
 
+@script
+def _cast_shape_function(x):
+    out_ndim = len(x)
+    out = output_tensor((out_ndim,), "int64")
+    for i in const_range(out_ndim):
+        out[i] = x[i]
+    return out
+
+def cast_shape_func(attrs, inputs, out_ndims):
+    return [_cast_shape_function(*inputs)]
+
+@script
+def _expand_dims_shape_func(x):
+    ndim = len(x.shape)
+    out = output_tensor((ndim+1,), "int64")
+    out[0] = int64(1)
+    for i in const_range(0, ndim):
+        out[i+1] = int64(x.shape[i])
+    return out
+
+def expand_dims_shape_func(attrs, inputs, out_ndims):
+    return [_expand_dims_shape_func(*inputs)]
+
 # shape func
 @script
 def _broadcast_shape_func(x, y, ndim):
@@ -140,6 +163,9 @@ def _broadcast_shape_func(x, y, ndim):
 def broadcast_shape_func(attrs, inputs, out_ndims):
     return [_broadcast_shape_func(*inputs, out_ndims[0])]
 
+register_shape_func("expand_dims", False, expand_dims_shape_func)
+register_shape_func("cast", False, cast_shape_func)
+
 register_shape_func("add", False, broadcast_shape_func)
 register_shape_func("subtract", False, broadcast_shape_func)
 register_shape_func("multiply", False, broadcast_shape_func)
diff --git a/python/tvm/relay/prelude.py b/python/tvm/relay/prelude.py
index 803d8ef50db5..d27ffe512617 100644
--- a/python/tvm/relay/prelude.py
+++ b/python/tvm/relay/prelude.py
@@ -16,8 +16,513 @@
 # under the License.
 # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
 """A prelude containing useful global functions and ADT definitions."""
+from .ty import GlobalTypeVar, TensorType, Any, scalar_type
+from .expr import Var, Function, GlobalVar, If, const
+from .op.tensor import add, subtract, equal
+from .adt import Constructor, TypeData, Clause, Match
+from .adt import PatternConstructor, PatternVar, PatternWildcard
+from . import op
 from .module import Module
 
+class TensorArrayOps(object):
+    """Contains tensor array related ops"""
+
+    def __init__(self, prelude, dtype):
+        """Create tensor array ops registry"""
+        self.prelude = prelude
+        self.dtype = dtype
+
+    def get_name(self, canonical):
+        """Get name corresponding to the caninical name"""
+        return self.prelude.get_name(canonical, self.dtype)
+
+    def get_var(self, canonical):
+        """Get var corresponding to the caninical name"""
+        return self.prelude.get_var(canonical, self.dtype)
+
+    def define_tensor_adt(self):
+        """Defines the dynamic tensor ADT, which is the container for tensors
+        with variable shapes."""
+        tensor_type_name = self.get_name('tensor_t')
+        tensor_type_var = GlobalTypeVar(tensor_type_name)
+        setattr(self.prelude, tensor_type_name, tensor_type_var)
+        tensor0_type = TensorType([], self.dtype)
+        tensor1_type = TensorType([Any()], self.dtype)
+        tensor2_type = TensorType([Any(), Any()], self.dtype)
+        tensor3_type = TensorType([Any(), Any(), Any()], self.dtype)
+        tensor4_type = TensorType([Any(), Any(), Any(), Any()], self.dtype)
+        tensor5_type = TensorType([Any(), Any(), Any(), Any(), Any()], self.dtype)
+        tensor6_type = TensorType([Any(), Any(), Any(), Any(), Any(), Any()], self.dtype)
+        tensor_nil_name = self.get_name('tensor_nil')
+        tensor0_name = self.get_name('tensor0')
+        tensor1_name = self.get_name('tensor1')
+        tensor2_name = self.get_name('tensor2')
+        tensor3_name = self.get_name('tensor3')
+        tensor4_name = self.get_name('tensor4')
+        tensor5_name = self.get_name('tensor5')
+        tensor6_name = self.get_name('tensor6')
+        tensor_nil_case = Constructor(tensor_nil_name, [], tensor_type_var)
+        tensor0_case = Constructor(tensor0_name, [tensor0_type], tensor_type_var)
+        tensor1_case = Constructor(tensor1_name, [tensor1_type], tensor_type_var)
+        tensor2_case = Constructor(tensor2_name, [tensor2_type], tensor_type_var)
+        tensor3_case = Constructor(tensor3_name, [tensor3_type], tensor_type_var)
+        tensor4_case = Constructor(tensor4_name, [tensor4_type], tensor_type_var)
+        tensor5_case = Constructor(tensor5_name, [tensor5_type], tensor_type_var)
+        tensor6_case = Constructor(tensor6_name, [tensor6_type], tensor_type_var)
+        setattr(self.prelude, tensor_nil_name, tensor_nil_case)
+        setattr(self.prelude, tensor0_name, tensor0_case)
+        setattr(self.prelude, tensor1_name, tensor1_case)
+        setattr(self.prelude, tensor2_name, tensor2_case)
+        setattr(self.prelude, tensor3_name, tensor3_case)
+        setattr(self.prelude, tensor4_name, tensor4_case)
+        setattr(self.prelude, tensor5_name, tensor5_case)
+        setattr(self.prelude, tensor6_name, tensor6_case)
+        self.prelude.mod[tensor_type_var] = TypeData(tensor_type_var, [], [tensor_nil_case,
+                                                                           tensor0_case,
+                                                                           tensor1_case,
+                                                                           tensor2_case,
+                                                                           tensor3_case,
+                                                                           tensor4_case,
+                                                                           tensor5_case,
+                                                                           tensor6_case])
+
+    def define_tensor_take(self):
+        """Defines a function to return a range of tensor_t on axis 0.
+            tensor_take(t, lower, upper) :
+            tensor_t -> Tensor[(), int32] -> Tensor[(), int32] -> tensor_t
+        """
+        take_name = self.get_name("tensor_take")
+        take_var = GlobalVar(take_name)
+        setattr(self.prelude, take_name, take_var)
+        tensor_t = self.get_var('tensor_t')
+        tensor1_var = self.get_var('tensor1')
+        tensor2_var = self.get_var('tensor2')
+        tensor3_var = self.get_var('tensor3')
+        tensor4_var = self.get_var('tensor4')
+        tensor5_var = self.get_var('tensor5')
+        tensor6_var = self.get_var('tensor6')
+        t = Var('tensor', tensor_t())
+        lower = Var('lower', scalar_type('int32'))
+        upper = Var('upper', scalar_type('int32'))
+        t1 = Var('t1')
+        t2 = Var('t2')
+        t3 = Var('t3')
+        t4 = Var('t4')
+        t5 = Var('t5')
+        t6 = Var('t6')
+        tensor1_case =\
+            Clause(PatternConstructor(tensor1_var, [PatternVar(t1)]),
+                   tensor1_var(op.take(t1, op.arange(lower, upper, dtype='int32'))))
+        tensor2_case =\
+            Clause(PatternConstructor(tensor2_var, [PatternVar(t2)]),
+                   tensor2_var(op.take(t2, op.arange(lower, upper, dtype='int32'), axis=0)))
+        tensor3_case =\
+            Clause(PatternConstructor(tensor3_var, [PatternVar(t3)]),
+                   tensor3_var(op.take(t3, op.arange(lower, upper, dtype='int32'), axis=0)))
+        tensor4_case =\
+            Clause(PatternConstructor(tensor4_var, [PatternVar(t4)]),
+                   tensor4_var(op.take(t4, op.arange(lower, upper, dtype='int32'), axis=0)))
+        tensor5_case =\
+            Clause(PatternConstructor(tensor5_var, [PatternVar(t5)]),
+                   tensor5_var(op.take(t5, op.arange(lower, upper, dtype='int32'), axis=0)))
+        tensor6_case =\
+            Clause(PatternConstructor(tensor6_var, [PatternVar(t6)]),
+                   tensor6_var(op.take(t6, op.arange(lower, upper, dtype='int32'), axis=0)))
+        self.prelude.mod[take_var] =\
+            Function([t, lower, upper],
+                     Match(t, [tensor1_case,
+                               tensor2_case,
+                               tensor3_case,
+                               tensor4_case,
+                               tensor5_case,
+                               tensor6_case], False),
+                     tensor_t(), [])
+
+    def define_tensor_expand_dims(self):
+        """Defines a function to grow a tensor_t's rank by adding one dimension in front
+        of the original tensor_t.
+        tensor_expand_dims(t) : tensor_t -> tensor_t
+        """
+        expand_dims_name = self.get_name("tensor_expand_dims")
+        expand_dims_var = GlobalVar(expand_dims_name)
+        setattr(self.prelude, expand_dims_name, expand_dims_var)
+        tensor_type_var = self.get_var('tensor_t')
+        x = Var("x", tensor_type_var())
+        t0 = Var("t0")
+        t1 = Var("t1")
+        t2 = Var("t2")
+        t3 = Var("t3")
+        t4 = Var("t4")
+        t5 = Var("t5")
+        tensor0_var = self.get_var('tensor0')
+        tensor1_var = self.get_var('tensor1')
+        tensor2_var = self.get_var('tensor2')
+        tensor3_var = self.get_var('tensor3')
+        tensor4_var = self.get_var('tensor4')
+        tensor5_var = self.get_var('tensor5')
+        tensor6_var = self.get_var('tensor6')
+        tensor0_case = Clause(PatternConstructor(tensor0_var, [PatternVar(t0)]),
+                              tensor1_var(op.expand_dims(t0, 0, 1)))
+        tensor1_case = Clause(PatternConstructor(tensor1_var, [PatternVar(t1)]),
+                              tensor2_var(op.expand_dims(t1, 0, 1)))
+        tensor2_case = Clause(PatternConstructor(tensor2_var, [PatternVar(t2)]),
+                              tensor3_var(op.expand_dims(t2, 0, 1)))
+        tensor3_case = Clause(PatternConstructor(tensor3_var, [PatternVar(t3)]),
+                              tensor4_var(op.expand_dims(t3, 0, 1)))
+        tensor4_case = Clause(PatternConstructor(tensor4_var, [PatternVar(t4)]),
+                              tensor5_var(op.expand_dims(t4, 0, 1)))
+        tensor5_case = Clause(PatternConstructor(tensor5_var, [PatternVar(t5)]),
+                              tensor6_var(op.expand_dims(t5, 0, 1)))
+        self.prelude.mod[expand_dims_var] =\
+            Function([x],
+                     Match(x, [tensor0_case,
+                               tensor1_case,
+                               tensor2_case,
+                               tensor3_case,
+                               tensor4_case,
+                               tensor5_case], False))
+
+    def define_tensor_concat(self):
+        """Defines a function to concatenate two tensor_t on the first axis
+
+        tensor_concatenate(t) : tensor_t -> tensor_t -> tensor_t
+        """
+        concat_name = self.get_name("tensor_concatenate")
+        concat_var = GlobalVar(concat_name)
+        setattr(self.prelude, concat_name, concat_var)
+        tensor_type_var = self.get_var('tensor_t')
+        x = Var("x", tensor_type_var())
+        y = Var("y", tensor_type_var())
+
+        tensor1_var = self.get_var('tensor1')
+        tensor2_var = self.get_var('tensor2')
+        tensor3_var = self.get_var('tensor3')
+        tensor4_var = self.get_var('tensor4')
+        t11 = Var("t11")
+        t12 = Var("t12")
+        t21 = Var("t21")
+        t22 = Var("t22")
+        t31 = Var("t31")
+        t32 = Var("t32")
+        t41 = Var("t41")
+        t42 = Var("t42")
+        tensor1_case = Clause(PatternConstructor(tensor1_var, [PatternVar(t11)]),
+                              Match(y, [Clause(PatternConstructor(tensor1_var, [PatternVar(t12)]),
+                                               tensor1_var(op.concatenate([t11, t12], axis=0)))],
+                                    False))
+        tensor2_case = Clause(PatternConstructor(tensor2_var, [PatternVar(t21)]),
+                              Match(y, [Clause(PatternConstructor(tensor2_var, [PatternVar(t22)]),
+                                               tensor2_var(op.concatenate([t21, t22], axis=0)))],
+                                    False))
+        tensor3_case = Clause(PatternConstructor(tensor3_var, [PatternVar(t31)]),
+                              Match(y, [Clause(PatternConstructor(tensor3_var, [PatternVar(t32)]),
+                                               tensor3_var(op.concatenate([t31, t32], axis=0)))],
+                                    False))
+        tensor4_case = Clause(PatternConstructor(tensor4_var, [PatternVar(t41)]),
+                              Match(y, [Clause(PatternConstructor(tensor4_var, [PatternVar(t42)]),
+                                               tensor4_var(op.concatenate([t41, t42], axis=0)))],
+                                    False))
+        # op.concatenate does not support tensor with rank higher than 4
+        self.prelude.mod[concat_var] =\
+            Function([x, y], Match(x, [tensor1_case,
+                                       tensor2_case,
+                                       tensor3_case,
+                                       tensor4_case], False))
+
+    def define_tensor_array(self):
+        """Defines a function to create a tensor array with size n.
+        tensor_array(n) : Tensor[(), int32] -> list[tensor_t]
+        """
+        tensor_array_constructor_name = self.get_name("tensor_array")
+        tensor_array_constructor_var = GlobalVar(tensor_array_constructor_name)
+        setattr(self.prelude, tensor_array_constructor_name, tensor_array_constructor_var)
+        tensor_nil_var = self.get_var('tensor_nil')
+        tensor_type_var = self.get_var('tensor_t')
+        n = Var("x", scalar_type('int32'))
+        body = If(equal(n, const(0)),
+                  self.prelude.nil(),
+                  self.prelude.cons(tensor_nil_var(),
+                                    tensor_array_constructor_var(subtract(n, const(1)))))
+        self.prelude.mod[tensor_array_constructor_var] = \
+            Function([n], body, self.prelude.l(tensor_type_var()), [])
+
+    def define_tensor_array_read(self):
+        """Defines a function to get the head of a list. Assume the list has at least one
+        element.
+
+        tensor_array_read(ta, n) : list[tensor_t] -> Tensor[(), int32] -> tensor_t
+        """
+        read_name = self.get_name("tensor_array_read")
+        read_var = GlobalVar(read_name)
+        setattr(self.prelude, read_name, read_var)
+        tensor_type_var = self.get_var('tensor_t')
+
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_type_var()))
+        n = Var("x", scalar_type('int32'))
+        self.prelude.mod[read_var] =\
+            Function([tensor_array, n], self.prelude.nth(tensor_array, n), tensor_type_var(), [])
+
+    def define_tensor_array_write(self):
+        """Defines a function to update a tensor array at index n with value v.
+        tensor_array_write(ta, n, v) :
+            list[tensor_t] -> Tensor[(), int32] -> tensor_t -> list[tensor_t]
+        """
+        write_name = self.get_name("tensor_array_write")
+        write_var = GlobalVar(write_name)
+        setattr(self.prelude, write_name, write_var)
+        tensor_type_var = self.get_var('tensor_t')
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_type_var()))
+        n = Var("x", scalar_type('int32'))
+        v = Var("v", tensor_type_var())
+        self.prelude.mod[write_var] =\
+            Function([tensor_array, n, v], self.prelude.update(tensor_array, n, v),
+                     self.prelude.l(tensor_type_var()), [])
+
+    def define_tensor_array_unstack_tensor1(self):
+        """Defines a function to unstack the values of a tensor_t with rank 1 in a tensor array.
+        tensor_array_unstack_tensor1(t) : tensor_t -> list[tensor_t]
+        """
+        helper_name = self.get_name("tensor_array_unstack_tensor1_helper")
+        helper_var = GlobalVar(helper_name)
+        setattr(self.prelude, helper_name, helper_var)
+        tensor = Var("t", TensorType([Any()], self.dtype))
+        up = Var("up", scalar_type('int32'))
+        i = Var("i", scalar_type('int32'))
+        tensor_type_var = self.get_var('tensor_t')
+        tensor0_var = self.get_var('tensor0')
+        helper_body =\
+            If(equal(i, up),
+               self.prelude.nil(),
+               self.prelude.cons(tensor0_var(op.take(tensor, i)),
+                                 helper_var(add(i, const(1)), up, tensor)))
+        self.prelude.mod[helper_var] =\
+            Function([i, up, tensor], helper_body, self.prelude.l(tensor_type_var()), [])
+        unstack_name = self.get_name("tensor_array_unstack_tensor1")
+        unstack_var = GlobalVar(unstack_name)
+        setattr(self.prelude, unstack_name, unstack_var)
+        tensor1 = Var("tensor", TensorType([Any()], self.dtype))
+        shape = op.shape_of(tensor1)
+        ndim = op.take(shape, const(0))
+        self.prelude.mod[unstack_var] =\
+            Function([tensor1], helper_var(const(0), ndim, tensor1),
+                     self.prelude.l(tensor_type_var()), [])
+
+    def define_tensor_array_unstack_tensor2(self):
+        """Defines a function to unstack the values of a tensor_t with rank 2 in a tensor array.
+
+        tensor_array_unstack_tensor2(t) : tensor_t -> list[tensor_t]
+        """
+        helper_name = self.get_name("tensor_array_unstack_tensor2_helper")
+        helper_var = GlobalVar(helper_name)
+        setattr(self.prelude, helper_name, helper_var)
+        tensor = Var("t", TensorType([Any(), Any()], self.dtype))
+        up = Var("up", scalar_type('int32'))
+        i = Var("i", scalar_type('int32'))
+
+        helper_body = If(equal(i, up),
+                         self.prelude.nil(),
+                         self.prelude.cons(self.get_var('tensor1')(op.take(tensor, i, axis=0)),
+                                           helper_var(add(i, const(1)), up, tensor)))
+        self.prelude.mod[helper_var] =\
+            Function([i, up, tensor], helper_body, self.prelude.l(self.get_var('tensor_t')()), [])
+
+        tensor_array_unstack_tensor2_name = self.get_name("tensor_array_unstack_tensor2")
+        tensor_array_unstack_tensor2_var = GlobalVar(tensor_array_unstack_tensor2_name)
+        setattr(self.prelude, tensor_array_unstack_tensor2_name, tensor_array_unstack_tensor2_var)
+        tensor2 = Var("tensor", TensorType([Any(), Any()], self.dtype))
+        shape = op.shape_of(tensor2)
+        ndim = op.take(shape, const(0))
+        self.prelude.mod[tensor_array_unstack_tensor2_var] =\
+            Function([tensor2], helper_var(const(0), ndim, tensor2),
+                     self.prelude.l(self.get_var('tensor_t')()), [])
+
+    def define_tensor_array_scatter(self):
+        """Defines a function to scatter the values of a tensor_t in indices of a tensor array.
+        tensor_array_scatter(ta, indices, value) :
+            list[tensor_t] -> Tensor[(Any), int32] -> tensor_t -> list[tensor_t]
+        """
+        tensor_array_scatter_helper_name = self.get_name("tensor_array_scatter_helper")
+        tensor_array_scatter_helper_var = GlobalVar(tensor_array_scatter_helper_name)
+        tensor_t = self.get_var('tensor_t')
+        ta = Var("ta", self.prelude.l(tensor_t()))
+        current = Var("current", scalar_type('int32'))
+        limit = Var("limit", scalar_type('int32'))
+        indices_ = Var('indices_', TensorType([Any()], 'int32'))
+        values_ = Var('values_', self.prelude.l(tensor_t()))
+        write_var = self.get_var('tensor_array_write')
+        read_var = self.get_var('tensor_array_read')
+        helper_body = If(equal(current, limit),
+                         ta,
+                         tensor_array_scatter_helper_var(
+                             write_var(ta, op.take(indices_, current),
+                                       read_var(values_, current)),
+                             add(current, const(1)),
+                             limit, indices_, values_))
+        self.prelude.mod[tensor_array_scatter_helper_var] =\
+            Function([ta, current, limit, indices_, values_],
+                     helper_body, self.prelude.l(tensor_t()), [])
+        tensor_array_scatter_name = self.get_name("tensor_array_scatter")
+        tensor_array_scatter_var = GlobalVar(tensor_array_scatter_name)
+        setattr(self.prelude, tensor_array_scatter_name, tensor_array_scatter_var)
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_t()))
+        indices = Var('indices', TensorType([Any()], 'int32'))
+        values = Var('values', self.prelude.l(tensor_t()))
+        indices_shape = op.shape_of(indices)
+        limit = op.take(indices_shape, const(0))
+        body = tensor_array_scatter_helper_var(tensor_array, const(0), limit, indices, values)
+        self.prelude.mod[tensor_array_scatter_var] =\
+            Function([tensor_array, indices, values], body, self.prelude.l(tensor_t()), [])
+
+    def define_tensor_array_split(self):
+        """Defines a function to split the values of a tensor_t into a tensor array.
+        tensor_array_split(ta, value, lengths) :
+            list[tensor_t] -> tensor_t -> Tensor[(Any), int32] -> list[tensor_t]
+        """
+        tensor_t = self.get_var('tensor_t')
+        tensor_array_split_helper_name = self.get_name("ta_split_helper")
+        tensor_array_split_helper_var = GlobalVar(tensor_array_split_helper_name)
+        setattr(self.prelude, tensor_array_split_helper_name, tensor_array_split_helper_var)
+        ta1 = Var("tensor_array", self.prelude.l(tensor_t()))
+        value1 = Var('value1', tensor_t())
+        offset1 = Var('offset1', scalar_type('int32'))
+        current1 = Var('current1', scalar_type('int32'))
+        limit1 = Var('limit1', scalar_type('int32'))
+        lengths1 = Var('lengths', TensorType([Any()], 'int32'))
+        write_var = self.get_var('tensor_array_write')
+        take_var = self.get_var('tensor_take')
+        helper1_body = If(equal(current1, limit1),
+                          ta1,
+                          write_var(
+                              tensor_array_split_helper_var(
+                                  ta1,
+                                  value1,
+                                  add(offset1, op.take(lengths1, current1)),
+                                  add(current1, const(1)),
+                                  limit1,
+                                  lengths1
+                              ),
+                              current1,
+                              take_var(value1,
+                                       offset1,
+                                       add(op.take(lengths1, current1), offset1))))
+        self.prelude.mod[tensor_array_split_helper_var] = \
+            Function([ta1, value1, offset1, current1, limit1, lengths1],
+                     helper1_body, self.prelude.l(tensor_t()), [])
+        split_name = self.get_name("tensor_array_split")
+        split_var = GlobalVar(split_name)
+        setattr(self.prelude, split_name, split_var)
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_t()))
+        value = Var('value', tensor_t())
+        lengths = Var('lengths', TensorType([Any()], 'int32'))
+        lengths_shape = op.shape_of(lengths)
+        lengths_limit = op.take(lengths_shape, const(0))
+        body = tensor_array_split_helper_var(
+            tensor_array,
+            value,
+            const(0),
+            const(0),
+            lengths_limit,
+            lengths)
+        self.prelude.mod[split_var] =\
+            Function([tensor_array, value, lengths], body, self.prelude.l(tensor_t()), [])
+
+    def define_tensor_array_concat(self):
+        """Defines a function to return the values in the tensor array as concatenated tensor_t.
+        tensor_array_concat(ta) : list[tensor_t] -> tensor_t
+        """
+        concat_name = self.get_name("tensor_array_concat")
+        concat_var = GlobalVar(concat_name)
+        setattr(self.prelude, concat_name, concat_var)
+        tensor_concat_var = self.get_var('tensor_concatenate')
+        tensor_t = self.get_var('tensor_t')
+        tensor_nil_var = self.get_var('tensor_nil')
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_t()))
+        hd = Var("hd")
+        tl = Var("tl")
+        nil_case = Clause(PatternConstructor(self.prelude.nil), tensor_nil_var())
+        cons_case = Clause(PatternConstructor(self.prelude.cons, [PatternVar(hd), PatternVar(tl)]),
+                           Match(tl, [
+                               Clause(PatternConstructor(self.prelude.nil), hd),
+                               Clause(PatternWildcard(),
+                                      tensor_concat_var(hd, concat_var(tl)))
+                           ], False))
+        self.prelude.mod[concat_var] =\
+            Function([tensor_array],
+                     Match(tensor_array, [nil_case, cons_case], False), tensor_t(), [])
+
+    def define_tensor_array_gather(self):
+        """Defines a function to return the selected values in a tensor array as tensor_t.
+        tensor_array_gather(ta, indices) : list[tensor_t] -> Tensor[(Any), int32] -> tensor_t
+        """
+        helper_name = self.get_name("tensor_array_gather_helper")
+        helper_var = GlobalVar(helper_name)
+        setattr(self.prelude, helper_name, helper_var)
+        tensor_type_var = self.get_var('tensor_t')
+        stack_var = self.get_var('tensor_array_stack')
+        read_var = self.get_var('tensor_array_read')
+        ta = Var("ta", self.prelude.l(tensor_type_var()))
+        accu = Var("accu", self.prelude.l(tensor_type_var()))
+        current = Var("current", scalar_type('int32'))
+        limit = Var("limit", scalar_type('int32'))
+        indices_ = Var('indices_', TensorType([Any()], 'int32'))
+        helper_body =\
+            If(equal(current, const(0)),
+               stack_var(accu),
+               helper_var(
+                   ta,
+                   self.prelude.cons(
+                       read_var(
+                           ta, op.take(indices_, subtract(current, const(1)))), accu),
+                   subtract(current, const(1)),
+                   limit, indices_))
+        self.prelude.mod[helper_var] = \
+            Function([ta, accu, current, limit, indices_], helper_body, tensor_type_var(), [])
+        gather_name = self.get_name("tensor_array_gather")
+        gather_var = GlobalVar(gather_name)
+        setattr(self.prelude, gather_name, gather_var)
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_type_var()))
+        indices = Var('indices', TensorType([Any()], 'int32'))
+        indices_shape = op.shape_of(indices)
+        limit = op.take(indices_shape, const(0))
+        body = helper_var(tensor_array, self.prelude.nil(), limit, limit, indices)
+        self.prelude.mod[gather_var] =\
+            Function([tensor_array, indices], body, tensor_type_var(), [])
+
+    def define_tensor_array_stack(self):
+        """Defines a function to get the values in the tensor array as a stack tensor_t.
+        tensor_array_stack(l) : list[tensor_t] -> tensor_t
+        """
+        stack_name = self.get_name("tensor_array_stack")
+        stack_var = GlobalVar(stack_name)
+        setattr(self.prelude, stack_name, stack_var)
+        tensor_type_var = self.get_var('tensor_t')
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_type_var()))
+        expand_dims_var = self.get_var('tensor_expand_dims')
+        concat_var = self.get_var('tensor_concatenate')
+        tensor_array_expand_dims = self.prelude.map(expand_dims_var, tensor_array)
+        tensors = self.prelude.foldl(concat_var,
+                                     self.prelude.hd(tensor_array_expand_dims),
+                                     self.prelude.tl(tensor_array_expand_dims))
+        self.prelude.mod[stack_var] = Function([tensor_array], tensors, tensor_type_var(), [])
+
+    def register(self):
+        """Register all tensor array ops in Prelude"""
+        self.define_tensor_adt()
+        self.define_tensor_take()
+        self.define_tensor_expand_dims()
+        self.define_tensor_concat()
+        self.define_tensor_array()
+        self.define_tensor_array_read()
+        self.define_tensor_array_write()
+        self.define_tensor_array_unstack_tensor1()
+        self.define_tensor_array_unstack_tensor2()
+        self.define_tensor_array_scatter()
+        self.define_tensor_array_split()
+        self.define_tensor_array_concat()
+        self.define_tensor_array_stack()
+        # TODO(wweic): Gather fails in PartialEvaluate
+        # self.define_tensor_array_gather()
+
 class Prelude:
     """Contains standard definitions."""
 
@@ -27,6 +532,17 @@ def __init__(self, mod=None):
         self.mod = mod
         self.load_prelude()
 
+    def get_name(self, canonical, dtype):
+        """Get name corresponding to the canonical name"""
+        if canonical == 'tensor_t':
+            return 'tensor_{}_t'.format(dtype)
+        return "{}_{}".format(canonical, dtype)
+
+    def get_var(self, canonical, dtype):
+        """Get var corresponding to the canonical name"""
+        name = self.get_name(canonical, dtype)
+        return getattr(self, name)
+
     def load_prelude(self):
         """Parses the Prelude from Relay's text format into a module."""
         # TODO(@jroesch): we should remove this helper when we port over prelude
@@ -74,3 +590,7 @@ def load_prelude(self):
         ]
         for global_def in GLOBAL_DEFS:
             setattr(self, global_def, self.mod.get_global_var(global_def))
+
+        for dtype in ['float32', 'int32']:
+            tensor_array_ops = TensorArrayOps(self, dtype)
+            tensor_array_ops.register()
diff --git a/python/tvm/relay/testing/py_converter.py b/python/tvm/relay/testing/py_converter.py
index d661be73ad02..d7b59922b89d 100644
--- a/python/tvm/relay/testing/py_converter.py
+++ b/python/tvm/relay/testing/py_converter.py
@@ -203,8 +203,12 @@ def convert_module(self):
         for var, func in self.mod.functions.items():
             # optimize the definition so any operators used are lowered
             opt_func = self.optimize(func)
-            converted_func, _ = self.convert_func_node(opt_func, var)
-            defs.append(converted_func)
+            try:
+                converted_func, _ = self.convert_func_node(opt_func, var)
+                defs.append(converted_func)
+            except TypeError:
+                # TODO(wweic): fix conversion for Any
+                pass
         return defs
 
 
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 21f71af4eb8c..f85283094e91 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -309,7 +309,9 @@ VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
       fields.push_back(instr.alloc_tensor_reg.shape_register);
       // Save `DLDataType` and the dst register.
       const auto& dtype = instr.alloc_tensor.dtype;
-      fields.assign({dtype.code, dtype.bits, dtype.lanes});
+      fields.push_back(dtype.code);
+      fields.push_back(dtype.bits);
+      fields.push_back(dtype.lanes);
       fields.push_back(instr.dst);
       break;
     }
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index c2cbbff24173..3321d71a2cb8 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -60,13 +60,19 @@ def vmobj_to_list(o):
             result.append(vmobj_to_list(f))
         return result
     elif isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
-        if o.constructor.name_hint == 'cons':
+        if o.constructor.name_hint == 'Cons':
             tl = vmobj_to_list(o.fields[1])
             hd = vmobj_to_list(o.fields[0])
             hd.extend(tl)
             return hd
-        elif o.constructor.name_hint == 'nil':
+        elif o.constructor.name_hint == 'Nil':
             return []
+        elif 'tensor_nil' in o.constructor.name_hint:
+            return [0]
+        elif 'tensor' in o.constructor.name_hint:
+            return [o.fields[0].asnumpy()]
+        else:
+            raise RuntimeError("Unknown object type: %s" % o.constructor.name_hint)
     elif isinstance(o, tvm.relay.backend.interpreter.TensorValue):
         return [o.data.asnumpy()]
     else:
@@ -77,14 +83,11 @@ def run_tvm_graph(graph_def, input_data, input_node, num_output=1,
     """ Generic function to compile on relay and execute on tvm """
     input_data = convert_to_list(input_data)
     input_node = convert_to_list(input_node)
-
     layout = None
     if target == "cuda":
         layout = "NCHW"
     target_host = None
-
     shape_dict = {e: i.shape for e, i in zip(input_node, input_data)}
-
     mod, params = relay.frontend.from_tensorflow(graph_def,
                                                  layout=layout,
                                                  shape=shape_dict,
@@ -581,6 +584,111 @@ def test_forward_squeeze():
     _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [-3, -5])
     _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [-3, -5, -1])
 
+def test_tensor_array_constructor():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype =  {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            t = tf.constant(np.array([[1.0, 2.0], [3.0, 4.0]]).astype(dtype_str), dtype=dtype)
+            t2 = tf.constant(np.array([[1.0, 2.0], [3.0, 4.0]]).astype(dtype_str), dtype=dtype)
+            ta1 = tf.TensorArray(dtype=dtype, size=2, infer_shape=False, dynamic_size=False)
+            ta2 = ta1.write(0, t)
+            ta3 = ta2.write(1, t2)
+            out = ta3.read(0)
+            g = tf.get_default_graph()
+            compare_tf_with_tvm([], [], 'TensorArrayReadV3:0', mode='debug')
+    run('float32')
+    run('int32')
+
+def test_tensor_array_scatter():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype =  {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            t = tf.constant(np.array([[1.0], [2.0], [3.0]]).astype(dtype_str), dtype=dtype)
+            indices = tf.constant([2, 1, 0])
+            ta1 = tf.TensorArray(dtype=dtype, size=3, infer_shape=False, dynamic_size=False)
+            ta2 = ta1.scatter(indices, t)
+            out0 = ta2.read(0)
+            out1 = ta2.read(1)
+            out2 = ta2.read(2)
+            g = tf.get_default_graph()
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_1:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_2:0'], mode='debug')
+    run('float32')
+    run('int32')
+
+# TODO(wweic): Fix gather issue with PartialEvaluate
+# def test_tensor_array_gather():
+#     with tf.Graph().as_default():
+#         dtype = 'float32'
+#         t = tf.constant([[1.0], [2.0], [3.0]])
+#         scatter_indices = tf.constant([2, 1, 0])
+#         gather_indices = tf.constant([1, 2])
+#         ta1 = tf.TensorArray(dtype=tf.float32, size=3, infer_shape=False, dynamic_size=False)
+#         ta2 = ta1.scatter(scatter_indices, t)
+#         t1 = ta2.gather(gather_indices)
+#         g = tf.get_default_graph()
+#         compare_tf_with_tvm([], [], ['TensorArrayGatherV3:0'], mode='debug')
+
+def test_tensor_array_split():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype =  {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            t = tf.constant(np.array([[1.0], [2.0], [3.0], [4.0], [5.0], [6.0], [7.0], [8.0]]).astype(dtype_str), dtype=dtype)
+            split_length = tf.constant([2, 2, 2, 2], dtype=tf.int32)
+            ta1 = tf.TensorArray(dtype=dtype, size=4, infer_shape=False, dynamic_size=False)
+            ta2 = ta1.split(t, split_length)
+            out0 = ta2.read(0)
+            out1 = ta2.read(1)
+            out2 = ta2.read(2)
+            out3 = ta2.read(3)
+            g = tf.get_default_graph()
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_1:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_2:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_3:0'], mode='debug')
+    run('float32')
+    run('int32')
+
+def test_tensor_array_concat():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype = {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            t = tf.constant(np.array([[1.0], [2.0], [3.0], [4.0], [5.0], [6.0], [7.0], [8.0]]).astype(dtype_str), dtype=dtype)
+            split_length = tf.constant([2, 2, 2, 2], dtype=tf.int32)
+            ta1 = tf.TensorArray(dtype=dtype, size=4, infer_shape=False, dynamic_size=False)
+            ta2 = ta1.split(t, split_length)
+            t = ta2.concat()
+            compare_tf_with_tvm([], [], ['TensorArrayConcatV3:0'], mode='debug')
+    run('float32')
+    run('int32')
+
+def test_tensor_array_size():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype =  {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            ta1 = tf.TensorArray(dtype=dtype, size=2, infer_shape=False, dynamic_size=False)
+            out = ta1.size()
+            g = tf.get_default_graph()
+            compare_tf_with_tvm([], [], 'TensorArraySizeV3:0', mode='debug')
+    run('float32')
+    run('int32')
+
 #######################################################################
 # ConcatV2
 # --------
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index 7be7c75dfe64..390d3cd9f3c4 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -21,6 +21,8 @@
 from tvm.relay.prelude import Prelude
 from tvm.relay.testing import add_nat_definitions, count as count_, make_nat_value, make_nat_expr
 
+import numpy as np
+
 mod = relay.Module()
 p = Prelude(mod)
 add_nat_definitions(p)
@@ -683,6 +685,146 @@ def test_iterate():
     res = intrp.evaluate(relay.Function([], expr)())
     assert count(res) == 12
 
+def test_tensor_expand_dims():
+    def run(dtype):
+        x = relay.var('x')
+        mod = relay.Module()
+        p = Prelude(mod)
+        expand_dims_func = p.get_var('tensor_expand_dims', dtype)
+        tensor1 = p.get_var('tensor1', dtype)
+        mod["main"] = relay.Function([x], expand_dims_func(tensor1(x)))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            x_np = np.random.uniform(size=(1,)).astype(dtype)
+            result = ex.evaluate()(x_np)
+            got = vmobj_to_list(result)
+            expected = [np.expand_dims(x_np, axis=0)]
+            tvm.testing.assert_allclose(expected, got)
+    run('float32')
+    run('int32')
+
+def test_tensor_array_constructor():
+    def run(dtype):
+        x = relay.var('x')
+        mod = relay.Module()
+        p = Prelude(mod)
+        tensor_array = p.get_var('tensor_array', dtype)
+        mod["main"] = relay.Function([x], tensor_array(x))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            result = ex.evaluate()(5)
+            got = vmobj_to_list(result)
+            expected = np.array([0, 0, 0, 0, 0])
+            tvm.testing.assert_allclose(expected, got)
+    run('float32')
+    run('int32')
+
+def test_tensor_array_read():
+    def run(dtype):
+        mod = relay.Module()
+        p = Prelude(mod)
+        l = relay.var('l')
+        i = relay.var('i')
+        read_func = p.get_var('tensor_array_read', dtype)
+        tensor_array = p.get_var('tensor_array', dtype)
+        mod["main"] = relay.Function([l, i], read_func(tensor_array(l), i))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            result = ex.evaluate()(10, 5)
+            got = vmobj_to_list(result)
+            expected = [0]
+            tvm.testing.assert_allclose(expected, got)
+    run('float32')
+    run('int32')
+
+def vmobj_to_list(o):
+    if isinstance(o, tvm.relay.backend.vmobj.Tensor):
+        return [o.asnumpy().tolist()]
+    elif isinstance(o, tvm.relay.backend.interpreter.TensorValue):
+        return [o.asnumpy()]
+    elif isinstance(o, tvm.relay.backend.vmobj.Datatype):
+        result = []
+        for f in o:
+            result.extend(vmobj_to_list(f))
+        return result
+    elif isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
+        if o.constructor.name_hint == 'Cons':
+            tl = vmobj_to_list(o.fields[1])
+            hd = vmobj_to_list(o.fields[0])
+            hd.extend(tl)
+            return hd
+        elif o.constructor.name_hint == 'Nil':
+            return []
+        elif 'tensor_nil' in o.constructor.name_hint:
+            return [0]
+        elif 'tensor' in o.constructor.name_hint:
+            return [o.fields[0].asnumpy()]
+        else:
+            raise RuntimeError("Unknown object type: %s" % o.constructor.name_hint)
+    else:
+        raise RuntimeError("Unknown object type: %s" % type(o))
+
+def test_tensor_array_stack():
+    def run(dtype):
+        mod = relay.Module()
+        p = Prelude(mod)
+        tensor_array = p.get_var('tensor_array', dtype)
+        tensor1 = p.get_var('tensor1', dtype)
+        write = p.get_var('tensor_array_write', dtype)
+        stack = p.get_var('tensor_array_stack', dtype)
+        l = relay.var('l')
+        v = relay.var('v')
+        init_tensor_array = tensor_array(relay.const(3))
+        tensor_array1 = write(init_tensor_array, relay.const(0), tensor1(v))
+        tensor_array2 = write(tensor_array1, relay.const(1), tensor1(v))    
+        tensor_array3 = write(tensor_array2, relay.const(2), tensor1(v))        
+        tensor_array4 = stack(tensor_array3)
+        mod["main"] = relay.Function([v], tensor_array4)
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            t = np.random.uniform(size=(1,)).astype(dtype)
+            result = ex.evaluate()(t)
+            res = vmobj_to_list(result)
+            expected = [np.stack([t, t, t])]
+            tvm.testing.assert_allclose(expected, res)
+    run('float32')
+    run('int32')
+
+def test_tensor_array_unstack():
+    def run(dtype):
+        mod = relay.Module()
+        p = Prelude(mod)
+        unstack_tensor1 = p.get_var('tensor_array_unstack_tensor1', dtype)
+        v = relay.var('v')
+        mod["main"] = relay.Function([v], unstack_tensor1(v))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            t = np.random.uniform(size=(1,)).astype(dtype)
+            result = ex.evaluate()(t)
+            res = vmobj_to_list(result)
+            tvm.testing.assert_allclose(t, res)
+    run('float32')
+    run('int32')
+
+def test_tensor_take():
+    def run(dtype):
+        mod = relay.Module()
+        p = Prelude(mod)
+        take = p.get_var('tensor_take', dtype)
+        tensor2 = p.get_var('tensor2', dtype)
+        v = relay.var('v')
+        lower = relay.var('lower')
+        upper = relay.var('upper')
+        mod["main"] = relay.Function([v, lower, upper], take(tensor2(v), lower, upper))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            t = np.random.uniform(size=(10, 10)).astype(dtype)
+            result = ex.evaluate()(t, 2, 5)
+            res = vmobj_to_list(result)
+            expected = [np.take(t, range(2, 5), axis=0)]
+            tvm.testing.assert_allclose(expected, res)
+    run('float32')
+    run('int32')
 
 if __name__ == "__main__":
     test_nat_constructor()
@@ -707,3 +849,9 @@ def test_iterate():
     test_size()
     test_compose()
     test_iterate()
+
+    test_tensor_expand_dims()
+    test_tensor_array_constructor()
+    test_tensor_array_read()
+    test_tensor_array_stack()
+    test_tensor_array_unstack()
diff --git a/tests/python/relay/test_feature.py b/tests/python/relay/test_feature.py
index 8f0e90de0315..64eda9d04e7c 100644
--- a/tests/python/relay/test_feature.py
+++ b/tests/python/relay/test_feature.py
@@ -38,7 +38,8 @@ def test_prelude():
         Feature.fLet,
         Feature.fIf,
         Feature.fConstructor,
-        Feature.fMatch
+        Feature.fMatch,
+        Feature.fGraph
     ])
 
 

From cb5277f979e59f7a29bf9d1987a381a1c5143a3e Mon Sep 17 00:00:00 2001
From: Gus Smith <guscomps@gmail.com>
Date: Fri, 18 Oct 2019 08:19:32 -0700
Subject: [PATCH 03/59] Fix typo (#4144)

---
 src/pass/lower_tvm_builtin.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc
index 69618985d50c..79329cbe717f 100644
--- a/src/pass/lower_tvm_builtin.cc
+++ b/src/pass/lower_tvm_builtin.cc
@@ -230,7 +230,7 @@ class BuiltinLower : public IRMutator {
                      cast(Int(32), device_type_)));
     return TVMStructGet(Handle(), stack_array_, idx, intrinsic::kArrAddr);
   }
-  // call packled.
+  // call packed.
   Expr MakeCallPacked(const Call* op, const Expr& e) {
     size_t restore_shape_stack = run_shape_stack_;
     size_t restore_array_stack = run_array_stack_;

From 86d445a9ccfb107ec33fd0d6d84b847473b3e038 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 18 Oct 2019 09:49:37 -0700
Subject: [PATCH 04/59] [CI] Pin NNPack pthreadtools version (#4152)

---
 docker/install/ubuntu_install_nnpack.sh | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/docker/install/ubuntu_install_nnpack.sh b/docker/install/ubuntu_install_nnpack.sh
index 4f45f130e2e5..dc51fc28d492 100755
--- a/docker/install/ubuntu_install_nnpack.sh
+++ b/docker/install/ubuntu_install_nnpack.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,11 +22,14 @@ set -o pipefail
 
 apt-get update && apt-get install -y --no-install-recommends git cmake
 
-# TODO: specific tag?
 git clone https://github.com/Maratyszcza/NNPACK NNPACK
+git clone https://github.com/Maratyszcza/pthreadpool  NNPACK/pthreadpool
+
+# Use specific versioning tag.
 (cd NNPACK && git checkout 1e005b0c2)
+(cd NNPACK/pthreadpool && git checkout 13da0b4c)
 
 mkdir -p NNPACK/build
 cd NNPACK/build
-cmake -DCMAKE_INSTALL_PREFIX:PATH=. -DNNPACK_INFERENCE_ONLY=OFF -DNNPACK_CONVOLUTION_ONLY=OFF -DNNPACK_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && make -j4 && make install
+cmake -DCMAKE_INSTALL_PREFIX:PATH=. -DNNPACK_INFERENCE_ONLY=OFF -DNNPACK_CONVOLUTION_ONLY=OFF -DNNPACK_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DPTHREADPOOL_SOURCE_DIR=pthreadpool .. && make -j4 && make install
 cd -

From c67bb94c210690cfc50019e19a807bfb72f71b82 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 18 Oct 2019 10:51:15 -0700
Subject: [PATCH 05/59] [QNN][TFLite] Parsing QNN Add op. Adding MobilenetV2.
 (#4142)

---
 python/tvm/relay/frontend/tflite.py          | 66 +++++++++++++++++++-
 tests/python/frontend/tflite/test_forward.py | 22 ++++++-
 2 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 35bc85e09fdd..b08dd6bf94e0 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -224,6 +224,18 @@ def has_same_qnn_params(self, lhs_tensor, rhs_tensor):
         return lhs_tensor.qnn_params['scale'] == rhs_tensor.qnn_params['scale'] and \
                 lhs_tensor.qnn_params['zero_point'] == rhs_tensor.qnn_params['zero_point']
 
+    def is_quantized(self, op):
+        """Check if an input tensor is quantized."""
+        try:
+            from tflite.Operator import Operator
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        first_tensor = input_tensors[0]
+        return first_tensor.qnn_params is not None
+
     def convert_conv2d(self, op):
         """Convert TFLite conv2d"""
         return self.convert_conv(op, "conv2d")
@@ -498,7 +510,25 @@ def _convert_elemwise(self, relay_op, op):
             rhs_type_str = self.get_tensor_type_str(rhs_tensor.tensor.Type())
             rhs_expr = self.exp_tab.new_const(self.get_tensor_value(rhs_tensor),
                                               dtype=rhs_type_str)
-        out = relay_op(lhs_expr, rhs_expr)
+
+        output_tensors = self.get_output_tensors(op)
+        assert len(output_tensors) == 1, "output tensors length should be 1"
+        output_tensor = output_tensors[0]
+
+        # If quantized, extracts qnn params and call QNN add operator.
+        if lhs_tensor.qnn_params:
+            assert rhs_tensor.qnn_params, "Both tensors should be quantized."
+            assert output_tensor.qnn_params, "Output tensor should be quantized."
+            out = relay_op(lhs=lhs_expr,
+                           rhs=rhs_expr,
+                           lhs_scale=lhs_tensor.qnn_params['scale'],
+                           lhs_zero_point=lhs_tensor.qnn_params['zero_point'],
+                           rhs_scale=rhs_tensor.qnn_params['scale'],
+                           rhs_zero_point=rhs_tensor.qnn_params['zero_point'],
+                           output_scale=output_tensor.qnn_params['scale'],
+                           output_zero_point=output_tensor.qnn_params['zero_point'])
+        else:
+            out = relay_op(lhs_expr, rhs_expr)
 
         # Options (fused_activation_function)
         options = None
@@ -517,36 +547,70 @@ def _convert_elemwise(self, relay_op, op):
             fused_activation_fn = options.FusedActivationFunction()
             # if we have activation fn
             if fused_activation_fn != ActivationFunctionType.NONE:
+                if output_tensor.qnn_params:
+                    raise tvm.error.OpNotImplemented(
+                        'Elemwise operators with fused activation are not supported yet.')
                 out = self.convert_fused_activation_function(out, fused_activation_fn)
 
         return out
 
     def convert_add(self, op):
         """Convert TFLite ADD"""
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            return self._convert_elemwise(_qnn.op.add, op)
         return self._convert_elemwise(_op.add, op)
 
     def convert_sub(self, op):
         """Convert TFLite SUB"""
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized sub operator is not supported yet.')
         return self._convert_elemwise(_op.subtract, op)
 
     def convert_mul(self, op):
         """Convert TFLite MUL"""
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized mul operator is not supported yet.')
         return self._convert_elemwise(_op.multiply, op)
 
     def convert_div(self, op):
         """Convert TFLite DIV"""
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized div operator is not supported yet.')
         return self._convert_elemwise(_op.divide, op)
 
     def convert_pow(self, op):
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized pow operator is not supported yet.')
         return self._convert_elemwise(_op.power, op)
 
     def convert_maximum(self, op):
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized maximum operator is not supported yet.')
         return self._convert_elemwise(_op.maximum, op)
 
     def convert_minimum(self, op):
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized minimum operator is not supported yet.')
         return self._convert_elemwise(_op.minimum, op)
 
     def convert_greater(self, op):
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized greater operator is not supported yet.')
         return self._convert_elemwise(_op.greater, op)
 
     def convert_zeros_like(self, op):
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index a71a24ee0a4f..29b0c87c5b32 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1037,6 +1037,26 @@ def test_forward_qnn_mobilenet_v1_net():
     tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
+def test_forward_qnn_mobilenet_v2_net():
+    """Test the Quantized TFLite Mobilenet V2 model."""
+    # MobilenetV2
+    tflite_model_file = tf_testing.get_workload_official(
+        "https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz",
+        "mobilenet_v2_1.0_224_quant.tflite")
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    # Checking the labels because the requantize implementation is different between TFLite and
+    # Relay. This cause final output numbers to mismatch. So, testing accuracy via labels.
+    np.random.seed(0)
+    data = np.random.random_integers(low=0, high=128, size=(1, 224, 224, 3)).astype('uint8')
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tflite_predictions = np.squeeze(tflite_output)
+    tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1]
+    tvm_output = run_tvm_graph(tflite_model_buf, data, 'input')
+    tvm_predictions = np.squeeze(tvm_output)
+    tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
+    tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
+
 #######################################################################
 # SSD Mobilenet
 # -------------
@@ -1111,6 +1131,6 @@ def test_forward_ssd_mobilenet_v1():
     test_forward_ssd_mobilenet_v1()
 
     # End to End quantized
-    # TODO - MobilenetV2 fails for now. Remove when fixed.
     test_forward_qnn_inception_v1_net()
     test_forward_qnn_mobilenet_v1_net()
+    test_forward_qnn_mobilenet_v2_net()

From 7aae836007d8c1e0f8ef3f3a291f97d8231579c1 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Fri, 18 Oct 2019 15:22:37 -0700
Subject: [PATCH 06/59] Add lift_if_then_else pass (#3865)

* Add LiftIfThenElse pass

* Add more comments

* Rename and refactor

* Add description for internal data structure

* Rename a test

* Minor change

* Address comments

* Improve update_for
---
 include/tvm/ir_pass.h                       |   7 +
 src/api/api_pass.cc                         |   1 +
 src/pass/hoist_if_then_else.cc              | 424 ++++++++++++++++++++
 tests/python/unittest/test_pass_hoist_if.py | 185 +++++++++
 4 files changed, 617 insertions(+)
 create mode 100644 src/pass/hoist_if_then_else.cc
 create mode 100644 tests/python/unittest/test_pass_hoist_if.py

diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index 5ac71fdce47b..03078b8be41f 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -377,6 +377,13 @@ Stmt LowerStorageAccessInfo(Stmt stmt);
  */
 Stmt DecorateDeviceScope(Stmt stmt);
 
+/*!
+ * \brief Loop invariant code motion which locates and hoists if statements.
+ * \param stmt The stmt to do if statement hoisting.
+ * \return Transformed stmt.
+ */
+Stmt HoistIfThenElse(Stmt stmt);
+
 /*!
  * \brief Make an user callable API LoweredFunc.
  *
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index 25cd5838385f..d2352496c2b4 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -160,5 +160,6 @@ REGISTER_PASS(VerifyGPUCode);
 REGISTER_PASS(DecorateDeviceScope);
 REGISTER_PASS(InstrumentBoundCheckers);
 REGISTER_PASS(VerifyCompactBuffer);
+REGISTER_PASS(HoistIfThenElse);
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/pass/hoist_if_then_else.cc b/src/pass/hoist_if_then_else.cc
new file mode 100644
index 000000000000..bbdb609e9a08
--- /dev/null
+++ b/src/pass/hoist_if_then_else.cc
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file hoist_if_then_else.cc
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_pass.h>
+#include <tvm/arithmetic.h>
+#include <tvm/api_registry.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <queue>
+#include "../arithmetic/int_set.h"
+#include "../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace ir {
+
+using HoistMap = std::unordered_map<const Node*, std::vector<Stmt>>;
+using VarMap = std::unordered_map<const Node*, std::unordered_set<const Node*>>;
+
+/*
+ * This pass tries to hoist IfThenElse stmt out of For loop if condition is loop invariant.
+ * For example, given the following block:
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 4; j++)
+ *        for (k = 0; k < 5; k++)
+ *            if (likely(i*2 < 4))
+ *                A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * We first detect all IfThenElse stmt and find the corresponding loop invariant For stmt.
+ * Then we hoist IfThenElse stmt by one For stmt each step:
+ *
+ * Step 1:
+ * for (i = 0; i < 3; i++)
+ *     for (j = 0; j < 4; j++)
+ *         if (likely(i*2 < 4))
+ *             for (k = 0; k < 5; k++)
+ *                 A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * Step 2:
+ * for (i = 0; i < 3; i++)
+ *     if (likely(i*2 < 4))
+ *         for (j = 0; j < 4; j++)
+ *             for (k = 0; k < 5; k++)
+ *                 A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * In this pass, we only continue detecting possible hoisting chance when visiting For,
+ * IfThenElse or AttrStmt Node. For example, for the following block:
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 4; j++)
+ *        A[i + j] = A[i + j] - 1
+ *        for (k = 0; k < 5; k++)
+ *            if (likely(i*2 < 4))
+ *                A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * Only the For with k variable will be considered and the resulting stmt would be:
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 4; j++)
+ *        A[i + j] = A[i + j] - 1
+ *        if (likely(i*2 < 4))
+ *            for (k = 0; k < 5; k++)
+ *                A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * This pass doesn't do hoisting for consecutive IfThenElse stmt. The following
+ * block won't be optimized:
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 4; j++)
+ *        for (k = 0; k < 5; k++)
+ *            if (likely(i*2 < 4))
+ *                A[3*i+2j+k] = B[7*i+3j+k]
+ *            if (likely(j > 2))
+ *                A[i+j+k] = B[i+j+k]
+ *
+ */
+class IfThenElseHoist {
+ public:
+  Stmt VisitAndMutate(const Stmt& stmt) {
+    SelectCandidates(stmt);
+    LocateTopFor();
+    return PostOrderMutate(stmt);
+  }
+
+ private:
+  void SelectCandidates(const Stmt& stmt);
+  void LocateTopFor();
+  Stmt PostOrderMutate(const Stmt& stmt);
+  size_t GetUpdatedFor(const Stmt& for_stmt, const Stmt& if_stmt);
+  Stmt HoistIf(const Stmt& if_stmt);
+
+  // Map of all For nodes to all child IfThenElse nodes.
+  HoistMap for2if_map_;
+  // Map of all IfThenElse nodes to all For nodes which are loop invariant.
+  HoistMap if2for_map_;
+  // Map of highest loop invariant For to child IfThenElse.
+  HoistMap top_for_var_map_;
+  // Map of original For to list of update For nodes.
+  HoistMap for_tracking_map_;
+  // Map of all IfThenElse nodes to condition variable nodes.
+  VarMap cond_var_map_;
+  // List of For nodes added in post order DFS visiting.
+  std::vector<Stmt> ordered_for_list_;
+};
+
+// Check whether a given IfThenElse stmt is the first one appearing
+// in a For stmt.
+bool is_first_if(const Stmt& for_stmt, const Stmt& if_stmt) {
+  std::vector<const Node*> if_node_list;
+  const For* for_node = for_stmt.as<For>();
+  CHECK(for_node);
+  CHECK(if_stmt.as<IfThenElse>());
+
+  PostOrderVisit(for_node->body, [&](const NodeRef& node) {
+    if (node.as<IfThenElse>()) {
+      if_node_list.push_back(node.get());
+    }
+  });
+  return if_node_list.empty() ? false : if_stmt.get() == if_node_list.back();
+}
+
+// Update upper level For node when current For node is modified.
+// With this function we only need to visit and mutate top level For node
+// in the main VisitAndMutate function.
+Stmt update_for(const Stmt& parent_for_stmt, const Stmt& new_if_stmt) {
+  const Node* top_for_node;
+  const For* parent_for_node = parent_for_stmt.as<For>();
+  CHECK(parent_for_node);
+  CHECK(new_if_stmt.as<IfThenElse>());
+
+  PostOrderVisit(parent_for_node->body, [&](const NodeRef& node) {
+    if (node.as<For>()) {
+      top_for_node = node.get();
+    }
+  });
+
+  PackedFunc replace_target_for = PackedFunc(
+    [&](TVMArgs args, TVMRetValue *ret){
+      const NodeRef& current_for = args[0];
+      if (current_for.get() == top_for_node) {
+        *ret = new_if_stmt;
+      }
+    });
+
+  return IRTransform(parent_for_stmt, nullptr, replace_target_for,
+                     {Expr("For")});
+}
+
+// Remove IfThenElse node from a For node.
+// A pair of For nodes will be generated.
+std::pair<Stmt, Stmt> RemoveIf(const Stmt& for_stmt, const Stmt& if_stmt) {
+  Stmt then_for;
+  Stmt else_for;
+  CHECK(if_stmt.as<IfThenElse>());
+
+  PackedFunc replace_then_case = PackedFunc(
+    [&](TVMArgs args, TVMRetValue *ret){
+      const NodeRef& node  = args[0];
+      if (node == if_stmt) {
+        *ret = node.as<IfThenElse>()->then_case;
+      }
+    });
+
+  PackedFunc replace_else_case = PackedFunc(
+    [&](TVMArgs args, TVMRetValue *ret){
+      const NodeRef& node  = args[0];
+      if (node == if_stmt) {
+        *ret = node.as<IfThenElse>()->else_case;
+      }
+    });
+
+  then_for = IRTransform(for_stmt, nullptr, replace_then_case,
+                         {Expr("IfThenElse")});
+  if (if_stmt.as<IfThenElse>()->else_case) {
+    else_for = IRTransform(for_stmt, nullptr, replace_else_case,
+                           {Expr("IfThenElse")});
+  }
+
+  return std::make_pair(then_for, else_for);
+}
+
+// Locate all For nodes and capture child IfThenElse nodes.
+void IfThenElseHoist::SelectCandidates(const Stmt& stmt) {
+  PostOrderVisit(stmt, [&](const NodeRef& node){
+    const For* for_node = node.as<For>();
+    if (!for_node) return;
+
+    std::queue<Stmt> tracker;
+    tracker.push(for_node->body);
+    Stmt for_stmt = Downcast<Stmt, NodeRef>(node);
+    for2if_map_.insert({for_stmt.get(), std::vector<Stmt>()});
+    while (!tracker.empty()) {
+      Stmt head = tracker.front();
+      tracker.pop();
+      if (head->is_type<For>()) {
+        for (const auto& if_stmt : for2if_map_.at(head.get())) {
+          for2if_map_[for_stmt.get()].push_back(if_stmt);
+        }
+      } else if (head->is_type<AttrStmt>()) {
+        const AttrStmt* attr_node = head.as<AttrStmt>();
+        tracker.push(attr_node->body);
+      } else if (head->is_type<IfThenElse>()) {
+        for2if_map_[for_stmt.get()].push_back(head);
+        const IfThenElse* if_node = head.as<IfThenElse>();
+        tracker.push(if_node->then_case);
+        if (if_node->else_case) {
+          tracker.push(if_node->else_case);
+        }
+
+        // Record condition variables.
+        if (!cond_var_map_.count(head.get())) {
+          std::unordered_set<const Node*> new_var_set;
+          cond_var_map_.insert({head.get(), new_var_set});
+          PostOrderVisit(if_node->condition, [&](const NodeRef& cond_node) {
+            if (cond_node.as<Variable>()) {
+              cond_var_map_[head.get()].insert(cond_node.get());
+            }
+          });
+        }
+      } else {
+        continue;
+      }
+    }
+    ordered_for_list_.emplace_back(Downcast<Stmt, NodeRef>(node));
+  });
+}
+
+// For each IfThenElse node, find the highest For node which
+// meets loop invariant condition.
+void IfThenElseHoist::LocateTopFor() {
+  std::unordered_map<const Node*, Stmt> if_position_map;
+  std::unordered_set<const Node*> top_for_var_set;
+
+  // Create IfThenElse -> For map.
+  for (const Stmt& for_stmt : ordered_for_list_) {
+    std::vector<Stmt> if_list = for2if_map_[for_stmt.get()];
+    const For* for_node = for_stmt.as<For>();
+    CHECK(for_node);
+    top_for_var_map_.insert({for_node->loop_var.get(), if_list});
+    for (const Stmt& if_stmt : if_list) {
+      const Node* if_node = if_stmt.get();
+      if2for_map_[if_node].push_back(for_stmt);
+    }
+  }
+
+  // Locate the highest For node which is loop invariant.
+  for (const auto& item : if2for_map_) {
+    Stmt top_for;
+    const Node* if_stmt = item.first;
+    std::vector<Stmt> for_list = item.second;
+    for (size_t i = 0; i < for_list.size(); ++i) {
+      const Stmt& for_stmt = for_list.at(i);
+      const For* for_node = for_stmt.as<For>();
+      CHECK(for_node);
+      std::vector<Stmt> new_for_list{for_stmt};
+      for_tracking_map_.insert({for_stmt.get(), new_for_list});
+      if (cond_var_map_[if_stmt]
+        .count(for_node->loop_var.get())) {
+        std::vector<Stmt> updated_for_list(for_list.begin(),
+                                           for_list.begin() + i);
+        if2for_map_[if_stmt] = updated_for_list;
+        break;
+      } else {
+        top_for = for_stmt;
+      }
+    }
+    if (top_for.as<For>()) {
+      if_position_map.insert({if_stmt, top_for});
+    }
+  }
+
+  for (const auto& item : if_position_map) {
+    top_for_var_set.insert(item.second.as<For>()->loop_var.get());
+  }
+
+  std::vector<const Node*> removed_for_var_list;
+  for (const auto& item : top_for_var_map_) {
+    const Node* top_for_var = item.first;
+    std::vector<Stmt> if_list = item.second;
+    if (!top_for_var_set.count(top_for_var)) {
+      removed_for_var_list.push_back(top_for_var);
+    } else {
+      std::vector<Stmt> actual_if_list;
+      for (const Stmt& if_stmt : if_list) {
+        if (if_position_map.count(if_stmt.get())) {
+          actual_if_list.push_back(if_stmt);
+        }
+      }
+      top_for_var_map_[top_for_var] = actual_if_list;
+    }
+  }
+  for (const Node* top_for_var : removed_for_var_list) {
+    top_for_var_map_.erase(top_for_var);
+  }
+}
+
+// When we try to mutate a For node, some child For nodes can have already
+// been mutated. This function is to get the updated For node and further
+// hoisting can be done based on this new node.
+// We keep all For nodes tracing in for_tracking_map_. When we get a
+// hoisted IfThenElse, we match it with tracing For nodes to pick
+// the updated one.
+size_t IfThenElseHoist::GetUpdatedFor(const Stmt& for_stmt,
+                                       const Stmt& if_stmt) {
+  std::vector<Stmt> tracked_for_list = for_tracking_map_[for_stmt.get()];
+  size_t updated_for_idx = 0;
+  for (size_t i = 0; i < tracked_for_list.size(); ++i) {
+    const Stmt& current_for =
+      tracked_for_list.at(tracked_for_list.size() - 1 - i);
+    if (is_first_if(current_for, if_stmt)) {
+      updated_for_idx = tracked_for_list.size() - 1 - i;
+      break;
+    }
+  }
+  return updated_for_idx;
+}
+
+// Hoist an IfThenElse node as high as possible.
+// This function iterates on all candidate For nodes. For each For node,
+// it first removes IfThenElse nodes. Then it generates a new IfThenElse
+// node using mutated For nodes.
+Stmt IfThenElseHoist::HoistIf(const Stmt& if_stmt) {
+  Stmt new_if = if_stmt;
+
+  for (size_t i = 0; i < if2for_map_[if_stmt.get()].size(); ++i) {
+    const Stmt& for_stmt = if2for_map_[if_stmt.get()].at(i);
+    size_t updated_for_idx = GetUpdatedFor(for_stmt, new_if);
+    const Stmt& updated_for_node =
+      for_tracking_map_[for_stmt.get()].at(updated_for_idx);
+    auto generated_for_pair = RemoveIf(updated_for_node, new_if);
+    const Stmt& then_for = generated_for_pair.first;
+    const Stmt& else_for = generated_for_pair.second;;
+    for_tracking_map_[for_stmt.get()].at(updated_for_idx) = then_for;
+
+    if (else_for.get()) {
+      for_tracking_map_[for_stmt.get()].push_back(else_for);
+    }
+
+    const IfThenElse* new_if_node = new_if.as<IfThenElse>();
+    CHECK(new_if_node);
+    new_if = IfThenElse::make(new_if_node->condition, then_for, else_for);
+    if (i < if2for_map_[if_stmt.get()].size() - 1) {
+      const Stmt& original_next_for = if2for_map_[if_stmt.get()].at(i + 1);
+      const Stmt& actual_next_for =
+        for_tracking_map_[original_next_for.get()].at(updated_for_idx);
+      Stmt update_for_stmt = update_for(actual_next_for, new_if);
+
+      for_tracking_map_[original_next_for.get()].
+        at(updated_for_idx) = update_for_stmt;
+    }
+  }
+  return new_if;
+}
+
+// Mutate For nodes in post order DFS manner.
+Stmt IfThenElseHoist::PostOrderMutate(const Stmt& stmt) {
+  PackedFunc replace_top_for = PackedFunc(
+    [&](TVMArgs args, TVMRetValue *ret){
+      const NodeRef& current_for = args[0];
+      const For* for_node = current_for.as<For>();
+      if (!for_node) return;
+
+      if (top_for_var_map_.count(for_node->loop_var.get())) {
+        std::vector<Stmt> new_if_list;
+        for (const Stmt& if_stmt :
+          top_for_var_map_[for_node->loop_var.get()]) {
+          new_if_list.emplace_back(HoistIf(if_stmt));
+        }
+
+        const IfThenElse* next_if_node;
+        const IfThenElse* current_if_node =
+          new_if_list.back().as<IfThenElse>();
+        Stmt new_for = Stmt();
+        for (size_t i = new_if_list.size() - 1; i > 0; --i) {
+          CHECK(current_if_node);
+          const Stmt current_if_stmt =
+            IfThenElse::make(current_if_node->condition,
+                             current_if_node->then_case,
+                             current_if_node->else_case);
+          next_if_node = new_if_list[i - 1].as<IfThenElse>();
+          CHECK(next_if_node);
+          new_for = IfThenElse::make(next_if_node->condition, current_if_stmt,
+                                     next_if_node->else_case);
+          current_if_node = new_for.as<IfThenElse>();
+        }
+
+        if (!new_for.get()) {
+          const IfThenElse* first_if_node = new_if_list[0].as<IfThenElse>();
+          CHECK(first_if_node);
+          new_for = IfThenElse::make(first_if_node->condition,
+                                     first_if_node->then_case,
+                                     first_if_node->else_case);
+        }
+        *ret = new_for;
+      }
+    });
+  return IRTransform(stmt, nullptr, replace_top_for, {Expr("For")});
+}
+
+Stmt HoistIfThenElse(Stmt stmt) {
+  return IfThenElseHoist().VisitAndMutate(stmt);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_pass_hoist_if.py b/tests/python/unittest/test_pass_hoist_if.py
new file mode 100644
index 000000000000..4a28cf6b318a
--- /dev/null
+++ b/tests/python/unittest/test_pass_hoist_if.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+
+
+var_list = []
+
+def verify_structure(stmt, expected_struct):
+    node_dict = {}
+    struct = {}
+    def _extract_vars(op):
+        global var_list
+        if isinstance(op, tvm.expr.Var):
+            var_list.append(op.name)
+
+    def _visit(op):
+        key = op
+        if isinstance(op, tvm.stmt.IfThenElse):
+            global var_list
+            tvm.ir_pass.PostOrderVisit(op.condition, _extract_vars)
+            val = [(op.then_case, op.else_case), ("IfThenElse", tuple(var_list))]
+            var_list.clear()
+        elif isinstance(op, tvm.stmt.For):
+            val = [(op.body,), ("For", op.loop_var.name)]
+        elif isinstance(op, tvm.stmt.AttrStmt):
+            val = [(op.body,), ("AttrStmt", op.attr_key, int(op.value))]
+        else:
+            return
+        node_dict[key] = val
+
+    tvm.ir_pass.PostOrderVisit(stmt, _visit)
+    for key, val in node_dict.items():
+        struct[val[1]] = tuple(node_dict[child][1] if child in node_dict
+                               else None for child in val[0])
+
+    assert struct == expected_struct, "Structure mismatch: expect %s but got %s" \
+                                      % (expected_struct, struct)
+    var_list.clear()
+
+def test_basic():
+    ib = tvm.ir_builder.create()
+    l = tvm.var('l')
+    m = tvm.var('m')
+    n = tvm.var('n')
+
+    with ib.for_range(0, l, "i") as i:
+        with ib.for_range(0, m, "j") as j:
+            with ib.for_range(0, n, "k") as k:
+                with ib.if_scope(ib.likely(i < 2)):
+                    ib.emit(tvm.make.Evaluate(m))
+                with ib.else_scope():
+                    ib.emit(tvm.make.Evaluate(n))
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('For', 'k'): (None,), ('For', 'j'): (('For', 'k'),),
+                       ('IfThenElse', ('i',)): (('For', 'j'), ('For', 'j')),
+                       ('For', 'i'): (('IfThenElse', ('i',)),)}
+    verify_structure(new_stmt, expected_struct)
+
+def test_no_else():
+    ib = tvm.ir_builder.create()
+    l = tvm.var('l')
+    m = tvm.var('m')
+    n = tvm.var('n')
+
+    with ib.for_range(0, l, "i") as i:
+        with ib.for_range(0, m, "j") as j:
+            with ib.for_range(0, n, "k") as k:
+                with ib.if_scope(ib.likely(i < 2)):
+                    ib.emit(tvm.make.Evaluate(m))
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('For', 'k'): (None,), ('For', 'j'): (('For', 'k'),),
+                       ('IfThenElse', ('i',)): (('For', 'j'), None),
+                       ('For', 'i'): (('IfThenElse', ('i',)),)}
+    verify_structure(new_stmt, expected_struct)
+
+def test_attr_stmt():
+    ib = tvm.ir_builder.create()
+    dshape = (32, 64)
+    data = ib.pointer("float32", name="data")
+    l = tvm.var('l')
+    m = tvm.var('m')
+    n = tvm.var('n')
+
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", dshape[0])
+    ib.scope_attr(bx, "thread_extent", dshape[1])
+    with ib.for_range(0, l, "i") as i:
+        with ib.for_range(0, m, "j") as j:
+            with ib.for_range(0, n, "k") as k:
+                with ib.if_scope(tvm.any(i < 4, j >= 8)):
+                    data[bx * j + tx * j * k] = data[bx * j + tx * j * k]  + 0.5
+                with ib.else_scope():
+                    data[bx * j + tx * j * k] = data[bx * j + tx * j * k]  + 1.0
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('For', 'k'): (None,), ('IfThenElse', ('i', 'j')): (('For', 'k'), ('For', 'k')),
+                       ('For', 'j'): (('IfThenElse', ('i', 'j')),), ('For', 'i'): (('For', 'j'),),
+                       ('AttrStmt', 'thread_extent', 64): (('For', 'i'),),
+                       ('AttrStmt', 'thread_extent', 32): (('AttrStmt', 'thread_extent', 64),)}
+    verify_structure(new_stmt, expected_struct)
+
+def test_nested_for():
+    ib = tvm.ir_builder.create()
+    data = ib.pointer("float32", name="data")
+
+
+    with ib.for_range(0, 5, "i") as i:
+        with ib.for_range(0, 10, "j") as j:
+            with ib.if_scope(i >= 3):
+                data[i * 3 + j] = data[i * 3 + j] + 0.5
+                with ib.for_range(0, 15, "k") as k:
+                    with ib.for_range(0, 20, "l") as l:
+                        with ib.if_scope(tvm.any(i < 4, j >= 8)):
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 2
+                        with ib.else_scope():
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 1.5
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('IfThenElse', ('i', 'j')): (None, None), ('For', 'l'): (('IfThenElse', ('i', 'j')),),
+                       ('For', 'k'): (('For', 'l'),), ('For', 'j'): (None,), ('IfThenElse', ('i',)): (('For', 'j'), None),
+                       ('For', 'i'): (('IfThenElse', ('i',)),)}
+    verify_structure(new_stmt, expected_struct)
+
+def test_if_block():
+    ib = tvm.ir_builder.create()
+    data = ib.pointer("float32", name="data")
+    n = tvm.var("n")
+
+
+    with ib.for_range(0, 5, "i") as i:
+        with ib.for_range(0, 10, "j") as j:
+            with ib.if_scope(i >= 3):
+                data[i * 3 + j] = data[i * 3 + j] + 0.5
+                with ib.for_range(0, 15, "k") as k:
+                    with ib.for_range(0, 20, "l") as l:
+                        with ib.if_scope(tvm.any(i < 4, j >= 8)):
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 2
+                        with ib.else_scope():
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 1.5
+                        with ib.if_scope(j <5):
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] - 1
+
+
+    with ib.for_range(0, 5, "i") as i:
+        with ib.for_range(0, 10, "j") as j:
+                with ib.for_range(0, 15, "k") as k:
+                    with ib.if_scope(n >= 3):
+                        data[i * 3 + j + k] = data[i * 3 + j + k] + 0.6
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('IfThenElse', ('i', 'j')): (None, None), ('IfThenElse', ('j',)): (None, None),
+                       ('For', 'l'): (None,), ('For', 'k'): (None,), ('For', 'j'): (('For', 'j'),),
+                       ('IfThenElse', ('i',)): (('For', 'j'), None), ('For', 'i'): (('IfThenElse', ('i',)),),
+                       ('IfThenElse', ('n',)): (('For', 'j'), None)}
+    verify_structure(new_stmt, expected_struct)
+
+
+if __name__ == "__main__":
+    test_basic()
+    test_no_else()
+    test_attr_stmt()
+    test_nested_for()
+    test_if_block()

From 909900fc3ae3b366aa9ad3e4d33031240ac01883 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 18 Oct 2019 16:15:52 -0700
Subject: [PATCH 07/59] [CI] Update cpu docker (#4153)

---
 Jenkinsfile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 4b9ae9cafd88..c140d9c58ad2 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -38,9 +38,15 @@
 // - Tag the new version as the lates
 // - Periodically cleanup the old versions on local workers
 //
+
+// Hashtag in the source to build current CI docker builds
+//
+// - ci-cpu:v0.54: e7c88a99f830de30814df14eaa980547ecbd61c1
+//
+
 ci_lint = "tvmai/ci-lint:v0.51"
 ci_gpu = "tvmai/ci-gpu:v0.54"
-ci_cpu = "tvmai/ci-cpu:v0.52"
+ci_cpu = "tvmai/ci-cpu:v0.54"
 ci_i386 = "tvmai/ci-i386:v0.52"
 
 // tvm libraries

From 6f5d9f206a2d6865efc48d1d61911e20b80a6c94 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Sat, 19 Oct 2019 21:57:50 -0700
Subject: [PATCH 08/59] [Refactor] Rename Datatype to ADT (#4156)

We think it will reduce the confusion with the meaning.

https://discuss.tvm.ai/t/discuss-consider-rename-vm-datatype/4339
---
 docs/dev/virtual_machine.rst                  | 10 +++----
 include/tvm/runtime/object.h                  |  2 +-
 include/tvm/runtime/vm.h                      | 24 ++++++++--------
 python/tvm/relay/backend/vm.py                |  2 +-
 python/tvm/relay/backend/vmobj.py             | 20 ++++++-------
 src/relay/backend/vm/compiler.cc              |  8 +++---
 src/runtime/vm/executable.cc                  |  6 ++--
 src/runtime/vm/object.cc                      | 28 +++++++++----------
 src/runtime/vm/vm.cc                          | 24 ++++++++--------
 .../frontend/tensorflow/test_forward.py       |  2 +-
 tests/python/relay/test_adt.py                |  2 +-
 tests/python/relay/test_vm.py                 |  2 +-
 tests/python/relay/test_vm_object.py          |  8 +++---
 13 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/docs/dev/virtual_machine.rst b/docs/dev/virtual_machine.rst
index 2791ee71177e..cb08cc14e56e 100644
--- a/docs/dev/virtual_machine.rst
+++ b/docs/dev/virtual_machine.rst
@@ -121,7 +121,7 @@ AllocTensor
 Allocate a tensor value of the appropriate shape (stored in `shape_register`) and `dtype`. The result
 is saved to register `dst`.
 
-AllocDatatype
+AllocADT
 ^^^^^^^^^^^^^
 **Arguments**:
 ::
@@ -176,7 +176,7 @@ GetTagi
   RegName object
   RegName dst
 
-Get the object tag for Datatype object in register `object`. And saves the reult to register `dst`.
+Get the object tag for ADT object in register `object`. And saves the reult to register `dst`.
 
 Fatal
 ^^^^^
@@ -251,9 +251,9 @@ Currently, we support 3 types of objects: tensors, data types, and closures.
 
 ::
 
-    VMObject VMTensor(const tvm::runtime::NDArray& data);
-    VMObject VMDatatype(size_t tag, const std::vector<VMObject>& fields);
-    VMObject VMClosure(size_t func_index, std::vector<VMObject> free_vars);
+    Object Tensor(const tvm::runtime::NDArray& data);
+    Object ADT(size_t tag, const std::vector<Object>& fields);
+    Object Closure(size_t func_index, std::vector<Object> free_vars);
 
 
 Stack and State
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 0693b1f47b3c..7291510c16df 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -51,7 +51,7 @@ enum TypeIndex  {
   kRoot = 0,
   kVMTensor = 1,
   kVMClosure = 2,
-  kVMDatatype = 3,
+  kVMADT = 3,
   kStaticIndexEnd,
   /*! \brief Type index is allocated during runtime. */
   kDynamic = kStaticIndexEnd
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index a276c658c496..7d2df0b285b1 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -57,31 +57,31 @@ class Tensor : public ObjectRef {
 
 
 /*! \brief An object representing a structure or enumeration. */
-class DatatypeObj : public Object {
+class ADTObj : public Object {
  public:
   /*! \brief The tag representing the constructor used. */
   size_t tag;
   /*! \brief The fields of the structure. */
   std::vector<ObjectRef> fields;
 
-  static constexpr const uint32_t _type_index = TypeIndex::kVMDatatype;
-  static constexpr const char* _type_key = "vm.Datatype";
-  TVM_DECLARE_FINAL_OBJECT_INFO(DatatypeObj, Object);
+  static constexpr const uint32_t _type_index = TypeIndex::kVMADT;
+  static constexpr const char* _type_key = "vm.ADT";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ADTObj, Object);
 };
 
-/*! \brief reference to data type. */
-class Datatype : public ObjectRef {
+/*! \brief reference to algebraic data type objects. */
+class ADT : public ObjectRef {
  public:
-  Datatype(size_t tag, std::vector<ObjectRef> fields);
+  ADT(size_t tag, std::vector<ObjectRef> fields);
 
   /*!
    * \brief construct a tuple object.
    * \param fields The fields of the tuple.
    * \return The constructed tuple type.
    */
-  static Datatype Tuple(std::vector<ObjectRef> fields);
+  static ADT Tuple(std::vector<ObjectRef> fields);
 
-  TVM_DEFINE_OBJECT_REF_METHODS(Datatype, ObjectRef, DatatypeObj);
+  TVM_DEFINE_OBJECT_REF_METHODS(ADT, ObjectRef, ADTObj);
 };
 
 /*! \brief An object representing a closure. */
@@ -129,7 +129,7 @@ enum class Opcode {
   InvokePacked = 4U,
   AllocTensor = 5U,
   AllocTensorReg = 6U,
-  AllocDatatype = 7U,
+  AllocADT = 7U,
   AllocClosure = 8U,
   GetField = 9U,
   If = 10U,
@@ -237,7 +237,7 @@ struct Instruction {
       /*! \brief The register to project from. */
       RegName object;
     } get_tag;
-    struct /* AllocDatatype Operands */ {
+    struct /* AllocADT Operands */ {
       /*! \brief The datatype's constructor tag. */
       Index constructor_tag;
       /*! \brief The number of fields to store in the datatype. */
@@ -294,7 +294,7 @@ struct Instruction {
    *  \param dst The register name of the destination.
    *  \return The allocate instruction tensor.
    */
-  static Instruction AllocDatatype(Index tag, Index num_fields, const std::vector<RegName>& fields,
+  static Instruction AllocADT(Index tag, Index num_fields, const std::vector<RegName>& fields,
                                    RegName dst);
   /*! \brief Construct an allocate closure instruction.
    *  \param func_index The index of the function table.
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 942c93b866f4..e190e3f1eb41 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -31,7 +31,7 @@
 from .interpreter import Executor
 
 Tensor = _obj.Tensor
-Datatype = _obj.Datatype
+ADT = _obj.ADT
 
 def _convert(arg, cargs):
     if isinstance(arg, (np.ndarray, tvm.nd.NDArray)):
diff --git a/python/tvm/relay/backend/vmobj.py b/python/tvm/relay/backend/vmobj.py
index 939b122bf510..f3fdb763209d 100644
--- a/python/tvm/relay/backend/vmobj.py
+++ b/python/tvm/relay/backend/vmobj.py
@@ -61,14 +61,14 @@ def asnumpy(self):
         return self.data.asnumpy()
 
 
-@register_object("vm.Datatype")
-class Datatype(Object):
-    """Datatype object.
+@register_object("vm.ADT")
+class ADT(Object):
+    """Algebatic data type(ADT) object.
 
     Parameters
     ----------
     tag : int
-        The tag of datatype.
+        The tag of ADT.
 
     fields : list[Object] or tuple[Object]
         The source tuple.
@@ -77,22 +77,22 @@ def __init__(self, tag, fields):
         for f in fields:
             assert isinstance(f, Object)
         self.__init_handle_by_constructor__(
-            _vmobj.Datatype, tag, *fields)
+            _vmobj.ADT, tag, *fields)
 
     @property
     def tag(self):
-        return _vmobj.GetDatatypeTag(self)
+        return _vmobj.GetADTTag(self)
 
     def __getitem__(self, idx):
         return getitem_helper(
-            self, _vmobj.GetDatatypeFields, len(self), idx)
+            self, _vmobj.GetADTFields, len(self), idx)
 
     def __len__(self):
-        return _vmobj.GetDatatypeNumberOfFields(self)
+        return _vmobj.GetADTNumberOfFields(self)
 
 
 def tuple_object(fields):
-    """Create a datatype object from source tuple.
+    """Create a ADT object from source tuple.
 
     Parameters
     ----------
@@ -101,7 +101,7 @@ def tuple_object(fields):
 
     Returns
     -------
-    ret : Datatype
+    ret : ADT
         The created object.
     """
     for f in fields:
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index f295ccd7a555..fab01bd40423 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -239,7 +239,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     DLOG(INFO) << "VMCompiler::Emit: instr=" << instr;
     CHECK((int)instr.op < 100) << "Invalid opcode " << (int)instr.op;
     switch (instr.op) {
-      case Opcode::AllocDatatype:
+      case Opcode::AllocADT:
       case Opcode::AllocTensor:
       case Opcode::AllocTensorReg:
       case Opcode::GetField:
@@ -287,7 +287,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     }
 
     // TODO(@jroesch): use correct tag
-    Emit(Instruction::AllocDatatype(
+    Emit(Instruction::AllocADT(
       0,
       tuple->fields.size(),
       fields_registers,
@@ -626,7 +626,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       for (size_t i = arity - return_count; i < arity; ++i) {
         fields_registers.push_back(unpacked_arg_regs[i]);
       }
-      Emit(Instruction::AllocDatatype(0, return_count, fields_registers, NewRegister()));
+      Emit(Instruction::AllocADT(0, return_count, fields_registers, NewRegister()));
     }
   }
 
@@ -659,7 +659,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       }
     } else if (auto constructor_node = op.as<ConstructorNode>()) {
       auto constructor = GetRef<Constructor>(constructor_node);
-      Emit(Instruction::AllocDatatype(constructor->tag, call_node->args.size(), args_registers,
+      Emit(Instruction::AllocADT(constructor->tag, call_node->args.size(), args_registers,
                                       NewRegister()));
     } else if (auto var_node = op.as<VarNode>()) {
       VisitExpr(GetRef<Var>(var_node));
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index f85283094e91..32032b5a1e64 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -315,7 +315,7 @@ VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
       fields.push_back(instr.dst);
       break;
     }
-    case Opcode::AllocDatatype: {
+    case Opcode::AllocADT: {
       // Number of fields = 3 + instr.num_fields
       fields.assign({instr.constructor_tag, instr.num_fields, instr.dst});
 
@@ -551,7 +551,7 @@ Instruction DeserializeInstruction(const VMInstructionSerializer& instr) {
 
       return Instruction::AllocTensorReg(shape_register, dtype, dst);
     }
-    case Opcode::AllocDatatype: {
+    case Opcode::AllocADT: {
       // Number of fields = 3 + instr.num_fields
       DCHECK_GE(instr.fields.size(), 3U);
       DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
@@ -561,7 +561,7 @@ Instruction DeserializeInstruction(const VMInstructionSerializer& instr) {
       RegName dst = instr.fields[2];
       std::vector<Index> fields = ExtractFields(instr.fields, 3, num_fields);
 
-      return Instruction::AllocDatatype(constructor_tag, num_fields, fields, dst);
+      return Instruction::AllocADT(constructor_tag, num_fields, fields, dst);
     }
     case Opcode::AllocClosure: {
       // Number of fields = 3 + instr.num_freevar
diff --git a/src/runtime/vm/object.cc b/src/runtime/vm/object.cc
index c20a1ce9de27..12edf511db66 100644
--- a/src/runtime/vm/object.cc
+++ b/src/runtime/vm/object.cc
@@ -39,15 +39,15 @@ Tensor::Tensor(NDArray data) {
   data_ = std::move(ptr);
 }
 
-Datatype::Datatype(size_t tag, std::vector<ObjectRef> fields) {
-  auto ptr = make_object<DatatypeObj>();
+ADT::ADT(size_t tag, std::vector<ObjectRef> fields) {
+  auto ptr = make_object<ADTObj>();
   ptr->tag = tag;
   ptr->fields = std::move(fields);
   data_ = std::move(ptr);
 }
 
-Datatype Datatype::Tuple(std::vector<ObjectRef> fields) {
-  return Datatype(0, fields);
+ADT ADT::Tuple(std::vector<ObjectRef> fields) {
+  return ADT(0, fields);
 }
 
 Closure::Closure(size_t func_index, std::vector<ObjectRef> free_vars) {
@@ -66,28 +66,28 @@ TVM_REGISTER_GLOBAL("_vmobj.GetTensorData")
   *rv = cell->data;
 });
 
-TVM_REGISTER_GLOBAL("_vmobj.GetDatatypeTag")
+TVM_REGISTER_GLOBAL("_vmobj.GetADTTag")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
   ObjectRef obj = args[0];
-  const auto* cell = obj.as<DatatypeObj>();
+  const auto* cell = obj.as<ADTObj>();
   CHECK(cell != nullptr);
   *rv = static_cast<int64_t>(cell->tag);
 });
 
-TVM_REGISTER_GLOBAL("_vmobj.GetDatatypeNumberOfFields")
+TVM_REGISTER_GLOBAL("_vmobj.GetADTNumberOfFields")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
   ObjectRef obj = args[0];
-  const auto* cell = obj.as<DatatypeObj>();
+  const auto* cell = obj.as<ADTObj>();
   CHECK(cell != nullptr);
   *rv = static_cast<int64_t>(cell->fields.size());
 });
 
 
-TVM_REGISTER_GLOBAL("_vmobj.GetDatatypeFields")
+TVM_REGISTER_GLOBAL("_vmobj.GetADTFields")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
   ObjectRef obj = args[0];
   int idx = args[1];
-  const auto* cell = obj.as<DatatypeObj>();
+  const auto* cell = obj.as<ADTObj>();
   CHECK(cell != nullptr);
   CHECK_LT(idx, cell->fields.size());
   *rv = cell->fields[idx];
@@ -104,10 +104,10 @@ TVM_REGISTER_GLOBAL("_vmobj.Tuple")
   for (auto i = 0; i < args.size(); ++i) {
     fields.push_back(args[i]);
   }
-  *rv = Datatype::Tuple(fields);
+  *rv = ADT::Tuple(fields);
 });
 
-TVM_REGISTER_GLOBAL("_vmobj.Datatype")
+TVM_REGISTER_GLOBAL("_vmobj.ADT")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
   int itag = args[0];
   size_t tag = static_cast<size_t>(itag);
@@ -115,11 +115,11 @@ TVM_REGISTER_GLOBAL("_vmobj.Datatype")
   for (int i = 1; i < args.size(); i++) {
     fields.push_back(args[i]);
   }
-  *rv = Datatype(tag, fields);
+  *rv = ADT(tag, fields);
 });
 
 TVM_REGISTER_OBJECT_TYPE(TensorObj);
-TVM_REGISTER_OBJECT_TYPE(DatatypeObj);
+TVM_REGISTER_OBJECT_TYPE(ADTObj);
 TVM_REGISTER_OBJECT_TYPE(ClosureObj);
 }  // namespace vm
 }  // namespace runtime
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 78b74768b930..fd5ff64d5812 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -74,7 +74,7 @@ Instruction::Instruction(const Instruction& instr) {
       this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register;
       this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype;
       return;
-    case Opcode::AllocDatatype:
+    case Opcode::AllocADT:
       this->constructor_tag = instr.constructor_tag;
       this->num_fields = instr.num_fields;
       this->datatype_fields = Duplicate<RegName>(instr.datatype_fields, instr.num_fields);
@@ -159,7 +159,7 @@ Instruction& Instruction::operator=(const Instruction& instr) {
       this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register;
       this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype;
       return *this;
-    case Opcode::AllocDatatype:
+    case Opcode::AllocADT:
       this->constructor_tag = instr.constructor_tag;
       this->num_fields = instr.num_fields;
       FreeIf(this->datatype_fields);
@@ -229,7 +229,7 @@ Instruction::~Instruction() {
     case Opcode::AllocTensor:
       delete this->alloc_tensor.shape;
       return;
-    case Opcode::AllocDatatype:
+    case Opcode::AllocADT:
       delete this->datatype_fields;
       return;
     case Opcode::AllocClosure:
@@ -301,10 +301,10 @@ Instruction Instruction::AllocTensorReg(RegName shape_register, DLDataType dtype
   return instr;
 }
 
-Instruction Instruction::AllocDatatype(Index tag, Index num_fields,
+Instruction Instruction::AllocADT(Index tag, Index num_fields,
                                        const std::vector<RegName>& datatype_fields, Index dst) {
   Instruction instr;
-  instr.op = Opcode::AllocDatatype;
+  instr.op = Opcode::AllocADT;
   instr.dst = dst;
   instr.constructor_tag = tag;
   instr.num_fields = num_fields;
@@ -485,7 +485,7 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) {
       DLDatatypePrint(os, instr.alloc_tensor_reg.dtype);
       break;
     }
-    case Opcode::AllocDatatype: {
+    case Opcode::AllocADT: {
       os << "alloc_data $" << instr.dst << " tag(" << instr.constructor_tag << ") [$"
          << StrJoin<RegName>(instr.datatype_fields, 0, instr.num_fields, ",$") << "]";
       break;
@@ -691,7 +691,7 @@ void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
                                   const std::vector<ObjectRef>& args) {
   size_t arity = 0;
   for (Index i = 0; i < arg_count; i++) {
-    if (const auto* obj = args[i].as<DatatypeObj>()) {
+    if (const auto* obj = args[i].as<ADTObj>()) {
       arity += obj->fields.size();
     } else {
       ++arity;
@@ -703,7 +703,7 @@ void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
   runtime::TVMArgsSetter setter(values.data(), codes.data());
   int idx = 0;
   for (Index i = 0; i < arg_count; i++) {
-    if (const auto* dt_cell = args[i].as<DatatypeObj>()) {
+    if (const auto* dt_cell = args[i].as<ADTObj>()) {
       for (auto obj : dt_cell->fields) {
         const auto* tensor = obj.as<TensorObj>();
         CHECK(tensor != nullptr);
@@ -849,7 +849,7 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::GetField: {
         auto object = ReadRegister(instr.object);
-        const auto* tuple = object.as<DatatypeObj>();
+        const auto* tuple = object.as<ADTObj>();
         CHECK(tuple != nullptr)
             << "Object is not data type object, register " << instr.object << ", Object tag "
             << object->type_index();
@@ -860,7 +860,7 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::GetTag: {
         auto object = ReadRegister(instr.get_tag.object);
-        const auto* data = object.as<DatatypeObj>();
+        const auto* data = object.as<ADTObj>();
         CHECK(data != nullptr)
             << "Object is not data type object, register "
             << instr.get_tag.object << ", Object tag "
@@ -925,12 +925,12 @@ void VirtualMachine::RunLoop() {
         pc++;
         goto main_loop;
       }
-      case Opcode::AllocDatatype: {
+      case Opcode::AllocADT: {
         std::vector<ObjectRef> fields;
         for (Index i = 0; i < instr.num_fields; ++i) {
           fields.push_back(ReadRegister(instr.datatype_fields[i]));
         }
-        ObjectRef obj = Datatype(instr.constructor_tag, fields);
+        ObjectRef obj = ADT(instr.constructor_tag, fields);
         WriteRegister(instr.dst, obj);
         pc++;
         goto main_loop;
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 3321d71a2cb8..420bcb72a4a2 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -49,7 +49,7 @@ def convert_to_list(x):
 def vmobj_to_list(o):
     if isinstance(o, tvm.relay.backend.vmobj.Tensor):
         return [o.asnumpy().tolist()]
-    elif isinstance(o, tvm.relay.backend.vmobj.Datatype):
+    elif isinstance(o, tvm.relay.backend.vmobj.ADT):
         result = []
         for f in o:
             result.extend(vmobj_to_list(f))
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index 390d3cd9f3c4..32bc22f9031a 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -742,7 +742,7 @@ def vmobj_to_list(o):
         return [o.asnumpy().tolist()]
     elif isinstance(o, tvm.relay.backend.interpreter.TensorValue):
         return [o.asnumpy()]
-    elif isinstance(o, tvm.relay.backend.vmobj.Datatype):
+    elif isinstance(o, tvm.relay.backend.vmobj.ADT):
         result = []
         for f in o:
             result.extend(vmobj_to_list(f))
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 1b40f894db08..a3b251c38e00 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -63,7 +63,7 @@ def veval(f, *args, ctx=tvm.cpu(), target="llvm"):
 def vmobj_to_list(o):
     if isinstance(o, tvm.relay.backend.vm.Tensor):
         return [o.asnumpy().tolist()]
-    elif isinstance(o, tvm.relay.backend.vm.Datatype):
+    elif isinstance(o, tvm.relay.backend.vm.ADT):
         result = []
         for f in o:
             result.extend(vmobj_to_list(f))
diff --git a/tests/python/relay/test_vm_object.py b/tests/python/relay/test_vm_object.py
index ad21fff8e185..12d263d1125b 100644
--- a/tests/python/relay/test_vm_object.py
+++ b/tests/python/relay/test_vm_object.py
@@ -28,13 +28,13 @@ def test_tensor():
     assert isinstance(x.data, tvm.nd.NDArray)
 
 
-def test_datatype():
+def test_adt():
     arr = tvm.nd.array([1,2,3])
     x = vm.Tensor(arr)
-    y = vm.Datatype(0, [x, x])
+    y = vm.ADT(0, [x, x])
 
     assert len(y) == 2
-    assert isinstance(y, vm.Datatype)
+    assert isinstance(y, vm.ADT)
     y[0:1][-1].data == x.data
     assert y.tag == 0
     assert isinstance(x.data, tvm.nd.NDArray)
@@ -43,4 +43,4 @@ def test_datatype():
 
 if __name__ == "__main__":
     test_tensor()
-    test_datatype()
+    test_adt()

From ffc11b758d9e786a18a47a8715356d0cad1a24d9 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Sun, 20 Oct 2019 10:40:10 -0700
Subject: [PATCH 09/59] [Runtime] Enable option to use OpenMP thread pool
 (#4089)

---
 CMakeLists.txt             |  4 ++++
 cmake/config.cmake         |  4 ++++
 cmake/modules/OpenMP.cmake | 48 ++++++++++++++++++++++++++++++++++++++
 src/runtime/thread_pool.cc | 26 +++++++++++++++++++++
 4 files changed, 82 insertions(+)
 create mode 100644 cmake/modules/OpenMP.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f44dd502e5de..248b39130e36 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" O
 tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
 tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
+tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF)
 tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
 tvm_option(USE_SGX "Build with SGX" OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
@@ -155,6 +156,7 @@ list(APPEND COMPILER_SRCS ${RELAY_BACKEND_SRCS})
 list(APPEND COMPILER_SRCS ${RELAY_IR_SRCS})
 list(APPEND COMPILER_SRCS ${RELAY_QNN_SRCS})
 
+
 if(USE_VM_PROFILER)
   message(STATUS "Build compiler with Relay VM profiler support...")
   file(GLOB BACKEND_VM_PROFILER_SRCS src/relay/backend/vm/profiler/*.cc)
@@ -234,6 +236,7 @@ include(cmake/modules/VTA.cmake)
 include(cmake/modules/CUDA.cmake)
 include(cmake/modules/OpenCL.cmake)
 include(cmake/modules/OpenGL.cmake)
+include(cmake/modules/OpenMP.cmake)
 include(cmake/modules/Vulkan.cmake)
 include(cmake/modules/Metal.cmake)
 include(cmake/modules/ROCM.cmake)
@@ -267,6 +270,7 @@ add_library(tvm_topi SHARED ${TOPI_SRCS})
 add_library(tvm_runtime SHARED ${RUNTIME_SRCS})
 add_library(tvm_runtime_static STATIC ${RUNTIME_SRCS})
 
+
 if(USE_RELAY_DEBUG)
   message(STATUS "Building Relay in debug mode...")
   set_target_properties(tvm PROPERTIES COMPILE_DEFINITIONS "USE_RELAY_DEBUG")
diff --git a/cmake/config.cmake b/cmake/config.cmake
index b88d25b68700..f87dc8ab1d8f 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -115,6 +115,10 @@ set(USE_BLAS none)
 # set(USE_MKL_PATH <path to venv or site-packages directory>) if using `pip install mkl`
 set(USE_MKL_PATH none)
 
+# Whether use OpenMP thread pool, choices: gnu, intel
+# Note: "gnu" uses gomp library, "intel" uses iomp5 library
+set(USE_OPENMP none)
+
 # Whether use contrib.random in runtime
 set(USE_RANDOM OFF)
 
diff --git a/cmake/modules/OpenMP.cmake b/cmake/modules/OpenMP.cmake
new file mode 100644
index 000000000000..5dd9be508342
--- /dev/null
+++ b/cmake/modules/OpenMP.cmake
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# OpenMP Module
+if(USE_OPENMP STREQUAL "gnu")
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenMP_CXX_LIBRARIES})
+    add_definitions(-DTVM_THREADPOOL_USE_OPENMP=1)
+    message(STATUS "Build with OpenMP ${OpenMP_CXX_LIBRARIES}")
+  else()
+    add_definitions(-DTVM_THREADPOOL_USE_OPENMP=0)
+    message(WARNING "OpenMP cannot be found, use TVM threadpool instead.")
+  endif()
+elseif(USE_OPENMP STREQUAL "intel")
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    if (MSVC)
+      find_library(OMP_LIBRARY NAMES libiomp5md)
+    else()
+      find_library(OMP_LIBRARY NAMES iomp5)
+    endif()
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${OMP_LIBRARY})
+    add_definitions(-DTVM_THREADPOOL_USE_OPENMP=1)
+    message(STATUS "Build with OpenMP " ${OMP_LIBRARY})
+  else()
+    add_definitions(-DTVM_THREADPOOL_USE_OPENMP=0)
+    message(WARNING "OpenMP cannot be found, use TVM threadpool instead.")
+  endif()
+else()
+  add_definitions(-DTVM_THREADPOOL_USE_OPENMP=0)
+endif()
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 2e101364db2a..e9e6d03243e3 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -29,6 +29,9 @@
 #include <tvm/runtime/threading_backend.h>
 #include <dmlc/thread_local.h>
 #include <dmlc/logging.h>
+#if TVM_THREADPOOL_USE_OPENMP
+#include <omp.h>
+#endif
 #include <thread>
 #include <condition_variable>
 #include <mutex>
@@ -394,12 +397,34 @@ int TVMBackendParallelLaunch(
     FTVMParallelLambda flambda,
     void* cdata,
     int num_task) {
+#if !TVM_THREADPOOL_USE_OPENMP
   int res = tvm::runtime::ThreadPool::ThreadLocal()->Launch(
       flambda, cdata, num_task, 1);
   return res;
+#else
+  int num_workers = tvm::runtime::threading::MaxConcurrency();
+  if (num_task == 0) num_task = num_workers;
+  omp_set_num_threads(num_workers);
+  #pragma omp parallel num_threads(num_workers)
+  {
+    TVMParallelGroupEnv env;
+    env.num_task = num_task;
+    std::atomic<int32_t>* sync_counter = new std::atomic<int>[num_task * tvm::runtime::kSyncStride];
+    for (int i = 0; i < num_task; ++i) {
+      sync_counter[i * tvm::runtime::kSyncStride].store(
+          0, std::memory_order_relaxed);
+    }
+    env.sync_handle = sync_counter;
+    (*flambda)(omp_get_thread_num(), &env, cdata);
+  }
+  return 0;
+#endif
 }
 
 int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) {
+#if TVM_THREADPOOL_USE_OPENMP
+  #pragma omp barrier
+#else
   using tvm::runtime::kSyncStride;
   int num_task = penv->num_task;
   std::atomic<int>* sync_counter =
@@ -415,5 +440,6 @@ int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) {
     }
   }
   std::atomic_thread_fence(std::memory_order_acquire);
+#endif
   return 0;
 }

From 824e1d8182b97c399a975d0583e78b021d03f0a6 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 20 Oct 2019 18:30:41 -0700
Subject: [PATCH 10/59] [REFACTOR][NODE][RUNTIME] Move Node to the new Object
 protocol. (#4161)

* [REFACTOR][NODE][RUNTIME] Move Node to the new Object protocol.

This PR removes the original node system, and make node as a subclass of Object.
This is a major refactor towards a better unified runtime object system.

List of changes in the refactor:

- We now hide data_ field, use Downcast explicitly to get a sub-class object.
- Removed the node system FFI in python.
- Removed the node C API, instead use PackedFunc for list and get attrs.
- Change relay::Op::set_attr_type_key(attr_key_name) to relay::Op::set_attr_type<AttrType>().
  - This change was necessary because of the new Object registration mechanism.
  - Subsequent changes to the op registrations
  - The change revealed a few previous problems that is now fixed.
- Patched up a few missing node type registration.
  - Now we will raise an error if we register object that is not registered.
- The original node.h and container.h are kept in the same location.
- Calling convention: kObjectHandle now equals the old kNodeHandle, kNodeHandle is removed.
- IRFunctor now dispatches on ObjectRef.
- Update to the new type checking API: is_type, derived_from are replaced by IsInstance.
- Removed .hash member function, instead use C++ convention hasher functors.

* Address review comments
---
 golang/src/value.go                           |   4 +-
 include/tvm/api_registry.h                    |   8 +-
 include/tvm/arithmetic.h                      |   4 +-
 include/tvm/attrs.h                           |  24 +-
 include/tvm/base.h                            |  16 +-
 include/tvm/buffer.h                          |   4 +-
 include/tvm/build_module.h                    |  16 +-
 include/tvm/c_dsl_api.h                       |  98 ------
 include/tvm/channel.h                         |   4 +-
 include/tvm/data_layout.h                     |   8 +-
 include/tvm/expr.h                            |  27 +-
 include/tvm/ir.h                              |   6 +-
 include/tvm/ir_functor_ext.h                  |  18 +-
 include/tvm/ir_mutator.h                      |   4 +-
 include/tvm/ir_visitor.h                      |   4 +-
 include/tvm/lowered_func.h                    |   9 +-
 include/tvm/node/container.h                  | 219 ++++++-------
 include/tvm/node/ir_functor.h                 |  50 +--
 include/tvm/node/memory.h                     |  77 -----
 include/tvm/node/node.h                       | 300 +++---------------
 include/tvm/operation.h                       |   2 +-
 include/tvm/packed_func_ext.h                 | 183 +++++------
 include/tvm/relay/adt.h                       |   2 +-
 include/tvm/relay/base.h                      |  10 +-
 include/tvm/relay/expr.h                      |  11 +-
 include/tvm/relay/expr_functor.h              |   8 +-
 include/tvm/relay/interpreter.h               |   4 +-
 include/tvm/relay/module.h                    |   6 +-
 include/tvm/relay/op.h                        |  19 +-
 include/tvm/relay/pattern_functor.h           |   8 +-
 include/tvm/relay/transform.h                 |   8 +-
 include/tvm/relay/type.h                      |   7 +-
 include/tvm/runtime/c_runtime_api.h           |   3 +-
 include/tvm/runtime/memory.h                  |   2 +-
 include/tvm/runtime/node_base.h               | 259 ---------------
 include/tvm/runtime/object.h                  | 248 +++++++++++++--
 include/tvm/runtime/packed_func.h             |  65 ++--
 include/tvm/schedule.h                        |  20 +-
 include/tvm/tensor.h                          |  16 +-
 include/tvm/tensor_intrin.h                   |   4 +-
 .../main/native/ml_dmlc_tvm_native_c_api.cc   |   6 +-
 nnvm/include/nnvm/compiler/util.h             |   6 +-
 nnvm/src/compiler/compile_engine.cc           |   7 +-
 nnvm/src/compiler/compile_engine.h            |   6 +-
 nnvm/src/compiler/graph_runtime.h             |   5 +-
 nnvm/src/compiler/packed_func_ext.cc          |   6 +-
 nnvm/src/top/tensor/transform.cc              |   6 +-
 python/tvm/_ffi/_ctypes/function.py           |  17 +-
 python/tvm/_ffi/_ctypes/node.py               | 102 ------
 python/tvm/_ffi/_ctypes/object.py             |  13 +-
 python/tvm/_ffi/_cython/base.pxi              |  17 +-
 python/tvm/_ffi/_cython/core.pyx              |   2 +-
 python/tvm/_ffi/_cython/function.pxi          |  23 +-
 python/tvm/_ffi/_cython/node.pxi              | 110 -------
 python/tvm/_ffi/_cython/object.pxi            |  12 +-
 python/tvm/_ffi/node.py                       |  59 +---
 python/tvm/_ffi/object.py                     |  23 +-
 python/tvm/_ffi/runtime_ctypes.py             |   3 +-
 python/tvm/error.py                           |   1 +
 python/tvm/relay/backend/profiler_vm.py       |   4 +
 python/tvm/relay/debug.py                     |   4 -
 rust/common/src/packed_func.rs                |   6 +-
 rust/frontend/src/function.rs                 |   2 +-
 src/api/api_arith.cc                          |   3 +-
 src/api/api_base.cc                           |  11 +-
 src/api/api_codegen.cc                        |   6 +-
 src/api/api_ir.cc                             |   1 -
 src/api/api_lang.cc                           |  93 +++---
 src/api/api_pass.cc                           |   8 +-
 src/api/api_schedule.cc                       |   5 +-
 src/api/dsl_api.cc                            | 134 +++-----
 src/arithmetic/analyzer.cc                    |   7 +-
 src/arithmetic/canonical_simplify.cc          |   6 +-
 src/arithmetic/const_int_bound.cc             |   2 +-
 src/arithmetic/detect_linear_equation.cc      |   2 +-
 src/arithmetic/int_set.cc                     |   4 +-
 src/arithmetic/ir_mutator_with_analyzer.cc    |   2 +-
 src/arithmetic/ir_visitor_with_analyzer.h     |   2 +-
 src/arithmetic/modular_set.cc                 |   2 +-
 src/codegen/build_module.cc                   |  24 +-
 src/codegen/codegen_c.cc                      |   2 +-
 src/codegen/llvm/codegen_llvm.cc              |   2 +-
 src/codegen/spirv/codegen_spirv.cc            |   2 +-
 src/contrib/hybrid/codegen_hybrid.cc          |   4 +-
 src/contrib/hybrid/codegen_hybrid.h           |   1 -
 src/lang/attr_functor.h                       |  80 ++---
 src/lang/attrs.cc                             |  52 +--
 src/lang/data_layout.cc                       |   8 +-
 src/lang/expr.cc                              |   4 +-
 src/lang/ir.cc                                |   8 +-
 src/lang/reflection.cc                        | 105 +++---
 src/node/node.cc                              |  76 -----
 src/op/compute_op.cc                          |   8 +-
 src/op/hybrid_op.cc                           |   4 +-
 src/op/op_util.cc                             |   2 +-
 src/op/tensorize.cc                           |   2 +-
 src/pass/arg_binder.cc                        |   2 +-
 src/pass/coproc_sync.cc                       |   6 +-
 src/pass/hoist_if_then_else.cc                |   7 +-
 src/pass/inject_copy_intrin.cc                |  10 +-
 src/pass/inject_double_buffer.cc              |   2 +-
 src/pass/inject_prefetch.cc                   |   2 +-
 src/pass/inject_virtual_thread.cc             |   5 +-
 src/pass/ir_mutator.cc                        |   2 +-
 src/pass/lift_attr_scope.cc                   |   6 +-
 src/pass/lower_thread_allreduce.cc            |   6 +-
 src/pass/lower_warp_memory.cc                 |   6 +-
 src/pass/make_api.cc                          |   4 +-
 src/pass/narrow_channel_access.cc             |   2 +-
 src/pass/remap_thread_axis.cc                 |   6 +-
 src/pass/split_host_device.cc                 |  10 +-
 src/pass/split_pipeline.cc                    |   8 +-
 src/pass/storage_access.cc                    |  12 +-
 src/pass/storage_flatten.cc                   |  16 +-
 src/pass/storage_rewrite.cc                   |  13 +-
 src/pass/storage_sync.cc                      |   6 +-
 src/pass/unroll_loop.cc                       |   3 +-
 src/pass/vectorize_loop.cc                    |   3 +-
 src/pass/verify_memory.cc                     |   6 +-
 src/relay/backend/compile_engine.cc           |   8 +-
 src/relay/backend/compile_engine.h            |  18 +-
 src/relay/backend/graph_runtime_codegen.cc    |   6 +-
 src/relay/ir/alpha_equal.cc                   |  15 +-
 src/relay/ir/expr_functor.cc                  |   7 +-
 src/relay/ir/hash.cc                          |  21 +-
 src/relay/ir/module.cc                        |   9 +-
 src/relay/ir/op.cc                            |  12 +-
 src/relay/ir/pretty_printer.cc                |  18 +-
 src/relay/ir/type_functor.h                   |  14 +-
 src/relay/op/algorithm/argsort.cc             |   2 +-
 src/relay/op/algorithm/topk.cc                |   2 +-
 src/relay/op/debug.cc                         |  13 +-
 src/relay/op/image/resize.cc                  |   6 +-
 src/relay/op/nn/bitserial.cc                  |  38 +--
 src/relay/op/nn/convolution.cc                |  20 +-
 src/relay/op/nn/nn.cc                         |  26 +-
 src/relay/op/nn/pad.cc                        |   4 +-
 src/relay/op/nn/pooling.cc                    |  16 +-
 src/relay/op/nn/sparse.cc                     |   4 +-
 src/relay/op/nn/upsampling.cc                 |   2 +-
 src/relay/op/tensor/reduce.cc                 |  18 +-
 src/relay/op/tensor/transform.cc              |  75 +++--
 src/relay/op/tensor/unary.cc                  |  10 +-
 src/relay/op/vision/multibox_op.cc            |   8 +-
 src/relay/op/vision/yolo.cc                   |   2 +-
 src/relay/pass/alter_op_layout.cc             |  15 +-
 src/relay/pass/device_annotation.cc           |  10 +-
 src/relay/pass/eta_expand.cc                  |   4 +-
 src/relay/pass/fold_constant.cc               |   2 +-
 src/relay/pass/fold_scale_axis.cc             |  10 +-
 src/relay/pass/partial_eval.cc                |  14 +-
 src/relay/pass/pass_manager.cc                |   7 +-
 src/relay/pass/quantize/annotate.cc           |   4 +-
 src/relay/pass/quantize/partition.cc          |   3 +
 src/relay/pass/quantize/quantize.cc           |   2 +-
 src/relay/pass/quantize/quantize.h            |   8 +-
 src/relay/pass/quantize/realize.cc            |  22 +-
 src/relay/pass/type_infer.cc                  |  21 +-
 src/relay/pass/type_solver.cc                 |   2 +-
 src/relay/qnn/op/concatenate.cc               |   2 +-
 src/relay/qnn/op/convolution.cc               |   2 +-
 src/relay/qnn/op/dense.cc                     |   2 +-
 src/relay/qnn/op/dequantize.cc                |   2 +-
 src/relay/qnn/op/quantize.cc                  |   2 +-
 src/relay/qnn/op/requantize.cc                |   2 +-
 src/runtime/c_dsl_api.cc                      |  91 ------
 src/runtime/c_runtime_api.cc                  |   2 +-
 src/runtime/dsl_api.h                         |  59 ----
 src/runtime/object.cc                         |  21 +-
 src/schedule/graph.cc                         |   2 +-
 src/schedule/schedule_dataflow_rewrite.cc     |  18 +-
 src/schedule/schedule_lang.cc                 |  24 +-
 src/schedule/schedule_ops.cc                  |   6 +-
 tests/cpp/expr_test.cc                        |   4 +-
 tests/cpp/ir_functor_test.cc                  |   2 +-
 tests/cpp/object_protocol_test.cc             |   6 +-
 tests/cpp/packed_func_test.cc                 |   2 +-
 tests/python/unittest/test_lang_schedule.py   |   6 +-
 .../unittest/test_runtime_vm_profiler.py      |   2 +
 topi/include/topi/cuda/pooling.h              |   2 +-
 topi/include/topi/cuda/reduction.h            |   2 +-
 topi/include/topi/detail/constant_utils.h     |  15 +-
 topi/include/topi/generic/extern.h            |   2 +-
 topi/src/topi.cc                              |   5 +-
 web/tvm_runtime.js                            |   8 +-
 185 files changed, 1442 insertions(+), 2387 deletions(-)
 delete mode 100644 include/tvm/c_dsl_api.h
 delete mode 100644 include/tvm/node/memory.h
 delete mode 100644 include/tvm/runtime/node_base.h
 delete mode 100644 python/tvm/_ffi/_ctypes/node.py
 delete mode 100644 python/tvm/_ffi/_cython/node.pxi
 delete mode 100644 src/node/node.cc
 delete mode 100644 src/runtime/c_dsl_api.cc
 delete mode 100644 src/runtime/dsl_api.h

diff --git a/golang/src/value.go b/golang/src/value.go
index 576331a8cfa0..5e0f78270eaa 100644
--- a/golang/src/value.go
+++ b/golang/src/value.go
@@ -44,8 +44,8 @@ var KTVMType                = int32(C.kTVMType)
 var KTVMContext             = int32(C.kTVMContext)
 // KArrayHandle is golang type code for TVM kArrayHandle.
 var KArrayHandle            = int32(C.kArrayHandle)
-// KNodeHandle is golang type code for TVM kNodeHandle.
-var KNodeHandle             = int32(C.kNodeHandle)
+// KObjectHandle is golang type code for TVM kObjectHandle.
+var KObjectHandle             = int32(C.kObjectHandle)
 // KModuleHandle is gonag type code for TVM kModuleHandle.
 var KModuleHandle           = int32(C.kModuleHandle)
 // KFuncHandle is gonalg type code for TVM kFuncHandle.
diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index e12d841519ca..dbd097293593 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -79,7 +79,7 @@ class EnvFunc : public NodeRef {
   explicit EnvFunc(NodePtr<Node> n) : NodeRef(n) {}
   /*! \return The internal global function pointer */
   const EnvFuncNode* operator->() const {
-    return static_cast<EnvFuncNode*>(node_.get());
+    return static_cast<const EnvFuncNode*>(get());
   }
   /*!
    * \brief Invoke the function.
@@ -124,19 +124,19 @@ class TypedEnvFunc<R(Args...)> : public NodeRef {
   /*! \brief short hand for this function type */
   using TSelf = TypedEnvFunc<R(Args...)>;
   TypedEnvFunc() {}
-  explicit TypedEnvFunc(NodePtr<Node> n) : NodeRef(n) {}
+  explicit TypedEnvFunc(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief Assign global function to a TypedEnvFunc
    * \param other Another global function.
    * \return reference to self.
    */
   TSelf& operator=(const EnvFunc& other) {
-    this->node_ = other.node_;
+    ObjectRef::operator=(other);
     return *this;
   }
   /*! \return The internal global function pointer */
   const EnvFuncNode* operator->() const {
-    return static_cast<EnvFuncNode*>(node_.get());
+    return static_cast<const EnvFuncNode*>(get());
   }
   /*!
    * \brief Invoke the function.
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index 8be1c3604813..e81fa0afd254 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -362,7 +362,7 @@ class IntSet : public NodeRef {
   /*! \brief constructor */
   IntSet() {}
   // constructor from not container.
-  explicit IntSet(NodePtr<Node> n) : NodeRef(n) {}
+  explicit IntSet(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -692,7 +692,7 @@ Array<Expr> DetectClipBound(const Expr& e,
 
 // implementation
 inline const IntSetNode* IntSet::operator->() const {
-  return static_cast<const IntSetNode*>(node_.get());
+  return static_cast<const IntSetNode*>(get());
 }
 }  // namespace arith
 }  // namespace tvm
diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 3b64d1f961e2..fb8927a75613 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -163,7 +163,7 @@ class AttrsEqual {
     return lhs == rhs;
   }
   // node comparator
-  TVM_DLL bool operator()(const NodeRef& lhs, const NodeRef& rhs) const;
+  TVM_DLL bool operator()(const ObjectRef& lhs, const ObjectRef& rhs) const;
 
  protected:
   friend class AttrsEqualHandler;
@@ -203,7 +203,7 @@ class AttrsHash {
         (static_cast<int>(value.bits()) << 8) |
         (static_cast<int>(value.lanes()) << 16));
   }
-  TVM_DLL size_t operator()(const NodeRef& value) const;
+  TVM_DLL size_t operator()(const ObjectRef& value) const;
 
  private:
   friend class AttrsHashHandler;
@@ -260,7 +260,7 @@ class BaseAttrsNode : public Node {
    * \return The comparison result.
    */
   TVM_DLL virtual bool ContentEqual(
-      const Node* other, AttrsEqual equal) const = 0;
+      const Object* other, AttrsEqual equal) const = 0;
   /*!
    * \brief Content aware hash.
    * \param hasher The hasher to run the hash.
@@ -290,7 +290,7 @@ class Attrs : public NodeRef {
  private:
   /*! \return the internal attribute node */
   const BaseAttrsNode* ptr() const {
-    return static_cast<const BaseAttrsNode*>(node_.get());
+    return static_cast<const BaseAttrsNode*>(get());
   }
 };
 
@@ -315,7 +315,7 @@ class DictAttrsNode : public BaseAttrsNode {
   void VisitNonDefaultAttrs(AttrVisitor* v) final;
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
   Array<AttrFieldInfo> ListFieldInfo() const final;
-  bool ContentEqual(const Node* other, AttrsEqual equal) const final;
+  bool ContentEqual(const Object* other, AttrsEqual equal) const final;
   size_t ContentHash(AttrsHash hasher) const final;
   // type info
   static constexpr const char* _type_key = "DictAttrs";
@@ -369,7 +369,7 @@ class AttrsEqualVisitor {
  public:
   bool result_{true};
   // constructor
-  AttrsEqualVisitor(const Node* lhs, const Node* rhs, const AttrsEqual& equal)
+  AttrsEqualVisitor(const Object* lhs, const Object* rhs, const AttrsEqual& equal)
       : lhs_(lhs), rhs_(rhs), equal_(equal) {
   }
   template<typename T>
@@ -387,8 +387,8 @@ class AttrsEqualVisitor {
   }
 
  private:
-  const Node* lhs_;
-  const Node* rhs_;
+  const Object* lhs_;
+  const Object* rhs_;
   const AttrsEqual& equal_;
 };
 
@@ -488,7 +488,7 @@ inline void SetIntValue(T* ptr, const TVMArgValue& val) {
     } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
       *ptr = static_cast<T>(op->value);
     } else {
-      LOG(FATAL) << "Expect int value, but get " << expr->type_key();
+      LOG(FATAL) << "Expect int value, but get " << expr->GetTypeKey();
     }
   }
 }
@@ -521,7 +521,7 @@ inline void SetValue<double>(double* ptr, const TVMArgValue& val) {
     } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
       *ptr = static_cast<double>(op->value);
     } else {
-      LOG(FATAL) << "Expect float value, but get " << expr->type_key();
+      LOG(FATAL) << "Expect float value, but get " << expr->GetTypeKey();
     }
   }
 }
@@ -827,7 +827,7 @@ class AttrsNode : public BaseAttrsNode {
     return visitor.fields_;
   }
 
-  bool ContentEqual(const Node* other, AttrsEqual equal) const final {
+  bool ContentEqual(const Object* other, AttrsEqual equal) const final {
     DerivedType* pself = self();
     if (pself == other) return true;
     if (other == nullptr) return false;
@@ -839,7 +839,7 @@ class AttrsNode : public BaseAttrsNode {
 
   size_t ContentHash(AttrsHash hasher) const final {
     ::tvm::detail::AttrsHashVisitor visitor(hasher);
-    visitor.result_ = std::hash<std::string>()(this->type_key());
+    visitor.result_ = this->GetTypeKeyHash();
     self()->__VisitAttrs__(visitor);
     return visitor.result_;
   }
diff --git a/include/tvm/base.h b/include/tvm/base.h
index f358f7f5d447..a42de10abef2 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -47,9 +47,10 @@ using ::tvm::AttrVisitor;
  */
 #define TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseTypeName, NodeName)   \
   TypeName() {}                                                         \
-  explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : BaseTypeName(n) {} \
+  explicit TypeName(::tvm::ObjectPtr<::tvm::Object> n)                  \
+      : BaseTypeName(n) {}                                              \
   const NodeName* operator->() const {                                  \
-    return static_cast<const NodeName*>(node_.get());                   \
+    return static_cast<const NodeName*>(data_.get());                   \
   }                                                                     \
   operator bool() const { return this->defined(); }                     \
   using ContainerType = NodeName;
@@ -75,12 +76,12 @@ using ::tvm::AttrVisitor;
  */
 #define TVM_DEFINE_NODE_REF_COW(NodeName)                               \
   NodeName* CopyOnWrite() {                                             \
-      CHECK(node_ != nullptr);                                          \
-      if (!node_.unique())  {                                           \
+      CHECK(data_ != nullptr);                                          \
+      if (!data_.unique())  {                                           \
         NodePtr<NodeName> n = make_node<NodeName>(*(operator->()));     \
-        NodePtr<Node>(std::move(n)).swap(node_);                        \
+        ObjectPtr<Object>(std::move(n)).swap(data_);                    \
       }                                                                 \
-      return static_cast<NodeName*>(node_.get());                       \
+      return static_cast<NodeName*>(data_.get());                       \
     }
 
 /*! \brief Macro to make it easy to define node ref type given node */
@@ -160,7 +161,7 @@ std::string SaveJSON(const NodeRef& node);
  *
  * \return The shared_ptr of the Node.
  */
-NodePtr<Node> LoadJSON_(std::string json_str);
+ObjectPtr<Object> LoadJSON_(std::string json_str);
 
 /*!
  * \brief Load the node from json string.
@@ -233,6 +234,7 @@ struct NodeFactoryReg {
  * \note This is necessary to enable serialization of the Node.
  */
 #define TVM_REGISTER_NODE_TYPE(TypeName)                                \
+  TVM_REGISTER_OBJECT_TYPE(TypeName);                                   \
   static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
       ::tvm::NodeFactoryReg::Registry()->__REGISTER__(TypeName::_type_key) \
       .set_creator([](const std::string&) { return ::tvm::make_node<TypeName>(); })
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 1233e9b0b89b..f18ed9206db3 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -51,7 +51,7 @@ enum BufferType : int {
 class Buffer : public NodeRef {
  public:
   Buffer() {}
-  explicit Buffer(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Buffer(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief Return a new buffer that is equivalent with current one
    *  but always add stride field.
@@ -171,7 +171,7 @@ class BufferNode : public Node {
 };
 
 inline const BufferNode* Buffer::operator->() const {
-  return static_cast<const BufferNode*>(node_.get());
+  return static_cast<const BufferNode*>(get());
 }
 
 /*!
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 1d57d82e66c6..c985fbe17546 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -93,7 +93,7 @@ class TargetNode : public Node {
 class Target : public NodeRef {
  public:
   Target() {}
-  explicit Target(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Target(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
   * \brief Create a Target given a string
   * \param target_str the string to parse
@@ -110,7 +110,7 @@ class Target : public NodeRef {
   TVM_DLL static tvm::Target Current(bool allow_not_defined = true);
 
   const TargetNode* operator->() const {
-      return static_cast<const TargetNode*>(node_.get());
+      return static_cast<const TargetNode*>(get());
   }
 
   using ContainerType = TargetNode;
@@ -256,12 +256,12 @@ class BuildConfigNode : public Node {
 class BuildConfig : public ::tvm::NodeRef {
  public:
   BuildConfig() {}
-  explicit BuildConfig(NodePtr<::tvm::Node> n) : NodeRef(n) {}
+  explicit BuildConfig(ObjectPtr<Object> n) : NodeRef(n) {}
   const BuildConfigNode* operator->() const {
-    return static_cast<const BuildConfigNode*>(node_.get());
+    return static_cast<const BuildConfigNode*>(get());
   }
   BuildConfigNode* operator->() {
-    return static_cast<BuildConfigNode*>(node_.get());
+    return static_cast<BuildConfigNode*>(get_mutable());
   }
   /*!
    * \brief Construct a BuildConfig containing a empty build config node.
@@ -371,7 +371,7 @@ class GenericFuncNode;
 class GenericFunc : public NodeRef {
  public:
   GenericFunc() {}
-  explicit GenericFunc(NodePtr<Node> n) : NodeRef(n) {}
+  explicit GenericFunc(ObjectPtr<Object> n) : NodeRef(n) {}
 
   /*!
    * \brief Set the default function implementaiton.
@@ -478,10 +478,10 @@ class GenericFuncNode : public Node {
 };
 
 inline GenericFuncNode* GenericFunc::operator->() {
-  return static_cast<GenericFuncNode*>(node_.get());
+  return static_cast<GenericFuncNode*>(get_mutable());
 }
 
-#define TVM_GENERIC_FUNC_REG_VAR_DEF                               \
+#define TVM_GENERIC_FUNC_REG_VAR_DEF                            \
   static TVM_ATTRIBUTE_UNUSED ::tvm::GenericFunc& __mk_ ## TVM
 
 /*!
diff --git a/include/tvm/c_dsl_api.h b/include/tvm/c_dsl_api.h
deleted file mode 100644
index bbbb84926e8e..000000000000
--- a/include/tvm/c_dsl_api.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/c_dsl_api.h
- *
- * \brief TVM DSL Node C API, used to interact to DSL compilation.
- *
- *  These are only a few functions needed for DSL construction time.
- *  These function are only available when link libtvm.
- *  If only TVM runtime is linked, calling these function will trigger error.
- *
- * \note Most API functions are registerd as PackedFunc and
- *  can be grabbed via TVMFuncGetGlobal
- */
-#ifndef TVM_C_DSL_API_H_
-#define TVM_C_DSL_API_H_
-
-#include "runtime/c_runtime_api.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! \brief handle to node */
-typedef void* NodeHandle;
-
-/*!
- * \brief free the node handle
- * \param handle The node handle to be freed.
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMNodeFree(NodeHandle handle);
-
-/*!
- * \brief Convert type key to type index.
- * \param type_key The key of the type.
- * \param out_index the corresponding type index.
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMNodeTypeKey2Index(const char* type_key,
-                                 int* out_index);
-
-/*!
- * \brief Get runtime type index of the node.
- * \param handle the node handle.
- * \param out_index the corresponding type index.
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMNodeGetTypeIndex(NodeHandle handle,
-                                int* out_index);
-
-/*!
- * \brief get attributes given key
- * \param handle The node handle
- * \param key The attribute name
- * \param out_value The attribute value
- * \param out_type_code The type code of the attribute.
- * \param out_success Whether get is successful.
- * \return 0 when success, -1 when failure happens
- * \note API calls always exchanges with type bits=64, lanes=1
- */
-TVM_DLL int TVMNodeGetAttr(NodeHandle handle,
-                           const char* key,
-                           TVMValue* out_value,
-                           int* out_type_code,
-                           int* out_success);
-
-/*!
- * \brief get attributes names in the node.
- * \param handle The node handle
- * \param out_size The number of functions
- * \param out_array The array of function names.
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMNodeListAttrNames(NodeHandle handle,
-                                 int *out_size,
-                                 const char*** out_array);
-#ifdef __cplusplus
-}  // TVM_EXTERN_C
-#endif
-#endif  // TVM_C_DSL_API_H_
diff --git a/include/tvm/channel.h b/include/tvm/channel.h
index 143d4295f3e3..346291a6b06a 100644
--- a/include/tvm/channel.h
+++ b/include/tvm/channel.h
@@ -35,7 +35,7 @@ class Channel : public NodeRef {
  public:
   /*! \brief default constructor  */
   Channel() {}
-  explicit Channel(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Channel(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -67,7 +67,7 @@ struct ChannelNode : public Node {
 
 // Inline implementations
 inline const ChannelNode* Channel::operator->() const {
-  return static_cast<const ChannelNode*>(node_.get());
+  return static_cast<const ChannelNode*>(get());
 }
 }  // namespace tvm
 #endif  // TVM_CHANNEL_H_
diff --git a/include/tvm/data_layout.h b/include/tvm/data_layout.h
index c2ae572de818..ad3da6b347af 100644
--- a/include/tvm/data_layout.h
+++ b/include/tvm/data_layout.h
@@ -127,7 +127,7 @@ class LayoutNode : public Node {
  */
 class Layout : public NodeRef {
  public:
-  explicit Layout(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Layout(ObjectPtr<Object> n) : NodeRef(n) {}
 
   /*! \brief default constructor */
   Layout() = default;
@@ -152,7 +152,7 @@ class Layout : public NodeRef {
    * \return the pointer to the internal node container
    */
   const LayoutNode* operator->() const {
-    return static_cast<const LayoutNode*>(node_.get());
+    return static_cast<const LayoutNode*>(get());
   }
 
   /*!
@@ -160,7 +160,7 @@ class Layout : public NodeRef {
    * \return the pointer to the internal node container
    */
   LayoutNode* operator->() {
-    return static_cast<LayoutNode*>(node_.get());
+    return static_cast<LayoutNode*>(get_mutable());
   }
 
   /*!
@@ -369,7 +369,7 @@ class BijectiveLayout : public NodeRef {
 };
 
 inline const BijectiveLayoutNode* BijectiveLayout::operator->() const {
-  return static_cast<const BijectiveLayoutNode*>(node_.get());
+  return static_cast<const BijectiveLayoutNode*>(get());
 }
 
 }  // namespace tvm
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 201a2b485aa6..d884a4d61748 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -49,7 +49,7 @@ class ExprNode : public Node {
 class Expr : public NodeRef {
  public:
   Expr() {}
-  explicit Expr(NodePtr<Node> ptr) : NodeRef(ptr) {}
+  explicit Expr(ObjectPtr<Object> ptr) : NodeRef(ptr) {}
   /*!
    * \brief construct from integer.
    * \param value The value to be constructed.
@@ -122,7 +122,7 @@ class Variable : public ExprNode {
 /*! \brief a named variable in TVM */
 class Var : public Expr {
  public:
-  explicit Var(NodePtr<Node> n) : Expr(n) {}
+  explicit Var(ObjectPtr<Object> n) : Expr(n) {}
   TVM_DLL explicit Var(std::string name_hint = "v",
                        Type t = Int(32));
   /*!
@@ -145,7 +145,7 @@ class Var : public Expr {
    * \return the corresponding Variable.
    */
   const Variable* get() const {
-    return static_cast<Variable*>(node_.get());
+    return static_cast<const Variable*>(data_.get());
   }
   /*! \brief type indicate the container type */
   using ContainerType = Variable;
@@ -187,7 +187,7 @@ class Integer : public Expr {
   /*!
    * \brief constructor from node.
    */
-  explicit Integer(NodePtr<Node> node) : Expr(node) {}
+  explicit Integer(ObjectPtr<Object> node) : Expr(node) {}
   /*!
    * \brief Construct integer from int value.
    */
@@ -197,7 +197,7 @@ class Integer : public Expr {
    * \param other another expression.
    */
   Integer& operator=(const Integer& other) {
-    node_ = other.node_;
+    data_ = other.data_;
     return *this;
   }
   /*!
@@ -205,13 +205,13 @@ class Integer : public Expr {
    * \return the content of the integer.
    */
   const IntImm* operator->() const {
-    return static_cast<const IntImm*>(node_.get());
+    return static_cast<const IntImm*>(get());
   }
   /*!
    * \brief convert to int64_t
    */
   operator int64_t() const {
-    CHECK(node_ != nullptr)
+    CHECK(data_ != nullptr)
         << " Trying to reference a null Integer";
     return (*this)->value;
   }
@@ -346,7 +346,7 @@ class IterVar : public NodeRef {
   // construct a new iter var without a domain
   IterVar() {}
   // construct from shared ptr.
-  explicit IterVar(NodePtr<Node> n) : NodeRef(n) {}
+  explicit IterVar(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -423,7 +423,7 @@ class IterVarNode : public Node {
 
 // inline implementations
 inline const IterVarNode* IterVar::operator->() const {
-  return static_cast<const IterVarNode*>(node_.get());
+  return static_cast<const IterVarNode*>(data_.get());
 }
 
 inline IterVar::operator Expr() const {
@@ -481,11 +481,11 @@ class IRPrinter {
       : stream(stream) {}
 
   /*! \brief The node to be printed. */
-  TVM_DLL void Print(const NodeRef& node);
+  TVM_DLL void Print(const ObjectRef& node);
   /*! \brief Print indent to the stream */
   TVM_DLL void PrintIndent();
   // Allow registration to be printer.
-  using FType = IRFunctor<void(const NodeRef&, IRPrinter *)>;
+  using FType = IRFunctor<void(const ObjectRef&, IRPrinter *)>;
   TVM_DLL static FType& vtable();
 };
 
@@ -498,10 +498,7 @@ inline std::ostream& operator<<(std::ostream& os, const NodeRef& n) {  // NOLINT
 
 namespace std {
 template <>
-struct hash<::tvm::IterVar> {
-  std::size_t operator()(const ::tvm::IterVar& k) const {
-    return k.hash();
-  }
+struct hash<::tvm::IterVar> : public ::tvm::NodeHash {
 };
 }
 #endif  // TVM_EXPR_H_
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 079f05f5a7f2..b90804983cfb 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -664,10 +664,10 @@ class CommReducerNode : public Node {
 };
 
 inline const CommReducerNode* CommReducer::get() const {
-  return static_cast<CommReducerNode*>(node_.get());
+  return static_cast<const CommReducerNode*>(data_.get());
 }
 inline const CommReducerNode* CommReducer::operator->() const {
-  return static_cast<CommReducerNode*>(node_.get());
+  return get();
 }
 
 /*! \brief Reduction operator operator */
@@ -1576,7 +1576,7 @@ namespace std {
 template <>
 struct hash<::tvm::ir::TensorKey> {
   std::size_t operator()(const ::tvm::ir::TensorKey& k) const {
-    size_t lhs = k.f.hash();
+    size_t lhs = ::tvm::NodeHash()(k.f);
     size_t rhs = static_cast<size_t>(k.value_index);
     lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
     return lhs;
diff --git a/include/tvm/ir_functor_ext.h b/include/tvm/ir_functor_ext.h
index a7d91eacf851..54a5eff6846b 100644
--- a/include/tvm/ir_functor_ext.h
+++ b/include/tvm/ir_functor_ext.h
@@ -84,19 +84,19 @@ class StmtFunctor;
   }
 #define STMT_FUNCTOR_DEFAULT {                                      \
     return VisitStmtDefault_(op, std::forward<Args>(args)...);      \
-}
+  }
 
 #define IR_EXPR_FUNCTOR_DISPATCH(OP)                                    \
   vtable.template set_dispatch<OP>(                                     \
-      [](const NodeRef& n, TSelf* self, Args... args) {                 \
-        return self->VisitExpr_(static_cast<const OP*>(n.node_.get()),  \
+      [](const ObjectRef& n, TSelf* self, Args... args) {               \
+        return self->VisitExpr_(static_cast<const OP*>(n.get()),        \
                                 std::forward<Args>(args)...);           \
       });                                                               \
 
 #define IR_STMT_FUNCTOR_DISPATCH(OP)                                    \
   vtable.template set_dispatch<OP>(                                     \
-      [](const NodeRef& n, TSelf* self, Args... args) {                 \
-        return self->VisitStmt_(static_cast<const OP*>(n.node_.get()),  \
+      [](const ObjectRef& n, TSelf* self, Args... args) {               \
+        return self->VisitStmt_(static_cast<const OP*>(n.get()),        \
                                 std::forward<Args>(args)...);           \
       });                                                               \
 
@@ -104,7 +104,7 @@ template<typename R, typename ...Args>
 class ExprFunctor<R(const Expr& n, Args...)> {
  private:
   using TSelf = ExprFunctor<R(const Expr& n, Args...)>;
-  using FType = IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using FType = IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -164,7 +164,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
   virtual R VisitExpr_(const FloatImm* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const StringImm* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Node* op, Args ...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     return R();
   }
 
@@ -213,7 +213,7 @@ template<typename R, typename ...Args>
 class StmtFunctor<R(const Stmt& n, Args... args)> {
  private:
   using TSelf = StmtFunctor<R(const Stmt& n, Args... args)>;
-  using FType = IRFunctor<R(const NodeRef& n, TSelf* self, Args... args)>;
+  using FType = IRFunctor<R(const ObjectRef& n, TSelf* self, Args... args)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -255,7 +255,7 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
   virtual R VisitStmt_(const Block* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const Evaluate* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmtDefault_(const Node* op, Args ...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     return R();
   }
 
diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
index b82a19d4689c..c910a48620c8 100644
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -65,9 +65,9 @@ class TVM_DLL IRMutator {
   /*! \brief destructor */
   virtual ~IRMutator() {}
   /*! \brief functor type of expr mutation */
-  using FMutateExpr = IRFunctor<Expr(const NodeRef&, const Expr&, IRMutator*)>;
+  using FMutateExpr = IRFunctor<Expr(const ObjectRef&, const Expr&, IRMutator*)>;
   /*! \brief functor type of stmt mutation */
-  using FMutateStmt = IRFunctor<Stmt(const NodeRef&, const Stmt&, IRMutator*)>;
+  using FMutateStmt = IRFunctor<Stmt(const ObjectRef&, const Stmt&, IRMutator*)>;
   /*! \return internal vtable of expr */
   static FMutateExpr& vtable_expr();  // NOLINT(*)
   /*! \return internal stmt of expr */
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index f20b91368587..bebf94585ed6 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -49,7 +49,7 @@ namespace ir {
  * // The use case is to count number of Variables in the ir tree.
  * class MyCounter : public IRVisitor {
  *  public:
- *   int Count(const NodeRef& n) {
+ *   int Count(const ObjectRef& n) {
  *     ret_ = 0;
  *     this->Visit(n);
  *     return ret_;
@@ -94,7 +94,7 @@ class TVM_DLL IRVisitor {
   /*! \brief destructor */
   virtual ~IRVisitor() {}
   /*! \brief functor type of visitor */
-  using FVisit = IRFunctor<void(const NodeRef&, IRVisitor*)>;
+  using FVisit = IRFunctor<void(const ObjectRef&, IRVisitor*)>;
   /*! \return internal vtable*/
   static FVisit& vtable();
   // overloadable visit function.
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index 4da93b80c2ab..e2147d036587 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -44,7 +44,7 @@ class LoweredFuncNode;
 class LoweredFunc : public ir::FunctionRef {
  public:
   LoweredFunc() {}
-  explicit LoweredFunc(NodePtr<Node> n) : FunctionRef(n) {}
+  explicit LoweredFunc(ObjectPtr<Object> n) : FunctionRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -136,17 +136,14 @@ class LoweredFuncNode : public ir::FunctionBaseNode {
 
 // Implementations of inline functions
 inline const LoweredFuncNode* LoweredFunc::operator->() const {
-  return static_cast<const LoweredFuncNode*>(node_.get());
+  return static_cast<const LoweredFuncNode*>(get());
 }
 
 }  // namespace tvm
 
 namespace std {
 template <>
-struct hash<::tvm::LoweredFunc> {
-  std::size_t operator()(const ::tvm::LoweredFunc& k) const {
-    return k.hash();
-  }
+struct hash<::tvm::LoweredFunc> : public tvm::NodeHash {
 };
 }
 
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
index c2c639e374f5..2e1a978f4806 100644
--- a/include/tvm/node/container.h
+++ b/include/tvm/node/container.h
@@ -38,14 +38,14 @@ namespace tvm {
 class ArrayNode : public Node {
  public:
   /*! \brief the data content */
-  std::vector<NodePtr<Node> > data;
+  std::vector<ObjectRef> data;
 
   void VisitAttrs(AttrVisitor* visitor) final {
      // Visitor to array have no effect.
   }
 
   static constexpr const char* _type_key = "Array";
-  TVM_DECLARE_NODE_TYPE_INFO(ArrayNode, Node);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ArrayNode, Node);
 };
 
 /*! \brief map node content */
@@ -54,32 +54,17 @@ class MapNode : public Node {
   void VisitAttrs(AttrVisitor* visitor) final {
      // Visitor to map have no effect.
   }
-  // hash function
-  struct Hash {
-    size_t operator()(const NodePtr<Node>& n) const {
-      return std::hash<Node*>()(n.get());
-    }
-  };
-  // comparator
-  struct Equal {
-    bool operator()(
-        const NodePtr<Node>& a,
-        const NodePtr<Node>& b) const {
-      return a.get() == b.get();
-    }
-  };
-
   /*! \brief The corresponding conatiner type */
   using ContainerType = std::unordered_map<
-   NodePtr<Node>,
-   NodePtr<Node>,
-   Hash, Equal>;
+    ObjectRef,
+    ObjectRef,
+    ObjectHash, ObjectEqual>;
 
   /*! \brief the data content */
   ContainerType data;
 
   static constexpr const char* _type_key = "Map";
-  TVM_DECLARE_NODE_TYPE_INFO(MapNode, Node);
+  TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Node);
 };
 
 
@@ -90,15 +75,13 @@ class StrMapNode : public Node {
      // Visitor to map have no effect.
   }
   /*! \brief The corresponding conatiner type */
-  using ContainerType = std::unordered_map<
-    std::string,
-    NodePtr<Node> >;
+  using ContainerType = std::unordered_map<std::string, ObjectRef>;
 
   /*! \brief the data content */
   ContainerType data;
 
   static constexpr const char* _type_key = "StrMap";
-  TVM_DECLARE_NODE_TYPE_INFO(StrMapNode, Node);
+  TVM_DECLARE_FINAL_OBJECT_INFO(StrMapNode, Node);
 };
 
 /*!
@@ -111,9 +94,9 @@ template<typename Converter,
 class IterAdapter {
  public:
   using difference_type = typename std::iterator_traits<TIter>::difference_type;
-  using value_type = typename std::iterator_traits<TIter>::value_type;
-  using pointer = typename std::iterator_traits<TIter>::pointer;
-  using reference = typename std::iterator_traits<TIter>::reference;
+  using value_type = typename Converter::ResultType;
+  using pointer = typename Converter::ResultType*;
+  using reference = typename Converter::ResultType&;   // NOLINT(*)
   using iterator_category = typename std::iterator_traits<TIter>::iterator_category;
 
   explicit IterAdapter(TIter iter) : iter_(iter) {}
@@ -138,7 +121,7 @@ class IterAdapter {
   inline bool operator!=(IterAdapter other) const {
     return !(*this == other);
   }
-  inline const typename Converter::ResultType operator*() const {
+  inline const value_type operator*() const {
     return Converter::convert(*iter_);
   }
 
@@ -162,26 +145,27 @@ class Array : public NodeRef {
    * \brief default constructor
    */
   Array() {
-    node_ = make_node<ArrayNode>();
+    data_ = make_node<ArrayNode>();
   }
   /*!
    * \brief move constructor
    * \param other source
    */
   Array(Array<T> && other) {  // NOLINT(*)
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
   }
   /*!
    * \brief copy constructor
    * \param other source
    */
-  Array(const Array<T> &other) : NodeRef(other.node_) { // NOLINT(*)
+  Array(const Array<T> &other) { // NOLINT(*)
+    data_ = std::move(other.data_);
   }
   /*!
    * \brief constructor from pointer
    * \param n the container pointer
    */
-  explicit Array(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Array(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief constructor from iterator
    * \param begin begin of iterator
@@ -214,9 +198,9 @@ class Array : public NodeRef {
   explicit Array(size_t n, const T& val) {
     auto tmp_node = make_node<ArrayNode>();
     for (size_t i = 0; i < n; ++i) {
-      tmp_node->data.push_back(val.node_);
+      tmp_node->data.push_back(val);
     }
-    node_ = std::move(tmp_node);
+    data_ = std::move(tmp_node);
   }
   /*!
    * \brief move assign operator
@@ -224,7 +208,7 @@ class Array : public NodeRef {
    * \return reference to self.
    */
   Array<T>& operator=(Array<T> && other) {
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
     return *this;
   }
   /*!
@@ -233,7 +217,7 @@ class Array : public NodeRef {
    * \return reference to self.
    */
   Array<T>& operator=(const Array<T> & other) {
-    node_ = other.node_;
+    data_ = other.data_;
     return *this;
   }
   /*!
@@ -246,9 +230,9 @@ class Array : public NodeRef {
   void assign(IterType begin, IterType end) {
     auto n = make_node<ArrayNode>();
     for (IterType it = begin; it != end; ++it) {
-      n->data.push_back((*it).node_);
+      n->data.push_back(T(*it));
     }
-    node_ = std::move(n);
+    data_ = std::move(n);
   }
   /*!
    * \brief Read i-th element from array.
@@ -256,12 +240,13 @@ class Array : public NodeRef {
    * \return the i-th element.
    */
   inline const T operator[](size_t i) const {
-    return T(static_cast<const ArrayNode*>(node_.get())->data[i]);
+    return DowncastNoCheck<T>(
+        static_cast<const ArrayNode*>(data_.get())->data[i]);
   }
   /*! \return The size of the array */
   inline size_t size() const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const ArrayNode*>(node_.get())->data.size();
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const ArrayNode*>(data_.get())->data.size();
   }
   /*!
    * \brief copy on write semantics
@@ -272,12 +257,12 @@ class Array : public NodeRef {
    * \return Handle to the internal node container(which ganrantees to be unique)
    */
   inline ArrayNode* CopyOnWrite() {
-    if (node_.get() == nullptr || !node_.unique())  {
+    if (data_.get() == nullptr || !data_.unique())  {
       NodePtr<ArrayNode> n = make_node<ArrayNode>();
-      n->data = static_cast<ArrayNode*>(node_.get())->data;
-      NodePtr<Node>(std::move(n)).swap(node_);
+      n->data = static_cast<ArrayNode*>(data_.get())->data;
+      ObjectPtr<Object>(std::move(n)).swap(data_);
     }
-    return static_cast<ArrayNode*>(node_.get());
+    return static_cast<ArrayNode*>(data_.get());
   }
   /*!
    * \brief push a new item to the back of the list
@@ -285,7 +270,7 @@ class Array : public NodeRef {
    */
   inline void push_back(const T& item) {
     ArrayNode* n = this->CopyOnWrite();
-    n->data.push_back(item.node_);
+    n->data.push_back(item);
   }
   /*!
    * \brief set i-th element of the array.
@@ -294,7 +279,7 @@ class Array : public NodeRef {
    */
   inline void Set(size_t i, const T& value) {
     ArrayNode* n = this->CopyOnWrite();
-    n->data[i] = value.node_;
+    n->data[i] = value;
   }
   /*! \return whether array is empty */
   inline bool empty() const {
@@ -303,34 +288,34 @@ class Array : public NodeRef {
   /*! \brief specify container node */
   using ContainerType = ArrayNode;
 
-  struct Ptr2NodeRef {
+  struct ValueConverter {
     using ResultType = T;
-    static inline T convert(const NodePtr<Node>& n) {
-      return T(n);
+    static inline T convert(const ObjectRef& n) {
+      return DowncastNoCheck<T>(n);
     }
   };
-  using iterator = IterAdapter<Ptr2NodeRef,
-                               std::vector<NodePtr<Node> >::const_iterator>;
+  using iterator = IterAdapter<ValueConverter,
+                               std::vector<ObjectRef>::const_iterator>;
 
   using reverse_iterator = IterAdapter<
-    Ptr2NodeRef,
-    std::vector<NodePtr<Node> >::const_reverse_iterator>;
+    ValueConverter,
+    std::vector<ObjectRef>::const_reverse_iterator>;
 
   /*! \return begin iterator */
   inline iterator begin() const {
-    return iterator(static_cast<const ArrayNode*>(node_.get())->data.begin());
+    return iterator(static_cast<const ArrayNode*>(data_.get())->data.begin());
   }
   /*! \return end iterator */
   inline iterator end() const {
-    return iterator(static_cast<const ArrayNode*>(node_.get())->data.end());
+    return iterator(static_cast<const ArrayNode*>(data_.get())->data.end());
   }
   /*! \return rbegin iterator */
   inline reverse_iterator rbegin() const {
-    return reverse_iterator(static_cast<const ArrayNode*>(node_.get())->data.rbegin());
+    return reverse_iterator(static_cast<const ArrayNode*>(data_.get())->data.rbegin());
   }
   /*! \return rend iterator */
   inline reverse_iterator rend() const {
-    return reverse_iterator(static_cast<const ArrayNode*>(node_.get())->data.rend());
+    return reverse_iterator(static_cast<const ArrayNode*>(data_.get())->data.rend());
   }
 };
 
@@ -355,26 +340,26 @@ class Map : public NodeRef {
    * \brief default constructor
    */
   Map() {
-    node_ = make_node<MapNode>();
+    data_ = make_node<MapNode>();
   }
   /*!
    * \brief move constructor
    * \param other source
    */
   Map(Map<K, V> && other) {  // NOLINT(*)
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
   }
   /*!
    * \brief copy constructor
    * \param other source
    */
-  Map(const Map<K, V> &other) : NodeRef(other.node_) { // NOLINT(*)
+  Map(const Map<K, V> &other) : NodeRef(other.data_) { // NOLINT(*)
   }
   /*!
    * \brief constructor from pointer
    * \param n the container pointer
    */
-  explicit Map(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Map(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief constructor from iterator
    * \param begin begin of iterator
@@ -406,7 +391,7 @@ class Map : public NodeRef {
    * \return reference to self.
    */
   Map<K, V>& operator=(Map<K, V> && other) {
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
     return *this;
   }
   /*!
@@ -415,7 +400,7 @@ class Map : public NodeRef {
    * \return reference to self.
    */
   Map<K, V>& operator=(const Map<K, V> & other) {
-    node_ = other.node_;
+    data_ = other.data_;
     return *this;
   }
   /*!
@@ -428,10 +413,9 @@ class Map : public NodeRef {
   void assign(IterType begin, IterType end) {
     NodePtr<MapNode> n = make_node<MapNode>();
     for (IterType i = begin; i != end; ++i) {
-      n->data.emplace(std::make_pair(i->first.node_,
-                                     i->second.node_));
+      n->data.emplace(std::make_pair(i->first, i->second));
     }
-    node_ = std::move(n);
+    data_ = std::move(n);
   }
   /*!
    * \brief Read element from map.
@@ -439,7 +423,8 @@ class Map : public NodeRef {
    * \return the corresonding element.
    */
   inline const V operator[](const K& key) const {
-    return V(static_cast<const MapNode*>(node_.get())->data.at(key.node_));
+    return DowncastNoCheck<V>(
+        static_cast<const MapNode*>(data_.get())->data.at(key));
   }
   /*!
    * \brief Read element from map.
@@ -447,17 +432,18 @@ class Map : public NodeRef {
    * \return the corresonding element.
    */
   inline const V at(const K& key) const {
-    return V(static_cast<const MapNode*>(node_.get())->data.at(key.node_));
+    return DowncastNoCheck<V>(
+        static_cast<const MapNode*>(data_.get())->data.at(key));
   }
   /*! \return The size of the array */
   inline size_t size() const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const MapNode*>(node_.get())->data.size();
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const MapNode*>(data_.get())->data.size();
   }
   /*! \return The number of elements of the key */
   inline size_t count(const K& key) const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const MapNode*>(node_.get())->data.count(key.node_);
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const MapNode*>(data_.get())->data.count(key);
   }
   /*!
    * \brief copy on write semantics
@@ -468,12 +454,12 @@ class Map : public NodeRef {
    * \return Handle to the internal node container(which ganrantees to be unique)
    */
   inline MapNode* CopyOnWrite() {
-    if (node_.get() == nullptr || !node_.unique())  {
+    if (data_.get() == nullptr || !data_.unique())  {
       NodePtr<MapNode> n = make_node<MapNode>();
-      n->data = static_cast<const MapNode*>(node_.get())->data;
-      NodePtr<Node>(std::move(n)).swap(node_);
+      n->data = static_cast<const MapNode*>(data_.get())->data;
+      ObjectPtr<Object>(std::move(n)).swap(data_);
     }
-    return static_cast<MapNode*>(node_.get());
+    return static_cast<MapNode*>(data_.get());
   }
   /*!
    * \brief set the Map.
@@ -482,7 +468,7 @@ class Map : public NodeRef {
    */
   inline void Set(const K& key, const V& value) {
     MapNode* n = this->CopyOnWrite();
-    n->data[key.node_] = value.node_;
+    n->data[key] = value;
   }
 
   /*! \return whether array is empty */
@@ -492,29 +478,31 @@ class Map : public NodeRef {
   /*! \brief specify container node */
   using ContainerType = MapNode;
 
-  struct Ptr2NodeRef {
+  struct ValueConverter {
     using ResultType = std::pair<K, V>;
     static inline ResultType convert(const std::pair<
-                            NodePtr<Node>,
-                            NodePtr<Node> >& n) {
-      return std::make_pair(K(n.first), V(n.second));
+                                     ObjectRef,
+                                     ObjectRef>& n) {
+      return std::make_pair(DowncastNoCheck<K>(n.first),
+                            DowncastNoCheck<V>(n.second));
     }
   };
 
   using iterator = IterAdapter<
-    Ptr2NodeRef, MapNode::ContainerType::const_iterator>;
+    ValueConverter, MapNode::ContainerType::const_iterator>;
 
   /*! \return begin iterator */
   inline iterator begin() const {
-    return iterator(static_cast<const MapNode*>(node_.get())->data.begin());
+    return iterator(static_cast<const MapNode*>(data_.get())->data.begin());
   }
   /*! \return end iterator */
   inline iterator end() const {
-    return iterator(static_cast<const MapNode*>(node_.get())->data.end());
+    return iterator(static_cast<const MapNode*>(data_.get())->data.end());
   }
   /*! \return begin iterator */
   inline iterator find(const K& key) const {
-    return iterator(static_cast<const MapNode*>(node_.get())->data.find(key.node_));
+    return iterator(
+        static_cast<const MapNode*>(data_.get())->data.find(key));
   }
 };
 
@@ -524,14 +512,14 @@ class Map<std::string, V, T1, T2> : public NodeRef {
  public:
   // for code reuse
   Map() {
-    node_ = make_node<StrMapNode>();
+    data_ = make_node<StrMapNode>();
   }
   Map(Map<std::string, V> && other) {  // NOLINT(*)
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
   }
-  Map(const Map<std::string, V> &other) : NodeRef(other.node_) { // NOLINT(*)
+  Map(const Map<std::string, V> &other) : NodeRef(other.data_) { // NOLINT(*)
   }
-  explicit Map(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Map(ObjectPtr<Object> n) : NodeRef(n) {}
   template<typename IterType>
   Map(IterType begin, IterType end) {
     assign(begin, end);
@@ -545,76 +533,77 @@ class Map<std::string, V, T1, T2> : public NodeRef {
     assign(init.begin(), init.end());
   }
   Map<std::string, V>& operator=(Map<std::string, V> && other) {
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
     return *this;
   }
   Map<std::string, V>& operator=(const Map<std::string, V> & other) {
-    node_ = other.node_;
+    data_ = other.data_;
     return *this;
   }
   template<typename IterType>
   void assign(IterType begin, IterType end) {
     auto n = make_node<StrMapNode>();
     for (IterType i = begin; i != end; ++i) {
-      n->data.emplace(std::make_pair(i->first,
-                                     i->second.node_));
+      n->data.emplace(std::make_pair(i->first, i->second));
     }
-    node_ = std::move(n);
+    data_ = std::move(n);
   }
   inline const V operator[](const std::string& key) const {
-    return V(static_cast<const StrMapNode*>(node_.get())->data.at(key));
+    return DowncastNoCheck<V>(
+        static_cast<const StrMapNode*>(data_.get())->data.at(key));
   }
   inline const V at(const std::string& key) const {
-    return V(static_cast<const StrMapNode*>(node_.get())->data.at(key));
+    return DowncastNoCheck<V>(
+        static_cast<const StrMapNode*>(data_.get())->data.at(key));
   }
   inline size_t size() const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const StrMapNode*>(node_.get())->data.size();
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const StrMapNode*>(data_.get())->data.size();
   }
   inline size_t count(const std::string& key) const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const StrMapNode*>(node_.get())->data.count(key);
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const StrMapNode*>(data_.get())->data.count(key);
   }
   inline StrMapNode* CopyOnWrite() {
-    if (node_.get() == nullptr || !node_.unique())  {
+    if (data_.get() == nullptr || !data_.unique())  {
       NodePtr<StrMapNode> n = make_node<StrMapNode>();
-      n->data = static_cast<const StrMapNode*>(node_.get())->data;
-      NodePtr<Node>(std::move(n)).swap(node_);
+      n->data = static_cast<const StrMapNode*>(data_.get())->data;
+      ObjectPtr<Object>(std::move(n)).swap(data_);
     }
-    return static_cast<StrMapNode*>(node_.get());
+    return static_cast<StrMapNode*>(data_.get());
   }
   inline void Set(const std::string& key, const V& value) {
     StrMapNode* n = this->CopyOnWrite();
-    n->data[key] = value.node_;
+    n->data[key] = value;
   }
   inline bool empty() const {
     return size() == 0;
   }
   using ContainerType = StrMapNode;
 
-  struct Ptr2NodeRef {
+  struct ValueConverter {
     using ResultType = std::pair<std::string, V>;
     static inline ResultType convert(const std::pair<
-                            std::string,
-                            NodePtr<Node> >& n) {
-      return std::make_pair(n.first, V(n.second));
+                                     std::string,
+                                     ObjectRef>& n) {
+      return std::make_pair(n.first, DowncastNoCheck<V>(n.second));
     }
   };
 
   using iterator = IterAdapter<
-    Ptr2NodeRef, StrMapNode::ContainerType::const_iterator>;
+    ValueConverter, StrMapNode::ContainerType::const_iterator>;
 
   /*! \return begin iterator */
   inline iterator begin() const {
-    return iterator(static_cast<const StrMapNode*>(node_.get())->data.begin());
+    return iterator(static_cast<const StrMapNode*>(data_.get())->data.begin());
   }
   /*! \return end iterator */
   inline iterator end() const {
-    return iterator(static_cast<const StrMapNode*>(node_.get())->data.end());
+    return iterator(static_cast<const StrMapNode*>(data_.get())->data.end());
   }
   /*! \return begin iterator */
   inline iterator find(const std::string& key) const {
-    return iterator(static_cast<const StrMapNode*>(node_.get())->data.find(key));
+    return iterator(static_cast<const StrMapNode*>(data_.get())->data.find(key));
   }
 };
 
diff --git a/include/tvm/node/ir_functor.h b/include/tvm/node/ir_functor.h
index 23c5a3fafdab..e902e8fb6d44 100644
--- a/include/tvm/node/ir_functor.h
+++ b/include/tvm/node/ir_functor.h
@@ -34,10 +34,10 @@
 
 namespace tvm {
 /*!
- * \brief A dynamically dispatched functor on NodeRef in the first argument.
+ * \brief A dynamically dispatched functor on ObjectRef in the first argument.
  *
  * \code
- *   IRFunctor<std::string (const NodeRef& n, std::string prefix)> tostr;
+ *   IRFunctor<std::string (const ObjectRef& n, std::string prefix)> tostr;
  *   tostr.set_dispatch<Add>([](const Add* op, std::string prefix) {
  *     return prefix + "Add";
  *   });
@@ -60,10 +60,10 @@ template<typename FType>
 class IRFunctor;
 
 template<typename R, typename ...Args>
-class IRFunctor<R(const NodeRef& n, Args...)> {
+class IRFunctor<R(const ObjectRef& n, Args...)> {
  private:
-  using Function = std::function<R (const NodeRef&n, Args...)>;
-  using TSelf = IRFunctor<R (const NodeRef& n, Args...)>;
+  using Function = std::function<R (const ObjectRef&n, Args...)>;
+  using TSelf = IRFunctor<R (const ObjectRef& n, Args...)>;
   /*! \brief internal function table */
   std::vector<Function> func_;
 
@@ -75,8 +75,8 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
    * \param n The node to be dispatched
    * \return Whether dispatching function is registered for n's type.
    */
-  inline bool can_dispatch(const NodeRef& n) const {
-    uint32_t type_index = n.type_index();
+  inline bool can_dispatch(const ObjectRef& n) const {
+    uint32_t type_index = n->type_index();
     return type_index < func_.size() && func_[type_index] != nullptr;
   }
   /*!
@@ -85,12 +85,12 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
    * \param args The additional arguments
    * \return The result.
    */
-  inline R operator()(const NodeRef& n, Args... args) const {
-    uint32_t type_index = n.type_index();
+  inline R operator()(const ObjectRef& n, Args... args) const {
+    uint32_t type_index = n->type_index();
     CHECK(type_index < func_.size() &&
           func_[type_index] != nullptr)
         << "IRFunctor calls un-registered function on type "
-        << Node::TypeIndex2Key(type_index);
+        << n->GetTypeKey();
     return func_[type_index](n, std::forward<Args>(args)...);
   }
   /*!
@@ -101,19 +101,19 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
    */
   template<typename TNode>
   inline TSelf& set_dispatch(Function f) {  // NOLINT(*)
-    uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
+    uint32_t tindex = TNode::RuntimeTypeIndex();
     if (func_.size() <= tindex) {
       func_.resize(tindex + 1, nullptr);
     }
     CHECK(func_[tindex] == nullptr)
-        << "Dispatch for " << Node::TypeIndex2Key(tindex)
+        << "Dispatch for " << TNode::_type_key
         << " is already set";
     func_[tindex] = f;
     return *this;
   }
   /*!
    * \brief set the dispacher for type TNode
-   *  This allows f to used detailed const Node pointer to replace NodeRef
+   *  This allows f to used detailed const Node pointer to replace ObjectRef
    *
    * \param f The function to be set.
    * \tparam TNode the type of Node to be dispatched.
@@ -121,8 +121,8 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
    */
   template<typename TNode>
   inline TSelf& set_dispatch(std::function<R(const TNode* n, Args...)> f) { // NOLINT(*)
-    Function fun = [f](const NodeRef& n, Args... args) {
-      return f(static_cast<const TNode*>(n.node_.get()),
+    Function fun = [f](const ObjectRef& n, Args... args) {
+      return f(static_cast<const TNode*>(n.get()),
                std::forward<Args>(args)...);
     };
     return this->set_dispatch<TNode>(fun);
@@ -135,7 +135,7 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
   */
   template<typename TNode>
   inline TSelf& clear_dispatch() {  // NOLINT(*)
-    uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
+    uint32_t tindex = TNode::RuntimeTypeIndex();
     CHECK_LT(tindex, func_.size()) << "clear_dispatch: index out of range";
     func_[tindex] = nullptr;
     return *this;
@@ -172,7 +172,7 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
  *      f(e, this);
  *    }
  *
- *    using FType = IRFunctor<void (const NodeRef&, IRPrinter *)>;
+ *    using FType = IRFunctor<void (const ObjectRef&, IRPrinter *)>;
  *    // function to return global function table
  *    static FType& vtable();
  *  };
@@ -232,15 +232,15 @@ template<typename FType>
 class IRFunctorStaticRegistry;
 
 template<typename R, typename ...Args>
-class IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> {
+class IRFunctorStaticRegistry<R(const ObjectRef& n, Args...)> {
  private:
-  IRFunctor<R(const NodeRef& n, Args...)> *irf_;
+  IRFunctor<R(const ObjectRef& n, Args...)> *irf_;
   std::shared_ptr<IRFunctorCleanList> free_list;
 
-  using TSelf = IRFunctorStaticRegistry<R(const NodeRef& n, Args...)>;
+  using TSelf = IRFunctorStaticRegistry<R(const ObjectRef& n, Args...)>;
 
  public:
-  IRFunctorStaticRegistry(IRFunctor<R(const NodeRef& n, Args...)> *irf) {
+  IRFunctorStaticRegistry(IRFunctor<R(const ObjectRef& n, Args...)> *irf) {
     irf_ = irf;
     free_list = std::make_shared<IRFunctorCleanList>();
   }
@@ -261,12 +261,12 @@ class IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> {
 * the compiler to deduce the template types.
 */
 template<typename R, typename ...Args>
-IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> MakeIRFunctorStaticRegistry(
-  IRFunctor<R(const NodeRef& n, Args...)> *irf) {
-  return IRFunctorStaticRegistry<R(const NodeRef& n, Args...)>(irf);
+IRFunctorStaticRegistry<R(const ObjectRef& n, Args...)> MakeIRFunctorStaticRegistry(
+  IRFunctor<R(const ObjectRef& n, Args...)> *irf) {
+  return IRFunctorStaticRegistry<R(const ObjectRef& n, Args...)>(irf);
 }
 
-#define TVM_AUTO_REGISTER_VAR_DEF(ClsName)                           \
+#define TVM_AUTO_REGISTER_VAR_DEF(ClsName)                        \
   static TVM_ATTRIBUTE_UNUSED auto __make_functor ## _ ## ClsName
 
 /*!
diff --git a/include/tvm/node/memory.h b/include/tvm/node/memory.h
deleted file mode 100644
index 1bba57144e19..000000000000
--- a/include/tvm/node/memory.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- * \file tvm/node/memory.h
- * \brief Node memory management.
- */
-#ifndef TVM_NODE_MEMORY_H_
-#define TVM_NODE_MEMORY_H_
-
-#include <utility>
-#include "node.h"
-
-namespace tvm {
-/*!
- * \brief Allocate a node object.
- * \param args arguments to the constructor.
- * \tparam T the node type.
- * \return The NodePtr to the allocated object.
- */
-template<typename T, typename... Args>
-inline NodePtr<T> make_node(Args&&... args);
-
-// Detail implementations after this
-//
-// The current design allows swapping the
-// allocator pattern when necessary.
-//
-// Possible future allocator optimizations:
-// - Arena allocator that gives ownership of memory to arena (deleter_= nullptr)
-// - Thread-local object pools: one pool per size and alignment requirement.
-// - Can specialize by type of object to give the specific allocator to each object.
-//
-template<typename T>
-class SimpleNodeAllocator {
- public:
-  template<typename... Args>
-  static T* New(Args&&... args) {
-    return new T(std::forward<Args>(args)...);
-  }
-  static NodeBase::FDeleter Deleter() {
-    return Deleter_;
-  }
-
- private:
-  static void Deleter_(NodeBase* ptr) {
-    delete static_cast<T*>(ptr);
-  }
-};
-
-template<typename T, typename... Args>
-inline NodePtr<T> make_node(Args&&... args) {
-  using Allocator = SimpleNodeAllocator<T>;
-  static_assert(std::is_base_of<NodeBase, T>::value,
-                "make_node can only be used to create NodeBase");
-  T* node = Allocator::New(std::forward<Args>(args)...);
-  node->deleter_ = Allocator::Deleter();
-  return NodePtr<T>(node);
-}
-
-}  // namespace tvm
-#endif  // TVM_NODE_MEMORY_H_
diff --git a/include/tvm/node/node.h b/include/tvm/node/node.h
index cb18e46e9a5c..8203ee69f686 100644
--- a/include/tvm/node/node.h
+++ b/include/tvm/node/node.h
@@ -25,7 +25,9 @@
 
 #include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/node_base.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/memory.h>
+#include <tvm/runtime/ndarray.h>
 #include <string>
 #include <vector>
 #include <utility>
@@ -38,13 +40,6 @@ class DataType;
 class Node;
 class NodeRef;
 
-namespace runtime {
-// forward declaration
-class NDArray;
-// forward declaration
-class ObjectRef;
-}  // namespace runtime
-
 /*!
  * \brief Visitor class to each node content.
  *  The content is going to be called for each field.
@@ -74,15 +69,17 @@ class TVM_DLL AttrVisitor {
 //! \endcond
 };
 
+/*! \brief Reuse the type index in he runtime. */
+using TypeIndex = runtime::TypeIndex;
+
 /*!
  * \brief base class of node container in DSL AST.
  */
-class TVM_DLL Node : public NodeBase {
+class Node : public runtime::Object {
  public:
   /*! \brief virtual destructor */
   virtual ~Node() {}
-  /*! \return The unique type key of the node */
-  virtual const char* type_key() const = 0;
+
   /*!
    * \brief Apply visitor to each field of the Node
    *  Visitor could mutate the content of the node.
@@ -90,272 +87,79 @@ class TVM_DLL Node : public NodeBase {
    * \param visitor The visitor
    */
   virtual void VisitAttrs(AttrVisitor* visitor) {}
-  /*! \return the type index of the node */
-  virtual uint32_t type_index() const = 0;
-  /*!
-   * \brief Whether this node derives from node with type_index=tid.
-   *  Implemented by TVM_DECLARE_NODE_TYPE_INFO
-   *
-   * \param tid The type index.
-   * \return the check result.
-   */
-  virtual bool _DerivedFrom(uint32_t tid) const;
-  /*!
-   * \brief get a runtime unique type index given a type key
-   * \param type_key Type key of a type.
-   * \return the corresponding type index.
-   */
-  static uint32_t TypeKey2Index(const char* type_key);
-  /*!
-   * \brief get type key from type index.
-   * \param index The type index
-   * \return the corresponding type key.
-   */
-  static const char* TypeIndex2Key(uint32_t index);
-  /*!
-   * \return whether the type is derived from
-   */
-  template<typename T>
-  inline bool derived_from() const;
-  /*!
-   * \return whether the node is of type T
-   * \tparam The type to be checked.
-   */
-  template<typename T>
-  inline bool is_type() const;
-  /*!
-   * \brief Get a NodePtr that holds reference to this Node.
-   * \return the NodePtr
-   */
-  inline NodePtr<Node> GetNodePtr() const;
-  // node ref can see this
-  friend class NodeRef;
+
   static constexpr const char* _type_key = "Node";
+  static constexpr uint32_t _type_index = TypeIndex::kDynamic;
+
+  TVM_DECLARE_BASE_OBJECT_INFO(Node, runtime::Object);
 };
 
-/*! \brief Base class of all node reference object */
-class NodeRef {
+
+/*!
+ * \brief Base class of all node reference object
+ *  NodeRef is just a alias of ObjectRef.
+ */
+class NodeRef : public runtime::ObjectRef {
  public:
   /*! \brief type indicate the container type */
   using ContainerType = Node;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool operator==(const NodeRef& other) const;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool same_as(const NodeRef& other) const;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool operator<(const NodeRef& other) const;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool operator!=(const NodeRef& other) const;
-  /*! \return the hash function for NodeRef */
-  inline size_t hash() const;
-  /*! \return whether the expression is null */
-  inline bool defined() const;
-  /*! \return the internal type index of IRNode */
-  inline uint32_t type_index() const;
+
   /*! \return the internal node pointer */
-  inline const Node* get() const;
+  const Node* get() const {
+    return static_cast<const Node*>(ObjectRef::get());
+  }
   /*! \return the internal node pointer */
-  inline const Node* operator->() const;
-  /*!
-   * \brief Downcast this ir node to its actual type (e.g. Add, or
-   * Select). This returns nullptr if the node is not of the requested
-   * type. Example usage:
-   *
-   * if (const Add *add = node->as<Add>()) {
-   *   // This is an add node
-   * }
-   * \tparam T the target type, must be subtype of IRNode
-   */
-  template<typename T>
-  inline const T *as() const;
+  const Node* operator->() const {
+    return get();
+  }
   /*!
    * \brief A more powerful version of as that also works with
    *  intermediate base types.
    * \tparam T the target type, must be subtype of IRNode
    */
   template<typename T>
-  inline const T *as_derived() const;
+  const T *as_derived() const {
+    return as<T>();
+  }
   /*! \brief default constructor */
   NodeRef() = default;
-  explicit NodeRef(NodePtr<Node> node) : node_(node) {}
-  /*! \brief the internal node object, do not touch  */
-  NodePtr<Node> node_;
+  explicit NodeRef(runtime::ObjectPtr<runtime::Object> ptr) : ObjectRef(ptr) {}
 };
 
-/*!
- * \brief Get a reference type from a Node ptr type
- *
- *  It is always important to get a reference type
- *  if we want to return a value as reference or keep
- *  the node alive beyond the scope of the function.
- *
- * \param ptr The node pointer
- * \tparam RefType The reference type
- * \tparam NodeType The node type
- * \return The corresponding RefType
- */
-template <typename RefType, typename NodeType>
-inline RefType GetRef(const NodeType* ptr);
-
-/*!
- * \brief Downcast a base reference type to a more specific type.
- *
- * \param ref The inptut reference
- * \return The corresponding SubRef.
- * \tparam SubRef The target specific reference type.
- * \tparam BaseRef the current reference type.
- */
-template <typename SubRef, typename BaseRef>
-inline SubRef Downcast(BaseRef ref);
-
 /*!
  * \brief helper macro to declare type information in a base node.
  */
-#define TVM_DECLARE_BASE_NODE_INFO(TypeName, Parent)                    \
-  bool _DerivedFrom(uint32_t tid) const override {                      \
-    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
-    if (tidx == tid) return true;                                       \
-    return Parent::_DerivedFrom(tid);                                   \
-  }
+#define TVM_DECLARE_BASE_NODE_INFO(TypeName, Parent)  \
+  TVM_DECLARE_BASE_OBJECT_INFO(TypeName, Parent)
 
 /*!
  * \brief helper macro to declare type information in a terminal node
  */
-#define TVM_DECLARE_NODE_TYPE_INFO(TypeName, Parent)                    \
-  const char* type_key() const final {                                  \
-    return TypeName::_type_key;                                         \
-  }                                                                     \
-  uint32_t type_index() const final {                                   \
-    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
-    return tidx;                                                        \
-  }                                                                     \
-  bool _DerivedFrom(uint32_t tid) const final {                         \
-    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
-    if (tidx == tid) return true;                                       \
-    return Parent::_DerivedFrom(tid);                                   \
-  }
-
-// implementations of inline functions after this
-template<typename T>
-inline bool Node::derived_from() const {
-  // use static field so query only happens once.
-  static uint32_t type_id = Node::TypeKey2Index(T::_type_key);
-  return this->_DerivedFrom(type_id);
-}
-
-
-template<typename T>
-inline bool Node::is_type() const {
-  // use static field so query only happens once.
-  static uint32_t type_id = Node::TypeKey2Index(T::_type_key);
-  return type_id == this->type_index();
-}
+#define TVM_DECLARE_NODE_TYPE_INFO(TypeName, Parent)  \
+  TVM_DECLARE_FINAL_OBJECT_INFO(TypeName, Parent);
 
 
-inline NodePtr<Node> Node::GetNodePtr() const {
-  return NodePtr<Node>(const_cast<Node*>(this));
-}
+using runtime::Object;
+using runtime::ObjectPtr;
+using runtime::ObjectRef;
+using runtime::GetRef;
+using runtime::Downcast;
+using runtime::make_object;
+using runtime::ObjectHash;
+using runtime::ObjectEqual;
 
-template <typename RefType, typename NodeType>
-inline RefType GetRef(const NodeType* ptr) {
-  static_assert(std::is_base_of<typename RefType::ContainerType, NodeType>::value,
-                "Can only cast to the ref of same container type");
-  return RefType(ptr->GetNodePtr());
-}
-
-template <typename SubRef, typename BaseRef>
-inline SubRef Downcast(BaseRef ref) {
-  CHECK(ref->template is_type<typename SubRef::ContainerType>() ||
-        ref->template derived_from<typename SubRef::ContainerType>())
-      << "Downcast from " << ref->type_key() << " to "
-      << SubRef::ContainerType::_type_key << " failed.";
-  return SubRef(std::move(ref.node_));
-}
-
-inline const Node* NodeRef::get() const {
-  return node_.get();
-}
-
-inline const Node* NodeRef::operator->() const {
-  return node_.get();
-}
-
-inline bool NodeRef::defined() const {
-  return node_.get() != nullptr;
-}
-
-inline bool NodeRef::operator==(const NodeRef& other) const {
-  return node_.get() == other.node_.get();
-}
+using NodeHash = ObjectHash;
+using NodeEqual = ObjectEqual;
 
-inline bool NodeRef::same_as(const NodeRef& other) const {
-  return node_.get() == other.node_.get();
-}
-
-inline bool NodeRef::operator<(const NodeRef& other) const {
-  return node_.get() < other.node_.get();
-}
-
-inline bool NodeRef::operator!=(const NodeRef& other) const {
-  return node_.get() != other.node_.get();
-}
-
-inline size_t NodeRef::hash() const {
-  return std::hash<Node*>()(node_.get());
-}
-
-inline uint32_t NodeRef::type_index() const {
-  CHECK(node_.get() != nullptr)
-      << "null type";
-  return get()->type_index();
-}
-
-template<typename T>
-inline const T* NodeRef::as() const {
-  const Node* ptr = static_cast<const Node*>(get());
-  if (ptr && ptr->is_type<T>()) {
-    return static_cast<const T*>(ptr);
-  }
-  return nullptr;
-}
-
-template<typename T>
-inline const T* NodeRef::as_derived() const {
-  const Node* ptr = static_cast<const Node*>(get());
-  if (ptr && (ptr->is_type<T>() || ptr->derived_from<T>())) {
-    return static_cast<const T*>(ptr);
-  }
-  return nullptr;
+/*!
+ * \brief Allocate a node object.
+ * \param args arguments to the constructor.
+ * \tparam T the node type.
+ * \return The NodePtr to the allocated object.
+ */
+template<typename T, typename... Args>
+inline NodePtr<T> make_node(Args&&... args) {
+  return runtime::make_object<T>(std::forward<Args>(args)...);
 }
-
-/*! \brief The hash function for nodes */
-struct NodeHash {
-  size_t operator()(const NodeRef& a) const {
-    return a.hash();
-  }
-};
-
-/*! \brief The equal comparator for nodes */
-struct NodeEqual {
-  bool operator()(const NodeRef& a, const NodeRef& b) const {
-    return a.get() == b.get();
-  }
-};
 }  // namespace tvm
 #endif  // TVM_NODE_NODE_H_
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index b950aa952f04..b942464d4907 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -651,7 +651,7 @@ inline Tensor compute(Array<Expr> shape,
 
 // inline function.
 inline const OperationNode* Operation::operator->() const {
-  return static_cast<const OperationNode*>(node_.get());
+  return static_cast<const OperationNode*>(get());
 }
 }  // namespace tvm
 #endif  // TVM_OPERATION_H_
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 5951594b873c..48d46fdf2fc6 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -37,6 +37,7 @@
 #include "runtime/packed_func.h"
 
 namespace tvm {
+
 using runtime::TVMArgs;
 using runtime::TVMRetValue;
 using runtime::PackedFunc;
@@ -47,86 +48,82 @@ namespace runtime {
  * \tparam T the type to be checked.
  */
 template<typename T>
-struct NodeTypeChecker {
-  static inline bool Check(Node* sptr) {
-    // This is the only place in the project where RTTI is used
-    // It can be turned off, but will make non strict checking.
-    // TODO(tqchen) possibly find alternative to turn of RTTI
+struct ObjectTypeChecker {
+  static bool Check(const Object* ptr) {
     using ContainerType = typename T::ContainerType;
-    // always allow nullptr.
-    if (sptr == nullptr) return true;
-    return sptr->derived_from<ContainerType>();
+    if (ptr == nullptr) return true;
+    return ptr->IsInstance<ContainerType>();
   }
-  static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
+  static void PrintName(std::ostream& os) { // NOLINT(*)
     using ContainerType = typename T::ContainerType;
     os << ContainerType::_type_key;
   }
 };
 
 template<typename T>
-struct NodeTypeChecker<Array<T> > {
-  static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return true;
-    if (!sptr->is_type<ArrayNode>()) return false;
-    ArrayNode* n = static_cast<ArrayNode*>(sptr);
+struct ObjectTypeChecker<Array<T> > {
+  static bool Check(const Object* ptr) {
+    if (ptr == nullptr) return true;
+    if (!ptr->IsInstance<ArrayNode>()) return false;
+    const ArrayNode* n = static_cast<const ArrayNode*>(ptr);
     for (const auto& p : n->data) {
-      if (!NodeTypeChecker<T>::Check(p.get())) {
+      if (!ObjectTypeChecker<T>::Check(p.get())) {
         return false;
       }
     }
     return true;
   }
-  static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
-    os << "array<";
-    NodeTypeChecker<T>::PrintName(os);
-    os << ">";
+  static void PrintName(std::ostream& os) { // NOLINT(*)
+    os << "List[";
+    ObjectTypeChecker<T>::PrintName(os);
+    os << "]";
   }
 };
 
 template<typename V>
-struct NodeTypeChecker<Map<std::string, V> > {
-  static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return true;
-    if (!sptr->is_type<StrMapNode>()) return false;
-    StrMapNode* n = static_cast<StrMapNode*>(sptr);
+struct ObjectTypeChecker<Map<std::string, V> > {
+  static bool Check(const Object* ptr) {
+    if (ptr == nullptr) return true;
+    if (!ptr->IsInstance<StrMapNode>()) return false;
+    const StrMapNode* n = static_cast<const StrMapNode*>(ptr);
     for (const auto& kv : n->data) {
-      if (!NodeTypeChecker<V>::Check(kv.second.get())) return false;
+      if (!ObjectTypeChecker<V>::Check(kv.second.get())) return false;
     }
     return true;
   }
-  static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
-    os << "map<string";
+  static void PrintName(std::ostream& os) { // NOLINT(*)
+    os << "Map[str";
     os << ',';
-    NodeTypeChecker<V>::PrintName(os);
-    os << '>';
+    ObjectTypeChecker<V>::PrintName(os);
+    os << ']';
   }
 };
 
 template<typename K, typename V>
-struct NodeTypeChecker<Map<K, V> > {
-  static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return true;
-    if (!sptr->is_type<MapNode>()) return false;
-    MapNode* n = static_cast<MapNode*>(sptr);
+struct ObjectTypeChecker<Map<K, V> > {
+  static bool Check(const Object* ptr) {
+    if (ptr == nullptr) return true;
+    if (!ptr->IsInstance<MapNode>()) return false;
+    const MapNode* n = static_cast<const MapNode*>(ptr);
     for (const auto& kv : n->data) {
-      if (!NodeTypeChecker<K>::Check(kv.first.get())) return false;
-      if (!NodeTypeChecker<V>::Check(kv.second.get())) return false;
+      if (!ObjectTypeChecker<K>::Check(kv.first.get())) return false;
+      if (!ObjectTypeChecker<V>::Check(kv.second.get())) return false;
     }
     return true;
   }
-  static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
-    os << "map<";
-    NodeTypeChecker<K>::PrintName(os);
+  static void PrintName(std::ostringstream& os) { // NOLINT(*)
+    os << "Map[";
+    ObjectTypeChecker<K>::PrintName(os);
     os << ',';
-    NodeTypeChecker<V>::PrintName(os);
-    os << '>';
+    ObjectTypeChecker<V>::PrintName(os);
+    os << ']';
   }
 };
 
 template<typename T>
-inline std::string NodeTypeName() {
+inline std::string ObjectTypeName() {
   std::ostringstream os;
-  NodeTypeChecker<T>::PrintName(os);
+  ObjectTypeChecker<T>::PrintName(os);
   return os.str();
 }
 
@@ -138,12 +135,12 @@ inline TNodeRef TVMArgValue::AsNodeRef() const {
       std::is_base_of<NodeRef, TNodeRef>::value,
       "Conversion only works for NodeRef");
   if (type_code_ == kNull) return TNodeRef(NodePtr<Node>(nullptr));
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
-  CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
-      << "Expected type " << NodeTypeName<TNodeRef>()
-      << " but get " << sptr->type_key();
-  return TNodeRef(sptr);
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+  CHECK(ObjectTypeChecker<TNodeRef>::Check(ptr))
+      << "Expected type " << ObjectTypeName<TNodeRef>()
+      << " but get " << ptr->GetTypeKey();
+  return TNodeRef(ObjectPtr<Node>(ptr));
 }
 
 inline TVMArgValue::operator tvm::Expr() const {
@@ -156,18 +153,20 @@ inline TVMArgValue::operator tvm::Expr() const {
   if (type_code_ == kDLFloat) {
     return Expr(static_cast<float>(value_.v_float64));
   }
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
-  if (sptr->is_type<IterVarNode>()) {
-    return IterVar(sptr)->var;
+
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+
+  if (ptr->IsInstance<IterVarNode>()) {
+    return IterVar(ObjectPtr<Node>(ptr))->var;
   }
-  if (sptr->is_type<TensorNode>()) {
-    return Tensor(sptr)();
+  if (ptr->IsInstance<TensorNode>()) {
+    return Tensor(ObjectPtr<Node>(ptr))();
   }
-  CHECK(NodeTypeChecker<Expr>::Check(sptr.get()))
-      << "Expected type " << NodeTypeName<Expr>()
-      << " but get " << sptr->type_key();
-  return Expr(sptr);
+  CHECK(ObjectTypeChecker<Expr>::Check(ptr))
+      << "Expected type " << ObjectTypeName<Expr>()
+      << " but get " << ptr->GetTypeKey();
+  return Expr(ObjectPtr<Node>(ptr));
 }
 
 inline TVMArgValue::operator tvm::Integer() const {
@@ -177,68 +176,36 @@ inline TVMArgValue::operator tvm::Integer() const {
     CHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
     return Integer(static_cast<int>(value_.v_int64));
   }
-  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
-  CHECK(NodeTypeChecker<Integer>::Check(sptr.get()))
-      << "Expected type " << NodeTypeName<Expr>()
-      << " but get " << sptr->type_key();
-  return Integer(sptr);
-}
-
-inline NodePtr<Node>& TVMArgValue::node_sptr() {
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  return *ptr<NodePtr<Node> >();
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+  CHECK(ObjectTypeChecker<Integer>::Check(ptr))
+      << "Expected type " << ObjectTypeName<Expr>()
+      << " but get " << ptr->GetTypeKey();
+  return Integer(ObjectPtr<Node>(ptr));
 }
 
-
 template<typename TNodeRef, typename>
-inline bool TVMArgValue::IsNodeType() const {
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  NodePtr<Node>& sptr =
-      *ptr<NodePtr<Node> >();
-  return NodeTypeChecker<TNodeRef>::Check(sptr.get());
+inline bool TVMPODValue_::IsObjectRef() const {
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+  return ObjectTypeChecker<TNodeRef>::Check(ptr);
 }
 
 // extensions for TVMRetValue
-inline TVMRetValue& TVMRetValue::operator=(
-    const NodePtr<Node>& other) {
-  if (other.get() == nullptr) {
-    SwitchToPOD(kNull);
-  } else {
-    SwitchToClass<NodePtr<Node> >(kNodeHandle, other);
-  }
-  return *this;
-}
-
-inline TVMRetValue& TVMRetValue::operator=(const NodeRef& other) {
-  if (!other.defined()) {
-    SwitchToPOD(kNull);
-  } else {
-    SwitchToClass<NodePtr<Node> >(kNodeHandle, other.node_);
-  }
-  return *this;
-}
-
 template<typename TNodeRef>
 inline TNodeRef TVMRetValue::AsNodeRef() const {
   static_assert(
       std::is_base_of<NodeRef, TNodeRef>::value,
       "Conversion only works for NodeRef");
   if (type_code_ == kNull) return TNodeRef();
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
-  CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
-      << "Expected type " << NodeTypeName<TNodeRef>()
-      << " but get " << sptr->type_key();
-  return TNodeRef(sptr);
-}
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
 
-inline void TVMArgsSetter::operator()(size_t i, const NodeRef& other) const {  // NOLINT(*)
-  if (other.defined()) {
-    values_[i].v_handle = const_cast<NodePtr<Node>*>(&(other.node_));
-    type_codes_[i] = kNodeHandle;
-  } else {
-    type_codes_[i] = kNull;
-  }
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+
+  CHECK(ObjectTypeChecker<TNodeRef>::Check(ptr))
+      << "Expected type " << ObjectTypeName<TNodeRef>()
+      << " but get " << ptr->GetTypeKey();
+  return TNodeRef(ObjectPtr<Object>(ptr));
 }
 
 // type related stuffs
diff --git a/include/tvm/relay/adt.h b/include/tvm/relay/adt.h
index 4329c438e8a0..e54d88d5a393 100644
--- a/include/tvm/relay/adt.h
+++ b/include/tvm/relay/adt.h
@@ -52,7 +52,7 @@ class PatternNode : public RelayNode {
 class Pattern : public NodeRef {
  public:
   Pattern() {}
-  explicit Pattern(NodePtr<tvm::Node> p) : NodeRef(p) {}
+  explicit Pattern(ObjectPtr<tvm::Object> p) : NodeRef(p) {}
 
   using ContainerType = PatternNode;
 };
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index f94ba5e26068..15330b00e961 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -83,10 +83,12 @@ using NodeEqual = ::tvm::NodeEqual;
 #define RELAY_DEFINE_NODE_REF(TypeName, NodeName, NodeRefBase)          \
   class TypeName : public NodeRefBase {                                 \
    public:                                                              \
-    TypeName() {}                                                        \
-    explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : NodeRefBase(n) {} \
+    TypeName() {}                                                       \
+    explicit TypeName(::tvm::ObjectPtr<::tvm::Object> n)                \
+        : NodeRefBase(n) {                                              \
+    }                                                                   \
     const NodeName* operator->() const {                                \
-      return static_cast<const NodeName*>(node_.get());                 \
+      return static_cast<const NodeName*>(get());                       \
     }                                                                   \
     operator bool() { return this->defined(); }                         \
     using ContainerType = NodeName;                                     \
@@ -127,7 +129,7 @@ class SourceName : public NodeRef {
    * \return the pointer to the internal node container
    */
   inline const SourceNameNode* operator->() const {
-    return static_cast<SourceNameNode*>(this->node_.get());
+    return static_cast<const SourceNameNode*>(get());
   }
 
   /*!
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index b1b8d6a7154e..281b99297e78 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -541,10 +541,11 @@ RELAY_DEFINE_NODE_REF(TempExpr, TempExprNode, Expr);
 
 // implementataions
 inline const Type& ExprNode::checked_type() const {
-  CHECK(checked_type_.defined()) << "internal error: the type checker has "
-    "not populated the checked_type "
-    "field for "
-                                 << GetRef<Expr>(this);
+  CHECK(checked_type_.defined())
+      << "internal error: the type checker has "
+      << "not populated the checked_type "
+      << "field for "
+      << GetRef<Expr>(this);
   return this->checked_type_;
 }
 
@@ -557,7 +558,7 @@ inline const TTypeNode* ExprNode::type_as() const {
   const TTypeNode* node = checked_type_.as<TTypeNode>();
   CHECK(node != nullptr)
       << "Expected type to be " << TTypeNode::_type_key
-      << ", but get " << checked_type_->type_key();
+      << ", but get " << checked_type_->GetTypeKey();
   return node;
 }
 
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index e0d940c5d1a5..8bc87a27f66f 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -57,8 +57,8 @@ class ExprFunctor;
 
 #define RELAY_EXPR_FUNCTOR_DISPATCH(OP)                                \
   vtable.template set_dispatch<OP>(                                    \
-      [](const NodeRef& n, TSelf* self, Args... args) {                \
-        return self->VisitExpr_(static_cast<const OP*>(n.node_.get()), \
+      [](const ObjectRef& n, TSelf* self, Args... args) {                \
+        return self->VisitExpr_(static_cast<const OP*>(n.get()), \
                                 std::forward<Args>(args)...);          \
       });
 
@@ -66,7 +66,7 @@ template <typename R, typename... Args>
 class ExprFunctor<R(const Expr& n, Args...)> {
  private:
   using TSelf = ExprFunctor<R(const Expr& n, Args...)>;
-  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using FType = tvm::IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -117,7 +117,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
   virtual R VisitExpr_(const ConstructorNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const MatchNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Node* op, Args...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     throw;
   }
 
diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
index d05099f781ac..a0422fa7f446 100644
--- a/include/tvm/relay/interpreter.h
+++ b/include/tvm/relay/interpreter.h
@@ -78,9 +78,9 @@ class ValueNode : public RelayNode {
 class Value : public NodeRef {
  public:
   Value() {}
-  explicit Value(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Value(ObjectPtr<Object> n) : NodeRef(n) {}
   const ValueNode* operator->() const {
-    return static_cast<const ValueNode*>(node_.get());
+    return static_cast<const ValueNode*>(get());
   }
 
   using ContainerType = ValueNode;
diff --git a/include/tvm/relay/module.h b/include/tvm/relay/module.h
index 8b17020a1132..10d72349d0f5 100644
--- a/include/tvm/relay/module.h
+++ b/include/tvm/relay/module.h
@@ -281,10 +281,10 @@ class ModuleNode : public RelayNode {
 
 struct Module : public NodeRef {
   Module() {}
-  explicit Module(NodePtr<tvm::Node> p) : NodeRef(p) {}
+  explicit Module(ObjectPtr<::tvm::Object> p) : NodeRef(p) {}
 
-  inline ModuleNode* operator->() const {
-    return static_cast<ModuleNode*>(node_.get());
+  ModuleNode* operator->() const {
+    return static_cast<ModuleNode*>(get_mutable());
   }
 
   using ContainerType = ModuleNode;
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index 0a6d3725655f..572c194bc269 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -138,7 +138,7 @@ class Op : public relay::Expr {
   /*! \brief default constructor  */
   Op() {}
   /*! \brief constructor from node pointer */
-  explicit Op(NodePtr<Node> n) : Expr(n) {}
+  explicit Op(ObjectPtr<Object> n) : Expr(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -221,11 +221,12 @@ class OpRegistry {
                                     const Attrs&,
                                     const TypeReporter&)> type_rel_func);
   /*!
-   * \brief Set the type key of attributes.
-   * \param type_key The type of of the attrs field.
+   * \brief Set the the attrs type key and index to be AttrsType.
+   * \tparam AttrsType the attribute type to b set.
    * \return reference to self.
    */
-  inline OpRegistry& set_attrs_type_key(const std::string& type_key);
+  template<typename AttrsType>
+  inline OpRegistry& set_attrs_type();
   /*!
    * \brief Set the num_inputs
    * \param n The number of inputs to be set.
@@ -397,7 +398,7 @@ class OpMap {
 
 // implementations
 inline const OpNode* Op::operator->() const {
-  return static_cast<const OpNode*>(node_.get());
+  return static_cast<const OpNode*>(get());
 }
 
 template <typename ValueType>
@@ -496,10 +497,10 @@ inline OpRegistry& OpRegistry::set_num_inputs(int32_t n) {  // NOLINT(*)
   return *this;
 }
 
-inline OpRegistry& OpRegistry::set_attrs_type_key(  // NOLINT(*)
-    const std::string& type_key) {
-  get()->attrs_type_key = type_key;
-  get()->attrs_type_index = Node::TypeKey2Index(type_key.c_str());
+template<typename AttrsType>
+inline OpRegistry& OpRegistry::set_attrs_type() {  // NOLINT(*)
+  get()->attrs_type_key = AttrsType::_type_key;
+  get()->attrs_type_index = AttrsType::RuntimeTypeIndex();
   return *this;
 }
 
diff --git a/include/tvm/relay/pattern_functor.h b/include/tvm/relay/pattern_functor.h
index 7f1c47e03592..c15523cb25de 100644
--- a/include/tvm/relay/pattern_functor.h
+++ b/include/tvm/relay/pattern_functor.h
@@ -57,8 +57,8 @@ class PatternFunctor;
 
 #define RELAY_PATTERN_FUNCTOR_DISPATCH(OP)                                \
   vtable.template set_dispatch<OP>(                                       \
-      [](const NodeRef& n, TSelf* self, Args... args) {                   \
-        return self->VisitPattern_(static_cast<const OP*>(n.node_.get()), \
+      [](const ObjectRef& n, TSelf* self, Args... args) {                   \
+        return self->VisitPattern_(static_cast<const OP*>(n.get()), \
                                    std::forward<Args>(args)...);          \
       });
 
@@ -66,7 +66,7 @@ template <typename R, typename... Args>
 class PatternFunctor<R(const Pattern& n, Args...)> {
  private:
   using TSelf = PatternFunctor<R(const Pattern& n, Args...)>;
-  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using FType = tvm::IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -103,7 +103,7 @@ class PatternFunctor<R(const Pattern& n, Args...)> {
   virtual R VisitPattern_(const PatternTupleNode* op,
                           Args... args) PATTERN_FUNCTOR_DEFAULT;
   virtual R VisitPatternDefault_(const Node* op, Args...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     throw;
   }
 
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index a2119c90f750..08ea3075cb83 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -134,16 +134,16 @@ class PassContext : public NodeRef {
    * \return const access pointer.
    */
   const PassContextNode* operator->() const {
-    CHECK(node_.get() != nullptr);
-    return static_cast<const PassContextNode*>(node_.get());
+    CHECK(get() != nullptr);
+    return static_cast<const PassContextNode*>(get());
   }
   /*!
    * \brief mutable accessor.
    * \return mutable access pointer.
    */
   PassContextNode* operator->() {
-    CHECK(node_.get() != nullptr);
-    return static_cast<PassContextNode*>(node_.get());
+    CHECK(get() != nullptr);
+    return static_cast<PassContextNode*>(get_mutable());
   }
   /*!
    * \brief Construct a PassContext containing the default configurations.
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 16e36785c533..a5cc3c83383e 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -58,7 +58,7 @@ class TypeNode : public RelayNode {
 class Type : public NodeRef {
  public:
   Type() {}
-  explicit Type(NodePtr<tvm::Node> p) : NodeRef(p) {}
+  explicit Type(ObjectPtr<tvm::Object> p) : NodeRef(p) {}
 
   using ContainerType = TypeNode;
 };
@@ -430,10 +430,11 @@ class TypeReporterNode : public Node {
 class TypeReporter : public NodeRef {
  public:
   TypeReporter() {}
-  explicit TypeReporter(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {
+  explicit TypeReporter(::tvm::ObjectPtr<::tvm::Object> n) : NodeRef(n) {
   }
   TypeReporterNode* operator->() const {
-    return static_cast<TypeReporterNode*>(node_.get());
+    return const_cast<TypeReporterNode*>(
+        static_cast<const TypeReporterNode*>(get()));
   }
   using ContainerType = TypeReporterNode;
 };
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index b058fd63a2f5..267504beb11a 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -98,13 +98,12 @@ typedef enum {
   kTVMType = 5U,
   kTVMContext = 6U,
   kArrayHandle = 7U,
-  kNodeHandle = 8U,
+  kObjectHandle = 8U,
   kModuleHandle = 9U,
   kFuncHandle = 10U,
   kStr = 11U,
   kBytes = 12U,
   kNDArrayContainer = 13U,
-  kObjectHandle = 14U,
   // Extension codes for other frameworks to integrate TVM PackedFunc.
   // To make sure each framework's id do not conflict, use first and
   // last sections to mark ranges.
diff --git a/include/tvm/runtime/memory.h b/include/tvm/runtime/memory.h
index 6b4f01e4ac9b..01c08d324fcb 100644
--- a/include/tvm/runtime/memory.h
+++ b/include/tvm/runtime/memory.h
@@ -69,7 +69,7 @@ class ObjAllocatorBase {
                   "make_node can only be used to create NodeBase");
     T* ptr = Handler::New(static_cast<Derived*>(this),
                          std::forward<Args>(args)...);
-    ptr->type_index_ = T::type_index();
+    ptr->type_index_ = T::RuntimeTypeIndex();
     ptr->deleter_ = Handler::Deleter();
     return ObjectPtr<T>(ptr);
   }
diff --git a/include/tvm/runtime/node_base.h b/include/tvm/runtime/node_base.h
deleted file mode 100644
index 8b47c18a09a7..000000000000
--- a/include/tvm/runtime/node_base.h
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/runtime/node_base.h
- * \brief Base data structure for Node.
- *
- * \note Node is not a runtime feature.
- *  This file only exposes the signature of NodePtr for PackedFunc.
- */
-#ifndef TVM_RUNTIME_NODE_BASE_H_
-#define TVM_RUNTIME_NODE_BASE_H_
-
-#include <utility>
-#include <atomic>
-
-namespace tvm {
-
-// forward declarations
-template<typename T>
-class NodePtr;
-class Node;
-class NodeRef;
-
-/*!
- * \brief Base class of Node for runtime destructor purposes.
- *
- * Node is a reference counted object which is used to construct AST.
- * Each node is backed by a custom deleter, which deletes the object.
- * Do not call create raw Node pointer, always use tvm::make_node.
- *
- * \note In most cases, please inheritate tvm::Node.
- * \sa Node, NodePtr, make_node
- */
-class NodeBase {
- public:
-  /*!
-   * \brief type of NodeBase deleter
-   * \param self pointer to the NodeBase.
-   */
-  typedef void (*FDeleter)(NodeBase* self);
-
- protected:
-  // default constructor and copy constructor
-  NodeBase() {}
-  // override the copy and assign constructors to do nothing.
-  // This is to make sure only contents, but not deleter and ref_counter
-  // are copied when a child class copies itself.
-  NodeBase(const NodeBase& other) {  // NOLINT(*)
-  }
-  NodeBase(NodeBase&& other) {  // NOLINT(*)
-  }
-  NodeBase& operator=(const NodeBase& other) {  //NOLINT(*)
-    return *this;
-  }
-  NodeBase& operator=(NodeBase&& other) {  //NOLINT(*)
-    return *this;
-  }
-
- private:
-  /*! \brief Internal reference counter */
-  std::atomic<int> ref_counter_{0};
-  /*!
-   * \brief deleter of this object to enable customized allocation.
-   * If the deleter is nullptr, no deletion will be performed.
-   * The creator of the Node must always set the deleter field properly.
-   */
-  FDeleter deleter_ = nullptr;
-  // reference counting functions
-  void IncRef() {
-    ref_counter_.fetch_add(1, std::memory_order_relaxed);
-  }
-  void DecRef() {
-    if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) {
-      std::atomic_thread_fence(std::memory_order_acquire);
-      if (this->deleter_ != nullptr) {
-        (*this->deleter_)(this);
-      }
-    }
-  }
-  int use_count() const {
-    return ref_counter_.load(std::memory_order_relaxed);
-  }
-  // friend declaration
-  template<typename>
-  friend class NodePtr;
-  template<typename Y, typename... Args>
-  friend NodePtr<Y> make_node(Args&&...);
-};
-
-/*!
- * \brief Smart pointer for Node containers,
- *  must be subclass of NodeBase
- * \tparam T the content data type.
- */
-template<typename T>
-class NodePtr {
- public:
-  /*! \brief default constructor */
-  NodePtr() {}
-  /*! \brief default constructor */
-  NodePtr(std::nullptr_t) {}  // NOLINT(*)
-  /*!
-   * \brief copy constructor
-   * \param other The value to be moved
-   */
-  NodePtr(const NodePtr<T>& other)  // NOLINT(*)
-      : NodePtr(other.data_) {
-  }
-  /*!
-   * \brief copy constructor
-   * \param other The value to be moved
-   */
-  template<typename Y>
-  NodePtr(const NodePtr<Y>& other)  // NOLINT(*)
-      : NodePtr(other.data_) {
-    static_assert(std::is_base_of<T, Y>::value,
-                  "can only assign of child class NodePtr to parent");
-  }
-  /*!
-   * \brief move constructor
-   * \param other The value to be moved
-   */
-  NodePtr(NodePtr<T>&& other) // NOLINT(*)
-      : data_(other.data_) {
-    other.data_ = nullptr;
-  }
-  /*!
-   * \brief move constructor
-   * \param other The value to be moved
-   */
-  template<typename Y>
-  NodePtr(NodePtr<Y>&& other)  // NOLINT(*)
-      : data_(other.data_) {
-    static_assert(std::is_base_of<T, Y>::value,
-                  "can only assign of child class NodePtr to parent");
-    other.data_ = nullptr;
-  }
-  /*! \brief destructor */
-  ~NodePtr() {
-    this->reset();
-  }
-  /*!
-   * \brief Swap this array with another NDArray
-   * \param other The other NDArray
-   */
-  void swap(NodePtr<T>& other) {  // NOLINT(*)
-    std::swap(data_, other.data_);
-  }
-  /*!
-   * \return Get the content of the pointer
-   */
-  T* get() const {
-    return static_cast<T*>(data_);
-  }
-  /*!
-   * \return The pointer
-   */
-  T* operator->() const {
-    return get();
-  }
-  /*!
-   * \return The reference
-   */
-  T& operator*() const { // NOLINT(*)
-    return *get();
-  }
-  /*!
-   * \brief copy assignmemt
-   * \param other The value to be assigned.
-   * \return reference to self.
-   */
-  NodePtr<T>& operator=(const NodePtr<T>& other) {  // NOLINT(*)
-    // takes in plane operator to enable copy elison.
-    // copy-and-swap idiom
-    NodePtr(other).swap(*this);  // NOLINT(*)
-    return *this;
-  }
-  /*!
-   * \brief move assignmemt
-   * \param other The value to be assigned.
-   * \return reference to self.
-   */
-  NodePtr<T>& operator=(NodePtr<T>&& other) {  // NOLINT(*)
-    // copy-and-swap idiom
-    NodePtr(std::move(other)).swap(*this); // NOLINT(*)
-    return *this;
-  }
-  /*! \brief reset the content of ptr to be nullptr */
-  void reset() {
-    if (data_ != nullptr) {
-      data_->DecRef();
-      data_ = nullptr;
-    }
-  }
-  /*! \return The use count of the ptr, for debug purposes */
-  int use_count() const {
-    return data_ != nullptr ? data_->use_count() : 0;
-  }
-  /*! \return whether the reference is unique */
-  bool unique() const {
-    return data_ != nullptr && data_->use_count() == 1;
-  }
-  /*! \return Whether two NodePtr do not equals each other */
-  bool operator==(const NodePtr<T>& other) const {
-    return data_ == other.data_;
-  }
-  /*! \return Whether two NodePtr equals each other */
-  bool operator!=(const NodePtr<T>& other) const {
-    return data_ != other.data_;
-  }
-  /*! \return Whether the pointer is nullptr */
-  bool operator==(std::nullptr_t null) const {
-    return data_ == nullptr;
-  }
-  /*! \return Whether the pointer is not nullptr */
-  bool operator!=(std::nullptr_t null) const {
-    return data_ != nullptr;
-  }
-
- private:
-  /*! \brief internal pointer field */
-  NodeBase* data_{nullptr};
-  /*!
-   * \brief constructor from NodeBase
-   * \param data The node base pointer
-   */
-  explicit NodePtr(NodeBase* data)
-      : data_(data) {
-    if (data != nullptr) {
-      data_->IncRef();
-    }
-  }
-  // friend declaration
-  friend class Node;
-  template<typename>
-  friend class NodePtr;
-  template<typename Y, typename... Args>
-  friend NodePtr<Y> make_node(Args&&...);
-};
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_NODE_BASE_H_
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 7291510c16df..143f3bb35220 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -65,7 +65,7 @@ enum TypeIndex  {
  * - _type_index:
  *      Static type index of the object, if assigned to TypeIndex::kDynamic
  *      the type index will be assigned during runtime.
- *      Runtime type index can be accessed by ObjectType::type_index();
+ *      Runtime type index can be accessed by ObjectType::TypeIndex();
  * - _type_key:
  *       The unique string identifier of tyep type.
  * - _type_final:
@@ -147,10 +147,23 @@ class Object {
    * \param self pointer to the Object.
    */
   typedef void (*FDeleter)(Object* self);
-  /*! \return The internal type index of the object. */
+  /*! \return The internal runtime type index of the object. */
   uint32_t type_index() const {
     return type_index_;
   }
+  /*!
+   * \return the type key of the object.
+   * \note this operation is expensive, can be used for error reporting.
+   */
+  std::string GetTypeKey() const {
+    return TypeIndex2Key(type_index_);
+  }
+  /*!
+   * \return A hash value of the return of GetTypeKey.
+   */
+  size_t GetTypeKeyHash() const {
+    return TypeIndex2KeyHash(type_index_);
+  }
   /*!
    * Check if the object is an instance of TargetType.
    * \tparam TargetType The target type to be checked.
@@ -159,6 +172,25 @@ class Object {
   template<typename TargetType>
   inline bool IsInstance() const;
 
+  /*!
+   * \brief Get the type key of the corresponding index from runtime.
+   * \param tindex The type index.
+   * \return the result.
+   */
+  TVM_DLL static std::string TypeIndex2Key(uint32_t tindex);
+  /*!
+   * \brief Get the type key hash of the corresponding index from runtime.
+   * \param tindex The type index.
+   * \return the related key-hash.
+   */
+  TVM_DLL static size_t TypeIndex2KeyHash(uint32_t tindex);
+  /*!
+   * \brief Get the type index of the corresponding key from runtime.
+   * \param key The type key.
+   * \return the result.
+   */
+  TVM_DLL static uint32_t TypeKey2Index(const char* key);
+
 #if TVM_OBJECT_ATOMIC_REF_COUNTER
   using RefCounterType = std::atomic<int32_t>;
 #else
@@ -170,9 +202,30 @@ class Object {
   static constexpr bool _type_final = false;
   static constexpr uint32_t _type_child_slots = 0;
   static constexpr bool _type_child_slots_can_overflow = true;
-  static const uint32_t _GetOrAllocRuntimeTypeIndex() {
+  static uint32_t _GetOrAllocRuntimeTypeIndex() {
     return 0;
   }
+  static uint32_t RuntimeTypeIndex() {
+    return 0;
+  }
+
+  // Default constructor and copy constructor
+  Object() {}
+  // Override the copy and assign constructors to do nothing.
+  // This is to make sure only contents, but not deleter and ref_counter
+  // are copied when a child class copies itself.
+  // This will enable us to use make_object<ObjectClass>(*obj_ptr)
+  // to copy an existing object.
+  Object(const Object& other) {  // NOLINT(*)
+  }
+  Object(Object&& other) {  // NOLINT(*)
+  }
+  Object& operator=(const Object& other) {  //NOLINT(*)
+    return *this;
+  }
+  Object& operator=(Object&& other) {  //NOLINT(*)
+    return *this;
+  }
 
  protected:
   // The fields of the base object cell.
@@ -215,18 +268,6 @@ class Object {
       uint32_t type_child_slots,
       bool type_child_slots_can_overflow);
 
-  /*!
-   * \brief Get the type key of the corresponding index from runtime.
-   * \param tindex The type index.
-   */
-  TVM_DLL static std::string TypeIndex2Key(uint32_t tindex);
-
-  /*!
-   * \brief Get the type index of the corresponding key from runtime.
-   * \param key The type key.
-   */
-  TVM_DLL static uint32_t TypeKey2Index(const char* key);
-
  private:
   // reference counter related operations
   /*! \brief developer function, increases reference counter. */
@@ -256,6 +297,32 @@ class Object {
   friend class TVMObjectCAPI;
 };
 
+/*!
+ * \brief Get a reference type from a raw object ptr type
+ *
+ *  It is always important to get a reference type
+ *  if we want to return a value as reference or keep
+ *  the node alive beyond the scope of the function.
+ *
+ * \param ptr The node pointer
+ * \tparam RefType The reference type
+ * \tparam ObjectType The node type
+ * \return The corresponding RefType
+ */
+template <typename RefType, typename ObjectType>
+inline RefType GetRef(const ObjectType* ptr);
+
+/*!
+ * \brief Downcast a base reference type to a more specific type.
+ *
+ * \param ref The inptut reference
+ * \return The corresponding SubRef.
+ * \tparam SubRef The target specific reference type.
+ * \tparam BaseRef the current reference type.
+ */
+template <typename SubRef, typename BaseRef>
+inline SubRef Downcast(BaseRef ref);
+
 /*!
  * \brief A custom smart pointer for Object.
  * \tparam T the content data type.
@@ -389,7 +456,7 @@ class ObjectPtr {
   /*! \brief internal pointer field */
   Object* data_{nullptr};
   /*!
-   * \brief constructor from NodeBase
+   * \brief constructor from Object
    * \param data The data pointer
    */
   explicit ObjectPtr(Object* data) : data_(data) {
@@ -400,6 +467,7 @@ class ObjectPtr {
   // friend classes
   friend class Object;
   friend class ObjectRef;
+  friend struct ObjectHash;
   template<typename>
   friend class ObjectPtr;
   template<typename>
@@ -407,6 +475,9 @@ class ObjectPtr {
   friend class TVMPODValue_;
   friend class TVMArgsSetter;
   friend class TVMRetValue;
+  friend class TVMArgValue;
+  template <typename RefType, typename ObjType>
+  friend RefType GetRef(const ObjType* ptr);
 };
 
 /*! \brief Base class of all object reference */
@@ -416,10 +487,54 @@ class ObjectRef {
   ObjectRef() = default;
   /*! \brief Constructor from existing object ptr */
   explicit ObjectRef(ObjectPtr<Object> data) : data_(data) {}
+  /*!
+   * \brief Comparator
+   * \param other Another object ref.
+   * \return the compare result.
+   */
+  bool same_as(const ObjectRef& other) const {
+    return data_ == other.data_;
+  }
+  /*!
+   * \brief Comparator
+   * \param other Another object ref.
+   * \return the compare result.
+   */
+  bool operator==(const ObjectRef& other) const {
+    return data_ == other.data_;
+  }
+  /*!
+   * \brief Comparator
+   * \param other Another node ref.
+   * \return the compare result.
+   */
+  bool operator!=(const ObjectRef& other) const {
+    return data_ != other.data_;
+  }
+  /*!
+   * \brief Comparator
+   * \param other Another object ref by address.
+   * \return the compare result.
+   */
+  bool operator<(const ObjectRef& other) const {
+    return data_.get() < other.data_.get();
+  }
+  /*! \return whether the expression is null */
+  bool defined() const {
+    return data_ != nullptr;
+  }
   /*! \return the internal object pointer */
-  inline const Object* get() const;
+  const Object* get() const {
+    return data_.get();
+  }
   /*! \return the internal node pointer */
-  inline const Object* operator->() const;
+  const Object* operator->() const {
+    return get();
+  }
+  /*! \return whether the reference is unique */
+  bool unique() const {
+    return data_.unique();
+  }
   /*!
    * \brief Try to downcast the internal Object to a
    *  raw pointer of a corresponding type.
@@ -434,25 +549,81 @@ class ObjectRef {
   template <typename ObjectType>
   inline const ObjectType* as() const;
 
-  /*! \brief type indicate the container type */
+  /*! \brief type indicate the container type. */
   using ContainerType = Object;
 
  protected:
   /*! \brief Internal pointer that backs the reference. */
   ObjectPtr<Object> data_;
+  /*! \return return a mutable internal ptr, can be used by sub-classes. */
+  Object* get_mutable() const {
+    return data_.get();
+  }
+  /*!
+   * \brief Internal helper function downcast a ref without check.
+   * \note Only used for internal dev purposes.
+   * \tparam T The target reference type.
+   * \return The casted result.
+   */
+  template<typename T>
+  static T DowncastNoCheck(ObjectRef ref) {
+    return T(std::move(ref.data_));
+  }
+  /*!
+   * \brief Internal helper function get data_ as ObjectPtr of ObjectType.
+   * \note only used for internal dev purpose.
+   * \tparam ObjectType The corresponding object type.
+   * \return the corresponding type.
+   */
+  template<typename ObjectType>
+  static ObjectPtr<ObjectType> GetDataPtr(const ObjectRef& ref) {
+    return ObjectPtr<ObjectType>(ref.data_.data_);
+  }
   // friend classes.
+  friend struct ObjectHash;
   friend class TVMRetValue;
   friend class TVMArgsSetter;
+  template <typename SubRef, typename BaseRef>
+  friend SubRef Downcast(BaseRef ref);
 };
 
+
+/*! \brief ObjectRef hash functor */
+struct ObjectHash {
+  size_t operator()(const ObjectRef& a) const {
+    return operator()(a.data_);
+  }
+
+  template<typename T>
+  size_t operator()(const ObjectPtr<T>& a) const {
+    return std::hash<Object*>()(a.get());
+  }
+};
+
+
+/*! \brief ObjectRef equal functor */
+struct ObjectEqual {
+  bool operator()(const ObjectRef& a, const ObjectRef& b) const {
+    return a.same_as(b);
+  }
+
+  template<typename T>
+  size_t operator()(const ObjectPtr<T>& a, const ObjectPtr<T>& b) const {
+    return a == b;
+  }
+};
+
+
 /*!
  * \brief helper macro to declare a base object type that can be inheritated.
  * \param TypeName The name of the current type.
  * \param ParentType The name of the ParentType
  */
 #define TVM_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)              \
-  static const uint32_t type_index()  {                                 \
-    if (_type_index != TypeIndex::kDynamic) return _type_index;         \
+  static const uint32_t RuntimeTypeIndex()  {                           \
+    if (_type_index != ::tvm::runtime::TypeIndex::kDynamic) {           \
+      return _type_index;                                               \
+    }                                                                   \
     return _GetOrAllocRuntimeTypeIndex();                               \
   }                                                                     \
   static const uint32_t _GetOrAllocRuntimeTypeIndex()  {                \
@@ -551,11 +722,11 @@ inline bool Object::IsInstance() const {
     if (TargetType::_type_final) {
       // if the target type is a final type
       // then we only need to check the equivalence.
-      return self->type_index_ == TargetType::type_index();
+      return self->type_index_ == TargetType::RuntimeTypeIndex();
     } else {
       // if target type is a non-leaf type
       // Check if type index falls into the range of reserved slots.
-      uint32_t begin = TargetType::type_index();
+      uint32_t begin = TargetType::RuntimeTypeIndex();
       // The condition will be optimized by constant-folding.
       if (TargetType::_type_child_slots != 0) {
         uint32_t end = begin + TargetType::_type_child_slots;
@@ -565,22 +736,15 @@ inline bool Object::IsInstance() const {
       }
       if (!TargetType::_type_child_slots_can_overflow) return false;
       // Invariance: parent index is always smaller than the child.
-      if (self->type_index_ < TargetType::type_index()) return false;
+      if (self->type_index_ < TargetType::RuntimeTypeIndex()) return false;
       // The rare slower-path, check type hierachy.
-      return self->DerivedFrom(TargetType::type_index());
+      return self->DerivedFrom(TargetType::RuntimeTypeIndex());
     }
   } else {
     return false;
   }
 }
 
-inline const Object* ObjectRef::get() const {
-  return data_.data_;
-}
-
-inline const Object* ObjectRef::operator->() const {
-  return get();
-}
 
 template <typename ObjectType>
 inline const ObjectType* ObjectRef::as() const {
@@ -591,7 +755,27 @@ inline const ObjectType* ObjectRef::as() const {
     return nullptr;
   }
 }
+
+template <typename RefType, typename ObjType>
+inline RefType GetRef(const ObjType* ptr) {
+  static_assert(std::is_base_of<typename RefType::ContainerType, ObjType>::value,
+                "Can only cast to the ref of same container type");
+  return RefType(ObjectPtr<Object>(const_cast<Object*>(static_cast<const Object*>(ptr))));
+}
+
+template <typename SubRef, typename BaseRef>
+inline SubRef Downcast(BaseRef ref) {
+  CHECK(ref->template IsInstance<typename SubRef::ContainerType>())
+      << "Downcast from " << ref->GetTypeKey() << " to "
+      << SubRef::ContainerType::_type_key << " failed.";
+  return SubRef(std::move(ref.data_));
+}
+
 }  // namespace runtime
+
+template<typename T>
+using NodePtr = runtime::ObjectPtr<T>;
+
 }  // namespace tvm
 
 #endif  // TVM_RUNTIME_OBJECT_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 2bfa3323e4f1..649a5058a9a5 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -40,7 +40,6 @@
 #include "module.h"
 #include "ndarray.h"
 #include "object.h"
-#include "node_base.h"
 
 // Whether use TVM runtime in header only mode.
 #ifndef TVM_RUNTIME_HEADER_ONLY
@@ -52,6 +51,8 @@ namespace tvm {
 class Integer;
 class DataType;
 class Expr;
+class Node;
+class NodeRef;
 
 namespace runtime {
 
@@ -490,9 +491,12 @@ class TVMPODValue_ {
     return NDArray(static_cast<NDArray::Container*>(value_.v_handle));
   }
   operator ObjectRef() const {
-    if (type_code_ == kNull) return ObjectRef(ObjectPtr<Object>(nullptr));
+    if (type_code_ == kNull) {
+      return ObjectRef(ObjectPtr<Object>(nullptr));
+    }
     TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
-    return ObjectRef(ObjectPtr<Object>(static_cast<Object*>(value_.v_handle)));
+    return ObjectRef(
+        ObjectPtr<Object>(static_cast<Object*>(value_.v_handle)));
   }
   operator TVMContext() const {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMContext);
@@ -512,9 +516,14 @@ class TVMPODValue_ {
     CHECK_LT(type_code_, kExtEnd);
     return static_cast<TExtension*>(value_.v_handle)[0];
   }
+  template<typename TNodeRef,
+           typename = typename std::enable_if<
+             std::is_class<TNodeRef>::value>::type>
+  inline bool IsObjectRef() const;
   int type_code() const {
     return type_code_;
   }
+
   /*!
    * \brief return handle as specific pointer type.
    * \tparam T the data type.
@@ -567,6 +576,7 @@ class TVMArgValue : public TVMPODValue_ {
   using TVMPODValue_::operator NDArray;
   using TVMPODValue_::operator TVMContext;
   using TVMPODValue_::operator ObjectRef;
+  using TVMPODValue_::IsObjectRef;
 
   // conversion operator.
   operator std::string() const {
@@ -616,15 +626,9 @@ class TVMArgValue : public TVMPODValue_ {
            typename = typename std::enable_if<
            std::is_class<T>::value>::type>
   inline operator T() const;
-  template<typename TNodeRef,
-           typename = typename std::enable_if<
-             std::is_class<TNodeRef>::value>::type>
-  inline bool IsNodeType() const;
   inline operator tvm::DataType() const;
   inline operator tvm::Expr() const;
   inline operator tvm::Integer() const;
-  // get internal node ptr, if it is node
-  inline NodePtr<Node>& node_sptr();
 };
 
 /*!
@@ -663,6 +667,8 @@ class TVMRetValue : public TVMPODValue_ {
   using TVMPODValue_::operator TVMContext;
   using TVMPODValue_::operator NDArray;
   using TVMPODValue_::operator ObjectRef;
+  using TVMPODValue_::IsObjectRef;
+
   TVMRetValue(const TVMRetValue& other) : TVMPODValue_() {
     this->Assign(other);
   }
@@ -760,11 +766,19 @@ class TVMRetValue : public TVMPODValue_ {
     return *this;
   }
   TVMRetValue& operator=(ObjectRef other) {
-    this->Clear();
-    type_code_ = kObjectHandle;
-    // move the handle out
-    value_.v_handle = other.data_.data_;
-    other.data_.data_ = nullptr;
+    return operator=(std::move(other.data_));
+  }
+  template<typename T>
+  TVMRetValue& operator=(ObjectPtr<T> other) {
+    if (other.data_ != nullptr) {
+      this->Clear();
+      type_code_ = kObjectHandle;
+      // move the handle out
+      value_.v_handle = other.data_;
+      other.data_ = nullptr;
+    } else {
+      SwitchToPOD(kNull);
+    }
     return *this;
   }
   TVMRetValue& operator=(PackedFunc f) {
@@ -814,7 +828,7 @@ class TVMRetValue : public TVMPODValue_ {
   }
   /*! \return The value field, if the data is POD */
   const TVMValue& value() const {
-    CHECK(type_code_ != kNodeHandle &&
+    CHECK(type_code_ != kObjectHandle &&
           type_code_ != kFuncHandle &&
           type_code_ != kModuleHandle &&
           type_code_ != kStr) << "TVMRetValue.value can only be used for POD data";
@@ -827,8 +841,6 @@ class TVMRetValue : public TVMPODValue_ {
   inline operator T() const;
   template<typename TNodeRef>
   inline TNodeRef AsNodeRef() const;
-  inline TVMRetValue& operator=(const NodeRef& other);
-  inline TVMRetValue& operator=(const NodePtr<Node>& other);
   // type related
   inline operator tvm::DataType() const;
   inline TVMRetValue& operator=(const tvm::DataType& other);
@@ -857,11 +869,6 @@ class TVMRetValue : public TVMPODValue_ {
         *this = other.operator NDArray();
         break;
       }
-      case kNodeHandle: {
-        SwitchToClass<NodePtr<Node> >(
-            kNodeHandle, *other.template ptr<NodePtr<Node> >());
-        break;
-      }
       case kObjectHandle: {
         *this = other.operator ObjectRef();
         break;
@@ -908,7 +915,6 @@ class TVMRetValue : public TVMPODValue_ {
       case kStr: delete ptr<std::string>(); break;
       case kFuncHandle: delete ptr<PackedFunc>(); break;
       case kModuleHandle: delete ptr<Module>(); break;
-      case kNodeHandle: delete ptr<NodePtr<Node> >(); break;
       case kNDArrayContainer: {
         static_cast<NDArray::Container*>(value_.v_handle)->DecRef();
         break;
@@ -939,7 +945,6 @@ inline const char* TypeCode2Str(int type_code) {
     case kBytes: return "bytes";
     case kHandle: return "handle";
     case kNull: return "NULL";
-    case kNodeHandle: return "NodeHandle";
     case kArrayHandle: return "ArrayHandle";
     case kTVMType: return "TVMType";
     case kTVMContext: return "TVMContext";
@@ -1057,8 +1062,6 @@ inline PackedFunc::FType PackedFunc::body() const {
   return body_;
 }
 
-
-
 // internal namespace
 namespace detail {
 
@@ -1163,8 +1166,12 @@ class TVMArgsSetter {
     type_codes_[i] = kNDArrayContainer;
   }
   void operator()(size_t i, const ObjectRef& value) const {  // NOLINT(*)
-    values_[i].v_handle = value.data_.data_;
-    type_codes_[i] = kObjectHandle;
+    if (value.defined()) {
+      values_[i].v_handle = value.data_.data_;
+      type_codes_[i] = kObjectHandle;
+    } else {
+      type_codes_[i] = kNull;
+    }
   }
   void operator()(size_t i, const TVMRetValue& value) const {  // NOLINT(*)
     if (value.type_code() == kStr) {
@@ -1181,8 +1188,6 @@ class TVMArgsSetter {
            typename = typename std::enable_if<
              extension_type_info<T>::code != 0>::type>
   inline void operator()(size_t i, const T& value) const;
-  // NodeRef related extenstions: in tvm/packed_func_ext.h
-  inline void operator()(size_t i, const NodeRef& other) const;  // NOLINT(*)
   inline void operator()(size_t i, const tvm::DataType& t) const;
 
  private:
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index af3e929ac3fa..36265667e5b6 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -56,7 +56,7 @@ enum AttachType : int {
 class Stage : public NodeRef {
  public:
   Stage() {}
-  explicit Stage(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Stage(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief create a new schedule for op.
    * \param op The operator in the schedule
@@ -280,7 +280,7 @@ class Stage : public NodeRef {
 class Schedule : public NodeRef {
  public:
   Schedule() {}
-  explicit Schedule(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Schedule(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief Get a copy of current schedule.
    * \return The copied schedule.
@@ -403,7 +403,7 @@ class Schedule : public NodeRef {
 class IterVarRelation : public NodeRef {
  public:
   IterVarRelation() {}
-  explicit IterVarRelation(NodePtr<Node> n) : NodeRef(n) {}
+  explicit IterVarRelation(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -417,7 +417,7 @@ class IterVarRelation : public NodeRef {
 class IterVarAttr : public NodeRef {
  public:
   IterVarAttr() {}
-  explicit IterVarAttr(NodePtr<Node> n) : NodeRef(n) {}
+  explicit IterVarAttr(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -745,25 +745,25 @@ class SingletonNode : public IterVarRelationNode {
 
 // implementations
 inline const StageNode* Stage::operator->() const {
-  return static_cast<const StageNode*>(node_.get());
+  return static_cast<const StageNode*>(get());
 }
 inline StageNode* Stage::operator->() {
-  return static_cast<StageNode*>(node_.get());
+  return static_cast<StageNode*>(get_mutable());
 }
 
 inline const ScheduleNode* Schedule::operator->() const {
-  return static_cast<const ScheduleNode*>(node_.get());
+  return static_cast<const ScheduleNode*>(get());
 }
 inline ScheduleNode* Schedule::operator->() {
-  return static_cast<ScheduleNode*>(node_.get());
+  return static_cast<ScheduleNode*>(get_mutable());
 }
 
 inline const IterVarRelationNode* IterVarRelation::operator->() const {
-  return static_cast<const IterVarRelationNode*>(node_.get());
+  return static_cast<const IterVarRelationNode*>(get());
 }
 
 inline const IterVarAttrNode* IterVarAttr::operator->() const {
-  return static_cast<const IterVarAttrNode*>(node_.get());
+  return static_cast<const IterVarAttrNode*>(get());
 }
 }  // namespace tvm
 #endif  // TVM_SCHEDULE_H_
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index f37cc7bed7d1..6471c9c69a62 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -50,7 +50,7 @@ class Tensor : public NodeRef {
  public:
   /*! \brief default constructor, used internally */
   Tensor() {}
-  explicit Tensor(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Tensor(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -141,7 +141,7 @@ class Operation : public ir::FunctionRef {
  public:
   /*! \brief default constructor  */
   Operation() {}
-  explicit Operation(NodePtr<Node> n) : FunctionRef(n) {}
+  explicit Operation(ObjectPtr<Object> n) : FunctionRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -189,7 +189,7 @@ class TensorNode : public Node {
 
 // Implementations of inline functions
 inline const TensorNode* Tensor::operator->() const {
-  return static_cast<const TensorNode*>(node_.get());
+  return static_cast<const TensorNode*>(get());
 }
 
 inline size_t Tensor::ndim() const {
@@ -250,19 +250,17 @@ DEFINE_OVERLOAD_SLICE_BINARY_OP(<);  // NOLINT(*)
 
 namespace std {
 template <>
-struct hash<::tvm::Operation> {
-  std::size_t operator()(const ::tvm::Operation& k) const {
-    return k.hash();
-  }
+struct hash<::tvm::Operation> : public ::tvm::NodeHash {
 };
 
 template <>
 struct hash<::tvm::Tensor> {
   std::size_t operator()(const ::tvm::Tensor& k) const {
+    ::tvm::NodeHash hasher;
     if (k.defined() && k->op.defined()) {
-      return k->op.hash();
+      return hasher(k->op);
     } else{
-      return k.hash();
+      return hasher(k);
     }
   }
 };
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index b5ca6eb4358b..152a27f6e2a9 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -112,7 +112,7 @@ class TensorIntrinNode : public Node {
 };
 
 inline const TensorIntrinNode* TensorIntrin::operator->() const {
-  return static_cast<const TensorIntrinNode*>(node_.get());
+  return static_cast<const TensorIntrinNode*>(get());
 }
 
 // Internal node container of tensor intrinsic calling.
@@ -170,7 +170,7 @@ class TensorIntrinCallNode : public Node {
 };
 
 inline const TensorIntrinCallNode* TensorIntrinCall::operator->() const {
-  return static_cast<const TensorIntrinCallNode*>(node_.get());
+  return static_cast<const TensorIntrinCallNode*>(get());
 }
 
 }  // namespace tvm
diff --git a/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc b/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc
index 1eff6c45e1fc..b4bfd4270775 100644
--- a/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc
+++ b/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc
@@ -242,7 +242,7 @@ extern "C" int funcInvokeCallback(TVMValue *args,
   for (int i = 0; i < numArgs; ++i) {
     TVMValue arg = args[i];
     int tcode = typeCodes[i];
-    if (tcode == kNodeHandle || tcode == kFuncHandle || tcode == kModuleHandle) {
+    if (tcode == kObjectHandle || tcode == kFuncHandle || tcode == kModuleHandle) {
       TVMCbArgToReturn(&arg, tcode);
     }
     jobject jarg = tvmRetValueToJava(env, arg, tcode);
@@ -259,8 +259,8 @@ extern "C" int funcInvokeCallback(TVMValue *args,
       reinterpret_cast<jobject>(resourceHandle), jargs);
 
   TVMFuncArgsThreadLocalEntry *e = TVMFuncArgsThreadLocalStore::Get();
-  const int prevNumStrArg = e->tvmFuncArgPushedStrs.size();
-  const int prevNumBytesArg = e->tvmFuncArgPushedBytes.size();
+  const size_t prevNumStrArg = e->tvmFuncArgPushedStrs.size();
+  const size_t prevNumBytesArg = e->tvmFuncArgPushedBytes.size();
 
   // convert returned (java) TVMValue to (C) TVMValue
   env->CallStaticVoidMethod(clsFunc, pushArgToStack, jretValue);
diff --git a/nnvm/include/nnvm/compiler/util.h b/nnvm/include/nnvm/compiler/util.h
index fa8b69f9b70a..9555c0e7b3ea 100644
--- a/nnvm/include/nnvm/compiler/util.h
+++ b/nnvm/include/nnvm/compiler/util.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -56,7 +56,7 @@ inline tvm::Array<tvm::Expr> ShapeToArray(TShape shape) {
  * \return An Array of Expr, where each element is a constant int32
  */
 inline tvm::Array<tvm::Integer> ShapeToIntArray(TShape shape) {
-  return tvm::Array<tvm::Integer>(ShapeToArray(shape).node_);
+  return tvm::Downcast<tvm::Array<tvm::Integer> >(ShapeToArray(shape));
 }
 }  // namespace compiler
 }  // namespace nnvm
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index 3da95e879fa7..c9cdaef63935 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -392,6 +392,9 @@ TVM_REGISTER_GLOBAL("nnvm.compiler.CacheItem2ScheduleArgs")
     *rv = ret;
   });
 
+TVM_REGISTER_NODE_TYPE(GraphFuncNode);
+TVM_REGISTER_NODE_TYPE(GraphCacheEntryNode);
+
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<GraphFuncNode>([](const GraphFuncNode *op, IRPrinter *p) {
     p->stream << "GraphFunc(name=" << op->func_name
diff --git a/nnvm/src/compiler/compile_engine.h b/nnvm/src/compiler/compile_engine.h
index 35287f5a9358..e8d33cb4be7e 100644
--- a/nnvm/src/compiler/compile_engine.h
+++ b/nnvm/src/compiler/compile_engine.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -92,7 +92,7 @@ class GraphCacheEntry : public ::tvm::NodeRef {
   GraphCacheEntry() {}
   explicit GraphCacheEntry(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {}
   GraphCacheEntryNode* operator->() {
-    return static_cast<GraphCacheEntryNode*>(node_.get());
+    return static_cast<GraphCacheEntryNode*>(get_mutable());
   }
   using ContainerType = GraphCacheEntryNode;
 };
diff --git a/nnvm/src/compiler/graph_runtime.h b/nnvm/src/compiler/graph_runtime.h
index 3a847de83d9f..7b324ba100ad 100644
--- a/nnvm/src/compiler/graph_runtime.h
+++ b/nnvm/src/compiler/graph_runtime.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -28,7 +28,6 @@
 #include <nnvm/graph.h>
 #include <tvm/base.h>
 #include <tvm/expr.h>
-#include <tvm/node/memory.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/ndarray.h>
 #include <vector>
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
index bbcc62a99ad8..45f1451663e6 100644
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -115,7 +115,7 @@ TVM_REGISTER_GLOBAL("nnvm._register_compute")
                         const Array<Tensor>& out_info)
         -> Array<Tensor> {
       TVMRetValue ret = (*f)(GetAttrDict(attrs), inputs, out_info);
-      if ((*ret.ptr<::tvm::NodePtr<tvm::Node> >())->derived_from<tvm::TensorNode>()) {
+      if (ret.IsObjectRef<tvm::Tensor>()) {
         return {ret.operator Tensor()};
       } else {
         return ret;
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index cafb99926bfa..ab18c2d7337a 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -1242,7 +1242,7 @@ Array<Integer> GetIntArray(Array<Expr> arr) {
     CHECK(!arr[i].defined() || arr[i].as<IntImm>())
         << "Expect an int array";
   }
-  return Array<Integer>(arr.node_);
+  return Downcast<Array<Integer> >(arr);
 }
 
 NNVM_REGISTER_OP(slice_like)
diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index 22fb6c335dcc..2f0b5babda4d 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # coding: utf-8
-# pylint: disable=invalid-name, protected-access, too-many-branches, global-statement
+# pylint: disable=invalid-name, protected-access, too-many-branches, global-statement, unused-import
 """Function configuration API."""
 from __future__ import absolute_import
 
@@ -32,9 +32,8 @@
 from .types import TVMValue, TypeCode
 from .types import TVMPackedCFunc, TVMCFuncFinalizer
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _ctx_to_int64
-from .node import NodeBase
+from .object import ObjectBase, _set_class_node
 from . import object as _object
-from . import node as _node
 
 FunctionHandle = ctypes.c_void_p
 ModuleHandle = ctypes.c_void_p
@@ -108,9 +107,9 @@ def _make_tvm_args(args, temp_args):
     values = (TVMValue * num_args)()
     type_codes = (ctypes.c_int * num_args)()
     for i, arg in enumerate(args):
-        if isinstance(arg, NodeBase):
+        if isinstance(arg, ObjectBase):
             values[i].v_handle = arg.handle
-            type_codes[i] = TypeCode.NODE_HANDLE
+            type_codes[i] = TypeCode.OBJECT_HANDLE
         elif arg is None:
             values[i].v_handle = None
             type_codes[i] = TypeCode.NULL
@@ -148,7 +147,7 @@ def _make_tvm_args(args, temp_args):
         elif isinstance(arg, (list, tuple, dict, NodeGeneric)):
             arg = convert_to_node(arg)
             values[i].v_handle = arg.handle
-            type_codes[i] = TypeCode.NODE_HANDLE
+            type_codes[i] = TypeCode.OBJECT_HANDLE
             temp_args.append(arg)
         elif isinstance(arg, _CLASS_MODULE):
             values[i].v_handle = arg.handle
@@ -164,9 +163,6 @@ def _make_tvm_args(args, temp_args):
             values[i].v_handle = arg.handle
             type_codes[i] = TypeCode.FUNC_HANDLE
             temp_args.append(arg)
-        elif isinstance(arg, _CLASS_OBJECT):
-            values[i].v_handle = arg.handle
-            type_codes[i] = TypeCode.OBJECT_HANDLE
         else:
             raise TypeError("Don't know how to handle type %s" % type(arg))
     return values, type_codes, num_args
@@ -226,7 +222,7 @@ def __init_handle_by_constructor__(fconstructor, args):
         raise get_last_ffi_error()
     _ = temp_args
     _ = args
-    assert ret_tcode.value in (TypeCode.NODE_HANDLE, TypeCode.OBJECT_HANDLE)
+    assert ret_tcode.value == TypeCode.OBJECT_HANDLE
     handle = ret_val.v_handle
     return handle
 
@@ -247,7 +243,6 @@ def _handle_return_func(x):
     return _CLASS_FUNCTION(handle, False)
 
 # setup return handle for function type
-_node.__init_by_constructor__ = __init_handle_by_constructor__
 _object.__init_by_constructor__ = __init_handle_by_constructor__
 RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func
 RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module
diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/node.py
deleted file mode 100644
index 39fe0ef35525..000000000000
--- a/python/tvm/_ffi/_ctypes/node.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, protected-access
-# pylint: disable=no-member, missing-docstring, not-callable
-from __future__ import absolute_import
-
-import ctypes
-from ..base import _LIB, check_call, c_str
-from ..node_generic import _set_class_node_base
-from .types import TVMValue, TypeCode
-from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
-
-NodeHandle = ctypes.c_void_p
-__init_by_constructor__ = None
-
-"""Maps node type to its constructor"""
-NODE_TYPE = {}
-
-def _register_node(index, cls):
-    """register node class"""
-    NODE_TYPE[index] = cls
-
-def _return_node(x):
-    """Return node function"""
-    handle = x.v_handle
-    if not isinstance(handle, NodeHandle):
-        handle = NodeHandle(handle)
-    tindex = ctypes.c_int()
-    check_call(_LIB.TVMNodeGetTypeIndex(handle, ctypes.byref(tindex)))
-    cls = NODE_TYPE.get(tindex.value, NodeBase)
-    # Avoid calling __init__ of cls, instead directly call __new__
-    # This allows child class to implement their own __init__
-    node = cls.__new__(cls)
-    node.handle = handle
-    return node
-
-
-RETURN_SWITCH[TypeCode.NODE_HANDLE] = _return_node
-C_TO_PY_ARG_SWITCH[TypeCode.NODE_HANDLE] = _wrap_arg_func(
-    _return_node, TypeCode.NODE_HANDLE)
-
-
-class NodeBase(object):
-    __slots__ = ["handle"]
-    # pylint: disable=no-member
-    def __del__(self):
-        if _LIB is not None:
-            check_call(_LIB.TVMNodeFree(self.handle))
-
-    def __getattr__(self, name):
-        ret_val = TVMValue()
-        ret_type_code = ctypes.c_int()
-        ret_success = ctypes.c_int()
-        check_call(_LIB.TVMNodeGetAttr(
-            self.handle, c_str(name),
-            ctypes.byref(ret_val),
-            ctypes.byref(ret_type_code),
-            ctypes.byref(ret_success)))
-        if not ret_success.value:
-            raise AttributeError(
-                "'%s' object has no attribute '%s'" % (str(type(self)), name))
-        return RETURN_SWITCH[ret_type_code.value](ret_val)
-
-    def __init_handle_by_constructor__(self, fconstructor, *args):
-        """Initialize the handle by calling constructor function.
-
-        Parameters
-        ----------
-        fconstructor : Function
-            Constructor function.
-
-        args: list of objects
-            The arguments to the constructor
-
-        Note
-        ----
-        We have a special calling convention to call constructor functions.
-        So the return handle is directly set into the Node object
-        instead of creating a new Node.
-        """
-        # assign handle first to avoid error raising
-        self.handle = None
-        handle = __init_by_constructor__(fconstructor, args)
-        if not isinstance(handle, NodeHandle):
-            handle = NodeHandle(handle)
-        self.handle = handle
-
-_set_class_node_base(NodeBase)
diff --git a/python/tvm/_ffi/_ctypes/object.py b/python/tvm/_ffi/_ctypes/object.py
index 5ddceb166677..c3ae56822198 100644
--- a/python/tvm/_ffi/_ctypes/object.py
+++ b/python/tvm/_ffi/_ctypes/object.py
@@ -21,6 +21,7 @@
 import ctypes
 from ..base import _LIB, check_call
 from .types import TypeCode, RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
+from ..node_generic import _set_class_node_base
 
 
 ObjectHandle = ctypes.c_void_p
@@ -29,6 +30,13 @@
 """Maps object type to its constructor"""
 OBJECT_TYPE = {}
 
+_CLASS_NODE = None
+
+def _set_class_node(node_class):
+    global _CLASS_NODE
+    _CLASS_NODE = node_class
+
+
 def _register_object(index, cls):
     """register object class"""
     OBJECT_TYPE[index] = cls
@@ -40,7 +48,7 @@ def _return_object(x):
         handle = ObjectHandle(handle)
     tindex = ctypes.c_uint()
     check_call(_LIB.TVMObjectGetTypeIndex(handle, ctypes.byref(tindex)))
-    cls = OBJECT_TYPE.get(tindex.value, ObjectBase)
+    cls = OBJECT_TYPE.get(tindex.value, _CLASS_NODE)
     # Avoid calling __init__ of cls, instead directly call __new__
     # This allows child class to implement their own __init__
     obj = cls.__new__(cls)
@@ -83,3 +91,6 @@ def __init_handle_by_constructor__(self, fconstructor, *args):
         if not isinstance(handle, ObjectHandle):
             handle = ObjectHandle(handle)
         self.handle = handle
+
+
+_set_class_node_base(ObjectBase)
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 76fa96376b47..4b7b2c88ffa5 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -31,13 +31,12 @@ cdef enum TVMTypeCode:
     kTVMType = 5
     kTVMContext = 6
     kArrayHandle = 7
-    kNodeHandle = 8
+    kObjectHandle = 8
     kModuleHandle = 9
     kFuncHandle = 10
     kStr = 11
     kBytes = 12
     kNDArrayContainer = 13
-    kObjectHandle = 14
     kExtBegin = 15
 
 cdef extern from "tvm/runtime/c_runtime_api.h":
@@ -78,7 +77,7 @@ ctypedef void* TVMStreamHandle
 ctypedef void* TVMRetValueHandle
 ctypedef void* TVMFunctionHandle
 ctypedef void* ObjectHandle
-ctypedef void* NodeHandle
+
 
 ctypedef struct TVMNDArrayContainer:
     DLTensor dl_tensor
@@ -134,18 +133,6 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
     int TVMObjectGetTypeIndex(ObjectHandle obj, unsigned* out_index)
 
 
-cdef extern from "tvm/c_dsl_api.h":
-    int TVMNodeFree(NodeHandle handle)
-    int TVMNodeTypeKey2Index(const char* type_key,
-                             int* out_index)
-    int TVMNodeGetTypeIndex(NodeHandle handle,
-                            int* out_index)
-    int TVMNodeGetAttr(NodeHandle handle,
-                       const char* key,
-                       TVMValue* out_value,
-                       int* out_type_code,
-                       int* out_success)
-
 cdef inline py_str(const char* x):
     if PY_MAJOR_VERSION < 3:
         return x
diff --git a/python/tvm/_ffi/_cython/core.pyx b/python/tvm/_ffi/_cython/core.pyx
index a9349338fc6a..cbf9d5859046 100644
--- a/python/tvm/_ffi/_cython/core.pyx
+++ b/python/tvm/_ffi/_cython/core.pyx
@@ -17,7 +17,7 @@
 
 include "./base.pxi"
 include "./object.pxi"
-include "./node.pxi"
+# include "./node.pxi"
 include "./function.pxi"
 include "./ndarray.pxi"
 
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index ceacf7407170..a2360427b6c7 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -41,10 +41,9 @@ cdef int tvm_callback(TVMValue* args,
     for i in range(num_args):
         value = args[i]
         tcode = type_codes[i]
-        if (tcode == kNodeHandle or
+        if (tcode == kObjectHandle or
             tcode == kFuncHandle or
             tcode == kModuleHandle or
-            tcode == kObjectHandle or
             tcode > kExtBegin):
             CALL(TVMCbArgToReturn(&value, tcode))
 
@@ -98,9 +97,9 @@ cdef inline int make_arg(object arg,
                          list temp_args) except -1:
     """Pack arguments into c args tvm call accept"""
     cdef unsigned long long ptr
-    if isinstance(arg, NodeBase):
-        value[0].v_handle = (<NodeBase>arg).chandle
-        tcode[0] = kNodeHandle
+    if isinstance(arg, ObjectBase):
+        value[0].v_handle = (<ObjectBase>arg).chandle
+        tcode[0] = kObjectHandle
     elif isinstance(arg, NDArrayBase):
         value[0].v_handle = (<NDArrayBase>arg).chandle
         tcode[0] = (kNDArrayContainer if
@@ -152,12 +151,9 @@ cdef inline int make_arg(object arg,
         temp_args.append(tstr)
     elif isinstance(arg, (list, tuple, dict, NodeGeneric)):
         arg = convert_to_node(arg)
-        value[0].v_handle = (<NodeBase>arg).chandle
-        tcode[0] = kNodeHandle
-        temp_args.append(arg)
-    elif isinstance(arg, _CLASS_OBJECT):
         value[0].v_handle = (<ObjectBase>arg).chandle
         tcode[0] = kObjectHandle
+        temp_args.append(arg)
     elif isinstance(arg, _CLASS_MODULE):
         value[0].v_handle = c_handle(arg.handle)
         tcode[0] = kModuleHandle
@@ -188,9 +184,7 @@ cdef inline bytearray make_ret_bytes(void* chandle):
 
 cdef inline object make_ret(TVMValue value, int tcode):
     """convert result to return value."""
-    if tcode == kNodeHandle:
-        return make_ret_node(value.v_handle)
-    elif tcode == kObjectHandle:
+    if tcode == kObjectHandle:
         return make_ret_object(value.v_handle)
     elif tcode == kNull:
         return None
@@ -314,6 +308,7 @@ cdef class FunctionBase:
 _CLASS_FUNCTION = None
 _CLASS_MODULE = None
 _CLASS_OBJECT = None
+_CLASS_NODE = None
 
 def _set_class_module(module_class):
     """Initialize the module."""
@@ -327,3 +322,7 @@ def _set_class_function(func_class):
 def _set_class_object(obj_class):
     global _CLASS_OBJECT
     _CLASS_OBJECT = obj_class
+
+def _set_class_node(node_class):
+    global _CLASS_NODE
+    _CLASS_NODE = node_class
diff --git a/python/tvm/_ffi/_cython/node.pxi b/python/tvm/_ffi/_cython/node.pxi
deleted file mode 100644
index 5e0c366e5600..000000000000
--- a/python/tvm/_ffi/_cython/node.pxi
+++ /dev/null
@@ -1,110 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from ... import _api_internal
-from ..base import string_types
-from ..node_generic import _set_class_node_base
-
-"""Maps node type to its constructor"""
-NODE_TYPE = []
-
-def _register_node(int index, object cls):
-    """register node class"""
-    while len(NODE_TYPE) <= index:
-        NODE_TYPE.append(None)
-    NODE_TYPE[index] = cls
-
-
-cdef inline object make_ret_node(void* chandle):
-    global NODE_TYPE
-    cdef int tindex
-    cdef list node_type
-    cdef object cls
-    node_type = NODE_TYPE
-    CALL(TVMNodeGetTypeIndex(chandle, &tindex))
-    if tindex < len(node_type):
-        cls = node_type[tindex]
-        if cls is not None:
-            obj = cls.__new__(cls)
-        else:
-            obj = NodeBase.__new__(NodeBase)
-    else:
-        obj = NodeBase.__new__(NodeBase)
-    (<NodeBase>obj).chandle = chandle
-    return obj
-
-
-cdef class NodeBase:
-    cdef void* chandle
-
-    cdef _set_handle(self, handle):
-        cdef unsigned long long ptr
-        if handle is None:
-            self.chandle = NULL
-        else:
-            ptr = handle.value
-            self.chandle = <void*>(ptr)
-
-    property handle:
-        def __get__(self):
-            if self.chandle == NULL:
-                return None
-            else:
-                return ctypes_handle(self.chandle)
-
-        def __set__(self, value):
-            self._set_handle(value)
-
-    def __dealloc__(self):
-        CALL(TVMNodeFree(self.chandle))
-
-    def __getattr__(self, name):
-        cdef TVMValue ret_val
-        cdef int ret_type_code, ret_succ
-        CALL(TVMNodeGetAttr(self.chandle, c_str(name),
-                            &ret_val, &ret_type_code, &ret_succ))
-        if ret_succ == 0:
-            raise AttributeError(
-                "'%s' object has no attribute '%s'" % (type(self), name))
-        return make_ret(ret_val, ret_type_code)
-
-    def __init_handle_by_constructor__(self, fconstructor, *args):
-        """Initialize the handle by calling constructor function.
-
-        Parameters
-        ----------
-        fconstructor : Function
-            Constructor function.
-
-        args: list of objects
-            The arguments to the constructor
-
-        Note
-        ----
-        We have a special calling convention to call constructor functions.
-        So the return handle is directly set into the Node object
-        instead of creating a new Node.
-        """
-        # avoid error raised during construction.
-        self.chandle = NULL
-        cdef void* chandle
-        ConstructorCall(
-            (<FunctionBase>fconstructor).chandle,
-            kNodeHandle, args, &chandle)
-        self.chandle = chandle
-
-_set_class_node_base(NodeBase)
diff --git a/python/tvm/_ffi/_cython/object.pxi b/python/tvm/_ffi/_cython/object.pxi
index 90be6a9c5b74..9561eab94ea2 100644
--- a/python/tvm/_ffi/_cython/object.pxi
+++ b/python/tvm/_ffi/_cython/object.pxi
@@ -16,6 +16,8 @@
 # under the License.
 
 """Maps object type to its constructor"""
+from ..node_generic import _set_class_node_base
+
 OBJECT_TYPE = []
 
 def _register_object(int index, object cls):
@@ -27,6 +29,7 @@ def _register_object(int index, object cls):
 
 cdef inline object make_ret_object(void* chandle):
     global OBJECT_TYPE
+    global _CLASS_NODE
     cdef unsigned tindex
     cdef list object_type
     cdef object cls
@@ -39,9 +42,11 @@ cdef inline object make_ret_object(void* chandle):
         if cls is not None:
             obj = cls.__new__(cls)
         else:
-            obj = ObjectBase.__new__(ObjectBase)
+            # default use node base class
+            # TODO(tqchen) change to object after Node unifies with Object
+            obj = _CLASS_NODE.__new__(_CLASS_NODE)
     else:
-        obj = ObjectBase.__new__(ObjectBase)
+        obj = _CLASS_NODE.__new__(_CLASS_NODE)
     (<ObjectBase>obj).chandle = chandle
     return obj
 
@@ -94,3 +99,6 @@ cdef class ObjectBase:
             (<FunctionBase>fconstructor).chandle,
             kObjectHandle, args, &chandle)
         self.chandle = chandle
+
+
+_set_class_node_base(ObjectBase)
diff --git a/python/tvm/_ffi/node.py b/python/tvm/_ffi/node.py
index baca89d628b8..c6c151af9053 100644
--- a/python/tvm/_ffi/node.py
+++ b/python/tvm/_ffi/node.py
@@ -21,21 +21,8 @@
 import ctypes
 import sys
 from .. import _api_internal
+from .object import Object, register_object, _set_class_node
 from .node_generic import NodeGeneric, convert_to_node, const
-from .base import _LIB, check_call, c_str, py_str, _FFI_MODE
-
-IMPORT_EXCEPT = RuntimeError if _FFI_MODE == "cython" else ImportError
-try:
-    # pylint: disable=wrong-import-position
-    if _FFI_MODE == "ctypes":
-        raise ImportError()
-    if sys.version_info >= (3, 0):
-        from ._cy3.core import _register_node, NodeBase as _NodeBase
-    else:
-        from ._cy2.core import _register_node, NodeBase as _NodeBase
-except IMPORT_EXCEPT:
-    # pylint: disable=wrong-import-position
-    from ._ctypes.node import _register_node, NodeBase as _NodeBase
 
 
 def _new_object(cls):
@@ -43,20 +30,22 @@ def _new_object(cls):
     return cls.__new__(cls)
 
 
-class NodeBase(_NodeBase):
+class NodeBase(Object):
     """NodeBase is the base class of all TVM language AST object."""
     def __repr__(self):
         return _api_internal._format_str(self)
 
     def __dir__(self):
-        plist = ctypes.POINTER(ctypes.c_char_p)()
-        size = ctypes.c_uint()
-        check_call(_LIB.TVMNodeListAttrNames(
-            self.handle, ctypes.byref(size), ctypes.byref(plist)))
-        names = []
-        for i in range(size.value):
-            names.append(py_str(plist[i]))
-        return names
+        fnames = _api_internal._NodeListAttrNames(self)
+        size = fnames(-1)
+        return [fnames(i) for i in range(size)]
+
+    def __getattr__(self, name):
+        try:
+            return _api_internal._NodeGetAttr(self, name)
+        except AttributeError:
+            raise AttributeError(
+                "%s has no attribute %s" % (str(type(self)), name))
 
     def __hash__(self):
         return _api_internal._raw_ptr(self)
@@ -95,24 +84,6 @@ def same_as(self, other):
         return self.__hash__() == other.__hash__()
 
 
-def register_node(type_key=None):
-    """register node type
-
-    Parameters
-    ----------
-    type_key : str or cls
-        The type key of the node
-    """
-    node_name = type_key if isinstance(type_key, str) else type_key.__name__
-
-    def register(cls):
-        """internal register function"""
-        tindex = ctypes.c_int()
-        ret = _LIB.TVMNodeTypeKey2Index(c_str(node_name), ctypes.byref(tindex))
-        if ret == 0:
-            _register_node(tindex.value, cls)
-        return cls
-
-    if isinstance(type_key, str):
-        return register
-    return register(type_key)
+# pylint: disable=invalid-name
+register_node = register_object
+_set_class_node(NodeBase)
diff --git a/python/tvm/_ffi/object.py b/python/tvm/_ffi/object.py
index be8b086a50f9..002fd27af0fd 100644
--- a/python/tvm/_ffi/object.py
+++ b/python/tvm/_ffi/object.py
@@ -20,25 +20,25 @@
 
 import sys
 import ctypes
-from .base import _FFI_MODE, check_call, _LIB, c_str
+from .base import _FFI_MODE, _RUNTIME_ONLY, check_call, _LIB, c_str
 
 IMPORT_EXCEPT = RuntimeError if _FFI_MODE == "cython" else ImportError
 
 try:
-    # pylint: disable=wrong-import-position
+    # pylint: disable=wrong-import-position,unused-import
     if _FFI_MODE == "ctypes":
         raise ImportError()
     if sys.version_info >= (3, 0):
-        from ._cy3.core import _set_class_object
+        from ._cy3.core import _set_class_object, _set_class_node
         from ._cy3.core import ObjectBase as _ObjectBase
         from ._cy3.core import _register_object
     else:
-        from ._cy2.core import _set_class_object
+        from ._cy2.core import _set_class_object, _set_class_node
         from ._cy2.core import ObjectBase as _ObjectBase
         from ._cy2.core import _register_object
 except IMPORT_EXCEPT:
-    # pylint: disable=wrong-import-position
-    from ._ctypes.function import _set_class_object
+    # pylint: disable=wrong-import-position,unused-import
+    from ._ctypes.function import _set_class_object, _set_class_node
     from ._ctypes.object import ObjectBase as _ObjectBase
     from ._ctypes.object import _register_object
 
@@ -75,8 +75,15 @@ def register(cls):
             tindex = cls._type_index
         else:
             tidx = ctypes.c_uint()
-            check_call(_LIB.TVMObjectTypeKey2Index(
-                c_str(object_name), ctypes.byref(tidx)))
+            if not _RUNTIME_ONLY:
+                check_call(_LIB.TVMObjectTypeKey2Index(
+                    c_str(object_name), ctypes.byref(tidx)))
+            else:
+                # directly skip unknown objects during runtime.
+                ret = _LIB.TVMObjectTypeKey2Index(
+                    c_str(object_name), ctypes.byref(tidx))
+                if ret != 0:
+                    return cls
             tindex = tidx.value
         _register_object(tindex, cls)
         return cls
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 00e19459df76..2dbb67dfbf73 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -36,13 +36,12 @@ class TypeCode(object):
     TVM_TYPE = 5
     TVM_CONTEXT = 6
     ARRAY_HANDLE = 7
-    NODE_HANDLE = 8
+    OBJECT_HANDLE = 8
     MODULE_HANDLE = 9
     FUNC_HANDLE = 10
     STR = 11
     BYTES = 12
     NDARRAY_CONTAINER = 13
-    OBJECT_HANDLE = 14
     EXT_BEGIN = 15
 
 
diff --git a/python/tvm/error.py b/python/tvm/error.py
index b5a7ed2374b7..a6d4f701d2a6 100644
--- a/python/tvm/error.py
+++ b/python/tvm/error.py
@@ -49,6 +49,7 @@ def __init__(self, msg):
 
 register_error("ValueError", ValueError)
 register_error("TypeError", TypeError)
+register_error("AttributeError", AttributeError)
 
 
 @register_error
diff --git a/python/tvm/relay/backend/profiler_vm.py b/python/tvm/relay/backend/profiler_vm.py
index b36715249f0a..ded5d0d13bd7 100644
--- a/python/tvm/relay/backend/profiler_vm.py
+++ b/python/tvm/relay/backend/profiler_vm.py
@@ -62,6 +62,10 @@ def compile(mod, target=None, target_host=None, params=None):
         compiler._compile(mod, target, target_host)
     return vm.Executable(compiler._get_exec())
 
+def enabled():
+    """Whether vm profiler is enabled."""
+    return hasattr(_vm, "_VMCompilerProfiler")
+
 class VMCompilerProfiler(vm.VMCompiler):
     """Build Relay module to run on VM runtime."""
     def __init__(self):
diff --git a/python/tvm/relay/debug.py b/python/tvm/relay/debug.py
index ee30f25d88c1..8887a7eb3c7c 100644
--- a/python/tvm/relay/debug.py
+++ b/python/tvm/relay/debug.py
@@ -17,12 +17,8 @@
 # pylint: disable=wildcard-import, redefined-builtin, invalid-name
 """The Relay IR namespace containing the IR definition and compiler."""
 from __future__ import absolute_import
-from .base import NodeBase, register_relay_node
 from ..api import register_func
 
-@register_relay_node
-class InterpreterState(NodeBase):
-    pass
 
 # pylint: disable=unused-argument
 def _debugger_init(expr, stack):
diff --git a/rust/common/src/packed_func.rs b/rust/common/src/packed_func.rs
index d9399492264b..848d5c00ab3f 100644
--- a/rust/common/src/packed_func.rs
+++ b/rust/common/src/packed_func.rs
@@ -71,7 +71,7 @@ macro_rules! TVMPODValue {
             Context(TVMContext),
             Handle(*mut c_void),
             ArrayHandle(TVMArrayHandle),
-            NodeHandle(*mut c_void),
+            ObjectHandle(*mut c_void),
             ModuleHandle(TVMModuleHandle),
             FuncHandle(TVMFunctionHandle),
             NDArrayContainer(*mut c_void),
@@ -92,7 +92,7 @@ macro_rules! TVMPODValue {
                         TVMTypeCode_kTVMContext => Context($value.v_ctx),
                         TVMTypeCode_kHandle => Handle($value.v_handle),
                         TVMTypeCode_kArrayHandle => ArrayHandle($value.v_handle as TVMArrayHandle),
-                        TVMTypeCode_kNodeHandle => NodeHandle($value.v_handle),
+                        TVMTypeCode_kObjectHandle => ObjectHandle($value.v_handle),
                         TVMTypeCode_kModuleHandle => ModuleHandle($value.v_handle),
                         TVMTypeCode_kFuncHandle => FuncHandle($value.v_handle),
                         TVMTypeCode_kNDArrayContainer => NDArrayContainer($value.v_handle),
@@ -124,7 +124,7 @@ macro_rules! TVMPODValue {
                             TVMTypeCode_kArrayHandle,
                         )
                     },
-                    NodeHandle(val) => (TVMValue { v_handle: *val }, TVMTypeCode_kNodeHandle),
+                    ObjectHandle(val) => (TVMValue { v_handle: *val }, TVMTypeCode_kObjectHandle),
                     ModuleHandle(val) =>
                         (TVMValue { v_handle: *val }, TVMTypeCode_kModuleHandle),
                     FuncHandle(val) => (
diff --git a/rust/frontend/src/function.rs b/rust/frontend/src/function.rs
index 948711276304..01d0c58cfc5d 100644
--- a/rust/frontend/src/function.rs
+++ b/rust/frontend/src/function.rs
@@ -264,7 +264,7 @@ unsafe extern "C" fn tvm_callback(
     for i in 0..len {
         value = args_list[i];
         tcode = type_codes_list[i];
-        if tcode == ffi::TVMTypeCode_kNodeHandle as c_int
+        if tcode == ffi::TVMTypeCode_kObjectHandle as c_int
             || tcode == ffi::TVMTypeCode_kFuncHandle as c_int
             || tcode == ffi::TVMTypeCode_kModuleHandle as c_int
         {
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index f31f02b1eaf4..c57e2afaa8eb 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -117,8 +117,7 @@ TVM_REGISTER_API("arith._CreateAnalyzer")
         });
       } else if (name == "bind") {
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            auto& sptr = args[1].node_sptr();
-            if (sptr->is_type<Range::ContainerType>()) {
+            if (args[1].IsObjectRef<Range>()) {
               self->Bind(args[0], args[1].operator Range());
             } else {
               self->Bind(args[0], args[1].operator Expr());
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 28ebb4d65005..c25c35f636e6 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -30,7 +30,7 @@
 namespace tvm {
 TVM_REGISTER_API("_format_str")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
-    CHECK(args[0].type_code() == kNodeHandle);
+    CHECK(args[0].type_code() == kObjectHandle);
     std::ostringstream os;
     os << args[0].operator NodeRef();
     *ret = os.str();
@@ -38,9 +38,8 @@ TVM_REGISTER_API("_format_str")
 
 TVM_REGISTER_API("_raw_ptr")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
-    CHECK(args[0].type_code() == kNodeHandle);
-    *ret = reinterpret_cast<int64_t>(
-        args[0].node_sptr().get());
+    CHECK(args[0].type_code() == kObjectHandle);
+    *ret = reinterpret_cast<int64_t>(args[0].value().v_handle);
   });
 
 TVM_REGISTER_API("_save_json")
diff --git a/src/api/api_codegen.cc b/src/api/api_codegen.cc
index 73e26719cf15..f2ca67e6e2f9 100644
--- a/src/api/api_codegen.cc
+++ b/src/api/api_codegen.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -33,7 +33,7 @@ namespace codegen {
 
 TVM_REGISTER_API("codegen._Build")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<LoweredFunc>()) {
+    if (args[0].IsObjectRef<LoweredFunc>()) {
       *ret = Build({args[0]}, args[1]);
     } else {
       *ret = Build(args[0], args[1]);
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index b8ee1441fe12..9312c5532302 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2016 by Contributors
  *  Implementation of API functions related to IR build
  * \file api_ir.cc
  */
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index aa0ce47b4a37..f3d6c5f6ab62 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -57,25 +57,26 @@ TVM_REGISTER_API("_str")
 
 TVM_REGISTER_API("_Array")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    std::vector<NodePtr<Node> > data;
+    std::vector<ObjectRef> data;
     for (int i = 0; i < args.size(); ++i) {
       if (args[i].type_code() != kNull) {
-        data.push_back(args[i].node_sptr());
+        data.push_back(args[i].operator ObjectRef());
       } else {
-        data.push_back(NodePtr<Node>(nullptr));
+        data.push_back(ObjectRef(nullptr));
       }
     }
     auto node = make_node<ArrayNode>();
     node->data = std::move(data);
-    *ret = node;
+    *ret = runtime::ObjectRef(node);
   });
 
 TVM_REGISTER_API("_ArrayGetItem")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     int64_t i = args[1];
-    auto& sptr = args[0].node_sptr();
-    CHECK(sptr->is_type<ArrayNode>());
-    auto* n = static_cast<const ArrayNode*>(sptr.get());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+    CHECK(ptr->IsInstance<ArrayNode>());
+    auto* n = static_cast<const ArrayNode*>(ptr);
     CHECK_LT(static_cast<size_t>(i), n->data.size())
         << "out of bound of array";
     *ret = n->data[static_cast<size_t>(i)];
@@ -83,10 +84,11 @@ TVM_REGISTER_API("_ArrayGetItem")
 
 TVM_REGISTER_API("_ArraySize")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    auto& sptr = args[0].node_sptr();
-    CHECK(sptr->is_type<ArrayNode>());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+    CHECK(ptr->IsInstance<ArrayNode>());
     *ret = static_cast<int64_t>(
-        static_cast<const ArrayNode*>(sptr.get())->data.size());
+        static_cast<const ArrayNode*>(ptr)->data.size());
   });
 
 TVM_REGISTER_API("_Map")
@@ -98,10 +100,10 @@ TVM_REGISTER_API("_Map")
       for (int i = 0; i < args.num_args; i += 2) {
         CHECK(args[i].type_code() == kStr)
             << "key of str map need to be str";
-        CHECK(args[i + 1].type_code() == kNodeHandle)
+        CHECK(args[i + 1].type_code() == kObjectHandle)
             << "value of the map to be NodeRef";
         data.emplace(std::make_pair(args[i].operator std::string(),
-                                    args[i + 1].node_sptr()));
+                                    args[i + 1].operator ObjectRef()));
       }
       auto node = make_node<StrMapNode>();
       node->data = std::move(data);
@@ -110,12 +112,12 @@ TVM_REGISTER_API("_Map")
       // Container node.
       MapNode::ContainerType data;
       for (int i = 0; i < args.num_args; i += 2) {
-        CHECK(args[i].type_code() == kNodeHandle)
+        CHECK(args[i].type_code() == kObjectHandle)
             << "key of str map need to be str";
-        CHECK(args[i + 1].type_code() == kNodeHandle)
+        CHECK(args[i + 1].type_code() == kObjectHandle)
             << "value of map to be NodeRef";
-        data.emplace(std::make_pair(args[i].node_sptr(),
-                                    args[i + 1].node_sptr()));
+        data.emplace(std::make_pair(args[i].operator ObjectRef(),
+                                    args[i + 1].operator ObjectRef()));
       }
       auto node = make_node<MapNode>();
       node->data = std::move(data);
@@ -125,31 +127,33 @@ TVM_REGISTER_API("_Map")
 
 TVM_REGISTER_API("_MapSize")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    auto& sptr = args[0].node_sptr();
-    if (sptr->is_type<MapNode>()) {
-      auto* n = static_cast<const MapNode*>(sptr.get());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+    if (ptr->IsInstance<MapNode>()) {
+      auto* n = static_cast<const MapNode*>(ptr);
       *ret = static_cast<int64_t>(n->data.size());
     } else {
-      CHECK(sptr->is_type<StrMapNode>());
-      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      CHECK(ptr->IsInstance<StrMapNode>());
+      auto* n = static_cast<const StrMapNode*>(ptr);
       *ret = static_cast<int64_t>(n->data.size());
     }
   });
 
 TVM_REGISTER_API("_MapGetItem")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    CHECK(args[0].type_code() == kNodeHandle);
-    auto& sptr = args[0].node_sptr();
-    if (sptr->is_type<MapNode>()) {
-      CHECK(args[1].type_code() == kNodeHandle);
-      auto* n = static_cast<const MapNode*>(sptr.get());
-      auto it = n->data.find(args[1].node_sptr());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+
+    if (ptr->IsInstance<MapNode>()) {
+      CHECK(args[1].type_code() == kObjectHandle);
+      auto* n = static_cast<const MapNode*>(ptr);
+      auto it = n->data.find(args[1].operator ObjectRef());
       CHECK(it != n->data.end())
           << "cannot find the corresponding key in the Map";
       *ret = (*it).second;
     } else {
-      CHECK(sptr->is_type<StrMapNode>());
-      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      CHECK(ptr->IsInstance<StrMapNode>());
+      auto* n = static_cast<const StrMapNode*>(ptr);
       auto it = n->data.find(args[1].operator std::string());
       CHECK(it != n->data.end())
           << "cannot find the corresponding key in the Map";
@@ -159,16 +163,17 @@ TVM_REGISTER_API("_MapGetItem")
 
 TVM_REGISTER_API("_MapCount")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    CHECK(args[0].type_code() == kNodeHandle);
-    auto& sptr = args[0].node_sptr();
-    if (sptr->is_type<MapNode>()) {
-      auto* n = static_cast<const MapNode*>(sptr.get());
-      CHECK(args[1].type_code() == kNodeHandle);
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+
+    if (ptr->IsInstance<MapNode>()) {
+      auto* n = static_cast<const MapNode*>(ptr);
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
       *ret = static_cast<int64_t>(
-          n->data.count(args[1].node_sptr()));
+          n->data.count(args[1].operator ObjectRef()));
     } else {
-      CHECK(sptr->is_type<StrMapNode>());
-      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      CHECK(ptr->IsInstance<StrMapNode>());
+      auto* n = static_cast<const StrMapNode*>(ptr);
       *ret = static_cast<int64_t>(
           n->data.count(args[1].operator std::string()));
     }
@@ -176,9 +181,11 @@ TVM_REGISTER_API("_MapCount")
 
 TVM_REGISTER_API("_MapItems")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    auto& sptr = args[0].node_sptr();
-    if (sptr->is_type<MapNode>()) {
-      auto* n = static_cast<const MapNode*>(sptr.get());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+
+    if (ptr->IsInstance<MapNode>()) {
+      auto* n = static_cast<const MapNode*>(ptr);
       auto rkvs = make_node<ArrayNode>();
       for (const auto& kv : n->data) {
         rkvs->data.push_back(kv.first);
@@ -186,10 +193,10 @@ TVM_REGISTER_API("_MapItems")
       }
       *ret = rkvs;
     } else {
-      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      auto* n = static_cast<const StrMapNode*>(ptr);
       auto rkvs = make_node<ArrayNode>();
       for (const auto& kv : n->data) {
-        rkvs->data.push_back(ir::StringImm::make(kv.first).node_);
+        rkvs->data.push_back(ir::StringImm::make(kv.first));
         rkvs->data.push_back(kv.second);
       }
       *ret = rkvs;
@@ -426,7 +433,7 @@ TVM_REGISTER_API("_ScheduleCacheRead")
 
 TVM_REGISTER_API("_ScheduleCacheWrite")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-    if (args[1].IsNodeType<Tensor>()) {
+    if (args[1].IsObjectRef<Tensor>()) {
       *ret = args[0].operator Schedule()
           .cache_write(args[1].operator Tensor(), args[2]);
     } else {
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index d2352496c2b4..dd0415afd9eb 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -35,7 +35,7 @@ namespace ir {
 
 TVM_REGISTER_API("ir_pass.Simplify")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<Stmt>()) {
+    if (args[0].IsObjectRef<Stmt>()) {
       if (args.size() > 1) {
         *ret = Simplify(args[0].operator Stmt(), args[1]);
       } else {
@@ -52,7 +52,7 @@ TVM_REGISTER_API("ir_pass.Simplify")
 
 TVM_REGISTER_API("ir_pass.CanonicalSimplify")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<Stmt>()) {
+    if (args[0].IsObjectRef<Stmt>()) {
       if (args.size() > 1) {
         *ret = CanonicalSimplify(args[0].operator Stmt(), args[1]);
       } else {
@@ -69,7 +69,7 @@ TVM_REGISTER_API("ir_pass.CanonicalSimplify")
 
 TVM_REGISTER_API("ir_pass.Substitute")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<Stmt>()) {
+    if (args[0].IsObjectRef<Stmt>()) {
       *ret = Substitute(args[0].operator Stmt(), args[1].operator Map<Var, Expr>());
     } else {
       *ret = Substitute(args[0].operator Expr(), args[1].operator Map<Var, Expr>());
@@ -78,7 +78,7 @@ TVM_REGISTER_API("ir_pass.Substitute")
 
 TVM_REGISTER_API("ir_pass.Equal")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<Stmt>()) {
+    if (args[0].IsObjectRef<Stmt>()) {
       *ret = Equal(args[0].operator Stmt(), args[1].operator Stmt());
     } else {
       *ret = Equal(args[0].operator Expr(), args[1].operator Expr());
diff --git a/src/api/api_schedule.cc b/src/api/api_schedule.cc
index 177360bf2ebb..cf0e0f3c6b7a 100644
--- a/src/api/api_schedule.cc
+++ b/src/api/api_schedule.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  *  Implementation of API functions related to schedule pass.
  * \file api_schedule.cc
  */
diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc
index 89e999f73edb..64805c9e8aa0 100644
--- a/src/api/dsl_api.cc
+++ b/src/api/dsl_api.cc
@@ -18,36 +18,18 @@
  */
 
 /*!
- *  Copyright (c) 2016 by Contributors
  *  Implementation of DSL API
  * \file dsl_api.cc
  */
-#include <dmlc/base.h>
 #include <dmlc/logging.h>
-#include <dmlc/thread_local.h>
 #include <tvm/api_registry.h>
 #include <tvm/attrs.h>
+#include <tvm/expr.h>
 #include <vector>
 #include <string>
-#include <exception>
-#include "../runtime/dsl_api.h"
 
 namespace tvm {
 namespace runtime {
-/*! \brief entry to to easily hold returning information */
-struct TVMAPIThreadLocalEntry {
-  /*! \brief result holder for returning strings */
-  std::vector<std::string> ret_vec_str;
-  /*! \brief result holder for returning string pointers */
-  std::vector<const char *> ret_vec_charp;
-  /*! \brief result holder for retruning string */
-  std::string ret_str;
-};
-
-/*! \brief Thread local store that can be used to hold return values. */
-typedef dmlc::ThreadLocalStore<TVMAPIThreadLocalEntry> TVMAPIThreadLocalStore;
-
-using TVMAPINode = NodePtr<Node>;
 
 struct APIAttrGetter : public AttrVisitor {
   std::string skey;
@@ -138,93 +120,71 @@ struct APIAttrDir : public AttrVisitor {
   }
 };
 
-class DSLAPIImpl : public DSLAPI {
- public:
-  void NodeFree(NodeHandle handle) const final {
-    delete static_cast<TVMAPINode*>(handle);
-  }
-  void NodeTypeKey2Index(const char* type_key,
-                        int* out_index) const final {
-    *out_index = static_cast<int>(Node::TypeKey2Index(type_key));
-  }
-  void NodeGetTypeIndex(NodeHandle handle,
-                        int* out_index) const final {
-    *out_index = static_cast<int>(
-        (*static_cast<TVMAPINode*>(handle))->type_index());
-  }
-  void NodeGetAttr(NodeHandle handle,
-                   const char* key,
-                   TVMValue* ret_val,
-                   int* ret_type_code,
-                   int* ret_success) const final {
-    TVMRetValue rv;
+struct NodeAPI {
+  static void GetAttr(TVMArgs args, TVMRetValue* ret) {
+    NodeRef ref = args[0];
+    Node* tnode = const_cast<Node*>(ref.get());
     APIAttrGetter getter;
-    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
-    getter.skey = key;
-    getter.ret = &rv;
+    getter.skey = args[1].operator std::string();
+    getter.ret = ret;
+
+    bool success;
     if (getter.skey == "type_key") {
-      ret_val->v_str = (*tnode)->type_key();
-      *ret_type_code = kStr;
-      *ret_success = 1;
-      return;
-    } else if (!(*tnode)->is_type<DictAttrsNode>()) {
-      (*tnode)->VisitAttrs(&getter);
-      *ret_success = getter.found_ref_object || rv.type_code() != kNull;
+      *ret = tnode->GetTypeKey();
+      success = true;
+    } else if (!tnode->IsInstance<DictAttrsNode>()) {
+      tnode->VisitAttrs(&getter);
+      success = getter.found_ref_object || ret->type_code() != kNull;
     } else {
       // specially handle dict attr
-      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
-      auto it = dnode->dict.find(key);
+      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode);
+      auto it = dnode->dict.find(getter.skey);
       if (it != dnode->dict.end()) {
-        *ret_success = 1;
-        rv = (*it).second;
+        success = true;
+        *ret = (*it).second;
       } else {
-        *ret_success = 0;
+        success = false;
       }
     }
-    if (*ret_success) {
-      if (rv.type_code() == kStr ||
-          rv.type_code() == kTVMType) {
-        TVMAPIThreadLocalEntry *e = TVMAPIThreadLocalStore::Get();
-        e->ret_str = rv.operator std::string();
-        *ret_type_code = kStr;
-        ret_val->v_str = e->ret_str.c_str();
-      } else {
-        rv.MoveToCHost(ret_val, ret_type_code);
-      }
+    if (!success) {
+      LOG(FATAL) << "AttributeError: " << tnode->GetTypeKey()
+                 << " object has no attributed " << getter.skey;
     }
   }
-  void NodeListAttrNames(NodeHandle handle,
-                        int *out_size,
-                        const char*** out_array) const final {
-    TVMAPIThreadLocalEntry *ret = TVMAPIThreadLocalStore::Get();
-    ret->ret_vec_str.clear();
-    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
+
+  static void ListAttrNames(TVMArgs args, TVMRetValue* ret) {
+    NodeRef ref = args[0];
+    Node* tnode = const_cast<Node*>(ref.get());
+    auto names = std::make_shared<std::vector<std::string> >();
     APIAttrDir dir;
-    dir.names = &(ret->ret_vec_str);
+    dir.names = names.get();
 
-    if (!(*tnode)->is_type<DictAttrsNode>()) {
-      (*tnode)->VisitAttrs(&dir);
+    if (!tnode->IsInstance<DictAttrsNode>()) {
+      tnode->VisitAttrs(&dir);
     } else {
       // specially handle dict attr
-      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
+      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode);
       for (const auto& kv : dnode->dict) {
-        ret->ret_vec_str.push_back(kv.first);
+        names->push_back(kv.first);
       }
     }
-    ret->ret_vec_charp.clear();
-    for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
-      ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
-    }
-    *out_array = dmlc::BeginPtr(ret->ret_vec_charp);
-    *out_size = static_cast<int>(ret->ret_vec_str.size());
+
+    *ret = PackedFunc([names](TVMArgs args, TVMRetValue *rv) {
+        int64_t i = args[0];
+        if (i == -1) {
+          *rv = static_cast<int64_t>(names->size());
+        } else {
+          *rv = (*names)[i];
+        }
+      });
   }
 };
 
-TVM_REGISTER_GLOBAL("dsl_api.singleton")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    static DSLAPIImpl impl;
-    void* ptr = &impl;
-    *rv = ptr;
-  });
+TVM_REGISTER_GLOBAL("_NodeGetAttr")
+.set_body(NodeAPI::GetAttr);
+
+TVM_REGISTER_GLOBAL("_NodeListAttrNames")
+.set_body(NodeAPI::ListAttrNames);
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/arithmetic/analyzer.cc b/src/arithmetic/analyzer.cc
index acd964935c25..98e25742592d 100644
--- a/src/arithmetic/analyzer.cc
+++ b/src/arithmetic/analyzer.cc
@@ -36,9 +36,7 @@ Analyzer::Analyzer()
       int_set(this) {
 }
 
-void Analyzer::Bind(const VarExpr& v, const Expr& expr) {
-  Var var(v.node_);
-
+void Analyzer::Bind(const VarExpr& var, const Expr& expr) {
   Expr new_expr = expr;
   new_expr = this->canonical_simplify(new_expr);
   new_expr = this->rewrite_simplify(new_expr);
@@ -49,9 +47,8 @@ void Analyzer::Bind(const VarExpr& v, const Expr& expr) {
   this->canonical_simplify.Update(var, new_expr);
 }
 
-void Analyzer::Bind(const VarExpr& v, const Range& range) {
+void Analyzer::Bind(const VarExpr& var, const Range& range) {
   CHECK(range.defined());
-  Var var(v.node_);
   if (is_one(range->extent)) {
     this->Bind(var, range->min);
   } else {
diff --git a/src/arithmetic/canonical_simplify.cc b/src/arithmetic/canonical_simplify.cc
index d80e4969d5c2..02e8079c9c7b 100644
--- a/src/arithmetic/canonical_simplify.cc
+++ b/src/arithmetic/canonical_simplify.cc
@@ -629,7 +629,7 @@ Mutate_(const Mul* op, const Expr& self) {
   }
   if (const auto* bconst = b.as<IntImm>()) {
     if (a.as<SumExprNode>()) {
-      SumExpr ret(std::move(a.node_));
+      SumExpr ret = Downcast<SumExpr>(std::move(a));
       ret.CopyOnWrite()->MulToSelf(bconst->value);
       return std::move(ret);
     } else {
@@ -931,7 +931,7 @@ Mutate_(const Mod* op, const Expr& self) {
       int64_t new_base = psum->base % cval;
       if (cbound->min_value >= 0 &&
           cbound->min_value - psum->base + new_base >= 0) {
-        SumExpr sum_expr(std::move(a.node_));
+        SumExpr sum_expr = Downcast<SumExpr>(a);
         sum_expr.CopyOnWrite()->base = new_base;
         return SplitModConst(ToSplitExpr(std::move(sum_expr)), cval, kTruncDiv);
       }
@@ -992,7 +992,7 @@ Mutate_(const FloorMod* op, const Expr& self) {
       // Simplify the offset constant if necessary.
       // floormod(x - 5, 3) => floormod(x + 1, 3)
       int64_t new_base = floormod(psum->base, cval);
-      SumExpr sum_expr(std::move(a.node_));
+      SumExpr sum_expr = Downcast<SumExpr>(std::move(a));
       sum_expr.CopyOnWrite()->base = new_base;
       return SplitModConst(ToSplitExpr(std::move(sum_expr)), cval, kFloorDiv);
     } else {
diff --git a/src/arithmetic/const_int_bound.cc b/src/arithmetic/const_int_bound.cc
index d5c012d302dc..168486ee0018 100644
--- a/src/arithmetic/const_int_bound.cc
+++ b/src/arithmetic/const_int_bound.cc
@@ -39,7 +39,7 @@ ConstIntBound::ConstIntBound(
   auto node = make_node<ConstIntBoundNode>();
   node->min_value = min_value;
   node->max_value = max_value;
-  node_ = std::move(node);
+  data_ = std::move(node);
 }
 
 inline void PrintBoundValue(std::ostream& os, int64_t val) {
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 3c5f12a7379e..7da020efc42a 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -176,7 +176,7 @@ bool DetectClipBound(
     if (const Variable* v = n.as<Variable>()) {
       if (bmap->count(v)) {
         if (flag == 0) {
-          var = Var(n.node_);
+          var = Downcast<Var>(n);
           flag = 1;
         } else if (flag == 1) {
           if (!var.same_as(n)) {
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index 0e24714daf1f..313b34ded034 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -40,7 +40,7 @@ IntervalSet::IntervalSet(Expr min_value, Expr max_value) {
   auto node = make_node<IntervalSetNode>();
   node->min_value = std::move(min_value);
   node->max_value = std::move(max_value);
-  node_ = std::move(node);
+  data_ = std::move(node);
 }
 
 IntervalSet MakeIntervalSet(Expr min_value, Expr max_value) {
@@ -506,7 +506,7 @@ class IntervalSetEvaluator :
   }
 
   IntervalSet VisitExprDefault_(const Node* op) final {
-    DLOG(WARNING) << "cannot evaluate set type " << op->type_key();
+    DLOG(WARNING) << "cannot evaluate set type " << op->GetTypeKey();
     return IntervalSet::Everything();
   }
 
diff --git a/src/arithmetic/ir_mutator_with_analyzer.cc b/src/arithmetic/ir_mutator_with_analyzer.cc
index 04e166ae52c0..cda9d585ace1 100644
--- a/src/arithmetic/ir_mutator_with_analyzer.cc
+++ b/src/arithmetic/ir_mutator_with_analyzer.cc
@@ -87,7 +87,7 @@ Stmt IRMutatorWithAnalyzer::
 Mutate_(const AttrStmt* op, const Stmt& s) {
   if (op->attr_key == attr::thread_extent ||
       op->attr_key == attr::virtual_thread) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     CHECK_NE(iv->thread_tag.length(), 0U);
     analyzer_->Bind(iv->var,
                     Range::make_by_min_extent(0, op->value));
diff --git a/src/arithmetic/ir_visitor_with_analyzer.h b/src/arithmetic/ir_visitor_with_analyzer.h
index 71eea50e4c72..918f2e89501f 100644
--- a/src/arithmetic/ir_visitor_with_analyzer.h
+++ b/src/arithmetic/ir_visitor_with_analyzer.h
@@ -47,7 +47,7 @@ class IRVisitorWithAnalyzer final : public IRVisitor {
   void Visit_(const AttrStmt* op) {
     if (op->attr_key == attr::thread_extent ||
         op->attr_key == attr::virtual_thread) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       CHECK_NE(iv->thread_tag.length(), 0U);
       analyzer_.Bind(iv->var,
                       Range::make_by_min_extent(0, op->value));
diff --git a/src/arithmetic/modular_set.cc b/src/arithmetic/modular_set.cc
index 08454dd0ef5a..9e363e7cf99a 100644
--- a/src/arithmetic/modular_set.cc
+++ b/src/arithmetic/modular_set.cc
@@ -41,7 +41,7 @@ ModularSet::ModularSet(int64_t coeff, int64_t base) {
   node->coeff = coeff;
   node->base = base;
   // finish construction.
-  node_ = std::move(node);
+  data_ = std::move(node);
 }
 
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 3f1c32243a23..66340e9c9021 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -34,6 +34,7 @@
 namespace tvm {
 
 TVM_REGISTER_NODE_TYPE(TargetNode);
+TVM_REGISTER_NODE_TYPE(GenericFuncNode);
 
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<TargetNode>([](const TargetNode *op, IRPrinter *p) {
@@ -51,9 +52,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 */
 Target CreateTarget(const std::string& target_name,
                     const std::vector<std::string>& options) {
-  auto target = Target(make_node<TargetNode>());
-  auto t = static_cast<TargetNode*>(target.node_.get());
-
+  auto t = make_node<TargetNode>();
   t->target_name = target_name;
 
   std::string libs_flag = "-libs=";
@@ -137,7 +136,7 @@ Target CreateTarget(const std::string& target_name,
     return target::stackvm();
   }
 
-  return target;
+  return Target(t);
 }
 
 TVM_REGISTER_API("_TargetCreate")
@@ -674,7 +673,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 });
 
 struct GenericFunc::Manager {
-  std::unordered_map<std::string, NodePtr<Node> > fmap;
+  std::unordered_map<std::string, GenericFunc> fmap;
   // mutex
   std::mutex mutex;
 
@@ -694,10 +693,11 @@ GenericFunc GenericFunc::Get(const std::string& name) {
   if (it == m->fmap.end()) {
     auto f = make_node<GenericFuncNode>();
     f->name_ = name;
-    m->fmap[name] = f;
-    return GenericFunc(f);
+    auto gf = GenericFunc(f);
+    m->fmap[name] = gf;
+    return gf;
   } else {
-    return GenericFunc(it->second);
+    return it->second;
   }
 }
 
@@ -707,12 +707,12 @@ void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name)
   auto it = m->fmap.find(name);
   CHECK(it == m->fmap.end()) << "GenericFunc already registered " << name;
   func->name_ = name;
-  m->fmap[name] = func.node_;
+  m->fmap[name] = func;
 }
 
 GenericFunc& GenericFunc::set_default(const PackedFunc value,
-                                           bool allow_override) {
-  auto node = static_cast<GenericFuncNode*>(node_.get());
+                                      bool allow_override) {
+  auto node = static_cast<GenericFuncNode*>(operator->());
   if (!allow_override) {
     CHECK(node->generic_func_ == nullptr)
       << "Generic function already registered for " << node->name_;
@@ -736,7 +736,7 @@ GenericFunc& GenericFunc::register_func(const std::vector<std::string>& tags,
 }
 
 void GenericFunc::CallPacked(TVMArgs args, TVMRetValue* ret) const {
-  auto node = static_cast<GenericFuncNode*>(node_.get());
+  auto node = static_cast<const GenericFuncNode*>(get());
   auto target = Target::Current(true);
   PackedFunc func;
 
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index ecf62ab0cfac..ab203f2aa28a 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -806,7 +806,7 @@ void CodeGenC::VisitStmt_(const Allocate* op) {
 
 void CodeGenC::VisitStmt_(const AttrStmt* op) {
   if (op->attr_key == ir::attr::thread_extent) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     if (iv->thread_tag.length() != 0) {
       if (!var_idmap_.count(iv->var.get())) {
         BindThreadIndex(iv);
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index d009290bb2fe..de54e242ff40 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -1173,7 +1173,7 @@ void CodeGenLLVM::VisitStmt_(const Allocate* op) {
 
 void CodeGenLLVM::VisitStmt_(const AttrStmt* op) {
   if (op->attr_key == attr::thread_extent) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     if (iv->thread_tag.length() != 0) {
       if (!var_map_.count(iv->var.get())) {
         var_map_[iv->var.get()] = GetThreadIndex(iv);
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
index 7caf3a258b6f..6a3b0571c9ab 100644
--- a/src/codegen/spirv/codegen_spirv.cc
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -606,7 +606,7 @@ void CodeGenSPIRV::VisitStmt_(const Allocate* op) {
 
 void CodeGenSPIRV::VisitStmt_(const AttrStmt* op) {
   if (op->attr_key == attr::thread_extent) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     if (iv->thread_tag.length() != 0) {
       if (!var_map_.count(iv->var.get())) {
         var_map_[iv->var.get()] = GetThreadIndex(iv, op->value);
diff --git a/src/contrib/hybrid/codegen_hybrid.cc b/src/contrib/hybrid/codegen_hybrid.cc
index 54616adc214e..778b6b1a7811 100644
--- a/src/contrib/hybrid/codegen_hybrid.cc
+++ b/src/contrib/hybrid/codegen_hybrid.cc
@@ -300,7 +300,7 @@ void CodeGenHybrid::VisitStmt_(const AttrStmt* op) {
     PrintStmt(op->body);
     indent_ -= tab_;
   } else if (op->attr_key == ir::attr::realize_scope) {
-    auto v = FunctionRef(op->node.node_);
+    auto v = Downcast<FunctionRef>(op->node);
     alloc_storage_scope_[v] = op->value.as<StringImm>()->value;
     PrintStmt(op->body);
   } else {
@@ -408,7 +408,7 @@ void CodeGenHybrid::PrintIndent() {
 std::string CodeGenHybrid::GetVarID(const Variable *v) {
   if (binds_.count(v))
     return binds_[v];
-  auto key = std::make_pair(v->GetNodePtr().get(), 0);
+  auto key = std::make_pair(static_cast<const Node*>(v), 0);
   if (id_map_.count(key)) {
     return id_map_[key];
   }
diff --git a/src/contrib/hybrid/codegen_hybrid.h b/src/contrib/hybrid/codegen_hybrid.h
index 498838fc908f..866756996f8d 100644
--- a/src/contrib/hybrid/codegen_hybrid.h
+++ b/src/contrib/hybrid/codegen_hybrid.h
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file codegen_hybrid.h
  * \brief Common utilities to generated C style code.
  */
diff --git a/src/lang/attr_functor.h b/src/lang/attr_functor.h
index 995dfb392e87..b9391e4895b9 100644
--- a/src/lang/attr_functor.h
+++ b/src/lang/attr_functor.h
@@ -44,17 +44,17 @@ class AttrFunctor;
 
 #define ATTR_FUNCTOR_DISPATCH(OP)                                       \
   vtable.template set_dispatch<OP>(                                     \
-      [](const NodeRef& n, TSelf* self, Args... args) {                 \
-        return self->VisitAttr_(static_cast<const OP*>(n.node_.get()),  \
+      [](const ObjectRef& n, TSelf* self, Args... args) {                 \
+        return self->VisitAttr_(static_cast<const OP*>(n.get()),  \
                                 std::forward<Args>(args)...);           \
       });                                                               \
 
 // A functor for common attribute information.
 template <typename R, typename... Args>
-class AttrFunctor<R(const NodeRef& n, Args...)> {
+class AttrFunctor<R(const ObjectRef& n, Args...)> {
  private:
-  using TSelf = AttrFunctor<R(const NodeRef& n, Args...)>;
-  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using TSelf = AttrFunctor<R(const ObjectRef& n, Args...)>;
+  using FType = tvm::IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -65,7 +65,7 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
    * \param args Additional arguments.
    * \return The result of the call
    */
-  virtual R VisitAttr(const NodeRef& n, Args... args) {
+  virtual R VisitAttr(const ObjectRef& n, Args... args) {
     static FType vtable = InitVTable();
     if (vtable.can_dispatch(n)) {
       return vtable(n, this, std::forward<Args>(args)...);
@@ -73,7 +73,7 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
       return VisitAttrDefault_(n.get(), std::forward<Args>(args)...);
     }
   }
-  virtual R VisitAttrDefault_(const Node* node, Args... args) = 0;
+  virtual R VisitAttrDefault_(const Object* node, Args... args) = 0;
   virtual R VisitAttr_(const ArrayNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const StrMapNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::IntImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
@@ -143,60 +143,60 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
 };
 
 class AttrsEqualHandler :
-      protected AttrFunctor<bool(const NodeRef&, const NodeRef&)> {
+      protected AttrFunctor<bool(const ObjectRef&, const ObjectRef&)> {
  public:
   /*!
    * \brief Check if lhs equals rhs
    * \param lhs The left operand.
    * \param rhs The right operand.
    */
-  bool Equal(const NodeRef& lhs, const NodeRef& rhs);
+  bool Equal(const ObjectRef& lhs, const ObjectRef& rhs);
 
  protected:
-  bool VisitAttrDefault_(const Node* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ArrayNode* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const StrMapNode* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::IntImm* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::UIntImm* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::FloatImm* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::StringImm* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Add* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Sub* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Mul* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Div* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Mod* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::FloorDiv* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::FloorMod* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Min* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Max* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::GE* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::GT* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::LT* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::LE* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::EQ* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::NE* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::And* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Or* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Not* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Cast* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Call* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Select* lhs, const NodeRef& other) final;
+  bool VisitAttrDefault_(const Object* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ArrayNode* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const StrMapNode* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::IntImm* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::UIntImm* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::FloatImm* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::StringImm* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Add* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Sub* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Mul* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Div* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Mod* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::FloorDiv* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::FloorMod* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Min* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Max* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::GE* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::GT* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::LT* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::LE* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::EQ* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::NE* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::And* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Or* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Not* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Cast* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Call* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Select* lhs, const ObjectRef& other) final;
 };
 
 class AttrsHashHandler :
-      protected AttrFunctor<size_t(const NodeRef&)> {
+      protected AttrFunctor<size_t(const ObjectRef&)> {
  public:
   /*!
    * \brief Get hash value of node
    * \param node The node to be hashed.
    */
-  size_t Hash(const NodeRef& node) {
+  size_t Hash(const ObjectRef& node) {
     if (!node.defined()) return 0;
     return this->VisitAttr(node);
   }
 
  protected:
-  size_t VisitAttrDefault_(const Node* lhs) final;
+  size_t VisitAttrDefault_(const Object* lhs) final;
   size_t VisitAttr_(const ir::IntImm* lhs) final;
   size_t VisitAttr_(const ir::UIntImm* lhs) final;
   size_t VisitAttr_(const ir::FloatImm* lhs) final;
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index c5b14ac577ec..a299e17996e0 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -40,7 +40,7 @@ void DictAttrsNode::InitByPackedArgs(
   for (int i = 0; i < args.size(); i += 2) {
     std::string key = args[i];
     runtime::TVMArgValue val = args[i + 1];
-    if (val.type_code() == kNodeHandle) {
+    if (val.type_code() == kObjectHandle) {
       dict.Set(key, val.operator NodeRef());
     } else if (val.type_code() == kStr) {
       dict.Set(key, Expr(val.operator std::string()));
@@ -72,14 +72,14 @@ TVM_REGISTER_NODE_TYPE(AttrFieldInfoNode);
 
 using namespace ir;
 // Equal handler.
-bool AttrsEqualHandler::Equal(const NodeRef& lhs, const NodeRef& rhs) {
+bool AttrsEqualHandler::Equal(const ObjectRef& lhs, const ObjectRef& rhs) {
   if (lhs.same_as(rhs)) return true;
   if (!lhs.defined() || !rhs.defined()) return false;
   return this->VisitAttr(lhs, rhs);
 }
 
-bool AttrsEqualHandler::VisitAttrDefault_(const Node* lhs, const NodeRef& other) {
-  if (lhs->derived_from<BaseAttrsNode>()) {
+bool AttrsEqualHandler::VisitAttrDefault_(const Object* lhs, const ObjectRef& other) {
+  if (lhs->IsInstance<BaseAttrsNode>()) {
     AttrsEqual equal;
     equal.handler_ = this;
     return static_cast<const BaseAttrsNode*>(lhs)->ContentEqual(
@@ -88,58 +88,58 @@ bool AttrsEqualHandler::VisitAttrDefault_(const Node* lhs, const NodeRef& other)
   return lhs == other.get();
 }
 
-bool AttrsEqualHandler::VisitAttr_(const IntImm* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const IntImm* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<IntImm>()) {
     return lhs->value == rhs->value;
   }
   return false;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const UIntImm* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const UIntImm* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<UIntImm>()) {
     return lhs->value == rhs->value;
   }
   return false;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const FloatImm* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const FloatImm* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<FloatImm>()) {
     return lhs->value == rhs->value;
   }
   return false;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const StringImm* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const StringImm* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<StringImm>()) {
     return lhs->value == rhs->value;
   }
   return false;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const ArrayNode* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const ArrayNode* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<ArrayNode>()) {
     if (rhs->data.size() != lhs->data.size()) return false;
     for (size_t  i = 0; i < lhs->data.size(); ++i) {
-      if (!Equal(NodeRef(lhs->data[i]), NodeRef(rhs->data[i]))) return false;
+      if (!Equal(lhs->data[i], rhs->data[i])) return false;
     }
   }
   return true;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const StrMapNode* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const StrMapNode* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<StrMapNode>()) {
     if (rhs->data.size() != lhs->data.size()) return false;
     for (const auto& kv : lhs->data) {
       auto it = rhs->data.find(kv.first);
       if (it == rhs->data.end()) return false;
-      if (!Equal(NodeRef(kv.second), NodeRef(it->second))) return false;
+      if (!Equal(kv.second, it->second)) return false;
     }
   }
   return true;
 }
 
 #define TVM_DEFINE_ATTRS_BINOP_EQUAL(NodeName)                          \
-  bool AttrsEqualHandler::VisitAttr_(const NodeName* lhs, const NodeRef& other) { \
+  bool AttrsEqualHandler::VisitAttr_(const NodeName* lhs, const ObjectRef& other) { \
     if (const auto* rhs = other.as<NodeName>()) {                       \
       if (!Equal(lhs->a, rhs->a)) return false;                         \
       if (!Equal(lhs->b, rhs->b)) return false;                         \
@@ -167,7 +167,7 @@ TVM_DEFINE_ATTRS_BINOP_EQUAL(NE);
 TVM_DEFINE_ATTRS_BINOP_EQUAL(And);
 TVM_DEFINE_ATTRS_BINOP_EQUAL(Or);
 
-bool AttrsEqualHandler::VisitAttr_(const Not* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const Not* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Not>()) {
     return Equal(lhs->a, rhs->a);
   } else {
@@ -175,7 +175,7 @@ bool AttrsEqualHandler::VisitAttr_(const Not* lhs, const NodeRef& other) {
   }
 }
 
-bool AttrsEqualHandler::VisitAttr_(const Cast* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const Cast* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Cast>()) {
     if (lhs->type != rhs->type) return false;
     return Equal(lhs->value, rhs->value);
@@ -184,7 +184,7 @@ bool AttrsEqualHandler::VisitAttr_(const Cast* lhs, const NodeRef& other) {
   }
 }
 
-bool AttrsEqualHandler::VisitAttr_(const Call* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const Call* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Call>()) {
     return
         lhs->name == rhs->name &&
@@ -196,7 +196,7 @@ bool AttrsEqualHandler::VisitAttr_(const Call* lhs, const NodeRef& other) {
   }
 }
 
-bool AttrsEqualHandler::VisitAttr_(const Select* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const Select* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Select>()) {
     return
         Equal(lhs->condition, rhs->condition) &&
@@ -208,13 +208,13 @@ bool AttrsEqualHandler::VisitAttr_(const Select* lhs, const NodeRef& other) {
 }
 
 // Hash Handler.
-size_t AttrsHashHandler::VisitAttrDefault_(const Node* value) {
-  if (value->derived_from<BaseAttrsNode>()) {
+size_t AttrsHashHandler::VisitAttrDefault_(const Object* value) {
+  if (value->IsInstance<BaseAttrsNode>()) {
     AttrsHash hasher;
     hasher.handler_ = this;
     return static_cast<const BaseAttrsNode*>(value)->ContentHash(hasher);
   } else {
-    return NodeHash()(GetRef<NodeRef>(value));
+    return ObjectHash()(GetRef<ObjectRef>(value));
   }
 }
 
@@ -237,13 +237,13 @@ size_t AttrsHashHandler::VisitAttr_(const StringImm* op) {
 size_t AttrsHashHandler::VisitAttr_(const ArrayNode* op) {
   size_t result = op->data.size();
   for (size_t  i = 0; i < op->data.size(); ++i) {
-    result = Combine(result, this->Hash(NodeRef(op->data[i])));
+    result = Combine(result, this->Hash(op->data[i]));
   }
   return result;
 }
 
 size_t AttrsHashHandler::VisitAttr_(const StrMapNode* lhs) {
-    using Entry = std::pair<std::string, NodePtr<Node> >;
+    using Entry = std::pair<std::string, ObjectRef>;
     std::vector<Entry> data(lhs->data.begin(), lhs->data.end());
     std::sort(data.begin(), data.end(), [](const Entry& a, const Entry& b) {
         return a.first < b.first;
@@ -251,7 +251,7 @@ size_t AttrsHashHandler::VisitAttr_(const StrMapNode* lhs) {
     size_t result = 0;
     for (const Entry& kv : data) {
       result = Combine(result, std::hash<std::string>()(kv.first));
-      result = Combine(result, this->Hash(NodeRef(kv.second)));
+      result = Combine(result, this->Hash(kv.second));
     }
     return result;
 }
@@ -316,7 +316,7 @@ size_t AttrsHashHandler::VisitAttr_(const Select* op) {
 
 
 // Default case
-bool AttrsEqual::operator()(const NodeRef& lhs, const NodeRef& rhs) const {
+bool AttrsEqual::operator()(const ObjectRef& lhs, const ObjectRef& rhs) const {
   if (lhs.same_as(rhs)) return true;
   if (handler_ == nullptr) {
     return AttrsEqualHandler().Equal(lhs, rhs);
@@ -325,7 +325,7 @@ bool AttrsEqual::operator()(const NodeRef& lhs, const NodeRef& rhs) const {
   }
 }
 
-size_t AttrsHash::operator()(const NodeRef& node) const {
+size_t AttrsHash::operator()(const ObjectRef& node) const {
   if (!node.defined()) return 0;
   if (handler_ == nullptr) {
     return AttrsHashHandler().Hash(node);
@@ -338,7 +338,7 @@ size_t DictAttrsNode::ContentHash(AttrsHash hasher) const {
   return hasher(this->dict);
 }
 
-bool DictAttrsNode::ContentEqual(const Node* other, AttrsEqual equal) const {
+bool DictAttrsNode::ContentEqual(const Object* other, AttrsEqual equal) const {
   if (this == other) return true;
   if (other == nullptr) return false;
   if (this->type_index() != other->type_index()) return false;
diff --git a/src/lang/data_layout.cc b/src/lang/data_layout.cc
index 8718039c32c7..3686d5f887b8 100644
--- a/src/lang/data_layout.cc
+++ b/src/lang/data_layout.cc
@@ -69,8 +69,7 @@ const LayoutAxis& LayoutAxis::make(const std::string& name) {
 }
 
 Layout::Layout(const Array<IterVar>& axes) {
-  node_ = make_node<LayoutNode>();
-  LayoutNode *node = operator->();
+  auto node = make_node<LayoutNode>();
   node->axes = axes;
   std::ostringstream repr;
   for (const IterVar& axis : axes) {
@@ -85,13 +84,13 @@ Layout::Layout(const Array<IterVar>& axes) {
     repr << axis->var.get()->name_hint;
   }
   node->name = repr.str();
+  data_ = std::move(node);
 }
 
 Layout::Layout(const std::string& name) { // NOLINT(*)
   if (name == "__undef__") return;
 
-  node_ = make_node<LayoutNode>();
-  LayoutNode *node = operator->();
+  auto node = make_node<LayoutNode>();
   node->name = name;
 
   if (name.empty()) return;  // scalar
@@ -140,6 +139,7 @@ Layout::Layout(const std::string& name) { // NOLINT(*)
                                       << std::toupper(axis);
     }
   }
+  data_ = std::move(node);
 }
 
 Layout LayoutNode::make(const std::string& layout) {
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index 11b72c71fda7..31ade90dd587 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -156,7 +156,7 @@ Var var(std::string name_hint, Type t) {
   return Var(name_hint, t);
 }
 
-void IRPrinter::Print(const NodeRef& ir) {
+void IRPrinter::Print(const ObjectRef& ir) {
   static const FType& f = vtable();
   if (!ir.defined()) {
     stream << "(nullptr)";
@@ -165,7 +165,7 @@ void IRPrinter::Print(const NodeRef& ir) {
       f(ir, this);
     } else {
       // default value, output type key and addr.
-      stream << ir->type_key() << "(" << ir.get() << ")";
+      stream << ir->GetTypeKey() << "(" << ir.get() << ")";
     }
   }
 }
diff --git a/src/lang/ir.cc b/src/lang/ir.cc
index e66d21bba1f7..48b486a7e13b 100644
--- a/src/lang/ir.cc
+++ b/src/lang/ir.cc
@@ -1061,7 +1061,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
       if (i != 0) {
         p->stream << ", ";
       }
-      p->Print(NodeRef(op->data[i]));
+      p->Print(op->data[i]);
     }
     p->stream << ']';
 });
@@ -1073,9 +1073,9 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
       if (it != op->data.begin()) {
         p->stream << ", ";
       }
-      p->Print(NodeRef(it->first));
+      p->Print(it->first);
       p->stream << ": ";
-      p->Print(NodeRef(it->second));
+      p->Print(it->second);
     }
     p->stream << '}';
   });
@@ -1088,7 +1088,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
         p->stream << ", ";
       }
       p->stream << '\"' << it->first << "\": ";
-      p->Print(NodeRef(it->second));
+      p->Print(it->second);
     }
     p->stream << '}';
   });
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 651312a949c4..8e2c3fe7cd15 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2016 by Contributors
  * \file reflection.cc
  * \brief Utilities to save/load/construct TVM objects
  */
@@ -56,11 +55,11 @@ inline Type String2Type(std::string s) {
 using runtime::Object;
 using runtime::ObjectRef;
 
-// indexer to index all the ndoes
+// indexer to index all the nodes
 class NodeIndexer : public AttrVisitor {
  public:
-  std::unordered_map<Node*, size_t> node_index{{nullptr, 0}};
-  std::vector<Node*> node_list{nullptr};
+  std::unordered_map<Object*, size_t> node_index{{nullptr, 0}};
+  std::vector<Object*> node_list{nullptr};
   std::unordered_map<DLTensor*, size_t> tensor_index;
   std::vector<DLTensor*> tensor_list;
 
@@ -73,7 +72,7 @@ class NodeIndexer : public AttrVisitor {
   void Visit(const char* key, void** value) final {}
   void Visit(const char* key, Type* value) final {}
   void Visit(const char* key, NodeRef* value) final {
-    MakeIndex(value->node_.get());
+    MakeIndex(const_cast<Node*>(value->get()));
   }
 
   void Visit(const char* key, runtime::NDArray* value) final {
@@ -85,35 +84,38 @@ class NodeIndexer : public AttrVisitor {
   }
 
   void Visit(const char* key, ObjectRef* value) final {
-    LOG(FATAL) << "Do not support json serialize non-node object";
+    MakeIndex(const_cast<Object*>(value->get()));
   }
 
   // make index of all the children of node
-  void MakeIndex(Node* node) {
-    if (node == nullptr) return;
+  void MakeIndex(Object* ptr) {
+    if (ptr == nullptr) return;
+    CHECK(ptr->IsInstance<Node>());
+    auto* node = static_cast<Node*>(ptr);
+
     if (node_index.count(node)) return;
     CHECK_EQ(node_index.size(), node_list.size());
     node_index[node] = node_list.size();
     node_list.push_back(node);
 
-    if (node->is_type<ArrayNode>()) {
+    if (node->IsInstance<ArrayNode>()) {
       ArrayNode* n = static_cast<ArrayNode*>(node);
       for (const auto& sp : n->data) {
-        MakeIndex(sp.get());
+        MakeIndex(const_cast<Object*>(sp.get()));
       }
-    } else if (node->is_type<MapNode>()) {
+    } else if (node->IsInstance<MapNode>()) {
       MapNode* n = static_cast<MapNode*>(node);
       for (const auto& kv : n->data) {
-        MakeIndex(kv.first.get());
-        MakeIndex(kv.second.get());
+        MakeIndex(const_cast<Object*>(kv.first.get()));
+        MakeIndex(const_cast<Object*>(kv.second.get()));
       }
-    } else if (node->is_type<StrMapNode>()) {
+    } else if (node->IsInstance<StrMapNode>()) {
       StrMapNode* n = static_cast<StrMapNode*>(node);
       for (const auto& kv : n->data) {
-        MakeIndex(kv.second.get());
+        MakeIndex(const_cast<Object*>(kv.second.get()));
       }
     } else {
-      node->VisitAttrs(this);
+      static_cast<Node*>(node)->VisitAttrs(this);
     }
   }
 };
@@ -169,7 +171,7 @@ struct JSONNode {
 
 class JSONAttrGetter : public AttrVisitor {
  public:
-  const std::unordered_map<Node*, size_t>* node_index_;
+  const std::unordered_map<Object*, size_t>* node_index_;
   const std::unordered_map<DLTensor*, size_t>* tensor_index_;
   JSONNode* node_;
 
@@ -199,7 +201,7 @@ class JSONAttrGetter : public AttrVisitor {
   }
   void Visit(const char* key, NodeRef* value) final {
     node_->attrs[key] = std::to_string(
-        node_index_->at(value->node_.get()));
+        node_index_->at(const_cast<Node*>(value->get())));
   }
   void Visit(const char* key, runtime::NDArray* value) final {
     node_->attrs[key] = std::to_string(
@@ -209,12 +211,15 @@ class JSONAttrGetter : public AttrVisitor {
     LOG(FATAL) << "Do not support json serialize non-node object";
   }
   // Get the node
-  void Get(Node* node) {
-    if (node == nullptr) {
+  void Get(Object* ptr) {
+    if (ptr == nullptr) {
       node_->type_key.clear();
       return;
     }
-    node_->type_key = node->type_key();
+    CHECK(ptr->IsInstance<Node>());
+    auto* node = static_cast<Node*>(ptr);
+    node_->type_key = node->GetTypeKey();
+
     // sepcially handle global object
     auto* f = dmlc::Registry<NodeFactoryReg>::Find(node_->type_key);
     CHECK(f != nullptr)
@@ -225,31 +230,31 @@ class JSONAttrGetter : public AttrVisitor {
     }
     node_->attrs.clear();
     node_->data.clear();
-    if (node->is_type<ArrayNode>()) {
+    if (node->IsInstance<ArrayNode>()) {
       ArrayNode* n = static_cast<ArrayNode*>(node);
       for (size_t i = 0; i < n->data.size(); ++i) {
         node_->data.push_back(
-            node_index_->at(n->data[i].get()));
+            node_index_->at(const_cast<Object*>(n->data[i].get())));
       }
-    } else if (node->is_type<MapNode>()) {
+    } else if (node->IsInstance<MapNode>()) {
       MapNode* n = static_cast<MapNode*>(node);
       for (const auto& kv : n->data) {
         node_->data.push_back(
-            node_index_->at(kv.first.get()));
+            node_index_->at(const_cast<Object*>(kv.first.get())));
         node_->data.push_back(
-            node_index_->at(kv.second.get()));
+            node_index_->at(const_cast<Object*>(kv.second.get())));
       }
-    } else if (node->is_type<StrMapNode>()) {
+    } else if (node->IsInstance<StrMapNode>()) {
       StrMapNode* n = static_cast<StrMapNode*>(node);
       for (const auto& kv : n->data) {
         node_->keys.push_back(kv.first);
         node_->data.push_back(
-            node_index_->at(kv.second.get()));
+            node_index_->at(const_cast<Object*>(kv.second.get())));
       }
     } else {
       // do not need to recover content of global singleton object
       // they are registered via the environment
-      auto* f = dmlc::Registry<NodeFactoryReg>::Find(node->type_key());
+      auto* f = dmlc::Registry<NodeFactoryReg>::Find(node->GetTypeKey());
       if (f != nullptr && f->fglobal_key != nullptr) return;
       // recursively index normal object.
       node->VisitAttrs(this);
@@ -259,7 +264,7 @@ class JSONAttrGetter : public AttrVisitor {
 
 class JSONAttrSetter : public AttrVisitor {
  public:
-  const std::vector<NodePtr<Node> >* node_list_;
+  const std::vector<ObjectPtr<Object> >* node_list_;
   const std::vector<runtime::NDArray>* tensor_list_;
 
   JSONNode* node_;
@@ -308,7 +313,7 @@ class JSONAttrSetter : public AttrVisitor {
     size_t index;
     ParseValue(key, &index);
     CHECK_LE(index, node_list_->size());
-    value->node_ = node_list_->at(index);
+    *value = NodeRef(node_list_->at(index));
   }
   void Visit(const char* key, runtime::NDArray* value) final {
     size_t index;
@@ -320,27 +325,30 @@ class JSONAttrSetter : public AttrVisitor {
     LOG(FATAL) << "Do not support json serialize non-node object";
   }
   // set node to be current JSONNode
-  void Set(Node* node) {
-    if (node == nullptr) return;
-    if (node->is_type<ArrayNode>()) {
+  void Set(Object* ptr) {
+    if (ptr == nullptr) return;
+
+    CHECK(ptr->IsInstance<Node>());
+    auto* node = static_cast<Node*>(ptr);
+    if (node->IsInstance<ArrayNode>()) {
       ArrayNode* n = static_cast<ArrayNode*>(node);
       n->data.clear();
       for (size_t index : node_->data) {
-        n->data.push_back(node_list_->at(index));
+        n->data.push_back(ObjectRef(node_list_->at(index)));
       }
-    } else if (node->is_type<MapNode>()) {
+    } else if (node->IsInstance<MapNode>()) {
       MapNode* n = static_cast<MapNode*>(node);
       CHECK_EQ(node_->data.size() % 2, 0U);
       for (size_t i = 0; i < node_->data.size(); i += 2) {
-        n->data[node_list_->at(node_->data[i])]
-            = node_list_->at(node_->data[i + 1]);
+        n->data[ObjectRef(node_list_->at(node_->data[i]))]
+            = ObjectRef(node_list_->at(node_->data[i + 1]));
       }
-    } else if (node->is_type<StrMapNode>()) {
+    } else if (node->IsInstance<StrMapNode>()) {
       StrMapNode* n = static_cast<StrMapNode*>(node);
       CHECK_EQ(node_->data.size(), node_->keys.size());
       for (size_t i = 0; i < node_->data.size(); ++i) {
         n->data[node_->keys[i]]
-            = node_list_->at(node_->data[i]);
+            = ObjectRef(node_list_->at(node_->data[i]));
       }
     } else {
       node->VisitAttrs(this);
@@ -380,21 +388,21 @@ struct JSONGraph {
     helper.ReadAllFields(reader);
   }
 
-  static JSONGraph Create(const NodeRef& root) {
+  static JSONGraph Create(const ObjectRef& root) {
     JSONGraph g;
     NodeIndexer indexer;
-    indexer.MakeIndex(root.node_.get());
+    indexer.MakeIndex(const_cast<Object*>(root.get()));
     JSONAttrGetter getter;
     getter.node_index_ = &indexer.node_index;
     getter.tensor_index_ = &indexer.tensor_index;
-    for (Node* n : indexer.node_list) {
+    for (Object* n : indexer.node_list) {
       JSONNode jnode;
       getter.node_ = &jnode;
       getter.Get(n);
       g.nodes.emplace_back(std::move(jnode));
     }
     g.attrs["tvm_version"] = TVM_VERSION;
-    g.root = indexer.node_index.at(root.node_.get());
+    g.root = indexer.node_index.at(const_cast<Object*>(root.get()));
     // serialize tensor
     for (DLTensor* tensor : indexer.tensor_list) {
       std::string blob;
@@ -416,13 +424,14 @@ std::string SaveJSON(const NodeRef& n) {
   return os.str();
 }
 
-NodePtr<Node> LoadJSON_(std::string json_str) {
+ObjectPtr<Object> LoadJSON_(std::string json_str) {
+  LOG(INFO) << json_str;
   std::istringstream is(json_str);
   dmlc::JSONReader reader(&is);
   JSONGraph jgraph;
   // load in json graph.
   jgraph.Load(&reader);
-  std::vector<NodePtr<Node> > nodes;
+  std::vector<ObjectPtr<Object> > nodes;
   std::vector<runtime::NDArray> tensors;
   // load in tensors
   for (const std::string& blob : jgraph.b64ndarrays) {
@@ -515,7 +524,7 @@ class NodeAttrSetter : public AttrVisitor {
 
 void InitNodeByPackedArgs(Node* n, const TVMArgs& args) {
   NodeAttrSetter setter;
-  setter.type_key = n->type_key();
+  setter.type_key = n->GetTypeKey();
   CHECK_EQ(args.size() % 2, 0);
   for (int i = 0; i < args.size(); i += 2) {
     setter.attrs.emplace(args[i].operator std::string(),
@@ -545,7 +554,7 @@ void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
   CHECK(f->fglobal_key == nullptr)
       << "Cannot make node type \'" << type_key << "\' with global_key.";
   NodePtr<Node> n = f->fcreator(empty_str);
-  if (n->derived_from<BaseAttrsNode>()) {
+  if (n->IsInstance<BaseAttrsNode>()) {
     static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
   } else {
     InitNodeByPackedArgs(n.get(), kwargs);
diff --git a/src/node/node.cc b/src/node/node.cc
deleted file mode 100644
index 6b2f3c0365ad..000000000000
--- a/src/node/node.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- *  Implementation of Node API
- * \file node.cc
- */
-#include <tvm/node/node.h>
-#include <memory>
-#include <atomic>
-#include <mutex>
-#include <unordered_map>
-
-// TODO(tqchen):
-// Think of re-organize and consolidate with object.
-namespace tvm {
-
-namespace {
-// single manager of operator information.
-struct TypeManager {
-  // mutex to avoid registration from multiple threads.
-  // recursive is needed for trigger(which calls UpdateAttrMap)
-  std::mutex mutex;
-  std::atomic<uint32_t> type_counter{0};
-  std::unordered_map<std::string, uint32_t> key2index;
-  std::vector<std::string> index2key;
-  // get singleton of the
-  static TypeManager* Global() {
-    static TypeManager inst;
-    return &inst;
-  }
-};
-}  // namespace
-
-TVM_DLL bool Node::_DerivedFrom(uint32_t tid) const {
-  static uint32_t tindex = TypeKey2Index(Node::_type_key);
-  return tid == tindex;
-}
-
-// this is slow, usually caller always hold the result in a static variable.
-TVM_DLL uint32_t Node::TypeKey2Index(const char* key) {
-  TypeManager *t = TypeManager::Global();
-  std::lock_guard<std::mutex>(t->mutex);
-  std::string skey = key;
-  auto it = t->key2index.find(skey);
-  if (it != t->key2index.end()) {
-    return it->second;
-  }
-  uint32_t tid = ++(t->type_counter);
-  t->key2index[skey] = tid;
-  t->index2key.push_back(skey);
-  return tid;
-}
-
-TVM_DLL const char* Node::TypeIndex2Key(uint32_t index) {
-  TypeManager *t = TypeManager::Global();
-  std::lock_guard<std::mutex>(t->mutex);
-  CHECK_NE(index, 0);
-  return t->index2key.at(index - 1).c_str();
-}
-}  // namespace tvm
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index 8d67632d9be4..69589423b663 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -147,7 +147,7 @@ Operation ComputeOpNode::make(std::string name,
   n->attrs = std::move(attrs);
   n->axis = std::move(axis);
   n->body = std::move(body);
-  if (n->body[0]->is_type<ir::Reduce>()) {
+  if (n->body[0]->IsInstance<ir::Reduce>()) {
     const ir::Reduce* reduce = n->body[0].as<ir::Reduce>();
     n->reduce_axis = reduce->axis;
   }
@@ -163,7 +163,7 @@ Array<Tensor> ComputeOpNode::InputTensors() const {
     ir::PostOrderVisit(e, [&ret, &visited](const NodeRef& n) {
         const ir::Call *call = n.as<ir::Call>();
         if (call != nullptr && call->func.defined()) {
-          Tensor t = Operation(call->func.node_).output(call->value_index);
+          Tensor t = Downcast<Operation>(call->func).output(call->value_index);
           if (!visited.count(t)) {
             ret.push_back(t);
             visited.insert(t);
@@ -180,7 +180,7 @@ Operation ComputeOpNode::ReplaceInputs(
   CHECK_EQ(self.operator->(), this);
   VerifyComputeOp(this);
   Array<Expr> arr;
-  if (this->body[0]->is_type<ir::Reduce>()) {
+  if (this->body[0]->IsInstance<ir::Reduce>()) {
     // Specially handle reduce so the replaced op
     // still share all the components
     Expr new_reduce = op::ReplaceTensor(this->body[0], rmap);
@@ -217,7 +217,7 @@ void ComputeOpNode::PropBoundToInputs(
   auto fvisit = [&dom_map, out_dom_map, analyzer](const NodeRef& n) {
     auto *call = n.as<ir::Call>();
     if (call != nullptr && call->func.defined()) {
-      Tensor t = Operation(call->func.node_).output(call->value_index);
+      Tensor t = Downcast<Operation>(call->func).output(call->value_index);
       if (t->op.defined() && out_dom_map->count(t)) {
         TensorDom& dom = out_dom_map->at(t);
         for (size_t i = 0; i < t.ndim(); ++i) {
diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 9ba71eb83919..e6a46fe19846 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -93,7 +93,7 @@ Array<Tensor> HybridOpNode::InputTensors() const {
   ir::PostOrderVisit(body, [&curr_inputs, &orig_inputs, &visited](const NodeRef& n) {
       const ir::Call *call = n.as<ir::Call>();
       if (call != nullptr && call->func.defined()) {
-        Tensor t = Operation(call->func.node_).output(call->value_index);
+        Tensor t = Downcast<Operation>(call->func).output(call->value_index);
         if (orig_inputs.count(t) && !visited.count(t)) {
           curr_inputs.push_back(t);
           visited.insert(t);
@@ -483,7 +483,7 @@ class ProviderReplacer : public ir::IRMutator {
       : vmap_(vmap) {}
 
   Stmt Mutate_(const ir::Provide* op, const Stmt &s) {
-    Tensor t = Operation(op->func.node_).output(op->value_index);
+    Tensor t = Downcast<Operation>(op->func).output(op->value_index);
     auto it = vmap_.find(t);
     if (it != vmap_.end()) {
       Stmt ret = ir::Provide::make(
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index 801f4fae48b3..691603157b1c 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -193,7 +193,7 @@ class TensorReplacer : public ir::IRMutator {
 
   Expr Mutate_(const ir::Call* op, const Expr& e) {
     if (op->call_type == ir::Call::Halide) {
-      Tensor t = Operation(op->func.node_).output(op->value_index);
+      Tensor t = Downcast<Operation>(op->func).output(op->value_index);
       auto it = vmap_.find(t);
       if (it != vmap_.end()) {
         Expr ret = ir::Call::make(
diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc
index 230472f2ddee..d9b09eba1c1e 100644
--- a/src/op/tensorize.cc
+++ b/src/op/tensorize.cc
@@ -164,7 +164,7 @@ class TensorIntrinMatcher final : public IRMutator {
     Expr expr = IRMutator::Mutate_(op, e);
     op = expr.as<Call>();
     if (op->call_type == Call::Halide) {
-      Tensor t = Operation(op->func.node_).output(op->value_index);
+      Tensor t = Downcast<Operation>(op->func).output(op->value_index);
       auto it = in_remap_.find(t);
       if (it != in_remap_.end()) {
         const InputEntry& e = it->second;
diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc
index 75be84290970..f892b6b957f8 100644
--- a/src/pass/arg_binder.cc
+++ b/src/pass/arg_binder.cc
@@ -54,7 +54,7 @@ bool ArgBinder::Bind_(const Expr& arg,
   if (const Variable* v = arg.as<Variable>()) {
     auto it = def_map_->find(v);
     if (it == def_map_->end()) {
-      Var v_arg(arg.node_);
+      Var v_arg = Downcast<Var>(arg);
       defs_.emplace_back(v_arg);
       if (with_lets) {
         (*def_map_)[v] = arg;
diff --git a/src/pass/coproc_sync.cc b/src/pass/coproc_sync.cc
index 0afe1ca9935b..d40b370d73fa 100644
--- a/src/pass/coproc_sync.cc
+++ b/src/pass/coproc_sync.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -66,7 +66,7 @@ class CoProcTouchedBuffer : public IRVisitor {
   void Visit_(const AttrStmt* op) final {
     if (op->attr_key == attr::coproc_scope && !in_scope_) {
       in_scope_ = true;
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       coproc_.insert(iv);
       IRVisitor::Visit_(op);
       in_scope_ = false;
diff --git a/src/pass/hoist_if_then_else.cc b/src/pass/hoist_if_then_else.cc
index bbdb609e9a08..a1c635e2692b 100644
--- a/src/pass/hoist_if_then_else.cc
+++ b/src/pass/hoist_if_then_else.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file hoist_if_then_else.cc
  */
 #include <tvm/ir.h>
@@ -211,14 +210,14 @@ void IfThenElseHoist::SelectCandidates(const Stmt& stmt) {
     while (!tracker.empty()) {
       Stmt head = tracker.front();
       tracker.pop();
-      if (head->is_type<For>()) {
+      if (head->IsInstance<For>()) {
         for (const auto& if_stmt : for2if_map_.at(head.get())) {
           for2if_map_[for_stmt.get()].push_back(if_stmt);
         }
-      } else if (head->is_type<AttrStmt>()) {
+      } else if (head->IsInstance<AttrStmt>()) {
         const AttrStmt* attr_node = head.as<AttrStmt>();
         tracker.push(attr_node->body);
-      } else if (head->is_type<IfThenElse>()) {
+      } else if (head->IsInstance<IfThenElse>()) {
         for2if_map_[for_stmt.get()].push_back(head);
         const IfThenElse* if_node = head.as<IfThenElse>();
         tracker.push(if_node->then_case);
diff --git a/src/pass/inject_copy_intrin.cc b/src/pass/inject_copy_intrin.cc
index 8df5fe1f7757..d73e78e7f08b 100644
--- a/src/pass/inject_copy_intrin.cc
+++ b/src/pass/inject_copy_intrin.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -92,7 +92,7 @@ class CopyIntrinInjector : public IRMutator {
     if (load->type.lanes() != 1) return false;
     Array<Var> loop_vars;
     for (const For* op : loops) {
-      loop_vars.push_back(Var(op->loop_var.node_));
+      loop_vars.push_back(op->loop_var);
     }
     Array<Expr> store_strides =
         arith::DetectLinearEquation(store->index, loop_vars);
@@ -153,7 +153,7 @@ class CopyIntrinInjector : public IRMutator {
         dst_strides.push_back(make_const(Int(32), 1));
     }
     Buffer dst = BufferNode::make(
-        Var(store->buffer_var.node_),
+        store->buffer_var,
         store->value.type(),
         dst_shape,
         dst_strides,
@@ -162,7 +162,7 @@ class CopyIntrinInjector : public IRMutator {
         GetStorageScope(store->buffer_var.get()),
         0, 0, kDefault);
     Buffer src = BufferNode::make(
-        Var(load->buffer_var.node_),
+        load->buffer_var,
         load->type,
         src_shape,
         src_strides,
diff --git a/src/pass/inject_double_buffer.cc b/src/pass/inject_double_buffer.cc
index 10a132f43741..065bbd4e4db3 100644
--- a/src/pass/inject_double_buffer.cc
+++ b/src/pass/inject_double_buffer.cc
@@ -212,7 +212,7 @@ class DoubleBufferInjector : public IRMutator {
 
  private:
   Stmt MakeProducer(const AttrStmt* op, const Stmt& s) {
-    const VarExpr buffer(op->node.node_);
+    const VarExpr buffer = Downcast<VarExpr>(op->node);
     CHECK_NE(loop_nest_.size(), 0U)
         << "Double buffer scope must be inside a loop";
     auto it = dbuffer_info_.find(buffer.get());
diff --git a/src/pass/inject_prefetch.cc b/src/pass/inject_prefetch.cc
index 3ad7f8a22124..a3a583f37e13 100644
--- a/src/pass/inject_prefetch.cc
+++ b/src/pass/inject_prefetch.cc
@@ -40,7 +40,7 @@ class PrefetchInjector : public IRMutator {
     Stmt ret = IRMutator::Mutate_(op, s);
     op = ret.as<AttrStmt>();
     if (op && op->attr_key == attr::prefetch_scope) {
-      Tensor ts(op->node.node_);
+      Tensor ts = Downcast<Tensor>(op->node);
       CHECK_NE(loop_nest_.size(), 0U);
       Domain domain = DomainTouched(op->body, ts, true, false);
       Region region;
diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc
index 88e7f4370126..eafe5a928cd7 100644
--- a/src/pass/inject_virtual_thread.cc
+++ b/src/pass/inject_virtual_thread.cc
@@ -190,8 +190,7 @@ class VTInjector : public IRMutator {
   }
   // Inject VTLoop when needed.
   Stmt Mutate(Stmt stmt) final {
-    CHECK(!visit_touched_var_)
-        << stmt->type_key() << stmt;
+    CHECK(!visit_touched_var_);
     stmt = IRMutator::Mutate(stmt);
     if (visit_touched_var_ || trigger_base_inject_) {
       if (!vt_loop_injected_)  {
@@ -485,7 +484,7 @@ class VirtualThreadInjector : public IRMutator {
     Stmt stmt = IRMutator::Mutate_(op, s);
     op = stmt.as<AttrStmt>();
     if (op->attr_key == attr::virtual_thread) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       bool allow_share = iv->thread_tag == "vthread";
       int nthread = static_cast<int>(op->value.as<IntImm>()->value);
       VarTouchedAnalysis vs;
diff --git a/src/pass/ir_mutator.cc b/src/pass/ir_mutator.cc
index 1bc856cf601c..fda12378a766 100644
--- a/src/pass/ir_mutator.cc
+++ b/src/pass/ir_mutator.cc
@@ -76,7 +76,7 @@ Stmt IRTransform(const Stmt& ir_node,
                  const Array<Expr>& only_enable) {
   std::unordered_set<uint32_t> only_type_index;
   for (Expr s : only_enable) {
-    only_type_index.insert(Node::TypeKey2Index(s.as<StringImm>()->value.c_str()));
+    only_type_index.insert(Object::TypeKey2Index(s.as<StringImm>()->value.c_str()));
   }
   return IRTransformer(f_preorder, f_postorder, only_type_index)
       .Mutate(ir_node);
diff --git a/src/pass/lift_attr_scope.cc b/src/pass/lift_attr_scope.cc
index 13e3a4e1755a..e300400e6a28 100644
--- a/src/pass/lift_attr_scope.cc
+++ b/src/pass/lift_attr_scope.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -198,7 +198,7 @@ class AttrScopeLifter : public IRMutator {
   // value comparison that also compares content of int constant
   static bool ValueSame(const Expr& a, const Expr& b) {
     if (a.same_as(b)) return true;
-    if (a->type_key() != b->type_key()) return false;
+    if (a->type_index() != b->type_index()) return false;
     if (a.type() != b.type()) return false;
     if (const IntImm* op = a.as<IntImm>()) {
       return op->value == b.as<IntImm>()->value;
diff --git a/src/pass/lower_thread_allreduce.cc b/src/pass/lower_thread_allreduce.cc
index 02c72d03fea8..b63010f9c28d 100644
--- a/src/pass/lower_thread_allreduce.cc
+++ b/src/pass/lower_thread_allreduce.cc
@@ -122,7 +122,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
     size_t size = combiner->result.size();
 
     const UIntImm *size_of_args = call->args[0].as<UIntImm>();
-    CHECK(size_of_args) << call->args[0]->type_key();
+    CHECK(size_of_args) << call->args[0]->GetTypeKey();
     CHECK_EQ(size, size_of_args->value);
     Array<Expr> inits = combiner->identity_element;
     std::vector<Expr> values(size);
@@ -152,7 +152,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
     std::vector<ThreadEntry> vred, vpar;
     for (const AttrStmt* attr : thread_extents_) {
       ThreadEntry e;
-      IterVar iv(attr->node.node_);
+      IterVar iv = Downcast<IterVar>(attr->node);
       e.scope = runtime::ThreadScope::make(iv->thread_tag);
       e.iv = iv;
       CHECK_LE(e.scope.rank, 1);
@@ -183,7 +183,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
       std::vector<Stmt> stores(size);
       for (size_t i = 0; i < size; ++i) {
         Expr pred = const_true(types[i].lanes());
-        Var buffer_var(call->args[2+size+i].node_);
+        Var buffer_var = Downcast<Var>(call->args[2+size+i]);
         stores[i] = Store::make(buffer_var, values[i], 0, pred);
       }
       return Block::make(stores);
diff --git a/src/pass/lower_warp_memory.cc b/src/pass/lower_warp_memory.cc
index 03d0cad40af5..bfd5c3979e69 100644
--- a/src/pass/lower_warp_memory.cc
+++ b/src/pass/lower_warp_memory.cc
@@ -158,7 +158,7 @@ class WarpIndexFinder : private IRVisitor {
   /// Visitor implementation
   void Visit_(const AttrStmt *op) final {
     if (op->attr_key == attr::thread_extent) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       if (iv->thread_tag == "threadIdx.x") {
         int value;
         CHECK(arith::GetConstInt(op->value, &value) &&
@@ -303,7 +303,7 @@ class BindVarBoundInfo : public IRVisitor {
       : analyzer_(analyzer) {}
 
   void Visit_(const For* op) final {
-    Var loop_var(op->loop_var.node_);
+    const Var& loop_var = op->loop_var;
     analyzer_->Bind(loop_var, Range::make_by_min_extent(op->min, op->extent));
     IRVisitor::Visit_(op);
   }
@@ -311,7 +311,7 @@ class BindVarBoundInfo : public IRVisitor {
   void Visit_(const AttrStmt* op) {
     if (op->attr_key == attr::thread_extent ||
         op->attr_key == attr::virtual_thread) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       CHECK_NE(iv->thread_tag.length(), 0U);
       if (!var_dom_.count(iv->var.get())) {
         Range dom = Range::make_by_min_extent(0, op->value);
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 27c15138adf1..d7e6d02d72ed 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -143,12 +143,12 @@ LoweredFunc MakeAPI(Stmt body,
     }
     // add checks for functions.
     if (api_args[i].as<Variable>()) {
-      var_defs.emplace_back(std::make_pair(Var(api_args[i].node_), v_arg));
+      var_defs.emplace_back(std::make_pair(Downcast<Var>(api_args[i]), v_arg));
     } else {
       // Buffer checks
       CHECK(api_args[i].as<BufferNode>())
           << "api_args can only be Buffer or Var";
-      buf_defs.emplace_back(std::make_pair(Buffer(api_args[i].node_), v_arg));
+      buf_defs.emplace_back(std::make_pair(Downcast<Buffer>(api_args[i]), v_arg));
     }
   }
 
diff --git a/src/pass/narrow_channel_access.cc b/src/pass/narrow_channel_access.cc
index 57f3baf20e10..a6f3d67bb33c 100644
--- a/src/pass/narrow_channel_access.cc
+++ b/src/pass/narrow_channel_access.cc
@@ -187,7 +187,7 @@ class ChannelAccessRewriter : public IRMutator {
     const Expr& window = e->window->value;
     bool read_access = e->read_access;
     Var var(for_op->loop_var);
-    Channel ch(adv_op->node.node_);
+    Channel ch = Downcast<Channel>(adv_op->node);
     ChannelAccessBound acc(ch->handle_var.get(), read_access);
     IntSet iset = acc.Eval(for_op->body);
     Range r = iset.cover_range(Range::make_by_min_extent(0, window));
diff --git a/src/pass/remap_thread_axis.cc b/src/pass/remap_thread_axis.cc
index 0a155967820d..3e3ed80db2e4 100644
--- a/src/pass/remap_thread_axis.cc
+++ b/src/pass/remap_thread_axis.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -46,7 +46,7 @@ class ThreadAxisRewriter : private IRMutator {
  private:
   Stmt Mutate_(const AttrStmt* op, const Stmt& stmt) final {
     if (op->attr_key == attr::thread_extent) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       CHECK_NE(iv->thread_tag.length(), 0U);
       auto it = tmap_.find(iv->thread_tag);
       if (it != tmap_.end()) {
diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc
index 27d8a310221e..6fac7be20acf 100644
--- a/src/pass/split_host_device.cc
+++ b/src/pass/split_host_device.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -38,7 +38,7 @@ class IRUseDefAnalysis : public IRMutator {
  public:
   Stmt Mutate_(const AttrStmt *op, const Stmt& s) final {
     if (op->attr_key == attr::thread_extent) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       CHECK_NE(iv->thread_tag.length(), 0U);
       // thread_extent can appear multiple times
       // use the first appearance as def.
@@ -57,7 +57,7 @@ class IRUseDefAnalysis : public IRMutator {
       return AttrStmt::make(op->node, op->attr_key, value, body);
     } else if (op->attr_key == attr::channel_write_scope ||
                op->attr_key == attr::channel_read_scope) {
-      Channel ch(op->node.node_);
+      Channel ch = Downcast<Channel>(op->node);
       if (!use_count_.count(ch->handle_var.get())) {
         this->HandleDef(ch->handle_var.get());
       }
@@ -141,7 +141,7 @@ class IRUseDefAnalysis : public IRMutator {
 
   void HandleUse(const Expr& v) {
     CHECK(v.as<Variable>());
-    Var var(v.node_);
+    Var var = Downcast<Var>(v);
     auto it = use_count_.find(var.get());
     if (it != use_count_.end()) {
       if (it->second >= 0) {
diff --git a/src/pass/split_pipeline.cc b/src/pass/split_pipeline.cc
index 9d379ac748f4..f33566cb051c 100644
--- a/src/pass/split_pipeline.cc
+++ b/src/pass/split_pipeline.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -97,7 +97,7 @@ class MarkChannelAccess : public IRMutator {
   }
   Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
     if (op->attr_key == ir::attr::storage_scope) {
-      Var buf_var(op->node.node_);
+      Var buf_var = Downcast<Var>(op->node);
       if (cmap_.count(buf_var.get())) return Mutate(op->body);
     }
     return IRMutator::Mutate_(op, s);
@@ -251,7 +251,7 @@ class StageSplitter : public IRMutator {
             op->condition, no_op, op->new_expr, op->free_function));
         MarkChannel(op);
       } else {
-        LOG(FATAL) << "not supported nest type " << s->type_key();
+        LOG(FATAL) << "not supported nest type " << s->GetTypeKey();
       }
     }
     body = Substitute(MergeNest(nest, body), subst);
diff --git a/src/pass/storage_access.cc b/src/pass/storage_access.cc
index d0f77343ec15..f7deb25560d6 100644
--- a/src/pass/storage_access.cc
+++ b/src/pass/storage_access.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -114,12 +114,12 @@ void StorageAccessVisitor::Visit_(const AttrStmt* op) {
     }
     double_buffer_write_ = nullptr;
   } else if (op->attr_key == attr::coproc_scope) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     env_threads_.push_back(iv);
     IRVisitor::Visit_(op);
     env_threads_.CopyOnWrite()->data.pop_back();
   } else if (op->attr_key == attr::thread_extent) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     env_threads_.push_back(iv);
     if (!in_device_env_) {
       in_device_env_ = true;
@@ -199,7 +199,7 @@ void StorageAccessVisitor::Visit_(const Call* op) {
       AccessEntry e;
       e.threads = env_threads();
       e.dtype = dtype;
-      e.buffer = VarExpr(op->args[1].node_);
+      e.buffer = Downcast<VarExpr>(op->args[1]);
       e.touched = arith::IntSet::range(
           Range::make_by_min_extent(offset, extent));
       e.scope = scope;
@@ -295,7 +295,7 @@ class StorageAccessInfoLower : public IRMutator {
     CHECK_EQ(op->args.size(), 5U);
     Type dtype = op->args[0].type();
     const Variable* buffer = op->args[1].as<Variable>();
-    Var buffer_var(op->args[1].node_);
+    Var buffer_var = Downcast<Var>(op->args[1]);
     Expr offset = op->args[2];
     auto it = storage_info_.find(buffer);
     if (it != storage_info_.end() && it->second.info.defined()) {
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index 3d84d42b7a06..eb7314e8a290 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -72,7 +72,7 @@ class StorageFlattener : public IRMutator {
     if (it != var_remap_.end() &&
         !it->second.same_as(op->buffer_var)) {
       CHECK(it->second.as<Variable>());
-      VarExpr buf_var(it->second.node_);
+      VarExpr buf_var = Downcast<VarExpr>(it->second);
       return Store::make(buf_var, op->value, op->index, op->predicate);
     } else {
       return stmt;
@@ -84,8 +84,8 @@ class StorageFlattener : public IRMutator {
       storage_scope_[op->node.get()] = op->value.as<StringImm>()->value;
       return this->Mutate(op->body);
     } else if (op->attr_key == attr::double_buffer_scope &&
-               op->node.node_->derived_from<OperationNode>()) {
-      Operation func(op->node.node_);
+               op->node->IsInstance<OperationNode>()) {
+      Operation func = Downcast<Operation>(op->node);
       Stmt body = Mutate(op->body);
       for (int i = 0; i < func->num_outputs(); ++i) {
         TensorKey key{func, i};
@@ -97,7 +97,7 @@ class StorageFlattener : public IRMutator {
       }
       return body;
     } else if (op->attr_key == attr::thread_extent) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       ThreadScope ts = ThreadScope::make(iv->thread_tag);
       curr_thread_scope_.push_back(ts);
       Stmt stmt = IRMutator::Mutate_(op, s);
@@ -106,7 +106,7 @@ class StorageFlattener : public IRMutator {
     } else if (op->attr_key == attr::buffer_bind_scope) {
       return HandleBufferBindScope(op);
     } else if (op->attr_key == attr::buffer_dim_align) {
-      Tensor tensor(op->node.node_);
+      Tensor tensor = Downcast<Tensor>(op->node);
       const Call* tuple = op->value.as<Call>();
       CHECK(tuple && tuple->is_intrinsic(intrinsic::tvm_tuple));
       TensorKey key{tensor->op, tensor->value_index};
@@ -271,7 +271,7 @@ class StorageFlattener : public IRMutator {
     if (it != var_remap_.end() &&
         !it->second.same_as(op->buffer_var)) {
       CHECK(it->second.as<Variable>());
-      VarExpr buf_var(it->second.node_);
+      VarExpr buf_var = Downcast<VarExpr>(it->second);
       return Load::make(op->type, buf_var, op->index, op->predicate);
     } else {
       return expr;
@@ -401,7 +401,7 @@ class StorageFlattener : public IRMutator {
   // We do support a few relaxed case, such as bindingx
   // region with shape [1, 1, n, m] to buffer with shape [n, m]
   Stmt HandleBufferBindScope(const AttrStmt* op) {
-    Array<NodeRef> arr(op->node.node_);
+    Array<NodeRef> arr = Downcast<Array<NodeRef> > (op->node);
     CHECK_EQ(arr.size(), 2U);
     const BufferNode* buffer = arr[0].as<BufferNode>();
     const TensorNode* tensor = arr[1].as<TensorNode>();
@@ -438,7 +438,7 @@ class StorageFlattener : public IRMutator {
     }
     // start binding
     ArgBinder binder(&var_remap_);
-    binder.BindBuffer(Buffer(arr[0].node_), slice, buffer->name, true);
+    binder.BindBuffer(Downcast<Buffer>(arr[0]), slice, buffer->name, true);
     // Apply the remaps
     Stmt body = MergeNest(binder.asserts(), op->body);
     body = MergeNest(binder.init_nest(), body);
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 56359c8b6cf1..18b6634ec422 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2017 by Contributors
  * \file storage_rewrite.cc
  * \brief Memory access pattern analysis and optimization.
  *  Re-write data access to enable memory sharing when possible.
@@ -243,13 +242,13 @@ class InplaceOpVerifier : public IRVisitor {
     dst_ = dst;
     src_ = src;
     result_ = true;
-    if (stmt->is_type<AttrStmt>()) {
+    if (stmt->IsInstance<AttrStmt>()) {
       Visit_(static_cast<const AttrStmt*>(stmt));
-    } else if (stmt->is_type<For>()) {
+    } else if (stmt->IsInstance<For>()) {
       Visit_(static_cast<const For*>(stmt));
-    } else if (stmt->is_type<IfThenElse>()) {
+    } else if (stmt->IsInstance<IfThenElse>()) {
       Visit_(static_cast<const IfThenElse*>(stmt));
-    } else if (stmt->is_type<Store>()) {
+    } else if (stmt->IsInstance<Store>()) {
       Visit_(static_cast<const Store*>(stmt));
     } else {
       return false;
@@ -776,7 +775,7 @@ class StoragePlanRewriter : public IRMutator {
         }
       }
       // enter/exit new scope
-      if (s.stmt->is_type<AttrStmt>()) {
+      if (s.stmt->IsInstance<AttrStmt>()) {
         const auto* op = static_cast<const AttrStmt*>(s.stmt);
         if (op->attr_key == attr::thread_extent ||
             op->attr_key == attr::virtual_thread ||
@@ -785,7 +784,7 @@ class StoragePlanRewriter : public IRMutator {
         } else {
           CHECK(op->attr_key == attr::extern_scope);
         }
-      } else if (s.stmt->is_type<For>()) {
+      } else if (s.stmt->IsInstance<For>()) {
         const auto* op = static_cast<const For*>(s.stmt);
         if (op->for_type == ForType::Parallel) {
           if (thread_scope_ == nullptr || thread_scope_ == op) {
diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc
index c686bc37a02d..7c2f3211c532 100644
--- a/src/pass/storage_sync.cc
+++ b/src/pass/storage_sync.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -304,7 +304,7 @@ class ThreadSyncInserter : public IRMutator {
       CHECK(!is_lead_.defined());
       num_work_dim_ = thread_extents_.size();
       for (const AttrStmt* attr : thread_extents_) {
-        IterVar iv(attr->node.node_);
+        IterVar iv = Downcast<IterVar>(attr->node);
         runtime::ThreadScope s = runtime::ThreadScope::make(iv->thread_tag);
         if (s.rank == 0) {
           num_blocks_ = (num_blocks_.defined() ?
diff --git a/src/pass/unroll_loop.cc b/src/pass/unroll_loop.cc
index 756130886e13..11d9daf3cf5d 100644
--- a/src/pass/unroll_loop.cc
+++ b/src/pass/unroll_loop.cc
@@ -151,8 +151,7 @@ class LoopUnroller : public IRMutator {
     Map<Var, Expr> vmap;
     Stmt unrolled;
     for (int i = 0; i < value; ++i) {
-      Var lv(op->loop_var.node_);
-      vmap.Set(lv, op->min + make_const(op->loop_var.type(), i));
+      vmap.Set(op->loop_var, op->min + make_const(op->loop_var.type(), i));
       Stmt step = Substitute(body, vmap);
       if (unrolled.defined()) {
         unrolled = Block::make(unrolled, step);
diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc
index aed92d692e53..10db37dbb99b 100644
--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -533,8 +533,7 @@ class LoopVectorizer : public IRMutator {
       if (!succ || lanes < 1) {
         LOG(FATAL) << "Failed to vectorize loop with extent " << op->extent;
       }
-      Var var(op->loop_var.node_);
-      return Vectorizer(var, lanes).Mutate(op->body);
+      return Vectorizer(op->loop_var, lanes).Mutate(op->body);
     } else {
       return IRMutator::Mutate_(op, s);
     }
diff --git a/src/pass/verify_memory.cc b/src/pass/verify_memory.cc
index 447691ca0e23..0069182f8367 100644
--- a/src/pass/verify_memory.cc
+++ b/src/pass/verify_memory.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -111,7 +111,7 @@ class MemoryAccessVerifier final : protected IRVisitor {
       CHECK(V) << "Invalid Variable\n";
 
       // Variable is from function args. Return true.
-      if (V == func_->args[0].node_.get()) return true;
+      if (V == func_->args[0].get()) return true;
 
       // The value is expected to come from a tvm_struct_get Call.
       // Get the first argument of tvm_struct_get, and continue.
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index a75cdb299bf4..993c4bf51f67 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file relay/backend/compile_engine.cc
  * \brief Internal compialtion engine.
  */
@@ -43,6 +42,11 @@
 namespace tvm {
 namespace relay {
 
+TVM_REGISTER_NODE_TYPE(CachedFuncNode);
+TVM_REGISTER_NODE_TYPE(CCacheKeyNode);
+TVM_REGISTER_NODE_TYPE(CCacheValueNode);
+TVM_REGISTER_OBJECT_TYPE(CompileEngineNode);
+
 CCacheKey CCacheKeyNode::make(Function source_func, Target target) {
   auto n = make_node<CCacheKeyNode>();
   n->source_func = std::move(source_func);
@@ -78,7 +82,7 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
       CHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
       CHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
       res.push_back(ir::IntImm::make(Int(32), *pval));
-    } else if (val->is_type<ir::Any>()) {
+    } else if (val->IsInstance<ir::Any>()) {
       res.push_back(val.as<ir::Any>()->ToVar());
     } else {
       res.push_back(val);
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 84197dbfabc7..e09ae0648534 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -119,9 +119,9 @@ class CCacheKeyNode : public Node {
 class CCacheKey : public NodeRef {
  public:
   CCacheKey() {}
-  explicit CCacheKey(NodePtr<Node> n) : NodeRef(n) {}
+  explicit CCacheKey(ObjectPtr<Object> n) : NodeRef(n) {}
   const CCacheKeyNode* operator->() const {
-    return static_cast<CCacheKeyNode*>(node_.get());
+    return static_cast<const CCacheKeyNode*>(get());
   }
   // comparator
   inline bool operator==(const CCacheKey& other) const {
@@ -153,12 +153,12 @@ class CCacheValueNode : public Node {
 class CCacheValue : public NodeRef {
  public:
   CCacheValue() {}
-  explicit CCacheValue(NodePtr<Node> n) : NodeRef(n) {}
+  explicit CCacheValue(ObjectPtr<Object> n) : NodeRef(n) {}
   CCacheValueNode* operator->() {
-    return static_cast<CCacheValueNode*>(node_.get());
+    return static_cast<CCacheValueNode*>(get_mutable());
   }
   const CCacheValueNode* operator->() const {
-    return static_cast<const CCacheValueNode*>(node_.get());
+    return static_cast<const CCacheValueNode*>(get());
   }
   using ContainerType = CCacheValueNode;
 };
@@ -201,9 +201,9 @@ class CompileEngineNode : public Node {
 class CompileEngine : public NodeRef {
  public:
   CompileEngine() {}
-  explicit CompileEngine(NodePtr<Node> n) : NodeRef(n) {}
+  explicit CompileEngine(ObjectPtr<Object> n) : NodeRef(n) {}
   CompileEngineNode* operator->() {
-    return static_cast<CompileEngineNode*>(node_.get());
+    return static_cast<CompileEngineNode*>(get_mutable());
   }
   using ContainerType = CompileEngineNode;
   /*! \brief The global compile engine. */
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index 382ae6954a80..7ec287b0e0a2 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -292,7 +292,7 @@ class GraphRuntimeCodegen
           shape.emplace_back(_ShapeToJSON(typ->shape));
           dtype.emplace_back(DType2String(typ->dtype));
         } else {
-          LOG(FATAL) << "type " << checked_type->type_key() << " not supported";
+          LOG(FATAL) << "type " << checked_type->GetTypeKey() << " not supported";
         }
       }
       CHECK_EQ(node->Type(), kGraphOpNode);
@@ -311,7 +311,7 @@ class GraphRuntimeCodegen
       node->attrs_["shape"] = shape;
       node->attrs_["dtype"] = dtype;
     } else {
-      LOG(FATAL) << "type " << checked_type->type_key() << " not supported";
+      LOG(FATAL) << "type " << checked_type->GetTypeKey() << " not supported";
     }
     return {GraphNodeRef(node_id, 0)};
   }
@@ -392,7 +392,7 @@ class GraphRuntimeCodegen
     } else if (op->op.as<FunctionNode>()) {
       func = GetRef<Function>(op->op.as<FunctionNode>());
     } else {
-      LOG(FATAL) << "TVM runtime does not support calls to " << op->op->type_key();
+      LOG(FATAL) << "TVM runtime does not support calls to " << op->op->GetTypeKey();
     }
     if (!func->IsPrimitive()) {
       LOG(FATAL) << "TVM only support calls to primitive functions "
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 878795d0b9f2..0dbcf992e028 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file src/tvm/relay/ir/alpha_equal.cc
  * \brief Alpha equality check by deep comparing two nodes.
  */
@@ -53,12 +52,12 @@ class AlphaEqualHandler:
   bool Equal(const NodeRef& lhs, const NodeRef& rhs) {
     if (lhs.same_as(rhs)) return true;
     if (!lhs.defined() || !rhs.defined()) return false;
-    if (lhs->derived_from<TypeNode>()) {
-      if (!rhs->derived_from<TypeNode>()) return false;
+    if (lhs->IsInstance<TypeNode>()) {
+      if (!rhs->IsInstance<TypeNode>()) return false;
       return TypeEqual(Downcast<Type>(lhs), Downcast<Type>(rhs));
     }
-    if (lhs->derived_from<ExprNode>()) {
-      if (!rhs->derived_from<ExprNode>()) return false;
+    if (lhs->IsInstance<ExprNode>()) {
+      if (!rhs->IsInstance<ExprNode>()) return false;
       return ExprEqual(Downcast<Expr>(lhs), Downcast<Expr>(rhs));
     }
     if (const auto lhsm = lhs.as<ModuleNode>()) {
@@ -181,7 +180,7 @@ class AlphaEqualHandler:
    * \param rhs The right hand operand.
    * \return The compare result.
    */
-  bool LeafNodeEqual(const NodeRef& lhs, const NodeRef& rhs) {
+  bool LeafNodeEqual(const ObjectRef& lhs, const ObjectRef& rhs) {
     if (lhs.same_as(rhs)) return true;
     auto it = equal_map_.find(lhs);
     if (it != equal_map_.end()) {
@@ -197,7 +196,7 @@ class AlphaEqualHandler:
     }
   }
   using AttrsEqualHandler::VisitAttr_;
-  bool VisitAttr_(const Variable* lhs, const NodeRef& other) final {
+  bool VisitAttr_(const Variable* lhs, const ObjectRef& other) final {
     return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
   }
 
@@ -588,7 +587,7 @@ class AlphaEqualHandler:
   // if in assert mode, must return true, and will throw error otherwise.
   bool assert_mode_;
   // renaming of NodeRef to indicate two nodes equals to each other
-  std::unordered_map<NodeRef, NodeRef, NodeHash, NodeEqual> equal_map_;
+  std::unordered_map<ObjectRef, ObjectRef, ObjectHash, ObjectEqual> equal_map_;
 };
 
 bool AlphaEqual(const Type& lhs, const Type& rhs) {
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 6a2db6b46d64..ac45d61e873d 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -18,8 +18,7 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
- * \file src/tvm/relay/expr_mutator.cc
+ * \file src/tvm/relay/expr_functor.cc
  * \brief A wrapper around ExprFunctor which functionally updates the AST.
  *
  * ExprMutator uses memoization and self return in order to amortize
@@ -447,10 +446,10 @@ Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& args_map) {
 TVM_REGISTER_API("relay._expr.Bind")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     NodeRef input = args[0];
-    if (input->derived_from<ExprNode>()) {
+    if (input->IsInstance<ExprNode>()) {
       *ret = Bind(Downcast<Expr>(input), args[1]);
     } else {
-      CHECK(input->derived_from<TypeNode>());
+      CHECK(input->IsInstance<TypeNode>());
       *ret = Bind(Downcast<Type>(input), args[1]);
     }
   });
diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
index d39253372830..bce3610da47b 100644
--- a/src/relay/ir/hash.cc
+++ b/src/relay/ir/hash.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file src/tvm/relay/ir/hash.cc
  * \brief Hash functions for Relay types and expressions.
  */
@@ -49,12 +48,12 @@ class RelayHashHandler:
    * \return the hash value.
    */
   size_t Hash(const NodeRef& ref) {
-    if (!ref.defined()) return ref.hash();
+    if (!ref.defined()) return NodeHash()(ref);
 
-    if (ref->derived_from<TypeNode>()) {
+    if (ref->IsInstance<TypeNode>()) {
       return TypeHash(Downcast<Type>(ref));
     }
-    if (ref->derived_from<ExprNode>()) {
+    if (ref->IsInstance<ExprNode>()) {
       return ExprHash(Downcast<Expr>(ref));
     }
     return AttrHash(ref);
@@ -66,7 +65,9 @@ class RelayHashHandler:
    * \return the hash value
    */
   size_t AttrHash(const NodeRef& ref) {
-    if (!ref.defined()) { return ref.hash(); }
+    if (!ref.defined()) {
+      return NodeHash()(ref);
+    }
     return AttrsHashHandler::Hash(ref);
   }
   /*!
@@ -76,7 +77,9 @@ class RelayHashHandler:
    * \return the hash value.
    */
   size_t TypeHash(const Type& type) {
-    if (!type.defined()) { return type.hash(); }
+    if (!type.defined()) {
+      return NodeHash()(type);
+    }
     auto found = hash_map_.find(type);
     if (found != hash_map_.end()) {
       return found->second;
@@ -98,7 +101,9 @@ class RelayHashHandler:
    * \return the hash value.
    */
   size_t ExprHash(const Expr& expr) {
-    if (!expr.defined()) return expr.hash();
+    if (!expr.defined()) {
+      return NodeHash()(expr);
+    }
     auto found = hash_map_.find(expr);
     if (found != hash_map_.end()) {
       return found->second;
@@ -301,7 +306,7 @@ class RelayHashHandler:
   }
 
   size_t VisitExpr_(const OpNode* op) final {
-    return GetRef<Op>(op).hash();
+    return NodeHash()(GetRef<Op>(op));
   }
 
   size_t VisitExpr_(const ConstantNode* rconst) final {
diff --git a/src/relay/ir/module.cc b/src/relay/ir/module.cc
index 0e8e6f5591dd..cd5b1e69f1d2 100644
--- a/src/relay/ir/module.cc
+++ b/src/relay/ir/module.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file  module.cc
  * \brief The global module in Relay.
  */
@@ -208,7 +207,7 @@ void ModuleNode::UpdateDef(const GlobalTypeVar& var, const TypeData& type) {
 
 void ModuleNode::Remove(const GlobalVar& var) {
   auto functions_node = this->functions.CopyOnWrite();
-  functions_node->data.erase(var.node_);
+  functions_node->data.erase(var);
   auto gvar_node = global_var_map_.CopyOnWrite();
   gvar_node->data.erase(var->name_hint);
 }
@@ -321,10 +320,10 @@ TVM_REGISTER_API("relay._module.Module_Add")
   GlobalVar var = args[1];
   NodeRef val = args[2];
   bool update = args[3];
-  CHECK(val->derived_from<ExprNode>());
-  if (val->derived_from<FunctionNode>()) {
+  CHECK(val->IsInstance<ExprNode>());
+  if (val->IsInstance<FunctionNode>()) {
     mod->Add(var, Downcast<Function>(val), update);
-  } else if (val->derived_from<GlobalVarNode>()) {
+  } else if (val->IsInstance<GlobalVarNode>()) {
     GlobalVar gv = Downcast<GlobalVar>(val);
     auto mod_copy = Module(make_node<ModuleNode>(*mod.operator->()));
     mod_copy = transform::EtaExpand()(mod_copy);
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
index d098863208fc..b0f889c4a489 100644
--- a/src/relay/ir/op.cc
+++ b/src/relay/ir/op.cc
@@ -164,7 +164,7 @@ TVM_REGISTER_API("relay.op._Register")
     if (attr_key == "num_inputs" && plevel > 128) {
       reg.set_num_inputs(value);
     } else if (attr_key == "attrs_type_key" && plevel > 128) {
-      reg.set_attrs_type_key(value);
+      LOG(FATAL) << "attrs type key no longer supported";
     } else {
       // normal attr table override.
       if (args[2].type_code() == kFuncHandle) {
@@ -179,10 +179,18 @@ TVM_REGISTER_API("relay.op._Register")
     }
   });
 
+// helper to get internal dev function in objectref.
+struct Op2NodePtr : public ObjectRef {
+  static NodePtr<Node> Get(const Op& op) {
+    return GetDataPtr<Node>(op);
+  }
+};
+
 NodePtr<Node> CreateOp(const std::string& name) {
+  // Hack use TVMRetValue as exchange
   auto op = Op::Get(name);
   CHECK(op.defined()) << "Cannot find op \'" << name << '\'';
-  return op.node_;
+  return Op2NodePtr::Get(op);
 }
 
 TVM_REGISTER_NODE_TYPE(OpNode)
diff --git a/src/relay/ir/pretty_printer.cc b/src/relay/ir/pretty_printer.cc
index 31218be4a6d4..394ec7eaab82 100644
--- a/src/relay/ir/pretty_printer.cc
+++ b/src/relay/ir/pretty_printer.cc
@@ -122,14 +122,14 @@ class TextMetaDataContext {
     if (it != meta_repr_.end()) {
       return it->second;
     }
-    std::string type_key = node->type_key();
+    std::string type_key = node->GetTypeKey();
     CHECK(!type_key.empty());
     Array<NodeRef>& mvector =
         meta_data_[type_key];
     int64_t index = static_cast<int64_t>(mvector.size());
     mvector.push_back(node);
     Doc doc;
-    doc << "meta[" << node->type_key() << "][" << index << "]";
+    doc << "meta[" << type_key << "][" << index << "]";
     meta_repr_[node] = doc;
     return meta_repr_[node];
   }
@@ -163,7 +163,7 @@ class PrettyPrinter :
     public ExprFunctor<Doc(const Expr&)>,
     public PatternFunctor<Doc(const Pattern&)>,
     public TypeFunctor<Doc(const Type&)>,
-    public AttrFunctor<Doc(const NodeRef&)> {
+    public AttrFunctor<Doc(const ObjectRef&)> {
  public:
   explicit PrettyPrinter(bool show_meta_data,
                          runtime::TypedPackedFunc<std::string(Expr)> annotate) :
@@ -809,13 +809,13 @@ class PrettyPrinter :
   // Overload of Attr printing functions
   //------------------------------------
 
-  Doc PrintAttr(const NodeRef& value, bool meta = false) {
+  Doc PrintAttr(const ObjectRef& value, bool meta = false) {
     if (value.defined()) {
       Doc printed_attr;
       if (value.as<tvm::ir::Any>()) {
         printed_attr << "?";
       } else if (meta) {
-        printed_attr = meta_.GetMetaNode(value);
+        printed_attr = meta_.GetMetaNode(Downcast<NodeRef>(value));
       } else {
         printed_attr = VisitAttr(value);
       }
@@ -825,16 +825,16 @@ class PrettyPrinter :
     }
   }
 
-  Doc VisitAttrDefault_(const Node* op) final {
-    return PrintAttr(GetRef<NodeRef>(op), true);
+  Doc VisitAttrDefault_(const Object* op) final {
+    return PrintAttr(GetRef<ObjectRef>(op), true);
   }
 
   Doc VisitAttr_(const ArrayNode* op) final {
     Doc doc;
     doc << "[";
     std::vector<Doc> arr_vals;
-    for (NodePtr<Node> val : op->data) {
-      arr_vals.push_back(PrintAttr(NodeRef(val)));
+    for (auto val : op->data) {
+      arr_vals.push_back(PrintAttr(val));
     }
     doc << PrintSep(arr_vals);
     doc << "]";
diff --git a/src/relay/ir/type_functor.h b/src/relay/ir/type_functor.h
index c3ee14eedd48..bd9e649a3b4f 100644
--- a/src/relay/ir/type_functor.h
+++ b/src/relay/ir/type_functor.h
@@ -43,18 +43,18 @@ class TypeFunctor;
   { return VisitTypeDefault_(op, std::forward<Args>(args)...); }
 
 
-#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                                   \
-  vtable.template set_dispatch<OP>(                                       \
-      [](const NodeRef& n, TSelf* self, Args... args) {                   \
-        return self->VisitType_(static_cast<const OP*>(n.node_.get()),    \
-                                std::forward<Args>(args)...);             \
+#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                                 \
+  vtable.template set_dispatch<OP>(                                     \
+      [](const ObjectRef& n, TSelf* self, Args... args) {               \
+        return self->VisitType_(static_cast<const OP*>(n.get()),        \
+                                std::forward<Args>(args)...);           \
       });
 
 template <typename R, typename... Args>
 class TypeFunctor<R(const Type& n, Args...)> {
  private:
   using TSelf = TypeFunctor<R(const Type& n, Args...)>;
-  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using FType = tvm::IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -95,7 +95,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
   virtual R VisitType_(const TypeCallNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const TypeDataNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitTypeDefault_(const Node* op, Args...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     throw;  // unreachable, written to stop compiler warning
   }
 
diff --git a/src/relay/op/algorithm/argsort.cc b/src/relay/op/algorithm/argsort.cc
index 31aa88808a23..78e7ff6851d6 100644
--- a/src/relay/op/algorithm/argsort.cc
+++ b/src/relay/op/algorithm/argsort.cc
@@ -69,7 +69,7 @@ RELAY_REGISTER_OP("argsort")
 input array along the given axis.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ArgsortAttrs")
+.set_attrs_type<ArgsortAttrs>()
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(6)
 .add_type_rel("Argsort", ArgsortRel);
diff --git a/src/relay/op/algorithm/topk.cc b/src/relay/op/algorithm/topk.cc
index c88e2c3ea007..207ed01fe454 100644
--- a/src/relay/op/algorithm/topk.cc
+++ b/src/relay/op/algorithm/topk.cc
@@ -91,7 +91,7 @@ RELAY_REGISTER_OP("topk")
 .describe(R"doc(Get the top k elements in an input tensor along the given axis.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.TopKAttrs")
+.set_attrs_type<TopKAttrs>()
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(6)
 .add_type_rel("TopK", TopKRel);
diff --git a/src/relay/op/debug.cc b/src/relay/op/debug.cc
index 37fb090aa231..c9d6c32a85fb 100644
--- a/src/relay/op/debug.cc
+++ b/src/relay/op/debug.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -34,10 +34,12 @@
 namespace tvm {
 namespace relay {
 
+TVM_REGISTER_NODE_TYPE(DebugAttrs);
+
 Array<Tensor> DebugCompute(const Attrs& attrs,
-                               const Array<Tensor>& inputs,
-                               const Type& out_type,
-                               const Target& target) {
+                           const Array<Tensor>& inputs,
+                           const Type& out_type,
+                           const Target& target) {
   return Array<Tensor>{ topi::identity(inputs[0]) };
 }
 
@@ -48,6 +50,7 @@ RELAY_REGISTER_OP("debug")
 .set_num_inputs(1)
 .add_argument("program", "Tuple", "The program to execute before debugging.")
 .set_support_level(1)
+.set_attrs_type<DebugAttrs>()
 .add_type_rel("Debug", IdentityRel)
 .set_attr<TOpPattern>("TOpPattern", kOpaque)
 .set_attr<FTVMCompute>("FTVMCompute", DebugCompute);
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index dbdf89790ac5..001dd217cf83 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -103,7 +103,7 @@ RELAY_REGISTER_OP("image.resize")
            for layout NHWC
            (batch_size, size[0], size[1], channels)
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ResizeAttrs")
+.set_attrs_type<ResizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(5)
diff --git a/src/relay/op/nn/bitserial.cc b/src/relay/op/nn/bitserial.cc
index 4eec8238f4a9..7afaa805544a 100644
--- a/src/relay/op/nn/bitserial.cc
+++ b/src/relay/op/nn/bitserial.cc
@@ -109,11 +109,11 @@ efficient implementation of bitserial operations.
             packed must be divisible by number of bits.
 - **out**:  Packed tensor with shape appropriately compressed.
 )code" TVM_ADD_FILELINE)
-    .set_num_inputs(1)
-    .set_attrs_type_key("relay.attrs.BitPackAttrs")
-    .add_argument("data", "Tensor", "Input data.")
-    .set_support_level(2)
-    .add_type_rel("BitPack", BitPackRel);
+.set_num_inputs(1)
+.set_attrs_type<BitPackAttrs>()
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(2)
+.add_type_rel("BitPack", BitPackRel);
 
 // relay.nn.bitserial_conv2d
 TVM_REGISTER_NODE_TYPE(BinaryConv2DAttrs);
@@ -185,14 +185,14 @@ on some platforms.
 
 - **out**:    Output with same layout as input.
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type_key("relay.attrs.BinaryConv2DAttrs")
-    .set_num_inputs(2)
-    .add_argument("data", "Tensor", "The input tensor.")
-    .add_argument("weight", "Tensor", "The weight tensor.")
-    .set_support_level(2)
-    .add_type_rel("BinaryConv2D", BinaryConv2DRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-                                   BinaryConv2DInferCorrectLayout<BinaryConv2DAttrs>);
+.set_attrs_type<BinaryConv2DAttrs>()
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(2)
+.add_type_rel("BinaryConv2D", BinaryConv2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               BinaryConv2DInferCorrectLayout<BinaryConv2DAttrs>);
 
 // relay.nn.bitserial_dense
 TVM_REGISTER_NODE_TYPE(BinaryDenseAttrs);
@@ -246,12 +246,12 @@ RELAY_REGISTER_OP("nn.bitserial_dense")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type_key("relay.attrs.BinaryDenseAttrs")
-    .set_num_inputs(2)
-    .add_argument("data", "2D Tensor", "Input data.")
-    .add_argument("weight", "2D Tensor", "Weight matrix.")
-    .set_support_level(1)
-    .add_type_rel("BinaryDense", BinaryDenseRel);
+.set_attrs_type<BinaryDenseAttrs>()
+.set_num_inputs(2)
+.add_argument("data", "2D Tensor", "Input data.")
+.add_argument("weight", "2D Tensor", "Weight matrix.")
+.set_support_level(1)
+.add_type_rel("BinaryDense", BinaryDenseRel);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 33ee3a35bf90..bf4e54ba5ff0 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -99,7 +99,7 @@ with the layer input to produce a tensor of outputs.
             (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DAttrs")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -261,7 +261,7 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
                 out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DTransposeAttrs")
+.set_attrs_type<Conv2DTransposeAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -391,7 +391,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform")
 
 - **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DWinograd")
+.set_attrs_type<Conv2DWinogradAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -450,7 +450,7 @@ weight transformation in advance.
 
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DWinogradWeightTransformAttrs")
+.set_attrs_type<Conv2DWinogradWeightTransformAttrs>()
 .set_num_inputs(1)
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)
@@ -501,7 +501,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
 
 - **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DAttrs")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -564,7 +564,7 @@ weight transformation in advance.
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DWinogradNNPACKWeightTransformAttrs")
+.set_attrs_type<Conv2DWinogradNNPACKWeightTransformAttrs>()
 .set_num_inputs(1)
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)
@@ -610,7 +610,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc_int8")
 
 - **out**:  Output is 5D packed tensor
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2D")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -659,7 +659,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc")
 
 - **out**:  Output is 5D packed tensor
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2D")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -709,7 +709,7 @@ RELAY_REGISTER_OP("nn.contrib_depthwise_conv2d_NCHWc")
 
 - **out**:  Output is 5D packed tensor
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DepthwiseConv2D")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -818,7 +818,7 @@ along the channel axis, and also evenly split `weight` along the first dimension
 the convolution on the *i*-th part of the data with the *i*-th weight part. The output is obtained
 by concating all the *g* results.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DeformableConv2D")
+.set_attrs_type<DeformableConv2DAttrs>()
 .set_num_inputs(3)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("offset", "Tensor", "The offset tensor.")
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 6d8c3acf1e00..dd1b4e532185 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -87,7 +87,7 @@ RELAY_REGISTER_OP("nn.bias_add")
 .describe(R"code(Add bias to an axis of the input.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.BiasAddAttrs")
+.set_attrs_type<BiasAddAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("bias", "1D Tensor", "Bias.")
@@ -158,7 +158,7 @@ Useful for
 * Encoding explicit re-use of computation in convolution ops operated on a sliding window input
 * Implementing a FIFO queue to cache intermediate results, e.g. as in Fast WaveNet.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.FIFOBufferAttrs")
+.set_attrs_type<FIFOBufferAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "Latest input")
 .add_argument("buffer", "Tensor",
@@ -195,7 +195,7 @@ RELAY_REGISTER_OP("nn.dense")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DenseAttrs")
+.set_attrs_type<DenseAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("weight", "2D Tensor", "Weight matrix.")
@@ -225,7 +225,7 @@ RELAY_REGISTER_OP("nn.leaky_relu")
 `y = x > 0 ? x : alpha * x`
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.LeakyReluAttrs")
+.set_attrs_type<LeakyReluAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(3)
@@ -305,7 +305,7 @@ It accepts two arguments: an input ``x`` and a channelwise slope ``alpha``
 and computes the output as :math:`PReLU(x) y = x > 0 ? x : alpha * x`,
 where :math:`*` is an channelwise multiplication for each sample in the batch.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.PReluAttrs")
+.set_attrs_type<PReluAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "Input data.")
 .add_argument("alpha", "Tensor", "Input channelwise alpha.")
@@ -344,7 +344,7 @@ RELAY_REGISTER_OP("nn.softmax")
 
 - **data**: The input data
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SoftmaxAttrs")
+.set_attrs_type<SoftmaxAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
@@ -378,7 +378,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
 
 - **data**: The input data
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SoftmaxAttrs")
+.set_attrs_type<SoftmaxAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
@@ -526,7 +526,7 @@ centered at that value (zero padding is added where necessary).
 
 - **data**: The input tensor.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.LRNAttrs")
+.set_attrs_type<LRNAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -560,7 +560,7 @@ Normalizes along dimension axis using an L2 norm
 
 - **data**: The input tensor.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.L2NormalizeAttrs")
+.set_attrs_type<L2NormalizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -602,7 +602,7 @@ During training, each element of the input is set to zero with probability ``p``
 The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input unchanged.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DropoutAttrs")
+.set_attrs_type<DropoutAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input to which dropout will be applied.")
 .set_support_level(1)
@@ -696,7 +696,7 @@ axis to be the last item in the input shape.
 .. note::
     This operator can be optimized away for inference.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.BatchNormAttrs")
+.set_attrs_type<BatchNormAttrs>()
 .set_num_inputs(5)
 .add_argument("data", "Tensor", "Input to which batch_norm will be applied.")
 .add_argument("gamma", "Tensor", "The gamma scale factor.")
@@ -768,7 +768,7 @@ to be the last item in the input shape.
 
     This operator can be optimized away for inference.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.InstanceNormAttrs")
+.set_attrs_type<InstanceNormAttrs>()
 .set_num_inputs(3)
 .add_argument("data", "Tensor", "Input to which instance_norm will be applied.")
 .add_argument("gamma", "Tensor", "The gamma scale factor.")
@@ -816,7 +816,7 @@ TVM_REGISTER_API("relay.op.nn._make.layer_norm")
 RELAY_REGISTER_OP("nn.layer_norm")
 .describe(R"code(
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.LayerNormAttrs")
+.set_attrs_type<LayerNormAttrs>()
 .set_num_inputs(3)
 .add_argument("data", "Tensor", "Input to which layer_norm will be applied.")
 .add_argument("gamma", "Tensor", "The gamma scale factor.")
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 2e34580d1b72..2342880063ad 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -204,7 +204,7 @@ RELAY_REGISTER_OP("nn.pad")
 .describe(R"code(Pad for n-D tensor.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.PadAttrs")
+.set_attrs_type<PadAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -278,7 +278,7 @@ RELAY_REGISTER_OP("nn.mirror_pad")
 .describe(R"code(MirrorPad for n-D tensor.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MirrorPadAttrs")
+.set_attrs_type<MirrorPadAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 503db4116a22..94f8a5442d6c 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -207,7 +207,7 @@ RELAY_REGISTER_OP("nn.max_pool2d")
            equation.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MaxPool2DAttrs")
+.set_attrs_type<MaxPool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -262,7 +262,7 @@ Average pooling operation for one dimensional data.
            equation.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.AvgPool2DAttrs")
+.set_attrs_type<AvgPool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -349,7 +349,7 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d")
            (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.GlobalPool2DAttrs")
+.set_attrs_type<GlobalPool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -380,7 +380,7 @@ RELAY_REGISTER_OP("nn.global_max_pool2d")
            (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.GlobalPool2DAttrs")
+.set_attrs_type<GlobalPool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -506,7 +506,7 @@ RELAY_REGISTER_OP("contrib.adaptive_avg_pool2d")
            (batch_size, channels, output_height, output_width)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.AdaptivePool2DAttrs")
+.set_attrs_type<AdaptivePool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(10)
@@ -545,7 +545,7 @@ RELAY_REGISTER_OP("contrib.adaptive_max_pool2d")
            (batch_size, channels, output_height, output_width)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.AdaptivePool2DAttrs")
+.set_attrs_type<AdaptivePool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(10)
@@ -653,7 +653,7 @@ RELAY_REGISTER_OP("nn.max_pool2d_grad")
            (batch_size, channels, height, width)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MaxPool2DAttrs")
+.set_attrs_type<MaxPool2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -702,7 +702,7 @@ RELAY_REGISTER_OP("nn.avg_pool2d_grad")
            (batch_size, channels, height, width)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MaxPool2DAttrs")
+.set_attrs_type<MaxPool2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index 48a9b11f7651..08bbbef9eec9 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -84,7 +84,7 @@ RELAY_REGISTER_OP("nn.sparse_dense")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SparseDenseAttrs")
+.set_attrs_type<SparseDenseAttrs>()
 .set_num_inputs(4)
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("weight_data", "1D Tensor", "Weight data matrix.")
@@ -131,7 +131,7 @@ RELAY_REGISTER_OP("nn.sparse_transpose")
 - **out**: `(N, N)`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SparseTransposeAttrs")
+.set_attrs_type<SparseTransposeAttrs>()
 .set_num_inputs(3)
 .add_argument("sparse_data", "1D Tensor", "Sparse data matrix.")
 .add_argument("sparse_indices", "1D Tensor", "Sparse indices matrix.")
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index 99892034ad02..c473f86a39ca 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -128,7 +128,7 @@ RELAY_REGISTER_OP("nn.upsampling")
            (batch_size, in_height*scale, in_width*scale, channels)
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.UpSamplingAttrs")
+.set_attrs_type<UpSamplingAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index e41cfdaeca10..51714bd9f756 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -324,7 +324,7 @@ RELAY_REGISTER_REDUCE_OP("argmax")
 values over a given axis.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("ArgReduce", ArgReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", ArgMaxCompute)
@@ -343,7 +343,7 @@ RELAY_REGISTER_REDUCE_OP("argmin")
 values over a given axis.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("ArgReduce", ArgReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", ArgMinCompute)
@@ -375,7 +375,7 @@ Example::
   [ 12.  19.  27.]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout)
@@ -413,7 +413,7 @@ Example::
    [False,  True, False]]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", AllCompute)
@@ -431,7 +431,7 @@ RELAY_REGISTER_REDUCE_OP("max")
 .describe(R"code(Computes the max of array elements over given axes.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", MaxCompute)
@@ -450,7 +450,7 @@ RELAY_REGISTER_REDUCE_OP("min")
 .describe(R"code(Computes the min of array elements over given axes.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", MinCompute)
@@ -480,7 +480,7 @@ Example::
   [ 36  480  2058]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", ProdCompute)
@@ -521,7 +521,7 @@ Example::
   [ 2.  3.16666667  4.5]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", MeanCompute)
@@ -596,7 +596,7 @@ RELAY_REGISTER_OP("variance")
 .describe(R"code(Computes the variance of array elements over given axes.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 34d260a83fb7..e1239ae5b9e2 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file transform.cc
  * \brief Transform operators.
  */
@@ -92,7 +91,7 @@ RELAY_REGISTER_OP("cast")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.CastAttrs")
+.set_attrs_type<CastAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Cast", CastRel)
@@ -177,16 +176,16 @@ TVM_REGISTER_API("relay._make.reinterpret").set_body([](const TVMArgs& args, TVM
 });
 
 RELAY_REGISTER_OP("reinterpret")
-    .describe(R"code(Reinterpret the data into a new data type.
+.describe(R"code(Reinterpret the data into a new data type.
 )code" TVM_ADD_FILELINE)
-    .set_num_inputs(1)
-    .set_attrs_type_key("relay.attrs.CastAttrs")
-    .add_argument("data", "Tensor", "The input tensor.")
-    .set_support_level(3)
-    .add_type_rel("Reinterpret", CastRel)
-    .set_attr<FTVMCompute>("FTVMCompute", ReinterpretCompute)
-    .set_attr<TOpPattern>("TOpPattern", kElemWise)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
+.set_num_inputs(1)
+.set_attrs_type<CastAttrs>()
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Reinterpret", CastRel)
+.set_attr<FTVMCompute>("FTVMCompute", ReinterpretCompute)
+.set_attr<TOpPattern>("TOpPattern", kElemWise)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
 
 // relay.expand_dims
 TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
@@ -260,7 +259,7 @@ RELAY_REGISTER_OP("expand_dims")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ExpandDimsAttrs")
+.set_attrs_type<ExpandDimsAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
 .add_type_rel("ExpandDims", ExpandDimsRel)
@@ -346,7 +345,7 @@ RELAY_REGISTER_OP("concatenate")
 - **axis** : The axis along which the tensors are concatenated.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ConcatenateAttrs")
+.set_attrs_type<ConcatenateAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input list of tensors.")
 .set_support_level(1)
@@ -441,7 +440,7 @@ RELAY_REGISTER_OP("stack")
 - **axis** : The axis along which the tensors are stacked.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.StackAttrs")
+.set_attrs_type<StackAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input list of tensors.")
 .set_support_level(3)
@@ -534,7 +533,7 @@ RELAY_REGISTER_OP("transpose")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.TransposeAttrs")
+.set_attrs_type<TransposeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Transpose", TransposeRel)
@@ -696,7 +695,7 @@ Array<Tensor> ReshapeCompute(const Attrs& attrs,
   CHECK(out_ttype != nullptr);
   Array<IndexExpr> newshape;
   for (auto val : out_ttype->shape) {
-    if (val->is_type<ir::Any>()) {
+    if (val->IsInstance<ir::Any>()) {
       newshape.push_back(val.as<ir::Any>()->ToVar());
     } else {
       newshape.push_back(val);
@@ -769,7 +768,7 @@ Example::
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ReshapeAttrs")
+.set_attrs_type<ReshapeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Reshape", ReshapeRel)
@@ -859,7 +858,7 @@ RELAY_REGISTER_OP("argwhere")
 .describe(R"doc(Find the indices of elements of a tensor that are
 non-zero)doc" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ArgWhereAttrs")
+.set_attrs_type<ArgWhereAttrs>()
 .add_argument("condition", "Tensor", "The input condition tensor.")
 .add_type_rel("ArgWhere", ArgWhereRel)
 .set_attr<TOpIsStateful>("TOpIsStateful", false)
@@ -962,7 +961,7 @@ Examples::
                               [ 4., 3.]]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.TakeAttrs")
+.set_attrs_type<TakeAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("indices", "Tensor", "The indices tensor.")
@@ -1024,7 +1023,7 @@ RELAY_REGISTER_OP("full")
 .describe(R"code(Fill array with scalar value.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.InitOpAttrs")
+.set_attrs_type<InitOpAttrs>()
 .set_num_inputs(1)
 .add_argument("fill_value", "double", "The value to fill.")
 .set_support_level(3)
@@ -1059,7 +1058,7 @@ RELAY_REGISTER_OP("zeros")
 .describe(R"code(Fill array with zeros.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.InitOpAttrs")
+.set_attrs_type<InitOpAttrs>()
 .set_num_inputs(0)
 .set_support_level(3)
 .add_type_rel("InitOp", InitOpRel);
@@ -1080,7 +1079,7 @@ RELAY_REGISTER_OP("ones")
 .describe(R"code(Fill array with ones.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.InitOpAttrs")
+.set_attrs_type<InitOpAttrs>()
 .set_num_inputs(0)
 .set_support_level(3)
 .add_type_rel("InitOp", InitOpRel);
@@ -1261,7 +1260,7 @@ RELAY_REGISTER_OP("arange")
 .describe(R"code(Returns evenly spaced values within a given interval.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ArangeAttrs")
+.set_attrs_type<ArangeAttrs>()
 .set_num_inputs(3)
 .set_support_level(3)
 .add_type_rel("Arange", ArangeRel)
@@ -1340,7 +1339,7 @@ RELAY_REGISTER_OP("repeat")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.Repeat")
+.set_attrs_type<RepeatAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Repeat", RepeatRel)
@@ -1438,7 +1437,7 @@ RELAY_REGISTER_OP("tile")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.Tile")
+.set_attrs_type<TileAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Tile", TileRel)
@@ -1499,7 +1498,7 @@ RELAY_REGISTER_OP("reverse")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.Reverse")
+.set_attrs_type<ReverseAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Reverse", ReverseRel)
@@ -1677,7 +1676,7 @@ RELAY_REGISTER_OP("squeeze")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.SqueezeAttrs")
+.set_attrs_type<SqueezeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Squeeze", SqueezeRel)
@@ -1814,7 +1813,7 @@ Array<Integer> GetIntArray(Array<IndexExpr> arr) {
     CHECK(!arr[i].defined() || arr[i].as<IntImm>())
       << "Expect an int array";
   }
-  return Array<Integer>(arr.node_);
+  return Downcast<Array<Integer> >(arr);
 }
 
 
@@ -2034,7 +2033,7 @@ Examples::
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(4)
-.set_attrs_type_key("relay.attrs.StridedSliceAttrs")
+.set_attrs_type<StridedSliceAttrs>()
 .add_type_rel("StridedSlice", StridedSliceRel)
 .set_attr<FTVMCompute>("FTVMCompute", StridedSliceCompute)
 .set_attr<TOpPattern>("TOpPattern", kInjective)
@@ -2081,11 +2080,11 @@ bool SplitRel(const Array<Type>& types,
     auto begin = IndexExpr(make_zero(Int(32)));
     std::vector<Type> fields;
     for (unsigned int i = 0; i < indices.size(); ++i) {
-      CHECK(reporter->Assert(IndexExpr(indices[i]) > begin))
+      CHECK(reporter->Assert(Downcast<IndexExpr>(indices[i]) > begin))
           << "indices_or_sections need to be a sorted ascending list";
       std::vector<IndexExpr> oshape(data->shape.begin(), data->shape.end());
-      oshape[axis] = IndexExpr(indices[i]) - begin;
-      begin = IndexExpr(indices[i]);
+      oshape[axis] = Downcast<IndexExpr>(indices[i]) - begin;
+      begin = Downcast<IndexExpr>(indices[i]);
       auto vec_type = TensorTypeNode::make(oshape, data->dtype);
       fields.push_back(vec_type);
     }
@@ -2147,7 +2146,7 @@ If indices_or_sections is a tuple of sorted integers,
 the entries indicate where along axis the array is split.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SplitAttrs")
+.set_attrs_type<SplitAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
@@ -2283,7 +2282,7 @@ TVM_REGISTER_API("relay.op._make.slice_like")
 RELAY_REGISTER_OP("slice_like")
 .describe(R"code(Slice the first input respect to the second input.
 )code" TVM_ADD_FILELINE)
-  .set_attrs_type_key("relay.attrs.SlicelikeAttrs")
+.set_attrs_type<SliceLikeAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("shape_like", "Tensor", "Shape tensor.")
@@ -2347,7 +2346,7 @@ For transforming from NCHW to N16cHWC, the `__layout_transform__` operator resha
 the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.LayoutTransformAttrs")
+.set_attrs_type<LayoutTransformAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_type_rel("layout_transform", LayoutTransformRel)
@@ -2383,7 +2382,7 @@ example below::
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ReshapeAttrs")
+.set_attrs_type<ReshapeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(10)
 .add_type_rel("Reshape", ReshapeRel)
@@ -2552,7 +2551,7 @@ Examples::
         [[  0.1,  0.1,  0.1],
          [  16.,  17.,  18.]]]
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SequenceMaskAttrs")
+.set_attrs_type<SequenceMaskAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("valid_length", "Tensor", "The real (valid) length of each sequence.")
@@ -2640,7 +2639,7 @@ RELAY_REGISTER_OP("one_hot")
     **axis** Axis to fill.
 
     **dtype**)code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.OneHotAttrs")
+.set_attrs_type<OneHotAttrs>()
 .set_num_inputs(3)
 .add_argument("indices", "Tensor", "Locations to set to on_value.")
 .add_argument("on_value", "Expr", "Value to fill at indices.")
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 0e3e539cc928..1979d067c917 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -177,7 +177,7 @@ This function takes a tensor, a minimum value `a_min`, and a maximum value `a_ma
 .set_attr<TOpPattern>("TOpPattern", kElemWise)
 .set_attr<TOpIsStateful>("TOpIsStateful", false)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
-.set_attrs_type_key("relay.attrs.ClipAttrs")
+.set_attrs_type<ClipAttrs>()
 .set_support_level(3);
 
 
@@ -314,7 +314,7 @@ RELAY_REGISTER_OP("shape_of")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ShapeOfAttrs")
+.set_attrs_type<ShapeOfAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .add_type_rel("ShapeOf", ShapeOfRel)
 .set_attr<TOpIsStateful>("TOpIsStateful", false)
@@ -365,7 +365,7 @@ RELAY_REGISTER_OP("contrib.ndarray_size")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.NdarraySizeAttrs")
+.set_attrs_type<NdarraySizeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .add_type_rel("NdarraySize", NdarraySizeRel)
 .set_attr<TOpIsStateful>("TOpIsStateful", false)
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index 56a03ff80bc9..f329cfdd710b 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -79,7 +79,7 @@ TVM_REGISTER_API("relay.op.vision._make.multibox_prior")
 RELAY_REGISTER_OP("vision.multibox_prior")
 .describe(R"doc("Generate prior(anchor) boxes from data, sizes and ratios."
 )doc" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MultiBoxPriorAttrs")
+.set_attrs_type<MultiBoxPriorAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(5)
@@ -150,7 +150,7 @@ TVM_REGISTER_API("relay.op.vision._make.multibox_transform_loc")
 RELAY_REGISTER_OP("vision.multibox_transform_loc")
 .describe(R"doc("Location transformation for multibox detection."
 )doc" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MultiBoxTransformLocAttrs")
+.set_attrs_type<MultiBoxTransformLocAttrs>()
 .set_num_inputs(3)
 .add_argument("cls_prob", "Tensor", "Class probabilities.")
 .add_argument("loc_pred", "Tensor", "Location regression predictions.")
diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc
index b96ab10aa650..a55bf0231956 100644
--- a/src/relay/op/vision/yolo.cc
+++ b/src/relay/op/vision/yolo.cc
@@ -80,7 +80,7 @@ Its function is mostly shape transform.")doc" TVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_num_inputs(1)
 .set_support_level(5)
-.set_attrs_type_key("relay.attrs.YoloReorgAttrs")
+.set_attrs_type<YoloReorgAttrs>()
 .add_type_rel("YoloReorg", YoloReorgRel)
 .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
                                          const Array<Tensor>& inputs,
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
index 23a480b4e42f..9143ae3a43b7 100644
--- a/src/relay/pass/alter_op_layout.cc
+++ b/src/relay/pass/alter_op_layout.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2019 by Contributors
  * \file alter_op_layout.cc
  * \brief Alternate the layouts of operators or replace primitive operators with
           other expressions. This pass can be used for computing convolution in
@@ -97,10 +96,10 @@ struct key_hash : public std::function<std::size_t(TransformKey)> {
 class TransformMemorizer : public NodeRef {
  public:
   TransformMemorizer() {}
-  explicit TransformMemorizer(NodePtr<Node> n) : NodeRef(n) {}
+  explicit TransformMemorizer(ObjectPtr<Object> n) : NodeRef(n) {}
 
   TransformMemorizerNode* operator->() {
-    return static_cast<TransformMemorizerNode*>(node_.get());
+    return static_cast<TransformMemorizerNode*>(get_mutable());
   }
 
   // Transform layout with memorizer
@@ -242,7 +241,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
 
   for (auto new_arg : new_args) {
     // NOTE: do not support nested tuple
-    if (new_arg->is_type<TupleNode>()) {
+    if (new_arg->IsInstance<TupleNode>()) {
       Tuple tuple_new_arg = Downcast<Tuple>(new_arg);
       std::vector<Expr> fields;
       for (auto x : tuple_new_arg->fields) {
@@ -264,7 +263,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
   }
 
   for (auto arg : ref_call->args) {
-    if (arg->is_type<TupleNode>()) {  // flatten tuple
+    if (arg->IsInstance<TupleNode>()) {  // flatten tuple
       Tuple tuple_arg = Downcast<Tuple>(arg);
       for (auto x : tuple_arg->fields) {
         input_shapes.push_back(x->type_as<TensorTypeNode>()->shape);
@@ -293,7 +292,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
   Call new_call = CallAlter(ref_call, normal_new_args);
 
   // new_in2, new_out = op.infer(new_in)
-  if (new_call->op->is_type<OpNode>()) {
+  if (new_call->op->IsInstance<OpNode>()) {
     success = false;
     std::tie(new_in2, new_out, success) = CallInfer(new_call, new_in, old_in, input_shapes);
     if (!success) { return Expr(nullptr); }
@@ -310,7 +309,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
   Array<Expr> transformed_args;
   size_t pt = 0;
   for (auto arg : new_call->args) {
-    if (arg->is_type<TupleNode>()) {  // unflatten tuple
+    if (arg->IsInstance<TupleNode>()) {  // unflatten tuple
       Tuple tuple_arg = Downcast<Tuple>(arg);
       std::vector<Expr> transformed_tuple_arg;
       for (auto arg_item : tuple_arg->fields) {
@@ -329,7 +328,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
 
   // state[node] = (old_out, new_out)
   // (handle tuple output)
-  if (ref_call->checked_type()->is_type<TupleTypeNode>()) {
+  if (ref_call->checked_type()->IsInstance<TupleTypeNode>()) {
     Expr tuple_output = CallNode::make(new_call->op, transformed_args,
                                        new_call->attrs);
     Array<Expr> fields;
diff --git a/src/relay/pass/device_annotation.cc b/src/relay/pass/device_annotation.cc
index aec974b184d3..d29516f0df79 100644
--- a/src/relay/pass/device_annotation.cc
+++ b/src/relay/pass/device_annotation.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -257,12 +257,12 @@ class RewriteAnnotation : public ExprMutator {
         // Here we need across device data transferring only when `src` is a
         // CallNode or FunctionNode and the `dst` is annotated with any device
         // id other than fallback_device_.
-        if (src->is_type<CallNode>() || src->is_type<FunctionNode>()) {
+        if (src->IsInstance<CallNode>() || src->IsInstance<FunctionNode>()) {
           return annotation_map_.at(dst) != fallback_device_;
         } else {
           // There shouldn't be any copy nodes between var/constant and another
           // expression.
-          return !(src->is_type<VarNode>() || src->is_type<ConstantNode>());
+          return !(src->IsInstance<VarNode>() || src->IsInstance<ConstantNode>());
         }
       } else {
         return false;
@@ -349,7 +349,7 @@ class AnnotatationVisitor : private ExprVisitor {
  *           ancestors until encountering another copy op. For example, this way
  *           provides add, x, and y device types from the copy operator, `copy1`.
  *  -Pass 2: Propagating the destination device type of "the last" copy op to the
- *           remain nodes. For instance, this offers `subtract` and `exp` the 
+ *           remain nodes. For instance, this offers `subtract` and `exp` the
  *           same device type as `copy3`.
  */
 
diff --git a/src/relay/pass/eta_expand.cc b/src/relay/pass/eta_expand.cc
index f4b7dbfaf622..612ababfe044 100644
--- a/src/relay/pass/eta_expand.cc
+++ b/src/relay/pass/eta_expand.cc
@@ -36,14 +36,14 @@ Expr EtaExpand(const Expr& e, const Module& mod) {
   tvm::Array<TypeVar> original_type_params;
   Type ret_type;
 
-  if (e->is_type<GlobalVarNode>()) {
+  if (e->IsInstance<GlobalVarNode>()) {
     auto gvar_node = e.as_derived<GlobalVarNode>();
     auto func = mod->Lookup(GetRef<GlobalVar>(gvar_node));
     original_params = func->params;
     original_type_params = func->type_params;
     ret_type = func->ret_type;
   } else {
-    CHECK(e->is_type<FunctionNode>());
+    CHECK(e->IsInstance<FunctionNode>());
     auto func = GetRef<Function>(e.as_derived<FunctionNode>());
     original_params = func->params;
     original_type_params = func->type_params;
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
index eba77c7241a7..684887683a04 100644
--- a/src/relay/pass/fold_constant.cc
+++ b/src/relay/pass/fold_constant.cc
@@ -157,7 +157,7 @@ class ConstantFolder : public ExprMutator {
       }
       return TupleNode::make(fields);
     } else {
-      LOG(FATAL) << "Cannot handle " << value->type_key();
+      LOG(FATAL) << "Cannot handle " << value->GetTypeKey();
       return Expr();
     }
   }
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index 868a08f8b576..6defa35b5106 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,8 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- *
  * \file fold_scale_axis.cc
  *
  * \brief Fold axis scaling into weights of
@@ -686,10 +684,10 @@ class BackwardTransformer : public NodeRef {
  public:
   BackwardTransformer() {}
   explicit BackwardTransformer(
-      ::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {
+      ::tvm::ObjectPtr<::tvm::Object> n) : NodeRef(n) {
   }
   BackwardTransformerNode* operator->() const {
-    return static_cast<BackwardTransformerNode*>(node_.get());
+    return static_cast<BackwardTransformerNode*>(get_mutable());
   }
   using ContainerType = BackwardTransformerNode;
 };
diff --git a/src/relay/pass/partial_eval.cc b/src/relay/pass/partial_eval.cc
index 906d245e4601..92f0db5d8ebe 100644
--- a/src/relay/pass/partial_eval.cc
+++ b/src/relay/pass/partial_eval.cc
@@ -18,8 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2019 by Contributors
- *
  * \file partial_eval.cc
  *
  * \brief Perform known computation in compile time.
@@ -112,7 +110,7 @@ using namespace runtime;
  */
 struct VarHash {
   size_t operator()(const Var& v) const {
-    return v->vid.hash();
+    return NodeHash()(v->vid);
   }
 };
 
@@ -138,9 +136,9 @@ class StaticNode : public RelayNode {
 class Static : public NodeRef {
  public:
   Static() {}
-  explicit Static(NodePtr<Node> n) : NodeRef(n) {}
-  const ValueNode* operator->() const {
-    return static_cast<const ValueNode*>(node_.get());
+  explicit Static(ObjectPtr<Object> n) : NodeRef(n) {}
+  const StaticNode* operator->() const {
+    return static_cast<const StaticNode*>(get());
   }
 
   using ContainerType = StaticNode;
@@ -251,7 +249,7 @@ class FuelNode;
 class Fuel : public NodeRef {
  public:
   Fuel() {}
-  explicit Fuel(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Fuel(ObjectPtr<Object> n) : NodeRef(n) {}
   const FuelNode* operator->() const;
 
   using ContainerType = FuelNode;
@@ -285,7 +283,7 @@ class FuelNode : public RelayNode {
 };
 
 const FuelNode* Fuel::operator->() const {
-  return static_cast<const FuelNode*>(node_.get());
+  return static_cast<const FuelNode*>(get());
 }
 
 Fuel MkFSeq(const std::vector<Fuel>& fuels);
diff --git a/src/relay/pass/pass_manager.cc b/src/relay/pass/pass_manager.cc
index 153b90cd8833..928d8bd180e5 100644
--- a/src/relay/pass/pass_manager.cc
+++ b/src/relay/pass/pass_manager.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2019 by Contributors
  * \file src/relay/pass/pass_manager.cc
  * \brief Relay pass manager implementation.
  */
@@ -342,7 +341,7 @@ Sequential::Sequential(tvm::Array<Pass> passes, PassInfo pass_info) {
   auto n = make_node<SequentialNode>();
   n->passes = std::move(passes);
   n->pass_info = std::move(pass_info);
-  node_ = std::move(n);
+  data_ = std::move(n);
 }
 
 Sequential::Sequential(tvm::Array<Pass> passes, std::string name) {
@@ -350,11 +349,11 @@ Sequential::Sequential(tvm::Array<Pass> passes, std::string name) {
   n->passes = std::move(passes);
   PassInfo pass_info = PassInfoNode::make(2, std::move(name), {});
   n->pass_info = std::move(pass_info);
-  node_ = std::move(n);
+  data_ = std::move(n);
 }
 
 const SequentialNode* Sequential::operator->() const {
-  return static_cast<const SequentialNode*>(this->node_.get());
+  return static_cast<const SequentialNode*>(get());
 }
 
 void SequentialNode::ResolveDependency(const Module& mod) {
diff --git a/src/relay/pass/quantize/annotate.cc b/src/relay/pass/quantize/annotate.cc
index d8a7a0f24818..38ffd9b59892 100644
--- a/src/relay/pass/quantize/annotate.cc
+++ b/src/relay/pass/quantize/annotate.cc
@@ -79,7 +79,7 @@ Pass QuantizeAnnotate() {
   // TODO(tvm-teams): since partition has added cast_hint in different
   // branches, try to remove this in the future.
   std::function<Expr(const Expr&)> fmulti_ref = [](const Expr& e) {
-    if (e->derived_from<TempExprNode>()) {
+    if (e->IsInstance<TempExprNode>()) {
       const auto* n = e.as<QAnnotateExprNode>();
       CHECK(n);
       const PackedFunc* f =
@@ -109,6 +109,8 @@ Pass QuantizeAnnotate() {
 TVM_REGISTER_API("relay._quantize.QuantizeAnnotate")
 .set_body_typed(QuantizeAnnotate);
 
+TVM_REGISTER_NODE_TYPE(QAnnotateExprNode);
+
 }  // namespace quantize
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/quantize/partition.cc b/src/relay/pass/quantize/partition.cc
index 3f46cf2f227e..6c7dc504b05e 100644
--- a/src/relay/pass/quantize/partition.cc
+++ b/src/relay/pass/quantize/partition.cc
@@ -35,6 +35,7 @@ namespace quantize {
 
 using namespace relay::transform;
 
+
 class QPartitionExpr;
 class QPartitionExprNode : public TempExprNode {
  public:
@@ -87,6 +88,8 @@ Pass QuantizePartition() {
 TVM_REGISTER_API("relay._quantize.QuantizePartition")
 .set_body_typed(QuantizePartition);
 
+TVM_REGISTER_NODE_TYPE(QPartitionExprNode);
+
 }  // namespace quantize
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/quantize/quantize.cc b/src/relay/pass/quantize/quantize.cc
index c6d71ba0ed32..dafbc1d1007f 100644
--- a/src/relay/pass/quantize/quantize.cc
+++ b/src/relay/pass/quantize/quantize.cc
@@ -64,7 +64,7 @@ RELAY_REGISTER_OP("relay.op.annotation.simulated_quantize")
 .add_argument("dom_scale", "Tensor", "The domain scale of input data. It should be a scalar")
 .add_argument("clip_min", "Tensor", "lower bound. It should be a scalar")
 .add_argument("clip_max", "Tensor", "upper bound. It should be a scalar")
-.set_attrs_type_key("relay.attrs.SimulatedQuantizeAttrs")
+.set_attrs_type<SimulatedQuantizeAttrs>()
 .set_support_level(11)
 .add_type_rel("SimulatedQuantize", SimulatedQuantizeRel);
 
diff --git a/src/relay/pass/quantize/quantize.h b/src/relay/pass/quantize/quantize.h
index 4c153d522d69..f193f9a63e0a 100644
--- a/src/relay/pass/quantize/quantize.h
+++ b/src/relay/pass/quantize/quantize.h
@@ -18,8 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors.
- *
  * \file tvm/relay/pass/quantize.h
  * \brief Header of definitions for quantization
  */
@@ -102,14 +100,14 @@ class QConfigNode : public Node {
 class QConfig : public NodeRef {
  public:
   QConfig() {}
-  explicit QConfig(NodePtr<Node> n) : NodeRef(n) {}
+  explicit QConfig(ObjectPtr<Object> n) : NodeRef(n) {}
 
   const QConfigNode* operator->() const {
-    return static_cast<const QConfigNode*>(node_.get());
+    return static_cast<const QConfigNode*>(get());
   }
 
   QConfigNode* operator->() {
-    return static_cast<QConfigNode*>(node_.get());
+    return static_cast<QConfigNode*>(get_mutable());
   }
 
   /*!
diff --git a/src/relay/pass/quantize/realize.cc b/src/relay/pass/quantize/realize.cc
index 7eae9992c9e4..cd367fdc0e5f 100644
--- a/src/relay/pass/quantize/realize.cc
+++ b/src/relay/pass/quantize/realize.cc
@@ -173,7 +173,7 @@ Expr QuantizeRealize(const Call& ref_call,
   }
 
   // quantize from real
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   Expr data = new_args[0];
   Expr scaled_data = Multiply(data, MakeConstantScalar(Float(32), 1 / dom_scale_imm));
   Expr round_data = Clip(Round(scaled_data), clip_min_imm, clip_max_imm);
@@ -196,7 +196,7 @@ Expr Conv2dRealize(const Call& ref_call,
                    const NodeRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   CHECK_EQ(new_args.size(), 2);
-  if (!new_args[0]->derived_from<TempExprNode>() && !new_args[1]->derived_from<TempExprNode>()) {
+  if (!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>()) {
     return Expr(nullptr);
   }
   const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
@@ -232,7 +232,7 @@ Expr DenseRealize(const Call& ref_call,
                   const NodeRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   CHECK_EQ(new_args.size(), 2);
-  if (!new_args[0]->derived_from<TempExprNode>() || !new_args[1]->derived_from<TempExprNode>()) {
+  if (!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>()) {
     return Expr(nullptr);
   }
   const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
@@ -290,7 +290,7 @@ Expr MulRealize(const Call& ref_call,
     Expr dom_scale = FoldConstantOpt(mul);
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>() && !new_args[1]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -375,7 +375,7 @@ Expr AddRealize(const Call& ref_call,
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   }
 
-  CHECK(!new_args[0]->derived_from<TempExprNode>() && !new_args[1]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -397,7 +397,7 @@ Expr ClipRealize(const Call& ref_call,
       {n->data}, Attrs(attrs), ref_call->type_args);
     return QRealizeIntExprNode::make(ret, n->dom_scale, n->dtype);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -426,7 +426,7 @@ Expr ConcatenateRealize(const Call& ref_call,
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   } else {
     for (auto arg : new_args) {
-      CHECK(!arg->derived_from<TempExprNode>());
+      CHECK(!arg->IsInstance<TempExprNode>());
     }
     return Expr(nullptr);
   }
@@ -445,7 +445,7 @@ Expr IdentityRealize(const Call& ref_call,
     Expr ret = ForwardOp(ref_call, {n->data});
     return QRealizeIntExprNode::make(ret, n->dom_scale, n->dtype);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -469,7 +469,7 @@ Expr CastDtypeInputRealize(const Call& ref_call,
     Expr ret = ForwardOp(ref_call, {data});
     return QRealizeIntExprNode::make(ret, n->dom_scale, cfg->dtype_input);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -490,7 +490,7 @@ Expr AvgPoolRealize(const Call& ref_call,
     Expr ret = ForwardOp(ref_call, {data});
     return QRealizeIntExprNode::make(ret, n->dom_scale, cfg->dtype_activation);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -506,7 +506,7 @@ Expr CastHintRealize(const Call& ref_call,
     Expr ret = Cast(n->data, param->dtype);
     return QRealizeIntExprNode::make(ret, n->dom_scale, param->dtype);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 5b9b25bd61f9..cb0ea1bbc0ed 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -211,8 +211,8 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
 
   Type VisitExpr_(const TupleGetItemNode* op) final {
     if (!tuple_getitem_rel_.defined())  {
-      tuple_getitem_rel_ = TypeRelationFn(
-          EnvFunc::Get("tvm.relay.type_relation.TupleGetItem").node_);
+      tuple_getitem_rel_ = Downcast<TypeRelationFn>(
+          EnvFunc::Get("tvm.relay.type_relation.TupleGetItem"));
     }
     Type tuple_type = GetType(op->tuple);
     Type rtype = IncompleteTypeNode::make(Kind::kType);
@@ -682,13 +682,13 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
     // Compiler optimization will likely fold these away for other nodes.
     CallNode* new_call =(
         std::is_base_of<CallNode, T>::value ?
-        static_cast<CallNode*>(new_e.node_.get()) : nullptr);
+        const_cast<CallNode*>(static_cast<const CallNode*>(new_e.get())) : nullptr);
     VarNode* new_var =(
         std::is_base_of<VarNode, T>::value ?
-        static_cast<VarNode*>(new_e.node_.get()) : nullptr);
+        const_cast<VarNode*>(static_cast<const VarNode*>(new_e.get())) : nullptr);
     FunctionNode* new_fn =(
         std::is_base_of<FunctionNode, T>::value ?
-        static_cast<FunctionNode*>(new_e.node_.get()) : nullptr);
+        const_cast<FunctionNode*>(static_cast<const FunctionNode*>(new_e.get())) : nullptr);
 
     // check if we need update the new_e
     bool need_update_type = !checked_type.same_as(new_e->checked_type_);
@@ -713,20 +713,21 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
       return new_e;
     }
 
-    if (!new_e.node_.unique()) {
+    if (!new_e.unique()) {
       // Copy on write optimization
       // If new_e is an old expression,
       // we make a copy mutating an existing reference.
-      new_e = Expr(make_node<T>(*new_e.as<T>()));
+      NodePtr<ExprNode> ptr = make_node<T>(*new_e.as<T>());
+      new_e = Expr(ptr);
       new_call = (
           std::is_base_of<CallNode, T>::value ?
-          static_cast<CallNode*>(new_e.node_.get()) : nullptr);
+          static_cast<CallNode*>(ptr.get()) : nullptr);
       new_var = (
           std::is_base_of<VarNode, T>::value ?
-          static_cast<VarNode*>(new_e.node_.get()) : nullptr);
+          static_cast<VarNode*>(ptr.get()) : nullptr);
       new_fn = (
           std::is_base_of<FunctionNode, T>::value ?
-          static_cast<FunctionNode*>(new_e.node_.get()) : nullptr);
+          static_cast<FunctionNode*>(ptr.get()) : nullptr);
     }
 
     // attach the information.
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index 31edd3b0e80e..f2bf46af4b28 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -592,7 +592,7 @@ void TypeSolver::AddConstraint(const TypeConstraint& constraint, const NodeRef&
     this->AddToQueue(rnode);
   } else {
     LOG(FATAL) << "Do not know how to handle constraint type"
-               << constraint->type_key();
+               << constraint->GetTypeKey();
   }
 }
 
diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc
index e87eaa18ddd7..6eceb8d40532 100644
--- a/src/relay/qnn/op/concatenate.cc
+++ b/src/relay/qnn/op/concatenate.cc
@@ -119,7 +119,7 @@ Expr ConcatenateQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
 RELAY_REGISTER_OP("qnn.concatenate")
 .describe(R"code(Concatenate the quantized input tensors along the given axis.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.QnnConcatenateAttrs")
+.set_attrs_type<QnnConcatenateAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The tensor to concatenate.")
 .set_support_level(11)
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index a73a65804e98..d17a18589d75 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -478,7 +478,7 @@ operator to understand how to scale back the int32 output to (u)int8.
 - **out**:  This depends on the `layout` parameter. Output is 4D array of shape
             (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.QnnConv2DAttrs")
+.set_attrs_type<QnnConv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The quantized input data tensor.")
 .add_argument("weight", "Tensor", "The quantized weight tensor.")
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 5473d139e215..c708cfa3dc63 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -115,7 +115,7 @@ RELAY_REGISTER_OP("qnn.dense")
 - **weight**: quantized(int8, unit8) `(units, input_dim)`
 - **out**: quantized(int32) `(x1, x2, ..., xn, units)`.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.qnn.QnnDenseAttrs")
+.set_attrs_type<QnnDenseAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "quantized nD Tensor", "Input data.")
 .add_argument("weight", "quantized 2D Tensor", "Weight matrix.")
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 784572fcab69..2baa1a57d96d 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -89,7 +89,7 @@ RELAY_REGISTER_OP("qnn.dequantize")
 The input is always quantized (int8, uint8) and will be converted to float32 given input scale and zero_point.
 - **data**: Quantized tensor of any shape to dequantize. The input data can be of floating point
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DequantizeAttrs")
+.set_attrs_type<DequantizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The tensor to dequantize.")
 .set_support_level(11)
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 97f1a220302e..1f7dbc1b6bb6 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -106,7 +106,7 @@ scale and zero point.
 - **data**: Tensor of any shape to quantize. The input data can be of floating point
           or quantized.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.QuantizeAttrs")
+.set_attrs_type<QuantizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The tensor to quantize.")
 .set_support_level(11)
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 85d8dc3609f8..4a424d1df693 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -183,7 +183,7 @@ point. The computation looks like this
 Q_output = zp_output +  (scale_input)/(scale_output) * (Q_input - zp_input)
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.RequantizeAttrs")
+.set_attrs_type<RequantizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The quantized input tensor.")
 .set_support_level(11)
diff --git a/src/runtime/c_dsl_api.cc b/src/runtime/c_dsl_api.cc
deleted file mode 100644
index bf9092637420..000000000000
--- a/src/runtime/c_dsl_api.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file cpu_dsl_api.cc
- * \brief DSL API dispatcher
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/c_dsl_api.h>
-#include "dsl_api.h"
-#include "runtime_base.h"
-
-namespace tvm {
-namespace runtime {
-
-DSLAPI* FindDSLAPI() {
-  auto* f = Registry::Get("dsl_api.singleton");
-  if (f == nullptr) {
-    throw dmlc::Error("TVM runtime only environment,"\
-                      " DSL API is not available");
-  }
-  void* ptr = (*f)();
-  return static_cast<DSLAPI*>(ptr);
-}
-
-static DSLAPI* GetDSLAPI() {
-  static DSLAPI* inst = FindDSLAPI();
-  return inst;
-}
-}  // namespace runtime
-}  // namespace tvm
-
-using namespace tvm::runtime;
-
-int TVMNodeFree(NodeHandle handle) {
-  API_BEGIN();
-  GetDSLAPI()->NodeFree(handle);
-  API_END();
-}
-
-int TVMNodeTypeKey2Index(const char* type_key,
-                         int* out_index) {
-  API_BEGIN();
-  GetDSLAPI()->NodeTypeKey2Index(type_key, out_index);
-  API_END();
-}
-
-
-int TVMNodeGetTypeIndex(NodeHandle handle,
-                        int* out_index) {
-  API_BEGIN();
-  GetDSLAPI()->NodeGetTypeIndex(handle, out_index);
-  API_END();
-}
-
-int TVMNodeGetAttr(NodeHandle handle,
-                   const char* key,
-                   TVMValue* out_value,
-                   int* out_type_code,
-                   int* out_success) {
-  API_BEGIN();
-  GetDSLAPI()->NodeGetAttr(
-      handle, key, out_value, out_type_code, out_success);
-  API_END();
-}
-
-int TVMNodeListAttrNames(NodeHandle handle,
-                         int *out_size,
-                         const char*** out_array) {
-  API_BEGIN();
-  GetDSLAPI()->NodeListAttrNames(
-      handle, out_size, out_array);
-  API_END();
-}
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 74f0f3e82f27..13181da7303a 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -471,7 +471,7 @@ int TVMFuncCall(TVMFunctionHandle func,
       TVMArgs(args, arg_type_codes, num_args), &rv);
   // handle return string.
   if (rv.type_code() == kStr ||
-     rv.type_code() == kTVMType ||
+      rv.type_code() == kTVMType ||
       rv.type_code() == kBytes) {
     TVMRuntimeEntry* e = TVMAPIRuntimeStore::Get();
     if (rv.type_code() != kTVMType) {
diff --git a/src/runtime/dsl_api.h b/src/runtime/dsl_api.h
deleted file mode 100644
index 6e79e250c56d..000000000000
--- a/src/runtime/dsl_api.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file cpu_dsl_api.cc
- * \brief DSL API dispatcher
- */
-#ifndef TVM_RUNTIME_DSL_API_H_
-#define TVM_RUNTIME_DSL_API_H_
-
-#include <tvm/c_dsl_api.h>
-
-namespace tvm {
-namespace runtime {
-/*!
- * \brief Common interface for DSL API
- *  Used for runtime registration
- */
-class DSLAPI {
- public:
-  virtual ~DSLAPI() = default;
-  virtual void NodeFree(NodeHandle handle) const = 0;
-
-  virtual void NodeTypeKey2Index(const char* type_key,
-                                 int* out_index) const = 0;
-
-  virtual void NodeGetTypeIndex(NodeHandle handle,
-                                int* out_index) const = 0;
-
-  virtual void NodeGetAttr(NodeHandle handle,
-                           const char* key,
-                           TVMValue* out_value,
-                           int* out_type_code,
-                           int* out_success) const = 0;
-
-  virtual void NodeListAttrNames(NodeHandle handle,
-                                 int *out_size,
-                                 const char*** out_array) const = 0;
-};
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_DSL_API_H_
diff --git a/src/runtime/object.cc b/src/runtime/object.cc
index a52a9b3b4457..d07612f6a963 100644
--- a/src/runtime/object.cc
+++ b/src/runtime/object.cc
@@ -25,6 +25,7 @@
 #include <mutex>
 #include <string>
 #include <vector>
+#include <utility>
 #include <unordered_map>
 #include "runtime_base.h"
 
@@ -47,6 +48,8 @@ struct TypeInfo {
   bool child_slots_can_overflow{true};
   /*! \brief name of the type. */
   std::string name;
+  /*! \brief hash of the name */
+  size_t name_hash{0};
 };
 
 /*!
@@ -127,6 +130,7 @@ class TypeContext {
     type_table_[allocated_tindex].child_slots_can_overflow =
         child_slots_can_overflow;
     type_table_[allocated_tindex].name = skey;
+    type_table_[allocated_tindex].name_hash = std::hash<std::string>()(skey);
     // update the key2index mapping.
     type_key2index_[skey] = allocated_tindex;
     return allocated_tindex;
@@ -140,6 +144,14 @@ class TypeContext {
     return type_table_[tindex].name;
   }
 
+  size_t TypeIndex2KeyHash(uint32_t tindex) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    CHECK(tindex < type_table_.size() &&
+          type_table_[tindex].allocated_slots != 0)
+        << "Unknown type index " << tindex;
+    return type_table_[tindex].name_hash;
+  }
+
   uint32_t TypeKey2Index(const char* key) {
     std::string skey = key;
     auto it = type_key2index_.find(skey);
@@ -182,6 +194,10 @@ std::string Object::TypeIndex2Key(uint32_t tindex) {
   return TypeContext::Global()->TypeIndex2Key(tindex);
 }
 
+size_t Object::TypeIndex2KeyHash(uint32_t tindex) {
+  return TypeContext::Global()->TypeIndex2KeyHash(tindex);
+}
+
 uint32_t Object::TypeKey2Index(const char* key) {
   return TypeContext::Global()->TypeKey2Index(key);
 }
@@ -189,7 +205,9 @@ uint32_t Object::TypeKey2Index(const char* key) {
 class TVMObjectCAPI {
  public:
   static void Free(TVMObjectHandle obj) {
-    static_cast<Object*>(obj)->DecRef();
+    if (obj != nullptr) {
+      static_cast<Object*>(obj)->DecRef();
+    }
   }
 
   static uint32_t TypeKey2Index(const char* type_key) {
@@ -201,6 +219,7 @@ class TVMObjectCAPI {
 
 int TVMObjectGetTypeIndex(TVMObjectHandle obj, unsigned* out_tindex) {
   API_BEGIN();
+  CHECK(obj != nullptr);
   out_tindex[0] = static_cast<tvm::runtime::Object*>(obj)->type_index();
   API_END();
 }
diff --git a/src/schedule/graph.cc b/src/schedule/graph.cc
index 506702ad52b5..bb0be274b583 100644
--- a/src/schedule/graph.cc
+++ b/src/schedule/graph.cc
@@ -62,7 +62,7 @@ namespace std {
 template <>
 struct hash<::tvm::schedule::TensorDimKey> {
   std::size_t operator()(const ::tvm::schedule::TensorDimKey& k) const {
-    size_t lhs = k.f.hash();
+    size_t lhs = ::tvm::NodeHash()(k.f);
     size_t rhs = static_cast<size_t>(k.value_index) << 16UL |
         static_cast<size_t>(k.dim);
     lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index 31f6169c899f..6400eeaab69a 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file schedule_dataflow_rewrite.cc
  */
 #include <tvm/schedule.h>
@@ -178,7 +177,7 @@ Tensor Schedule::cache_read(const Tensor& tensor,
   cache_stage.set_scope(scope);
   CHECK_LT(pos, stages->data.size());
   stages->data.insert(stages->data.begin() + pos + 1,
-                      cache_stage.node_);
+                      cache_stage);
   (*this)->stage_map.Set(cache->op, cache_stage);
   // Update group
   cache_stage->group = op_stage->group;
@@ -281,7 +280,7 @@ Array<Tensor> ReplaceOriginalOp(Schedule sch,
   cache_stage.set_scope(scope);
   CHECK_LT(pos, stages->data.size());
   stages->data.insert(stages->data.begin() + pos,
-                      cache_stage.node_);
+                      cache_stage);
   sch->stage_map.Set(cache_op, cache_stage);
   // Update group
   cache_stage->group = orig_stage->group;
@@ -322,7 +321,7 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch,
     body = VarReplacer(vsub2newvar).Mutate(body);
     // Reduce nodes in ONE computeOp must be the same except value_index
     // This is right only if the original body ensures Reduce nodes are the same
-    if (body->is_type<ir::Reduce>()) {
+    if (body->IsInstance<ir::Reduce>()) {
       const ir::Reduce* reduce_body = body.as<ir::Reduce>();
       if (first_reduce != nullptr) {
         CHECK(ReduceEqual(reduce_body, first_reduce));
@@ -486,10 +485,9 @@ Tensor Schedule::cache_write(const Tensor& tensor,
                              const std::string& scope) {
   // support original compute and tensor compute both
   (*this)->InvalidateCache();
-  const char* type_key = tensor->op->type_key();
-  if (!strcmp(type_key, "ComputeOp")) {
+  if (tensor->op.as<ComputeOpNode>()) {
     return (CacheWriteWithReLayout(*this, {tensor}, scope))[0];
-  } else if (!strcmp(type_key, "TensorComputeOp")) {
+  } else if (tensor->op.as<TensorComputeOpNode>()) {
     return (CacheWriteWithReLayoutTensor(*this, {tensor}, scope))[0];
   } else {
     LOG(FATAL) << "cache write only take ComputeOp or TensorComputeOp as writers";
@@ -521,7 +519,7 @@ void RebaseNonZeroMinLoop(const Schedule& sch) {
         if (s->iter_var_attrs.count(iv)) {
           s->iter_var_attrs.Set(rebased, s->iter_var_attrs.at(iv));
         }
-        leaf_vars->data[idx] = rebased.node_;
+        leaf_vars->data[idx] = rebased;
         rebase_map[iv] = rebased;
       }
     }
@@ -575,7 +573,7 @@ void InjectInline(ScheduleNode* sch) {
           if (!new_body[j].size()) {
             new_body[j] = compute->body;
           }
-          if (new_body[j][0]->is_type<ir::Reduce>()) {
+          if (new_body[j][0]->IsInstance<ir::Reduce>()) {
             // specially handle reduction inline for multiplre reductions.
             const ir::Reduce* reduce = new_body[j][0].as<ir::Reduce>();
             for (size_t k = 1; k < new_body[j].size(); ++k) {
@@ -826,7 +824,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
   factor_stage->relations = rels;
   CHECK_LT(stage_pos, stages->data.size());
   stages->data.insert(stages->data.begin() + stage_pos,
-                      factor_stage.node_);
+                      factor_stage);
   (*this)->stage_map.Set(factor_op, factor_stage);
   factor_stage->group = reduce_stage->group;
   if (factor_stage->group.defined()) {
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index 7e61479a5a48..407729df8038 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -77,12 +77,12 @@ void Split(StageNode* self,
   size_t pos = FindLeafVar(all_vars, leaf_vars, parent);
   self->relations.push_back(SplitNode::make(parent, outer, inner, factor, nparts));
   // add vars to all vars
-  all_vars->data.push_back(outer.node_);
-  all_vars->data.push_back(inner.node_);
+  all_vars->data.push_back(outer);
+  all_vars->data.push_back(inner);
   // replace the position.
   leaf_vars->data.erase(leaf_vars->data.begin() + pos);
-  leaf_vars->data.insert(leaf_vars->data.begin() + pos, inner.node_);
-  leaf_vars->data.insert(leaf_vars->data.begin() + pos, outer.node_);
+  leaf_vars->data.insert(leaf_vars->data.begin() + pos, inner);
+  leaf_vars->data.insert(leaf_vars->data.begin() + pos, outer);
 }
 
 }  // namespace
@@ -102,7 +102,7 @@ Stage::Stage(Operation op) {
   } else {
     n->leaf_iter_vars = clean;
   }
-  node_ = n;
+  data_ = std::move(n);
 }
 
 bool Stage::is_scheduled() const {
@@ -206,9 +206,9 @@ Stage& Stage::env_threads(Array<IterVar> threads) {
       << "Already set env_threads";
   ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
-  std::vector<NodePtr<Node> > temp;
+  std::vector<ObjectRef> temp;
   for (IterVar iv : threads) {
-    temp.push_back(iv.node_);
+    temp.push_back(iv);
   }
   leaf_vars->data.insert(
       leaf_vars->data.begin(), temp.begin(), temp.end());
@@ -265,13 +265,13 @@ Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT
     std::swap(pos_inner, pos_outer);
   }
   self->relations.push_back(FuseNode::make(outer, inner, fused));
-  all_vars->data.push_back(fused.node_);
+  all_vars->data.push_back(fused);
   CHECK_EQ(pos_inner, pos_outer + 1)
       << "Can only fuse iterations that are consecutive between each other";
   leaf_vars->data.erase(leaf_vars->data.begin() + pos_outer,
                         leaf_vars->data.begin() + pos_inner + 1);
   leaf_vars->data.insert(leaf_vars->data.begin() + pos_outer,
-                         fused.node_);
+                         fused);
   *p_target = fused;
   return *this;
 }
@@ -293,8 +293,8 @@ Stage& Stage::fuse(const Array<IterVar>& axes, IterVar* p_target) {  // NOLINT(*
     self->relations.push_back(SingletonNode::make(singleton));
     ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
     ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
-    all_vars->data.push_back(singleton.node_);
-    leaf_vars->data.insert(leaf_vars->data.begin(), singleton.node_);
+    all_vars->data.push_back(singleton);
+    leaf_vars->data.insert(leaf_vars->data.begin(), singleton);
     *p_target = singleton;
   }
   return *this;
@@ -321,7 +321,7 @@ Stage& Stage::reorder(const Array<IterVar>& order) {  // NOLINT(*)
   for (size_t i = 0; i < order.size(); ++i) {
     pos.push_back(FindLeafVar(all_vars, leaf_vars, order[i]));
   }
-  std::vector<NodePtr<Node> > temp;
+  std::vector<ObjectRef> temp;
   for (size_t i = 0; i < pos.size(); ++i) {
     temp.emplace_back(leaf_vars->data[pos[i]]);
   }
diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc
index d9d28ffdca7e..10c9e3cab8f6 100644
--- a/src/schedule/schedule_ops.cc
+++ b/src/schedule/schedule_ops.cc
@@ -221,8 +221,8 @@ class SchedulePostProc : public IRMutator {
         }
       }
     } else if (op->attr_key == ir::attr::buffer_bind_scope) {
-      Array<NodeRef> tuple(op->node.node_);
-      Tensor tensor(tuple[1].node_);
+      Array<NodeRef> tuple = Downcast<Array<NodeRef> >(op->node);
+      Tensor tensor = Downcast<Tensor>(tuple[1]);
       auto it = replace_op_.find(tensor->op.get());
       if (it != replace_op_.end()) {
         if (it->second.defined()) {
@@ -234,7 +234,7 @@ class SchedulePostProc : public IRMutator {
         }
       }
     } else if (op->attr_key == ir::attr::buffer_dim_align) {
-      Tensor tensor(op->node.node_);
+      Tensor tensor = Downcast<Tensor>(op->node);
       auto it = replace_op_.find(tensor->op.get());
       if (it != replace_op_.end()) {
         if (it->second.defined()) {
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index 1d241ddc34bd..7ecf4590ca12 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -26,7 +26,7 @@ TEST(Expr, Basic) {
   Var x("x");
   auto z = max(x + 1 + 2, 100);
   NodeRef tmp = z;
-  Expr zz(tmp.node_);
+  Expr zz = Downcast<Expr>(tmp);
   std::ostringstream os;
   os << z;
   CHECK(zz.same_as(z));
@@ -39,7 +39,7 @@ TEST(ExprNodeRef, Basic) {
   Var x("x");
   Expr z = max(x + 1 + 2, 100);
   const ir::Max* op = z.as<ir::Max>();
-  CHECK(NodeRef(op->GetNodePtr()).same_as(z));
+  CHECK(GetRef<NodeRef>(op).same_as(z));
 }
 
 
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index d3584c633199..fef43f97d3c3 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -30,7 +30,7 @@ TEST(IRF, Basic) {
   Var x("x");
   auto z = x + 1;
 
-  IRFunctor<int(const NodeRef& n, int b)> f;
+  IRFunctor<int(const ObjectRef& n, int b)> f;
   LOG(INFO) << "x";
   f.set_dispatch<Variable>([](const Variable* n, int b) {
       return b;
diff --git a/tests/cpp/object_protocol_test.cc b/tests/cpp/object_protocol_test.cc
index 9f3ce00f3b24..2977b6805e5c 100644
--- a/tests/cpp/object_protocol_test.cc
+++ b/tests/cpp/object_protocol_test.cc
@@ -72,7 +72,7 @@ TEST(ObjectHierachy, Basic) {
   using namespace tvm::test;
 
   ObjectRef refA(make_object<ObjA>());
-  CHECK_EQ(refA->type_index(), ObjA::type_index());
+  CHECK_EQ(refA->type_index(), ObjA::RuntimeTypeIndex());
   CHECK(refA.as<Object>() != nullptr);
   CHECK(refA.as<ObjA>() != nullptr);
   CHECK(refA.as<ObjBase>() != nullptr);
@@ -80,7 +80,7 @@ TEST(ObjectHierachy, Basic) {
   CHECK(refA.as<ObjAA>() == nullptr);
 
   ObjectRef refAA(make_object<ObjAA>());
-  CHECK_EQ(refAA->type_index(), ObjAA::type_index());
+  CHECK_EQ(refAA->type_index(), ObjAA::RuntimeTypeIndex());
   CHECK(refAA.as<Object>() != nullptr);
   CHECK(refAA.as<ObjBase>() != nullptr);
   CHECK(refAA.as<ObjA>() != nullptr);
@@ -88,7 +88,7 @@ TEST(ObjectHierachy, Basic) {
   CHECK(refAA.as<ObjB>() == nullptr);
 
   ObjectRef refB(make_object<ObjB>());
-  CHECK_EQ(refB->type_index(), ObjB::type_index());
+  CHECK_EQ(refB->type_index(), ObjB::RuntimeTypeIndex());
   CHECK(refB.as<Object>() != nullptr);
   CHECK(refB.as<ObjBase>() != nullptr);
   CHECK(refB.as<ObjA>() == nullptr);
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 4e2ec065710c..4baf649c6e49 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -49,7 +49,7 @@ TEST(PackedFunc, Node) {
   Var x;
   Var t = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
       CHECK(args.num_args == 1);
-      CHECK(args.type_codes[0] == kNodeHandle);
+      CHECK(args.type_codes[0] == kObjectHandle);
       Var b = args[0];
       CHECK(x.same_as(b));
       *rv = b;
diff --git a/tests/python/unittest/test_lang_schedule.py b/tests/python/unittest/test_lang_schedule.py
index 62c0a675569b..0a653066bff7 100644
--- a/tests/python/unittest/test_lang_schedule.py
+++ b/tests/python/unittest/test_lang_schedule.py
@@ -103,16 +103,20 @@ def test_fuse():
 
 
 def test_singleton():
+    print("test singleton")
     A = tvm.placeholder((), name='A')
     T = tvm.compute((), lambda : A() + 1)
     s = tvm.create_schedule(T.op)
+    print("test singleton fin1")
     fused = s[T].fuse()
     assert any(isinstance(x, tvm.schedule.Singleton) for x in s[T].relations)
     assert tuple(s[T].leaf_iter_vars) == (fused,)
     dump = pkl.dumps(s)
+    print("test singleton fin3")
     s_loaded = pkl.loads(dump)
+    print("test singleton fin2")
     assert isinstance(s_loaded, tvm.schedule.Schedule)
-
+    print("test singleton fin")
 
 def test_vectorize():
     m = tvm.var('m')
diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py
index 53f573730576..35f59052c80b 100644
--- a/tests/python/unittest/test_runtime_vm_profiler.py
+++ b/tests/python/unittest/test_runtime_vm_profiler.py
@@ -26,6 +26,8 @@ def test_basic():
     mod, params = resnet.get_workload()
     target = 'llvm'
     ctx = tvm.cpu()
+    if not relay.profiler_vm.enabled():
+        return
     exe = relay.profiler_vm.compile(mod, target, params=params)
     vm = relay.profiler_vm.VirtualMachineProfiler(exe)
     vm.init(ctx)
diff --git a/topi/include/topi/cuda/pooling.h b/topi/include/topi/cuda/pooling.h
index 0869adbc2877..df4ab3362634 100644
--- a/topi/include/topi/cuda/pooling.h
+++ b/topi/include/topi/cuda/pooling.h
@@ -51,7 +51,7 @@ inline Schedule schedule_pool(const Target &target, const Array<Tensor>& outs) {
   auto s = create_schedule(out_ops);
 
   auto _schedule = [&](const Tensor& padded_input, const Tensor& pool) {
-    if (padded_input->op->is_type<ComputeOpNode>()) {
+    if (padded_input->op->IsInstance<ComputeOpNode>()) {
       s[padded_input].compute_inline();
     }
     auto num_thread = target->max_num_threads;
diff --git a/topi/include/topi/cuda/reduction.h b/topi/include/topi/cuda/reduction.h
index 85d7d86b9236..3166d0836247 100644
--- a/topi/include/topi/cuda/reduction.h
+++ b/topi/include/topi/cuda/reduction.h
@@ -137,7 +137,7 @@ Schedule ScheduleReduce(const Target& target,
  * \param op The current op in the traversal
  */
 void TraverseBeforeReduce(Schedule s, Operation op) {
-  if (op->derived_from<PlaceholderOpNode>()) {
+  if (op->IsInstance<PlaceholderOpNode>()) {
     return;
   } else if (is_injective(op->tag)) {
     s[op].compute_inline();
diff --git a/topi/include/topi/detail/constant_utils.h b/topi/include/topi/detail/constant_utils.h
index d68c308c88ae..6d93f9d28264 100644
--- a/topi/include/topi/detail/constant_utils.h
+++ b/topi/include/topi/detail/constant_utils.h
@@ -18,10 +18,9 @@
  */
 
 /*!
-*  Copyright (c) 2017 by Contributors
-* \file constant_utils.h
-* \brief Utility functions for handling constants in TVM expressions
-*/
+ * \file constant_utils.h
+ * \brief Utility functions for handling constants in TVM expressions
+ */
 #ifndef TOPI_DETAIL_CONSTANT_UTILS_H_
 #define TOPI_DETAIL_CONSTANT_UTILS_H_
 
@@ -44,8 +43,8 @@ using namespace tvm;
  */
 inline bool IsConstInt(Expr expr) {
   return
-    expr->derived_from<tvm::ir::IntImm>() ||
-    expr->derived_from<tvm::ir::UIntImm>();
+    expr->IsInstance<tvm::ir::IntImm>() ||
+    expr->IsInstance<tvm::ir::UIntImm>();
 }
 
 /*!
@@ -57,10 +56,10 @@ inline bool IsConstInt(Expr expr) {
  * \return The integer value.
  */
 inline int64_t GetConstInt(Expr expr) {
-  if (expr->derived_from<tvm::ir::IntImm>()) {
+  if (expr->IsInstance<tvm::ir::IntImm>()) {
     return expr.as<tvm::ir::IntImm>()->value;
   }
-  if (expr->derived_from<tvm::ir::UIntImm>()) {
+  if (expr->IsInstance<tvm::ir::UIntImm>()) {
     return expr.as<tvm::ir::UIntImm>()->value;
   }
   LOG(ERROR) << "expr must be a constant integer";
diff --git a/topi/include/topi/generic/extern.h b/topi/include/topi/generic/extern.h
index 5c0c392f768d..03e362c399df 100644
--- a/topi/include/topi/generic/extern.h
+++ b/topi/include/topi/generic/extern.h
@@ -51,7 +51,7 @@ inline Schedule schedule_extern(const Target& target, Array<Tensor> outs) {
 
   tvm::schedule::AutoInlineInjective(s);
   for (auto out : outs) {
-    if (out->op->derived_from<ExternOpNode>()) {
+    if (out->op->IsInstance<ExternOpNode>()) {
       continue;
     }
     tvm::GenericFunc::Get("schedule_injective_from_existing")(s, out);
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 7114f4d878a8..a0700bffa7e3 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -86,8 +86,9 @@ Array<Integer> ArrayOrInt(TVMArgValue arg) {
 }
 
 inline bool IsTensorType(TVMArgValue arg) {
-  return (arg.type_code() == kNodeHandle &&
-          arg.node_sptr()->is_type<tvm::TensorNode>());
+  return (arg.type_code() == kObjectHandle &&
+          static_cast<Object*>(
+              arg.value().v_handle)->IsInstance<tvm::TensorNode>());
 }
 
 
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index a336a802b80f..2c36b9eb277d 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -100,7 +100,7 @@ var tvm_runtime = tvm_runtime || {};
     var kTVMType = 5;
     var kTVMContext = 6;
     var kArrayHandle = 7;
-    var kNodeHandle = 8;
+    var kObjectHandle = 8;
     var kModuleHandle = 9;
     var kFuncHandle = 10;
     var kStr = 11;
@@ -497,7 +497,7 @@ var tvm_runtime = tvm_runtime || {};
       for (var i = 0; i < nargs; ++i) {
         var vptr = arg_value + i * SIZEOF_TVMVALUE;
         var tcode = Module.getValue(arg_tcode + i * SIZEOF_INT, "i32");
-        if (tcode == kNodeHandle ||
+        if (tcode == kObjectHandle ||
             tcode == kFuncHandle ||
             tcode == kModuleHandle) {
           TVM_CALL(TVMCbArgToReturn(vptr, tcode));

From 866a7cbc6bb164f7158f0b98ca3cb83a36b41c56 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 20 Oct 2019 20:29:32 -0700
Subject: [PATCH 11/59] [CI] Move golang tests to the end (#4164)

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index c140d9c58ad2..47b7cad419e5 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -201,10 +201,10 @@ stage('Build') {
         make(ci_cpu, 'build', '-j4')
         pack_lib('cpu', tvm_lib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta.sh"
+          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
         }
       }
     }

From ddf63c06044ee004e77a08468b9b860d67bb792a Mon Sep 17 00:00:00 2001
From: ekalda <elenkalda@gmail.com>
Date: Mon, 21 Oct 2019 15:07:39 +0100
Subject: [PATCH 12/59] Add support for quantized multiply to Relay (#4141)

This patch adds multiply operator for quantized tensors.
The details of the quantized multiplication are outlined
in the code.

This builds on pull request 3927 and includes the changes
Animesh mentions in the comments on that request.

Change-Id: I555715b53d0266a91d5c03dc3dfe8fc31e7ce4e1
---
 python/tvm/relay/qnn/op/qnn.py     |  42 +++++
 src/relay/qnn/op/mul.cc            | 109 +++++++++++++
 tests/python/relay/test_qnn_mul.py | 236 +++++++++++++++++++++++++++++
 3 files changed, 387 insertions(+)
 create mode 100644 src/relay/qnn/op/mul.cc
 create mode 100644 tests/python/relay/test_qnn_mul.py

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index c8ebfc00a21b..7faf62b4be14 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -349,3 +349,45 @@ def dense(data,
                        input_zero_point,
                        kernel_zero_point,
                        out_dtype)
+
+
+def mul(lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point,
+        output_scale, output_zero_point):
+    """Quantized multiplication with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side quantized input data.
+
+    rhs : relay.Expr
+        The right hand side quantized input data.
+
+    lhs_scale: float
+        The scale of the lhs quantized expr.
+
+    lhs_zero_point: int
+       The zero point of lhs quantized expr.
+
+    rhs_scale: float
+        The scale of the rhs quantized expr.
+
+    rhs_zero_point: int
+       The zero point of rhs quantized expr.
+
+    output_scale: float
+        The scale of the output quantized expr.
+
+    output_zero_point: int
+       The zero point of output quantized expr.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+
+    """
+    return _make.mul(lhs, rhs,
+                     lhs_scale, lhs_zero_point,
+                     rhs_scale, rhs_zero_point,
+                     output_scale, output_zero_point)
diff --git a/src/relay/qnn/op/mul.cc b/src/relay/qnn/op/mul.cc
new file mode 100644
index 000000000000..24264022c2ea
--- /dev/null
+++ b/src/relay/qnn/op/mul.cc
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/relay/qnn/op/mul.cc
+ * \brief QNN mul operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/qnn/attrs.h>
+#include "../../pass/pattern_util.h"
+#include "../util.h"
+#include "op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+/*
+ * \brief Canonicalizes the QNN mul op.
+ * \param attrs The QNN concatenate attrs.
+ * \param new_args The new mutated args to the call node.
+ * \param arg_types The types of input and output.
+ * \return The sequence of Relay ops for mul op.
+ */
+Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                        const Array<tvm::relay::Type>& arg_types) {
+  // Get the attrs.
+  CHECK_EQ(new_args.size(), 2);
+  auto& lhs = new_args[0];
+  auto& rhs = new_args[1];
+  const auto* binary_op_attrs = attrs.as<QnnBinaryOpAttrs>();
+  CHECK(binary_op_attrs != nullptr);
+  auto lhs_scale = binary_op_attrs->lhs_scale;
+  auto lhs_zero_point = binary_op_attrs->lhs_zero_point;
+  auto rhs_scale = binary_op_attrs->rhs_scale;
+  auto rhs_zero_point = binary_op_attrs->rhs_zero_point;
+  auto output_scale = binary_op_attrs->output_scale;
+  auto output_zero_point = binary_op_attrs->output_zero_point;
+
+  // Get the input dtype and shape.
+  CHECK_EQ(arg_types.size(), 3);
+  auto tensor_type = arg_types[0].as<TensorTypeNode>();
+  auto input_dtype = tensor_type->dtype;
+  auto input_shape = tensor_type->shape;
+
+  /*
+  A tensor multiplication c = a * b can be written in terms of respective
+  quantized tensors, scales and zero points as
+  S_c * (Q_c - zp_c) = S_a * (Q_a - zp_a) * S_b * (Q_b - zp_b).
+
+  We can consider the product (Q_a - zp_a) * (Q_b - zp_b) as a different
+  quantized tensor of c, Q', with corresponding scale S' = S_a * S_b and zp' =
+  0. The quantized multiplication then becomes
+  Q_c = S'/S_c Q' + z_c,
+  which is essentially a requantization of tensor Q' into tensor Q_c.
+  */
+
+  auto lhs_shifted = Cast(lhs, Int(32));
+  auto rhs_shifted = Cast(rhs, Int(32));
+
+  if (lhs_zero_point != 0) {
+    auto lhs_zp = MakeConstantScalar(Int(32), lhs_zero_point);
+    lhs_shifted = Subtract(lhs_shifted, lhs_zp);
+  }
+
+  if (rhs_zero_point != 0) {
+    auto rhs_zp = MakeConstantScalar(Int(32), rhs_zero_point);
+    rhs_shifted = Subtract(rhs_shifted, rhs_zp);
+  }
+
+  // Create a new tensor Q'
+  auto output = Multiply(lhs_shifted, rhs_shifted);
+
+  auto scale_new = rhs_scale * lhs_scale;
+
+  // Requantize to get Q_c
+  output = Requantize(output, input_shape, scale_new, 0, output_scale,
+    output_zero_point, input_dtype);
+
+  return output;
+}
+
+// QNN Multiplication operator.
+QNN_REGISTER_BINARY_OP("mul")
+.describe("Elementwise mul with with broadcasting for quantized tensors.")
+.set_support_level(11)
+.set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnMulCanonicalize);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_qnn_mul.py b/tests/python/relay/test_qnn_mul.py
new file mode 100644
index 000000000000..8c08c1abe10c
--- /dev/null
+++ b/tests/python/relay/test_qnn_mul.py
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.contrib import graph_runtime
+import topi.testing
+
+# "unquantize" a quantized tensor
+def recover(data, scale, zp):
+    return scale * (np.asarray(data) - zp)
+
+
+def generate_golden_output(x_recovered, y_recovered, scale, zp):
+    mul = x_recovered * y_recovered
+    output = np.around(mul / scale + zp)
+
+    q_min = np.iinfo(np.uint8).min
+    q_max = np.iinfo(np.uint8).max
+    return np.clip(output, q_min, q_max)
+
+
+def test_tflite_same_io_qnn_params():
+    data_dtype = "uint8"
+
+    lhs_scale = rhs_scale = output_scale = 0.00784314
+    lhs_zero_point = rhs_zero_point = output_zero_point = 127
+
+    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
+    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_datas = [
+        np.array((1, 153, 2, 178)).reshape((1, 4)),
+        np.array((25, 1, 178, 216)).reshape((1, 4)),
+        np.array((25, 153, 1, 165)).reshape((1, 4)),
+    ]
+    y_datas = [
+        np.array((204, 178, 1, 8)).reshape((1, 4)),
+        np.array((204, 178, 191, 1)).reshape((1, 4)),
+        np.array((204, 178, 1, 191)).reshape((1, 4)),
+    ]
+
+    for i in range(0, 3):
+        x_data = x_datas[i]
+        y_data = y_datas[i]
+
+        x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+        y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+        golden = generate_golden_output(x_rec, y_rec, output_scale,
+            output_zero_point)
+
+        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        op_res = intrp.evaluate(func)(x_data, y_data)
+
+        np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+
+def test_tflite_different_io_qnn_params():
+    data_dtype = "uint8"
+
+    lhs_scale = 0.0156863
+    lhs_zero_point = 127
+    rhs_scale = 0.0117647
+    rhs_zero_point = 85
+    output_scale = 0.0235294
+    output_zero_point = 128
+
+    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
+    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_datas = [
+        np.array((76, 140, 153, 172)).reshape((1, 4)),
+        np.array((133, 140, 146, 153)).reshape((1, 4)),
+        np.array((76, 140, 172, 146)).reshape((1, 4)),
+    ]
+    y_datas = [
+        np.array((136, 119, 128, 17)).reshape((1, 4)),
+        np.array((136, 119, 111, 94)).reshape((1, 4)),
+        np.array((136, 119, 17, 128)).reshape((1, 4)),
+    ]
+
+    for i in range(0, 3):
+        x_data = x_datas[i]
+        y_data = y_datas[i]
+
+        x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+        y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+        golden = generate_golden_output(x_rec, y_rec, output_scale,
+            output_zero_point)
+
+        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        op_res = intrp.evaluate(func)(x_data, y_data)
+        np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+
+def test_saturation():
+    # Same params
+    data_dtype = "uint8"
+    lhs_scale = rhs_scale = output_scale = 0.125
+    lhs_zero_point = rhs_zero_point = output_zero_point = 0
+
+    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
+    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
+    y_data = np.array((255, 255, 128, 0)).reshape((1, 4))
+
+    x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+    y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+
+    golden = generate_golden_output(x_rec, y_rec, output_scale,
+        output_zero_point)
+
+    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    op_res = intrp.evaluate(func)(x_data, y_data)
+    np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+    # Same params, different scale
+
+    lhs_scale = rhs_scale = 0.125
+    output_scale = 0.25
+
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
+    y_data = np.array((255, 255, 127, 0)).reshape((1, 4))
+
+    x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+    y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+
+    golden = generate_golden_output(x_rec, y_rec, output_scale,
+        output_zero_point)
+
+    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    op_res = intrp.evaluate(func)(x_data, y_data)
+    np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+    # All params different
+
+    lhs_scale = 0.5
+    rhs_scale = 0.25
+    output_scale = 0.125
+
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_data = np.array((255, 0, 1, 0)).reshape((1, 4))
+    y_data = np.array((0, 128, 64, 0)).reshape((1, 4))
+
+    x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+    y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+
+    golden = generate_golden_output(x_rec, y_rec, output_scale,
+        output_zero_point)
+
+    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    op_res = intrp.evaluate(func)(x_data, y_data)
+    np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+
+if __name__ == "__main__":
+    test_tflite_same_io_qnn_params()
+    test_tflite_different_io_qnn_params()
+    test_saturation()

From f0f80dfc4f753b821f28b8c1a48b23af274101bc Mon Sep 17 00:00:00 2001
From: Monkeyking <jackschora@gmail.com>
Date: Tue, 22 Oct 2019 00:05:26 +0800
Subject: [PATCH 13/59] Fix missspelling (#4166)

FIX "After connecting he usb" with "After connecting the usb"
---
 docs/vta/install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/vta/install.md b/docs/vta/install.md
index c43a167292b4..02c50fbba481 100644
--- a/docs/vta/install.md
+++ b/docs/vta/install.md
@@ -229,7 +229,7 @@ Now you can connect the power cable and serial port to boot the Angstrom Linux.
 > In this case, you might need to build the `zImage` file of your own from [socfpga-4.9.78-ltsi](https://github.com/altera-opensource/linux-socfpga/tree/socfpga-4.9.78-ltsi) branch of the [linux-socfpga](https://github.com/altera-opensource/linux-socfpga) repository. 
 > For a quick fix, you can also download a prebuilt version of the `zImage` file [here](https://raw.githubusercontent.com/liangfu/de10-nano-supplement/master/zImage).
 
-After connecting he usb cables to the DE10-Nano board, power on the board by connecting the power cable. You may then connect to the serial port of the device by using `minicom` on your host PC:
+After connecting the usb cables to the DE10-Nano board, power on the board by connecting the power cable. You may then connect to the serial port of the device by using `minicom` on your host PC:
 
 ``` bash
 # NOTE: root privilege is typically required to run the following command.

From ea3e5b7d34e5b6ffb65b3cce7c5752a9fcdfcaed Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Mon, 21 Oct 2019 13:40:55 -0700
Subject: [PATCH 14/59] [Relay][Pass] Count MAC for BatchMatMul (#4157)

* count MAC for BatchMatMul

* update doc
---
 src/relay/pass/mac_count.cc | 39 +++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/src/relay/pass/mac_count.cc b/src/relay/pass/mac_count.cc
index 48a0dfb84746..000783c5bf01 100644
--- a/src/relay/pass/mac_count.cc
+++ b/src/relay/pass/mac_count.cc
@@ -66,7 +66,7 @@ int64_t ConvMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK(args.size() == 2)
+  CHECK_EQ(args.size(), 2)
     << "The number of input arguments of a CONV 2D node should be 2.";
   const auto* conv_2d_attr = call_node->attrs.as<Conv2DAttrs>();
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
@@ -74,13 +74,13 @@ int64_t ConvMacCount(const Call& call_node) {
   std::string data_layout = conv_2d_attr->data_layout;
   int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
   int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
-  CHECK(C_ind != -1)
+  CHECK_NE(C_ind, -1)
     << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
   if (c_ind != -1)
     input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImm>()->value);
   Array<IndexExpr> kernel_size = conv_2d_attr->kernel_size;
-  CHECK(kernel_size.size() == 2)
+  CHECK_EQ(kernel_size.size(), 2)
     << "The dimension of the kernel in Conv 2D should be 2.";
   const auto* expr = call_node->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> output_tensor = expr->shape;
@@ -99,7 +99,7 @@ int64_t Conv2dTransposeMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK(args.size() == 2)
+  CHECK_EQ(args.size(), 2)
     << "The number of input arguments of a CONV 2D Transpose node should be 2.";
   const auto* conv_2d_transpose_attr = call_node->attrs.as<Conv2DTransposeAttrs>();
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
@@ -107,13 +107,13 @@ int64_t Conv2dTransposeMacCount(const Call& call_node) {
   std::string data_layout = conv_2d_transpose_attr->data_layout;
   int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
   int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
-  CHECK(C_ind != -1)
+  CHECK_NE(C_ind, -1)
     << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
   if (c_ind != -1)
     input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImm>()->value);
   Array<IndexExpr> kernel_size = conv_2d_transpose_attr->kernel_size;
-  CHECK(kernel_size.size() == 2)
+  CHECK_EQ(kernel_size.size(), 2)
     << "The dimension of the kernel in Conv 2D Transpose should be 2.";
   const auto* expr = call_node->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> output_tensor = expr->shape;
@@ -132,7 +132,7 @@ int64_t DenseMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK(args.size() == 2)
+  CHECK_EQ(args.size(), 2)
       << "The number of input arguments of a Dense node should be 2.";
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
   const auto* weight_type = args[1]->checked_type().as<TensorTypeNode>();
@@ -144,12 +144,28 @@ int64_t DenseMacCount(const Call& call_node) {
   int64_t d2 = static_cast<int64_t>(data_shape[1].as<IntImm>()->value);
   int64_t d3 = static_cast<int64_t>(weight_shape[0].as<IntImm>()->value);
   int64_t d4 = static_cast<int64_t>(weight_shape[1].as<IntImm>()->value);
-  CHECK(d2 == d4)
+  CHECK_EQ(d2, d4)
     << "The dimensions of input arguments do not match.";
   int64_t count = d1 * d2 * d3;
   return count;
 }
 
+int64_t BatchMatmulMacCount(const Call& call_node) {
+  if (!call_node->checked_type_.defined()) {
+    LOG(WARNING) << "The infer type pass should be called before the mac count pass";
+    return 0;
+  }
+  Array<Expr> args = call_node->args;
+  CHECK_EQ(args.size(), 2);
+  Array<IndexExpr> x_shape = args[0]->checked_type().as<TensorTypeNode>()->shape;
+  Array<IndexExpr> y_shape = args[1]->checked_type().as<TensorTypeNode>()->shape;
+  int64_t batch = x_shape[0].as<IntImm>()->value;
+  int64_t m = x_shape[1].as<IntImm>()->value;
+  int64_t k = x_shape[2].as<IntImm>()->value;
+  int64_t n = y_shape[1].as<IntImm>()->value;
+  return batch * m * k * n;
+}
+
 RELAY_REGISTER_OP("nn.conv2d")
 .set_attr<FMacCount>("FMacCount", ConvMacCount);
 
@@ -159,14 +175,17 @@ RELAY_REGISTER_OP("nn.conv2d_transpose")
 RELAY_REGISTER_OP("nn.dense")
 .set_attr<FMacCount>("FMacCount", DenseMacCount);
 
+RELAY_REGISTER_OP("nn.batch_matmul")
+.set_attr<FMacCount>("FMacCount", BatchMatmulMacCount);
+
 class MacCounter : private ExprVisitor {
  public:
   MacCounter() {
     count_ = 0;
   }
   static int64_t GetTotalMacNumber(const Expr& expr) {
-    LOG(INFO) << "This pass only counts MACs in direct CONV 2D, "
-              << "CONV 2D Transpose and Dense ops";
+    LOG(INFO) << "This pass only counts MACs in direct conv2d, "
+              << "conv2d_transpose, dense, and batch_matmul ops";
     MacCounter counter;
     counter(expr);
     return counter.count_;

From ab69254472f1947121e8beb7a0e0db12629d7672 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Mon, 21 Oct 2019 16:16:45 -0700
Subject: [PATCH 15/59] [Relay][QNN] Add unit test for int8 (#4159)

* [bugfix][codegen] fix casting bug in llvm codegen

* update example

* retrigger ci

* check llvm version
---
 tests/python/relay/test_op_qnn_conv2d.py | 83 +++++++++++++++++-------
 1 file changed, 59 insertions(+), 24 deletions(-)

diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py
index b4e8bfd71b62..3b5195c051b8 100644
--- a/tests/python/relay/test_op_qnn_conv2d.py
+++ b/tests/python/relay/test_op_qnn_conv2d.py
@@ -160,7 +160,7 @@ def get_output(func, golden_inputs):
     qnn_output = get_output(qnn_func, golden_inputs)
     np.testing.assert_equal(qnn_output, golden_output)
 
-def no_zero_point_test():
+def test_no_zero_point():
     # uint8 input
     data_shape = (2, 1, 2, 4)
     data_dtype = 'uint8'
@@ -203,7 +203,7 @@ def no_zero_point_test():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def kernel_zero_point_test():
+def test_kernel_zero_point():
     # uint8 input
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
@@ -247,7 +247,7 @@ def kernel_zero_point_test():
             kernel_shape, kernel_dtype)
 
 
-def input_zero_point_test():
+def test_input_zero_point():
     # uint8 input
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
@@ -290,7 +290,7 @@ def input_zero_point_test():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def both_zero_point_test():
+def test_both_zero_point():
     # uint8 input
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
@@ -333,7 +333,7 @@ def both_zero_point_test():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def layout_test():
+def test_layout():
     # uint8 input
     data_shape = (2, 2, 4, 4) # NHWC
     data_dtype = 'uint8'
@@ -378,7 +378,7 @@ def layout_test():
 
 
 
-def padding_test():
+def test_padding():
     # uint8 input
     data_shape = (1, 4, 2, 2)
     data_dtype = 'uint8'
@@ -421,7 +421,7 @@ def padding_test():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def dilation_test():
+def test_dilation():
     # uint8 input
     data_shape = (2, 4, 4, 4)
     data_dtype = 'uint8'
@@ -444,7 +444,7 @@ def dilation_test():
             kernel_shape, kernel_dtype)
 
 
-def const_folding_test():
+def test_const_folding():
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
     kernel_shape = (3, 4, 2, 2)
@@ -470,7 +470,7 @@ def const_folding_test():
     folded_func = folded_mod["main"]
     assert "reshape" not in folded_func.astext()
 
-def kernel_size_1x1_test():
+def test_kernel_size_1x1():
     # uint8 input
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
@@ -493,7 +493,7 @@ def kernel_size_1x1_test():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def tflite_large_irregular_test():
+def test_tflite_large_irregular():
     # uint8 input
     data_shape = (1, 1024, 1, 1)
     data_dtype = 'uint8'
@@ -607,7 +607,7 @@ def tflite_anistropic_strides():
     golden_output = np.array((124, -92, 164, -132)).reshape(1, 1, 2, 2)
     np.testing.assert_equal(qnn_output, golden_output)
 
-def broadcast_layout_test():
+def test_broadcast_layout():
     # Test broadcast support for NHWC layout.
     data_shape = (1, 229, 229, 3) # NHWC
     data_dtype = 'uint8'
@@ -640,17 +640,52 @@ def broadcast_layout_test():
     with relay.build_config(opt_level=3):
         graph, lib, params = relay.build(mod, "llvm -mcpu=skylake-avx512")
 
+
+def test_conv2d_int8():
+    target = "llvm -mcpu=core-avx2"
+    if not tvm.module.enabled(target):
+        print("skip because %s is not enabled..." % target)
+        return
+
+    data = relay.var("data", shape=(1, 28, 28, 128), dtype='uint8')
+    kernel = relay.var("w", shape=(3, 3, 128, 256), dtype='int8')
+    conv = relay.nn.conv2d(
+        data,
+        kernel,
+        kernel_size=(3, 3),
+        out_dtype='int32',
+        data_layout='NHWC',
+        kernel_layout='HWIO')
+    func = relay.Function([data, kernel], conv)
+
+    with relay.build_config(opt_level=0):
+        params = {"w": np.zeros((3, 3, 128, 256)).astype("int8")}
+        # -mcpu should be specified to avoid the llvm jitting error here:
+        # https://discuss.tvm.ai/t/segfault-in-llvm/3567
+        # To use VNNI, we need to specify the micro-architecture that supports
+        # it, e.g. cascadelake.
+        graph, lib, params = relay.build(func, target, params=params)
+        mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+        mod.set_input("data", np.zeros((1, 28, 28, 128)).astype("uint8"))
+        mod.set_input(**params)
+        mod.run()
+        qnn_output = mod.get_output(0).asnumpy()
+    golden_output = np.zeros((1, 26, 26, 256)).astype("int32")
+    np.testing.assert_equal(qnn_output, golden_output)
+
+
 if __name__ == "__main__":
-    no_zero_point_test()
-    input_zero_point_test()
-    kernel_zero_point_test()
-    both_zero_point_test()
-    layout_test()
-    padding_test()
-    dilation_test()
-    const_folding_test()
-    kernel_size_1x1_test()
-    tflite_large_irregular_test()
-    tflite_output_multiplier_greater_than_one()
-    tflite_anistropic_strides()
-    broadcast_layout_test()
+    test_no_zero_point()
+    test_input_zero_point()
+    test_kernel_zero_point()
+    test_both_zero_point()
+    test_layout()
+    test_padding()
+    test_dilation()
+    test_const_folding()
+    test_kernel_size_1x1g()
+    test_tflite_large_irregularg()
+    test_tflite_output_multiplier_greater_than_one()
+    test_tflite_anistropic_strides()
+    test_broadcast_layoutg()
+    test_conv2d_int8()

From 716826883110b45579d274f9d87adbdb8f3ae9ec Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Mon, 21 Oct 2019 23:13:35 -0700
Subject: [PATCH 16/59] [relay][vm] Reuse allocated device memory (#4170)

---
 include/tvm/runtime/vm.h |  6 ++++++
 src/runtime/vm/vm.cc     | 15 ++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index 7d2df0b285b1..ee973cb62092 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -747,6 +747,12 @@ class VirtualMachine : public runtime::ModuleNode {
 
   /*! \brief The parameter name to data mapping. */
   std::unordered_map<std::string, ObjectRef> params_;
+
+  /*!
+   * \brief The constant pool for runtime. It caches the device dependent
+   * object to avoid rellocation of constants during inference.
+   */
+  std::vector<ObjectRef> const_pool_;
 };
 
 }  // namespace vm
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index fd5ff64d5812..ab0e06208de9 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -795,9 +795,18 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::LoadConst: {
         auto constant_obj = exec->constants[instr.const_index];
-        // TODO(wweic) ctx could be obtained from the ctxs list.
-        auto device_obj = CopyTo(constant_obj, ctxs[0]);
-        WriteRegister(instr.dst, device_obj);
+        // We cache the allocated object in the constant pool. To measure, the
+        // first iteration will set the pool up. The other iterations will
+        // directly reuse the allocated objects.
+        if (const_pool_.size() <= static_cast<size_t>(instr.const_index)) {
+          const_pool_.resize(instr.const_index + 1);
+        }
+
+        if (!const_pool_[instr.const_index].defined()) {
+          // TODO(wweic) ctx could be obtained from the ctxs list.
+          const_pool_[instr.const_index] = CopyTo(constant_obj, ctxs[0]);
+        }
+        WriteRegister(instr.dst, const_pool_[instr.const_index]);
         pc++;
         goto main_loop;
       }

From 7e29f1831dd2f3cd703e8ce5e6c30264d31ad0b6 Mon Sep 17 00:00:00 2001
From: Altan Haan <altanh@cs.washington.edu>
Date: Mon, 21 Oct 2019 23:13:55 -0700
Subject: [PATCH 17/59] add missing gradient check to gradient pass (#4169)

---
 src/relay/pass/gradient.cc | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/relay/pass/gradient.cc b/src/relay/pass/gradient.cc
index 2606910d3906..8b06b8721994 100644
--- a/src/relay/pass/gradient.cc
+++ b/src/relay/pass/gradient.cc
@@ -351,8 +351,6 @@ struct ReverseAD : ExprMutator {
   Expr VisitExpr_(const CallNode* op) final {
     if (const OpNode* op_node = op->op.as<OpNode>()) {
       Op op_ref = GetRef<Op>(op_node);
-      CHECK(rev_map.count(op_ref))
-        << op_node->name << " does not have reverse mode defined";
       return LetList::With([&](LetList* ll) {
         std::vector<Var> args;
         for (const auto& arg : op->args) {
@@ -408,6 +406,34 @@ Expr BPEmpty() {
   return RefCreateNode::make(unitF);
 }
 
+bool MissingGrad(const Expr& e) {
+  struct MGVisitor : ExprVisitor {
+    const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
+    std::unordered_set<std::string> op_names;
+
+    void VisitExpr_(const OpNode* op) final {
+      Op op_ref = GetRef<Op>(op);
+      if (!rev_map.count(op_ref)) {
+        op_names.insert(op_ref->name);
+      }
+      ExprVisitor::VisitExpr_(op);
+    }
+  };
+
+  MGVisitor mg;
+  mg.VisitExpr(e);
+
+  if (mg.op_names.size() > 0) {
+    LOG(WARNING) << "found operators with missing gradients:";
+    for (const auto& op : mg.op_names) {
+      LOG(WARNING) << "    " << op;
+    }
+    return true;
+  }
+
+  return false;
+}
+
 Expr Gradient(const Expr& re, const Module& mod) {
   auto e = DeGlobal(mod, re);
   auto f = e.as<FunctionNode>();
@@ -416,6 +442,7 @@ Expr Gradient(const Expr& re, const Module& mod) {
   for (const auto& p : f->params) {
     CHECK(p->checked_type().as<TensorTypeNode>()) << "input parameters need to be tensor";
   }
+  CHECK(!MissingGrad(e)) << "input has operators with missing gradients";
   Expr body = LetList::With([&](LetList* ll) {
     Var bp = ll->Push(BPEmpty());
     Expr rev = ReverseAD(bp)(e);

From 5f7f3005d7a9d3160e4696346493a68523cb858c Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Tue, 22 Oct 2019 13:26:04 -0700
Subject: [PATCH 18/59] merge extract_from_program and
 extract_from_multiple_progam (#4173)

---
 python/tvm/autotvm/task/relay_integration.py  | 75 +++----------------
 .../relay/test_autotvm_task_extraction.py     | 21 ++++--
 2 files changed, 26 insertions(+), 70 deletions(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 5b0294ef2d07..55be05f4b88f 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -52,8 +52,7 @@ def _build(func,
 def extract_from_program(func, params, ops, target, target_host=None):
     """ Extract tuning tasks from a relay program.
 
-    This function collects tuning tasks by building the program
-    with a "tracing" target and tracing all the calls to topi.
+    This function is the single program version of extract_from_multiple_program.
 
     Parameters
     ----------
@@ -73,66 +72,14 @@ def extract_from_program(func, params, ops, target, target_host=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
-    import tvm.relay.op
-    from tvm import relay
-    import topi
-
-    env = TaskExtractEnv.get()
-
-    # NOTE: To add more ops, you only need to change the following lists
-    # relay op -> topi compute
-    OP2TOPI = {
-        tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
-                                 topi.nn.group_conv2d_nchw, topi.nn.conv2d_NCHWc],
-        tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
-        tvm.relay.op.nn.dense: [topi.nn.dense],
-        tvm.relay.op.nn.deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
-    }
-
-    topi_funcs = []
-    for op_name in ops:
-        if op_name in OP2TOPI:
-            topi_funcs.extend(OP2TOPI[op_name])
-        else:
-            warnings.warn("Op %s is not tunable, ignored" % op_name)
-
-    # run compiler to collect all TOPI calls during compilation
-    env.reset(topi_funcs)
-    with env:
-        # disable logger temporarily
-        old_state = logger.disabled
-        logger.disabled = True
-
-        relay.backend.compile_engine.get().clear()
-        # wrap build call in thread to avoid multiprocessing problems
-        mod = relay.Module.from_expr(func)
-        build_thread = threading.Thread(target=_build,
-                                        args=(mod,
-                                              target,
-                                              target_host,
-                                              params))
-        build_thread.start()
-        build_thread.join()
-
-        logger.disabled = old_state
-
-    # create tasks for target
-    tasks = []
-    for task_name, args in env.get_tasks():
-        try:
-            tsk = create(task_name, args,
-                         target=target, target_host=target_host,
-                         template_key='direct')
-            tasks.append(tsk)
-        except topi.InvalidShapeError:
-            warnings.warn("Invalid shape during AutoTVM task creation")
-    return tasks
+    return extract_from_multiple_program([func], [params], ops, target, target_host)
 
 
 def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
     """ Extract tuning tasks from multiple relay programs.
 
-    This function is the multiple program version of extract_from_program
+    This function collects tuning tasks by building a list of programs
+    with a "tracing" target and tracing all the calls to topi.
 
     Parameters
     ----------
@@ -152,19 +99,20 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
-    env = TaskExtractEnv.get()
     import tvm.relay.op
     from tvm import relay
     import topi
 
+    env = TaskExtractEnv.get()
+
     # NOTE: To add more ops, you only need to change the following lists
     # relay op -> topi compute
     OP2TOPI = {
         tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
-                                 topi.nn.group_conv2d_nchw],
+                                 topi.nn.group_conv2d_nchw, topi.nn.conv2d_NCHWc],
         tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
         tvm.relay.op.nn.dense: [topi.nn.dense],
-        tvm.relay.op.nn.contrib_deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
+        tvm.relay.op.nn.deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
     }
 
     topi_funcs = []
@@ -185,11 +133,8 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
             relay.backend.compile_engine.get().clear()
             # wrap build call in thread to avoid multiprocessing problems
             mod = relay.Module.from_expr(func)
-            build_thread = threading.Thread(target=my_build,
-                                            args=(mod,
-                                                  target,
-                                                  target_host,
-                                                  params))
+            build_thread = threading.Thread(target=_build,
+                                            args=(mod, target, target_host, param))
             build_thread.start()
             build_thread.join()
 
diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py
index 0bef382cb5d0..242096fb66e1 100644
--- a/tests/python/relay/test_autotvm_task_extraction.py
+++ b/tests/python/relay/test_autotvm_task_extraction.py
@@ -37,36 +37,47 @@ def get_network(name, batch_size):
 
 def test_task_extraction():
     target = 'llvm'
+    mod_list = []
+    params_list = []
 
-    mod, params, input_shape = get_network('resnet-18', batch_size=1)
+    mod, params, _ = get_network('resnet-18', batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.conv2d,))
     assert len(tasks) == 12
 
-    mod, params, input_shape = get_network('resnet-18', batch_size=1)
+    mod, params, _ = get_network('resnet-18', batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.dense,))
     assert len(tasks) == 1
 
-    mod, params, input_shape = get_network('resnet-18', batch_size=1)
+    mod, params, _ = get_network('resnet-18', batch_size=1)
+    mod_list.append(mod)
+    params_list.append(params)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.conv2d, relay.op.nn.dense))
     assert len(tasks) == 13
 
-    mod, params, input_shape = get_network('mobilenet', batch_size=1)
+    mod, params, _ = get_network('mobilenet', batch_size=1)
+    mod_list.append(mod)
+    params_list.append(params)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.conv2d, relay.op.nn.dense))
     assert len(tasks) == 20
 
-    mod, params, input_shape = get_network('dcgan', batch_size=1)
+    mod, params, _ = get_network('dcgan', batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.conv2d_transpose,))
     assert len(tasks) == 4
 
+    tasks = autotvm.task.extract_from_multiple_program([m['main'] for m in mod_list], params_list,
+                                                       target=target,
+                                                       ops=(relay.op.nn.conv2d,))
+    assert len(tasks) == 31
+
 if __name__ == '__main__':
     test_task_extraction()

From 78af92b9cd8f433bace82a3f1282db4ff15c8342 Mon Sep 17 00:00:00 2001
From: mbarrett97 <55580676+mbarrett97@users.noreply.github.com>
Date: Tue, 22 Oct 2019 21:49:17 +0100
Subject: [PATCH 19/59] [TOPI] Added support for Mali Bifrost target (#4047)

---
 python/tvm/target.py                         |  13 +
 topi/python/topi/__init__.py                 |   1 +
 topi/python/topi/arm_cpu/conv2d.py           |  28 +-
 topi/python/topi/bifrost/__init__.py         |   8 +
 topi/python/topi/bifrost/conv2d.py           | 485 +++++++++++++++++++
 topi/python/topi/bifrost/dense.py            | 110 +++++
 topi/python/topi/bifrost/depthwise_conv2d.py | 126 +++++
 topi/python/topi/bifrost/gemm.py             | 363 ++++++++++++++
 topi/python/topi/bifrost/transforms.py       | 120 +++++
 9 files changed, 1245 insertions(+), 9 deletions(-)
 create mode 100644 topi/python/topi/bifrost/__init__.py
 create mode 100644 topi/python/topi/bifrost/conv2d.py
 create mode 100644 topi/python/topi/bifrost/dense.py
 create mode 100644 topi/python/topi/bifrost/depthwise_conv2d.py
 create mode 100644 topi/python/topi/bifrost/gemm.py
 create mode 100644 topi/python/topi/bifrost/transforms.py

diff --git a/python/tvm/target.py b/python/tvm/target.py
index 42045c0fb733..087c9b47fd7a 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -506,6 +506,19 @@ def vta(model='unknown', options=None):
     return ret
 
 
+def bifrost(model='unknown', options=None):
+    """Return an ARM Mali GPU target (Bifrost architecture).
+
+    Parameters
+    ----------
+    options : str or list of str
+        Additional options
+    """
+    opts = ["-device=bifrost", '-model=%s' % model]
+    opts = _merge_opts(opts, options)
+    return _api_internal._TargetCreate("opencl", *opts)
+
+
 def create(target_str):
     """Get a target given target string.
 
diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py
index fd293a09b9e7..484cb2d11993 100644
--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -28,6 +28,7 @@
 from . import cuda
 from . import arm_cpu
 from . import mali
+from . import bifrost
 from . import intel_graphics
 from . import opengl
 from . import util
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index c939e2c1c121..278e1f0df92a 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -560,6 +560,9 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
             if "-device=arm_cpu" in target.options:
                 tile_size = 4
                 VC = cfg['tile_k'].size[-1]
+            elif "-device=bifrost" in target.options:
+                tile_size = 2
+                VC = 0
             else:
                 from ..mali.conv2d import _pick_tile_size
                 tile_size = _pick_tile_size(tinfos[0], tinfos[1])
@@ -567,21 +570,28 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
 
             weight = F.nn.contrib_conv2d_winograd_weight_transform(copy_inputs[1],
                                                                    tile_size=tile_size)
-            weight = F.reshape(weight,
-                               newshape=(KH + tile_size - 1,
-                                         KW + tile_size - 1,
-                                         idxd(CO, VC), VC, CI))
-            weight = F.transpose(weight, axes=[0, 1, 2, 4, 3])
+            if VC > 0:
+                weight = F.reshape(weight,
+                                   newshape=(KH + tile_size - 1,
+                                             KW + tile_size - 1,
+                                             idxd(CO, VC), VC, CI))
+                weight = F.transpose(weight, axes=[0, 1, 2, 4, 3])
+                new_weight = tvm.placeholder((KH + tile_size - 1,
+                                              KW + tile_size -1,
+                                              idxd(CO, VC), CI, VC),
+                                             kernel.dtype)
+            else:
+                weight = F.reshape(weight,
+                                   newshape=(KH + tile_size - 1, KW + tile_size - 1, CO, CI))
+                new_weight = tvm.placeholder(
+                    (KH + tile_size - 1, KW + tile_size -1, CO, CI), kernel.dtype
+                )
 
             copy_inputs[1] = weight
             new_attrs['tile_size'] = tile_size
 
             # Store the same config for the altered operator (workload)
             new_data = data
-            new_weight = tvm.placeholder((KH + tile_size - 1,
-                                          KH + tile_size -1,
-                                          idxd(CO, VC), CI, VC),
-                                         kernel.dtype)
             new_workload = autotvm.task.args_to_workload(
                 [new_data, new_weight, strides, padding, dilation,
                  new_attrs[data_layout_key], out_dtype, tile_size],
diff --git a/topi/python/topi/bifrost/__init__.py b/topi/python/topi/bifrost/__init__.py
new file mode 100644
index 000000000000..a8bc545af44b
--- /dev/null
+++ b/topi/python/topi/bifrost/__init__.py
@@ -0,0 +1,8 @@
+# pylint: disable=redefined-builtin, wildcard-import
+"""ARM Mali GPU specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+from .gemm import *
+from .conv2d import *
+from .dense import *
+from .depthwise_conv2d import *
diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py
new file mode 100644
index 000000000000..1ed3f2c6e300
--- /dev/null
+++ b/topi/python/topi/bifrost/conv2d.py
@@ -0,0 +1,485 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""conv2d schedule on ARM Mali (Bifrost) GPU"""
+
+import tvm
+from tvm import autotvm
+
+from .gemm import decl_winograd_gemm, schedule_gemm
+from .transforms import tile_and_bind, tile_and_bind3d
+from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
+from ..util import traverse_inline, get_const_int, get_const_tuple
+from ..nn import conv2d, conv2d_winograd_without_weight_transform, \
+    get_pad_tuple, pad, conv2d_alter_layout, dilate
+from ..nn.winograd_util import winograd_transform_matrices
+
+# reuse some compute declarations from ARM CPU
+from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw
+from ..arm_cpu.conv2d import _alter_conv2d_layout_arm
+
+
+@autotvm.register_topi_compute(conv2d, 'bifrost', ['direct'])
+def conv2d_bifrost(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    """TOPI compute callback for conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
+        filter_width, num_filter_block]
+
+    strides : list of two ints
+        [stride_height, stride_width]
+
+    padding : list of two ints
+        [pad_height, pad_width]
+
+    dilation : list of two ints
+        [dilation_height, dilation_width]
+
+    layout : str
+        layout of data
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    if layout == 'NCHW':
+        return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
+                                        dilation, out_dtype, num_tile=3)
+    else:
+        raise ValueError("Unsupported layout {}".format(layout))
+
+
+@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'bifrost', ['direct', 'winograd'])
+def schedule_conv2d_nchw_bifrost(cfg, outs):
+    """TOPI schedule callback for conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The configuration of this template
+    outs: Array of Tensor
+        The computation graph description of convolution2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        # schedule conv2d
+        if 'spatial_conv2d_output' in op.tag:
+            output = op.output(0)
+            conv = op.input_tensors[0]
+
+            data_vec = conv.op.input_tensors[0]
+            data_pad = data_vec.op.input_tensors[0]
+            s[data_pad].compute_inline()
+
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec)
+
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
+    """schedule the spatial packing for conv2d"""
+    data = s[data_vec].op.input_tensors[0]
+
+    max_unroll = 16
+    vec_size = [1, 2, 4, 8, 16]
+    # get tunable parameters (they are defined in compute)
+    BC, TC, VC = cfg["tile_co"].size
+    BH, TH, VH = cfg["tile_oh"].size
+    BW, TW, VW = cfg["tile_ow"].size
+
+    # schedule padding
+    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+        data_pad = data
+        s[data_pad].compute_inline()
+
+    # schedule data packing
+    if isinstance(data_vec.op, tvm.tensor.ComputeOp) and data_vec.op.name == 'data_vec_undilated':
+        _, h, w, ci, _, _, vh, vw = s[data_vec].op.axis
+    else:
+        _, h, w, ci, vh, vw = s[data_vec].op.axis
+    tile_and_bind3d(s, data_vec, h, w, ci, 1)
+    if vh.dom.extent.value < max_unroll:
+        s[data_vec].unroll(vh)
+    if vw.dom.extent.value < max_unroll:
+        s[data_vec].unroll(vw)
+
+    if isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and kernel_vec.name == 'kernel_vec':
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel packing will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[kernel_vec].pragma(s[kernel_vec].op.axis[0], 'debug_skip_region')
+        else:
+            max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+            co, ci, kh, kw, vc = s[kernel_vec].op.axis
+            fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
+            fused, vec = s[kernel_vec].split(fused, VC)
+            bb, tt = s[kernel_vec].split(fused, max_threads)
+            s[kernel_vec].bind(bb, tvm.thread_axis("blockIdx.x"))
+            s[kernel_vec].bind(tt, tvm.thread_axis("threadIdx.x"))
+            if VC in vec_size:
+                s[kernel_vec].vectorize(vec)
+
+    # schedule convolution
+    n, c, h, w, vh, vw, vc = s[conv].op.axis
+    kc, kh, kw = s[conv].op.reduce_axis
+
+    cfg["reorder_0"].apply(s, conv, [n, c, h, w, kc, kh, kw, vh, vw, vc])
+    tile_and_bind3d(s, conv, c, h, w, TC, TH, TW)
+
+    cfg["ann_reduce"].apply(s, conv, [kh, kw],
+                            axis_lens=[get_const_int(kernel_vec.shape[2]),
+                                       get_const_int(kernel_vec.shape[3])],
+                            max_unroll=max_unroll)
+
+    cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
+                             axis_lens=[VH, VW, VC],
+                             max_unroll=max_unroll,
+                             vec_size=vec_size,
+                             cfg=cfg)
+
+    # schedule output
+    if output.op not in s.outputs:  # has bias
+        s[output].compute_inline()
+        output = s.outputs[0]
+
+    _, co, oh, ow = s[output].op.axis
+    tile_and_bind3d(s, output, co, oh, ow, TC, TH, TW)
+
+    return s
+
+
+@autotvm.register_topi_compute(conv2d, 'bifrost', ['winograd'])
+def conv2d_bifrost_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    """Use Winograd as the convolution method"""
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
+
+
+def _decl_winograd_kernel_transform(kernel, tile_size, G):
+    """Declare a Winograd kernel transform
+    This exists separately to allow for precomputation
+    The precomputation will most often happen on CPU
+
+    Parameters
+    ----------
+    kernel : tvm.Tensor
+        The kernel to transform
+
+    tile_size : int
+        The size of the tile to use for the Winograd filter
+
+    Returns
+    -------
+    U : tvm.Tensor
+        Transformed kernel
+
+    """
+    CO, CI, KH, KW = [get_const_int(x) for x in kernel.shape]
+    # Only support 32 bit floats
+    out_dtype = 'float32'
+
+    alpha = G.shape[0]
+    K = CO
+    C = CI
+
+    def upround(x, align):
+        return (x + align - 1) // align * align
+
+    ALIGN = 16
+    K_round = upround(K, ALIGN)
+
+    # Padded Kernel [K_round, C, KH, KW]
+    # Pad the number of kernels to multiple of ALIGN
+    padded_kernel = tvm.compute((K_round, C, KH, KW),
+                                lambda k, c, h, w:
+                                tvm.if_then_else(k < K,
+                                                 kernel[k][c][h][w],
+                                                 tvm.const(0, out_dtype)),
+                                name='padded_kernel')
+
+    # U [alpha, alpha, K_round, C]
+    # Perform the kernel transform
+    r_kh = tvm.reduce_axis((0, KH), 'r_kh')
+    r_kw = tvm.reduce_axis((0, KW), 'r_kw')
+    U = tvm.compute((alpha, alpha, K_round, C),
+                    lambda eps, nu, k, c:
+                    tvm.sum(padded_kernel[k][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
+                            axis=[r_kh, r_kw]),
+                    name='U')
+
+    return U
+
+
+def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size=2):
+    """Declare a winograd convolution - only tile_size=2 is currently supported"""
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if int(kernel.shape[2]) == 3:
+        if dilation_h != 1 or dilation_w != 1:
+            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
+        pre_computed = False
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+    else:
+        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
+        pre_computed = True
+        H_CAT, W_CAT, CO, CI = get_const_tuple(kernel.shape)
+        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    assert layout == 'NCHW'
+    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
+    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+
+    r = KW
+    m = tile_size
+    alpha = m + r - 1
+    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+
+    K = CO
+    C = CI
+    H = (IH + 2 * HPAD - 3) // HSTR + 1
+    W = (IW + 2 * WPAD - 3) // WSTR + 1
+    nH, nW = (H + m-1) // m, (W + m-1) // m
+    P = N * nH * nW
+
+    def upround(x, align):
+        return (x + align - 1) // align * align
+
+    ALIGN = 16
+    P_round = upround(P, ALIGN)
+    K_round = upround(K, ALIGN)
+
+    # CONFIG
+
+    cfg.define_knob("data_transform_wgx", [1, 2, 4, 8, 16, 32, 64])
+    cfg.define_knob("data_transform_wgy", [1, 2, 4, 8, 16, 32, 64])
+
+    # Pack input tile
+    input_tile = tvm.compute((N, C, H + 2, W + 2),
+                             lambda n, c, h, w:
+                             data_pad[n][c][h][w],
+                             name='d')
+
+    if pre_computed:
+        U = kernel
+    else:
+        U = _decl_winograd_kernel_transform(kernel, tile_size, G)
+
+    # V [alpha * alpha, C, P_round)
+    # Perform the image transform
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    V = tvm.compute((alpha * alpha, C, P_round),
+                    lambda epsnu, c, b:
+                    tvm.sum(input_tile[b // (nH*nW)][c][b // nW % nH * m + r_eps][b % nW * m +r_nu]\
+                            * B[r_eps][epsnu // alpha] * B[r_nu][epsnu % alpha],
+                            axis=[r_eps, r_nu]),
+                    name='V')
+
+    # Winograd GEMM is a wrapper around batched GEMM to convert U to a 3D Tensor
+    _, M = decl_winograd_gemm(cfg, U, V)
+
+    # Y [K, P, m, m]
+    # Winograd output transform
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
+                    tvm.sum(M[r_eps * alpha + r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
+                            axis=[r_eps, r_nu]), name='Y')
+
+    # Output [N, K, H, W]
+    # Unpack back to NCHW format
+    # The last term ensures alignment is not lost to bound inference
+    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
+                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
+                         + tvm.const(0, out_dtype) * M[(alpha*alpha)-1][K_round-1][P_round-1],
+                         name='output', tag='winograd_conv2d_output')
+
+    return output
+
+
+def _schedule_winograd(cfg, s, op):
+    """Schedule Winograd convolution for Bifrost"""
+
+    # Get ops and tensors
+    output = op.output(0)
+
+    Y = op.input_tensors[0]
+    M, A = s[Y].op.input_tensors
+    U_3D, V = s[M].op.input_tensors
+    U = s[U_3D].op.input_tensors[0]
+    d, B = s[V].op.input_tensors
+    data_pad = s[d].op.input_tensors[0]
+
+    if isinstance(U.op, tvm.tensor.ComputeOp):
+        padded_kernel, G = s[U].op.input_tensors
+        kernel = s[padded_kernel].op.input_tensors[0]
+        s[G].compute_inline()
+        eps, _, _, _ = s[U].op.axis
+        y, _, _, _ = s[padded_kernel].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # Kernel transformation will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[U].pragma(eps, 'debug_skip_region')
+            s[padded_kernel].pragma(y, 'debug_skip_region')
+        else:
+            # Pad kernel
+            y, x, ky, kx = s[padded_kernel].op.axis
+            s[padded_kernel].unroll(ky)
+            s[padded_kernel].unroll(kx)
+            tile_and_bind(s, padded_kernel, y, x, 1, 8)
+
+            # Transform kernel
+            eps, nu, k, c = s[U].op.axis
+            s[U].reorder(k, c, eps, nu)
+            r_kh, r_kw = s[U].op.reduce_axis
+            _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
+
+            yo, xo, yi, xi = tile_and_bind(s, U, k, c, 1, 4)
+
+        # Dilation
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            s[kernel].compute_inline()
+
+    # Pad data
+    s[data_pad].compute_inline()
+
+    # Pack data
+    n, c, h, w = s[d].op.axis
+    w, wi = s[d].split(w, 4)
+    s[d].unroll(wi)
+    b = s[d].fuse(n, c)
+    tile_and_bind3d(s, d, b, h, w, 1, 4, 2)
+
+    # Transform data
+    bIL_d = s.cache_read(d, 'local', [V])
+
+    s[B].compute_inline()
+    epsnu, c, b = s[V].op.axis
+    r_eps, r_nu = s[V].op.reduce_axis
+    s[V].reorder(b, c, epsnu, r_nu, r_eps)
+    _ = [s[V].unroll(x) for x in [epsnu, r_eps, r_nu]]
+    yo, xo, yi, xi = tile_and_bind(
+        s, V, b, c, cfg["data_transform_wgy"].val, cfg["data_transform_wgx"].val
+    )
+
+    s[bIL_d].compute_at(s[V], xi)
+    n, c, h, w = s[bIL_d].op.axis
+    s[bIL_d].unroll(h)
+    s[bIL_d].vectorize(w)
+
+    # Batched GEMM
+    # Inline the 4D -> 3D tensor transform on the kernel
+    s[U_3D].compute_inline()
+    U_transform, V_transform = schedule_gemm(
+        cfg, s, U_3D, V, M, batched=True, schedule_transforms=True
+    )
+
+    # Inverse transform
+    CR_M = s.cache_read(M, 'local', [Y])
+    CW_Y = s.cache_write(Y, 'local')
+
+    s[A].compute_inline()
+    k, b, vh, vw = s[Y].op.axis
+    fused = s[Y].fuse(vh, vw)
+    s[Y].vectorize(fused)
+    yo, xo, yi, xi = tile_and_bind(s, Y, k, b, 1, 4)
+
+    s[CR_M].compute_at(s[Y], xi)
+    k, b, epsnu = s[CR_M].op.axis
+    s[CR_M].unroll(k)
+
+    s[CW_Y].compute_at(s[Y], xi)
+    k, b, vh, vw = s[CW_Y].op.axis
+    r_eps, r_nu = s[CW_Y].op.reduce_axis
+    _ = [s[CW_Y].unroll(x) for x in [vh, vw, r_eps, r_nu]]
+
+    # Schedule output and fusion
+    if output.op not in s.outputs:
+        s[output].compute_inline()
+        output = s.outputs[0]
+
+    _, k, h, w = s[output].op.axis
+    tile_and_bind3d(s, output, k, h, w, 1, 2, 2)
+
+
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
+@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'bifrost', ['winograd'])
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
+    """TOPI compute callback"""
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
+
+
+@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                'bifrost', ['winograd'])
+def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+##### REGISTER ALTER OP LAYOUT #####
+@conv2d_alter_layout.register(["bifrost"])
+def _alter_conv2d_layout(attrs, inputs, tinfos, F):
+    try:
+        return _alter_conv2d_layout_arm(attrs, inputs, tinfos, F)
+    except KeyError:  # to filter out fallback opencl templates
+        return None
diff --git a/topi/python/topi/bifrost/dense.py b/topi/python/topi/bifrost/dense.py
new file mode 100644
index 000000000000..114168f27514
--- /dev/null
+++ b/topi/python/topi/bifrost/dense.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable
+"""dense schedule on ARM Mali GPU"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+from tvm import autotvm
+
+from .. import generic, nn
+from ..util import traverse_inline
+
+autotvm.register_topi_compute(nn.dense, 'bifrost', 'direct', nn.dense.fdefault)
+
+@autotvm.register_topi_schedule(generic.schedule_dense, 'bifrost', 'direct')
+def schedule_dense(cfg, outs):
+    """Schedule for dense operator.
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config entity for this template
+    outs: Array of Tensor
+        The computation graph description of dense
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for dense.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'dense':
+            vec_size = [1, 2, 4, 8, 16]
+            max_unroll = 32
+
+            dense = op.output(0)
+            output = outs[0]
+
+            y, x = s[output].op.axis
+            c = s[dense].op.reduce_axis[0]
+
+            ##### space definition begin #####
+            cfg.define_split('tile_y', y, num_outputs=3)
+            cfg.define_split('tile_x', x, num_outputs=3)
+            cfg.define_split('c_unroll', c, num_outputs=2, max_factor=64)
+
+            # fallback support
+            if cfg.is_fallback:
+                ref_log = autotvm.tophub.load_reference_log(
+                    'mali', 'rk3399', 'dense', 'direct')
+                cfg.fallback_with_reference_log(ref_log)
+            ##### space definition end #####
+
+            if dense.op in s.outputs:
+                dense = s.cache_write(output, 'local')
+
+            by, ty, yi = cfg['tile_y'].apply(s, output, y)
+            bx, tx, xi = cfg['tile_x'].apply(s, output, x)
+
+            s[output].bind(by, tvm.thread_axis('blockIdx.y'))
+            s[output].bind(bx, tvm.thread_axis('blockIdx.x'))
+            s[output].bind(ty, tvm.thread_axis('threadIdx.y'))
+            s[output].bind(tx, tvm.thread_axis('threadIdx.x'))
+
+            if cfg['tile_y'].size[-1] < max_unroll:
+                s[output].unroll(yi)
+            if cfg['tile_x'].size[-1] in vec_size:
+                s[output].vectorize(xi)
+            s[dense].compute_at(s[output], tx)
+
+            k = s[dense].op.reduce_axis[0]
+            y, x = s[dense].op.axis
+            k, k_unroll = cfg['c_unroll'].apply(s, dense, k)
+            s[dense].reorder(k, k_unroll, y, x)
+            s[dense].unroll(k_unroll)
+            if cfg['tile_y'].size[-1] < max_unroll:
+                s[dense].unroll(y)
+            if cfg['tile_x'].size[-1] in vec_size:
+                s[dense].vectorize(x)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+    """ fuse all the axis and bind to GPU threads """
+    axis = axis or s[tensor].op.axis
+    fused = s[tensor].fuse(*axis)
+    bx, tx = s[tensor].split(fused, num_thread)
+    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    return bx, tx
diff --git a/topi/python/topi/bifrost/depthwise_conv2d.py b/topi/python/topi/bifrost/depthwise_conv2d.py
new file mode 100644
index 000000000000..0cde1ea6f413
--- /dev/null
+++ b/topi/python/topi/bifrost/depthwise_conv2d.py
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""depthwise_conv2d schedule on ARM Mali GPU"""
+
+from __future__ import absolute_import as _abs
+import tvm
+
+from .. import generic
+from .. import util
+from .. import tag
+
+@generic.schedule_depthwise_conv2d_nchw.register(["bifrost"])
+def schedule_depthwise_conv2d_nchw(outs):
+    """Schedule for depthwise_conv2d nchw forward.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of depthwise_conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for depthwise_conv2d nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(pad_data, kernel, conv):
+        raw_data = s[pad_data].op.input_tensors[0]
+
+        if conv.op not in s.outputs:  # has bias or relu
+            output = outs[0]
+        else:                         # no bias or relu
+            output = conv
+
+        def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+            """ tile and bind 3d """
+            y_factor = y_factor or z_factor
+            x_factor = x_factor or y_factor
+            zo, zi = s[tensor].split(z, z_factor)
+            yo, yi = s[tensor].split(y, y_factor)
+            xo, xi = s[tensor].split(x, x_factor)
+            s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+            s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+            s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+            s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+            s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+            s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+            return zo, zi, yo, yi, xo, xi
+
+        # set tunable parameters
+        VH = 1
+        VW = 1
+        num_thread = 4
+        while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
+            VW = VW * 2
+        while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
+            VH = VH * 2
+        if raw_data.dtype == 'float16':
+            if util.get_const_int(conv.shape[3]) % (VW * 2) == 0:
+                VW *= 2
+                num_thread *= 2
+            else:
+                num_thread *= 2
+
+        # schedule padding
+        _, c, y, x = s[pad_data].op.axis
+        tile_and_bind3d(pad_data, c, y, x, num_thread, 1, 1)
+
+        # schedule conv
+        di, dj = s[conv].op.reduce_axis
+        s[conv].unroll(di)
+        s[conv].unroll(dj)
+
+        _, c, y, x = s[output].op.axis
+        y, x, yi, xi = s[output].tile(y, x, VH, VW)
+        s[output].unroll(yi)
+        s[output].vectorize(xi)
+
+        _, _, _, _, _, ji = tile_and_bind3d(output, c, y, x, num_thread, 1, 1)
+
+        if conv.op not in s.outputs:
+            _, c, y, x = s[conv].op.axis
+            y, x, yi, xi = s[conv].tile(y, x, VH, VW)
+            s[conv].unroll(yi)
+            s[conv].vectorize(xi)
+            s[conv].compute_at(s[output], ji)
+
+    def traverse(op):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        # schedule depthwise_conv2d
+        if op.tag == 'depthwise_conv2d_nchw':
+            pad_data = op.input_tensors[0]
+            kernel = op.input_tensors[1]
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+                s[kernel].compute_inline()
+            conv = op.output(0)
+            _schedule(pad_data, kernel, conv)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/bifrost/gemm.py b/topi/python/topi/bifrost/gemm.py
new file mode 100644
index 000000000000..cc6cf09de4ce
--- /dev/null
+++ b/topi/python/topi/bifrost/gemm.py
@@ -0,0 +1,363 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""GEMM schedules for Mali Bifrost"""
+
+import tvm
+
+from .transforms import tile_and_bind, tile_and_bind3d, interleave_transpose, \
+    transpose_interleave
+from .. import util
+
+def decl_gemm(cfg, A, B):
+    """Declare a single GEMM computation for Mali Bifrost GPUs
+
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    A : tvm.Tensor
+        2D Tensor, shape [n, k]
+
+    B : tvm.Tensor
+        2D Tensor, shape [k, m]
+
+    Returns
+    -------
+    C : tvm.Tensor
+        2D Tensor, shape [n, m]
+    """
+
+    cfg.define_knob("work_group_x", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
+    cfg.define_knob("work_group_y", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
+    cfg.define_knob("unroll_k_factor", [1, 2, 4])
+    cfg.define_knob("A_interleave", [1, 4, 8, 16, 24, 32, 48, 64])
+    cfg.define_knob("B_interleave", [1, 4, 8, 16, 32])
+    cfg.define_knob("split_k_factor", [1, 4, 16])
+
+
+    # Mutual k axis must be of equal extent
+    assert util.get_const_int(A.shape[1]) == util.get_const_int(B.shape[0])
+    n = A.shape[0]
+    m = B.shape[1]
+    k_size = util.get_const_int(A.shape[1])
+    unroll_gemm = cfg["split_k_factor"].val
+    if unroll_gemm == 1:
+        # No unrolling case must have the same set of tensors to keep scheduling consistent
+        # Create identity tensors to take the place of A_unrolled, B_unrolled and R
+        A_unrolled = tvm.compute((n, k_size), lambda i, j: A[i, j], name="A_unrolled")
+        B_unrolled = tvm.compute((k_size, m), lambda i, j: B[i, j], name="B_unrolled")
+
+        # Declare standard GEMM
+        k = tvm.reduce_axis((0, A.shape[1]), name='k')
+        C = tvm.compute((n, m), lambda i, j:
+                        tvm.sum(A_unrolled[i, k] * B_unrolled[k, j], axis=k), name='C')
+
+        R = tvm.compute((n, m), lambda i, j: C[i, j], name="R")
+
+    else:
+        unrolled_k_size = k_size // unroll_gemm
+
+        # Unroll the two input matrices along the shared k axis
+        A_unrolled = tvm.compute((unroll_gemm, n, unrolled_k_size), lambda b, i, j:
+                                 A[i][unrolled_k_size * b + j], name='A_unrolled')
+
+        B_unrolled = tvm.compute((unroll_gemm, unrolled_k_size, m), lambda b, i, j:
+                                 B[unrolled_k_size * b + i][j], name='B_unrolled')
+
+        # Declare a batched GEMM
+        k = tvm.reduce_axis((0, unrolled_k_size), name='k')
+        C = tvm.compute((unroll_gemm, n, m), lambda b, i, j:
+                        tvm.sum(A_unrolled[b][i][k] * B_unrolled[b][k][j], axis=k), name='C')
+
+        # Then declare a reduction to reduce the sub matrices
+        k = tvm.reduce_axis((0, unroll_gemm), name='k')
+        R = tvm.compute((n, m), lambda i, j:
+                        tvm.sum(C[k][i][j], axis=k), name='R')
+
+    return R
+
+def decl_batched_gemm(cfg, A, B):
+    """Declare a batched GEMM computation for Mali Bifrost GPUs
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    A : tvm.Tensor
+        3D Tensor, shape [b, n, k]
+
+    B : tvm.Tensor
+        3D Tensor, shape [b, k, m]
+
+    Returns
+    -------
+    C : tvm.Tensor
+        3D Tensor, shape [b, n, m]
+
+    """
+    # Mutual b and k axis must be of equal extent
+    assert util.get_const_int(A.shape[2]) == util.get_const_int(B.shape[1])
+    assert util.get_const_int(A.shape[0]) == util.get_const_int(B.shape[0])
+
+    cfg.define_knob("work_group_x", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
+    cfg.define_knob("work_group_y", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
+    cfg.define_knob("unroll_k_factor", [1, 2, 4])
+    cfg.define_knob("A_interleave", [1, 4, 8, 16, 32, 64])
+    cfg.define_knob("B_interleave", [1, 4, 8, 16, 32])
+
+    n = A.shape[1]
+    m = B.shape[2]
+    k_size = util.get_const_int(A.shape[2])
+    b_size = util.get_const_int(A.shape[0])
+
+    # Declare a batched GEMM
+    k = tvm.reduce_axis((0, k_size), name='k')
+    C = tvm.compute((b_size, n, m), lambda b, i, j:
+                    tvm.sum(A[b][i][k] * B[b][k][j], axis=k), name='C')
+
+    return C
+
+def decl_winograd_gemm(cfg, A, B):
+    """Declare a winograd GEMM for Mali Bifrost GPUs
+    Winograd uses batched GEMM, however the input tensors are 4D
+    This wraps decl_batched_gemm to provide it with 3D tensors
+
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    A : tvm.Tensor
+        4D Tensor, shape [a, a, n, k]
+
+    B : tvm.Tensor
+        4D Tensor, shape [a * a, k, m]
+
+    Returns
+    -------
+
+    """
+    alpha = util.get_const_int(A.shape[0])
+    n = util.get_const_int(A.shape[2])
+    k = util.get_const_int(A.shape[3])
+
+    A_3D = tvm.compute((alpha * alpha, n, k), lambda b, i, j:
+                       A[b // alpha][b % alpha][i][j], name='A_3D')
+
+    C = decl_batched_gemm(cfg, A_3D, B)
+    return A_3D, C
+
+def schedule_gemm(cfg, s, A, B, C, batched=False, schedule_transforms=True):
+    """Schedule GEMM, single and batched
+
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    s : tvm.schedule.Schedule
+        Operator schedule
+
+    A : tvm.Tensor
+        2D/3D Tensor, shape [n, k]/[b, n, k]
+
+    B : tvm.Tensor
+        2D/3D Tensor, shape [k, m]/[b, k, m]
+
+    C : tvm.Tensor
+        2D/3D Tensor, shape [n, m]/[b, n, m]
+
+    batched : bool
+        Whether the GEMM is batched
+
+    Returns
+    -------
+
+    """
+    block_size_x = 4
+    block_size_y = 4
+    warp_size_x = 2
+    warp_size_y = 2
+
+    work_group_x = cfg["work_group_x"].val
+    work_group_y = cfg["work_group_y"].val
+    k_unroll = cfg["unroll_k_factor"].val
+
+    if not batched:
+        y_index, x_index = (0, 1)
+    else:
+        y_index, x_index = (1, 2)
+
+    trans_inter, A_transposed_interleaved = transpose_interleave(
+        s, A, cfg["A_interleave"].val, y_index, x_index, [C], batched=batched
+    )
+    inter_trans, B_interleaved_transposed = interleave_transpose(
+        s, B, cfg["B_interleave"].val, y_index, x_index, [C], batched=batched
+    )
+
+    if schedule_transforms:
+        # Schedule A
+        y, x = s[trans_inter].op.axis
+        y, x, yi, xi = s[trans_inter].tile(y, x, 1, 8)
+        s[trans_inter].unroll(yi)
+        s[trans_inter].unroll(xi)
+        tile_and_bind(s, trans_inter, y, x, 1, 4)
+
+        # Schedule B
+        y, x = s[inter_trans].op.axis
+        xo, xi = s[inter_trans].split(x, 4)
+        s[inter_trans].vectorize(xi)
+        tile_and_bind(s, inter_trans, y, xo, 4, 4)
+
+    # Schedule C
+    CR_A = s.cache_read(A_transposed_interleaved, 'local', [C])
+    CR_B = s.cache_read(B_interleaved_transposed, 'local', [C])
+    CW_C = s.cache_write(C, 'local')
+
+    if not batched:
+        y, x = s[C].op.axis
+    else:
+        z, y, x = s[C].op.axis
+    y, x, yt, xt = s[C].tile(y, x, block_size_y, block_size_x)
+    s[C].unroll(yt)
+    s[C].vectorize(xt)
+    # Tile the global work space to generate 'square' warps -> 2x2 for warp size of 4
+    y, x, wy, wx = s[C].tile(y, x, warp_size_y, warp_size_x)
+    x = s[C].fuse(x, wy, wx)
+    if not batched:
+        yo, xo, yi, xi = tile_and_bind(s, C, y, x, work_group_y, work_group_x)
+    else:
+        # For batched GEMM bind batch to z axis
+        zo, yo, xo, zi, yi, xi = tile_and_bind3d(s, C, z, y, x, 1, work_group_y, work_group_x)
+
+    s[CW_C].compute_at(s[C], xi)
+    if not batched:
+        y, x = s[CW_C].op.axis
+    else:
+        _, y, x = s[CW_C].op.axis
+    y, x, yt, xt = s[CW_C].tile(y, x, block_size_y, block_size_x)
+    k = s[CW_C].op.reduce_axis[0]
+    s[CW_C].reorder(k, yt, xt)
+    ko, ki = s[CW_C].split(k, k_unroll)
+    s[CW_C].unroll(ki)
+    s[CW_C].unroll(yt)
+    s[CW_C].unroll(xt)
+
+    if not batched:
+        i, j = s[CR_A].op.axis
+    else:
+        _, i, j = s[CR_A].op.axis
+    s[CR_A].reorder(j, i)
+    s[CR_A].compute_at(s[CW_C], ki)
+    s[CR_A].unroll(j)
+    s[CR_A].vectorize(i)
+
+    if not batched:
+        i, j = s[CR_B].op.axis
+    else:
+        _, i, j = s[CR_B].op.axis
+    s[CR_B].compute_at(s[CW_C], ki)
+    s[CR_B].unroll(i)
+    s[CR_B].vectorize(j)
+
+    return trans_inter, inter_trans
+
+def schedule_unrollable_gemm(cfg, s, A, B, C, R):
+    """Schedule a GEMM that can be unrolled by a constant factor
+    along its inner dimension
+
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    s : tvm.schedule.Schedule
+        Operator schedule
+
+    A : tvm.Tensor
+        2D/3D Tensor, shape [n, k]/[b, n, k]
+
+    B : tvm.Tensor
+        2D/3D Tensor, shape [k, m]/[b, k, m]
+
+    C : tvm.Tensor
+        2D/3D Tensor, shape [n, m]/[b, n, m]
+
+    R : tvm.Tensor
+        2D Tensor, shape [n, m]
+
+    """
+    # If the GEMM is 2D, no unrolling has taken place
+    # Use non-batched GEMM schedule and inline identity matrices A, B and R
+    if len(C.op.axis) == 2:
+        s[A].compute_inline()
+        s[B].compute_inline()
+        schedule_gemm(cfg, s, A, B, C)
+        s[R].compute_inline()
+
+    # GEMM is 3D, use batched GEMM schedule, inline A and B and schedule R
+    else:
+        s[A].compute_inline()
+        s[B].compute_inline()
+        schedule_gemm(cfg, s, A, B, C, batched=True)
+
+        CR_C = s.cache_read(C, 'local', [R])
+
+        y, x = s[R].op.axis
+        xo, xi = s[R].split(x, 4)
+        k = s[R].op.reduce_axis[0]
+        s[R].reorder(k, xi)
+        ko, ki = s[R].split(k, 4)
+        s[R].unroll(xi)
+        s[R].unroll(ki)
+        tile_and_bind(s, R, y, xo, 1, 2)
+
+        s[CR_C].compute_at(s[R], ko)
+        _, y, x = s[CR_C].op.axis
+        s[CR_C].unroll(y)
+        s[CR_C].vectorize(x)
+
+def get_unrollable_gemm_ops(R):
+    """Get all GEMM operators from the final reduction
+    This is a helper function to more easily get all the GEMM operations
+    from an operator
+
+    Parameters
+    ----------
+    R : tvm.Tensor
+        Reduced tensor, final stage of GEMM
+
+    Returns
+    -------
+    A_unrolled : tvm.Tensor
+        Matrix A unrolled along k
+
+    B_unrolled: tvm.Tensor
+        Matrix B unrolled along k
+
+    C : tvm.Tensor
+        Result of batched GEMM
+
+    R : tvm.Tensor
+        Reduction of C, result of unrollable GEMM
+
+    """
+    C = R.op.input_tensors[0]
+    A_unrolled, B_unrolled = C.op.input_tensors
+    return A_unrolled, B_unrolled, C, R
diff --git a/topi/python/topi/bifrost/transforms.py b/topi/python/topi/bifrost/transforms.py
new file mode 100644
index 000000000000..ea3e51082657
--- /dev/null
+++ b/topi/python/topi/bifrost/transforms.py
@@ -0,0 +1,120 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Utility scheduling functions for the Bifrost schedules"""
+
+from __future__ import absolute_import as _abs
+import tvm
+
+def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+    """Fuse all the axis and bind to GPU threads"""
+    axis = axis or s[tensor].op.axis
+    fused = s[tensor].fuse(*axis)
+    max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+    bx, tx = s[tensor].split(fused, num_thread or max_threads)
+    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    return bx, tx
+
+def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
+    """Tile and bind to GPU threads"""
+    x_factor = x_factor or y_factor
+    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    return yo, xo, yi, xi
+
+def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+    """Tile and bind 3d"""
+    y_factor = y_factor or z_factor
+    x_factor = x_factor or y_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    return zo, yo, xo, zi, yi, xi
+
+def pack_tensor(s, tensor, factor, readers):
+    """Do transform X[n, m] -> X[n / factor, m, factor]"""
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis
+    yo, yi = s[tmp].split(y, factor)
+    s[tmp].reorder(yo, x, yi)
+    s[tmp].compute_inline()
+    return s.cache_write(tmp, 'global'), tmp
+
+def transpose(s, tensor, y_index, x_index, readers):
+    """Do transform X[n, m] -> X[m, n]"""
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis[y_index], s[tmp].op.axis[x_index]
+    s[tmp].reorder(x, y)
+    s[tmp].compute_inline()
+    A_transpose = s.cache_write(tmp, "global")
+
+    CR_A = s.cache_read(tensor, 'local', [A_transpose])
+    CW_A_transpose = s.cache_write(A_transpose, 'local')
+
+    y, x = s[A_transpose].op.axis[y_index], s[A_transpose].op.axis[x_index]
+    yo, xo, yi, xi = s[A_transpose].tile(y, x, 4, 4)
+    s[A_transpose].unroll(yi)
+    s[A_transpose].vectorize(xi)
+    _, _, _, xi = tile_and_bind(s, A_transpose, yo, xo, 32, 2)
+
+    s[CW_A_transpose].compute_at(s[A_transpose], xi)
+    y, x = s[CW_A_transpose].op.axis[y_index], s[CW_A_transpose].op.axis[x_index]
+    s[CW_A_transpose].unroll(x)
+    s[CW_A_transpose].unroll(y)
+
+    s[CR_A].compute_at(s[A_transpose], xi)
+    y, x = s[CR_A].op.axis[y_index], s[CR_A].op.axis[x_index]
+    s[CR_A].unroll(y)
+    s[CR_A].vectorize(x)
+
+    return tmp
+
+def interleave_transpose(s, tensor, width, y_index, x_index, readers, batched=False):
+    """Interleave the tensor, then transpose it"""
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis[y_index], s[tmp].op.axis[x_index]
+    xo, xi = s[tmp].split(x, width)
+    s[tmp].reorder(xo, y, xi)
+    s[tmp].fuse(y, xi)
+    if batched:
+        z = s[tmp].op.axis[0]
+        s[tmp].fuse(z, xo)
+    s[tmp].compute_inline()
+    return s.cache_write(tmp, 'global'), tmp
+
+def transpose_interleave(s, tensor, width, y_index, x_index, readers, batched=False):
+    """Transpose the tensor, then interleave it"""
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis[y_index], s[tmp].op.axis[x_index]
+    yo, yi = s[tmp].split(y, width)
+    s[tmp].reorder(yo, x, yi)
+    s[tmp].fuse(x, yi)
+    if batched:
+        z = s[tmp].op.axis[0]
+        s[tmp].fuse(z, yo)
+    s[tmp].compute_inline()
+    return s.cache_write(tmp, 'global'), tmp

From 5fb518ea142591b43b959197769b6dc4789e50e4 Mon Sep 17 00:00:00 2001
From: Jon Soifer <soiferj@gmail.com>
Date: Tue, 22 Oct 2019 16:37:53 -0700
Subject: [PATCH 20/59] [Relay][Frontend][TF] Fix Size operator (#4175)

* [Relay][Frontend][TF] Fix Size operator

* Uncomment tests
---
 python/tvm/relay/frontend/common.py              |  2 +-
 python/tvm/relay/frontend/tensorflow.py          |  9 ++++++++-
 tests/python/frontend/tensorflow/test_forward.py | 13 ++++++++-----
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 637e1f0860da..d4b9162d6f3d 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -259,7 +259,7 @@ def get_relay_op(op_name):
             op = None
     else:
         # try search op in various modules
-        for candidate in (_op, _op.nn, _op.image, _op.vision):
+        for candidate in (_op, _op.nn, _op.image, _op.vision, _op.contrib):
             op = getattr(candidate, op_name, None)
             if op is not None:
                 break
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index eb67cf24b81e..95e0008b3332 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1305,6 +1305,13 @@ def _impl(inputs, attr, params):
         return _op.multiply(difference, difference)
     return _impl
 
+def _size():
+    def _impl(inputs, attr, params):
+        new_attr = attr
+        new_attr['out_type'] = attr['out_type'].name
+        return AttrCvt('ndarray_size', transforms={'out_type' : 'dtype'})(inputs, new_attr)
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -1410,7 +1417,7 @@ def _impl(inputs, attr, params):
     'Shape'                             : _shape(),
     'Sigmoid'                           : AttrCvt('sigmoid'),
     'Sign'                              : AttrCvt('sign'),
-    'Size'                              : AttrCvt('ndarray_size'),
+    'Size'                              : _size(),
     'Slice'                             : _slice(),
     'Softmax'                           : _softmax(),
     'Softplus'                          : _softplus(),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 420bcb72a4a2..11c6a7befca6 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -2184,15 +2184,18 @@ def check_mean(ishape, **kwargs):
 def test_forward_size():
     def check_size(ishape):
         np_input = np.random.uniform(size=ishape).astype(np.float32)
+
+        # if all dimensions are constant, TF will optimize away size operator into constant
+        tf_input_shape = list(np_input.shape)
+        tf_input_shape[0] = None
+
         with tf.Graph().as_default():
-            input = tf.placeholder(shape=np_input.shape, dtype=np_input.dtype, name='input')
+            input = tf.placeholder(shape=tf_input_shape, dtype=np_input.dtype, name='input')
             tf.size(input, name='size')
             compare_tf_with_tvm([np_input], ['input:0'], 'size:0')
 
-    if tf.__version__ < LooseVersion('1.1'):
-        check_size((10, 8, 16, 32))
-        check_size((10,))
-        check_size(())
+    check_size((10, 8, 16, 32))
+    check_size((10,))
 
 #######################################################################
 # All, Max, Min

From 302abeac9772657612de42bbd08ce68005383054 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Tue, 22 Oct 2019 19:13:55 -0700
Subject: [PATCH 21/59] [Pass] Remove dead code (#4177)

---
 src/relay/pass/device_annotation.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/relay/pass/device_annotation.cc b/src/relay/pass/device_annotation.cc
index d29516f0df79..94d09b7c236c 100644
--- a/src/relay/pass/device_annotation.cc
+++ b/src/relay/pass/device_annotation.cc
@@ -415,7 +415,6 @@ class DeviceInfo {
 
     void VisitExpr_(const TupleGetItemNode* op) final {
       ExprVisitor::VisitExpr_(op);
-      std::make_pair(op, has_copy_);
     }
 
     void VisitExpr_(const VarNode* vn) final {

From fc1ab4b43128b4d537b4bbc3d8f590848c89d008 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Thu, 24 Oct 2019 02:02:34 +0800
Subject: [PATCH 22/59] [rpc] use callback func to do send & recv (#4147)

* [rpc] use callback func to do send & recv. don't get fd from sock as it is deprecated in java

* fix java build

* fix min/max macro define in windows

* keep the old rpc setup for py

* add doc for CallbackChannel
---
 .../tvm/rpc/ConnectProxyServerProcessor.java  | 19 +++----
 .../rpc/ConnectTrackerServerProcessor.java    | 15 +++---
 .../ml/dmlc/tvm/rpc/NativeServerLoop.java     | 13 +++--
 .../main/java/ml/dmlc/tvm/rpc/RPCSession.java |  2 +
 .../src/main/java/ml/dmlc/tvm/rpc/Server.java | 49 ++-----------------
 .../java/ml/dmlc/tvm/rpc/SocketChannel.java   | 49 +++++++++++++++++++
 .../tvm/rpc/SocketFileDescriptorGetter.java   | 32 ------------
 .../tvm/rpc/StandaloneServerProcessor.java    | 19 +++----
 .../ml/dmlc/tvm/contrib/GraphRuntimeTest.java |  5 +-
 jvm/pom.xml                                   |  4 +-
 python/tvm/rpc/tracker.py                     |  2 +-
 src/common/socket.h                           |  2 +
 src/runtime/rpc/rpc_event_impl.cc             | 28 ++---------
 src/runtime/rpc/rpc_session.cc                | 22 +++++++++
 src/runtime/rpc/rpc_session.h                 | 33 ++++++++++++-
 src/runtime/rpc/rpc_socket_impl.cc            | 17 ++++++-
 16 files changed, 165 insertions(+), 146 deletions(-)
 create mode 100644 jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketChannel.java
 delete mode 100644 jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketFileDescriptorGetter.java

diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java
index 2fc97f65aca4..04888f568be3 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java
@@ -30,7 +30,6 @@ public class ConnectProxyServerProcessor implements ServerProcessor {
   private final String host;
   private final int port;
   private final String key;
-  private final SocketFileDescriptorGetter socketFileDescriptorGetter;
 
   private volatile Socket currSocket = new Socket();
   private Runnable callback;
@@ -40,14 +39,11 @@ public class ConnectProxyServerProcessor implements ServerProcessor {
    * @param host Proxy server host.
    * @param port Proxy server port.
    * @param key Proxy server key.
-   * @param sockFdGetter Method to get file descriptor from Java socket.
    */
-  public ConnectProxyServerProcessor(String host, int port, String key,
-      SocketFileDescriptorGetter sockFdGetter) {
+  public ConnectProxyServerProcessor(String host, int port, String key) {
     this.host = host;
     this.port = port;
     this.key = "server:" + key;
-    socketFileDescriptorGetter = sockFdGetter;
   }
   
   /** 
@@ -70,8 +66,8 @@ public void setStartTimeCallback(Runnable callback) {
     try {
       SocketAddress address = new InetSocketAddress(host, port);
       currSocket.connect(address, 6000);
-      InputStream in = currSocket.getInputStream();
-      OutputStream out = currSocket.getOutputStream();
+      final InputStream in = currSocket.getInputStream();
+      final OutputStream out = currSocket.getOutputStream();
       out.write(Utils.toBytes(RPC.RPC_MAGIC));
       out.write(Utils.toBytes(key.length()));
       out.write(Utils.toBytes(key));
@@ -91,11 +87,10 @@ public void setStartTimeCallback(Runnable callback) {
       if (callback != null) {
         callback.run();
       }
-      final int sockFd = socketFileDescriptorGetter.get(currSocket);
-      if (sockFd != -1) {
-        new NativeServerLoop(sockFd).run();
-        System.err.println("Finish serving " + address);
-      }
+
+      SocketChannel sockChannel = new SocketChannel(currSocket);
+      new NativeServerLoop(sockChannel.getFsend(), sockChannel.getFrecv()).run();
+      System.err.println("Finish serving " + address);
     } catch (Throwable e) {
       e.printStackTrace();
       throw new RuntimeException(e);
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java
index 47881eb350c3..c449bb18a565 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java
@@ -37,7 +37,6 @@
  */
 public class ConnectTrackerServerProcessor implements ServerProcessor {
   private ServerSocket server;
-  private final SocketFileDescriptorGetter socketFileDescriptorGetter;
   private final String trackerHost;
   private final int trackerPort;
   // device key
@@ -62,10 +61,11 @@ public class ConnectTrackerServerProcessor implements ServerProcessor {
    * @param trackerHost Tracker host.
    * @param trackerPort Tracker port.
    * @param key Device key.
-   * @param sockFdGetter Method to get file descriptor from Java socket.
+   * @param watchdog watch for timeout, etc.
+   * @throws java.io.IOException when socket fails to open.
    */
   public ConnectTrackerServerProcessor(String trackerHost, int trackerPort, String key,
-      SocketFileDescriptorGetter sockFdGetter, RPCWatchdog watchdog) throws IOException {
+      RPCWatchdog watchdog) throws IOException {
     while (true) {
       try {
         this.server = new ServerSocket(serverPort);
@@ -81,7 +81,6 @@ public ConnectTrackerServerProcessor(String trackerHost, int trackerPort, String
       }
     }
     System.err.println("using port: " + serverPort);
-    this.socketFileDescriptorGetter = sockFdGetter;
     this.trackerHost = trackerHost;
     this.trackerPort = trackerPort;
     this.key = key;
@@ -163,11 +162,9 @@ public String getMatchKey() {
       System.err.println("Connection from " + socket.getRemoteSocketAddress().toString());
       // received timeout in seconds
       watchdog.startTimeout(timeout * 1000);
-      final int sockFd = socketFileDescriptorGetter.get(socket);
-      if (sockFd != -1) {
-        new NativeServerLoop(sockFd).run();
-        System.err.println("Finish serving " + socket.getRemoteSocketAddress().toString());
-      }
+      SocketChannel sockChannel = new SocketChannel(socket);
+      new NativeServerLoop(sockChannel.getFsend(), sockChannel.getFrecv()).run();
+      System.err.println("Finish serving " + socket.getRemoteSocketAddress().toString());
       Utils.closeQuietly(socket);
     } catch (ConnectException e) {
       // if the tracker connection failed, wait a bit before retrying
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java
index 255dabb438d5..697ce45fa04f 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java
@@ -28,14 +28,17 @@
  * Call native ServerLoop on socket file descriptor.
  */
 public class NativeServerLoop implements Runnable {
-  private final int sockFd;
+  private final Function fsend;
+  private final Function frecv;
 
   /**
    * Constructor for NativeServerLoop.
-   * @param nativeSockFd native socket file descriptor.
+   * @param fsend socket.send function.
+   * @param frecv socket.recv function.
    */
-  public NativeServerLoop(final int nativeSockFd) {
-    sockFd = nativeSockFd;
+  public NativeServerLoop(final Function fsend, final Function frecv) {
+    this.fsend = fsend;
+    this.frecv = frecv;
   }
 
   @Override public void run() {
@@ -43,7 +46,7 @@ public NativeServerLoop(final int nativeSockFd) {
     try {
       tempDir = serverEnv();
       System.err.println("starting server loop...");
-      RPC.getApi("_ServerLoop").pushArg(sockFd).invoke();
+      RPC.getApi("_ServerLoop").pushArg(fsend).pushArg(frecv).invoke();
       System.err.println("done server loop...");
     } catch (IOException e) {
       e.printStackTrace();
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
index 8ebf188b0667..278ef9fe8eef 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
@@ -200,6 +200,7 @@ public void upload(byte[] data, String target) {
    * Upload file to remote runtime temp folder.
    * @param data The file in local to upload.
    * @param target The path in remote.
+   * @throws java.io.IOException for network failure.
    */
   public void upload(File data, String target) throws IOException {
     byte[] blob = getBytesFromFile(data);
@@ -209,6 +210,7 @@ public void upload(File data, String target) throws IOException {
   /**
    * Upload file to remote runtime temp folder.
    * @param data The file in local to upload.
+   * @throws java.io.IOException for network failure.
    */
   public void upload(File data) throws IOException {
     upload(data, data.getName());
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Server.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Server.java
index c81faa0ca999..a9ea2d89a62c 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Server.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Server.java
@@ -17,31 +17,12 @@
 
 package ml.dmlc.tvm.rpc;
 
-import sun.misc.SharedSecrets;
-
-import java.io.FileDescriptor;
-import java.io.FileInputStream;
 import java.io.IOException;
-import java.io.InputStream;
-import java.net.Socket;
 
 /**
  * RPC Server.
  */
 public class Server {
-  private static SocketFileDescriptorGetter defaultSocketFdGetter
-      = new SocketFileDescriptorGetter() {
-          @Override public int get(Socket socket) {
-            try {
-              InputStream is = socket.getInputStream();
-              FileDescriptor fd = ((FileInputStream) is).getFD();
-              return SharedSecrets.getJavaIOFileDescriptorAccess().get(fd);
-            } catch (IOException e) {
-              e.printStackTrace();
-              return -1;
-            }
-          }
-        };
   private final WorkerThread worker;
 
   private static class WorkerThread extends Thread {
@@ -72,35 +53,10 @@ public void terminate() {
   /**
    * Start a standalone server.
    * @param serverPort Port.
-   * @param socketFdGetter Method to get system file descriptor of the server socket.
-   * @throws IOException if failed to bind localhost:port.
-   */
-  public Server(int serverPort, SocketFileDescriptorGetter socketFdGetter) throws IOException {
-    worker = new WorkerThread(new StandaloneServerProcessor(serverPort, socketFdGetter));
-  }
-
-  /**
-   * Start a standalone server.
-   * Use sun.misc.SharedSecrets.getJavaIOFileDescriptorAccess
-   * to get file descriptor for the socket.
-   * @param serverPort Port.
    * @throws IOException if failed to bind localhost:port.
    */
   public Server(int serverPort) throws IOException {
-    this(serverPort, defaultSocketFdGetter);
-  }
-
-  /**
-   * Start a server connected to proxy.
-   * @param proxyHost The proxy server host.
-   * @param proxyPort The proxy server port.
-   * @param key The key to identify the server.
-   * @param socketFdGetter Method to get system file descriptor of the server socket.
-   */
-  public Server(String proxyHost, int proxyPort, String key,
-      SocketFileDescriptorGetter socketFdGetter) {
-    worker = new WorkerThread(
-      new ConnectProxyServerProcessor(proxyHost, proxyPort, key, socketFdGetter));
+    worker = new WorkerThread(new StandaloneServerProcessor(serverPort));
   }
 
   /**
@@ -112,7 +68,8 @@ public Server(String proxyHost, int proxyPort, String key,
    * @param key The key to identify the server.
    */
   public Server(String proxyHost, int proxyPort, String key) {
-    this(proxyHost, proxyPort, key, defaultSocketFdGetter);
+    worker = new WorkerThread(
+        new ConnectProxyServerProcessor(proxyHost, proxyPort, key));
   }
 
   /**
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketChannel.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketChannel.java
new file mode 100644
index 000000000000..e72581b2358f
--- /dev/null
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketChannel.java
@@ -0,0 +1,49 @@
+package ml.dmlc.tvm.rpc;
+
+import ml.dmlc.tvm.Function;
+import ml.dmlc.tvm.TVMValue;
+import ml.dmlc.tvm.TVMValueBytes;
+
+import java.io.IOException;
+import java.net.Socket;
+
+public class SocketChannel {
+  private final Socket socket;
+
+  SocketChannel(Socket sock) {
+    socket = sock;
+  }
+
+  private Function fsend = Function.convertFunc(new Function.Callback() {
+    @Override public Object invoke(TVMValue... args) {
+      byte[] data = args[0].asBytes();
+      try {
+        socket.getOutputStream().write(data);
+      } catch (IOException e) {
+        e.printStackTrace();
+        return -1;
+      }
+      return data.length;
+    }
+  });
+
+  private Function frecv = Function.convertFunc(new Function.Callback() {
+    @Override public Object invoke(TVMValue... args) {
+      long size = args[0].asLong();
+      try {
+        return new TVMValueBytes(Utils.recvAll(socket.getInputStream(), (int) size));
+      } catch (IOException e) {
+        e.printStackTrace();
+        return -1;
+      }
+    }
+  });
+
+  public Function getFsend() {
+    return fsend;
+  }
+
+  public Function getFrecv() {
+    return frecv;
+  }
+}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketFileDescriptorGetter.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketFileDescriptorGetter.java
deleted file mode 100644
index 4c35f720009d..000000000000
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketFileDescriptorGetter.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package ml.dmlc.tvm.rpc;
-
-import java.net.Socket;
-
-/**
- * Interface for defining different socket fd getter.
- */
-public interface SocketFileDescriptorGetter {
-  /**
-   * Get native socket file descriptor.
-   * @param socket Java socket.
-   * @return native socket fd.
-   */
-  public int get(Socket socket);
-}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java
index 06e3303d1523..2d2303d3fe8a 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java
@@ -28,12 +28,9 @@
  */
 public class StandaloneServerProcessor implements ServerProcessor {
   private final ServerSocket server;
-  private final SocketFileDescriptorGetter socketFileDescriptorGetter;
 
-  public StandaloneServerProcessor(int serverPort,
-      SocketFileDescriptorGetter sockFdGetter) throws IOException {
+  public StandaloneServerProcessor(int serverPort) throws IOException {
     this.server = new ServerSocket(serverPort);
-    this.socketFileDescriptorGetter = sockFdGetter;
   }
 
   @Override public void terminate() {
@@ -46,9 +43,9 @@ public StandaloneServerProcessor(int serverPort,
 
   @Override public void run() {
     try {
-      Socket socket = server.accept();
-      InputStream in = socket.getInputStream();
-      OutputStream out = socket.getOutputStream();
+      final Socket socket = server.accept();
+      final InputStream in = socket.getInputStream();
+      final OutputStream out = socket.getOutputStream();
       int magic = Utils.wrapBytes(Utils.recvAll(in, 4)).getInt();
       if (magic != RPC.RPC_MAGIC) {
         Utils.closeQuietly(socket);
@@ -66,12 +63,10 @@ public StandaloneServerProcessor(int serverPort,
         out.write(Utils.toBytes(serverKey));
       }
 
+      SocketChannel sockChannel = new SocketChannel(socket);
       System.err.println("Connection from " + socket.getRemoteSocketAddress().toString());
-      final int sockFd = socketFileDescriptorGetter.get(socket);
-      if (sockFd != -1) {
-        new NativeServerLoop(sockFd).run();
-        System.err.println("Finish serving " + socket.getRemoteSocketAddress().toString());
-      }
+      new NativeServerLoop(sockChannel.getFsend(), sockChannel.getFrecv()).run();
+      System.err.println("Finish serving " + socket.getRemoteSocketAddress().toString());
       Utils.closeQuietly(socket);
     } catch (Throwable e) {
       e.printStackTrace();
diff --git a/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java b/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java
index d719eb6f61e7..a29402867381 100644
--- a/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java
+++ b/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java
@@ -17,7 +17,10 @@
 
 package ml.dmlc.tvm.contrib;
 
-import ml.dmlc.tvm.*;
+import ml.dmlc.tvm.Module;
+import ml.dmlc.tvm.NDArray;
+import ml.dmlc.tvm.TVMContext;
+import ml.dmlc.tvm.TestUtils;
 import ml.dmlc.tvm.rpc.Client;
 import ml.dmlc.tvm.rpc.RPCSession;
 import ml.dmlc.tvm.rpc.Server;
diff --git a/jvm/pom.xml b/jvm/pom.xml
index 99cfe0d7b5ec..150c3a00a894 100644
--- a/jvm/pom.xml
+++ b/jvm/pom.xml
@@ -164,8 +164,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.3</version>
         <configuration>
-          <source>1.6</source>
-          <target>1.6</target>
+          <source>1.7</source>
+          <target>1.7</target>
           <encoding>UTF-8</encoding>
         </configuration>
       </plugin>
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index f31625cd34ed..b9b29a7fe4a1 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -230,7 +230,7 @@ def call_handler(self, args):
             port, matchkey = args[2]
             self.pending_matchkeys.add(matchkey)
             # got custom address (from rpc server)
-            if args[3] is not None:
+            if len(args) >= 4 and args[3] is not None:
                 value = (self, args[3], port, matchkey)
             else:
                 value = (self, self._addr[0], port, matchkey)
diff --git a/src/common/socket.h b/src/common/socket.h
index 2a2d9166a134..39bcff863c10 100644
--- a/src/common/socket.h
+++ b/src/common/socket.h
@@ -27,8 +27,10 @@
 #define TVM_COMMON_SOCKET_H_
 
 #if defined(_WIN32)
+#define NOMINMAX
 #include <winsock2.h>
 #include <ws2tcpip.h>
+#undef NOMINMAX
 using ssize_t = int;
 #ifdef _MSC_VER
 #pragma comment(lib, "Ws2_32.lib")
diff --git a/src/runtime/rpc/rpc_event_impl.cc b/src/runtime/rpc/rpc_event_impl.cc
index 7a142f3373db..3f4782693d8a 100644
--- a/src/runtime/rpc/rpc_event_impl.cc
+++ b/src/runtime/rpc/rpc_event_impl.cc
@@ -29,32 +29,14 @@
 namespace tvm {
 namespace runtime {
 
-class CallbackChannel final : public RPCChannel {
- public:
-  explicit CallbackChannel(PackedFunc fsend)
-      : fsend_(fsend) {}
-
-  size_t Send(const void* data, size_t size) final {
-    TVMByteArray bytes;
-    bytes.data = static_cast<const char*>(data);
-    bytes.size = size;
-    uint64_t ret = fsend_(bytes);
-    return static_cast<size_t>(ret);
-  }
-
-  size_t Recv(void* data, size_t size) final {
-    LOG(FATAL) << "Do not allow explicit receive for";
-    return 0;
-  }
-
- private:
-  PackedFunc fsend_;
-};
-
 PackedFunc CreateEventDrivenServer(PackedFunc fsend,
                                    std::string name,
                                    std::string remote_key) {
-  std::unique_ptr<CallbackChannel> ch(new CallbackChannel(fsend));
+  static PackedFunc frecv([](TVMArgs args, TVMRetValue* rv) {
+    LOG(FATAL) << "Do not allow explicit receive";
+    return 0;
+  });
+  std::unique_ptr<CallbackChannel> ch(new CallbackChannel(fsend, frecv));
   std::shared_ptr<RPCSession> sess =
       RPCSession::Create(std::move(ch), name, remote_key);
   return PackedFunc([sess](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index f235ec8e8f0c..39db150bd3a0 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -36,6 +36,7 @@
 #include <algorithm>
 #include "rpc_session.h"
 #include "../../common/ring_buffer.h"
+#include "../../common/socket.h"
 
 namespace tvm {
 namespace runtime {
@@ -1260,5 +1261,26 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf,
   return PackedFunc(ftimer);
 }
 
+size_t CallbackChannel::Send(const void* data, size_t size) {
+  TVMByteArray bytes;
+  bytes.data = static_cast<const char*>(data);
+  bytes.size = size;
+  int64_t n = fsend_(bytes);
+  if (n == -1) {
+    common::Socket::Error("CallbackChannel::Send");
+  }
+  return static_cast<size_t>(n);
+}
+
+size_t CallbackChannel::Recv(void* data, size_t size) {
+  TVMRetValue ret = frecv_(size);
+  if (ret.type_code() != kBytes) {
+    common::Socket::Error("CallbackChannel::Recv");
+  }
+  std::string* bytes = ret.ptr<std::string>();
+  memcpy(static_cast<char*>(data), bytes->c_str(), bytes->length());
+  return bytes->length();
+}
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index bc0bc8fe5918..d982f68bcb6e 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -87,7 +87,7 @@ class RPCChannel {
    */
   virtual size_t Send(const void* data, size_t size) = 0;
   /*!
-e   * \brief Recv data from channel.
+   * \brief Recv data from channel.
    *
    * \param data The data pointer.
    * \param size The size fo the data.
@@ -253,6 +253,37 @@ class RPCSession {
   std::string remote_key_;
 };
 
+/*!
+ * \brief RPC channel which callback
+ * frontend (Python/Java/etc.)'s send & recv function
+ */
+class CallbackChannel final : public RPCChannel {
+ public:
+  explicit CallbackChannel(PackedFunc fsend, PackedFunc frecv)
+      : fsend_(std::move(fsend)), frecv_(std::move(frecv)) {}
+
+  ~CallbackChannel() {}
+  /*!
+   * \brief Send data over to the channel.
+   * \param data The data pointer.
+   * \param size The size fo the data.
+   * \return The actual bytes sent.
+   */
+  size_t Send(const void* data, size_t size) final;
+  /*!
+   * \brief Recv data from channel.
+   *
+   * \param data The data pointer.
+   * \param size The size fo the data.
+   * \return The actual bytes received.
+   */
+  size_t Recv(void* data, size_t size) final;
+
+ private:
+  PackedFunc fsend_;
+  PackedFunc frecv_;
+};
+
 /*!
  * \brief Wrap a timer function to measure the time cost of a given packed function.
  * \param f The function argument.
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 16528bcc68a1..65d37531159f 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -36,7 +36,7 @@ class SockChannel final : public RPCChannel {
       : sock_(sock) {}
   ~SockChannel() {
     if (!sock_.BadSocket()) {
-        sock_.Close();
+      sock_.Close();
     }
   }
   size_t Send(const void* data, size_t size) final {
@@ -109,12 +109,25 @@ void RPCServerLoop(int sockfd) {
       "SockServerLoop", "")->ServerLoop();
 }
 
+void RPCServerLoop(PackedFunc fsend, PackedFunc frecv) {
+  RPCSession::Create(std::unique_ptr<CallbackChannel>(
+      new CallbackChannel(fsend, frecv)),
+      "SockServerLoop", "")->ServerLoop();
+}
+
 TVM_REGISTER_GLOBAL("rpc._Connect")
 .set_body_typed(RPCClientConnect);
 
 TVM_REGISTER_GLOBAL("rpc._ServerLoop")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-    RPCServerLoop(args[0]);
+    if (args.size() == 1) {
+      RPCServerLoop(args[0]);
+    } else {
+      CHECK_EQ(args.size(), 2);
+      RPCServerLoop(
+          args[0].operator tvm::runtime::PackedFunc(),
+          args[1].operator tvm::runtime::PackedFunc());
+    }
   });
 }  // namespace runtime
 }  // namespace tvm

From c275545bd40831d28dc4bb6b7fb3148aa78c33f9 Mon Sep 17 00:00:00 2001
From: Bjarke Hammersholt Roune <bjarke.roune@gmail.com>
Date: Wed, 23 Oct 2019 13:53:03 -0700
Subject: [PATCH 23/59] Add support and testing for tf.assert (as no-op) and
 tf.no_op to TF Relay frontend. (#4172)

---
 python/tvm/relay/frontend/tensorflow.py       | 27 +++++-
 .../frontend/tensorflow/test_debugging.py     | 93 +++++++++++++++++++
 .../python/frontend/tensorflow/test_no_op.py  | 43 +++++++++
 3 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100644 tests/python/frontend/tensorflow/test_debugging.py
 create mode 100644 tests/python/frontend/tensorflow/test_no_op.py

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 95e0008b3332..bfa3431ba29e 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -436,6 +436,24 @@ def _impl(inputs, attr, params):
         return AttrCvt(op_name="copy", ignores=['message'])(inputs, attr)
     return _impl
 
+def _assert():
+    # ToDo: In general people want asserts to be gone from TensorFlow graphs
+    # when they are optimizing them, so converting it to a no-op is
+    # reasonable. However, it would be nice to have the option to keep them
+    # once Relay gets a Halt or Assert op.
+    return _no_op()
+
+def _no_op():
+    def _impl(inputs, attr, params):
+        # ToDo: This should really be an op that returns nothing, which could
+        # be represented as an empty tuple. It turns out that TVM
+        # infrastructure doesn't like running functions that return None and
+        # also don't like running functions that return an empty tuple. So it
+        # doesn't work, but it should be made to work and then this could be
+        # improved. In the mean time, it is hard to imagine a case where it
+        # matters in any real way that a no-op is converted to a constant 0.
+        return tvm.relay.const(0)
+    return _impl
 
 def _matmul():
     def _impl(inputs, attr, params):
@@ -1326,6 +1344,7 @@ def _impl(inputs, attr, params):
     'All'                               : _reduce('all'),
     'ArgMax'                            : _argx(_op.argmax, 'argmax'),
     'ArgMin'                            : _argx(_op.argmin, 'argmin'),
+    'Assert'                            : _assert(),
     'AvgPool'                           : _pooling('avg_pool'),
     'BatchMatMul'                       : _batch_matmul(),
     'BatchMatMulV2'                     : _batch_matmul(),
@@ -1384,6 +1403,7 @@ def _impl(inputs, attr, params):
     'Mod'                               : _elemwise('mod'),
     'Mul'                               : _elemwise('multiply'),
     'Neg'                               : AttrCvt('negative'),
+    'NoOp'                              : _no_op(),
     'NotEqual'                          : _broadcast('not_equal'),
     'OneHot'                            : _one_hot(),
     'Pack'                              : _pack(),
@@ -2196,8 +2216,11 @@ def _parse_param(self, key, value, name, shape):
             if np_array.dtype == np.dtype(object):
                 # Object types are generally tensorflow DT_STRING (DecodeJpeg op).
                 # Just leave it as placeholder.
-                self._nodes[name] = [_expr.var(name, shape=shape[name], dtype='uint8')]
-
+                if shape:
+                    var_shape = shape[name]
+                else:
+                    var_shape = tensor_util.TensorShapeProtoToList(value.tensor.tensor_shape)
+                self._nodes[name] = [_expr.var(name, shape=var_shape, dtype='uint8')]
                 return
 
             array_ndim = len(np_array.shape)
diff --git a/tests/python/frontend/tensorflow/test_debugging.py b/tests/python/frontend/tensorflow/test_debugging.py
new file mode 100644
index 000000000000..c7da636e28aa
--- /dev/null
+++ b/tests/python/frontend/tensorflow/test_debugging.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for converting TensorFlow debugging ops to Relay."""
+import tensorflow as tf
+import numpy as np
+from tvm import relay
+from tvm.relay.frontend.tensorflow import from_tensorflow
+
+def run_relay(graph, *vars):
+    mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    ex = relay.create_executor('debug', mod=mod)
+    return ex.evaluate()(*vars)
+
+def test_assert_true():
+    g = tf.Graph()
+    with g.as_default():
+        x = tf.placeholder(tf.float32, shape=())
+        assert_op = tf.Assert(tf.less_equal(x, x), ["it failed"])
+
+        with tf.Session() as sess:
+            x_value = np.random.rand()
+            assert sess.run(assert_op, feed_dict={x: x_value}) is None
+
+        # In TVM, tf.assert is converted to a no-op which is actually a 0,
+        # though it should probably be none or an empty tuple.
+        #
+        # ToDo: It appears that the frontend converter gets confused here and
+        # entirely eliminates all operands from main(). Likely because x <= x
+        # is always true, so the placeholder can be eliminated. But TF doesn't
+        # do that, it's happening in Relay, and that optimization shouldn't
+        # affect the arity of the main function. We should have to pass in
+        # x_value here.
+        np.testing.assert_allclose(0, run_relay(g).asnumpy())
+
+def test_assert_true_var_capture():
+    g = tf.Graph()
+    with g.as_default():
+        x = tf.placeholder(tf.float32, shape=())
+
+        # It turns out that tf.assert() creates a large and complex subgraph if
+        # you capture a variable as part of the error message. So we need to
+        # test that, too.
+        assert_op = tf.Assert(tf.less_equal(x, x), ["it failed", x])
+
+        with tf.Session() as sess:
+            x_value = np.random.rand()
+            assert sess.run(assert_op, feed_dict={x: x_value}) is None
+
+        # ToDo: The frontend converter gets confused here as well, thinking
+        # that it needs to be told what x is twice. It also notes the output of
+        # the graph as a boolean, which is not correct - as you can see above,
+        # TF believes that the value of this graph is None. In addition, the
+        # arity of the translated function should be 1, not 2.
+        np.testing.assert_allclose(True, run_relay(g, x_value, x_value).asnumpy())
+
+def test_assert_false():
+    g = tf.Graph()
+    with g.as_default():
+        assert_op = tf.Assert(tf.constant(False), ["it failed"])
+
+        with tf.Session() as sess:
+            try:
+                print(sess.run(assert_op))
+                assert False  # TF should have thrown an exception
+            except tf.errors.InvalidArgumentError as e:
+                assert "it failed" in e.message
+
+        # In TVM, tf.assert is converted to a no-op which is actually a 0,
+        # though it should probably be none or an empty tuple. For the same
+        # reason, there should not be an error here, even though the assertion
+        # argument is false.
+        np.testing.assert_allclose(0, run_relay(g).asnumpy())
+
+        
+if __name__ == "__main__":
+    test_assert_true()
+    test_assert_true_var_capture()
+    test_assert_false()
+    
diff --git a/tests/python/frontend/tensorflow/test_no_op.py b/tests/python/frontend/tensorflow/test_no_op.py
new file mode 100644
index 000000000000..0d09cf4b8949
--- /dev/null
+++ b/tests/python/frontend/tensorflow/test_no_op.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for converting TensorFlow debugging ops to Relay."""
+import tensorflow as tf
+import numpy as np
+from tvm import relay
+from tvm.relay.frontend.tensorflow import from_tensorflow
+
+def run_relay(graph):
+    mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    ex = relay.create_executor('debug', mod=mod)
+    return ex.evaluate()(**params)
+
+def test_no_op():
+    g = tf.Graph()
+    with g.as_default():
+        no_op = tf.no_op()
+        with tf.Session() as sess:
+            # In TF, the type of a no-op is None.
+            assert sess.run(no_op) is None
+
+        # In TVM, no-op is currently translated to 0, though it should
+        # probably be none or an empty tuple.
+        np.testing.assert_allclose(0, run_relay(g).asnumpy())
+
+
+if __name__ == "__main__":
+    test_no_op()
+

From 45de99b5c5fc5dcd12a155f14b83a2ffcac39542 Mon Sep 17 00:00:00 2001
From: Jon Soifer <soiferj@gmail.com>
Date: Wed, 23 Oct 2019 20:12:47 -0700
Subject: [PATCH 24/59] [DOCS] Add TensorFlow frontend docs (#4154)

* Start to update TF frontend docs

* Add rst

* Remove markdown

* Update wording

* Resolve comments
---
 docs/frontend/tensorflow.md  |  53 --------
 docs/frontend/tensorflow.rst | 241 +++++++++++++++++++++++++++++++++++
 docs/index.rst               |   6 +
 3 files changed, 247 insertions(+), 53 deletions(-)
 delete mode 100644 docs/frontend/tensorflow.md
 create mode 100644 docs/frontend/tensorflow.rst

diff --git a/docs/frontend/tensorflow.md b/docs/frontend/tensorflow.md
deleted file mode 100644
index 06a6fcc32b4f..000000000000
--- a/docs/frontend/tensorflow.md
+++ /dev/null
@@ -1,53 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Tensorflow Frontend
-Tensorflow frontend helps in importing tensorflow released model into TVM.
-
-This document helps few steps while importing various different models from
-[tensorflow research/slim](https://github.com/tensorflow/models/tree/master/research/slim).
-
-Current frontend is tested with all versions of below models
-- Inception (V1/V2/V3/V4)
-- Resnet (All)
-- Mobilenet (V1/V2 All)
-- Vgg (16/19)
-
-Tensorflow frontend expects a freezed protobuf format as input.
-
-Not all models are released as freezed protobuf. Some of them are checkpoints (.ckpt).
-Please refer to [export](https://github.com/tensorflow/models/tree/master/research/slim#exporting-the-inference-graph) 
-and [freeze](https://github.com/tensorflow/models/tree/master/research/slim#freezing-the-exported-graph) 
-instructions to generate protobuf from checkpoint.
-
-## General Instructions
-
-### Add Shapes:
-While freezing of protobuf add additional option ```add_shapes=True``` to embed output shapes of each node into graph.
-You may use ```tvm.relay.testing.tf.AddShapesToGraphDef``` from nnvm for the same.
-Please refer to [tensorflow tutorial](https://github.com/dmlc/tvm/blob/master/tutorials/nnvm/from_tensorflow.py).
-
-### Explicit Shape:
-There might be situations where the add_shapes=True may not provide sufficient information about shape.
-You may pass explicit dictionary of input shapes argument for ```from_tensorflow```.
-Please refer to [test cases](https://github.com/dmlc/tvm/blob/master/nnvm/tests/python/frontend/tensorflow/test_forward.py#L36).
-
-### GPU:
-Most of these tensorflow models are released for CPU with NHWC layout.
-To compile for GPU we need to pass extra argument ```layout='NCHW'``` for from_tensorflow.
-This option will do a layout conversion before and after for neural network ops.
-Remaining nnvm build options for GPU compilation remain as it is.
diff --git a/docs/frontend/tensorflow.rst b/docs/frontend/tensorflow.rst
new file mode 100644
index 000000000000..827f5d637988
--- /dev/null
+++ b/docs/frontend/tensorflow.rst
@@ -0,0 +1,241 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+TensorFlow Frontend
+===================
+
+The TensorFlow frontend helps in importing TensorFlow models into TVM.
+
+Supported versions:
+
+- 1.12 and below
+
+Tested models:
+
+- Inception (V1/V2/V3/V4)
+- Resnet (All)
+- Mobilenet (V1/V2 All)
+- Vgg (16/19)
+- BERT (Base/3-layer)
+
+Preparing a Model for Inference
+-------------------------------
+
+Remove Unneeded Nodes
+~~~~~~~~~~~~~~~~~~~~~
+
+The export process will remove many nodes that are not needed for inference, but unfortunately will leave some remaining. The nodes that should be manually removed are:
+
+- Dropout, including `Dropout`_ and `DropoutWrapper`_
+- `Assert`_
+
+.. _Dropout: https://www.tensorflow.org/api_docs/python/tf/nn/dropout
+.. _DropoutWrapper: https://www.tensorflow.org/versions/r1.12/api_docs/python/tf/nn/rnn_cell/DropoutWrapper?hl=hr
+.. _Assert: https://www.tensorflow.org/api_docs/python/tf/debugging/Assert
+
+Convert None Dimensions to Constants
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TVM has minimal support for dynamic tensor shapes. Dimensions that are ``None`` should be replaced with constants. For example, a model may accept an input with shape ``(None,20)``. This should be converted to a shape like ``(1,20)``. The model should be modified accordingly to ensure that these shapes match throughout the graph.
+
+Export
+~~~~~~
+
+TensorFlow frontend expects a frozen protobuf (.pb) or saved model as input. It currently does not support checkpoint (.ckpt). The graphdef needed by the TensorFlow frontend can be extracted from the active session, or by using the `TFParser`_ helper class.
+
+.. _TFParser: https://github.com/dmlc/tvm/blob/master/python/tvm/relay/frontend/tensorflow_parser.py
+
+The model should be exported with a number of transformations to prepare the model for inference. It is also important to set ```add_shapes=True```, as this will embed the output shapes of each node into the graph. Here is one function to export a model as a protobuf given a session:
+
+.. code:: python
+
+    import tensorflow as tf
+    from tensorflow.tools.graph_transforms import TransformGraph
+
+    def export_pb(session):
+        with tf.gfile.GFile("myexportedmodel.pb", "wb") as f:
+            inputs = ["myinput1", "myinput2"] # replace with your input names
+            outputs = ["myoutput1"] # replace with your output names
+            graph_def = session.graph.as_graph_def(add_shapes=True)
+            graph_def = tf.graph.util.convert_variables_to_constants(session, graph_def, outputs)
+            graph_def = TransformGraph(
+                graph_def,
+                inputs,
+                outputs,
+                [
+                    "remove_nodes(op=Identity, op=CheckNumerics, op=StopGradient)",
+                    "sort_by_execution_order", # sort by execution order after each transform to ensure correct node ordering
+                    "remove_device",
+                    "sort_by_execution_order",
+                    "fold_batch_norms",
+                    "sort_by_execution_order",
+                    "fold_old_batch_norms",
+                    "sort_by_execution_order"
+                ]
+            )
+            f.write(graph_def.SerializeToString())
+
+Another method is to `export and freeze the graph <https://github.com/tensorflow/models/tree/master/research/slim#exporting-the-inference-graph>`_.
+
+Import the Model
+----------------
+
+Explicit Shape:
+~~~~~~~~~~~~~~~
+
+To ensure shapes can be known throughout the entire graph, pass the ```shape``` argument to ```from_tensorflow```. This dictionary maps input names to input shapes. Please refer to these `test cases <https://github.com/dmlc/tvm/blob/master/nnvm/tests/python/frontend/tensorflow/test_forward.py#L36>`_ for examples.
+
+Data Layout
+~~~~~~~~~~~
+
+Most TensorFlow models are released with NHWC layout. NCHW layout often provides better performance, especially on GPU. The TensorFlow frontend can automatically convert the model's data layout by passing the argument ```layout='NCHW'``` to ```from_tensorflow```.
+
+Best Practices
+--------------
+
+- Use static tensor shapes instead of dynamic shapes (remove ```None``` dimensions).
+- Use static RNN instead of dynamic RNN, as ```TensorArray``` isn't supported yet.
+
+Supported Ops
+-------------
+
+- Abs
+- Add
+- All
+- ArgMax
+- ArgMin
+- AvgPool
+- BatchMatMul
+- BatchMatMulV2
+- BatchNormWithGlobalNormalization
+- BatchToSpaceND
+- BiasAdd
+- BroadcastTo
+- Cast
+- Ceil
+- CheckNumerics
+- ClipByValue
+- Concat
+- ConcatV2
+- Conv2D
+- Cos
+- CropAndResize
+- DecodeJpeg
+- DepthwiseConv2dNative
+- DepthToSpace
+- Equal
+- Elu
+- Enter
+- Erf
+- Exit
+- Exp
+- ExpandDims
+- Fill
+- Floor
+- FloorDiv
+- FusedBatchNorm
+- FusedBatchNormV2
+- Gather
+- GatherNd
+- GatherV2
+- Greater
+- GreaterEqual
+- Identity
+- LeakyRelu
+- LeftShift
+- Less
+- LessEqual
+- Log
+- Log1p
+- LoopCond
+- LogicalAnd
+- LogicalOr
+- LogicalNot
+- LogSoftmax
+- LRN
+- LSTMBlockCell
+- MatMul
+- Max
+- MaxPool
+- Maximum
+- Mean
+- Merge
+- Min
+- Minimum
+- MirrorPad
+- Mod
+- Mul
+- Neg
+- NextIteration
+- NotEqual
+- OneHot
+- Pack
+- Pad
+- PadV2
+- Pow
+- Prod
+- Range
+- Rank
+- RealDiv
+- Relu
+- Relu6
+- Reshape
+- ResizeBilinear
+- ResizeBicubic
+- ResizeNearestNeighbor
+- ReverseV2
+- RightShift
+- Round
+- Rsqrt
+- Select
+- Selu
+- Shape
+- Sigmoid
+- Sign
+- Sin
+- Size
+- Slice
+- Softmax
+- Softplus
+- SpaceToBatchND
+- SpaceToDepth,
+- Split
+- SplitV
+- Sqrt
+- Square
+- SquareDifference
+- Squeeze
+- StridedSlice
+- Sub
+- Sum
+- Switch
+- Tanh
+- TensorArrayV3
+- TensorArrayScatterV3
+- TensorArrayGatherV3
+- TensorArraySizeV3
+- TensorArrayWriteV3
+- TensorArrayReadV3
+- TensorArraySplitV3
+- TensorArrayConcatV3
+- Tile
+- TopKV2
+- Transpose
+- TruncateMod
+- Unpack
+- Where
+- ZerosLike
diff --git a/docs/index.rst b/docs/index.rst
index 9666fff0c5d3..f02dcc7c91e2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,6 +47,12 @@ Developer Guide
    dev/index
    nnvm_top
 
+Frontends
+----------------
+.. toctree::
+   :maxdepth: 1
+
+   frontend/tensorflow
 
 Index
 -----

From 4c65719c61f683de8fd709f04be4f07371b1858a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 23 Oct 2019 21:59:58 -0700
Subject: [PATCH 25/59] Revert "[Relay][QNN] Add unit test for int8 (#4159)"
 (#4192)

This reverts commit 6f9d028b80f9e41fd577b5c6a7229cafcfc72173.
---
 tests/python/relay/test_op_qnn_conv2d.py | 83 +++++++-----------------
 1 file changed, 24 insertions(+), 59 deletions(-)

diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py
index 3b5195c051b8..b4e8bfd71b62 100644
--- a/tests/python/relay/test_op_qnn_conv2d.py
+++ b/tests/python/relay/test_op_qnn_conv2d.py
@@ -160,7 +160,7 @@ def get_output(func, golden_inputs):
     qnn_output = get_output(qnn_func, golden_inputs)
     np.testing.assert_equal(qnn_output, golden_output)
 
-def test_no_zero_point():
+def no_zero_point_test():
     # uint8 input
     data_shape = (2, 1, 2, 4)
     data_dtype = 'uint8'
@@ -203,7 +203,7 @@ def test_no_zero_point():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def test_kernel_zero_point():
+def kernel_zero_point_test():
     # uint8 input
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
@@ -247,7 +247,7 @@ def test_kernel_zero_point():
             kernel_shape, kernel_dtype)
 
 
-def test_input_zero_point():
+def input_zero_point_test():
     # uint8 input
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
@@ -290,7 +290,7 @@ def test_input_zero_point():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def test_both_zero_point():
+def both_zero_point_test():
     # uint8 input
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
@@ -333,7 +333,7 @@ def test_both_zero_point():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def test_layout():
+def layout_test():
     # uint8 input
     data_shape = (2, 2, 4, 4) # NHWC
     data_dtype = 'uint8'
@@ -378,7 +378,7 @@ def test_layout():
 
 
 
-def test_padding():
+def padding_test():
     # uint8 input
     data_shape = (1, 4, 2, 2)
     data_dtype = 'uint8'
@@ -421,7 +421,7 @@ def test_padding():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def test_dilation():
+def dilation_test():
     # uint8 input
     data_shape = (2, 4, 4, 4)
     data_dtype = 'uint8'
@@ -444,7 +444,7 @@ def test_dilation():
             kernel_shape, kernel_dtype)
 
 
-def test_const_folding():
+def const_folding_test():
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
     kernel_shape = (3, 4, 2, 2)
@@ -470,7 +470,7 @@ def test_const_folding():
     folded_func = folded_mod["main"]
     assert "reshape" not in folded_func.astext()
 
-def test_kernel_size_1x1():
+def kernel_size_1x1_test():
     # uint8 input
     data_shape = (2, 4, 2, 4)
     data_dtype = 'uint8'
@@ -493,7 +493,7 @@ def test_kernel_size_1x1():
     verify(ref_func, qnn_func, data_shape, data_dtype,
             kernel_shape, kernel_dtype)
 
-def test_tflite_large_irregular():
+def tflite_large_irregular_test():
     # uint8 input
     data_shape = (1, 1024, 1, 1)
     data_dtype = 'uint8'
@@ -607,7 +607,7 @@ def tflite_anistropic_strides():
     golden_output = np.array((124, -92, 164, -132)).reshape(1, 1, 2, 2)
     np.testing.assert_equal(qnn_output, golden_output)
 
-def test_broadcast_layout():
+def broadcast_layout_test():
     # Test broadcast support for NHWC layout.
     data_shape = (1, 229, 229, 3) # NHWC
     data_dtype = 'uint8'
@@ -640,52 +640,17 @@ def test_broadcast_layout():
     with relay.build_config(opt_level=3):
         graph, lib, params = relay.build(mod, "llvm -mcpu=skylake-avx512")
 
-
-def test_conv2d_int8():
-    target = "llvm -mcpu=core-avx2"
-    if not tvm.module.enabled(target):
-        print("skip because %s is not enabled..." % target)
-        return
-
-    data = relay.var("data", shape=(1, 28, 28, 128), dtype='uint8')
-    kernel = relay.var("w", shape=(3, 3, 128, 256), dtype='int8')
-    conv = relay.nn.conv2d(
-        data,
-        kernel,
-        kernel_size=(3, 3),
-        out_dtype='int32',
-        data_layout='NHWC',
-        kernel_layout='HWIO')
-    func = relay.Function([data, kernel], conv)
-
-    with relay.build_config(opt_level=0):
-        params = {"w": np.zeros((3, 3, 128, 256)).astype("int8")}
-        # -mcpu should be specified to avoid the llvm jitting error here:
-        # https://discuss.tvm.ai/t/segfault-in-llvm/3567
-        # To use VNNI, we need to specify the micro-architecture that supports
-        # it, e.g. cascadelake.
-        graph, lib, params = relay.build(func, target, params=params)
-        mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
-        mod.set_input("data", np.zeros((1, 28, 28, 128)).astype("uint8"))
-        mod.set_input(**params)
-        mod.run()
-        qnn_output = mod.get_output(0).asnumpy()
-    golden_output = np.zeros((1, 26, 26, 256)).astype("int32")
-    np.testing.assert_equal(qnn_output, golden_output)
-
-
 if __name__ == "__main__":
-    test_no_zero_point()
-    test_input_zero_point()
-    test_kernel_zero_point()
-    test_both_zero_point()
-    test_layout()
-    test_padding()
-    test_dilation()
-    test_const_folding()
-    test_kernel_size_1x1g()
-    test_tflite_large_irregularg()
-    test_tflite_output_multiplier_greater_than_one()
-    test_tflite_anistropic_strides()
-    test_broadcast_layoutg()
-    test_conv2d_int8()
+    no_zero_point_test()
+    input_zero_point_test()
+    kernel_zero_point_test()
+    both_zero_point_test()
+    layout_test()
+    padding_test()
+    dilation_test()
+    const_folding_test()
+    kernel_size_1x1_test()
+    tflite_large_irregular_test()
+    tflite_output_multiplier_greater_than_one()
+    tflite_anistropic_strides()
+    broadcast_layout_test()

From 70b8cf82f4c9a4214e58798a74e2e86bb746f7a1 Mon Sep 17 00:00:00 2001
From: Jon Soifer <soiferj@gmail.com>
Date: Wed, 23 Oct 2019 22:16:08 -0700
Subject: [PATCH 26/59] [cmake][ANTLR] Support setting path to ANTLR jar
 (#4176)

* Support setting path to ANTLR jar

* Update comment
---
 CMakeLists.txt             |  1 +
 cmake/config.cmake         |  4 +++
 cmake/modules/ANTLR.cmake  | 24 +-------------
 cmake/util/FindANTLR.cmake | 65 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+), 23 deletions(-)
 create mode 100644 cmake/util/FindANTLR.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 248b39130e36..c7767f87dd6a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ include(cmake/util/FindCUDA.cmake)
 include(cmake/util/FindVulkan.cmake)
 include(cmake/util/FindLLVM.cmake)
 include(cmake/util/FindROCM.cmake)
+include(cmake/util/FindANTLR.cmake)
 
 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
   include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index f87dc8ab1d8f..8b3437ca1790 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -147,6 +147,10 @@ set(USE_SORT ON)
 # /path/to/tensorrt that contains include and lib dirs
 set(USE_TENSORRT OFF)
 # Build ANTLR parser for Relay text format
+# Possible values:
+# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
+# - OFF: disable ANTLR
+# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
 set(USE_ANTLR OFF)
 
 # Whether use Relay debug mode
diff --git a/cmake/modules/ANTLR.cmake b/cmake/modules/ANTLR.cmake
index 5842c819099d..d3c1b4218253 100644
--- a/cmake/modules/ANTLR.cmake
+++ b/cmake/modules/ANTLR.cmake
@@ -15,29 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 if(USE_ANTLR)
-  find_program(ANTLR4 antlr4)
-
-  if (NOT ANTLR4)
-    file(GLOB_RECURSE ANTLR4JAR
-          /usr/local/lib/antlr-*-complete.jar
-          /usr/local/Cellar/*antlr-*-complete.jar)
-
-    # Get the first element of the list of antlr jars.
-    # Sort and reverse the list so the item selected is the highest
-    #   version in lib or else in Cellar if no lib installation exists.
-    list(SORT ANTLR4JAR)
-    list(REVERSE ANTLR4JAR)
-    list(GET ANTLR4JAR 0 ANTLR4JAR)
-
-    set(JAVA_HOME $ENV{JAVA_HOME})
-    if (NOT DEFINED JAVA_HOME)
-      # Hack to get system to search for Java itself.
-      set(JAVA_HOME "/usr")
-    endif()
-
-    set(ANTLR4 ${JAVA_HOME}/bin/java -jar ${ANTLR4JAR})
-  endif()
-
+  find_antlr(${USE_ANTLR})
   if(ANTLR4)
 
     set(RELAY_PARSER_DIR
diff --git a/cmake/util/FindANTLR.cmake b/cmake/util/FindANTLR.cmake
new file mode 100644
index 000000000000..b68f90ead131
--- /dev/null
+++ b/cmake/util/FindANTLR.cmake
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#######################################################
+# Enhanced version of find ANTLR.
+#
+# Usage:
+#   find_antlr(${USE_ANTLR})
+#
+# - When USE_ANTLR=ON, use auto search by first trying to find antlr4 program,
+#                      then trying to find antlr-*-complete.jar
+# - When USE_ANTLR=/path/to/antlr-*-complete.jar, use provided jar
+#
+# Provide variables:
+# - ANTLR4
+#
+macro(find_antlr use_antlr)
+  set(JAVA_HOME $ENV{JAVA_HOME})
+  if (NOT DEFINED JAVA_HOME)
+    # Hack to get system to search for Java itself.
+    message(STATUS "JAVA_HOME is not defined. Set it to ensure proper use")
+    set(JAVA_HOME "/usr")
+  endif()
+  if(MSVC)
+    set(JAVA_PROGRAM ${JAVA_HOME}/java.exe)
+  else()
+    set(JAVA_PROGRAM ${JAVA_HOME}/bin/java)
+  endif()
+  message(STATUS "Using Java at " ${JAVA_PROGRAM})
+
+  if (${use_antlr} STREQUAL "ON")
+    find_program(ANTLR4 antlr4)
+    if (NOT ANTLR4)
+      file(GLOB_RECURSE ANTLR4JAR
+          /usr/local/lib/antlr-*-complete.jar
+          /usr/local/Cellar/*antlr-*-complete.jar)
+
+      # Get the first element of the list of antlr jars.
+      # Sort and reverse the list so the item selected is the highest
+      #   version in lib or else in Cellar if no lib installation exists.
+      list(SORT ANTLR4JAR)
+      list(REVERSE ANTLR4JAR)
+      list(GET ANTLR4JAR 0 ANTLR4JAR)
+
+      set(ANTLR4 ${JAVA_PROGRAM} -jar ${ANTLR4JAR})
+    endif()
+  elseif(NOT ${use_antlr} STREQUAL "OFF")
+    set(ANTLR4 ${JAVA_PROGRAM} -jar ${use_antlr})
+  endif()
+  message(STATUS "ANTLR4="${ANTLR4})
+endmacro(find_antlr)

From 15ff567441a96b236ae56a1294abda6d18575d0b Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Thu, 24 Oct 2019 08:37:56 -0700
Subject: [PATCH 27/59] Split adaptive_pool2d_avg into sum and div (#4186)

---
 topi/include/topi/nn/pooling.h  | 18 +++++++++++++++---
 topi/python/topi/x86/pooling.py |  5 +++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
index 289452e26869..ca35e6e43498 100644
--- a/topi/include/topi/nn/pooling.h
+++ b/topi/include/topi/nn/pooling.h
@@ -492,7 +492,7 @@ inline Tensor adaptive_pool_impl(const Tensor& x,
       return tvm::max(x(indices), { dheight, dwidth });  // NOLINT(*)
     }, "tensor", "adaptive_pool_max");
   } else if (pool_type == kAvgPool) {
-    return tvm::compute(out_shape, [&](const Array<Var>& output) {
+    auto pool_sum = tvm::compute(out_shape, [&](const Array<Var>& output) {
       Array<Expr> indices;
       for (const Var& var : output) indices.push_back(var);
       auto i_start_h = start_index(output[height_axis], out_height, height);
@@ -505,8 +505,20 @@ inline Tensor adaptive_pool_impl(const Tensor& x,
       auto dwidth = tvm::reduce_axis(Range(0, i_end_w - i_start_w), "rv2");
       indices.Set(height_axis, i_start_h + dheight);
       indices.Set(width_axis, i_start_w + dwidth);
-      return tvm::sum(div(x(indices), divide_factor), { dheight, dwidth });
-    }, "tensor", "adaptive_pool_avg");
+      return tvm::sum(x(indices), { dheight, dwidth });
+    }, "tensor", "adaptive_pool_sum");
+
+    return tvm::compute(out_shape, [&](const Array<Var>& output) {
+      Array<Expr> indices;
+      for (const Var& var : output) indices.push_back(var);
+      auto i_start_h = start_index(output[height_axis], out_height, height);
+      auto i_end_h = end_index(output[height_axis], out_height, height);
+      auto i_start_w = start_index(output[width_axis], out_width, width);
+      auto i_end_w = end_index(output[width_axis], out_width, width);
+      auto divide_factor = tvm::cast(x->dtype, (i_end_h - i_start_h)
+                                               * (i_end_w - i_start_w));
+      return div(pool_sum(indices), divide_factor);
+    }, "tensor", kElementWise);
   } else {
     LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
     return x;
diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py
index ac19b19de28d..e9f832dde902 100644
--- a/topi/python/topi/x86/pooling.py
+++ b/topi/python/topi/x86/pooling.py
@@ -147,6 +147,11 @@ def traverse(OP):
                     traverse(tensor.op)
         # schedule pool
         elif OP.tag.startswith('adaptive_pool'):
+            if OP != outs[0].op:
+                output = outs[0]
+                output_fused = s[output].fuse(output.op.axis[0], output.op.axis[1])
+                s[output].parallel(output_fused)
+
             Pool = OP.output(0)
             _parallel_sch(s[Pool], outs[0].shape)
         else:

From 07e3a1bd9c7f0635d4222e62c7a9dc2d16ffba68 Mon Sep 17 00:00:00 2001
From: optima2005 <56945758+optima2005@users.noreply.github.com>
Date: Fri, 25 Oct 2019 00:56:00 +0800
Subject: [PATCH 28/59] [Documentation]Fix example code in comment of
 tvm.build_module.build() (#4195)

* Fix example code in comment of tvm.build_module.build()

* Update build_module.py
---
 python/tvm/build_module.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 4cb09931616e..479bdcead7c5 100644
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -568,10 +568,11 @@ def build(inputs,
         B = tvm.placeholder((n,), name='B')
         C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
         s1 = tvm.create_schedule(C.op)
-        s2 = topi.cpp.cuda.schedule_injective("cuda", [C])
-        f1 = tvm.lower(s1, [A, B, C], name="test_add1")
-        f2 = tvm.lower(s2, [A, B, C], name="test_add2")
-        m = tvm.build({"llvm": [f1], "cuda": [f2]}, target_host="llvm")
+        with tvm.target.cuda() as cuda_tgt:
+          s2 = topi.cuda.schedule_injective(cuda_tgt, [C])
+          f1 = tvm.lower(s1, [A, B, C], name="test_add1")
+          f2 = tvm.lower(s2, [A, B, C], name="test_add2")
+          m = tvm.build({"llvm": [f1], "cuda": [f2]}, target_host="llvm")
 
     Note
     ----

From 9341f6fdfe104c24b8d633f665f9d5b239b10935 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Thu, 24 Oct 2019 09:56:58 -0700
Subject: [PATCH 29/59] [relay] use time_evaluator for measurement (#4191)

---
 .../python/relay/benchmarking/benchmark_vm.py | 65 +++++++++++++------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/tests/python/relay/benchmarking/benchmark_vm.py b/tests/python/relay/benchmarking/benchmark_vm.py
index b1d8b9cb9130..cfb3fd42f834 100644
--- a/tests/python/relay/benchmarking/benchmark_vm.py
+++ b/tests/python/relay/benchmarking/benchmark_vm.py
@@ -21,16 +21,20 @@
 from tvm.contrib import graph_runtime
 from tvm import relay
 from tvm.relay import testing
+from tvm.relay import vm
+from tvm.relay import vmobj as _obj
 
 
 def benchmark_execution(mod,
                         params,
-                        measure=False,
+                        measure=True,
                         data_shape=(1, 3, 224, 224),
                         out_shape=(1, 1000),
-                        dtype='float32'):
-    def get_tvm_output(mod, data, params, target, ctx, dtype='float32'):
-        with relay.build_config(opt_level=1):
+                        dtype='float32',
+                        model="unknown"):
+    def get_graph_runtime_output(mod, data, params, target, ctx,
+                                 dtype='float32', number=2, repeat=20):
+        with relay.build_config(opt_level=3):
             graph, lib, params = relay.build(mod, target, params=params)
 
         m = graph_runtime.create(graph, lib, ctx)
@@ -41,18 +45,34 @@ def get_tvm_output(mod, data, params, target, ctx, dtype='float32'):
         out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
 
         if measure:
-            print("Evaluate graph runtime inference time cost...")
+            print("Evaluate graph runtime inference cost of {} on "
+                  "{}".format(model, repr(ctx)))
             ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=20)
             # Measure in millisecond.
             prof_res = np.array(ftimer().results) * 1000
-            print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+            print("Mean graph runtime inference time (std dev): %.2f ms (%.2f ms)" %
                   (np.mean(prof_res), np.std(prof_res)))
 
         return out.asnumpy()
 
-    def get_tvm_vm_output(mod, data, params, target, ctx, dtype='float32'):
-        ex = relay.create_executor('vm', mod=mod, ctx=ctx)
-        result = ex.evaluate()(data, **params)
+    def get_vm_output(mod, data, params, target, ctx, dtype='float32',
+                      number=2, repeat=20):
+        with relay.build_config(opt_level=3):
+            exe = vm.compile(mod, target, params=params)
+            rly_vm = vm.VirtualMachine(exe)
+            rly_vm.init(ctx)
+            result = rly_vm.run(data)
+
+        if measure:
+            print("Evaluate vm inference cost of {} on {}".format(model,
+                                                                  repr(ctx)))
+            ftimer = rly_vm.mod.time_evaluator("invoke", ctx, number=number,
+                                               repeat=repeat)
+            # Measure in millisecond.
+            prof_res = np.array(ftimer("main", _obj.Tensor(data)).results) * 1000
+            print("Mean vm inference time (std dev): %.2f ms (%.2f ms)" %
+                  (np.mean(prof_res), np.std(prof_res)))
+            
         return result.asnumpy().astype(dtype)
 
     # random input
@@ -60,41 +80,46 @@ def get_tvm_vm_output(mod, data, params, target, ctx, dtype='float32'):
     target = "llvm"
     ctx = tvm.cpu(0)
 
-    tvm_out = get_tvm_output(mod, tvm.nd.array(data.astype(dtype)), params,
-                             target, ctx, dtype)
-    vm_out = get_tvm_vm_output(mod, tvm.nd.array(data.astype(dtype)), params,
-                               target, ctx, dtype)
+    tvm_out = get_graph_runtime_output(mod, tvm.nd.array(data.astype(dtype)),
+                                       params, target, ctx, dtype)
+    vm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params,
+                           target, ctx, dtype)
     tvm.testing.assert_allclose(vm_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
 def test_mlp():
     image_shape = (1, 1, 28, 28)
     mod, params = testing.mlp.get_workload(1)
-    benchmark_execution(mod, params, data_shape=image_shape, out_shape=(1, 10))
+    benchmark_execution(mod, params, data_shape=image_shape, out_shape=(1, 10),
+                       model="mlp")
 
 
 def test_vgg():
     for n in [11, 16]:
         mod, params = testing.vgg.get_workload(1, num_layers=n)
-        benchmark_execution(mod, params)
+        model = "vgg" + str(n)
+        benchmark_execution(mod, params, model=model)
 
 
 def test_resnet():
     for n in [18, 50]:
         mod, params = testing.resnet.get_workload(batch_size=1, num_layers=n)
-        benchmark_execution(mod, params, True)
+        model = "resnet" + str(n)
+        benchmark_execution(mod, params, model=model)
 
 
 def test_squeezenet():
     for version in ['1.0', '1.1']:
         mod, params = testing.squeezenet.get_workload(version=version)
-        benchmark_execution(mod, params)
+        model = "squeezenet" + version
+        benchmark_execution(mod, params, model=model)
 
 
 def test_inception_v3():
     image_shape = (3, 299, 299)
     mod, params = testing.inception_v3.get_workload(image_shape=image_shape)
-    benchmark_execution(mod, params, data_shape=(1, 3, 299, 299))
+    benchmark_execution(mod, params, data_shape=(1, 3, 299, 299),
+                        model="inception_v3")
 
 
 def test_dqn():
@@ -112,7 +137,7 @@ def test_dcgan():
 
 def test_mobilenet():
     mod, params = testing.mobilenet.get_workload(batch_size=1)
-    benchmark_execution(mod, params)
+    benchmark_execution(mod, params, model="mobilenet")
 
 # TODO: enable when the low building performance (several minutes) fixed.
 def test_mobilenet_nhwc():
@@ -124,7 +149,7 @@ def test_mobilenet_nhwc():
 
 def test_densenet():
     mod, params = testing.densenet.get_workload(batch_size=1)
-    benchmark_execution(mod, params)
+    benchmark_execution(mod, params, model="densenet")
 
 
 if __name__ == '__main__':

From ab0e24d6f145e0f13e54c80254c91572127a1b45 Mon Sep 17 00:00:00 2001
From: Ina Dobreva <55383260+inadob@users.noreply.github.com>
Date: Thu, 24 Oct 2019 17:58:22 +0100
Subject: [PATCH 30/59] Add parser support for SUM tflite operator (#4182)

---
 python/tvm/relay/frontend/tflite.py          | 4 ++++
 tests/python/frontend/tflite/test_forward.py | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index b08dd6bf94e0..b042af9fbe65 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -82,6 +82,7 @@ def __init__(self, model, subgraph, exp_tab):
             'REDUCE_MAX': self._convert_reduce_max,
             'MEAN': self._convert_reduce_mean,
             'REDUCE_PROD': self._convert_reduce_prod,
+            'SUM': self._convert_reduce_sum,
             'FULLY_CONNECTED': self.convert_fully_connected,
             'PAD': self.convert_pad,
             'PACK': self.convert_pack,
@@ -672,6 +673,9 @@ def _convert_reduce_mean(self, op):
     def _convert_reduce_prod(self, op):
         return self._convert_reduce(_op.reduce.prod, op)
 
+    def _convert_reduce_sum(self, op):
+        return self._convert_reduce(_op.reduce.sum, op)
+
     def convert_fully_connected(self, op):
         """Convert TFLite fully connected"""
         try:
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 29b0c87c5b32..de19fe34f811 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -714,6 +714,14 @@ def _test_reduce_prod(data, keep_dims=None):
     """ One iteration of reduce_prod """
     return _test_reduce(math_ops.reduce_prod, data, keep_dims)
 
+#######################################################################
+# Reduce_sum
+# -----------
+
+def _test_reduce_sum(data, keep_dims=None):
+    """ One iteration of reduce_sum """
+    return _test_reduce(math_ops.reduce_sum, data, keep_dims)
+
 
 def _test_forward_reduce(testop):
     """ Reduce """
@@ -732,6 +740,7 @@ def test_all_reduce():
     _test_forward_reduce(_test_reduce_max)
     _test_forward_reduce(_test_reduce_mean)
     _test_forward_reduce(_test_reduce_prod)
+    _test_forward_reduce(_test_reduce_sum)
 
 
 #######################################################################

From 7f66bd5c7d7a43cec61eb2d384f6b71999728e49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Thu, 24 Oct 2019 11:50:25 -0700
Subject: [PATCH 31/59] [Relay] Fix memory leak in the interpreter (#4155)

* save

lint

* address reviewer comment
---
 include/tvm/relay/interpreter.h         | 26 ++++++++++++++++++
 python/tvm/relay/backend/interpreter.py |  5 ++++
 src/relay/backend/interpreter.cc        | 35 ++++++++++++++++++++-----
 3 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
index a0422fa7f446..f0b1e7ce8a26 100644
--- a/include/tvm/relay/interpreter.h
+++ b/include/tvm/relay/interpreter.h
@@ -119,6 +119,32 @@ class ClosureNode : public ValueNode {
 
 RELAY_DEFINE_NODE_REF(Closure, ClosureNode, Value);
 
+/*! \brief A Relay Recursive Closure. A closure that has a name. */
+class RecClosure;
+
+/*! \brief The container type of RecClosure. */
+class RecClosureNode : public ValueNode {
+ public:
+  /*! \brief The closure. */
+  Closure clos;
+  /*! \brief variable the closure bind to. */
+  Var bind;
+
+  RecClosureNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("clos", &clos);
+    v->Visit("bind", &bind);
+  }
+
+  TVM_DLL static RecClosure make(Closure clos, Var bind);
+
+  static constexpr const char* _type_key = "relay.RecClosure";
+  TVM_DECLARE_NODE_TYPE_INFO(RecClosureNode, ValueNode);
+};
+
+RELAY_DEFINE_NODE_REF(RecClosure, RecClosureNode, Value);
+
 /*! \brief A tuple value. */
 class TupleValue;
 
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index ae60b7a89b2f..1d53f6a92b07 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -72,6 +72,11 @@ class Closure(Value):
     """A closure produced by the interpreter."""
 
 
+@register_relay_node
+class RecClosure(Value):
+    """A recursive closure produced by the interpreter."""
+
+
 @register_relay_node
 class ConstructorValue(Value):
     def __init__(self, tag, fields, constructor):
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 86a4ebb4ebd2..2703b1c8634a 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -56,9 +56,27 @@ TVM_REGISTER_API("relay._make.Closure")
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ClosureNode>([](const ClosureNode* node, tvm::IRPrinter* p) {
-    p->stream << "ClosureNode(" << node->func << ")";
+    p->stream << "ClosureNode(" << node->func << ", " << node->env << ")";
   });
 
+
+// TODO(@jroesch): this doesn't support mutual letrec
+/* Value Implementation */
+RecClosure RecClosureNode::make(Closure clos, Var bind) {
+  NodePtr<RecClosureNode> n = make_node<RecClosureNode>();
+  n->clos = std::move(clos);
+  n->bind = std::move(bind);
+  return RecClosure(n);
+}
+
+TVM_REGISTER_API("relay._make.RecClosure")
+.set_body_typed(RecClosureNode::make);
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<RecClosureNode>([](const RecClosureNode* node, tvm::IRPrinter* p) {
+                                p->stream << "RecClosureNode(" << node->clos << ")";
+                              });
+
 TupleValue TupleValueNode::make(tvm::Array<Value> value) {
   NodePtr<TupleValueNode> n = make_node<TupleValueNode>();
   n->fields = value;
@@ -281,7 +299,6 @@ class Interpreter :
     return TupleValueNode::make(values);
   }
 
-  // TODO(@jroesch): this doesn't support mutual letrec
   inline Value MakeClosure(const Function& func, Var letrec_name = Var()) {
     tvm::Map<Var, Value> captured_mod;
     Array<Var> free_vars = FreeVars(func);
@@ -298,10 +315,8 @@ class Interpreter :
 
     // We must use mutation here to build a self referential closure.
     auto closure = ClosureNode::make(captured_mod, func);
-    auto mut_closure =
-        static_cast<ClosureNode*>(const_cast<Node*>(closure.get()));
     if (letrec_name.defined()) {
-      mut_closure->env.Set(letrec_name, closure);
+      return RecClosureNode::make(closure, letrec_name);
     }
     return std::move(closure);
   }
@@ -559,7 +574,7 @@ class Interpreter :
   }
 
   // Invoke the closure
-  Value Invoke(const Closure& closure, const tvm::Array<Value>& args) {
+  Value Invoke(const Closure& closure, const tvm::Array<Value>& args, const Var& bind = Var()) {
     // Get a reference to the function inside the closure.
     if (closure->func->IsPrimitive()) {
       return InvokePrimitiveOp(closure->func, args);
@@ -575,12 +590,16 @@ class Interpreter :
       locals.Set(func->params[i], args[i]);
     }
 
-    // Add the var to value mappings from the Closure's modironment.
+    // Add the var to value mappings from the Closure's environment.
     for (auto it = closure->env.begin(); it != closure->env.end(); ++it) {
       CHECK_EQ(locals.count((*it).first), 0);
       locals.Set((*it).first, (*it).second);
     }
 
+    if (bind.defined()) {
+      locals.Set(bind, RecClosureNode::make(closure, bind));
+    }
+
     return WithFrame<Value>(Frame(locals), [&]() { return Eval(func->body); });
   }
 
@@ -607,6 +626,8 @@ class Interpreter :
     if (const ClosureNode* closure_node = fn_val.as<ClosureNode>()) {
       auto closure = GetRef<Closure>(closure_node);
       return this->Invoke(closure, args);
+    } else if (const RecClosureNode* closure_node = fn_val.as<RecClosureNode>()) {
+      return this->Invoke(closure_node->clos, args, closure_node->bind);
     } else {
       LOG(FATAL) << "internal error: type error, expected function value in the call "
                  << "position";

From ccdd47aa7773a8e81c4f58c66e2276366571143d Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Thu, 24 Oct 2019 12:03:15 -0700
Subject: [PATCH 32/59] [TOPI] Tunable Template for Conv2D HWCN on CUDA (#4168)

* support conv2d HWCN in AutoTVM and Relay

* fix lint

* fix comments and unit tests
---
 python/tvm/autotvm/task/task.py             |  2 +-
 python/tvm/autotvm/task/topi_integration.py |  7 +-
 python/tvm/relay/op/nn/_nn.py               | 12 +--
 src/pass/vectorize_loop.cc                  |  2 -
 topi/python/topi/cuda/conv2d_hwcn.py        | 85 ++++++++++++---------
 topi/python/topi/generic/nn.py              | 18 +++++
 topi/python/topi/nn/conv2d.py               |  4 +-
 topi/tests/python/test_topi_conv2d_hwcn.py  | 49 ++++++++----
 topi/tests/python/test_topi_conv2d_nchw.py  |  1 -
 9 files changed, 115 insertions(+), 65 deletions(-)

diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index e0db27574898..4f3cc90b474e 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -226,7 +226,7 @@ def args_to_workload(x, topi_compute_func=None):
     elif x is None:
         workload = 0
     else:
-        raise RuntimeError('Do not support type "%s" in argument. Consider to use'
+        raise RuntimeError('Do not support type "%s" in argument. Consider to use '
                            'primitive types only' % type(x))
     return (get_func_name(topi_compute_func), ) + workload  if topi_compute_func else workload
 
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 09f08ad8b4ae..ac4683d4ae0b 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -176,9 +176,12 @@ def _topi_nn_conv2d(*args, **kwargs):
             args = deserialize_args(args)
             A, W = args[:2]
             layout = args[-2]
-            assert layout == 'NCHW', "only support NCHW currently"
+            assert layout == 'NCHW' or layout == 'HWCN', "only support NCHW/HWCN currently"
             C = topi.nn.conv2d(*args, **kwargs)
-            s = topi.generic.schedule_conv2d_nchw([C])
+            if layout == 'NCHW':
+                s = topi.generic.schedule_conv2d_nchw([C])
+            else:
+                s = topi.generic.schedule_conv2d_hwcn([C])
             return s, [A, W, C]
 
         @register("topi_nn_depthwise_conv2d_nchw")
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index b8572349fb9d..0043ffae0f61 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -153,14 +153,14 @@ def compute_conv2d(attrs, inputs, out_type, target):
     out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
                  else out_dtype)
 
-    assert layout in ["NCHW", "NHWC", "NCHW4c"]
+    assert layout in ["NCHW", "NHWC", "NCHW4c", "HWCN"]
     (dilation_h, dilation_w) = dilation
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
 
     def _get_out_depth():
         weight_shape = get_const_tuple(inputs[1].shape)
-        if kernel_layout == "HWOI":
+        if kernel_layout.startswith("HW"):
             return weight_shape[2] * weight_shape[3]
         return weight_shape[0] * weight_shape[1]
 
@@ -192,11 +192,13 @@ def schedule_conv2d(attrs, outs, target):
     with target:
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
-        if groups == 1 and layout == "NCHW4c":
+        elif groups == 1 and layout == "NCHW4c":
             return topi.generic.schedule_conv2d_nchw(outs)
-        if groups == 1 and layout == "NHWC":
+        elif groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
-        if groups != 1:
+        elif groups == 1 and layout == "HWCN":
+            return topi.generic.schedule_conv2d_hwcn(outs)
+        elif groups != 1:
             # collect in_channels to distinguish depthwise and group conv2d
             op = _find_conv2d_op(outs[0].op)
             assert op is not None
diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc
index 10db37dbb99b..187033092e76 100644
--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -368,7 +368,6 @@ class Vectorizer : public IRMutator {
     CHECK(!op->extent.type().is_vector());
     Expr extent = Mutate(op->extent);
     if (extent.type().is_vector()) {
-      LOG(WARNING) << "Detect vectorized extent type, scalarizing...";
       return Scalarize(s);
     }
     Stmt body = Mutate(op->body);
@@ -386,7 +385,6 @@ class Vectorizer : public IRMutator {
     CHECK(!op->condition.type().is_vector());
     Expr condition = this->Mutate(op->condition);
     if (condition.type().is_vector()) {
-      LOG(WARNING) << "Detect vector condition in Vectorized Loop, scalarizing...";
       return Scalarize(s);
     }
     Stmt then_case = this->Mutate(op->then_case);
diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index 5d101b9e010f..18a624a67aea 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -17,9 +17,14 @@
 # pylint: disable=invalid-name, too-many-locals, too-many-statements
 """Schedule for conv2d_hwcn with auto fusion"""
 import tvm
-from .. import tag
+from tvm import autotvm
+from tvm.autotvm.task.space import SplitEntity
 
-def schedule_conv2d_hwcn(outs):
+from .. import generic, tag
+
+
+@autotvm.register_topi_schedule(generic.schedule_conv2d_hwcn, ["cuda", "gpu"], ["direct"])
+def schedule_conv2d_hwcn(cfg, outs):
     """Schedule for conv2d_hwcn and any element-wise operations.
 
     Parameters
@@ -51,36 +56,44 @@ def schedule(Apad, W, B):
             sch[B].set_scope("local")
             BL = B
 
-        tile = 8
-        num_thread = 8
-        block_factor = tile * num_thread
-        step = 8
-        vthread = 2
+        hi, wi, fi, ni = sch[Out].op.axis
 
-        block_x = tvm.thread_axis("blockIdx.x")
-        block_y = tvm.thread_axis("blockIdx.y")
-        block_z = tvm.thread_axis("blockIdx.z")
-        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
-        thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
-        thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
+        # Create tuning space
+        n_thread_cand = [1, 2, 4, 8, 16, 32]
+        vthread_cand = [1, 2, 4, 8]
+
+        cfg.define_split(
+            'tile_fi',
+            fi,
+            num_outputs=4,
+            filter=lambda x:
+            (x.size[1] in vthread_cand and x.size[2] in n_thread_cand))
+        cfg.define_split(
+            'tile_ni',
+            ni,
+            num_outputs=4,
+            filter=lambda x:
+            (x.size[1] in vthread_cand and x.size[2] in n_thread_cand))
+
+        if cfg.is_fallback:
+            cfg['tile_fi'] = SplitEntity([-1, 2, 8, 4])
+            cfg['tile_ni'] = SplitEntity([-1, 2, 8, 4])
+
+        # Scheduling
+        step = 8
 
-        hi, wi, fi, ni = sch[Out].op.axis
         bz = sch[Out].fuse(hi, wi)
-        by, fi = sch[Out].split(fi, factor=block_factor)
-        bx, ni = sch[Out].split(ni, factor=block_factor)
-        tyz, fi = sch[Out].split(fi, nparts=vthread)
-        txz, ni = sch[Out].split(ni, nparts=vthread)
-        ty, fi = sch[Out].split(fi, nparts=num_thread)
-        tx, ni = sch[Out].split(ni, nparts=num_thread)
+        by, tyz, ty, fi = cfg['tile_fi'].apply(sch, Out, fi)
+        bx, txz, tx, ni = cfg['tile_ni'].apply(sch, Out, ni)
         sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni)
-        sch[Out].bind(bz, block_z)
-        sch[Out].bind(by, block_y)
-        sch[Out].bind(bx, block_x)
-        sch[Out].bind(tyz, thread_yz)
-        sch[Out].bind(txz, thread_xz)
-        sch[Out].bind(ty, thread_y)
-        sch[Out].bind(tx, thread_x)
+
+        sch[Out].bind(bz, tvm.thread_axis('blockIdx.z'))
+        sch[Out].bind(by, tvm.thread_axis('blockIdx.y'))
+        sch[Out].bind(bx, tvm.thread_axis('blockIdx.x'))
+        sch[Out].bind(tyz, tvm.thread_axis('vthread'))
+        sch[Out].bind(txz, tvm.thread_axis('vthread'))
+        sch[Out].bind(ty, tvm.thread_axis('threadIdx.y'))
+        sch[Out].bind(tx, tvm.thread_axis('threadIdx.x'))
 
         # Schedule BL local write
         sch[BL].compute_at(sch[Out], tx)
@@ -98,21 +111,21 @@ def schedule(Apad, W, B):
         sch[WL].compute_at(sch[BL], rci)
         # Schedule for A's shared memory load
         yi, xi, ci, ni = sch[AA].op.axis
-        ty, ci = sch[AA].split(ci, nparts=num_thread)
-        tx, ni = sch[AA].split(ni, nparts=num_thread)
+        ty, ci = sch[AA].split(ci, nparts=cfg['tile_fi'].size[2])
+        tx, ni = sch[AA].split(ni, nparts=cfg['tile_ni'].size[2])
         _, ni = sch[AA].split(ni, factor=4)
         sch[AA].reorder(ty, tx, yi, xi, ci, ni)
-        sch[AA].bind(ty, thread_y)
-        sch[AA].bind(tx, thread_x)
+        sch[AA].bind(ty, tvm.thread_axis('threadIdx.y'))
+        sch[AA].bind(tx, tvm.thread_axis('threadIdx.x'))
         sch[AA].vectorize(ni)
         # Schedule for W's shared memory load
         yi, xi, ci, fi = sch[WW].op.axis
-        ty, ci = sch[WW].split(ci, nparts=num_thread)
-        tx, fi = sch[WW].split(fi, nparts=num_thread)
+        ty, ci = sch[WW].split(ci, nparts=cfg['tile_fi'].size[2])
+        tx, fi = sch[WW].split(fi, nparts=cfg['tile_ni'].size[2])
         _, fi = sch[WW].split(fi, factor=4)
         sch[WW].reorder(ty, tx, yi, xi, ci, fi)
-        sch[WW].bind(ty, thread_y)
-        sch[WW].bind(tx, thread_x)
+        sch[WW].bind(ty, tvm.thread_axis('threadIdx.y'))
+        sch[WW].bind(tx, tvm.thread_axis('threadIdx.x'))
         sch[WW].vectorize(fi)
 
     scheduled_ops = []
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index c2cb2b27c5f1..4043cb7e4606 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -34,6 +34,24 @@ def _default_schedule(outs, auto_inline):
     return s
 
 
+@tvm.target.generic_func
+def schedule_conv2d_hwcn(outs):
+    """Schedule for conv2d_hwcn
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_hwcn
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_conv2d_nchw(outs):
     """Schedule for conv2d_nchw
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index ffae4b2094e4..130632fd08a9 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -64,9 +64,9 @@ def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=N
     # default declaration
     if layout == 'NCHW':
         return conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
-    if layout == 'HWCN':
+    elif layout == 'HWCN':
         return conv2d_hwcn(input, filter, strides, padding, dilation, out_dtype)
-    if layout == 'NHWC':
+    elif layout == 'NHWC':
         return conv2d_nhwc(input, filter, strides, padding, dilation, out_dtype)
     raise ValueError("not support this layout {} yet".format(layout))
 
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index 297df827b542..35423a686e8f 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -29,24 +29,25 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
 
     A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
     W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-    B = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation)
-    C = topi.nn.relu(B)
-    s1 = topi.cuda.schedule_conv2d_hwcn([B])
-    s2 = topi.cuda.schedule_conv2d_hwcn([C])
+    B = tvm.placeholder((1, num_filter, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
+    b_shape = get_const_tuple(B.shape)
     dtype = A.dtype
 
     @memoize("topi.tests.test_topi_conv2d_hwcn.verify_hwcn")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=b_shape).astype(dtype)
         dw_np = topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        b_np = topi.testing.conv2d_hwcn_python(a_np, dw_np, stride, padding)
-        c_np = np.maximum(b_np, 0)
-        return a_np, w_np, b_np, c_np
-    a_np, w_np, b_np, c_np = get_ref_data()
+        c1_np = topi.testing.conv2d_hwcn_python(a_np, dw_np, stride, padding)
+        c2_np = c1_np + b_np
+        c3_np = np.maximum(c2_np, 0)
+        return a_np, w_np, b_np, c1_np, c2_np, c3_np
+
+    a_np, w_np, b_np, c1_np, c2_np, c3_np = get_ref_data()
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -54,16 +55,32 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            t_conv = topi.nn.conv2d(A, W, stride, padding, dilation, layout='HWCN')
+            t_bias = topi.add(t_conv, B)
+            t_relu = topi.nn.relu(t_bias)
+            s1 = topi.generic.schedule_conv2d_hwcn([t_conv])
+            s2 = topi.generic.schedule_conv2d_hwcn([t_bias])
+            s3 = topi.generic.schedule_conv2d_hwcn([t_relu])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        func1 = tvm.build(s1, [A, W, B], device)
-        func2 = tvm.build(s2, [A, W, C], device)
-        func1(a, w, b)
-        func2(a, w, c)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        b = tvm.nd.array(b_np, ctx)
+
+        conv_out = tvm.nd.array(
+            np.zeros(get_const_tuple(t_conv.shape), dtype=t_conv.dtype), ctx)
+        bias_out = tvm.nd.array(
+            np.zeros(get_const_tuple(t_bias.shape), dtype=t_bias.dtype), ctx)
+        relu_out = tvm.nd.array(
+            np.zeros(get_const_tuple(t_relu.shape), dtype=t_relu.dtype), ctx)
+        func1 = tvm.build(s1, [A, W, t_conv], device)
+        func2 = tvm.build(s2, [A, W, B, t_bias], device)
+        func3 = tvm.build(s3, [A, W, B, t_relu], device)
+        func1(a, w, conv_out)
+        func2(a, w, b, bias_out)
+        func3(a, w, b, relu_out)
+        tvm.testing.assert_allclose(conv_out.asnumpy(), c1_np, rtol=1e-5)
+        tvm.testing.assert_allclose(bias_out.asnumpy(), c2_np, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_out.asnumpy(), c3_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index ca1cef22d9c8..d7c39a9cc016 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -48,7 +48,6 @@ def get_ref_data():
         dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
         c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
         if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
             c_np += b_np
         if add_relu:
             c_np = np.maximum(c_np, 0)

From 5884ea9583777fa29bdb4e14ab0cac38198b4e8a Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@vip.qq.com>
Date: Thu, 24 Oct 2019 12:04:37 -0700
Subject: [PATCH 33/59] TensorCore Support using Intrinsic (#4136)

* add tensor core support

* avoid memory bank conflict

* fix thread sync & better performance

* better performance

* add schedule test for conv2d

* extend into BatchMatMul

* support config fragment shape and layout using intrinsic

* add TensorCore tutorial

* add int support and fix lint

* address comment

* add 32*16*8 TensorCore test

* fix wmma include logic
---
 include/tvm/ir.h                              |  58 +++
 include/tvm/ir_pass.h                         |  17 +
 python/tvm/build_module.py                    |   4 +-
 src/api/api_pass.cc                           |  10 +
 src/codegen/build_module.cc                   |   3 +-
 src/codegen/codegen_cuda.cc                   | 167 +++++++-
 src/codegen/codegen_cuda.h                    |  13 +-
 src/pass/infer_fragment.cc                    | 224 ++++++++++
 src/pass/storage_access.cc                    |   6 +
 src/pass/storage_sync.cc                      |  22 +
 src/runtime/thread_storage_scope.h            |  20 +-
 .../unittest/test_schedule_tensor_core.py     | 386 ++++++++++++++++++
 .../python/topi/testing/conv2d_nhwc_python.py |   2 +-
 tutorials/optimize/opt_conv_tensorcore.py     | 348 ++++++++++++++++
 vta/python/vta/build_module.py                |   1 +
 15 files changed, 1274 insertions(+), 7 deletions(-)
 create mode 100644 src/pass/infer_fragment.cc
 create mode 100644 tests/python/unittest/test_schedule_tensor_core.py
 create mode 100644 tutorials/optimize/opt_conv_tensorcore.py

diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index b90804983cfb..37718fe1b3c7 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -1310,6 +1310,16 @@ constexpr const char* opengl_stage_scope = "opengl_stage_scope";
  */
 constexpr const char* device_scope = "device_scope";
 
+/*!
+ * \brief Mark that the shape of TensorCore fragment
+ */
+constexpr const char* fragment_shape = "fragment_shape";
+
+/*!
+ * \brief Mark that the layout of TensorCore fragment
+ */
+constexpr const char* fragment_layout = "fragment_layout";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
@@ -1552,6 +1562,54 @@ constexpr const char* tvm_global_barrier_kinit = "tvm_global_barrier_kinit";
  *  }
  */
 constexpr const char* tvm_thread_allreduce = "tvm_thread_allreduce";
+/*!
+ * \brief tvm intrinsic for tensor core load operators.
+ *
+ *  void tvm_load_matrix_sync(Var fragment, UIntImm m, UIntImm, n, UIntImm k,
+ *                            Expr index, Expr buffer_ptr, Expr stride,
+ *                            StringImm layout) {
+ *    // m, n, k are the shape of wmma fragment.
+ *    // Determine fragment layout(column-major or row major) by layout.
+ *    // fragments must be in 'wmma.matrix_a' or 'wmma.matrix_b' scope.
+ *    nvcuda::wmma::load_matrix_sync(fragment[index], buffer_ptr, stride);
+ *  }
+ */
+constexpr const char* tvm_load_matrix_sync = "tvm_load_matrix_sync";
+/*!
+ * \brief tvm intrinsic for tensor core mma_sync operators.
+ *
+ *  void tvm_mma_sync(Var fragment_d, Expr index_d,
+ *                    Var fragment_a, Expr index_a,
+ *                    Var fragment_b, Expr index_b,
+ *                    Var fragment_c, Expr index_c) {
+ *    nvcuda::wmma::mma_sync(fragment_d[index_d], fragment_a[index_a],
+ *                           fragment_b[index_b], fragment_c[index_c]);
+ *  }
+ */
+constexpr const char* tvm_mma_sync = "tvm_mma_sync";
+/*!
+ * \brief tvm intrinsic for tensor core fill_fragment operators.
+ *
+ *  void tvm_fill_fragment(Var fragment, UIntImm m, UIntImm, n, UIntImm k,
+ *                         Expr index, Expr value) {
+ *    // m, n, k are the shape of wmma fragment
+ *    // fragments must be in 'wmma.accumulator' scope.
+ *    nvcuda::wmma::fill_fragment(fragment[index], value);
+ *  }
+ */
+constexpr const char* tvm_fill_fragment = "tvm_fill_fragment";
+/*!
+ * \brief tvm intrinsic for tensor core store operators.
+ *
+ *  void tvm_store_matrix_sync(Var fragment, UIntImm m, UIntImm, n, UIntImm k,
+ *                             Expr index, Expr buffer_ptr, Expr stride,
+ *                             StringImm layout) {
+ *    // m, n, k are the shape of wmma fragment
+ *    // fragments must be in 'wmma.accumulator' scope.
+ *    nvcuda::wmma::store_matrix_sync(fragment[index], buffer_ptr, stride, layout);
+ *  }
+ */
+constexpr const char* tvm_store_matrix_sync = "tvm_store_matrix_sync";
 
 }   // namespace intrinsic
 
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index 03078b8be41f..842c6af8cf5d 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -513,6 +513,15 @@ LoweredFunc CombineContextCall(LoweredFunc f);
  */
 LoweredFunc PointerValueTypeRewrite(LoweredFunc f);
 
+/*!
+ * \brief Lower attached storage access information on device.
+ * Do this pass after all storage access analysis finish.
+ *
+ * \param func The device function to be lowered.
+ * \return Transformed function.
+ */
+LoweredFunc LowerDeviceStorageAccessInfo(LoweredFunc func);
+
 /*!
  * \brief Lower intrinsic function calls.
  * \param f The device function to be lowered.
@@ -532,6 +541,14 @@ LoweredFunc LowerIntrin(LoweredFunc f, const std::string& target);
  */
 LoweredFunc LowerCustomDatatypes(LoweredFunc f, const std::string& target);
 
+/*!
+ * \brief Infer the TensorCore fragment infomation using tensor intrinsics
+ *
+ * \param f The device function to be lowered.
+ * \return Transformed function.
+ */
+LoweredFunc InferFragment(LoweredFunc f);
+
 /*!
  * \brief Verify if memory accesses are legal for a specific target device type.
  *
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 479bdcead7c5..fe2f64142c56 100644
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -413,7 +413,6 @@ def lower(sch,
 
     # Phase 3
     stmt = ir_pass.Simplify(stmt)
-    stmt = ir_pass.LowerStorageAccessInfo(stmt)
     stmt = ir_pass.RemoveNoOp(stmt)
     if not cfg.disable_select_rewriting:
         stmt = ir_pass.RewriteUnsafeSelect(stmt)
@@ -465,6 +464,7 @@ def _build_for_device(flist, target, target_host):
                 func = ir_pass.ThreadSync(func, "global")
             func = ir_pass.ThreadSync(func, "shared")
             func = ir_pass.ThreadSync(func, "warp")
+            func = ir_pass.InferFragment(func)
             warp_size = target.thread_warp_size
             func = ir_pass.LowerThreadAllreduce(func, warp_size)
             fsplits = [s for s in ir_pass.SplitHostDevice(func)]
@@ -494,6 +494,8 @@ def _build_for_device(flist, target, target_host):
         assert not fdevice
 
     target_host = _target.create(target_host)
+    fdevice = [ir_pass.LowerDeviceStorageAccessInfo(x) for x in fdevice]
+    fhost = [ir_pass.LowerDeviceStorageAccessInfo(x) for x in fhost]
     fdevice = [ir_pass.LowerIntrin(x, target.target_name) for x in fdevice]
     fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost]
     fhost = [ir_pass.CombineContextCall(x) for x in fhost]
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index dd0415afd9eb..d7f621f3ade1 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -118,6 +118,14 @@ TVM_REGISTER_API("ir_pass.PostOrderVisit")
       });
   });
 
+TVM_REGISTER_API("ir_pass.LowerStorageAccess")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  LoweredFunc f = args[0];
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
+  n->body = LowerStorageAccessInfo(f->body);
+  *ret = LoweredFunc(n);
+});
+
 // make from two arguments
 #define REGISTER_PASS(PassName)                                   \
   TVM_REGISTER_API("ir_pass."#PassName)                           \
@@ -140,6 +148,7 @@ REGISTER_PASS(SplitHostDevice);
 REGISTER_PASS(StorageRewrite);
 REGISTER_PASS(CoProcSync);
 REGISTER_PASS(LowerStorageAccessInfo);
+REGISTER_PASS(LowerDeviceStorageAccessInfo)
 REGISTER_PASS(InjectVirtualThread);
 REGISTER_PASS(InjectPrefetch);
 REGISTER_PASS(InjectDoubleBuffer);
@@ -161,5 +170,6 @@ REGISTER_PASS(DecorateDeviceScope);
 REGISTER_PASS(InstrumentBoundCheckers);
 REGISTER_PASS(VerifyCompactBuffer);
 REGISTER_PASS(HoistIfThenElse);
+REGISTER_PASS(InferFragment)
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 66340e9c9021..cfcb0607858f 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -422,7 +422,6 @@ Stmt BuildStmt(Schedule sch,
 
   // Phase 2
   stmt = ir::Simplify(stmt);
-  stmt = ir::LowerStorageAccessInfo(stmt);
   stmt = ir::RemoveNoOp(stmt);
 
   if (!(config->disable_select_rewriting))
@@ -517,6 +516,7 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
   for (size_t i = 0; i < fhost.size(); ++i) {
     auto func = fhost[i];
     func = ir::BindDeviceType(func, target->device_type);
+    func = ir::LowerDeviceStorageAccessInfo(func);
     func = ir::LowerTVMBuiltin(func);
     fhost.Set(i, func);
   }
@@ -524,6 +524,7 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
   for (size_t i = 0; i < fhost.size(); ++i) {
     auto func = fhost[i];
     func = ir::LowerIntrin(func, target_host->target_name);
+    func = ir::LowerDeviceStorageAccessInfo(func);
     func = ir::CombineContextCall(func);
     fhost.Set(i, func);
   }
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 241310fd00d4..55b4810ed4d8 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -74,6 +74,10 @@ std::string CodeGenCUDA::Finish() {
     decl_stream << "#include <math_constants.h>\n";
   }
 
+  if (need_mma_h_) {
+    decl_stream << "#include <mma.h>\n";
+  }
+
   return CodeGenC::Finish();
 }
 
@@ -102,14 +106,22 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   bool fail = false;
   if (t.is_float()) {
     switch (t.bits()) {
-      case 16: os << "half";
+      case 16:
         enable_fp16_ = true;
+        if (lanes == 1) {
+          os << "half";
+        } else if (lanes <= 8) {
+          CHECK_EQ(lanes % 2, 0) << "only support even lane for half type";
+          os << "float" << lanes / 2;
+        } else {
+          fail = true;
+        }
         break;
       case 32: os << "float"; break;
       case 64: os << "double"; break;
       default: fail = true; break;
     }
-    if (!fail && lanes == 1) return;
+    if (!fail && (lanes == 1 || t.bits() == 16)) return;
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes; return;
     }
@@ -290,6 +302,113 @@ void CodeGenCUDA::PrintStorageScope(
   }
 }
 
+void CodeGenCUDA::VisitExpr_(const Call *op, std::ostream& os) {
+  if (op->is_intrinsic(intrinsic::tvm_fill_fragment)) {
+    need_mma_h_ = true;
+    CHECK_EQ(op->args.size(), 6U);
+    os << "nvcuda::wmma::fill_fragment(";
+    this->PrintExpr(op->args[0], os);
+    os << "[";
+    this->PrintExpr(op->args[4], os);
+    os << "], ";
+    this->PrintExpr(op->args[5], os);
+    os << ")";
+  } else if (op->is_intrinsic(intrinsic::tvm_load_matrix_sync)) {
+    need_mma_h_ = true;
+    CHECK_EQ(op->args.size(), 8U);
+    os << "nvcuda::wmma::load_matrix_sync(";
+    this->PrintExpr(op->args[0], os);
+    os << "[";
+    this->PrintExpr(op->args[4], os);
+    os << "], ";
+    this->PrintExpr(op->args[5], os);
+    os << ", ";
+    this->PrintExpr(op->args[6], os);
+    os << ")";
+  } else if (op->is_intrinsic(intrinsic::tvm_store_matrix_sync)) {
+    need_mma_h_ = true;
+    CHECK_EQ(op->args.size(), 8U);
+    os << "nvcuda::wmma::store_matrix_sync(";
+    this->PrintExpr(op->args[5], os);
+    os << ", ";
+    this->PrintExpr(op->args[0], os);
+    os << "[";
+    this->PrintExpr(op->args[4], os);
+    os << "], ";
+    this->PrintExpr(op->args[6], os);
+    if (const StringImm *str = op->args[7].as<StringImm>()) {
+      os << ", nvcuda::wmma::mem_" << str->value;
+    } else {
+      LOG(FATAL) << "Invalid parameters";
+    }
+    os << ")";
+  } else if (op->is_intrinsic(intrinsic::tvm_mma_sync)) {
+    need_mma_h_ = true;
+    CHECK_EQ(op->args.size(), 8U);
+    os << "nvcuda::wmma::mma_sync(";
+    for (int i = 0; i < 4; ++i) {
+      this->PrintExpr(op->args[i * 2], os);
+      os << "[";
+      this->PrintExpr(op->args[i * 2 + 1], os);
+      os << "]" << ((i < 3) ? ", ": ")");
+    }
+  } else {
+    CodeGenC::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenCUDA::VisitStmt_(const AttrStmt* op) {
+  if (op->attr_key == attr::fragment_shape) {
+    const Variable* buffer = op->node.as<Variable>();
+    const StringImm* shape_str = op->value.as<StringImm>();
+    fragment_shapes[buffer] = shape_str->value;
+  } else if (op->attr_key == attr::fragment_layout) {
+    const Variable* buffer = op->node.as<Variable>();
+    const StringImm* layout_str = op->value.as<StringImm>();
+    fragment_layouts[buffer] = layout_str->value;
+  }
+  CodeGenC::VisitStmt_(op);
+}
+
+void CodeGenCUDA::VisitStmt_(const Allocate* op) {
+  CHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+  if (op->new_expr.defined()) {
+    // Prefer global static allocation for the program
+    CHECK_EQ(op->free_function, "nop");
+    std::string new_data = PrintExpr(op->new_expr);
+    this->PrintIndent();
+    PrintType(op->type, stream);
+    stream << "* "<< vid << '=' << new_data << ";\n";
+  } else {
+    this->PrintIndent();
+    int32_t constant_size = op->constant_allocation_size();
+    CHECK_GT(constant_size, 0)
+      << "Can only handle constant size stack allocation for now";
+    const Variable* buffer = op->buffer_var.as<Variable>();
+    std::string scope = alloc_storage_scope_.at(buffer);
+    if (scope.find("wmma.") == 0) {
+      if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
+        CHECK(op->type == Float(16) || op->type == Int(8) || op->type == UInt(8))
+          << "Matrix_a and matrix_b only support half or char or unsigned char type for now";
+      } else {
+        CHECK(op->type == Float(16) || op->type == Float(32) || op->type == Int(32))
+          << "Accumulator only support half, float and int type for now";
+      }
+      constant_size = GetWmmaFragmentSize(scope, buffer, constant_size);
+      PrintWmmaScope(scope, op->type, buffer, stream);
+    } else {
+      PrintStorageScope(scope, stream);
+      stream << ' ';
+      PrintType(op->type, stream);
+    }
+    stream << ' '<< vid << '['
+           << constant_size << "];\n";
+  }
+  RegisterHandleType(op->buffer_var.get(), op->type);
+  this->PrintStmt(op->body);
+}
+
 void CodeGenCUDA::VisitStmt_(const Evaluate *op) {
   if (is_const(op->value)) return;
   const Call* call = op->value.as<Call>();
@@ -392,5 +511,49 @@ void CodeGenCUDA::VisitExpr_(const FloatImm *op, std::ostream& os) { // NOLINT(*
   PrintConst(op, os, this);
 }
 
+void CodeGenCUDA::PrintWmmaScope(const std::string &scope, Type t,
+    const Variable* variable, std::ostream &os) {
+  std::stringstream type;
+  PrintType(t, type);
+  std::string shape_str = fragment_shapes[variable];
+  if (scope == "wmma.matrix_a") {
+    need_mma_h_ = true;
+    std::string layout_str = fragment_layouts[variable];
+    os << "nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, "
+      << shape_str << ", " << type.str() << ", nvcuda::wmma::" << layout_str <<">";
+  } else if (scope == "wmma.matrix_b") {
+    need_mma_h_ = true;
+    std::string layout_str = fragment_layouts[variable];
+    os << "nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, "
+       << shape_str << ", " << type.str() << ", nvcuda::wmma::" << layout_str <<">";
+  } else if (scope == "wmma.accumulator") {
+    need_mma_h_ = true;
+    os << "nvcuda::wmma::fragment<nvcuda::wmma::accumulator, "
+       << shape_str << ", "<< type.str() << ">";
+  }
+}
+
+int32_t CodeGenCUDA::GetWmmaFragmentSize(const std::string &scope,
+                                         const Variable* variable, int32_t size) {
+  std::string shape_str = fragment_shapes[variable];
+  size_t m, n, k;
+  size_t last_pos = 0, pos = 0;
+  pos = shape_str.find(", ", last_pos);
+  m = std::stoi(shape_str.substr(last_pos, pos - last_pos));
+  last_pos = pos + 2;
+  pos = shape_str.find(", ", last_pos);
+  n = std::stoi(shape_str.substr(last_pos, pos - last_pos));
+  last_pos = pos + 2;
+  k = std::stoi(shape_str.substr(last_pos, shape_str.length() - last_pos));
+  if (scope == "wmma.matrix_a") {
+    return size / m / k;
+  } else if (scope == "wmma.matrix_b") {
+    return size / n / k;
+  } else if (scope == "wmma.accumulator") {
+    return size / m / n;
+  }
+  return 0;
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index 61c6fa3a5170..53e7db45efc6 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -28,6 +28,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
+#include <unordered_map>
 #include "codegen_c.h"
 
 namespace tvm {
@@ -40,7 +41,7 @@ class CodeGenCUDA final : public CodeGenC {
   void AddFunction(LoweredFunc f);
   std::string Finish();
   bool need_include_path() {
-    return (enable_fp16_ || enable_int8_ || need_math_constants_h_);
+    return (enable_fp16_ || enable_int8_ || need_math_constants_h_ || need_mma_h_);
   }
   // override behavior
   void VisitStmt_(const ir::For* op) final;
@@ -60,7 +61,10 @@ class CodeGenCUDA final : public CodeGenC {
   void VisitExpr_(const Shuffle* op, std::ostream& os) final; // NOLINT(*)
   void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
   void VisitExpr_(const FloatImm *op, std::ostream& os) final;
+  void VisitExpr_(const Call *op, std::ostream& os) final;
   void VisitStmt_(const Evaluate *op) final;
+  void VisitStmt_(const Allocate *op) final;
+  void VisitStmt_(const AttrStmt *op) final;
 
  private:
   // Whether global barrier is needed.
@@ -75,7 +79,14 @@ class CodeGenCUDA final : public CodeGenC {
   bool enable_int8_{false};
   // whether need math_constants.h
   bool need_math_constants_h_{false};
+  // whether need mma.h
+  bool need_mma_h_{false};
+
+  std::unordered_map<const Variable*, std::string> fragment_shapes;
+  std::unordered_map<const Variable*, std::string> fragment_layouts;
   friend void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p);
+  void PrintWmmaScope(const std::string& scope, Type t, const Variable* variable, std::ostream& os);
+  int32_t GetWmmaFragmentSize(const std::string &scope, const Variable* variable, int32_t size);
 };
 
 }  // namespace codegen
diff --git a/src/pass/infer_fragment.cc b/src/pass/infer_fragment.cc
new file mode 100644
index 000000000000..d9c0ef04787b
--- /dev/null
+++ b/src/pass/infer_fragment.cc
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \brief Infer TensorCore metadata from tensor intrinsic.
+ * \file tensorcore_fragment.cc
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_visitor.h>
+#include <unordered_map>
+#include <unordered_set>
+#include "ir_util.h"
+#include "storage_access.h"
+#include "../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace ir {
+
+// Get fragment information from tensor intrinsics
+class FragmentGetter : public IRVisitor {
+ public:
+  // fragment metadata
+  struct FragmentInfo {
+    // fragment shape
+    int m, n, k;
+    // fragment layout (row-major or column-major)
+    std::string layout;
+    FragmentInfo() = default;
+    FragmentInfo(int _m, int _n, int _k, const std::string& _layout)
+      : m(_m), n(_n), k(_k), layout(_layout) {}
+  };
+
+  void Visit_(const Call* op) final {
+    IRVisitor::Visit_(op);
+
+    if (op->is_intrinsic(intrinsic::tvm_load_matrix_sync) ||
+        op->is_intrinsic(intrinsic::tvm_store_matrix_sync)) {
+      // Get shape and layout information from load and store intrinsic
+      CHECK_EQ(op->args.size(), 8U);
+      const Variable* buffer_var = op->args[0].as<Variable>();
+      CHECK(buffer_var);
+      // Get shape
+      const IntImm* m = op->args[1].as<IntImm>();
+      const IntImm* n = op->args[2].as<IntImm>();
+      const IntImm* k = op->args[3].as<IntImm>();
+      const StringImm* layout = op->args[7].as<StringImm>();
+      CHECK(m);
+      CHECK(n);
+      CHECK(k);
+      CHECK(layout);
+
+      std::string scope = scopes[buffer_var];
+      if (fragments.count(buffer_var)) {
+        // check if the fragment has met before
+        FragmentInfo info = fragments[buffer_var];
+        CHECK_EQ(m->value, info.m);
+        CHECK_EQ(n->value, info.n);
+        CHECK_EQ(k->value, info.k);
+        if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
+          CHECK_EQ(layout->value, info.layout);
+        }
+      } else {
+        // store metadata
+        FragmentInfo info;
+        if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
+          info = FragmentInfo(m->value, n->value, k->value, layout->value);
+        } else if (scope == "wmma.accumulator") {
+          info = FragmentInfo(m->value, n->value, k->value, "");
+        }
+        fragments[buffer_var] = info;
+      }
+    } else if (op->is_intrinsic(intrinsic::tvm_fill_fragment)) {
+      // Get shape information from fill intrinsic
+      CHECK_EQ(op->args.size(), 6U);
+      const Variable* buffer_var = op->args[0].as<Variable>();
+      CHECK(buffer_var);
+      // Get shape
+      const IntImm* m = op->args[1].as<IntImm>();
+      const IntImm* n = op->args[2].as<IntImm>();
+      const IntImm* k = op->args[3].as<IntImm>();
+      CHECK(m);
+      CHECK(n);
+      CHECK(k);
+
+      std::string scope = scopes[buffer_var];
+      // Only wmma.accumulator can use tvm_fill_fragment
+      CHECK_EQ(scope, "wmma.accumulator");
+      if (fragments.count(buffer_var)) {
+        FragmentInfo info = fragments[buffer_var];
+        CHECK_EQ(m->value, info.m);
+        CHECK_EQ(n->value, info.n);
+        CHECK_EQ(k->value, info.k);
+      } else {
+        FragmentInfo info(m->value, n->value, k->value, "");
+        fragments[buffer_var] = info;
+      }
+    }
+  }
+
+  // Get memory scope
+  void Visit_(const AttrStmt* op) final {
+    if (op->attr_key == attr::storage_scope) {
+      const Variable* buffer = op->node.as<Variable>();
+      CHECK(buffer);
+      scopes[buffer] = op->value.as<StringImm>()->value;
+    }
+    IRVisitor::Visit_(op);
+  }
+
+  // Memory scope for allocations
+  std::unordered_map<const Variable*, std::string> scopes;
+  // Fragment metadata for all fragments
+  std::unordered_map<const Variable*, FragmentInfo> fragments;
+};
+
+// Check shape of fragment making sure it is a valid shape for tvm_mma_sync
+class FragmentChecker : public IRVisitor {
+ public:
+  explicit FragmentChecker(const FragmentGetter &getter) : fragment_getter(getter) {}
+
+  void Visit_(const Call* op) final {
+    // Check shape when calling tvm_mma_sync
+    if (op->is_intrinsic(intrinsic::tvm_mma_sync)) {
+      CHECK_EQ(op->args.size(), 8U);
+      const Variable* buffer_var_d = op->args[0].as<Variable>();
+      const Variable* buffer_var_a = op->args[2].as<Variable>();
+      const Variable* buffer_var_b = op->args[4].as<Variable>();
+      const Variable* buffer_var_c = op->args[6].as<Variable>();
+      CHECK(buffer_var_d);
+      CHECK(buffer_var_a);
+      CHECK(buffer_var_b);
+      CHECK(buffer_var_c);
+
+      // Check all fragment A, B, C and D have the same shape
+      CHECK(CheckShape(buffer_var_d, buffer_var_a));
+      CHECK(CheckShape(buffer_var_d, buffer_var_b));
+      CHECK(CheckShape(buffer_var_d, buffer_var_c));
+    }
+  }
+
+ private:
+  // A tool for checking shapes of two fragments
+  bool CheckShape(const Variable* buffer1, const Variable* buffer2) {
+    CHECK(fragment_getter.fragments.count(buffer1));
+    CHECK(fragment_getter.fragments.count(buffer2));
+    FragmentGetter::FragmentInfo info1 = fragment_getter.fragments.at(buffer1);
+    FragmentGetter::FragmentInfo info2 = fragment_getter.fragments.at(buffer2);
+    return info1.m == info2.m && info1.n == info2.n && info1.k == info2.k;
+  }
+  // Fragment infomation
+  const FragmentGetter &fragment_getter;
+};
+
+// Store the metadata into attributes
+class InferFragmenter : public IRMutator {
+ public:
+  explicit InferFragmenter(const FragmentGetter &getter) : fragment_getter(getter) {}
+
+  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    const Variable* buffer = op->buffer_var.get();
+    if (fragment_getter.fragments.count(buffer)) {
+      // Add attribute to fragments allocation
+      FragmentGetter::FragmentInfo info = fragment_getter.fragments.at(buffer);
+
+      // Add shape attribute to all fragments
+      std::string shape = std::to_string(info.m) + ", " +
+                          std::to_string(info.n) + ", " +
+                          std::to_string(info.k);
+      Expr shape_expr = StringImm::make(shape);
+      Stmt shape_attr = AttrStmt::make(op->buffer_var, attr::fragment_shape, shape_expr, stmt);
+      if (info.layout != "") {
+        // Add shape attribute to matrix_a and matrix_b
+        Stmt layout_attr = AttrStmt::make(op->buffer_var, attr::fragment_layout,
+                                          StringImm::make(info.layout), shape_attr);
+        return layout_attr;
+      } else {
+        return shape_attr;
+      }
+    }
+    return stmt;
+  }
+
+ private:
+  // Fragment infomation
+  const FragmentGetter &fragment_getter;
+};
+
+Stmt InferFragment(Stmt stmt) {
+  FragmentGetter getter;
+  getter.Visit(stmt);
+  FragmentChecker(getter).Visit(stmt);
+  stmt = InferFragmenter(getter).Mutate(stmt);
+  return stmt;
+}
+
+LoweredFunc InferFragment(LoweredFunc f) {
+  CHECK_NE(f->func_type, kHostFunc);
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
+  n->body = InferFragment(f->body);
+  return LoweredFunc(n);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/storage_access.cc b/src/pass/storage_access.cc
index f7deb25560d6..8cad36d0e287 100644
--- a/src/pass/storage_access.cc
+++ b/src/pass/storage_access.cc
@@ -341,5 +341,11 @@ Stmt LowerStorageAccessInfo(Stmt stmt) {
   return StorageAccessInfoLower().Mutate(stmt);
 }
 
+LoweredFunc LowerDeviceStorageAccessInfo(LoweredFunc f) {
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
+  n->body = LowerStorageAccessInfo(f->body);
+  return LoweredFunc(n);
+}
+
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc
index 7c2f3211c532..34dac52b6d85 100644
--- a/src/pass/storage_sync.cc
+++ b/src/pass/storage_sync.cc
@@ -263,6 +263,28 @@ class ThreadSyncInserter : public IRMutator {
     }
   }
 
+  Expr Mutate_(const Call* op, const Expr& e) final {
+    if (op->is_intrinsic(intrinsic::tvm_access_ptr)) {
+      Expr expr = IRMutator::Mutate_(op, e);
+      op = expr.as<Call>();
+      CHECK_EQ(op->args.size(), 5U);
+      const Variable* buffer_var = op->args[1].as<Variable>();
+      Var var(GetRef<Var>(buffer_var));
+      const IntImm* flag = op->args[4].as<IntImm>();
+      if ((flag->value & 1) && sync_scope_.rank == StorageRank::kGlobal &&
+          GetScope(buffer_var).rank == StorageRank::kGlobal) {
+        ++rw_stats_[var].read_count;
+      }
+      if (flag->value & 2 && sync_scope_.rank == StorageRank::kGlobal &&
+          GetScope(buffer_var).rank == StorageRank::kGlobal) {
+        ++rw_stats_[var].write_count;
+      }
+      return expr;
+    } else {
+      return IRMutator::Mutate_(op, e);
+    }
+  }
+
  private:
   // RW statistics about data
   struct Entry {
diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index 0934e46d4e21..8e75fab87849 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -50,7 +50,13 @@ enum class StorageRank {
    */
   kWarp = 2,
   /*! \brief thread local memory */
-  kLocal = 3
+  kLocal = 3,
+  /*! \brief wmma scope memory of matrix_a */
+  kWMMAMatrixA = 4,
+  /*! \brief wmma scope memory of matrix_b */
+  kWMMAMatrixB = 5,
+  /*! \brief wmma scope memory of accumulator */
+  kWMMAAccumulator = 6,
 };
 
 /*!
@@ -89,6 +95,9 @@ struct StorageScope {
       case StorageRank::kShared: return "shared" + tag;
       case StorageRank::kWarp: return "warp" + tag;
       case StorageRank::kLocal: return "local" + tag;
+      case StorageRank::kWMMAMatrixA: return "wmma.matrix_a" + tag;
+      case StorageRank::kWMMAMatrixB: return "wmma.matrix_b" + tag;
+      case StorageRank::kWMMAAccumulator: return "wmma.accumulator" + tag;
       default: LOG(FATAL) << "unknown storage scope"; return "";
     }
   }
@@ -111,6 +120,15 @@ struct StorageScope {
     } else if (s.compare(0, 5, "local") == 0) {
       r.rank = StorageRank::kLocal;
       r.tag = s.substr(5, std::string::npos);
+    } else if (s.compare(0, 13, "wmma.matrix_a") == 0) {
+      r.rank = StorageRank::kWMMAMatrixA;
+      r.tag = s.substr(13, std::string::npos);
+    } else if (s.compare(0, 13, "wmma.matrix_b") == 0) {
+      r.rank = StorageRank::kWMMAMatrixB;
+      r.tag = s.substr(13, std::string::npos);
+    } else if (s.compare(0, 16, "wmma.accumulator") == 0) {
+      r.rank = StorageRank::kWMMAAccumulator;
+      r.tag = s.substr(16, std::string::npos);
     } else {
       LOG(FATAL) << "unknown storage scope " << s;
     }
diff --git a/tests/python/unittest/test_schedule_tensor_core.py b/tests/python/unittest/test_schedule_tensor_core.py
new file mode 100644
index 000000000000..9fe72cd4e5d2
--- /dev/null
+++ b/tests/python/unittest/test_schedule_tensor_core.py
@@ -0,0 +1,386 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# 'License'); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import numpy as np
+from topi.testing import conv2d_nhwc_python
+from tvm.contrib import nvcc
+
+VERIFY = True
+
+
+def intrin_wmma_load_matrix(shape, scope):
+    n, m, l = shape
+    if scope == "wmma.matrix_a":
+        row, col = n, l
+    elif scope == "wmma.matrix_b":
+        row, col = l, m
+    A = tvm.placeholder((row, col), name='A', dtype='float16')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=row * col)
+    C = tvm.compute((row, col), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=row * col)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_load_matrix_sync',
+                                BC.data, n, m, l, BC.elem_offset // (row * col),
+                                BA.access_ptr('r'), col, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+
+def intrin_wmma_gemm(shape):
+    n, m, l = shape
+    A = tvm.placeholder((n, l), name='A', dtype='float16')
+    B = tvm.placeholder((l, m), name='B', dtype='float16')
+    k = tvm.reduce_axis((0, l), name="k")
+    C = tvm.compute((n, m),
+                    lambda ii, jj:
+                    tvm.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
+                    name='C')
+    BA = tvm.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=n * l)
+    BB = tvm.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=l * m)
+    BC = tvm.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=n * m)
+
+    def intrin_func(ins, outs):
+        BA, BB = ins
+        BC, = outs
+
+        def init():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, m, l, BC.elem_offset // (n * m), 0.0))
+            return ib.get()
+
+        def update():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_mma_sync',
+                                    BC.data, BC.elem_offset // (n * m),
+                                    BA.data, BA.elem_offset // (n * l),
+                                    BB.data, BB.elem_offset // (l * m),
+                                    BC.data, BC.elem_offset // (n * m)))
+            return ib.get()
+
+        return update(), init(), update()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
+
+
+def intrin_wmma_store_matrix(shape):
+    n, m, l = shape
+    A = tvm.placeholder((n, m), name='A', dtype='float32')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=n * m)
+    C = tvm.compute((n, m), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=n * m)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_store_matrix_sync',
+                                BA.data, n, m, l, BA.elem_offset // (n * m),
+                                BC.access_ptr('w'), m, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+
+def test_tensor_core_batch_matmal():
+    if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+        print("skip because cuda is not enabled..")
+        return
+    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
+        print("skip because gpu does not support tensor core")
+        return
+
+    batch_size = 4
+    n = 512
+    m, l = n, n
+    assert (n % 32 == 0)
+    assert (m % 8 == 0)
+    assert (l % 16 == 0)
+    nn, mm, ll = n // 32, m // 8, l // 16
+    A = tvm.placeholder((batch_size, nn, ll, 32, 16), name='A', dtype='float16')
+    B = tvm.placeholder((batch_size, ll, mm, 16, 8), name='B', dtype='float16')
+    k1 = tvm.reduce_axis((0, ll), name='k1')
+    k2 = tvm.reduce_axis((0, 16), name='k2')
+    C = tvm.compute((batch_size, nn, mm, 32, 8),
+                    lambda b, i, j, ii, jj:
+                    tvm.sum(A[b, i, k1, ii, k2].astype('float') * B[b, k1, j, k2, jj].astype('float'), axis=[k1, k2]),
+                    name='Fragment_C')
+    s = tvm.create_schedule(C.op)
+
+    warp_size = 32
+    kernel_size = 16
+    block_row_warps = 2
+    block_col_warps = 4
+    warp_row_tiles = 4
+    warp_col_tiles = 2
+    chunk = 4
+
+    block_x = tvm.thread_axis('blockIdx.x')
+    block_y = tvm.thread_axis('blockIdx.y')
+    block_z = tvm.thread_axis('blockIdx.z')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    thread_y = tvm.thread_axis('threadIdx.y')
+    thread_z = tvm.thread_axis('threadIdx.z')
+
+    AS = s.cache_read(A, 'shared', [C])
+    BS = s.cache_read(B, 'shared', [C])
+    AF = s.cache_read(AS, 'wmma.matrix_a', [C])
+    BF = s.cache_read(BS, 'wmma.matrix_b', [C])
+    CF = s.cache_write(C, 'wmma.accumulator')
+
+    b, i, j, kernel_i, kernel_j = s[C].op.axis
+    i, ii = s[C].split(i, factor=warp_row_tiles)
+    block_i, i = s[C].split(i, factor=block_row_warps)
+    j, jj = s[C].split(j, factor=warp_col_tiles)
+    block_j, j = s[C].split(j, factor=block_col_warps)
+    s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j)
+    s[C].bind(b, block_z)
+    s[C].bind(block_i, block_x)
+    s[C].bind(block_j, block_y)
+    s[C].bind(i, thread_y)
+    s[C].bind(j, thread_z)
+
+    s[CF].compute_at(s[C], j)
+    b, warp_i, warp_j, _i, _j = s[CF].op.axis
+    k, _k = CF.op.reduce_axis
+    ko, ki = s[CF].split(k, factor=chunk)
+    s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k)
+
+    s[AF].compute_at(s[CF], ki)
+    s[BF].compute_at(s[CF], ki)
+
+    s[AS].compute_at(s[CF], ko)
+    b, xo, yo, xi, yi = AS.op.axis
+    tx, xo = s[AS].split(xo, nparts=block_row_warps)
+    ty, yo = s[AS].split(yo, nparts=block_col_warps)
+    t = s[AS].fuse(xi, yi)
+    to, ti = s[AS].split(t, nparts=warp_size)
+    s[AS].bind(tx, thread_y)
+    s[AS].bind(ty, thread_z)
+    s[AS].bind(to, thread_x)
+
+    s[BS].compute_at(s[CF], ko)
+    b, xo, yo, xi, yi = BS.op.axis
+    tx, xo = s[BS].split(xo, nparts=block_row_warps)
+    ty, yo = s[BS].split(yo, nparts=block_col_warps)
+    t = s[BS].fuse(xi, yi)
+    to, ti = s[BS].split(t, nparts=warp_size)
+    s[BS].bind(tx, thread_y)
+    s[BS].bind(ty, thread_z)
+    s[BS].bind(to, thread_x)
+
+    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_a'))
+    s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_b'))
+    s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16)))
+    s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16)))
+
+    func = tvm.build(s, [A, B, C], 'cuda')
+
+    ctx = tvm.gpu(0)
+    a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype)
+    b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), ctx)
+    func(a, b, c)
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    print('gemm with tensor core: %f ms' % (evaluator(a, b, c).mean * 1e3))
+
+    if VERIFY:
+        func(a, b, c)
+        a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
+        b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
+        c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
+        np.testing.assert_allclose(c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4)
+
+
+
+def test_tensor_core_batch_conv():
+    if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+        print("skip because cuda is not enabled..")
+        return
+    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
+        print("skip because gpu does not support tensor core")
+        return
+
+    # The sizes of inputs and filters
+    batch_size = 32
+    height = 14
+    width = 14
+    in_channels = 32
+    out_channels = 64
+    kernel_h = 3
+    kernel_w = 3
+    pad_h = 1
+    pad_w = 1
+    stride_h = 1
+    stride_w = 1
+    block_size = 16
+
+    block_row_warps = 2
+    block_col_warps = 4
+    warp_row_tiles = 4
+    warp_col_tiles = 2
+    warp_size = 32
+    chunk = 2
+
+    # Input feature map: (N, H, W, IC, n, ic)
+    data_shape = (batch_size // block_size,
+                  height,
+                  width,
+                  in_channels // block_size,
+                  block_size,
+                  block_size)
+    # Kernel: (H, W, IC, OC, ic, oc)
+    kernel_shape = (kernel_h,
+                    kernel_w,
+                    in_channels // block_size,
+                    out_channels // block_size,
+                    block_size,
+                    block_size)
+
+    # Output feature map: (N, H, W, OC, n, oc)
+    output_shape = (batch_size // block_size,
+                    height,
+                    width,
+                    out_channels // block_size,
+                    block_size,
+                    block_size)
+
+    assert (batch_size % block_size == 0)
+    assert (in_channels % block_size == 0)
+    assert (out_channels % block_size == 0)
+
+    kh = tvm.reduce_axis((0, kernel_h), name='kh')
+    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+    ic = tvm.reduce_axis((0, in_channels // block_size), name='ic')
+    ii = tvm.reduce_axis((0, block_size), name='ii')
+
+    # Algorithm
+    A = tvm.placeholder(data_shape, name='A', dtype="float16")
+    W = tvm.placeholder(kernel_shape, name='W', dtype="float16")
+    Apad = tvm.compute(
+        (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size,
+         block_size),
+        lambda n, h, w, i, nn, ii: tvm.if_then_else(
+            tvm.all(h >= pad_h, h - pad_h < height,
+                    w >= pad_w, w - pad_w < width),
+            A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.const(0., "float16")),
+        name='Apad')
+    Conv = tvm.compute(output_shape,
+                       lambda n, h, w, o, nn, oo: tvm.sum(
+                           Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") *
+                           W[kh, kw, ic, o, ii, oo].astype("float32"),
+                           axis=[ic, kh, kw, ii]),
+                       name="Conv")
+
+    s = tvm.create_schedule(Conv.op)
+    s[Apad].compute_inline()
+
+    AS = s.cache_read(Apad, 'shared', [Conv])
+    WS = s.cache_read(W, 'shared', [Conv])
+    AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
+    WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
+    ConvF = s.cache_write(Conv, 'wmma.accumulator')
+
+    block_x = tvm.thread_axis('blockIdx.x')
+    block_y = tvm.thread_axis('blockIdx.y')
+    block_z = tvm.thread_axis('blockIdx.z')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    thread_y = tvm.thread_axis('threadIdx.y')
+    thread_z = tvm.thread_axis('threadIdx.z')
+
+    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
+    block_k = s[Conv].fuse(hc, wc)
+    s[Conv].bind(block_k, block_z)
+    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
+    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
+    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
+    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
+    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
+    s[Conv].bind(block_i, block_x)
+    s[Conv].bind(block_j, block_y)
+    s[Conv].bind(nc, thread_y)
+    s[Conv].bind(oc, thread_z)
+
+    s[ConvF].compute_at(s[Conv], oc)
+    n, h, w, o, nnf, oof = ConvF.op.axis
+    ko, ki = s[ConvF].split(ic, factor=chunk)
+    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)
+
+    s[AF].compute_at(s[ConvF], kw)
+    s[WF].compute_at(s[ConvF], kw)
+
+    s[WS].compute_at(s[ConvF], kh)
+    s[AS].compute_at(s[ConvF], kh)
+
+    n, h, w, i, nn, ii = AS.op.axis
+    tx, xo = s[AS].split(n, nparts=block_row_warps)
+    ty, yo = s[AS].split(xo, nparts=block_col_warps)
+    t = s[AS].fuse(nn, ii)
+    to, ti = s[AS].split(t, factor=warp_size)
+    s[AS].bind(tx, thread_y)
+    s[AS].bind(ty, thread_z)
+    s[AS].bind(ti, thread_x)
+
+    kh, kw, ic, o, ii, oo = WS.op.axis
+    tx, xo = s[WS].split(o, nparts=block_row_warps)
+    ty, yo = s[WS].split(xo, nparts=block_col_warps)
+    t = s[WS].fuse(ii, oo)
+    to, ti = s[WS].split(t, nparts=warp_size)
+    s[WS].bind(tx, thread_y)
+    s[WS].bind(ty, thread_z)
+    s[WS].bind(to, thread_x)
+    s[WS].vectorize(ti)
+
+    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_a'))
+    s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_b'))
+    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
+    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))
+
+    func = tvm.build(s, [A, W, Conv], 'cuda')
+
+    ctx = tvm.gpu(0)
+    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
+    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))
+
+    if VERIFY:
+        func(a, w, c)
+        a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels)
+        w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(kernel_h, kernel_w, in_channels, out_channels)
+        c_np = c.asnumpy().transpose((0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width, out_channels)
+        c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype),
+                                   w_np.astype(Conv.dtype),
+                                   (stride_h, stride_w),
+                                   (pad_h, pad_w)).astype(Conv.dtype)
+        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
+
+
+if __name__ == '__main__':
+    test_tensor_core_batch_matmal()
+    test_tensor_core_batch_conv()
diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py
index d2ef40c64d21..8a6a467a80c4 100644
--- a/topi/python/topi/testing/conv2d_nhwc_python.py
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -40,7 +40,7 @@ def conv2d_nhwc_python(a_np, w_np, stride, padding):
     Returns
     -------
     b_np : np.ndarray
-        4-D with shape [out_height, out_width, out_channel, batch]
+        4-D with shape [batch, out_height, out_width, out_channel]
     """
     batch, in_height, in_width, in_channel = a_np.shape
     kernel_h, kernel_w, _, num_filter = w_np.shape
diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/tutorials/optimize/opt_conv_tensorcore.py
new file mode 100644
index 000000000000..774b4c7258bb
--- /dev/null
+++ b/tutorials/optimize/opt_conv_tensorcore.py
@@ -0,0 +1,348 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _opt-conv-tensorcore:
+
+How to optimize convolution using TensorCores
+==================================
+**Author**: `Siyuan Feng <https://github.com/Hzfengsy>`_
+
+In this tutorial, we will demonstrate how to write a high performance convolution
+schedule using TensorCores in TVM. In this example, we assume the input to
+convolution has a large batch. We strongly recommend covering the :ref:`opt-conv-gpu` tutorial first.
+
+"""
+
+################################################################
+# TensorCore Introduction
+# -------------------------
+# Each Tensor Core provides a 4x4x4 matrix processing array that operates
+# :code:`D = A * B + C`, where A, B, C and D are 4x4 matrices as Figure shows.
+# The matrix multiplication inputs A and B are FP16 matrices, while the accumulation
+# matrices C and D may be FP16 or FP32 matrices.
+#
+# However, CUDA programmers can only use warp-level primitive
+# :code:`wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag)` to perform
+# 16x16x16 half-precision matrix multiplication on tensor cores. Before invoking
+# the matrix multiplication, programmers must load data from memory into registers
+# with primitive :code:`wmma::load_matrix_sync`, explicitly. The NVCC compiler translates
+# that primitive into multiple memory load instructions. At run time, every thread loads
+# 16 elements from matrix A and 16 elements from B.
+
+################################################################
+# Preparation and Algorithm
+# --------------------------
+# We use the fixed size for input tensors with 256 channels and 14 x 14 dimensions.
+# The batch size is 256. Convolution filters contain 512 filters of size 3 x 3.
+# We use stride size 1 and padding size 1 for the convolution. In the example, we use
+# NHWCnc memory layout.The following code defines the convolution algorithm in TVM.
+
+import tvm
+import numpy as np
+from tvm.contrib import nvcc
+
+# The sizes of inputs and filters
+batch_size = 256
+height = 14
+width = 14
+in_channels = 256
+out_channels = 512
+kernel_h = 3
+kernel_w = 3
+pad_h = 1
+pad_w = 1
+stride_h = 1
+stride_w = 1
+
+# TensorCore shape
+block_size = 16
+
+assert (batch_size % block_size == 0)
+assert (in_channels % block_size == 0)
+assert (out_channels % block_size == 0)
+
+# Input feature map: (N, H, W, IC, n, ic)
+data_shape = (batch_size // block_size,
+              height,
+              width,
+              in_channels // block_size,
+              block_size,
+              block_size)
+# Kernel: (H, W, IC, OC, ic, oc)
+kernel_shape = (kernel_h,
+                kernel_w,
+                in_channels // block_size,
+                out_channels // block_size,
+                block_size,
+                block_size)
+# Output feature map: (N, H, W, OC, n, oc)
+output_shape = (batch_size // block_size,
+                height,
+                width,
+                out_channels // block_size,
+                block_size,
+                block_size)
+
+# Reduction axes
+kh = tvm.reduce_axis((0, kernel_h), name='kh')
+kw = tvm.reduce_axis((0, kernel_w), name='kw')
+ic = tvm.reduce_axis((0, in_channels // block_size), name='ic')
+ii = tvm.reduce_axis((0, block_size), name='ii')
+
+# Algorithm
+A = tvm.placeholder(data_shape, name='A', dtype="float16")
+W = tvm.placeholder(kernel_shape, name='W', dtype="float16")
+Apad = tvm.compute(
+    (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size,
+     block_size),
+    lambda n, h, w, i, nn, ii: tvm.if_then_else(
+        tvm.all(h >= pad_h, h - pad_h < height,
+                w >= pad_w, w - pad_w < width),
+        A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.const(0., "float16")),
+    name='Apad')
+Conv = tvm.compute(output_shape,
+                   lambda n, h, w, o, nn, oo: tvm.sum(
+                       Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") *
+                       W[kh, kw, ic, o, ii, oo].astype("float32"),
+                       axis=[ic, kh, kw, ii]),
+                   name="Conv")
+
+s = tvm.create_schedule(Conv.op)
+s[Apad].compute_inline()
+
+###############################################################################
+# Memory Scope
+# ----------------
+#
+# In traditional GPU schedule, we have global, shared and local memory scope.
+# To support TensorCores, we add another three special memory scope: :code:`wmma.matrix_a`,
+# :code:`wmma.matrix_b` and :code:`wmma.accumulator`. On hardware, all fragments scope
+# stores at the on-chip registers level, the same place with local memory.
+
+# Designate the memory hierarchy
+AS = s.cache_read(Apad, 'shared', [Conv])
+WS = s.cache_read(W, 'shared', [Conv])
+AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
+WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
+ConvF = s.cache_write(Conv, 'wmma.accumulator')
+
+###############################################################################
+# Define Tensor Intrinsic
+# In fact, TensorCore is a special hardware operation. So, we can just use tensorize
+# to replace a unit of computation with the TensorCore instruction. The first thing is
+# that we need to define tensor intrinsic.
+#
+# There are four basic operation in TensorCore: :code:`fill_fragment`, :code:`load_matrix`,
+# :code:`mma_sync` and :code:`store_matrix`. Since :code:`fill_fragment` and :code:`mma_sync`
+# are both used in matrix multiplication, so we can just write following three intrinsics.
+
+def intrin_wmma_load_matrix(scope):
+    n = 16
+    A = tvm.placeholder((n, n), name='A', dtype='float16')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256)
+    C = tvm.compute((n, n), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_load_matrix_sync',
+                                BC.data, n, n, n, BC.elem_offset // 256,
+                                BA.access_ptr('r'), n, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+
+def intrin_wmma_gemm():
+    n = 16
+    A = tvm.placeholder((n, n), name='A', dtype='float16')
+    B = tvm.placeholder((n, n), name='B', dtype='float16')
+    k = tvm.reduce_axis((0, n), name="k")
+    C = tvm.compute((n, n),
+                    lambda ii, jj:
+                    tvm.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
+                    name='C')
+    BA = tvm.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=256)
+    BB = tvm.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=256)
+    BC = tvm.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=256)
+
+    def intrin_func(ins, outs):
+        BA, BB = ins
+        BC, = outs
+
+        def init():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, n, n, BC.elem_offset // 256, 0.0))
+            return ib.get()
+
+        def update():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_mma_sync',
+                                    BC.data, BC.elem_offset // 256,
+                                    BA.data, BA.elem_offset // 256,
+                                    BB.data, BB.elem_offset // 256,
+                                    BC.data, BC.elem_offset // 256))
+            return ib.get()
+
+        return update(), init(), update()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
+
+
+def intrin_wmma_store_matrix():
+    n = 16
+    A = tvm.placeholder((n, n), name='A', dtype='float32')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=256)
+    C = tvm.compute((n, n), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=256)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_store_matrix_sync',
+                                BA.data, n, n, n, BA.elem_offset // 256,
+                                BC.access_ptr('w'), n, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+###############################################################################
+# Scheduling the Computation
+# --------------------------
+# To use TensorCores in TVM, we must schedule the computation into specific structure
+# to match the tensor intrinsic. The same as traditional GPU programs, we can also use
+# shared memory to boost the speed. If you have any questions about blocking and shared
+# memory, please refer :ref:`opt-conv-gpu`.
+#
+# In this example, each block contains 2x4 warps, and each warp calls 4x2 TensorCore
+# instructions. Thus, the output shape of each warp is 64x32 and each block outputs
+# 128x128 titles. Due to the limit of shared memory space, we only load 2 blocks (2x128x128 tiles)
+# one time.
+#
+# .. note::
+#
+#   *Warp-level Operation*
+#
+#   Note that all TensorCore instructions are warp-level instructions, which means all 32 threads
+#   in a warp should do this instruction simultaneously. Making theadIdx.x extent=32 is one of the
+#   easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain
+#   TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution.
+#   The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time.
+#
+
+# Define tiling sizes
+block_row_warps = 4
+block_col_warps = 2
+warp_row_tiles = 2
+warp_col_tiles = 4
+warp_size = 32
+chunk = 2
+
+block_x = tvm.thread_axis('blockIdx.x')
+block_y = tvm.thread_axis('blockIdx.y')
+block_z = tvm.thread_axis('blockIdx.z')
+thread_x = tvm.thread_axis('threadIdx.x')
+thread_y = tvm.thread_axis('threadIdx.y')
+thread_z = tvm.thread_axis('threadIdx.z')
+
+nc, hc, wc, oc, nnc, ooc = Conv.op.axis
+block_k = s[Conv].fuse(hc, wc)
+s[Conv].bind(block_k, block_z)
+nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
+block_i, nc = s[Conv].split(nc, factor=block_row_warps)
+oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
+block_j, oc = s[Conv].split(oc, factor=block_col_warps)
+s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
+s[Conv].bind(block_i, block_x)
+s[Conv].bind(block_j, block_y)
+s[Conv].bind(nc, thread_y)
+s[Conv].bind(oc, thread_z)
+
+# Schedule local computation
+s[ConvF].compute_at(s[Conv], oc)
+n, h, w, o, nnf, oof = ConvF.op.axis
+ko, ki = s[ConvF].split(ic, factor=chunk)
+s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)
+
+# Move intermediate computation into each output compute tile
+s[AF].compute_at(s[ConvF], kw)
+s[WF].compute_at(s[ConvF], kw)
+
+# Schedule for A's share memory
+s[AS].compute_at(s[ConvF], kh)
+n, h, w, i, nn, ii = AS.op.axis
+tx, xo = s[AS].split(n, nparts=block_row_warps)
+ty, yo = s[AS].split(xo, nparts=block_col_warps)
+t = s[AS].fuse(nn, ii)
+to, ti = s[AS].split(t, factor=warp_size)
+s[AS].bind(tx, thread_y)
+s[AS].bind(ty, thread_z)
+s[AS].bind(ti, thread_x)
+
+# Schedule for W's share memory
+s[WS].compute_at(s[ConvF], kh)
+kh, kw, ic, o, ii, oo = WS.op.axis
+tx, xo = s[WS].split(o, nparts=block_row_warps)
+ty, yo = s[WS].split(xo, nparts=block_col_warps)
+t = s[WS].fuse(ii, oo)
+to, ti = s[WS].split(t, nparts=warp_size)
+s[WS].bind(tx, thread_y)
+s[WS].bind(ty, thread_z)
+s[WS].bind(to, thread_x)
+s[WS].vectorize(ti)
+print(tvm.lower(s, [A, W, Conv], simple_mode=True))
+
+###############################################################################
+# Lowering Computation to Intrinsics
+# --------------------------
+# The last phase is to lower the computation loops down to TensorCore hardware intrinsics
+# by mapping the 2D convolution to tensor intrinsics
+#
+
+s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a'))
+s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b'))
+s[Conv].tensorize(nnc, intrin_wmma_store_matrix())
+s[ConvF].tensorize(nnf, intrin_wmma_gemm())
+print(tvm.lower(s, [A, W, Conv], simple_mode=True))
+
+###############################################################################
+# Generate CUDA Kernel
+# --------------------
+# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution.
+# Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not
+# be able to run on our build server
+
+ctx = tvm.gpu(0)
+if nvcc.have_tensorcore(ctx.compute_version):
+    with tvm.build_config(auto_unroll_max_step=16):
+        func = tvm.build(s, [A, W, Conv], 'cuda')
+    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
+    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))
+
+###############################################################################
+# Summary
+# This tutorial demonstrates how TVM scheduling primitives can be used to
+# call TensorCores on specific GPUs.
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index 5c243751c340..cec217cbd393 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -80,6 +80,7 @@ def add_debug(stmt):
     if debug_flag:
         pass_list.append((1, add_debug))
     pass_list.append((2, ir_pass.inject_alu_intrin))
+    pass_list.append((3, tvm.ir_pass.LowerStorageAccessInfo))
     pass_list.append((3, ir_pass.fold_uop_loop))
     pass_list.append((3, ir_pass.cpu_access_rewrite))
     return tvm.build_config(add_lower_pass=pass_list, **kwargs)

From 1a11efb93f9d0b84987f3697489da6458feccc74 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 24 Oct 2019 13:40:04 -0700
Subject: [PATCH 34/59] [NODE][REFACTOR] Refactor reflection system in node.
 (#4189)

* [NODE][REFACTOR] Refactor reflection system in node.

- Removed the old Node, Node is now just an alias of runtime::Object
- Introduce ReflectionVTable, a new columnar dispatcher to support reflection
  - This allows us to remove vtable from most node objects
  - The VisitAttrs are registered via TVM_RESGITER_NODE_TYPE,
    they are no longer virtual.
- Consolidated serialization and reflection features into node.

* Explicit type qualification when calling destructor.

* Fix SPIRV, more comments
---
 include/tvm/api_registry.h                    |   2 +-
 include/tvm/arithmetic.h                      |   6 +-
 include/tvm/attrs.h                           |  10 +-
 include/tvm/base.h                            | 169 +---------
 include/tvm/buffer.h                          |   2 +-
 include/tvm/build_module.h                    |   6 +-
 include/tvm/channel.h                         |   2 +-
 include/tvm/data_layout.h                     |   4 +-
 include/tvm/expr.h                            |  12 +-
 include/tvm/ir.h                              |  66 ++--
 include/tvm/lowered_func.h                    |   2 +-
 include/tvm/node/container.h                  |  13 +-
 include/tvm/node/node.h                       | 199 ++++++------
 include/tvm/node/reflection.h                 | 241 ++++++++++++++
 include/tvm/node/serialization.h              |  51 +++
 include/tvm/operation.h                       |  12 +-
 include/tvm/packed_func_ext.h                 |  38 +--
 include/tvm/relay/adt.h                       |  16 +-
 include/tvm/relay/base.h                      |   6 +-
 include/tvm/relay/expr.h                      |  24 +-
 include/tvm/relay/interpreter.h               |  10 +-
 include/tvm/relay/module.h                    |   2 +-
 include/tvm/relay/op.h                        |   4 +-
 include/tvm/relay/transform.h                 |   7 +-
 include/tvm/relay/type.h                      |  20 +-
 include/tvm/runtime/device_api.h              |   1 +
 include/tvm/runtime/memory.h                  |  25 +-
 include/tvm/runtime/object.h                  |  28 +-
 include/tvm/runtime/packed_func.h             |  18 +-
 include/tvm/runtime/registry.h                |  42 +--
 include/tvm/schedule.h                        |  14 +-
 include/tvm/target_info.h                     |   2 +-
 include/tvm/tensor.h                          |   2 +-
 include/tvm/tensor_intrin.h                   |   4 +-
 nnvm/src/compiler/compile_engine.h            |   4 +-
 nnvm/src/compiler/graph_hash.h                |   6 +-
 nnvm/src/compiler/graph_runtime.cc            |   7 +-
 nnvm/src/compiler/graph_runtime.h             |   4 +-
 src/README.md                                 |   3 +-
 src/api/api_base.cc                           |   5 +-
 src/api/dsl_api.cc                            | 190 -----------
 src/arithmetic/bound_deducer.cc               |   8 +-
 src/arithmetic/canonical_simplify.cc          |   7 +-
 src/arithmetic/int_set.cc                     |   2 +
 src/arithmetic/int_set.h                      |   2 +-
 src/codegen/spirv/intrin_rule_spirv.cc        |   6 +-
 src/lang/api_registry.cc                      |   6 +-
 src/lang/ir.cc                                |   2 +
 src/lang/target_info.cc                       |   6 +-
 src/node/reflection.cc                        | 306 ++++++++++++++++++
 .../reflection.cc => node/serialization.cc}   | 277 +++++-----------
 src/relay/backend/compile_engine.h            |   8 +-
 src/relay/backend/interpreter.cc              |   7 +-
 src/relay/backend/param_dict.cc               |  12 +-
 src/relay/backend/param_dict.h                |   6 +-
 src/relay/ir/adt.cc                           |   1 -
 src/relay/ir/base.cc                          |   4 +-
 src/relay/ir/op.cc                            |   2 +-
 src/relay/ir/pretty_printer.cc                |  17 +-
 src/relay/ir/type_functor.cc                  |   6 +-
 src/relay/pass/alter_op_layout.cc             |   2 +-
 src/relay/pass/device_annotation.cc           |  18 +-
 src/relay/pass/eta_expand.cc                  |   4 +-
 src/relay/pass/fold_scale_axis.cc             |   4 +-
 src/relay/pass/forward_rewrite.cc             |   6 +-
 src/relay/pass/pass_manager.cc                |   6 +-
 src/relay/pass/quantize/annotate.cc           |   2 +-
 src/relay/pass/quantize/partition.cc          |   2 +-
 src/relay/pass/quantize/quantize.cc           |   2 -
 src/relay/pass/quantize/quantize.h            |   2 +-
 src/relay/pass/quantize/realize.cc            |   2 +-
 src/relay/pass/type_solver.cc                 |   6 +-
 src/relay/pass/util.cc                        |   8 +-
 src/runtime/object.cc                         |  16 +-
 tests/cpp/build_module_test.cc                |   1 +
 tests/cpp/packed_func_test.cc                 |   1 +
 76 files changed, 1105 insertions(+), 941 deletions(-)
 create mode 100644 include/tvm/node/reflection.h
 create mode 100644 include/tvm/node/serialization.h
 delete mode 100644 src/api/dsl_api.cc
 create mode 100644 src/node/reflection.cc
 rename src/{lang/reflection.cc => node/serialization.cc} (64%)

diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index dbd097293593..c41c3087f4ac 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -58,7 +58,7 @@ class EnvFuncNode : public Node {
   /*! \brief constructor */
   EnvFuncNode() {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
   }
 
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index e81fa0afd254..bda6ac647f55 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -60,7 +60,7 @@ class ConstIntBoundNode : public Node {
   int64_t min_value;
   int64_t max_value;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("min_value", &min_value);
     v->Visit("max_value", &max_value);
   }
@@ -162,7 +162,7 @@ class ModularSetNode : public Node {
   /*! \brief The base */
   int64_t base;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("coeff", &coeff);
     v->Visit("base", &base);
   }
@@ -351,7 +351,7 @@ enum SignType {
  */
 struct IntSetNode : public Node {
   static constexpr const char* _type_key = "IntSet";
-  TVM_DECLARE_BASE_NODE_INFO(IntSetNode, Node);
+  TVM_DECLARE_BASE_NODE_INFO(IntSetNode, Object);
 };
 
 /*!
diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index fb8927a75613..2fbb9e6a866e 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -115,7 +115,7 @@ class AttrFieldInfoNode : public Node {
   /*! \brief detailed description of the type */
   std::string description;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("type_info", &type_info);
     v->Visit("description", &description);
@@ -197,7 +197,7 @@ class AttrsHash {
   size_t operator()(const std::string& value) const {
     return std::hash<std::string>()(value);
   }
-  size_t operator()(const Type& value) const {
+  size_t operator()(const DataType& value) const {
     return std::hash<int>()(
         static_cast<int>(value.code()) |
         (static_cast<int>(value.bits()) << 8) |
@@ -221,6 +221,8 @@ class BaseAttrsNode : public Node {
  public:
   using TVMArgs = runtime::TVMArgs;
   using TVMRetValue = runtime::TVMRetValue;
+  // visit function
+  virtual void VisitAttrs(AttrVisitor* v) {}
   /*!
    * \brief Initialize the attributes by sequence of arguments
    * \param args The postional arguments in the form
@@ -753,12 +755,12 @@ class AttrNonDefaultVisitor {
 template<typename DerivedType>
 class AttrsNode : public BaseAttrsNode {
  public:
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     ::tvm::detail::AttrNormalVisitor vis(v);
     self()->__VisitAttrs__(vis);
   }
 
-  void VisitNonDefaultAttrs(AttrVisitor* v) final {
+  void VisitNonDefaultAttrs(AttrVisitor* v) {
     ::tvm::detail::AttrNonDefaultVisitor vis(v);
     self()->__VisitAttrs__(vis);
   }
diff --git a/include/tvm/base.h b/include/tvm/base.h
index a42de10abef2..9b3b4cd3e8df 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -19,89 +19,16 @@
 
 /*!
  * \file tvm/base.h
- * \brief Defines the base data structure
+ * \brief Base utilities
  */
 #ifndef TVM_BASE_H_
 #define TVM_BASE_H_
 
 #include <dmlc/logging.h>
-#include <dmlc/registry.h>
-#include <tvm/node/node.h>
-#include <string>
-#include <memory>
-#include <functional>
 #include <utility>
-#include "runtime/registry.h"
 
 namespace tvm {
 
-using ::tvm::Node;
-using ::tvm::NodeRef;
-using ::tvm::AttrVisitor;
-
-/*!
- * \brief Macro to define common node ref methods.
- * \param TypeName The name of the NodeRef.
- * \param BaseTypeName The Base type.
- * \param NodeName The node container type.
- */
-#define TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseTypeName, NodeName)   \
-  TypeName() {}                                                         \
-  explicit TypeName(::tvm::ObjectPtr<::tvm::Object> n)                  \
-      : BaseTypeName(n) {}                                              \
-  const NodeName* operator->() const {                                  \
-    return static_cast<const NodeName*>(data_.get());                   \
-  }                                                                     \
-  operator bool() const { return this->defined(); }                     \
-  using ContainerType = NodeName;
-
-/*!
- * \brief Macro to define CopyOnWrite function in a NodeRef.
- * \param NodeName The Type of the Node.
- *
- *  CopyOnWrite will generate a unique copy of the internal node.
- *  The node will be copied if it is referenced by multiple places.
- *  The function returns the raw pointer to the node to allow modification
- *  of the content.
- *
- * \code
- *
- *  MyCOWNodeRef ref, ref2;
- *  ref2 = ref;
- *  ref.CopyOnWrite()->value = new_value;
- *  assert(ref2->value == old_value);
- *  assert(ref->value == new_value);
- *
- * \endcode
- */
-#define TVM_DEFINE_NODE_REF_COW(NodeName)                               \
-  NodeName* CopyOnWrite() {                                             \
-      CHECK(data_ != nullptr);                                          \
-      if (!data_.unique())  {                                           \
-        NodePtr<NodeName> n = make_node<NodeName>(*(operator->()));     \
-        ObjectPtr<Object>(std::move(n)).swap(data_);                    \
-      }                                                                 \
-      return static_cast<NodeName*>(data_.get());                       \
-    }
-
-/*! \brief Macro to make it easy to define node ref type given node */
-#define TVM_DEFINE_NODE_REF(TypeName, NodeName)                      \
-  class TypeName : public ::tvm::NodeRef {                           \
-   public:                                                           \
-    TVM_DEFINE_NODE_REF_METHODS(TypeName, ::tvm::NodeRef, NodeName); \
-  };                                                                 \
-
-/*!
- * \brief Macro to make it easy to define node ref type that
- *  has a CopyOnWrite member function.
- */
-#define TVM_DEFINE_COW_NODE_REF(TypeName, BaseType, NodeName)           \
-  class TypeName : public BaseType {                                    \
-   public:                                                              \
-    TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseType, NodeName);          \
-    TVM_DEFINE_NODE_REF_COW(NodeName);                                  \
-  };
-
 /*!
  * \brief RAII wrapper function to enter and exit a context object
  *        similar to python's with syntax.
@@ -146,100 +73,6 @@ class With {
   ContextType ctx_;
 };
 
-/*!
- * \brief save the node as well as all the node it depends on as json.
- *  This can be used to serialize any TVM object
- *
- * \return the string representation of the node.
- */
-std::string SaveJSON(const NodeRef& node);
-
-/*!
- * \brief Internal implementation of LoadJSON
- * Load tvm Node object from json and return a shared_ptr of Node.
- * \param json_str The json string to load from.
- *
- * \return The shared_ptr of the Node.
- */
-ObjectPtr<Object> LoadJSON_(std::string json_str);
-
-/*!
- * \brief Load the node from json string.
- *  This can be used to deserialize any TVM object.
- *
- * \param json_str The json string to load from.
- *
- * \tparam NodeType the nodetype
- *
- * \code
- *  Expr e = LoadJSON<Expr>(json_str);
- * \endcode
- */
-template<typename NodeType,
-         typename = typename std::enable_if<std::is_base_of<NodeRef, NodeType>::value>::type >
-inline NodeType LoadJSON(const std::string& json_str) {
-  return NodeType(LoadJSON_(json_str));
-}
-
-/*!
- * \brief Registry entry for NodeFactory.
- *
- *  There are two types of Nodes that can be serialized.
- *  The normal node requires a registration a creator function that
- *  constructs an empty Node of the corresponding type.
- *
- *  The global singleton(e.g. global operator) where only global_key need to be serialized,
- *  in this case, FGlobalKey need to be defined.
- */
-struct NodeFactoryReg {
-  /*!
-   * \brief creator function.
-   * \param global_key Key that identifies a global single object.
-   *        If this is not empty then FGlobalKey
-   * \return The created function.
-   */
-  using FCreate = std::function<NodePtr<Node>(const std::string& global_key)>;
-  /*!
-   * \brief Global key function, only needed by global objects.
-   * \param node The node pointer.
-   * \return node The global key to the node.
-   */
-  using FGlobalKey = std::function<std::string(const Node* node)>;
-  /*! \brief registered name */
-  std::string name;
-  /*!
-   * \brief The creator function
-   */
-  FCreate fcreator = nullptr;
-  /*!
-   * \brief The global key function.
-   */
-  FGlobalKey fglobal_key = nullptr;
-  // setter of creator
-  NodeFactoryReg& set_creator(FCreate f) {  // NOLINT(*)
-    this->fcreator = f;
-    return *this;
-  }
-  // setter of creator
-  NodeFactoryReg& set_global_key(FGlobalKey f) {  // NOLINT(*)
-    this->fglobal_key = f;
-    return *this;
-  }
-  // global registry singleton
-  TVM_DLL static ::dmlc::Registry<::tvm::NodeFactoryReg> *Registry();
-};
-
-/*!
- * \brief Register a Node type
- * \note This is necessary to enable serialization of the Node.
- */
-#define TVM_REGISTER_NODE_TYPE(TypeName)                                \
-  TVM_REGISTER_OBJECT_TYPE(TypeName);                                   \
-  static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
-      ::tvm::NodeFactoryReg::Registry()->__REGISTER__(TypeName::_type_key) \
-      .set_creator([](const std::string&) { return ::tvm::make_node<TypeName>(); })
-
-
 #define TVM_STRINGIZE_DETAIL(x) #x
 #define TVM_STRINGIZE(x) TVM_STRINGIZE_DETAIL(x)
 #define TVM_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" TVM_STRINGIZE(__LINE__))
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index f18ed9206db3..d2c2b40661e2 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -135,7 +135,7 @@ class BufferNode : public Node {
   /*! \brief constructor */
   BufferNode() {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("data", &data);
     v->Visit("dtype", &dtype);
     v->Visit("shape", &shape);
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index c985fbe17546..7114a4550331 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -61,7 +61,7 @@ class TargetNode : public Node {
   /*! \return the full device string to pass to codegen::Build */
   TVM_DLL const std::string& str() const;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("target_name", &target_name);
     v->Visit("device_name", &device_name);
     v->Visit("device_type", &device_type);
@@ -229,7 +229,7 @@ class BuildConfigNode : public Node {
   /*! \brief Whether to disable loop vectorization. */
   bool disable_vectorize = false;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("data_alignment", &data_alignment);
     v->Visit("offset_factor", &offset_factor);
     v->Visit("double_buffer_split_loop", &double_buffer_split_loop);
@@ -473,6 +473,8 @@ class GenericFuncNode : public Node {
   /* \brief map from keys to registered functions */
   std::unordered_map<std::string, runtime::PackedFunc> dispatch_dict_;
 
+  void VisitAttrs(AttrVisitor* v) {}
+
   static constexpr const char* _type_key = "GenericFunc";
   TVM_DECLARE_NODE_TYPE_INFO(GenericFuncNode, Node);
 };
diff --git a/include/tvm/channel.h b/include/tvm/channel.h
index 346291a6b06a..3a40a787d891 100644
--- a/include/tvm/channel.h
+++ b/include/tvm/channel.h
@@ -54,7 +54,7 @@ struct ChannelNode : public Node {
   /*! \brief default data type in read/write */
   Type dtype;
   // visit all attributes
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("handle_var", &handle_var);
     v->Visit("dtype", &dtype);
   }
diff --git a/include/tvm/data_layout.h b/include/tvm/data_layout.h
index ad3da6b347af..5e2cc08660db 100644
--- a/include/tvm/data_layout.h
+++ b/include/tvm/data_layout.h
@@ -104,7 +104,7 @@ class LayoutNode : public Node {
    */
   Array<IterVar> axes;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("axes", &axes);
   }
@@ -325,7 +325,7 @@ class BijectiveLayoutNode : public Node {
   /*! \brief The destination layout */
   Layout dst_layout;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("src_layout", &src_layout);
     v->Visit("dst_layout", &dst_layout);
     v->Visit("forward_rule", &forward_rule);
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index d884a4d61748..ea578152899d 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -27,8 +27,10 @@
 #include <string>
 #include <algorithm>
 #include <unordered_map>
+#include <iostream>
 #include "base.h"
 #include "dtype.h"
+#include "node/node.h"
 #include "node/container.h"
 #include "node/ir_functor.h"
 #include "runtime/c_runtime_api.h"
@@ -110,7 +112,7 @@ class Variable : public ExprNode {
 
   static Var make(DataType dtype, std::string name_hint);
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("name", &name_hint);
   }
@@ -164,7 +166,7 @@ class IntImm : public ExprNode {
   /*! \brief the Internal value. */
   int64_t value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -230,7 +232,7 @@ class RangeNode : public Node {
   RangeNode() {}
   RangeNode(Expr min, Expr extent) : min(min), extent(extent) {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("min", &min);
     v->Visit("extent", &extent);
   }
@@ -406,7 +408,7 @@ class IterVarNode : public Node {
    */
   std::string thread_tag;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dom", &dom);
     v->Visit("var", &var);
     v->Visit("iter_type", &iter_type);
@@ -490,7 +492,7 @@ class IRPrinter {
 };
 
 // default print function for all nodes
-inline std::ostream& operator<<(std::ostream& os, const NodeRef& n) {  // NOLINT(*)
+inline std::ostream& operator<<(std::ostream& os, const ObjectRef& n) {  // NOLINT(*)
   IRPrinter(os).Print(n);
   return os;
 }
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 37718fe1b3c7..b6c3028d892f 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -45,7 +45,7 @@ class UIntImm : public ExprNode {
   /*! \brief The constant value content. */
   uint64_t value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -62,7 +62,7 @@ class FloatImm : public ExprNode {
   /*! \brief The constant value content. */
   double value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -79,7 +79,7 @@ class StringImm : public ExprNode {
   /*! \brief The constant value content. */
   std::string value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -99,7 +99,7 @@ class Cast : public ExprNode {
   /*! \brief Original data type. */
   Expr value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -122,7 +122,7 @@ class BinaryOpNode : public ExprNode {
   /*! \brief The right operand. */
   Expr b;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &(this->type));
     v->Visit("a", &a);
     v->Visit("b", &b);
@@ -214,7 +214,7 @@ class CmpOpNode : public ExprNode {
   /*! \brief The right operand. */
   Expr b;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &(this->type));
     v->Visit("a", &a);
     v->Visit("b", &b);
@@ -278,7 +278,7 @@ class And : public ExprNode {
   /*! \brief The right operand. */
   Expr b;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &(this->type));
     v->Visit("a", &a);
     v->Visit("b", &b);
@@ -298,7 +298,7 @@ class Or : public ExprNode {
   /*! \brief The right operand. */
   Expr b;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("a", &a);
     v->Visit("b", &b);
@@ -316,7 +316,7 @@ class Not : public ExprNode {
   /*! \brief The input operand. */
   Expr a;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("a", &a);
   }
@@ -343,7 +343,7 @@ class Select : public ExprNode {
   /*! \brief value to be returned when condition is false. */
   Expr false_value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("condition", &condition);
     v->Visit("true_value", &true_value);
@@ -380,7 +380,7 @@ class Load : public ExprNode {
   /*! \brief The predicate to mask which lanes would be loaded. */
   Expr predicate;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("buffer_var", &buffer_var);
     v->Visit("index", &index);
@@ -411,7 +411,7 @@ class Ramp : public ExprNode {
   /*! \brief Total number of lanes. */
   int lanes;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("base", &base);
     v->Visit("stride", &stride);
@@ -432,7 +432,7 @@ class Broadcast : public ExprNode {
   /*! \brief The numerb of lanes. */
   int lanes;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
     v->Visit("lanes", &lanes);
@@ -456,7 +456,7 @@ class Let : public ExprNode {
   /*! \brief The result expression. */
   Expr body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("var", &var);
     v->Visit("value", &value);
@@ -522,7 +522,7 @@ class Call : public ExprNode {
   /*! \brief The output value index if func's value is a tuple. */
   int value_index{0};
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("name", &name);
     v->Visit("args", &args);
@@ -592,7 +592,7 @@ class Shuffle : public ExprNode {
   /*! \brief The indices of each element. */
   Array<Expr> indices;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("vectors", &vectors);
     v->Visit("indices", &indices);
   }
@@ -652,7 +652,7 @@ class CommReducerNode : public Node {
                                   Array<Expr> result,
                                   Array<Expr> identity_element);
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("lhs", &lhs);
     v->Visit("rhs", &rhs);
     v->Visit("result", &result);
@@ -694,7 +694,7 @@ class Reduce : public ExprNode {
                            Expr condition,
                            int value_index);
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("combiner", &combiner);
     v->Visit("source", &source);
@@ -710,7 +710,7 @@ class Reduce : public ExprNode {
 /*! \brief Any shape. */
 class Any : public ExprNode {
  public:
-  void VisitAttrs(AttrVisitor* v) final {}
+  void VisitAttrs(AttrVisitor* v) {}
   /*! \brief Convert to var. */
   Var ToVar() const {
     return Variable::make(Int(32), "any_dim");
@@ -735,7 +735,7 @@ class LetStmt : public StmtNode {
   /*! \brief The body block. */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("value", &value);
     v->Visit("body", &body);
@@ -768,7 +768,7 @@ class AttrStmt : public StmtNode {
   /*! \brief The body statement to be executed */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("node", &node);
     v->Visit("attr_key", &attr_key);
     v->Visit("value", &value);
@@ -799,7 +799,7 @@ class AssertStmt : public StmtNode {
    */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("condition", &condition);
     v->Visit("message", &message);
     v->Visit("body", &body);
@@ -822,7 +822,7 @@ class ProducerConsumer : public StmtNode {
   /*! \brief Body to be executed. */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("is_producer", &is_producer);
     v->Visit("body", &body);
@@ -863,7 +863,7 @@ class Store : public StmtNode {
   /*! \brief The predicate to mask which lanes would be stored. */
   Expr predicate;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("buffer_var", &buffer_var);
     v->Visit("value", &value);
     v->Visit("index", &index);
@@ -893,7 +893,7 @@ class Provide : public StmtNode {
   /*! \brief The index arguments of the function. */
   Array<Expr> args;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("value_index", &value_index);
     v->Visit("value", &value);
@@ -929,7 +929,7 @@ class Allocate : public StmtNode {
   Expr new_expr;
   std::string free_function;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("buffer_var", &buffer_var);
     v->Visit("dtype", &type);
     v->Visit("extents", &extents);
@@ -972,7 +972,7 @@ class Free : public StmtNode {
   /*! \brief The buffer variable. */
   Var buffer_var;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("buffer_var", &buffer_var);
   }
 
@@ -1001,7 +1001,7 @@ class Realize : public StmtNode {
   /*! \brief The body of realization. */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("value_index", &value_index);
     v->Visit("dtype", &type);
@@ -1031,7 +1031,7 @@ class Block : public StmtNode {
   /*! \brief The restof statments. */
   Stmt rest;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("first", &first);
     v->Visit("rest", &rest);
   }
@@ -1055,7 +1055,7 @@ class IfThenElse : public StmtNode {
   /*! \brief The branch to be executed when condition is false, can be null. */
   Stmt else_case;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("condition", &condition);
     v->Visit("then_case", &then_case);
     v->Visit("else_case", &else_case);
@@ -1078,7 +1078,7 @@ class Evaluate : public StmtNode {
   /*! \brief The expression to be evaluated. */
   Expr value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("value", &value);
   }
 
@@ -1142,7 +1142,7 @@ class For : public StmtNode {
                            DeviceAPI device_api,
                            Stmt body);
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("loop_var", &loop_var);
     v->Visit("min", &min);
     v->Visit("extent", &extent);
@@ -1169,7 +1169,7 @@ class Prefetch : public StmtNode {
   /*! \brief Bounds to be prefetched. */
   Region bounds;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("value_index", &value_index);
     v->Visit("type", &type);
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index e2147d036587..6709f545cb39 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -119,7 +119,7 @@ class LoweredFuncNode : public ir::FunctionBaseNode {
   int num_outputs() const final {
     return 1;
   }
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("args", &args);
     v->Visit("thread_axis", &thread_axis);
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
index 2e1a978f4806..c36c6c141451 100644
--- a/include/tvm/node/container.h
+++ b/include/tvm/node/container.h
@@ -40,8 +40,7 @@ class ArrayNode : public Node {
   /*! \brief the data content */
   std::vector<ObjectRef> data;
 
-  void VisitAttrs(AttrVisitor* visitor) final {
-     // Visitor to array have no effect.
+  void VisitAttrs(AttrVisitor* visitor) {
   }
 
   static constexpr const char* _type_key = "Array";
@@ -51,9 +50,9 @@ class ArrayNode : public Node {
 /*! \brief map node content */
 class MapNode : public Node {
  public:
-  void VisitAttrs(AttrVisitor* visitor) final {
-     // Visitor to map have no effect.
+  void VisitAttrs(AttrVisitor* visitor) {
   }
+
   /*! \brief The corresponding conatiner type */
   using ContainerType = std::unordered_map<
     ObjectRef,
@@ -71,12 +70,12 @@ class MapNode : public Node {
 /*! \brief specialized map node with string as key */
 class StrMapNode : public Node {
  public:
-  void VisitAttrs(AttrVisitor* visitor) final {
-     // Visitor to map have no effect.
-  }
   /*! \brief The corresponding conatiner type */
   using ContainerType = std::unordered_map<std::string, ObjectRef>;
 
+  void VisitAttrs(AttrVisitor* visitor) {
+  }
+
   /*! \brief the data content */
   ContainerType data;
 
diff --git a/include/tvm/node/node.h b/include/tvm/node/node.h
index 8203ee69f686..4014c3700596 100644
--- a/include/tvm/node/node.h
+++ b/include/tvm/node/node.h
@@ -18,113 +18,68 @@
  */
 /*!
  * \file tvm/node/node.h
- * \brief Node system data structure.
+ * \brief Definitions and helper macros for IR/AST nodes.
+ *
+ *  The node folder contains base utilities for IR/AST nodes,
+ *  invariant of which specific language dialect.
+ *
+ *  We implement AST/IR nodes as sub-classes of runtime::Object.
+ *  The base class Node is just an alias of runtime::Object.
+ *
+ *  Besides the runtime type checking provided by Object,
+ *  node folder contains additional functionalities such as
+ *  reflection and serialization, which are important features
+ *  for building a compiler infra.
  */
 #ifndef TVM_NODE_NODE_H_
 #define TVM_NODE_NODE_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/memory.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/node/reflection.h>
+
 #include <string>
 #include <vector>
 #include <utility>
 #include <type_traits>
 
-
 namespace tvm {
-// forward declaration
-class DataType;
-class Node;
-class NodeRef;
 
-/*!
- * \brief Visitor class to each node content.
- *  The content is going to be called for each field.
- */
-class TVM_DLL AttrVisitor {
- public:
-//! \cond Doxygen_Suppress
-  virtual ~AttrVisitor() = default;
-  virtual void Visit(const char* key, double* value) = 0;
-  virtual void Visit(const char* key, int64_t* value) = 0;
-  virtual void Visit(const char* key, uint64_t* value) = 0;
-  virtual void Visit(const char* key, int* value) = 0;
-  virtual void Visit(const char* key, bool* value) = 0;
-  virtual void Visit(const char* key, std::string* value) = 0;
-  virtual void Visit(const char* key, void** value) = 0;
-  virtual void Visit(const char* key, DataType* value) = 0;
-  virtual void Visit(const char* key, NodeRef* value) = 0;
-  virtual void Visit(const char* key, runtime::NDArray* value) = 0;
-  virtual void Visit(const char* key, runtime::ObjectRef* value) = 0;
-  template<typename ENum,
-           typename = typename std::enable_if<std::is_enum<ENum>::value>::type>
-  void Visit(const char* key, ENum* ptr) {
-    static_assert(std::is_same<int, typename std::underlying_type<ENum>::type>::value,
-                  "declare enum to be enum int to use visitor");
-    this->Visit(key, reinterpret_cast<int*>(ptr));
-  }
-//! \endcond
-};
+using runtime::TypeIndex;
+using runtime::Object;
+using runtime::ObjectPtr;
+using runtime::ObjectRef;
+using runtime::GetRef;
+using runtime::Downcast;
+using runtime::ObjectHash;
+using runtime::ObjectEqual;
+using runtime::make_object;
 
-/*! \brief Reuse the type index in he runtime. */
-using TypeIndex = runtime::TypeIndex;
+using NodeHash = ObjectHash;
+using NodeEqual = ObjectEqual;
+using Node = Object;
 
 /*!
- * \brief base class of node container in DSL AST.
+ * \brief Base class of all references to AST/IR nodes.
  */
-class Node : public runtime::Object {
+class NodeRef : public ObjectRef {
  public:
-  /*! \brief virtual destructor */
-  virtual ~Node() {}
-
-  /*!
-   * \brief Apply visitor to each field of the Node
-   *  Visitor could mutate the content of the node.
-   *  override if Node contains attribute fields.
-   * \param visitor The visitor
-   */
-  virtual void VisitAttrs(AttrVisitor* visitor) {}
-
-  static constexpr const char* _type_key = "Node";
-  static constexpr uint32_t _type_index = TypeIndex::kDynamic;
-
-  TVM_DECLARE_BASE_OBJECT_INFO(Node, runtime::Object);
+  NodeRef() {}
+  explicit NodeRef(ObjectPtr<Object> n) : ObjectRef(n) {}
 };
 
-
 /*!
- * \brief Base class of all node reference object
- *  NodeRef is just a alias of ObjectRef.
+ * \brief Allocate a node object.
+ * \param args arguments to the constructor.
+ * \tparam T the node type.
+ * \return The NodePtr to the allocated object.
+ * \note This function is an alias of make_object.
  */
-class NodeRef : public runtime::ObjectRef {
- public:
-  /*! \brief type indicate the container type */
-  using ContainerType = Node;
-
-  /*! \return the internal node pointer */
-  const Node* get() const {
-    return static_cast<const Node*>(ObjectRef::get());
-  }
-  /*! \return the internal node pointer */
-  const Node* operator->() const {
-    return get();
-  }
-  /*!
-   * \brief A more powerful version of as that also works with
-   *  intermediate base types.
-   * \tparam T the target type, must be subtype of IRNode
-   */
-  template<typename T>
-  const T *as_derived() const {
-    return as<T>();
-  }
-  /*! \brief default constructor */
-  NodeRef() = default;
-  explicit NodeRef(runtime::ObjectPtr<runtime::Object> ptr) : ObjectRef(ptr) {}
-};
+template<typename T, typename... Args>
+inline NodePtr<T> make_node(Args&&... args) {
+  return runtime::make_object<T>(std::forward<Args>(args)...);
+}
 
 /*!
  * \brief helper macro to declare type information in a base node.
@@ -139,27 +94,67 @@ class NodeRef : public runtime::ObjectRef {
   TVM_DECLARE_FINAL_OBJECT_INFO(TypeName, Parent);
 
 
-using runtime::Object;
-using runtime::ObjectPtr;
-using runtime::ObjectRef;
-using runtime::GetRef;
-using runtime::Downcast;
-using runtime::make_object;
-using runtime::ObjectHash;
-using runtime::ObjectEqual;
+/*!
+ * \brief Macro to define common node ref methods.
+ * \param TypeName The name of the NodeRef.
+ * \param BaseTypeName The Base type.
+ * \param NodeName The node container type.
+ */
+#define TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseTypeName, NodeName)   \
+  TypeName() {}                                                         \
+  explicit TypeName(::tvm::ObjectPtr<::tvm::Object> n)                  \
+      : BaseTypeName(n) {}                                              \
+  const NodeName* operator->() const {                                  \
+    return static_cast<const NodeName*>(data_.get());                   \
+  }                                                                     \
+  operator bool() const { return this->defined(); }                     \
+  using ContainerType = NodeName;
 
-using NodeHash = ObjectHash;
-using NodeEqual = ObjectEqual;
+/*!
+ * \brief Macro to define CopyOnWrite function in a NodeRef.
+ * \param NodeName The Type of the Node.
+ *
+ *  CopyOnWrite will generate a unique copy of the internal node.
+ *  The node will be copied if it is referenced by multiple places.
+ *  The function returns the raw pointer to the node to allow modification
+ *  of the content.
+ *
+ * \code
+ *
+ *  MyCOWNodeRef ref, ref2;
+ *  ref2 = ref;
+ *  ref.CopyOnWrite()->value = new_value;
+ *  assert(ref2->value == old_value);
+ *  assert(ref->value == new_value);
+ *
+ * \endcode
+ */
+#define TVM_DEFINE_NODE_REF_COW(NodeName)                               \
+  NodeName* CopyOnWrite() {                                             \
+      CHECK(data_ != nullptr);                                          \
+      if (!data_.unique())  {                                           \
+        NodePtr<NodeName> n = make_node<NodeName>(*(operator->()));     \
+        ObjectPtr<Object>(std::move(n)).swap(data_);                    \
+      }                                                                 \
+      return static_cast<NodeName*>(data_.get());                       \
+    }
+
+/*! \brief Macro to make it easy to define node ref type given node */
+#define TVM_DEFINE_NODE_REF(TypeName, NodeName)                      \
+  class TypeName : public ::tvm::NodeRef {                           \
+   public:                                                           \
+    TVM_DEFINE_NODE_REF_METHODS(TypeName, ::tvm::NodeRef, NodeName); \
+  };                                                                 \
 
 /*!
- * \brief Allocate a node object.
- * \param args arguments to the constructor.
- * \tparam T the node type.
- * \return The NodePtr to the allocated object.
+ * \brief Macro to make it easy to define node ref type that
+ *  has a CopyOnWrite member function.
  */
-template<typename T, typename... Args>
-inline NodePtr<T> make_node(Args&&... args) {
-  return runtime::make_object<T>(std::forward<Args>(args)...);
-}
+#define TVM_DEFINE_COW_NODE_REF(TypeName, BaseType, NodeName)           \
+  class TypeName : public BaseType {                                    \
+   public:                                                              \
+    TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseType, NodeName);          \
+    TVM_DEFINE_NODE_REF_COW(NodeName);                                  \
+  };
 }  // namespace tvm
 #endif  // TVM_NODE_NODE_H_
diff --git a/include/tvm/node/reflection.h b/include/tvm/node/reflection.h
new file mode 100644
index 000000000000..e6caa443ab9c
--- /dev/null
+++ b/include/tvm/node/reflection.h
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file tvm/node/reflection.h
+ * \brief Reflection and serialization of compiler IR/AST nodes.
+ */
+#ifndef TVM_NODE_REFLECTION_H_
+#define TVM_NODE_REFLECTION_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/memory.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/ndarray.h>
+
+#include <vector>
+#include <string>
+
+namespace tvm {
+
+// forward declaration
+class DataType;
+
+using runtime::Object;
+using runtime::ObjectPtr;
+using runtime::ObjectRef;
+
+/*!
+ * \brief Visitor class for to get the attributesof a AST/IR node.
+ *  The content is going to be called for each field.
+ *
+ *  Each objects that wants reflection will need to implement
+ *  a VisitAttrs function and call visitor->Visit on each of its field.
+ */
+class TVM_DLL AttrVisitor {
+ public:
+//! \cond Doxygen_Suppress
+  virtual ~AttrVisitor() = default;
+  virtual void Visit(const char* key, double* value) = 0;
+  virtual void Visit(const char* key, int64_t* value) = 0;
+  virtual void Visit(const char* key, uint64_t* value) = 0;
+  virtual void Visit(const char* key, int* value) = 0;
+  virtual void Visit(const char* key, bool* value) = 0;
+  virtual void Visit(const char* key, std::string* value) = 0;
+  virtual void Visit(const char* key, void** value) = 0;
+  virtual void Visit(const char* key, DataType* value) = 0;
+  virtual void Visit(const char* key, runtime::NDArray* value) = 0;
+  virtual void Visit(const char* key, runtime::ObjectRef* value) = 0;
+  template<typename ENum,
+           typename = typename std::enable_if<std::is_enum<ENum>::value>::type>
+  void Visit(const char* key, ENum* ptr) {
+    static_assert(std::is_same<int, typename std::underlying_type<ENum>::type>::value,
+                  "declare enum to be enum int to use visitor");
+    this->Visit(key, reinterpret_cast<int*>(ptr));
+  }
+//! \endcond
+};
+
+/*!
+ * \brief Virtual function table to support IR/AST node reflection.
+ *
+ * Functions are stored  in columar manner.
+ * Each column is a vector indexed by Object's type_index.
+ */
+class ReflectionVTable {
+ public:
+  /*!
+   * \brief Visitor function.
+   * \note We use function pointer, instead of std::function
+   *       to reduce the dispatch overhead as field visit
+   *       does not need as much customization.
+   */
+  typedef void (*FVisitAttrs)(Object* self, AttrVisitor* visitor);
+  /*!
+   * \brief creator function.
+   * \param global_key Key that identifies a global single object.
+   *        If this is not empty then FGlobalKey must be defined for the object.
+   * \return The created function.
+   */
+  using FCreate = std::function<ObjectPtr<Object>(const std::string& global_key)>;
+  /*!
+   * \brief Global key function, only needed by global objects.
+   * \param node The node pointer.
+   * \return node The global key to the node.
+   */
+  using FGlobalKey = std::function<std::string(const Object* self)>;
+  /*!
+   * \brief Dispatch the VisitAttrs function.
+   * \param self The pointer to the object.
+   * \param visitor The attribute visitor.
+   */
+  inline void VisitAttrs(Object* self, AttrVisitor* visitor) const;
+  /*!
+   * \brief Get global key of the object, if any.
+   * \param self The pointer to the object.
+   * \return the global key if object has one, otherwise return empty string.
+   */
+  inline std::string GetGlobalKey(Object* self) const;
+  /*!
+   * \brief Create an initial object using default constructor
+   *        by type_key and global key.
+   *
+   * \param type_key The type key of the object.
+   * \param global_key A global key that can be used to uniquely identify the object if any.
+   */
+  TVM_DLL ObjectPtr<Object> CreateInitObject(const std::string& type_key,
+                                             const std::string& global_key = "") const;
+  /*!
+   * \brief Get an field object by the attr name.
+   * \param self The pointer to the object.
+   * \param attr_name The name of the field.
+   * \return The corresponding attribute value.
+   * \note This function will throw an exception if the object does not contain the field.
+   */
+  TVM_DLL runtime::TVMRetValue GetAttr(Object* self, const std::string& attr_name) const;
+
+  /*!
+   * \brief List all the fields in the object.
+   * \return All the fields.
+   */
+  TVM_DLL std::vector<std::string> ListAttrNames(Object* self) const;
+
+  /*! \return The global singleton. */
+  TVM_DLL static ReflectionVTable* Global();
+
+  class Registry;
+  template<typename T>
+  inline Registry Register();
+
+ private:
+  /*! \brief Attribute visitor. */
+  std::vector<FVisitAttrs> fvisit_attrs_;
+  /*! \brief Creation function. */
+  std::vector<FCreate> fcreate_;
+  /*! \brief Global key function. */
+  std::vector<FGlobalKey> fglobal_key_;
+};
+
+/*! \brief Registry of a reflection table. */
+class ReflectionVTable::Registry {
+ public:
+  Registry(ReflectionVTable* parent, uint32_t type_index)
+      : parent_(parent), type_index_(type_index) { }
+  /*!
+   * \brief Set fcreate function.
+   * \param f The creator function.
+   * \return rference to self.
+   */
+  Registry& set_creator(FCreate f) {  // NOLINT(*)
+    CHECK_LT(type_index_, parent_->fcreate_.size());
+    parent_->fcreate_[type_index_] = f;
+    return *this;
+  }
+  /*!
+   * \brief Set global_key function.
+   * \param f The creator function.
+   * \return rference to self.
+   */
+  Registry& set_global_key(FGlobalKey f) {  // NOLINT(*)
+    CHECK_LT(type_index_, parent_->fglobal_key_.size());
+    parent_->fglobal_key_[type_index_] = f;
+    return *this;
+  }
+
+ private:
+  ReflectionVTable* parent_;
+  uint32_t type_index_;
+};
+
+/*!
+ * \brief Register a node type to object registry and reflection registry.
+ * \param TypeName The name of the type.
+ * \note This macro will call TVM_REGISTER_OBJECT_TYPE for the type as well.
+ */
+#define TVM_REGISTER_NODE_TYPE(TypeName)                                \
+  TVM_REGISTER_OBJECT_TYPE(TypeName);                                   \
+  static DMLC_ATTRIBUTE_UNUSED ::tvm::ReflectionVTable::Registry &      \
+  __make_Node ## _ ## TypeName ## __ =                                  \
+      ::tvm::ReflectionVTable::Global()->Register<TypeName>()           \
+      .set_creator([](const std::string&) {                             \
+          return ::tvm::runtime::make_object<TypeName>();               \
+        })
+
+// Implementation details
+template<typename T>
+inline ReflectionVTable::Registry
+ReflectionVTable::Register() {
+  uint32_t tindex = T::RuntimeTypeIndex();
+  if (tindex >= fvisit_attrs_.size()) {
+    fvisit_attrs_.resize(tindex + 1, nullptr);
+    fcreate_.resize(tindex + 1, nullptr);
+    fglobal_key_.resize(tindex + 1, nullptr);
+  }
+  // functor that implemnts the redirection.
+  struct Functor {
+    static void VisitAttrs(Object* self, AttrVisitor* v) {
+      static_cast<T*>(self)->VisitAttrs(v);
+     }
+  };
+
+  fvisit_attrs_[tindex] = Functor::VisitAttrs;
+  return Registry(this, tindex);
+}
+
+inline void ReflectionVTable::
+VisitAttrs(Object* self, AttrVisitor* visitor) const {
+  uint32_t tindex = self->type_index();
+  if (tindex >= fvisit_attrs_.size() || fvisit_attrs_[tindex] == nullptr) {
+    LOG(FATAL) << "TypeError: " << self->GetTypeKey()
+               << " is not registered via TVM_REGISTER_NODE_TYPE";
+  }
+  fvisit_attrs_[tindex](self, visitor);
+}
+
+inline std::string ReflectionVTable::GetGlobalKey(Object* self) const {
+  uint32_t tindex = self->type_index();
+  if (tindex < fglobal_key_.size() && fglobal_key_[tindex] != nullptr) {
+    return fglobal_key_[tindex](self);
+  } else {
+    return std::string();
+  }
+}
+
+}  // namespace tvm
+#endif  // TVM_NODE_REFLECTION_H_
diff --git a/include/tvm/node/serialization.h b/include/tvm/node/serialization.h
new file mode 100644
index 000000000000..ac675946e0eb
--- /dev/null
+++ b/include/tvm/node/serialization.h
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Utility functions for serialization.
+ * \file tvm/node/serialization.h
+ */
+#ifndef TVM_NODE_SERIALIZATION_H_
+#define TVM_NODE_SERIALIZATION_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/object.h>
+
+#include <string>
+
+namespace tvm {
+/*!
+ * \brief save the node as well as all the node it depends on as json.
+ *  This can be used to serialize any TVM object
+ *
+ * \return the string representation of the node.
+ */
+TVM_DLL std::string SaveJSON(const runtime::ObjectRef& node);
+
+/*!
+ * \brief Internal implementation of LoadJSON
+ * Load tvm Node object from json and return a shared_ptr of Node.
+ * \param json_str The json string to load from.
+ *
+ * \return The shared_ptr of the Node.
+ */
+TVM_DLL runtime::ObjectRef LoadJSON(std::string json_str);
+
+}  // namespace tvm
+#endif  // TVM_NODE_SERIALIZATION_H_
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index b942464d4907..f53c1ce56a93 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -188,7 +188,7 @@ class PlaceholderOpNode : public OperationNode {
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
@@ -259,7 +259,7 @@ class TVM_DLL ComputeOpNode : public BaseComputeOpNode {
       bool debug_keep_trivial_loop) const final;
   size_t num_schedulable_dims() const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
@@ -312,7 +312,7 @@ class TensorComputeOpNode : public BaseComputeOpNode {
       bool debug_keep_trivial_loop) const final;
   size_t num_schedulable_dims() const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("axis", &axis);
@@ -394,7 +394,7 @@ class ScanOpNode : public OperationNode {
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
@@ -461,7 +461,7 @@ class ExternOpNode : public OperationNode {
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
@@ -529,7 +529,7 @@ class HybridOpNode : public OperationNode {
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 48d46fdf2fc6..71f8f55b2655 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -20,7 +20,7 @@
 /*!
  * \file tvm/packed_func_ext.h
  * \brief Extension package to PackedFunc
- *   This enales pass NodeRef types into/from PackedFunc.
+ *   This enales pass ObjectRef types into/from PackedFunc.
  */
 #ifndef TVM_PACKED_FUNC_EXT_H_
 #define TVM_PACKED_FUNC_EXT_H_
@@ -129,18 +129,18 @@ inline std::string ObjectTypeName() {
 
 // extensions for tvm arg value
 
-template<typename TNodeRef>
-inline TNodeRef TVMArgValue::AsNodeRef() const {
+template<typename TObjectRef>
+inline TObjectRef TVMArgValue::AsObjectRef() const {
   static_assert(
-      std::is_base_of<NodeRef, TNodeRef>::value,
-      "Conversion only works for NodeRef");
-  if (type_code_ == kNull) return TNodeRef(NodePtr<Node>(nullptr));
+      std::is_base_of<ObjectRef, TObjectRef>::value,
+      "Conversion only works for ObjectRef");
+  if (type_code_ == kNull) return TObjectRef(NodePtr<Node>(nullptr));
   TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
   Object* ptr = static_cast<Object*>(value_.v_handle);
-  CHECK(ObjectTypeChecker<TNodeRef>::Check(ptr))
-      << "Expected type " << ObjectTypeName<TNodeRef>()
+  CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+      << "Expected type " << ObjectTypeName<TObjectRef>()
       << " but get " << ptr->GetTypeKey();
-  return TNodeRef(ObjectPtr<Node>(ptr));
+  return TObjectRef(ObjectPtr<Node>(ptr));
 }
 
 inline TVMArgValue::operator tvm::Expr() const {
@@ -184,28 +184,28 @@ inline TVMArgValue::operator tvm::Integer() const {
   return Integer(ObjectPtr<Node>(ptr));
 }
 
-template<typename TNodeRef, typename>
+template<typename TObjectRef, typename>
 inline bool TVMPODValue_::IsObjectRef() const {
   TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
   Object* ptr = static_cast<Object*>(value_.v_handle);
-  return ObjectTypeChecker<TNodeRef>::Check(ptr);
+  return ObjectTypeChecker<TObjectRef>::Check(ptr);
 }
 
 // extensions for TVMRetValue
-template<typename TNodeRef>
-inline TNodeRef TVMRetValue::AsNodeRef() const {
+template<typename TObjectRef>
+inline TObjectRef TVMRetValue::AsObjectRef() const {
   static_assert(
-      std::is_base_of<NodeRef, TNodeRef>::value,
-      "Conversion only works for NodeRef");
-  if (type_code_ == kNull) return TNodeRef();
+      std::is_base_of<ObjectRef, TObjectRef>::value,
+      "Conversion only works for ObjectRef");
+  if (type_code_ == kNull) return TObjectRef();
   TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
 
   Object* ptr = static_cast<Object*>(value_.v_handle);
 
-  CHECK(ObjectTypeChecker<TNodeRef>::Check(ptr))
-      << "Expected type " << ObjectTypeName<TNodeRef>()
+  CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+      << "Expected type " << ObjectTypeName<TObjectRef>()
       << " but get " << ptr->GetTypeKey();
-  return TNodeRef(ObjectPtr<Object>(ptr));
+  return TObjectRef(ObjectPtr<Object>(ptr));
 }
 
 // type related stuffs
diff --git a/include/tvm/relay/adt.h b/include/tvm/relay/adt.h
index e54d88d5a393..a74353239a00 100644
--- a/include/tvm/relay/adt.h
+++ b/include/tvm/relay/adt.h
@@ -66,7 +66,7 @@ class PatternWildcardNode : public PatternNode {
 
   TVM_DLL static PatternWildcard make();
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("span", &span);
   }
 
@@ -88,7 +88,7 @@ class PatternVarNode : public PatternNode {
 
   TVM_DLL static PatternVar make(tvm::relay::Var var);
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("span", &span);
   }
@@ -122,7 +122,7 @@ class ConstructorNode : public ExprNode {
                                   tvm::Array<Type> inputs,
                                   GlobalTypeVar belong_to);
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name_hint", &name_hint);
     v->Visit("inputs", &inputs);
     v->Visit("belong_to", &belong_to);
@@ -151,7 +151,7 @@ class PatternConstructorNode : public PatternNode {
 
   TVM_DLL static PatternConstructor make(Constructor constructor, tvm::Array<Pattern> var);
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("constructor", &constructor);
     v->Visit("patterns", &patterns);
     v->Visit("span", &span);
@@ -175,7 +175,7 @@ class PatternTupleNode : public PatternNode {
 
   TVM_DLL static PatternTuple make(tvm::Array<Pattern> var);
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("patterns", &patterns);
     v->Visit("span", &span);
   }
@@ -213,7 +213,7 @@ class TypeDataNode : public TypeNode {
   /*! \brief The constructors. */
   tvm::Array<Constructor> constructors;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("header", &header);
     v->Visit("type_vars", &type_vars);
     v->Visit("constructors", &constructors);
@@ -240,7 +240,7 @@ class ClauseNode : public Node {
   /*! \brief The resulting value. */
   Expr rhs;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("lhs", &lhs);
     v->Visit("rhs", &rhs);
   }
@@ -269,7 +269,7 @@ class MatchNode : public ExprNode {
    */
   bool complete;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("data", &data);
     v->Visit("clauses", &clauses);
     v->Visit("complete", &complete);
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index 15330b00e961..5a2326ece05d 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -107,7 +107,7 @@ class SourceNameNode : public Node {
   /*! \brief The source name. */
   std::string name;
   // override attr visitor
-  void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); }
+  void VisitAttrs(AttrVisitor* v) { v->Visit("name", &name); }
 
   static constexpr const char* _type_key = "relay.SourceName";
   TVM_DECLARE_NODE_TYPE_INFO(SourceNameNode, Node);
@@ -160,7 +160,7 @@ class SpanNode : public Node {
   /*! \brief column offset */
   int col_offset;
   // override attr visitor
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("source", &source);
     v->Visit("lineno", &lineno);
     v->Visit("col_offset", &col_offset);
@@ -204,7 +204,7 @@ class IdNode : public Node {
    */
   std::string name_hint;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name_hint", &name_hint);
   }
 
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 281b99297e78..6df4273d34c0 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -95,7 +95,7 @@ class ConstantNode : public ExprNode {
     return data->ndim == 0;
   }
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("data", &data);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -117,7 +117,7 @@ class TupleNode : public ExprNode {
   /*! \brief the fields of the tuple */
   tvm::Array<relay::Expr> fields;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("fields", &fields);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -165,7 +165,7 @@ class VarNode : public ExprNode {
     return vid->name_hint;
   }
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("vid", &vid);
     v->Visit("type_annotation", &type_annotation);
     v->Visit("span", &span);
@@ -197,7 +197,7 @@ class GlobalVarNode : public ExprNode {
   /*! \brief The name of the variable, this only acts as a hint. */
   std::string name_hint;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name_hint", &name_hint);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -243,7 +243,7 @@ class FunctionNode : public ExprNode {
    */
   tvm::Attrs attrs;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("params", &params);
     v->Visit("body", &body);
     v->Visit("ret_type", &ret_type);
@@ -327,7 +327,7 @@ class CallNode : public ExprNode {
    */
   tvm::Array<Type> type_args;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("op", &op);
     v->Visit("args", &args);
     v->Visit("attrs", &attrs);
@@ -369,7 +369,7 @@ class LetNode : public ExprNode {
   /*! \brief The body of the let binding */
   Expr body;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("value", &value);
     v->Visit("body", &body);
@@ -407,7 +407,7 @@ class IfNode : public ExprNode {
   /*! \brief The expression evaluated when condition is false */
   Expr false_branch;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("cond", &cond);
     v->Visit("true_branch", &true_branch);
     v->Visit("false_branch", &false_branch);
@@ -432,7 +432,7 @@ class TupleGetItemNode : public ExprNode {
   /*! \brief which value to get */
   int index;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("tuple_value", &tuple);
     v->Visit("index", &index);
     v->Visit("span", &span);
@@ -454,7 +454,7 @@ class RefCreateNode : public ExprNode {
   /*! \brief The initial value of the Reference. */
   Expr value;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("value", &value);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -475,7 +475,7 @@ class RefReadNode : public ExprNode {
   /*! \brief The Reference Expression. */
   Expr ref;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("ref", &ref);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -498,7 +498,7 @@ class RefWriteNode : public ExprNode {
   /*! \brief The value to write into. */
   Expr value;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("ref", &ref);
     v->Visit("value", &value);
     v->Visit("span", &span);
diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
index f0b1e7ce8a26..3bdc125f9938 100644
--- a/include/tvm/relay/interpreter.h
+++ b/include/tvm/relay/interpreter.h
@@ -106,7 +106,7 @@ class ClosureNode : public ValueNode {
 
   ClosureNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("env", &env);
     v->Visit("func", &func);
   }
@@ -154,7 +154,7 @@ struct TupleValueNode : ValueNode {
 
   TupleValueNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("fields", &fields); }
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("fields", &fields); }
 
   TVM_DLL static TupleValue make(tvm::Array<Value> value);
 
@@ -173,7 +173,7 @@ struct TensorValueNode : ValueNode {
 
   TensorValueNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("data", &data); }
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("data", &data); }
 
   /*! \brief Build a value from an NDArray. */
   TVM_DLL static TensorValue make(runtime::NDArray data);
@@ -192,7 +192,7 @@ struct RefValueNode : ValueNode {
 
   RefValueNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("value", &value);
   }
 
@@ -215,7 +215,7 @@ struct ConstructorValueNode : ValueNode {
   /*! \brief Optional field tracking ADT constructor. */
   Constructor constructor;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("tag", &tag);
     v->Visit("fields", &fields);
     v->Visit("constructor", &constructor);
diff --git a/include/tvm/relay/module.h b/include/tvm/relay/module.h
index 10d72349d0f5..160ae5db8265 100644
--- a/include/tvm/relay/module.h
+++ b/include/tvm/relay/module.h
@@ -68,7 +68,7 @@ class ModuleNode : public RelayNode {
 
   ModuleNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("functions", &functions);
     v->Visit("type_definitions", &type_definitions);
     v->Visit("global_var_map_", &global_var_map_);
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index 572c194bc269..7d2a1f653a93 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -24,6 +24,8 @@
 #ifndef TVM_RELAY_OP_H_
 #define TVM_RELAY_OP_H_
 
+#include <dmlc/registry.h>
+
 #include <functional>
 #include <limits>
 #include <string>
@@ -82,7 +84,7 @@ class OpNode : public relay::ExprNode {
    */
   int32_t support_level = 10;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("op_type", &op_type);
     v->Visit("description", &description);
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 08ea3075cb83..82144d76e565 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -101,7 +101,7 @@ class PassContextNode : public RelayNode {
 
   PassContextNode() = default;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("opt_level", &opt_level);
     v->Visit("fallback_device", &fallback_device);
     v->Visit("required_pass", &required_pass);
@@ -196,7 +196,7 @@ class PassInfoNode : public RelayNode {
 
   PassInfoNode() = default;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("opt_level", &opt_level);
     v->Visit("name", &name);
     v->Visit("required", &required);
@@ -221,6 +221,7 @@ class Pass;
  */
 class PassNode : public RelayNode {
  public:
+  virtual ~PassNode() {}
   /*!
    * \brief Get the pass information/meta data. */
   virtual PassInfo Info() const = 0;
@@ -247,7 +248,7 @@ class PassNode : public RelayNode {
   virtual Module operator()(const Module& mod,
                             const PassContext& pass_ctx) const = 0;
 
-  void VisitAttrs(tvm::AttrVisitor* v) override {}
+  void VisitAttrs(tvm::AttrVisitor* v) {}
 
   static constexpr const char* _type_key = "relay.Pass";
   TVM_DECLARE_BASE_NODE_INFO(PassNode, RelayNode);
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index a5cc3c83383e..e0c056c1216b 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -96,7 +96,7 @@ class TensorTypeNode : public BaseTensorTypeNode {
   /*! \brief The content data type */
   DataType dtype;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("shape", &shape);
     v->Visit("dtype", &dtype);
     v->Visit("span", &span);
@@ -159,7 +159,7 @@ class TypeVarNode : public TypeNode {
   /*! \brief The kind of type parameter */
   Kind kind;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("kind", &kind);
     v->Visit("span", &span);
@@ -188,7 +188,7 @@ class GlobalTypeVarNode : public TypeNode {
   /*! \brief The kind of type parameter */
   Kind kind;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("kind", &kind);
     v->Visit("span", &span);
@@ -216,7 +216,7 @@ class TypeCallNode : public TypeNode {
   /*! \brief The arguments. */
   tvm::Array<Type> args;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("args", &args);
     v->Visit("span", &span);
@@ -245,7 +245,7 @@ class IncompleteTypeNode : public TypeNode {
  public:
   Kind kind;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("kind", &kind);
     v->Visit("span", &span);
   }
@@ -297,7 +297,7 @@ class FuncTypeNode : public TypeNode {
    */
   tvm::Array<TypeConstraint> type_constraints;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("arg_types", &arg_types);
     v->Visit("ret_type", &ret_type);
     v->Visit("type_params", &type_params);
@@ -330,7 +330,7 @@ class TupleTypeNode : public TypeNode {
 
   TupleTypeNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("fields", &fields);
     v->Visit("span", &span);
   }
@@ -357,7 +357,7 @@ class RefTypeNode : public TypeNode {
 
   RefTypeNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("value", &value);
     v->Visit("span", &span);
   }
@@ -417,7 +417,7 @@ class TypeReporterNode : public Node {
   TVM_DLL virtual Module GetModule() = 0;
 
   // solver is not serializable.
-  void VisitAttrs(tvm::AttrVisitor* v) final {}
+  void VisitAttrs(tvm::AttrVisitor* v) {}
 
   static constexpr const char* _type_key = "relay.TypeReporter";
   TVM_DECLARE_NODE_TYPE_INFO(TypeReporterNode, Node);
@@ -488,7 +488,7 @@ class TypeRelationNode : public TypeConstraintNode {
   /*! \brief Attributes to the relation function */
   Attrs attrs;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("args", &args);
     v->Visit("num_inputs", &num_inputs);
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 68029c13cb93..bb362dcdec66 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -230,6 +230,7 @@ inline std::ostream& operator<<(std::ostream& os, DLContext ctx) {  // NOLINT(*)
   os << runtime::DeviceName(device_type) << "(" << ctx.device_id << ")";
   return os;
 }
+
 #endif
 }  // namespace runtime
 }  // namespace tvm
diff --git a/include/tvm/runtime/memory.h b/include/tvm/runtime/memory.h
index 01c08d324fcb..d28552eaf7fd 100644
--- a/include/tvm/runtime/memory.h
+++ b/include/tvm/runtime/memory.h
@@ -82,6 +82,8 @@ class SimpleObjAllocator :
   template<typename T>
   class Handler {
    public:
+    using StorageType = typename std::aligned_storage<sizeof(T), alignof(T)>::type;
+
     template<typename... Args>
     static T* New(SimpleObjAllocator*, Args&&... args) {
       // NOTE: the first argument is not needed for SimpleObjAllocator
@@ -91,7 +93,15 @@ class SimpleObjAllocator :
       // In the case of an object pool, an allocator needs to create
       // a special chunk memory that hides reference to the allocator
       // and call allocator's release function in the deleter.
-      return new T(std::forward<Args>(args)...);
+
+      // NOTE2: Use inplace new to allocate
+      // This is used to get rid of warning when deleting a virtual
+      // class with non-virtual destructor.
+      // We are fine here as we captured the right deleter during construction.
+      // This is also the right way to get storage type for an object pool.
+      StorageType* data = new StorageType();
+      new (data) T(std::forward<Args>(args)...);
+      return reinterpret_cast<T*>(data);
     }
 
     static Object::FDeleter Deleter() {
@@ -99,8 +109,17 @@ class SimpleObjAllocator :
     }
 
    private:
-    static void Deleter_(Object* ptr) {
-      delete static_cast<T*>(ptr);
+    static void Deleter_(Object* objptr) {
+      // NOTE: this is important to cast back to T*
+      // because objptr and tptr may not be the same
+      // depending on how sub-class allocates the space.
+      T* tptr = static_cast<T*>(objptr);
+      // It is important to do tptr->T::~T(),
+      // so that we explicitly call the specific destructor
+      // instead of tptr->~T(), which could mean the intention
+      // call a virtual destructor(which may not be available and is not required).
+      tptr->T::~T();
+      delete reinterpret_cast<StorageType*>(tptr);
     }
   };
 };
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 143f3bb35220..cc4a295cc5d4 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -23,6 +23,7 @@
 #ifndef TVM_RUNTIME_OBJECT_H_
 #define TVM_RUNTIME_OBJECT_H_
 
+#include <dmlc/logging.h>
 #include <type_traits>
 #include <string>
 #include <utility>
@@ -189,7 +190,7 @@ class Object {
    * \param key The type key.
    * \return the result.
    */
-  TVM_DLL static uint32_t TypeKey2Index(const char* key);
+  TVM_DLL static uint32_t TypeKey2Index(const std::string& key);
 
 #if TVM_OBJECT_ATOMIC_REF_COUNTER
   using RefCounterType = std::atomic<int32_t>;
@@ -197,18 +198,24 @@ class Object {
   using RefCounterType = int32_t;
 #endif
 
-  // Object type properties
   static constexpr const char* _type_key = "Object";
-  static constexpr bool _type_final = false;
-  static constexpr uint32_t _type_child_slots = 0;
-  static constexpr bool _type_child_slots_can_overflow = true;
+
   static uint32_t _GetOrAllocRuntimeTypeIndex() {
-    return 0;
+    return TypeIndex::kRoot;
   }
   static uint32_t RuntimeTypeIndex() {
-    return 0;
+    return TypeIndex::kRoot;
   }
 
+  // Default object type properties for sub-classes
+  static constexpr bool _type_final = false;
+  static constexpr uint32_t _type_child_slots = 0;
+  static constexpr bool _type_child_slots_can_overflow = true;
+  // NOTE: the following field is not type index of Object
+  // but was intended to be used by sub-classes as default value.
+  // The type index of Object is TypeIndex::kRoot
+  static constexpr uint32_t _type_index = TypeIndex::kDynamic;
+
   // Default constructor and copy constructor
   Object() {}
   // Override the copy and assign constructors to do nothing.
@@ -262,13 +269,12 @@ class Object {
    * \return The allocated type index.
    */
   TVM_DLL static uint32_t GetOrAllocRuntimeTypeIndex(
-      const char* key,
+      const std::string& key,
       uint32_t static_tindex,
       uint32_t parent_tindex,
       uint32_t type_child_slots,
       bool type_child_slots_can_overflow);
 
- private:
   // reference counter related operations
   /*! \brief developer function, increases reference counter. */
   inline void IncRef();
@@ -621,8 +627,8 @@ struct ObjectEqual {
  */
 #define TVM_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)              \
   static const uint32_t RuntimeTypeIndex()  {                           \
-    if (_type_index != ::tvm::runtime::TypeIndex::kDynamic) {           \
-      return _type_index;                                               \
+    if (TypeName::_type_index != ::tvm::runtime::TypeIndex::kDynamic) { \
+      return TypeName::_type_index;                                     \
     }                                                                   \
     return _GetOrAllocRuntimeTypeIndex();                               \
   }                                                                     \
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 649a5058a9a5..a42946ac2d2c 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -51,8 +51,6 @@ namespace tvm {
 class Integer;
 class DataType;
 class Expr;
-class Node;
-class NodeRef;
 
 namespace runtime {
 
@@ -516,9 +514,9 @@ class TVMPODValue_ {
     CHECK_LT(type_code_, kExtEnd);
     return static_cast<TExtension*>(value_.v_handle)[0];
   }
-  template<typename TNodeRef,
+  template<typename TObjectRef,
            typename = typename std::enable_if<
-             std::is_class<TNodeRef>::value>::type>
+             std::is_class<TObjectRef>::value>::type>
   inline bool IsObjectRef() const;
   int type_code() const {
     return type_code_;
@@ -620,8 +618,8 @@ class TVMArgValue : public TVMPODValue_ {
     return value_;
   }
   // Deferred extension handler.
-  template<typename TNodeRef>
-  inline TNodeRef AsNodeRef() const;
+  template<typename TObjectRef>
+  inline TObjectRef AsObjectRef() const;
   template<typename T,
            typename = typename std::enable_if<
            std::is_class<T>::value>::type>
@@ -834,13 +832,13 @@ class TVMRetValue : public TVMPODValue_ {
           type_code_ != kStr) << "TVMRetValue.value can only be used for POD data";
     return value_;
   }
-  // NodeRef related extenstions: in tvm/packed_func_ext.h
+  // ObjectRef related extenstions: in tvm/packed_func_ext.h
   template<typename T,
            typename = typename std::enable_if<
              std::is_class<T>::value>::type>
   inline operator T() const;
-  template<typename TNodeRef>
-  inline TNodeRef AsNodeRef() const;
+  template<typename TObjectRef>
+  inline TObjectRef AsObjectRef() const;
   // type related
   inline operator tvm::DataType() const;
   inline TVMRetValue& operator=(const tvm::DataType& other);
@@ -1306,7 +1304,7 @@ template<typename T, typename TSrc, bool is_ext, bool is_nd>
 struct TVMValueCast {
   static T Apply(const TSrc* self) {
     static_assert(!is_ext && !is_nd, "The default case accepts only non-extensions");
-    return self->template AsNodeRef<T>();
+    return self->template AsObjectRef<T>();
   }
 };
 
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 40e1a520cb67..d668984f50e2 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -91,7 +91,7 @@ class Registry {
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * int multiply(int x, int y) {
    *   return x * y;
    * }
@@ -115,7 +115,7 @@ class Registry {
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * // node subclass:
    * struct Example {
    *    int doThing(int x);
@@ -143,7 +143,7 @@ class Registry {
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * // node subclass:
    * struct Example {
    *    int doThing(int x);
@@ -168,22 +168,22 @@ class Registry {
 
   /*!
    * \brief set the body of the function to be the passed method pointer.
-   *        Used when calling a method on a Node subclass through a NodeRef subclass.
+   *        Used when calling a method on a Node subclass through a ObjectRef subclass.
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * // node subclass:
    * struct ExampleNode: BaseNode {
    *    int doThing(int x);
    * }
-   * 
+   *
    * // noderef subclass
-   * struct Example; 
+   * struct Example;
    *
    * TVM_REGISTER_API("Example_doThing")
    * .set_body_method<Example>(&ExampleNode::doThing); // will have type int(Example, int)
-   * 
+   *
    * // note that just doing:
    * // .set_body_method(&ExampleNode::doThing);
    * // wouldn't work, because ExampleNode can't be taken from a TVMArgValue.
@@ -191,15 +191,15 @@ class Registry {
    * \endcode
    *
    * \param f the method pointer to forward to.
-   * \tparam TNodeRef the node reference type to call the method on
+   * \tparam TObjectRef the node reference type to call the method on
    * \tparam TNode the node type containing the method (inferred).
    * \tparam R the return type of the function (inferred).
    * \tparam Args the argument types of the function (inferred).
    */
-  template<typename TNodeRef, typename TNode, typename R, typename ...Args,
-    typename = typename std::enable_if<std::is_base_of<NodeRef, TNodeRef>::value>::type>
+  template<typename TObjectRef, typename TNode, typename R, typename ...Args,
+    typename = typename std::enable_if<std::is_base_of<ObjectRef, TObjectRef>::value>::type>
   Registry& set_body_method(R (TNode::*f)(Args...)) {
-    return set_body_typed<R(TNodeRef, Args...)>([f](TNodeRef ref, Args... params) {
+    return set_body_typed<R(TObjectRef, Args...)>([f](TObjectRef ref, Args... params) {
       TNode* target = ref.operator->();
       // call method pointer
       return (target->*f)(params...);
@@ -208,22 +208,22 @@ class Registry {
 
   /*!
    * \brief set the body of the function to be the passed method pointer.
-   *        Used when calling a method on a Node subclass through a NodeRef subclass.
+   *        Used when calling a method on a Node subclass through a ObjectRef subclass.
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * // node subclass:
    * struct ExampleNode: BaseNode {
    *    int doThing(int x);
    * }
-   * 
+   *
    * // noderef subclass
-   * struct Example; 
+   * struct Example;
    *
    * TVM_REGISTER_API("Example_doThing")
    * .set_body_method<Example>(&ExampleNode::doThing); // will have type int(Example, int)
-   * 
+   *
    * // note that just doing:
    * // .set_body_method(&ExampleNode::doThing);
    * // wouldn't work, because ExampleNode can't be taken from a TVMArgValue.
@@ -231,15 +231,15 @@ class Registry {
    * \endcode
    *
    * \param f the method pointer to forward to.
-   * \tparam TNodeRef the node reference type to call the method on
+   * \tparam TObjectRef the node reference type to call the method on
    * \tparam TNode the node type containing the method (inferred).
    * \tparam R the return type of the function (inferred).
    * \tparam Args the argument types of the function (inferred).
    */
-  template<typename TNodeRef, typename TNode, typename R, typename ...Args,
-    typename = typename std::enable_if<std::is_base_of<NodeRef, TNodeRef>::value>::type>
+  template<typename TObjectRef, typename TNode, typename R, typename ...Args,
+    typename = typename std::enable_if<std::is_base_of<ObjectRef, TObjectRef>::value>::type>
   Registry& set_body_method(R (TNode::*f)(Args...) const) {
-    return set_body_typed<R(TNodeRef, Args...)>([f](TNodeRef ref, Args... params) {
+    return set_body_typed<R(TObjectRef, Args...)>([f](TObjectRef ref, Args... params) {
       const TNode* target = ref.operator->();
       // call method pointer
       return (target->*f)(params...);
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index 36265667e5b6..3f4ee38a7695 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -495,7 +495,7 @@ class StageNode : public Node {
   /*! \brief Number of direct child stages, only used for group stage.*/
   int num_child_stages{0};
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("op", &op);
     v->Visit("origin_op", &origin_op);
     v->Visit("all_iter_vars", &all_iter_vars);
@@ -540,7 +540,7 @@ class ScheduleNode : public Node {
    */
   std::unordered_map<const Node*, Stage> op2stage_cache_;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("outputs", &outputs);
     v->Visit("stages", &stages);
     v->Visit("groups", &groups);
@@ -617,7 +617,7 @@ class IterVarAttrNode : public Node {
    */
   Array<Expr> pragma_values;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("iter_type", &iter_type);
     v->Visit("bind_thread", &bind_thread);
     v->Visit("prefetch_data", &prefetch_data);
@@ -657,7 +657,7 @@ class SplitNode : public IterVarRelationNode {
   /*! \brief Number of parts, only factor or nparts can be given */
   Expr nparts;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("parent", &parent);
     v->Visit("outer", &outer);
     v->Visit("inner", &inner);
@@ -687,7 +687,7 @@ class FuseNode : public IterVarRelationNode {
   /*! \brief The target domain */
   IterVar fused;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("outer", &outer);
     v->Visit("inner", &inner);
     v->Visit("fused", &fused);
@@ -712,7 +712,7 @@ class RebaseNode : public IterVarRelationNode {
   /*! \brief The inner domain */
   IterVar rebased;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("parent", &parent);
     v->Visit("rebased", &rebased);
   }
@@ -732,7 +732,7 @@ class SingletonNode : public IterVarRelationNode {
   /*! \brief The singleton iterator */
   IterVar iter;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("iter", &iter);
   }
 
diff --git a/include/tvm/target_info.h b/include/tvm/target_info.h
index 1e3a7686ca00..86cb0e275609 100644
--- a/include/tvm/target_info.h
+++ b/include/tvm/target_info.h
@@ -47,7 +47,7 @@ struct MemoryInfoNode : public Node {
    */
   Expr head_address;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("unit_bits", &unit_bits);
     v->Visit("max_num_bits", &max_num_bits);
     v->Visit("max_simd_bits", &max_simd_bits);
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index 6471c9c69a62..599d6ff657d1 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -171,7 +171,7 @@ class TensorNode : public Node {
   /*! \brief constructor */
   TensorNode() {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("shape", &shape);
     v->Visit("dtype", &dtype);
     v->Visit("op", &op);
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index 152a27f6e2a9..0d4795ad5440 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -87,7 +87,7 @@ class TensorIntrinNode : public Node {
   /*! \brief constructor */
   TensorIntrinNode() {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("op", &op);
     v->Visit("inputs", &inputs);
@@ -152,7 +152,7 @@ class TensorIntrinCallNode : public Node {
   /*! \brief scalar expression inputs */
   Array<Expr> scalar_inputs;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("intrin", &intrin);
     v->Visit("tensors", &tensors);
     v->Visit("regions", &regions);
diff --git a/nnvm/src/compiler/compile_engine.h b/nnvm/src/compiler/compile_engine.h
index e8d33cb4be7e..ec9a13b13b17 100644
--- a/nnvm/src/compiler/compile_engine.h
+++ b/nnvm/src/compiler/compile_engine.h
@@ -55,7 +55,7 @@ struct GraphFuncNode : public tvm::Node {
   /*! \brief The lowered functions */
   tvm::Array<tvm::LoweredFunc> funcs;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("target", &target);
     v->Visit("func_name", &func_name);
     v->Visit("inputs", &inputs);
@@ -78,7 +78,7 @@ struct GraphCacheEntryNode : public tvm::Node {
   /*! \brief Index of the master node for calling schedule*/
   int master_idx;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("graph_func", &graph_func);
     v->Visit("use_count", &use_count);
     v->Visit("master_idx", &master_idx);
diff --git a/nnvm/src/compiler/graph_hash.h b/nnvm/src/compiler/graph_hash.h
index aed3462cf128..6966a152224b 100644
--- a/nnvm/src/compiler/graph_hash.h
+++ b/nnvm/src/compiler/graph_hash.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -48,7 +48,7 @@ struct GraphKeyNode : public tvm::Node {
   // The graph hash key is ensured always not to be 0
   mutable size_t cache_hash_key_{0};
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("inputs", &inputs);
     v->Visit("target", &target);
   }
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
index 3bfebe3ba4e8..d8ff3bf34bf8 100644
--- a/nnvm/src/compiler/graph_runtime.cc
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,11 +18,12 @@
  */
 
 /*!
- * Copyright (c) 2017 by Contributors
  * \file graph_runtime.cc
  * \brief Interface code with TVM graph runtime.
 */
 #include <dmlc/memory_io.h>
+#include <tvm/runtime/registry.h>
+
 #include <utility>
 #include "graph_runtime.h"
 
diff --git a/nnvm/src/compiler/graph_runtime.h b/nnvm/src/compiler/graph_runtime.h
index 7b324ba100ad..770c98e83261 100644
--- a/nnvm/src/compiler/graph_runtime.h
+++ b/nnvm/src/compiler/graph_runtime.h
@@ -61,13 +61,13 @@ struct NDArrayWrapperNode : public ::tvm::Node {
   std::string name;
   tvm::runtime::NDArray array;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("array", &array);
   }
 
   static constexpr const char* _type_key = "NDArrayWrapper";
-  TVM_DECLARE_NODE_TYPE_INFO(NDArrayWrapperNode, Node);
+  TVM_DECLARE_NODE_TYPE_INFO(NDArrayWrapperNode, tvm::Node);
 };
 
 TVM_DEFINE_NODE_REF(NDArrayWrapper, NDArrayWrapperNode);
diff --git a/src/README.md b/src/README.md
index 0c6f30a881b8..599f41dfdc5f 100644
--- a/src/README.md
+++ b/src/README.md
@@ -22,6 +22,8 @@ There can be internal header files within each module that sit in src.
 
 ## Modules
 - common: Internal common utilities.
+- runtime: Minimum runtime related codes.
+- node: base infra for IR/AST nodes that is dialect independent.
 - api: API function registration.
 - lang: The definition of DSL related data structure.
 - arithmetic: Arithmetic expression and set simplification.
@@ -29,7 +31,6 @@ There can be internal header files within each module that sit in src.
 - schedule: The operations on the schedule graph before converting to IR.
 - pass: The optimization pass on the IR structure.
 - codegen: The code generator.
-- runtime: Minimum runtime related codes.
 - autotvm: The auto-tuning module.
 - relay: Implementation of Relay. The second generation of NNVM, a new IR for deep learning frameworks.
 - contrib: Contrib extension libraries.
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index c25c35f636e6..42367efb15bb 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -26,6 +26,7 @@
 #include <tvm/expr.h>
 #include <tvm/tensor.h>
 #include <tvm/api_registry.h>
+#include <tvm/node/serialization.h>
 
 namespace tvm {
 TVM_REGISTER_API("_format_str")
@@ -43,10 +44,10 @@ TVM_REGISTER_API("_raw_ptr")
   });
 
 TVM_REGISTER_API("_save_json")
-.set_body_typed<std::string(NodeRef)>(SaveJSON);
+.set_body_typed<std::string(ObjectRef)>(SaveJSON);
 
 TVM_REGISTER_API("_load_json")
-.set_body_typed<NodeRef(std::string)>(LoadJSON<NodeRef>);
+.set_body_typed<ObjectRef(std::string)>(LoadJSON);
 
 TVM_REGISTER_API("_TVMSetStream")
 .set_body_typed(TVMSetStream);
diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc
deleted file mode 100644
index 64805c9e8aa0..000000000000
--- a/src/api/dsl_api.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Implementation of DSL API
- * \file dsl_api.cc
- */
-#include <dmlc/logging.h>
-#include <tvm/api_registry.h>
-#include <tvm/attrs.h>
-#include <tvm/expr.h>
-#include <vector>
-#include <string>
-
-namespace tvm {
-namespace runtime {
-
-struct APIAttrGetter : public AttrVisitor {
-  std::string skey;
-  TVMRetValue* ret;
-  bool found_ref_object{false};
-
-  void Visit(const char* key, double* value) final {
-    if (skey == key) *ret = value[0];
-  }
-  void Visit(const char* key, int64_t* value) final {
-    if (skey == key) *ret = value[0];
-  }
-  void Visit(const char* key, uint64_t* value) final {
-    CHECK_LE(value[0], static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
-        << "cannot return too big constant";
-    if (skey == key) *ret = static_cast<int64_t>(value[0]);
-  }
-  void Visit(const char* key, int* value) final {
-    if (skey == key) *ret = static_cast<int64_t>(value[0]);
-  }
-  void Visit(const char* key, bool* value) final {
-    if (skey == key) *ret = static_cast<int64_t>(value[0]);
-  }
-  void Visit(const char* key, void** value) final {
-    if (skey == key) *ret = static_cast<void*>(value[0]);
-  }
-  void Visit(const char* key, Type* value) final {
-    if (skey == key) *ret = value[0];
-  }
-  void Visit(const char* key, std::string* value) final {
-    if (skey == key) *ret = value[0];
-  }
-  void Visit(const char* key, NodeRef* value) final {
-    if (skey == key) {
-      *ret = value[0];
-      found_ref_object = true;
-    }
-  }
-  void Visit(const char* key, runtime::NDArray* value) final {
-    if (skey == key) {
-      *ret = value[0];
-      found_ref_object = true;
-    }
-  }
-  void Visit(const char* key, runtime::ObjectRef* value) final {
-    if (skey == key) {
-      *ret = value[0];
-      found_ref_object = true;
-    }
-  }
-};
-
-struct APIAttrDir : public AttrVisitor {
-  std::vector<std::string>* names;
-
-  void Visit(const char* key, double* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, int64_t* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, uint64_t* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, bool* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, int* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, void** value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, Type* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, std::string* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, NodeRef* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, runtime::NDArray* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, runtime::ObjectRef* value) final {
-    names->push_back(key);
-  }
-};
-
-struct NodeAPI {
-  static void GetAttr(TVMArgs args, TVMRetValue* ret) {
-    NodeRef ref = args[0];
-    Node* tnode = const_cast<Node*>(ref.get());
-    APIAttrGetter getter;
-    getter.skey = args[1].operator std::string();
-    getter.ret = ret;
-
-    bool success;
-    if (getter.skey == "type_key") {
-      *ret = tnode->GetTypeKey();
-      success = true;
-    } else if (!tnode->IsInstance<DictAttrsNode>()) {
-      tnode->VisitAttrs(&getter);
-      success = getter.found_ref_object || ret->type_code() != kNull;
-    } else {
-      // specially handle dict attr
-      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode);
-      auto it = dnode->dict.find(getter.skey);
-      if (it != dnode->dict.end()) {
-        success = true;
-        *ret = (*it).second;
-      } else {
-        success = false;
-      }
-    }
-    if (!success) {
-      LOG(FATAL) << "AttributeError: " << tnode->GetTypeKey()
-                 << " object has no attributed " << getter.skey;
-    }
-  }
-
-  static void ListAttrNames(TVMArgs args, TVMRetValue* ret) {
-    NodeRef ref = args[0];
-    Node* tnode = const_cast<Node*>(ref.get());
-    auto names = std::make_shared<std::vector<std::string> >();
-    APIAttrDir dir;
-    dir.names = names.get();
-
-    if (!tnode->IsInstance<DictAttrsNode>()) {
-      tnode->VisitAttrs(&dir);
-    } else {
-      // specially handle dict attr
-      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode);
-      for (const auto& kv : dnode->dict) {
-        names->push_back(kv.first);
-      }
-    }
-
-    *ret = PackedFunc([names](TVMArgs args, TVMRetValue *rv) {
-        int64_t i = args[0];
-        if (i == -1) {
-          *rv = static_cast<int64_t>(names->size());
-        } else {
-          *rv = (*names)[i];
-        }
-      });
-  }
-};
-
-TVM_REGISTER_GLOBAL("_NodeGetAttr")
-.set_body(NodeAPI::GetAttr);
-
-TVM_REGISTER_GLOBAL("_NodeListAttrNames")
-.set_body(NodeAPI::ListAttrNames);
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/arithmetic/bound_deducer.cc b/src/arithmetic/bound_deducer.cc
index 6f7b4d78da05..9c3a706e2ad0 100644
--- a/src/arithmetic/bound_deducer.cc
+++ b/src/arithmetic/bound_deducer.cc
@@ -53,17 +53,17 @@ class VariablePathFinder: public IRVisitor {
     if (!found_) path_.pop_back();
   }
 
-  std::vector<const Node*> path_;
+  std::vector<const Object*> path_;
 
  private:
   bool found_{false};
   Expr target_;
-  std::unordered_set<const Node*> visited_;
+  std::unordered_set<const Object*> visited_;
 };
 
 // get the path to the variable,
 // return empty vector to represent failure
-std::vector<const Node*> GetPath(Expr target, Expr expr) {
+std::vector<const Object*> GetPath(Expr target, Expr expr) {
   VariablePathFinder v(target);
   v.Visit(expr);
   return v.path_;
@@ -189,7 +189,7 @@ class BoundDeducer: public IRVisitor {
   const std::unordered_map<const Variable*, IntSet>& hint_map_;
   const std::unordered_map<const Variable*, IntSet>& relax_map_;
   ExprIntSetMap expr_map_;
-  std::vector<const Node*> path_;
+  std::vector<const Object*> path_;
   size_t iter_{0};
   // internal analzyer
   Analyzer analyzer_;
diff --git a/src/arithmetic/canonical_simplify.cc b/src/arithmetic/canonical_simplify.cc
index 02e8079c9c7b..1b576a645824 100644
--- a/src/arithmetic/canonical_simplify.cc
+++ b/src/arithmetic/canonical_simplify.cc
@@ -43,6 +43,7 @@ class SplitExpr;
  */
 class CanonicalExprNode : public BaseExprNode {
  public:
+  virtual ~CanonicalExprNode() {}
   /*!
    * \brief Return the normal Expr that is equivalent to self.
    * \note Can mutate the internal data structure.
@@ -51,7 +52,7 @@ class CanonicalExprNode : public BaseExprNode {
   virtual Expr Normalize() const = 0;
 
   // overrides
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
   }
 
   static constexpr const char* _type_key = "arith.CanonicalExpr";
@@ -485,7 +486,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
    * \return Normalized expr.
    */
   Expr Normalize(Expr expr) {
-    if (const auto* op = expr.as_derived<CanonicalExprNode>()) {
+    if (const auto* op = expr.as<CanonicalExprNode>()) {
       return op->Normalize();
     } else {
       return expr;
@@ -503,7 +504,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
     if (const auto* op = expr.as<SumExprNode>()) {
       if (op->base == 0 && op->args.size() == 1) return op->args[0];
     }
-    if (const auto* op = expr.as_derived<CanonicalExprNode>()) {
+    if (const auto* op = expr.as<CanonicalExprNode>()) {
       expr = op->Normalize();
     }
     NodePtr<SplitExprNode> n = make_node<SplitExprNode>();
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index 313b34ded034..409477578758 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -807,6 +807,8 @@ IntSet EvalSet(Range r,
   return EvalSet(r, ConvertDomMap(dom_map));
 }
 
+TVM_REGISTER_NODE_TYPE(IntervalSetNode);
+
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<IntervalSetNode>([](const IntervalSetNode *op, IRPrinter *p) {
     p->stream << "IntervalSet"
diff --git a/src/arithmetic/int_set.h b/src/arithmetic/int_set.h
index 306361868759..831b44409030 100644
--- a/src/arithmetic/int_set.h
+++ b/src/arithmetic/int_set.h
@@ -47,7 +47,7 @@ class IntervalSetNode : public IntSetNode {
   Expr max_value;
 
   // visitor overload.
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("min_value", &min_value);
     v->Visit("max_value", &max_value);
   }
diff --git a/src/codegen/spirv/intrin_rule_spirv.cc b/src/codegen/spirv/intrin_rule_spirv.cc
index a046cc4f458c..fca9aa203f80 100644
--- a/src/codegen/spirv/intrin_rule_spirv.cc
+++ b/src/codegen/spirv/intrin_rule_spirv.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,9 +18,9 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file intrin_rule_spirv.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/ir.h>
 #include <GLSL.std.450.h>
diff --git a/src/lang/api_registry.cc b/src/lang/api_registry.cc
index e041f3a2dd2d..cd3d43b7dcf3 100644
--- a/src/lang/api_registry.cc
+++ b/src/lang/api_registry.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -62,7 +62,7 @@ TVM_REGISTER_API("_EnvFuncGetPackedFunc")
 
 TVM_REGISTER_NODE_TYPE(EnvFuncNode)
 .set_creator(CreateEnvNode)
-.set_global_key([](const Node* n) {
+.set_global_key([](const Object* n) {
     return static_cast<const EnvFuncNode*>(n)->name;
   });
 
diff --git a/src/lang/ir.cc b/src/lang/ir.cc
index 48b486a7e13b..04e04aef455c 100644
--- a/src/lang/ir.cc
+++ b/src/lang/ir.cc
@@ -1150,6 +1150,8 @@ TVM_REGISTER_NODE_TYPE(Select);
 TVM_REGISTER_NODE_TYPE(Load);
 TVM_REGISTER_NODE_TYPE(Ramp);
 TVM_REGISTER_NODE_TYPE(Broadcast);
+TVM_REGISTER_NODE_TYPE(Shuffle);
+TVM_REGISTER_NODE_TYPE(Prefetch);
 TVM_REGISTER_NODE_TYPE(Call);
 TVM_REGISTER_NODE_TYPE(Let);
 TVM_REGISTER_NODE_TYPE(LetStmt);
diff --git a/src/lang/target_info.cc b/src/lang/target_info.cc
index ff6a35286f20..481a9269193b 100644
--- a/src/lang/target_info.cc
+++ b/src/lang/target_info.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,9 +18,9 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file target_info.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/target_info.h>
 #include <tvm/packed_func_ext.h>
 
diff --git a/src/node/reflection.cc b/src/node/reflection.cc
new file mode 100644
index 000000000000..e92ca92834a2
--- /dev/null
+++ b/src/node/reflection.cc
@@ -0,0 +1,306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Reflection utilities.
+ * \file node/reflection.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/node/node.h>
+#include <tvm/node/container.h>
+#include <tvm/node/reflection.h>
+#include <tvm/attrs.h>
+
+namespace tvm {
+
+// Attr getter.
+class AttrGetter : public AttrVisitor {
+ public:
+  const std::string& skey;
+  TVMRetValue* ret;
+
+  AttrGetter(const std::string &skey,
+             TVMRetValue* ret)
+      : skey(skey), ret(ret) {}
+
+  bool found_ref_object{false};
+
+  void Visit(const char* key, double* value) final {
+    if (skey == key) *ret = value[0];
+  }
+  void Visit(const char* key, int64_t* value) final {
+    if (skey == key) *ret = value[0];
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    CHECK_LE(value[0], static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
+        << "cannot return too big constant";
+    if (skey == key) *ret = static_cast<int64_t>(value[0]);
+  }
+  void Visit(const char* key, int* value) final {
+    if (skey == key) *ret = static_cast<int64_t>(value[0]);
+  }
+  void Visit(const char* key, bool* value) final {
+    if (skey == key) *ret = static_cast<int64_t>(value[0]);
+  }
+  void Visit(const char* key, void** value) final {
+    if (skey == key) *ret = static_cast<void*>(value[0]);
+  }
+  void Visit(const char* key, Type* value) final {
+    if (skey == key) *ret = value[0];
+  }
+  void Visit(const char* key, std::string* value) final {
+    if (skey == key) *ret = value[0];
+  }
+
+  void Visit(const char* key, runtime::NDArray* value) final {
+    if (skey == key) {
+      *ret = value[0];
+      found_ref_object = true;
+    }
+  }
+  void Visit(const char* key, runtime::ObjectRef* value) final {
+    if (skey == key) {
+      *ret = value[0];
+      found_ref_object = true;
+    }
+  }
+};
+
+runtime::TVMRetValue ReflectionVTable::GetAttr(
+    Object* self, const std::string& field_name) const {
+  runtime::TVMRetValue ret;
+  AttrGetter getter(field_name, &ret);
+
+  bool success;
+  if (getter.skey == "type_key") {
+    ret = self->GetTypeKey();
+    success = true;
+  } else if (!self->IsInstance<DictAttrsNode>()) {
+    VisitAttrs(self, &getter);
+    success = getter.found_ref_object || ret.type_code() != kNull;
+  } else {
+    // specially handle dict attr
+    DictAttrsNode* dnode = static_cast<DictAttrsNode*>(self);
+    auto it = dnode->dict.find(getter.skey);
+    if (it != dnode->dict.end()) {
+      success = true;
+      ret = (*it).second;
+    } else {
+      success = false;
+    }
+  }
+  if (!success) {
+      LOG(FATAL) << "AttributeError: " << self->GetTypeKey()
+                 << " object has no attributed " << getter.skey;
+  }
+  return ret;
+}
+
+// List names;
+class AttrDir : public AttrVisitor {
+ public:
+  std::vector<std::string>* names;
+
+  void Visit(const char* key, double* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, int64_t* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, bool* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, int* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, void** value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, Type* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, std::string* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, runtime::ObjectRef* value) final {
+    names->push_back(key);
+  }
+};
+
+std::vector<std::string>
+ReflectionVTable::ListAttrNames(Object* self) const {
+  std::vector<std::string> names;
+  AttrDir dir;
+  dir.names = &names;
+
+  if (!self->IsInstance<DictAttrsNode>()) {
+    VisitAttrs(self, &dir);
+  } else {
+    // specially handle dict attr
+    DictAttrsNode* dnode = static_cast<DictAttrsNode*>(self);
+    for (const auto& kv : dnode->dict) {
+      names.push_back(kv.first);
+    }
+  }
+  return names;
+}
+
+ReflectionVTable* ReflectionVTable::Global() {
+  static ReflectionVTable inst;
+  return &inst;
+}
+
+ObjectPtr<Object>
+ReflectionVTable::CreateInitObject(const std::string& type_key,
+                                   const std::string& global_key) const {
+  uint32_t tindex = Object::TypeKey2Index(type_key);
+  if (tindex >= fvisit_attrs_.size() || fvisit_attrs_[tindex] == nullptr) {
+    LOG(FATAL) << "TypeError: " << type_key
+               << " is not registered via TVM_REGISTER_NODE_TYPE";
+  }
+  return fcreate_[tindex](global_key);
+}
+
+class NodeAttrSetter : public AttrVisitor {
+ public:
+  std::string type_key;
+  std::unordered_map<std::string, runtime::TVMArgValue> attrs;
+
+  void Visit(const char* key, double* value) final {
+    *value = GetAttr(key).operator double();
+  }
+  void Visit(const char* key, int64_t* value) final {
+    *value = GetAttr(key).operator int64_t();
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    *value = GetAttr(key).operator uint64_t();
+  }
+  void Visit(const char* key, int* value) final {
+    *value = GetAttr(key).operator int();
+  }
+  void Visit(const char* key, bool* value) final {
+    *value = GetAttr(key).operator bool();
+  }
+  void Visit(const char* key, std::string* value) final {
+    *value = GetAttr(key).operator std::string();
+  }
+  void Visit(const char* key, void** value) final {
+    *value = GetAttr(key).operator void*();
+  }
+  void Visit(const char* key, DataType* value) final {
+    *value = GetAttr(key).operator DataType();
+  }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    *value = GetAttr(key).operator runtime::NDArray();
+  }
+  void Visit(const char* key, ObjectRef* value) final {
+    *value = GetAttr(key).operator ObjectRef();
+  }
+
+ private:
+  runtime::TVMArgValue GetAttr(const char* key) {
+    auto it = attrs.find(key);
+    if (it == attrs.end()) {
+      LOG(FATAL) << type_key << ": require field " << key;
+    }
+    runtime::TVMArgValue v = it->second;
+    attrs.erase(it);
+    return v;
+  }
+};
+
+void InitNodeByPackedArgs(Object* n, const TVMArgs& args) {
+  NodeAttrSetter setter;
+  setter.type_key = n->GetTypeKey();
+  CHECK_EQ(args.size() % 2, 0);
+  for (int i = 0; i < args.size(); i += 2) {
+    setter.attrs.emplace(args[i].operator std::string(),
+                         args[i + 1]);
+  }
+  auto* reflection = ReflectionVTable::Global();
+  reflection->VisitAttrs(n, &setter);
+
+  if (setter.attrs.size() != 0) {
+    std::ostringstream os;
+    os << setter.type_key << " does not contain field ";
+    for (const auto &kv : setter.attrs) {
+      os << " " << kv.first;
+    }
+    LOG(FATAL) << os.str();
+  }
+}
+
+// Expose to FFI APIs.
+void NodeGetAttr(TVMArgs args, TVMRetValue* ret) {
+  CHECK_EQ(args[0].type_code(), kObjectHandle);
+  Object* self = static_cast<Object*>(args[0].value().v_handle);
+  *ret = ReflectionVTable::Global()->GetAttr(self, args[1]);
+}
+
+void NodeListAttrNames(TVMArgs args, TVMRetValue* ret) {
+  CHECK_EQ(args[0].type_code(), kObjectHandle);
+  Object* self = static_cast<Object*>(args[0].value().v_handle);
+
+  auto names = std::make_shared<std::vector<std::string> >(
+      ReflectionVTable::Global()->ListAttrNames(self));
+
+  *ret = PackedFunc([names](TVMArgs args, TVMRetValue *rv) {
+      int64_t i = args[0];
+      if (i == -1) {
+        *rv = static_cast<int64_t>(names->size());
+      } else {
+        *rv = (*names)[i];
+      }
+    });
+}
+
+// API function to make node.
+// args format:
+//   key1, value1, ..., key_n, value_n
+void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
+  std::string type_key = args[0];
+  std::string empty_str;
+  TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
+  auto* reflection = ReflectionVTable::Global();
+  ObjectPtr<Object> n = reflection->CreateInitObject(type_key);
+  if (n->IsInstance<BaseAttrsNode>()) {
+    static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
+  } else {
+    InitNodeByPackedArgs(n.get(), kwargs);
+  }
+  *rv = ObjectRef(n);
+}
+
+
+TVM_REGISTER_GLOBAL("_NodeGetAttr")
+.set_body(NodeGetAttr);
+
+TVM_REGISTER_GLOBAL("_NodeListAttrNames")
+.set_body(NodeListAttrNames);
+
+TVM_REGISTER_GLOBAL("make._Node")
+.set_body(MakeNode);
+
+}  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/node/serialization.cc
similarity index 64%
rename from src/lang/reflection.cc
rename to src/node/serialization.cc
index 8e2c3fe7cd15..d270e72d3958 100644
--- a/src/lang/reflection.cc
+++ b/src/node/serialization.cc
@@ -18,50 +18,42 @@
  */
 
 /*!
- * \file reflection.cc
- * \brief Utilities to save/load/construct TVM objects
+ * \file node/serialization.cc
+ * \brief Utilities to serialize TVM AST/IR objects.
  */
-#include <tvm/base.h>
-#include <tvm/expr.h>
-#include <tvm/attrs.h>
-#include <tvm/node/container.h>
-#include <tvm/packed_func_ext.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
 #include <dmlc/json.h>
 #include <dmlc/memory_io.h>
+
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/node/container.h>
+#include <tvm/node/reflection.h>
+#include <tvm/node/serialization.h>
+#include <tvm/attrs.h>
+
 #include <string>
-#include "../common/base64.h"
+#include <map>
 
-namespace dmlc {
-DMLC_REGISTRY_ENABLE(::tvm::NodeFactoryReg);
-}  // namespace dmlc
+#include "../common/base64.h"
 
 namespace tvm {
 
-::dmlc::Registry<NodeFactoryReg>* NodeFactoryReg::Registry() {
-  return ::dmlc::Registry<NodeFactoryReg>::Get();
-}
-
-inline std::string Type2String(const Type& t) {
+inline std::string Type2String(const DataType& t) {
   return runtime::TVMType2String(Type2TVMType(t));
 }
 
-
 inline Type String2Type(std::string s) {
   return TVMType2Type(runtime::String2TVMType(s));
 }
 
-using runtime::Object;
-using runtime::ObjectRef;
-
 // indexer to index all the nodes
 class NodeIndexer : public AttrVisitor {
  public:
-  std::unordered_map<Object*, size_t> node_index{{nullptr, 0}};
-  std::vector<Object*> node_list{nullptr};
-  std::unordered_map<DLTensor*, size_t> tensor_index;
-  std::vector<DLTensor*> tensor_list;
+  std::unordered_map<Object*, size_t> node_index_{{nullptr, 0}};
+  std::vector<Object*> node_list_{nullptr};
+  std::unordered_map<DLTensor*, size_t> tensor_index_;
+  std::vector<DLTensor*> tensor_list_;
+  ReflectionVTable* reflection_ = ReflectionVTable::Global();
 
   void Visit(const char* key, double* value) final {}
   void Visit(const char* key, int64_t* value) final {}
@@ -70,17 +62,14 @@ class NodeIndexer : public AttrVisitor {
   void Visit(const char* key, bool* value) final {}
   void Visit(const char* key, std::string* value) final {}
   void Visit(const char* key, void** value) final {}
-  void Visit(const char* key, Type* value) final {}
-  void Visit(const char* key, NodeRef* value) final {
-    MakeIndex(const_cast<Node*>(value->get()));
-  }
+  void Visit(const char* key, DataType* value) final {}
 
   void Visit(const char* key, runtime::NDArray* value) final {
     DLTensor* ptr = const_cast<DLTensor*>((*value).operator->());
-    if (tensor_index.count(ptr)) return;
-    CHECK_EQ(tensor_index.size(), tensor_list.size());
-    tensor_index[ptr] = tensor_list.size();
-    tensor_list.push_back(ptr);
+    if (tensor_index_.count(ptr)) return;
+    CHECK_EQ(tensor_index_.size(), tensor_list_.size());
+    tensor_index_[ptr] = tensor_list_.size();
+    tensor_list_.push_back(ptr);
   }
 
   void Visit(const char* key, ObjectRef* value) final {
@@ -88,15 +77,14 @@ class NodeIndexer : public AttrVisitor {
   }
 
   // make index of all the children of node
-  void MakeIndex(Object* ptr) {
-    if (ptr == nullptr) return;
-    CHECK(ptr->IsInstance<Node>());
-    auto* node = static_cast<Node*>(ptr);
+  void MakeIndex(Object* node) {
+    if (node == nullptr) return;
+    CHECK(node->IsInstance<Node>());
 
-    if (node_index.count(node)) return;
-    CHECK_EQ(node_index.size(), node_list.size());
-    node_index[node] = node_list.size();
-    node_list.push_back(node);
+    if (node_index_.count(node)) return;
+    CHECK_EQ(node_index_.size(), node_list_.size());
+    node_index_[node] = node_list_.size();
+    node_list_.push_back(node);
 
     if (node->IsInstance<ArrayNode>()) {
       ArrayNode* n = static_cast<ArrayNode*>(node);
@@ -115,7 +103,7 @@ class NodeIndexer : public AttrVisitor {
         MakeIndex(const_cast<Object*>(kv.second.get()));
       }
     } else {
-      static_cast<Node*>(node)->VisitAttrs(this);
+      reflection_->VisitAttrs(node, this);
     }
   }
 };
@@ -123,17 +111,17 @@ class NodeIndexer : public AttrVisitor {
 // use map so attributes are ordered.
 using AttrMap = std::map<std::string, std::string>;
 
-// A Node structure for JSON node.
+/*! \brief Node structure for json format. */
 struct JSONNode {
-  // The type key of the data
+  /*! \brief The type of key of the object. */
   std::string type_key;
-  // The global key for global object
+  /*! \brief The global key for global object. */
   std::string global_key;
-  // the attributes
+  /*! \brief the attributes */
   AttrMap attrs;
-  // container keys
+  /*! \brief keys of a map. */
   std::vector<std::string> keys;
-  // container data
+  /*! \brief values of a map or array. */
   std::vector<size_t> data;
 
   void Save(dmlc::JSONWriter *writer) const {
@@ -169,11 +157,14 @@ struct JSONNode {
   }
 };
 
+// Helper class to populate the json node
+// using the existing index.
 class JSONAttrGetter : public AttrVisitor {
  public:
   const std::unordered_map<Object*, size_t>* node_index_;
   const std::unordered_map<DLTensor*, size_t>* tensor_index_;
   JSONNode* node_;
+  ReflectionVTable* reflection_ = ReflectionVTable::Global();
 
   void Visit(const char* key, double* value) final {
     node_->attrs[key] = std::to_string(*value);
@@ -196,40 +187,36 @@ class JSONAttrGetter : public AttrVisitor {
   void Visit(const char* key, void** value) final {
     LOG(FATAL) << "not allowed to serialize a pointer";
   }
-  void Visit(const char* key, Type* value) final {
+  void Visit(const char* key, DataType* value) final {
     node_->attrs[key] = Type2String(*value);
   }
-  void Visit(const char* key, NodeRef* value) final {
-    node_->attrs[key] = std::to_string(
-        node_index_->at(const_cast<Node*>(value->get())));
-  }
+
   void Visit(const char* key, runtime::NDArray* value) final {
     node_->attrs[key] = std::to_string(
         tensor_index_->at(const_cast<DLTensor*>((*value).operator->())));
   }
+
   void Visit(const char* key, ObjectRef* value) final {
-    LOG(FATAL) << "Do not support json serialize non-node object";
+    node_->attrs[key] = std::to_string(
+        node_index_->at(const_cast<Object*>(value->get())));
   }
+
   // Get the node
-  void Get(Object* ptr) {
-    if (ptr == nullptr) {
+  void Get(Object* node) {
+    if (node == nullptr) {
       node_->type_key.clear();
       return;
     }
-    CHECK(ptr->IsInstance<Node>());
-    auto* node = static_cast<Node*>(ptr);
     node_->type_key = node->GetTypeKey();
+    node_->global_key = reflection_->GetGlobalKey(node);
+    // No need to recursively visit fields of global singleton
+    // They are registered via the environment.
+    if (node_->global_key.length() != 0) return;
 
-    // sepcially handle global object
-    auto* f = dmlc::Registry<NodeFactoryReg>::Find(node_->type_key);
-    CHECK(f != nullptr)
-        << "Node type \'" << node_->type_key << "\' is not registered in TVM";
-    if (f->fglobal_key != nullptr) {
-      node_->global_key = f->fglobal_key(node);
-      return;
-    }
+    // populates the fields.
     node_->attrs.clear();
     node_->data.clear();
+
     if (node->IsInstance<ArrayNode>()) {
       ArrayNode* n = static_cast<ArrayNode*>(node);
       for (size_t i = 0; i < n->data.size(); ++i) {
@@ -252,23 +239,22 @@ class JSONAttrGetter : public AttrVisitor {
             node_index_->at(const_cast<Object*>(kv.second.get())));
       }
     } else {
-      // do not need to recover content of global singleton object
-      // they are registered via the environment
-      auto* f = dmlc::Registry<NodeFactoryReg>::Find(node->GetTypeKey());
-      if (f != nullptr && f->fglobal_key != nullptr) return;
       // recursively index normal object.
-      node->VisitAttrs(this);
+      reflection_->VisitAttrs(node, this);
     }
   }
 };
 
+// Helper class to set the attributes of a node
+// from given json node.
 class JSONAttrSetter : public AttrVisitor {
  public:
   const std::vector<ObjectPtr<Object> >* node_list_;
   const std::vector<runtime::NDArray>* tensor_list_;
-
   JSONNode* node_;
 
+  ReflectionVTable* reflection_ = ReflectionVTable::Global();
+
   std::string GetValue(const char* key) const {
     auto it = node_->attrs.find(key);
     if (it == node_->attrs.end()) {
@@ -305,16 +291,10 @@ class JSONAttrSetter : public AttrVisitor {
   void Visit(const char* key, void** value) final {
     LOG(FATAL) << "not allowed to deserialize a pointer";
   }
-  void Visit(const char* key, Type* value) final {
+  void Visit(const char* key, DataType* value) final {
     std::string stype = GetValue(key);
     *value = String2Type(stype);
   }
-  void Visit(const char* key, NodeRef* value) final {
-    size_t index;
-    ParseValue(key, &index);
-    CHECK_LE(index, node_list_->size());
-    *value = NodeRef(node_list_->at(index));
-  }
   void Visit(const char* key, runtime::NDArray* value) final {
     size_t index;
     ParseValue(key, &index);
@@ -322,14 +302,15 @@ class JSONAttrSetter : public AttrVisitor {
     *value = tensor_list_->at(index);
   }
   void Visit(const char* key, ObjectRef* value) final {
-    LOG(FATAL) << "Do not support json serialize non-node object";
+    size_t index;
+    ParseValue(key, &index);
+    CHECK_LE(index, node_list_->size());
+    *value = ObjectRef(node_list_->at(index));
   }
   // set node to be current JSONNode
-  void Set(Object* ptr) {
-    if (ptr == nullptr) return;
+  void Set(Object* node) {
+    if (node == nullptr) return;
 
-    CHECK(ptr->IsInstance<Node>());
-    auto* node = static_cast<Node*>(ptr);
     if (node->IsInstance<ArrayNode>()) {
       ArrayNode* n = static_cast<ArrayNode*>(node);
       n->data.clear();
@@ -351,7 +332,7 @@ class JSONAttrSetter : public AttrVisitor {
             = ObjectRef(node_list_->at(node_->data[i]));
       }
     } else {
-      node->VisitAttrs(this);
+      reflection_->VisitAttrs(node, this);
     }
   }
 };
@@ -393,18 +374,18 @@ struct JSONGraph {
     NodeIndexer indexer;
     indexer.MakeIndex(const_cast<Object*>(root.get()));
     JSONAttrGetter getter;
-    getter.node_index_ = &indexer.node_index;
-    getter.tensor_index_ = &indexer.tensor_index;
-    for (Object* n : indexer.node_list) {
+    getter.node_index_ = &indexer.node_index_;
+    getter.tensor_index_ = &indexer.tensor_index_;
+    for (Object* n : indexer.node_list_) {
       JSONNode jnode;
       getter.node_ = &jnode;
       getter.Get(n);
       g.nodes.emplace_back(std::move(jnode));
     }
     g.attrs["tvm_version"] = TVM_VERSION;
-    g.root = indexer.node_index.at(const_cast<Object*>(root.get()));
+    g.root = indexer.node_index_.at(const_cast<Object*>(root.get()));
     // serialize tensor
-    for (DLTensor* tensor : indexer.tensor_list) {
+    for (DLTensor* tensor : indexer.tensor_list_) {
       std::string blob;
       dmlc::MemoryStringStream mstrm(&blob);
       common::Base64OutStream b64strm(&mstrm);
@@ -416,7 +397,7 @@ struct JSONGraph {
   }
 };
 
-std::string SaveJSON(const NodeRef& n) {
+std::string SaveJSON(const ObjectRef& n) {
   auto jgraph = JSONGraph::Create(n);
   std::ostringstream os;
   dmlc::JSONWriter writer(&os);
@@ -424,8 +405,7 @@ std::string SaveJSON(const NodeRef& n) {
   return os.str();
 }
 
-ObjectPtr<Object> LoadJSON_(std::string json_str) {
-  LOG(INFO) << json_str;
+ObjectRef LoadJSON(std::string json_str) {
   std::istringstream is(json_str);
   dmlc::JSONReader reader(&is);
   JSONGraph jgraph;
@@ -442,16 +422,18 @@ ObjectPtr<Object> LoadJSON_(std::string json_str) {
     CHECK(temp.Load(&b64strm));
     tensors.emplace_back(temp);
   }
+  ReflectionVTable* reflection = ReflectionVTable::Global();
+
   // node 0 is always null
   nodes.reserve(jgraph.nodes.size());
+
   for (const JSONNode& jnode : jgraph.nodes) {
     if (jnode.type_key.length() != 0) {
-      auto* f = dmlc::Registry<NodeFactoryReg>::Find(jnode.type_key);
-      CHECK(f != nullptr)
-          << "Node type \'" << jnode.type_key << "\' is not registered in TVM";
-      nodes.emplace_back(f->fcreator(jnode.global_key));
+      ObjectPtr<Object> node =
+          reflection->CreateInitObject(jnode.type_key, jnode.global_key);
+      nodes.emplace_back(node);
     } else {
-      nodes.emplace_back(NodePtr<Node>());
+      nodes.emplace_back(ObjectPtr<Object>());
     }
   }
   CHECK_EQ(nodes.size(), jgraph.nodes.size());
@@ -467,101 +449,6 @@ ObjectPtr<Object> LoadJSON_(std::string json_str) {
       setter.Set(nodes[i].get());
     }
   }
-  return nodes.at(jgraph.root);
+  return ObjectRef(nodes.at(jgraph.root));
 }
-
-class NodeAttrSetter : public AttrVisitor {
- public:
-  std::string type_key;
-  std::unordered_map<std::string, runtime::TVMArgValue> attrs;
-
-  void Visit(const char* key, double* value) final {
-    *value = GetAttr(key).operator double();
-  }
-  void Visit(const char* key, int64_t* value) final {
-    *value = GetAttr(key).operator int64_t();
-  }
-  void Visit(const char* key, uint64_t* value) final {
-    *value = GetAttr(key).operator uint64_t();
-  }
-  void Visit(const char* key, int* value) final {
-    *value = GetAttr(key).operator int();
-  }
-  void Visit(const char* key, bool* value) final {
-    *value = GetAttr(key).operator bool();
-  }
-  void Visit(const char* key, std::string* value) final {
-    *value = GetAttr(key).operator std::string();
-  }
-  void Visit(const char* key, void** value) final {
-    *value = GetAttr(key).operator void*();
-  }
-  void Visit(const char* key, Type* value) final {
-    *value = GetAttr(key).operator Type();
-  }
-  void Visit(const char* key, NodeRef* value) final {
-    *value = GetAttr(key).operator NodeRef();
-  }
-  void Visit(const char* key, runtime::NDArray* value) final {
-    *value = GetAttr(key).operator runtime::NDArray();
-  }
-  void Visit(const char* key, ObjectRef* value) final {
-    *value = GetAttr(key).operator ObjectRef();
-  }
-
- private:
-  runtime::TVMArgValue GetAttr(const char* key) {
-    auto it = attrs.find(key);
-    if (it == attrs.end()) {
-      LOG(FATAL) << type_key << ": require field " << key;
-    }
-    runtime::TVMArgValue v = it->second;
-    attrs.erase(it);
-    return v;
-  }
-};
-
-
-void InitNodeByPackedArgs(Node* n, const TVMArgs& args) {
-  NodeAttrSetter setter;
-  setter.type_key = n->GetTypeKey();
-  CHECK_EQ(args.size() % 2, 0);
-  for (int i = 0; i < args.size(); i += 2) {
-    setter.attrs.emplace(args[i].operator std::string(),
-                         args[i + 1]);
-  }
-  n->VisitAttrs(&setter);
-  if (setter.attrs.size() != 0) {
-    std::ostringstream os;
-    os << setter.type_key << " does not contain field ";
-    for (const auto &kv : setter.attrs) {
-      os << " " << kv.first;
-    }
-    LOG(FATAL) << os.str();
-  }
-}
-
-// API function to make node.
-// args format:
-//   key1, value1, ..., key_n, value_n
-void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
-  std::string type_key = args[0];
-  std::string empty_str;
-  auto* f = dmlc::Registry<NodeFactoryReg>::Find(type_key);
-  CHECK(f != nullptr)
-      << "Node type \'" << type_key << "\' is not registered in TVM";
-  TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
-  CHECK(f->fglobal_key == nullptr)
-      << "Cannot make node type \'" << type_key << "\' with global_key.";
-  NodePtr<Node> n = f->fcreator(empty_str);
-  if (n->IsInstance<BaseAttrsNode>()) {
-    static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
-  } else {
-    InitNodeByPackedArgs(n.get(), kwargs);
-  }
-  *rv = NodeRef(n);
-}
-
-TVM_REGISTER_GLOBAL("make._Node")
-.set_body(MakeNode);
 }  // namespace tvm
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index e09ae0648534..65f5eed8d405 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -59,7 +59,7 @@ struct CachedFuncNode : public Node {
   /*! \brief Parameter usage states in the shape function. */
   tvm::Array<Integer> shape_func_param_states;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("target", &target);
     v->Visit("func_name", &func_name);
     v->Visit("inputs", &inputs);
@@ -84,7 +84,7 @@ class CCacheKeyNode : public Node {
   /*! \brief The hardware target.*/
   Target target;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("source_func", &source_func);
     v->Visit("target", &target);
   }
@@ -141,7 +141,7 @@ class CCacheValueNode : public Node {
   /*! \brief usage statistics */
   int use_count{0};
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("cached_func", &cached_func);
     v->Visit("use_count", &use_count);
   }
@@ -191,7 +191,7 @@ class CompileEngineNode : public Node {
   virtual void Clear() = 0;
 
   // VisitAttrs
-  void VisitAttrs(AttrVisitor*) final {}
+  void VisitAttrs(AttrVisitor*) {}
 
   static constexpr const char* _type_key = "relay.CompileEngine";
   TVM_DECLARE_NODE_TYPE_INFO(CompileEngineNode, Node);
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 2703b1c8634a..8c6daceedd5c 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file src/tvm/relay/interpreter.cc
  * \brief An interpreter for the Relay IR.
  */
@@ -116,6 +115,8 @@ RefValue RefValueNode::make(Value value) {
 TVM_REGISTER_API("relay._make.RefValue")
 .set_body_typed(RefValueNode::make);
 
+TVM_REGISTER_NODE_TYPE(RefValueNode);
+
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<RefValueNode>([](const RefValueNode* node,
                                tvm::IRPrinter* p) {
@@ -135,6 +136,8 @@ ConstructorValue ConstructorValueNode::make(int32_t tag,
 TVM_REGISTER_API("relay._make.ConstructorValue")
 .set_body_typed(ConstructorValueNode::make);
 
+TVM_REGISTER_NODE_TYPE(ConstructorValueNode);
+
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ConstructorValueNode>([](const ConstructorValueNode* node,
                                        tvm::IRPrinter* p) {
@@ -207,7 +210,7 @@ class InterpreterStateNode : public Node {
   /*! \brief The call stack of the interpreter. */
   Stack stack;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("current_expr", &current_expr);
     v->Visit("stack", &stack);
   }
diff --git a/src/relay/backend/param_dict.cc b/src/relay/backend/param_dict.cc
index 0b9a299ae59b..9bde3a0b4edd 100644
--- a/src/relay/backend/param_dict.cc
+++ b/src/relay/backend/param_dict.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,19 +18,21 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file param_dict.cc
  * \brief Implementation and registration of parameter dictionary
  * serializing/deserializing functions.
  */
-#include "param_dict.h"
-
+#include <tvm/runtime/registry.h>
 #include <dmlc/memory_io.h>
 
 #include <string>
 #include <vector>
 #include <utility>
 
+#include "param_dict.h"
+
+
+
 namespace tvm {
 namespace relay {
 
diff --git a/src/relay/backend/param_dict.h b/src/relay/backend/param_dict.h
index 296c71ced644..e7695dc74c09 100644
--- a/src/relay/backend/param_dict.h
+++ b/src/relay/backend/param_dict.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -45,7 +45,7 @@ struct NamedNDArrayNode : public ::tvm::Node {
   std::string name;
   tvm::runtime::NDArray array;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("array", &array);
   }
diff --git a/src/relay/ir/adt.cc b/src/relay/ir/adt.cc
index 9c670bf47e8c..12cebe5f5d3c 100644
--- a/src/relay/ir/adt.cc
+++ b/src/relay/ir/adt.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file src/tvm/ir/adt.cc
  * \brief AST nodes for Relay algebraic data types (ADTs).
  */
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index 2032112f2a85..80f07904662f 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -61,7 +61,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 
 TVM_REGISTER_NODE_TYPE(SourceNameNode)
 .set_creator(GetSourceNameNode)
-.set_global_key([](const Node* n) {
+.set_global_key([](const Object* n) {
     return static_cast<const SourceNameNode*>(n)->name;
   });
 
@@ -88,7 +88,7 @@ TVM_REGISTER_NODE_TYPE(IdNode);
 
 TVM_REGISTER_API("relay._base.set_span")
 .set_body_typed<void(NodeRef, Span)>([](NodeRef node_ref, Span sp) {
-    auto rn = node_ref.as_derived<RelayNode>();
+    auto rn = node_ref.as<RelayNode>();
     CHECK(rn);
     rn->span = sp;
 });
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
index b0f889c4a489..7bfe41c05058 100644
--- a/src/relay/ir/op.cc
+++ b/src/relay/ir/op.cc
@@ -195,7 +195,7 @@ NodePtr<Node> CreateOp(const std::string& name) {
 
 TVM_REGISTER_NODE_TYPE(OpNode)
 .set_creator(CreateOp)
-.set_global_key([](const Node* n) {
+.set_global_key([](const Object* n) {
     return static_cast<const OpNode*>(n)->name;
   });
 
diff --git a/src/relay/ir/pretty_printer.cc b/src/relay/ir/pretty_printer.cc
index 394ec7eaab82..b2a8396706f2 100644
--- a/src/relay/ir/pretty_printer.cc
+++ b/src/relay/ir/pretty_printer.cc
@@ -32,7 +32,7 @@
  *  - Otherwise, inline if the node is at the end of a scope and is used at most once.
  */
 
-#include <dmlc/json.h>
+#include <tvm/node/serialization.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/module.h>
 #include <tvm/relay/pattern_functor.h>
@@ -214,7 +214,7 @@ class PrettyPrinter :
   }
 
   Doc PrintFinal(const NodeRef& node) {
-    if (node.as_derived<ExprNode>()) {
+    if (node.as<ExprNode>()) {
       Expr expr = Downcast<Expr>(node);
       dg_ = DependencyGraph::Create(&arena_, expr);
     }
@@ -237,13 +237,13 @@ class PrettyPrinter :
   std::vector<Doc> PrintFuncAttrs(const Attrs& attrs);
 
   Doc Print(const NodeRef& node, bool meta = false, bool try_inline = false) {
-    if (node.as_derived<ExprNode>()) {
+    if (node.as<ExprNode>()) {
       return PrintExpr(Downcast<Expr>(node), meta, try_inline);
-    } else if (node.as_derived<TypeNode>()) {
+    } else if (node.as<TypeNode>()) {
       return PrintType(Downcast<Type>(node), meta);
-    } else if (node.as_derived<PatternNode>()) {
+    } else if (node.as<PatternNode>()) {
       return PrintPattern(Downcast<Pattern>(node), meta);
-    } else if (node.as_derived<ModuleNode>()) {
+    } else if (node.as<ModuleNode>()) {
       return PrintMod(Downcast<Module>(node));
     } else {
       Doc doc;
@@ -924,14 +924,11 @@ class PrettyPrinter::AttrPrinter : public AttrVisitor {
   void Visit(const char* key, DataType* value) final {
     PrintKV(key, PrintString(runtime::TVMType2String(Type2TVMType(*value))));
   }
-  void Visit(const char* key, NodeRef* value) final {
-    PrintKV(key, parent_->PrintAttr(*value));
-  }
   void Visit(const char* key, runtime::NDArray* value) final {
     LOG(FATAL) << "do not allow NDarray as argument";
   }
   void Visit(const char* key, runtime::ObjectRef* obj) final {
-    LOG(FATAL) << "do not allow Object as argument";
+    PrintKV(key, parent_->PrintAttr(*obj));
   }
 
  private:
diff --git a/src/relay/ir/type_functor.cc b/src/relay/ir/type_functor.cc
index cde68c50daef..b93d9cc79433 100644
--- a/src/relay/ir/type_functor.cc
+++ b/src/relay/ir/type_functor.cc
@@ -132,7 +132,7 @@ Type TypeMutator::VisitType_(const FuncTypeNode* op) {
     if (const TypeVarNode* tin = new_type_param.as<TypeVarNode>()) {
       type_params.push_back(GetRef<TypeVar>(tin));
     } else {
-      LOG(FATAL) << new_type_param << std::endl;
+      LOG(FATAL) << new_type_param;
     }
   }
 
@@ -141,10 +141,10 @@ Type TypeMutator::VisitType_(const FuncTypeNode* op) {
     auto new_type_cs = VisitType(type_cs);
     changed = changed || !new_type_cs.same_as(type_cs);
     if (const TypeConstraintNode* tin =
-        new_type_cs.as_derived<TypeConstraintNode>()) {
+        new_type_cs.as<TypeConstraintNode>()) {
       type_constraints.push_back(GetRef<TypeConstraint>(tin));
     } else {
-      LOG(FATAL) << new_type_cs << std::endl;
+      LOG(FATAL) << new_type_cs;
     }
   }
 
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
index 9143ae3a43b7..bbfb97c56dc2 100644
--- a/src/relay/pass/alter_op_layout.cc
+++ b/src/relay/pass/alter_op_layout.cc
@@ -140,7 +140,7 @@ class LayoutAlternatedExprNode : public TempExprNode {
     return tmp_memorizer.Transform(value, new_layout, old_layout);
   }
 
-  void VisitAttrs(AttrVisitor *v) final {
+  void VisitAttrs(AttrVisitor *v) {
     v->Visit("value", &value);
     v->Visit("old_layout", &old_layout);
     v->Visit("new_layout", &new_layout);
diff --git a/src/relay/pass/device_annotation.cc b/src/relay/pass/device_annotation.cc
index 94d09b7c236c..21992ab7abb7 100644
--- a/src/relay/pass/device_annotation.cc
+++ b/src/relay/pass/device_annotation.cc
@@ -18,8 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- *
  * \file deivce_annotation.cc
  * \brief Passes to rewrite annotated program and retrieve the device allocation
  * of expression.
@@ -46,13 +44,15 @@ namespace relay {
 namespace {
 
 bool IsOnDeviceNode(const ExprNode* node) {
-  const auto* call_node = dynamic_cast<const CallNode*>(node);
-  return call_node != nullptr && call_node->attrs.as<OnDeviceAttrs>();
+  if (!node->IsInstance<CallNode>()) return false;
+  const auto* call_node = static_cast<const CallNode*>(node);
+  return call_node->attrs.as<OnDeviceAttrs>();
 }
 
 bool IsDeviceCopyNode(const ExprNode* node) {
-  const auto* call_node = dynamic_cast<const CallNode*>(node);
-  return call_node != nullptr && call_node->attrs.as<DeviceCopyAttrs>();
+  if (!node->IsInstance<CallNode>()) return false;
+  const auto* call_node = static_cast<const CallNode*>(node);
+  return call_node->attrs.as<DeviceCopyAttrs>();
 }
 
 }  // namespace
@@ -447,7 +447,8 @@ class DeviceInfo {
   static const ExprNode* GetDeviceCopyNode(const ExprNode* node) {
     if (IsDeviceCopyNode(node)) {
       return node;
-    } else if (const auto* call_node = dynamic_cast<const CallNode*>(node)) {
+    } else if (node->IsInstance<CallNode>()) {
+      const auto* call_node = static_cast<const CallNode*>(node);
       if (const auto* fn = call_node->op.as<FunctionNode>()) {
         const ExprNode* body = fn->body.operator->();
         if (IsDeviceCopyNode(body)) {
@@ -472,7 +473,8 @@ class DeviceInfo {
     for (auto it = post_visitor_.post_dfs_order_.crbegin();
          it != post_visitor_.post_dfs_order_.crend(); ++it) {
       if (const auto* node = GetDeviceCopyNode(it->first)) {
-        last_copy_node = dynamic_cast<const CallNode*>(node);
+        CHECK(node->IsInstance<CallNode>());
+        last_copy_node = static_cast<const CallNode*>(node);
         const auto* attrs = last_copy_node->attrs.as<DeviceCopyAttrs>();
         cur_dev_type = attrs->src_dev_type;
         if (out_dev_type == -1) out_dev_type = attrs->dst_dev_type;
diff --git a/src/relay/pass/eta_expand.cc b/src/relay/pass/eta_expand.cc
index 612ababfe044..a5d04871ba95 100644
--- a/src/relay/pass/eta_expand.cc
+++ b/src/relay/pass/eta_expand.cc
@@ -37,14 +37,14 @@ Expr EtaExpand(const Expr& e, const Module& mod) {
   Type ret_type;
 
   if (e->IsInstance<GlobalVarNode>()) {
-    auto gvar_node = e.as_derived<GlobalVarNode>();
+    auto gvar_node = e.as<GlobalVarNode>();
     auto func = mod->Lookup(GetRef<GlobalVar>(gvar_node));
     original_params = func->params;
     original_type_params = func->type_params;
     ret_type = func->ret_type;
   } else {
     CHECK(e->IsInstance<FunctionNode>());
-    auto func = GetRef<Function>(e.as_derived<FunctionNode>());
+    auto func = GetRef<Function>(e.as<FunctionNode>());
     original_params = func->params;
     original_type_params = func->type_params;
     ret_type = func->ret_type;
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index 6defa35b5106..e13a50a99c58 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -176,7 +176,7 @@ class ScaledExprNode : public TempExprNode {
     return value;
   }
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("value", &value);
     v->Visit("axes", &axes);
     v->Visit("scale", &scale);
@@ -664,7 +664,7 @@ class BackwardTransformerNode :
   }
 
   // solver is not serializable.
-  void VisitAttrs(tvm::AttrVisitor* v) final {}
+  void VisitAttrs(tvm::AttrVisitor* v) {}
 
   static constexpr const char* _type_key = "relay.fold_scale_axis.FBackwardTransformer";
   TVM_DECLARE_NODE_TYPE_INFO(BackwardTransformerNode, Node);
diff --git a/src/relay/pass/forward_rewrite.cc b/src/relay/pass/forward_rewrite.cc
index 6c66d6e982a7..f7d463a0547e 100644
--- a/src/relay/pass/forward_rewrite.cc
+++ b/src/relay/pass/forward_rewrite.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -47,7 +47,7 @@ class TempRealizer : private ExprMutator {
       return it->second;
     } else {
       Expr res;
-      if (const auto* temp = expr.as_derived<TempExprNode>()) {
+      if (const auto* temp = expr.as<TempExprNode>()) {
         res = temp->Realize();
 
       } else {
diff --git a/src/relay/pass/pass_manager.cc b/src/relay/pass/pass_manager.cc
index 928d8bd180e5..d2688620b0c3 100644
--- a/src/relay/pass/pass_manager.cc
+++ b/src/relay/pass/pass_manager.cc
@@ -102,7 +102,7 @@ class ModulePassNode : public PassNode {
 
   ModulePassNode() = default;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pass_info", &pass_info);
   }
 
@@ -156,7 +156,7 @@ class FunctionPassNode : public PassNode {
 
   FunctionPassNode() = default;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pass_info", &pass_info);
   }
 
@@ -211,7 +211,7 @@ class SequentialNode : public PassNode {
   /*! \brief A list of passes that used to compose a sequential pass. */
   tvm::Array<Pass> passes;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pass_info", &pass_info);
     v->Visit("passes", &passes);
   }
diff --git a/src/relay/pass/quantize/annotate.cc b/src/relay/pass/quantize/annotate.cc
index 38ffd9b59892..31e95fc6fb8d 100644
--- a/src/relay/pass/quantize/annotate.cc
+++ b/src/relay/pass/quantize/annotate.cc
@@ -41,7 +41,7 @@ class QAnnotateExprNode : public TempExprNode {
   Expr expr;
   QAnnotateKind kind;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("expr", &expr);
     v->Visit("kind", &kind);
   }
diff --git a/src/relay/pass/quantize/partition.cc b/src/relay/pass/quantize/partition.cc
index 6c7dc504b05e..f66aed3549a2 100644
--- a/src/relay/pass/quantize/partition.cc
+++ b/src/relay/pass/quantize/partition.cc
@@ -42,7 +42,7 @@ class QPartitionExprNode : public TempExprNode {
   /*! \brief The original expression */
   Expr expr;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("expr", &expr);
   }
 
diff --git a/src/relay/pass/quantize/quantize.cc b/src/relay/pass/quantize/quantize.cc
index dafbc1d1007f..3d0e71edfb7c 100644
--- a/src/relay/pass/quantize/quantize.cc
+++ b/src/relay/pass/quantize/quantize.cc
@@ -18,8 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- *
  * \file quantize.cc
  *
  * \brief transform a graph to a low-bit graph
diff --git a/src/relay/pass/quantize/quantize.h b/src/relay/pass/quantize/quantize.h
index f193f9a63e0a..412bce0a394e 100644
--- a/src/relay/pass/quantize/quantize.h
+++ b/src/relay/pass/quantize/quantize.h
@@ -76,7 +76,7 @@ class QConfigNode : public Node {
   bool round_for_shift = true;
   Array<Expr> debug_enabled_ops = Array<Expr>(NodePtr<Node>(nullptr));
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("nbit_input", &nbit_input);
     v->Visit("nbit_weight", &nbit_weight);
     v->Visit("nbit_activation", &nbit_activation);
diff --git a/src/relay/pass/quantize/realize.cc b/src/relay/pass/quantize/realize.cc
index cd367fdc0e5f..bdd0d732d146 100644
--- a/src/relay/pass/quantize/realize.cc
+++ b/src/relay/pass/quantize/realize.cc
@@ -56,7 +56,7 @@ class QRealizeIntExprNode : public QRealizeExprNode {
   Expr dom_scale;
   DataType dtype;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("data", &data);
     v->Visit("dom_scale", &dom_scale);
     v->Visit("dtype", &dtype);
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index f2bf46af4b28..6035790225aa 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -153,7 +153,7 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
   // default: unify only if alpha-equal
   Type VisitTypeDefault_(const Node* op, const Type& tn) final {
     NodeRef nr = GetRef<NodeRef>(op);
-    Type t1 = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
+    Type t1 = GetRef<Type>(nr.as<tvm::relay::TypeNode>());
     if (!AlphaEqual(t1, tn)) {
       return Type(nullptr);
     }
@@ -411,7 +411,7 @@ class TypeSolver::Propagator : public TypeFunctor<void(const Type&)> {
 
   void VisitTypeDefault_(const Node* op) override {
     NodeRef nr = GetRef<NodeRef>(op);
-    Type t = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
+    Type t = GetRef<Type>(nr.as<tvm::relay::TypeNode>());
     UpdateRelSet(t);
   }
 
@@ -495,7 +495,7 @@ class TypeSolver::Merger : public TypeFunctor<void(const Type&)> {
 
   void VisitTypeDefault_(const Node* op) override {
     NodeRef nr = GetRef<NodeRef>(op);
-    Type t = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
+    Type t = GetRef<Type>(nr.as<tvm::relay::TypeNode>());
     TransferLinks(t);
   }
 
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index 90c3de857329..fe1cc14b304d 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -280,7 +280,7 @@ TVM_REGISTER_API("relay._analysis.free_vars")
 TVM_REGISTER_API("relay._analysis.bound_vars")
   .set_body([](TVMArgs args, TVMRetValue* ret) {
       NodeRef x = args[0];
-      if (x.as_derived<ExprNode>()) {
+      if (x.as<ExprNode>()) {
         *ret = BoundVars(Downcast<Expr>(x));
       } else {
         *ret = BoundVars(Downcast<Pattern>(x));
@@ -294,7 +294,7 @@ TVM_REGISTER_API("relay._analysis.free_type_vars")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     NodeRef x = args[0];
     Module mod = args[1];
-    if (x.as_derived<TypeNode>()) {
+    if (x.as<TypeNode>()) {
       *ret = FreeTypeVars(Downcast<Type>(x), mod);
     } else {
       *ret = FreeTypeVars(Downcast<Expr>(x), mod);
@@ -305,7 +305,7 @@ TVM_REGISTER_API("relay._analysis.bound_type_vars")
   .set_body([](TVMArgs args, TVMRetValue* ret) {
       NodeRef x = args[0];
       Module mod = args[1];
-      if (x.as_derived<TypeNode>()) {
+      if (x.as<TypeNode>()) {
         *ret = BoundTypeVars(Downcast<Type>(x), mod);
       } else {
         *ret = BoundTypeVars(Downcast<Expr>(x), mod);
@@ -316,7 +316,7 @@ TVM_REGISTER_API("relay._analysis.all_type_vars")
   .set_body([](TVMArgs args, TVMRetValue* ret) {
       NodeRef x = args[0];
       Module mod = args[1];
-      if (x.as_derived<TypeNode>()) {
+      if (x.as<TypeNode>()) {
         *ret = AllTypeVars(Downcast<Type>(x), mod);
       } else {
         *ret = AllTypeVars(Downcast<Expr>(x), mod);
diff --git a/src/runtime/object.cc b/src/runtime/object.cc
index d07612f6a963..5d71c2fd2fa1 100644
--- a/src/runtime/object.cc
+++ b/src/runtime/object.cc
@@ -73,13 +73,12 @@ class TypeContext {
     return child_tindex == parent_tindex;
   }
 
-  uint32_t GetOrAllocRuntimeTypeIndex(const char* key,
+  uint32_t GetOrAllocRuntimeTypeIndex(const std::string& skey,
                                       uint32_t static_tindex,
                                       uint32_t parent_tindex,
                                       uint32_t num_child_slots,
                                       bool child_slots_can_overflow) {
     std::lock_guard<std::mutex> lock(mutex_);
-    std::string skey = key;
     auto it = type_key2index_.find(skey);
     if (it != type_key2index_.end()) {
       return it->second;
@@ -106,7 +105,7 @@ class TypeContext {
           << "Conflicting static index " << static_tindex
           << " between " << type_table_[allocated_tindex].name
           << " and "
-          << key;
+          << skey;
     } else if (pinfo.allocated_slots + num_slots < pinfo.num_slots) {
       // allocate the slot from parent's reserved pool
       allocated_tindex = parent_tindex + pinfo.allocated_slots;
@@ -152,11 +151,10 @@ class TypeContext {
     return type_table_[tindex].name_hash;
   }
 
-  uint32_t TypeKey2Index(const char* key) {
-    std::string skey = key;
+  uint32_t TypeKey2Index(const std::string& skey) {
     auto it = type_key2index_.find(skey);
     CHECK(it != type_key2index_.end())
-        << "Cannot find type " << key;
+        << "Cannot find type " << skey;
     return it->second;
   }
 
@@ -176,7 +174,7 @@ class TypeContext {
   std::unordered_map<std::string, uint32_t> type_key2index_;
 };
 
-uint32_t Object::GetOrAllocRuntimeTypeIndex(const char* key,
+uint32_t Object::GetOrAllocRuntimeTypeIndex(const std::string& key,
                                             uint32_t static_tindex,
                                             uint32_t parent_tindex,
                                             uint32_t num_child_slots,
@@ -198,7 +196,7 @@ size_t Object::TypeIndex2KeyHash(uint32_t tindex) {
   return TypeContext::Global()->TypeIndex2KeyHash(tindex);
 }
 
-uint32_t Object::TypeKey2Index(const char* key) {
+uint32_t Object::TypeKey2Index(const std::string& key) {
   return TypeContext::Global()->TypeKey2Index(key);
 }
 
@@ -210,7 +208,7 @@ class TVMObjectCAPI {
     }
   }
 
-  static uint32_t TypeKey2Index(const char* type_key) {
+  static uint32_t TypeKey2Index(const std::string& type_key) {
     return Object::TypeKey2Index(type_key);
   }
 };
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index a7237db482ac..6e43b408978a 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -21,6 +21,7 @@
 #include <gtest/gtest.h>
 #include <topi/cuda/injective.h>
 #include <tvm/operation.h>
+#include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/build_module.h>
 
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 4baf649c6e49..70a4c32bedac 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -20,6 +20,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/ir.h>
 

From 1df6c67c2d18953fe3398c31fb2fe3e8ecbd7462 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 24 Oct 2019 15:49:24 -0700
Subject: [PATCH 35/59] hotfix the ci (#4199)

---
 include/tvm/relay/interpreter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
index 3bdc125f9938..d5d783d4804a 100644
--- a/include/tvm/relay/interpreter.h
+++ b/include/tvm/relay/interpreter.h
@@ -132,7 +132,7 @@ class RecClosureNode : public ValueNode {
 
   RecClosureNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("clos", &clos);
     v->Visit("bind", &bind);
   }

From 19f105f112aad92325ca9991a774ee29e67086c1 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 24 Oct 2019 23:24:25 -0700
Subject: [PATCH 36/59] [TOPI][x86] Legalize - Support int8xint8 convolution to
 use VNNI instructions. (#4196)

---
 tests/python/relay/test_op_level2.py    | 65 ++++++++++++----------
 topi/python/topi/x86/conv2d_alter_op.py | 72 ++++++++++++++++++++-----
 2 files changed, 97 insertions(+), 40 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index e097980b060c..9236d6e55fa0 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -546,9 +546,11 @@ def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
 
         n, h, w, ch, cw = 1, 64, 64, 3, 3
         if data_layout == 'NCHW':
-            x = relay.var("x", relay.TensorType((n, ic, h, w), input_dtype))
+            data_shape = (n, ic, h, w)
+            x = relay.var("x", relay.TensorType(data_shape, input_dtype))
         elif data_layout == 'NHWC':
-            x = relay.var("x", relay.TensorType((n, h, w, ic), input_dtype))
+            data_shape = (n, h, w, ic)
+            x = relay.var("x", relay.TensorType(data_shape, input_dtype))
         else:
             raise ValueError('Not supported')
 
@@ -559,8 +561,8 @@ def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
         else:
             raise ValueError('Not supported')
 
-        w = relay.var("w", relay.TensorType(kernel_shape, weight_dtype))
-        y = relay.nn.conv2d(x, w,
+        weight = relay.var("weight", relay.TensorType(kernel_shape, weight_dtype))
+        y = relay.nn.conv2d(x, weight,
                             kernel_size=(ch, cw),
                             channels=oc,
                             padding=(1, 1),
@@ -568,11 +570,13 @@ def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
                             data_layout=data_layout,
                             kernel_layout=kernel_layout,
                             out_dtype=output_dtype)
-        func = relay.Function([x, w], y)
+        func = relay.Function([x, weight], y)
         wdata = np.random.rand(*kernel_shape) * 10
-        parameters = {"w": tvm.nd.array(wdata.astype(weight_dtype))}
+        parameters = {"weight": tvm.nd.array(wdata.astype(weight_dtype))}
+
         with relay.build_config(opt_level=3):
             graph, lib, params = relay.build(func, target, params=parameters)
+
         assembly = lib.get_source("asm")
         return assembly
 
@@ -589,58 +593,63 @@ def _has_fast_int8_instructions(asm, target):
     llvm_version = tvm.codegen.llvm_version_major()
     for target in targets:
         if llvm_version >= 8:
-            fast_int8_dtypes = ('uint8', 'int8', 'int32')
+            dtypes = ('uint8', 'int8', 'int32')
             # Sweep the input channels to check int8 robustness
             # Input channels should be a multiple of 4 internally.
             for ic in [1, 4, 6]:
-                asm = _compile(ic=ic, oc=32, target=target, data_layout="NCHW",
+                asm = _compile(ic=ic, oc=16, target=target, data_layout="NCHW",
                                kernel_layout='OIHW',
-                               dtypes=fast_int8_dtypes)
+                               dtypes=dtypes)
                 assert _has_fast_int8_instructions(asm, target)
 
             for ic in [1, 4, 6]:
-                asm = _compile(ic=ic, oc=32, target=target, data_layout="NHWC",
+                asm = _compile(ic=ic, oc=16, target=target, data_layout="NHWC",
                                kernel_layout='HWIO',
-                               dtypes=fast_int8_dtypes)
+                               dtypes=dtypes)
                 assert _has_fast_int8_instructions(asm, target)
 
-
             # Sweep the output channels to check int8 robustness
             # Output channels should be a multiple of 16 internally.
             for oc in [4, 16, 20]:
-                asm = _compile(ic=16, oc=oc, target=target, data_layout="NCHW",
+                asm = _compile(ic=8, oc=oc, target=target, data_layout="NCHW",
                                kernel_layout='OIHW',
-                               dtypes=fast_int8_dtypes)
+                               dtypes=dtypes)
                 assert _has_fast_int8_instructions(asm, target)
 
             for oc in [4, 16, 20]:
-                asm = _compile(ic=16, oc=oc, target=target, data_layout="NHWC",
+                asm = _compile(ic=8, oc=oc, target=target, data_layout="NHWC",
                                kernel_layout='HWIO',
-                               dtypes=fast_int8_dtypes)
+                               dtypes=dtypes)
                 assert _has_fast_int8_instructions(asm, target)
 
             # Check that both non-divisible oc and ic work
             asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
-                           dtypes=fast_int8_dtypes)
+                           dtypes=dtypes)
             assert _has_fast_int8_instructions(asm, target)
 
             asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                           dtypes=fast_int8_dtypes)
+                           dtypes=dtypes)
             assert _has_fast_int8_instructions(asm, target)
 
-            # Ensure that code is generated when datatypes are not HW supported.
-            dtypes = ('int8', 'int8', 'int32')
-            asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
+    # Check that int8 x int8 goes through legalization so that fast instructions can be picked up.
+    for target in targets:
+        if llvm_version >= 8:
+            dtypes = (('int8', 'int8', 'int32'))
+            # Check that both non-divisible oc and ic work
+            asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
                            dtypes=dtypes)
-            # Check that intrinisic is not present in the assembly.
-            assert not _has_fast_int8_instructions(asm, target)
+            assert _has_fast_int8_instructions(asm, target)
 
-            # Ensure that code is generated when datatypes are not HW supported.
-            dtypes = ('uint8', 'uint8', 'int32')
-            asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
+            asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
                            dtypes=dtypes)
-            # Check that intrinisic is not present in the assembly.
-            assert not _has_fast_int8_instructions(asm, target)
+            assert _has_fast_int8_instructions(asm, target)
+
+    # Ensure that code is generated when datatypes are not HW supported.
+    dtypes = ('uint8', 'uint8', 'int32')
+    asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
+                   dtypes=dtypes)
+    # Check that intrinisic is not present in the assembly.
+    assert not _has_fast_int8_instructions(asm, target)
 
     # Check that a vectorized instruction is generated for older Intel
     # generations, because we default to NCHWc layout.
diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py
index 0b8d9d791e77..9b66bb71092d 100644
--- a/topi/python/topi/x86/conv2d_alter_op.py
+++ b/topi/python/topi/x86/conv2d_alter_op.py
@@ -198,24 +198,72 @@ def _conv2d_legalize(attrs, inputs, arg_types):
         The legalized expr
     """
 
+    # Dilation not supported yet. Return None if dilation is not (1, 1)
+    dilation = attrs.get_int_tuple("dilation")
+    if not (dilation[0] == 1 and dilation[1] == 1):
+        return None
+
     # Collect the input tensors.
     data_tensor, kernel_tensor = arg_types[0], arg_types[1]
+    data_dtype = data_tensor.dtype
+    kernel_dtype = kernel_tensor.dtype
 
     # Collect the output tensor.
     output_tensor = arg_types[2]
 
+    # Collect the input exprs.
+    data, kernel = inputs
+
+    # Get the conv attrs
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    is_int8_inputs = False
+    # If both the inputs are int8, we can add 128 to make the input dtype uint8, and then adjust the
+    # output. This will help picking up Intel VNNI instructions.
+    # Original --> C = A (conv) B
+    # A and B are int8
+    #   C = (A + 128 - 128) (conv) B
+    #   C = (A' conv B) - 128 (conv) B
+    # where A' = A + 128
+    # and 128 (conv) B is basically a reduce on CRS axis for weights.
+    if data_tensor.dtype == 'int8' and kernel_tensor.dtype == 'int8':
+        is_int8_inputs = True
+        padding = attrs.get_int_tuple("padding")
+
+        if attrs['data_layout'] == 'NHWC' and attrs['kernel_layout'] == 'HWIO':
+            adjust_shift = relay.sum(relay.cast(kernel, dtype='int32'), axis=(0, 1, 2))
+            pad_width = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
+        elif attrs['data_layout'] == 'NCHW' and attrs['kernel_layout'] == 'OIHW':
+            pad_width = ((0, 0), (0, 0), (padding[0], padding[0]), (padding[1], padding[1]))
+            adjust_shift = relay.sum(relay.cast(kernel, dtype='int32'), axis=(1, 2, 3))
+            adjust_shift = relay.expand_dims(adjust_shift, axis=1, num_newaxis=2)
+        else:
+            return None
+
+        data = relay.cast(data, 'int32')
+        data = relay.add(data, relay.const(128, 'int32'))
+        data = relay.cast(data, 'uint8')
+
+        # Do external padding as pad value has to be 128.
+        if not (padding[0] == 0 and padding[1] == 0):
+            data = relay.nn.pad(data, pad_width=pad_width, pad_value=128)
+        new_attrs['padding'] = (0, 0)
+
+        # The data type is now shifted to uint8
+        data_dtype = 'uint8'
+
+        # Multiply 128 to adjust shift.
+        adjust_shift = relay.multiply(adjust_shift, relay.const(128, 'int32'))
+
     # Legalize if the datatypes are suitable for fast Int8 instructions.  Int8 instructions require
     # input channel to be a multiple of 4 and output channels to be a multiple of 16. For input
     # channels, we pad both the inputs and weights input channels. For output channels, we pad the
     # weight and stride_slice the output.
-    if _is_int8_hw_support(data_tensor.dtype, kernel_tensor.dtype):
+    if _is_int8_hw_support(data_dtype, kernel_dtype):
         # Flags to remember if the expr is modified
         ic_modified = False
         oc_modified = False
 
-        # Collect the input exprs.
-        data, kernel = inputs
-
         # Find the value of input and output channel.
         in_channel = -1
         out_channel = -1
@@ -256,16 +304,16 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             else:
                 return None
 
-        if not (ic_modified or oc_modified):
-            return None
-
-        if ic_modified and not oc_modified:
-            return relay.nn.conv2d(data, kernel, **attrs)
-
         if oc_modified:
-            new_attrs = {k: attrs[k] for k in attrs.keys()}
             new_attrs['channels'] = new_out_channel
             out = tvm.relay.nn.conv2d(data, kernel, **new_attrs)
             original_out_shape = [x.value for x in output_tensor.shape]
-            return relay.strided_slice(out, begin=(0, 0, 0, 0), end=original_out_shape)
+            out = relay.strided_slice(out, begin=(0, 0, 0, 0), end=original_out_shape)
+        else:
+            out = relay.nn.conv2d(data, kernel, **new_attrs)
+
+        if is_int8_inputs:
+            out = relay.subtract(out, adjust_shift)
+
+        return out
     return None

From 3d0af15a379b0a48357f9e2db496ff6f3c613ea3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Fri, 25 Oct 2019 07:54:29 -0700
Subject: [PATCH 37/59] [Relay] crossentropy_with_logits and its gradient
 (#4075)

* save

* lint
---
 python/tvm/relay/op/_reduce.py             |  1 +
 python/tvm/relay/op/_tensor_grad.py        |  9 ++++++++
 python/tvm/relay/op/nn/_nn.py              |  9 ++++++++
 python/tvm/relay/op/nn/nn.py               | 19 ++++++++++++++++
 src/relay/op/nn/nn.cc                      | 25 +++++++++++++++++++++-
 tests/python/relay/test_op_grad_level10.py | 14 +++++++++---
 6 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
index f6b699f1e9cc..845ec4b9ba87 100644
--- a/python/tvm/relay/op/_reduce.py
+++ b/python/tvm/relay/op/_reduce.py
@@ -37,3 +37,4 @@ def _schedule_reduce(_, outs, target):
 _reg.register_schedule("mean", _schedule_reduce)
 _reg.register_schedule("variance", _schedule_reduce)
 _reg.register_schedule("nn.cross_entropy", _schedule_reduce)
+_reg.register_schedule("nn.cross_entropy_with_logits", _schedule_reduce)
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 1c94162d87d9..d55cad7c7a2d 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -449,3 +449,12 @@ def cross_entropy_grad(orig, grad):
     batch_size = take(shape, const(0, dtype='int32'), axis=0)
     grad = grad / batch_size.astype('float32')
     return [-grad * y / x, -grad * log(x)]
+
+
+@register_gradient("nn.cross_entropy_with_logits")
+def cross_entropy_with_logits_grad(orig, grad):
+    x, y = orig.args
+    shape = shape_of(x)
+    batch_size = take(shape, const(0, dtype='int32'), axis=0)
+    grad = grad / batch_size.astype('float32')
+    return [-grad * y, -grad * x]
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 0043ffae0f61..5786c228abc0 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -770,3 +770,12 @@ def schedule_bitserial_dense(attrs, outputs, target):
 def compute_cross_entropy(attrs, inputs, out_dtype, target):
     x, y = inputs
     return [-topi.sum(topi.log(x) * y) / x.shape[0]]
+
+
+reg.register_pattern("nn.cross_entropy_with_logits", OpPattern.OPAQUE)
+
+
+@reg.register_compute("nn.cross_entropy_with_logits")
+def compute_cross_entropy_with_logits(attrs, inputs, out_dtype, target):
+    x, y = inputs
+    return [-topi.sum(x * y) / x.shape[0]]
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 9ddb3ece4ce2..1f289d1bd27a 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1807,3 +1807,22 @@ def cross_entropy(predictions, targets):
       The computed result.
     """
     return _make.cross_entropy(predictions, targets)
+
+
+def cross_entropy_with_logits(predictions, targets):
+    """CrossEntropy with logits.
+
+    Parameters
+    ----------
+    predictions : tvm.relay.Expr
+      The predictions.
+
+    targets : tvm.relay.Expr
+      The targets.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+      The computed result.
+    """
+    return _make.cross_entropy_with_logits(predictions, targets)
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index dd1b4e532185..416a0d7b543f 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -910,7 +910,7 @@ bool CrossEntropyRel(const Array<Type>& types,
   return true;
 }
 
-// Positional relay function to create batch_matmul operator used by frontend FFI.
+// Positional relay function to create cross_entropy operator used by frontend FFI.
 Expr MakeCrossEntropy(Expr predictions, Expr targets) {
   static const Op& op = Op::Get("nn.cross_entropy");
   return CallNode::make(op, {predictions, targets}, Attrs(), {});
@@ -933,5 +933,28 @@ Do log on the data - do not accept logits.
 .add_type_rel("CrossEntropy", CrossEntropyRel);
 
 
+// Positional relay function to create cross_entropy_with_logits operator used by frontend FFI.
+Expr MakeCrossEntropyWithLogits(Expr predictions, Expr targets) {
+  static const Op& op = Op::Get("nn.cross_entropy_with_logits");
+  return CallNode::make(op, {predictions, targets}, Attrs(), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.cross_entropy_with_logits")
+.set_body_typed(MakeCrossEntropyWithLogits);
+
+
+RELAY_REGISTER_OP("nn.cross_entropy_with_logits")
+.describe(R"code(
+Computes cross entropy given predictions and targets.
+Accept logits.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("x", "1D Tensor", "Predictions.")
+.add_argument("y", "1D Tensor", "Targets.")
+.set_support_level(10)
+.add_type_rel("CrossEntropy", CrossEntropyRel);
+
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
index 2592d181240a..7aa9e0bc135f 100644
--- a/tests/python/relay/test_op_grad_level10.py
+++ b/tests/python/relay/test_op_grad_level10.py
@@ -14,15 +14,23 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
+
 from tvm import relay
 from tvm.relay.testing import check_grad
 
 
 def test_cross_entropy_grad():
-    x = relay.var("x", shape=(1, 5))
-    y = relay.var("y", shape=(1, 5))
+    x = relay.var("x", shape=(2, 5))
+    y = relay.var("y", shape=(2, 5))
     check_grad(relay.Function([x, y], relay.op.nn.cross_entropy(x, y)), eps=0.01, scale=0.1, mean=1)
 
 
+def test_cross_entropy_with_logits_grad():
+    x = relay.var("x", shape=(2, 5))
+    y = relay.var("y", shape=(2, 5))
+    check_grad(relay.Function([x, y], relay.op.nn.cross_entropy_with_logits(x, y)), eps=0.01, scale=0.1, mean=1)
+
+
 if __name__ == "__main__":
-    test_cross_entropy_grad()
+    pytest.main([__file__])

From e216d285c258dd5ce2389b528e2e870da336ca30 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Fri, 25 Oct 2019 15:47:45 -0700
Subject: [PATCH 38/59] [hotfix] missing include headers (#4204)

---
 src/arithmetic/const_fold.h   | 1 +
 src/codegen/codegen_cuda.cc   | 1 +
 src/codegen/codegen_opencl.cc | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/arithmetic/const_fold.h b/src/arithmetic/const_fold.h
index 57f90534fbb4..86f1927f2abe 100644
--- a/src/arithmetic/const_fold.h
+++ b/src/arithmetic/const_fold.h
@@ -28,6 +28,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/expr_operator.h>
 #include <algorithm>
+#include <cmath>
 #include "int_operator.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 55b4810ed4d8..39a3ab7df0cc 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -24,6 +24,7 @@
 #include <tvm/base.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
+#include <cmath>
 #include <vector>
 #include <string>
 #include "codegen_cuda.h"
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index 0b33bf43c151..3120bb543aea 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -22,6 +22,7 @@
  * \file codegen_opencl.cc
  */
 #include <tvm/packed_func_ext.h>
+#include <cmath>
 #include <vector>
 #include <string>
 #include "codegen_opencl.h"

From 81452aafde2612ed7d0d76891151b675ef76f382 Mon Sep 17 00:00:00 2001
From: Altan Haan <altanh@cs.washington.edu>
Date: Sat, 26 Oct 2019 17:04:42 -0700
Subject: [PATCH 39/59] [Relay][Training] Add checkpoint annotation for
 checkpointing memory optimization (#4146)

* add checkpoint annotation for checkpointing memory optimization

* add alpha-equivalence checkpoint test and fix gradient type issue

* fix build issues

* ignore checkpoint annotation when checking missing gradients

* refactor, fix checkpoint compute for tuple and add tests
---
 python/tvm/relay/op/annotation/annotation.py |  19 ++-
 src/relay/op/annotation/annotation.cc        |  27 ++++
 src/relay/pass/de_duplicate.cc               |   4 +-
 src/relay/pass/gradient.cc                   | 162 +++++++++++++++----
 tests/python/relay/test_op_grad_level10.py   |  12 ++
 tests/python/relay/test_op_level10.py        | 121 ++++++++++++++
 6 files changed, 309 insertions(+), 36 deletions(-)

diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py
index 10c898538596..2b9d4bcd81bc 100644
--- a/python/tvm/relay/op/annotation/annotation.py
+++ b/python/tvm/relay/op/annotation/annotation.py
@@ -17,10 +17,10 @@
 """Annotation operations."""
 from __future__ import absolute_import as _abs
 from . import _make
+from ..op import register_schedule, schedule_injective
 from .... import nd as _nd
 from .... import TVMContext as _TVMContext
 
-
 def on_device(data, device):
     """Annotate an expression with a certain device type.
 
@@ -61,3 +61,20 @@ def stop_fusion(data):
         The annotated expression.
     """
     return _make.stop_fusion(data)
+
+def checkpoint(data):
+    """Annotate an expression to be a checkpoint for the checkpointing memory optimization.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The expression to be annotated.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The annotated expression.
+    """
+    return _make.checkpoint(data)
+
+register_schedule("annotation.checkpoint", schedule_injective)
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index eeacc6cbf999..5a8ad33c63a7 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -144,5 +144,32 @@ Mark the end of bitpacking.
                          return {topi::identity(inputs[0])};
                        });
 
+TVM_REGISTER_API("relay.op.annotation._make.checkpoint")
+.set_body_typed<Expr(Expr)>([](Expr data) {
+  static const Op& op = Op::Get("annotation.checkpoint");
+  return CallNode::make(op, {data}, Attrs{}, {});
+});
+
+RELAY_REGISTER_OP("annotation.checkpoint")
+.describe(R"code(
+Mark a checkpoint for checkpointing memory optimization.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_support_level(10)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<TOpIsStateful>("TOpIsStateful", false)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>("FTVMCompute",
+                       [](const Attrs& attrs, const Array<Tensor>& inputs,
+                          const Type& out_dtype, const Target& target) -> Array<Tensor> {
+                         Array<Tensor> outputs;
+                         for (size_t i = 0; i < inputs.size(); ++i) {
+                           outputs.push_back(topi::identity(inputs[i]));
+                         }
+                         return outputs;
+                       });
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/de_duplicate.cc b/src/relay/pass/de_duplicate.cc
index 332803cb71ba..38acdcde94b0 100644
--- a/src/relay/pass/de_duplicate.cc
+++ b/src/relay/pass/de_duplicate.cc
@@ -52,7 +52,9 @@ Expr DeDup(const Expr& e) {
     }
 
     Expr VisitExpr(const Expr& e) final {
-      return ExprMutator::VisitExpr(e);
+      auto ret = ExprMutator::VisitExpr(e);
+      ret->checked_type_ = e->checked_type_;
+      return ret;
     }
 
     Expr VisitExpr_(const VarNode* op) final {
diff --git a/src/relay/pass/gradient.cc b/src/relay/pass/gradient.cc
index 8b06b8721994..b93c110a71c6 100644
--- a/src/relay/pass/gradient.cc
+++ b/src/relay/pass/gradient.cc
@@ -273,24 +273,29 @@ Type ReverseType(const Type& t) {
  * by doing a structure preserving map.
  */
 Expr LiftTensor(const std::function<Expr(const Expr& t)>& f,
-                const Type& t,
+                const std::function<Type(const Type&)>& tf,
+                const Type& forward_type,
                 const Expr& e,
                 LetList* ll) {
   CHECK(IsAtomic(e)) << e;
-  if (t.as<TensorTypeNode>()) {
+  if (forward_type.as<TensorTypeNode>()) {
     auto ret = f(e);
-    ret->checked_type_ = t;
+    ret->checked_type_ = tf(forward_type);
     return ret;
-  } else if (auto* tt = t.as<TupleTypeNode>()) {
+  } else if (auto* tt = forward_type.as<TupleTypeNode>()) {
     tvm::Array<Expr> fields;
+    tvm::Array<Type> types;
     for (size_t i = 0; i < tt->fields.size(); ++i) {
-      fields.push_back(LiftTensor(f,
-                                  tt->fields[i],
-                                  ll->Push(GetField(e, i)),
-                                  ll));
+      auto field = LiftTensor(f,
+                              tf,
+                              tt->fields[i],
+                              ll->Push(GetField(e, i)),
+                              ll);
+      fields.push_back(field);
+      types.push_back(field->checked_type_);
     }
     auto ret = TupleNode::make(fields);
-    ret->checked_type_ = t;
+    ret->checked_type_ = TupleTypeNode::make(types);
     return std::move(ret);
   } else {
     LOG(FATAL) << "unsupported input/output type: " << tt;
@@ -298,25 +303,63 @@ Expr LiftTensor(const std::function<Expr(const Expr& t)>& f,
   }
 }
 
+/*! \brief Transfers the gradients from an Expr to a deep duplication of the Expr,
+ * by stitching the references in the AD values.
+ */
+void TransferGrads(const Type& forward_type,
+                   const Expr& from,
+                   const Expr& to,
+                   LetList* ll) {
+  CHECK(IsAtomic(from)) << from;
+  CHECK(IsAtomic(to)) << to;
+  if (forward_type.as<TensorTypeNode>()) {
+    auto from_ref = TupleGetItemNode::make(from, 1);
+    auto to_ref = TupleGetItemNode::make(to, 1);
+    ll->Push(RefWriteNode::make(to_ref, RefReadNode::make(from_ref)));
+  } else if (auto* tt = forward_type.as<TupleTypeNode>()) {
+    for (size_t i = 0; i < tt->fields.size(); ++i) {
+      TransferGrads(tt->fields[i],
+                    ll->Push(TupleGetItemNode::make(from, i)),
+                    ll->Push(TupleGetItemNode::make(to, i)),
+                    ll);
+    }
+  } else {
+    LOG(FATAL) << "Unsupported input/output type: " << forward_type;
+    throw;
+  }
+}
+
 /*! \brief t -> ReverseType(t). Transform to Reverse Mode Value. */
-Expr GetRev(const Type& t, const Expr& e, LetList* ll) {
+Expr GetRev(const Type& forward_type, const Expr& e, LetList* ll) {
   auto rev = [&](const Expr& e) {
     return Pair(e, ll->Push(RefCreateNode::make(ZerosLike(e))));
   };
-  return LiftTensor(rev, t, e, ll);
+  auto rev_type = [&](const Type& forward_type) {
+    return ReverseType(forward_type);
+  };
+  return LiftTensor(rev, rev_type, forward_type, e, ll);
 }
 
 /*! \brief ReverseType(t) -> t. Get the original value. */
-Expr GetValue(const Type& t, const Expr& e, LetList* ll) {
-  return LiftTensor([&](const Expr& e) { return GetField(e, 0); }, t, e, ll);
+Expr GetValue(const Type& forward_type, const Expr& e, LetList* ll) {
+  auto val = [&](const Expr& e) {
+    return GetField(e, 0);
+  };
+  auto val_type = [&](const Type& forward_type) {
+    return forward_type;
+  };
+  return LiftTensor(val, val_type, forward_type, e, ll);
 }
 
 /*! \brief ReverseType(t) -> t. Get the gradient. */
-Expr GetGrad(const Type& t, const Expr& e, LetList* ll) {
+Expr GetGrad(const Type& forward_type, const Expr& e, LetList* ll) {
   auto grad = [&](const Expr& e) {
     return ll->Push(RefReadNode::make(GetField(e, 1)));
   };
-  return LiftTensor(grad, t, e, ll);
+  auto grad_type = [&](const Type& forward_type) {
+    return forward_type;
+  };
+  return LiftTensor(grad, grad_type, forward_type, e, ll);
 }
 
 void UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) {
@@ -337,42 +380,87 @@ void UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) {
   }
 }
 
+Expr BPEmpty() {
+  Expr unitF = FunctionNode::make({}, TupleNode::make({}), TupleTypeNode::make({}), {});
+  return RefCreateNode::make(unitF);
+}
+
 struct ReverseAD : ExprMutator {
+  using ADVarMap = std::unordered_map<Var, Var, NodeHash, NodeEqual>;
+
   Var bp;
+  std::shared_ptr<ADVarMap> ad_vars;
   const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
 
-  explicit ReverseAD(const Var& bp) : bp(bp) { }
+  explicit ReverseAD(const Var& bp, std::shared_ptr<ADVarMap> ad_vars)
+      : bp(bp), ad_vars(ad_vars) { }
 
   Expr VisitExpr_(const OpNode* op) final {
     LOG(FATAL) << "op should only be inside call";
     throw;
   }
 
-  Expr VisitExpr_(const CallNode* op) final {
-    if (const OpNode* op_node = op->op.as<OpNode>()) {
+  Expr VisitCheckpoint(const CallNode *call) {
+    const OpNode* op_node = call->op.as<OpNode>();
+    CHECK(op_node) << "expected op in call";
+    Op op_ref = GetRef<Op>(op_node);
+    CHECK(op_ref->name == "annotation.checkpoint") << "expected checkpoint annotation";
+    auto x = call->args[0];
+    return LetList::With([&](LetList* ll) {
+      auto x_var = ll->Push(x);
+      auto ret = ll->Push(GetRev(call->checked_type(), x_var, ll));
+      auto bpv = ll->Push(RefReadNode::make(bp));
+      Expr nbp = FunctionNode::make(
+        {},
+        LetList::With([&](LetList* ll) {
+          // we need a new ReverseAD visitor to avoid clobbering the bp local var
+          auto dup_bp = ll->Push(BPEmpty());
+          ReverseAD dup_diff(dup_bp, ad_vars);
+          auto dup_ad = ll->Push(dup_diff.VisitExpr(DeDup(x)));
+
+          TransferGrads(call->checked_type(), ret, dup_ad, ll);
+          ll->Push(CallNode::make(RefReadNode::make(dup_bp), {}));
+          return CallNode::make(bpv, {});
+        }),
+        TupleTypeNode::make({}),
+        {});
+      ll->Push(RefWriteNode::make(bp, nbp));
+      return ret;
+    });
+  }
+
+  Expr VisitExpr_(const CallNode* call) final {
+    if (const OpNode* op_node = call->op.as<OpNode>()) {
       Op op_ref = GetRef<Op>(op_node);
+
+      if (op_ref->name == "annotation.checkpoint") {
+        return VisitCheckpoint(call);
+      }
+
+      CHECK(rev_map.count(op_ref))
+        << op_node->name << " does not have reverse mode defined";
       return LetList::With([&](LetList* ll) {
         std::vector<Var> args;
-        for (const auto& arg : op->args) {
+        for (const auto& arg : call->args) {
           args.push_back(ll->Push(VisitExpr(arg)));
         }
         std::vector<Expr> orig_args;
         for (size_t i = 0; i < args.size(); i++) {
-          orig_args.push_back(GetValue(op->args[i]->checked_type(), args[i], ll));
+          orig_args.push_back(GetValue(call->args[i]->checked_type(), args[i], ll));
         }
-        Expr orig = CallNode::make(op->op, orig_args, op->attrs, op->type_args);
-        orig->checked_type_ = op->checked_type();
+        Expr orig = CallNode::make(call->op, orig_args, call->attrs, call->type_args);
+        orig->checked_type_ = call->checked_type();
         Var orig_var = ll->Push(orig);
-        orig_var->checked_type_ = op->checked_type();
-        auto ret = ll->Push(GetRev(op->checked_type(), orig_var, ll));
+        orig_var->checked_type_ = call->checked_type();
+        auto ret = ll->Push(GetRev(call->checked_type(), orig_var, ll));
         auto bpv = ll->Push(RefReadNode::make(bp));
         Expr nbp = FunctionNode::make(
           {},
           LetList::With([&](LetList* ll) {
-            tvm::Array<Expr> rev = rev_map[op_ref](orig, GetGrad(op->checked_type(), ret, ll));
+            tvm::Array<Expr> rev = rev_map[op_ref](orig, GetGrad(call->checked_type(), ret, ll));
             CHECK(args.size() == rev.size());
             for (size_t i = 0; i < args.size(); ++i) {
-              UpdateGrad(op->args[i]->checked_type(), args[i], rev[i], ll);
+              UpdateGrad(call->args[i]->checked_type(), args[i], rev[i], ll);
             }
             return CallNode::make(bpv, {});
           }),
@@ -382,7 +470,7 @@ struct ReverseAD : ExprMutator {
         return ret;
       });
     }
-    return ExprMutator::VisitExpr_(op);
+    return ExprMutator::VisitExpr_(call);
   }
 
   Expr VisitExpr_(const ConstantNode* op) final {
@@ -396,16 +484,22 @@ struct ReverseAD : ExprMutator {
                         VisitExpr(op->false_branch));
   }
 
+  Expr VisitExpr_(const VarNode* var) final {
+    // memoize Var -> ADVar so we don't end up with free Vars when checkpointing
+    auto var_ref = GetRef<Var>(var);
+    if (!ad_vars->count(var_ref)) {
+      auto res = Downcast<Var>(ExprMutator::VisitExpr_(var));
+      (*ad_vars)[var_ref] = res;
+    }
+
+    return ad_vars->at(var_ref);
+  }
+
   Type VisitType(const Type& t) final {
     return t.defined() ? ReverseType(t) : t;
   }
 };
 
-Expr BPEmpty() {
-  Expr unitF = FunctionNode::make({}, TupleNode::make({}), TupleTypeNode::make({}), {});
-  return RefCreateNode::make(unitF);
-}
-
 bool MissingGrad(const Expr& e) {
   struct MGVisitor : ExprVisitor {
     const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
@@ -413,7 +507,7 @@ bool MissingGrad(const Expr& e) {
 
     void VisitExpr_(const OpNode* op) final {
       Op op_ref = GetRef<Op>(op);
-      if (!rev_map.count(op_ref)) {
+      if (op_ref->name != "annotation.checkpoint" && !rev_map.count(op_ref)) {
         op_names.insert(op_ref->name);
       }
       ExprVisitor::VisitExpr_(op);
@@ -445,7 +539,7 @@ Expr Gradient(const Expr& re, const Module& mod) {
   CHECK(!MissingGrad(e)) << "input has operators with missing gradients";
   Expr body = LetList::With([&](LetList* ll) {
     Var bp = ll->Push(BPEmpty());
-    Expr rev = ReverseAD(bp)(e);
+    Expr rev = ReverseAD(bp, std::make_shared<ReverseAD::ADVarMap>())(e);
     std::vector<Expr> args;
     for (const auto& p : f->params) {
       args.push_back(ll->Push(Pair(p, RefCreateNode::make(ZerosLike(p)))));
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
index 7aa9e0bc135f..acf3b75e0cb5 100644
--- a/tests/python/relay/test_op_grad_level10.py
+++ b/tests/python/relay/test_op_grad_level10.py
@@ -30,6 +30,18 @@ def test_cross_entropy_with_logits_grad():
     x = relay.var("x", shape=(2, 5))
     y = relay.var("y", shape=(2, 5))
     check_grad(relay.Function([x, y], relay.op.nn.cross_entropy_with_logits(x, y)), eps=0.01, scale=0.1, mean=1)
+    
+def test_checkpoint():
+    inputs = [relay.var("x{}".format(i), shape=(1,)) for i in range(4)]
+    output = relay.multiply(relay.add(inputs[0], inputs[1]),
+                            relay.add(inputs[2], inputs[3]))
+    check_grad(relay.Function(inputs, relay.annotation.checkpoint(output)))
+
+    out_tuple = relay.Tuple([relay.add(inputs[0], inputs[1]),
+                             relay.multiply(inputs[2], inputs[3])])
+    out_single = relay.subtract(relay.TupleGetItem(relay.annotation.checkpoint(out_tuple), 0),
+                                relay.TupleGetItem(out_tuple, 1))
+    check_grad(relay.Function(inputs, out_single))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index e828fa30de56..d9e29d8bbd9f 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -31,6 +31,127 @@ def run_infer_type(expr):
     entry = mod["main"]
     return entry if isinstance(expr, relay.Function) else entry.body
 
+def test_checkpoint():
+    dtype = "float32"
+    xs = [relay.var("x{}".format(i), dtype) for i in range(4)]
+    f = relay.multiply(relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3]))
+    f_checkpoint = relay.annotation.checkpoint(f)
+
+    func, func_checkpoint = relay.Function(xs, f), relay.Function(xs, f_checkpoint)
+    f, f_checkpoint = run_infer_type(func), run_infer_type(func_checkpoint)
+    assert f.checked_type == f_checkpoint.checked_type
+
+    inputs = [np.random.uniform() for _ in range(len(xs))]
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            f_res = intrp.evaluate(f)(*inputs)
+            f_checkpoint_res = intrp.evaluate(f_checkpoint)(*inputs)
+            tvm.testing.assert_allclose(f_res.asnumpy(), f_checkpoint_res.asnumpy(), 0, 0)
+
+def test_checkpoint_alpha_equal():
+    xs = [relay.var("x{}".format(i), relay.TensorType((1,), "float32")) for i in range(4)]
+    f = relay.Function(xs, relay.annotation.checkpoint(
+        relay.multiply(relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3]))
+    ))
+    df = transform.gradient(run_infer_type(f))
+
+    # run PE and DCE
+    with transform.PassContext(opt_level=3):
+        passes = [transform.PartialEvaluate(),
+                  transform.DeadCodeElimination(inline_once=True)]
+        mod = transform.Sequential(passes)(relay.Module.from_expr(df))
+        df = mod["main"]
+
+    df_parsed = relay.parser.fromtext(
+        """
+        v0.0.4
+        fn (%x: Tensor[(1), float32], %y: Tensor[(1), float32],
+            %z: Tensor[(1), float32], %w: Tensor[(1), float32])
+            ->  (Tensor[(1), float32],
+                (Tensor[(1), float32], Tensor[(1), float32],
+                 Tensor[(1), float32], Tensor[(1), float32])) {
+            %0 = add(%x, %y);
+            %1 = add(%z, %w);
+            let %x1: Tensor[(1), float32] = multiply(%0, %1);
+            let %x2: Tensor[(1), float32] = ones_like(%x1);
+            let %x3: Tensor[(1), float32] = add(%x, %y);
+            let %x4: Tensor[(1), float32] = add(%z, %w);
+            %2 = zeros_like(%x3);
+            %3 = multiply(%x2, %x4);
+            %4 = collapse_sum_like(%3, %x3);
+            let %x5: Tensor[(1), float32] = add(%2, %4);
+            %5 = zeros_like(%x4);
+            %6 = multiply(%x2, %x3);
+            %7 = collapse_sum_like(%6, %x4);
+            let %x6: Tensor[(1), float32] = add(%5, %7);
+            %8 = zeros_like(%x);
+            %9 = collapse_sum_like(%x5, %x);
+            %10 = add(%8, %9);
+            %11 = zeros_like(%y);
+            %12 = collapse_sum_like(%x5, %y);
+            %13 = add(%11, %12);
+            %14 = zeros_like(%z);
+            %15 = collapse_sum_like(%x6, %z);
+            %16 = add(%14, %15);
+            %17 = zeros_like(%w);
+            %18 = collapse_sum_like(%x6, %w);
+            %19 = add(%17, %18);
+            %20 = (%10, %13, %16, %19);
+            (%x1, %20)
+        }
+        """
+    )
+
+    relay.analysis.assert_alpha_equal(df, df_parsed)
+
+def test_checkpoint_alpha_equal_tuple():
+    xs = [relay.var("x{}".format(i), relay.TensorType((1,), "float32")) for i in range(4)]
+    f = relay.Function(xs, relay.annotation.checkpoint(
+        relay.Tuple([relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3])])
+    ))
+    df = transform.gradient(run_infer_type(f))
+
+    # run PE and DCE
+    with transform.PassContext(opt_level=3):
+        passes = [transform.PartialEvaluate(),
+                  transform.DeadCodeElimination(inline_once=True)]
+        mod = transform.Sequential(passes)(relay.Module.from_expr(df))
+        df = mod["main"]
+
+    df_parsed = relay.parser.fromtext(
+        """
+        v0.0.4
+        fn (%x: Tensor[(1), float32], %y: Tensor[(1), float32],
+            %z: Tensor[(1), float32], %w: Tensor[(1), float32])
+            -> ((Tensor[(1), float32], Tensor[(1), float32]),
+                (Tensor[(1), float32], Tensor[(1), float32],
+                 Tensor[(1), float32], Tensor[(1), float32])) {
+        let %x1: Tensor[(1), float32] = add(%x, %y) /* ty=Tensor[(1), float32] */;
+        let %x2: Tensor[(1), float32] = add(%z, %w) /* ty=Tensor[(1), float32] */;
+        let %x3: Tensor[(1), float32] = zeros_like(%x2) /* ty=Tensor[(1), float32] */;
+        let %x4: Tensor[(1), float32] = ones_like(%x1) /* ty=Tensor[(1), float32] */;
+        %0 = (%x1, %x2);
+        %1 = zeros_like(%x) /* ty=Tensor[(1), float32] */;
+        %2 = collapse_sum_like(%x4, %x) /* ty=Tensor[(1), float32] */;
+        %3 = add(%1, %2) /* ty=Tensor[(1), float32] */;
+        %4 = zeros_like(%y) /* ty=Tensor[(1), float32] */;
+        %5 = collapse_sum_like(%x4, %y) /* ty=Tensor[(1), float32] */;
+        %6 = add(%4, %5) /* ty=Tensor[(1), float32] */;
+        %7 = zeros_like(%z) /* ty=Tensor[(1), float32] */;
+        %8 = collapse_sum_like(%x3, %z) /* ty=Tensor[(1), float32] */;
+        %9 = add(%7, %8) /* ty=Tensor[(1), float32] */;
+        %10 = zeros_like(%w) /* ty=Tensor[(1), float32] */;
+        %11 = collapse_sum_like(%x3, %w) /* ty=Tensor[(1), float32] */;
+        %12 = add(%10, %11) /* ty=Tensor[(1), float32] */;
+        %13 = (%3, %6, %9, %12);
+        (%0, %13)
+        }
+        """
+    )
+
+    relay.analysis.assert_alpha_equal(df, df_parsed)
+
 def test_collapse_sum_like():
     shape = (3, 4, 5, 6)
     shape_like = (4, 5, 6)

From d121208d33db16d4b01bfe7ecc0d48b91b55f142 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Sat, 26 Oct 2019 17:05:03 -0700
Subject: [PATCH 40/59] [Relay][Params] Add APIs for storing and retrieving
 parameters from individual functions. (#4194)

* Add support for attaching params

* Fix types

* Fix test
---
 include/tvm/relay/expr.h            | 14 +++++++++++-
 python/tvm/relay/expr.py            | 12 +++++++++++
 src/relay/ir/expr.cc                | 20 +++++++++++++++++
 tests/python/relay/test_ir_nodes.py | 33 +++++++++++++++++++++++++++--
 4 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 6df4273d34c0..ff075e3a8970 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -274,6 +274,19 @@ class FunctionNode : public ExprNode {
                                tvm::Array<TypeVar> ty_params,
                                tvm::Attrs attrs = Attrs());
 
+  /*!
+   * \brief Attach the function's parameters to its attributes for use in analysis.
+   * \return The function with its parameters attached.
+   */
+  Function SetParams(const tvm::Map<Var, Constant>& parameters) const;
+
+  /*!
+   * \brief Retrieve the function's parameters.
+   *
+   * \return The function's parameter.
+   */
+  tvm::Map<Var, Constant> GetParams() const;
+
   static constexpr const char* _type_key = "relay.Function";
   TVM_DECLARE_NODE_TYPE_INFO(FunctionNode, ExprNode);
 };
@@ -284,7 +297,6 @@ RELAY_DEFINE_NODE_REF(Function, FunctionNode, Expr);
 TVM_DLL NodeRef FunctionGetAttr(const Function& func, const std::string& key);
 TVM_DLL Function FunctionSetAttr(const Function& func, const std::string& key, const NodeRef& data);
 
-
 /*!
  * \brief Call corresponds to operator invocation.
  *  Corresponds to the operator in computational graph terminology.
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 88779dfd76e0..8d59e99d8388 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -27,6 +27,7 @@
 from .._ffi import base as _base
 from .. import nd as _nd
 from .. import convert
+from ..ndarray import NDArray
 
 # will be registered afterwards
 _op_make = None
@@ -305,6 +306,17 @@ def __call__(self, *args):
         """
         return Call(self, args, None, None)
 
+    def get_params(self):
+        return _expr.FunctionGetParams(self)
+
+    def set_params(self, params):
+        for key in params:
+            value = params[key]
+            if isinstance(value, NDArray):
+                params[key] = Constant(value)
+
+        return _expr.FunctionSetParams(self, params)
+
 
 @register_relay_node
 class Call(Expr):
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 35e4f2b4ab13..c36b4c8566b8 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -159,6 +159,26 @@ bool FunctionNode::IsPrimitive() const {
   return pval && pval->value != 0;
 }
 
+Function FunctionNode::SetParams(const tvm::Map<Var, Constant>& parameters) const {
+  return FunctionSetAttr(GetRef<Function>(this), "__params__", parameters);
+}
+
+TVM_REGISTER_API("relay._expr.FunctionSetParams")
+.set_body_typed<Function(const Function&, const tvm::Map<Var, Constant>&)>(
+  [](const Function& func, const tvm::Map<Var, Constant>& parameters) {
+    return func->SetParams(parameters);
+});
+
+tvm::Map<Var, Constant> FunctionNode::GetParams() const {
+  auto node_ref = FunctionGetAttr(GetRef<Function>(this), "__params__");
+  return Downcast<tvm::Map<Var, Constant>>(node_ref);
+}
+
+TVM_REGISTER_API("relay._expr.FunctionGetParams")
+.set_body_typed<tvm::Map<Var, Constant>(const Function&)>([](const Function& func) {
+  return func->GetParams();
+});
+
 NodeRef FunctionGetAttr(const Function& func, const std::string& key) {
   if (!func->attrs.defined()) { return NodeRef(); }
 
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index b42a1e6d52c6..dec840a214a0 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -20,7 +20,7 @@
 from tvm.expr import *
 from tvm.relay import op
 from tvm.relay.analysis import graph_equal
-
+import numpy as np
 
 def check_json_roundtrip(node):
     json_str = tvm.save_json(node)
@@ -160,7 +160,6 @@ def test_global_var():
     str(gv)
     check_json_roundtrip(gv)
 
-
 def test_function():
     param_names = ['a', 'b', 'c', 'd']
     params = tvm.convert([relay.Var(n) for n in param_names])
@@ -175,6 +174,34 @@ def test_function():
     str(fn)
     check_json_roundtrip(fn)
 
+def test_function_attrs():
+    param_names = ['a', 'b', 'c', 'd']
+    params = tvm.convert([relay.var(n, shape=(5, 2)) for n in param_names])
+    ret_type = relay.TupleType(tvm.convert([]))
+    body = relay.Tuple(tvm.convert([]))
+    type_params = tvm.convert([])
+    fn = relay.Function(params, body, ret_type, type_params)
+    model_params = {}
+    for param in params[:1]:
+        cty = param.type_annotation
+        tensor = np.random.rand(*[int(sh) for sh in cty.shape]).astype(cty.dtype)
+        model_params[param] = tvm.nd.array(tensor)
+    fn = fn.set_params(model_params)
+    assert fn.params == params
+    assert fn.body == body
+    assert fn.type_params == type_params
+    assert fn.span == None
+    str(fn)
+    check_json_roundtrip(fn)
+    json_str = tvm.save_json(fn)
+    fn_after = tvm.load_json(json_str)
+    model_params_after = fn_after.get_params()
+    after_keys = [item[0] for item in model_params_after.items()]
+    for key1, key2 in zip(model_params, after_keys):
+        assert key1.name_hint == key2.name_hint
+        p1 = model_params[key1]
+        p2 = model_params_after[key2]
+        np.testing.assert_allclose(p1.data.asnumpy(), p2.data.asnumpy())
 
 def test_call():
     op = relay.Var('f')
@@ -257,9 +284,11 @@ def test_conv2d_attrs():
     test_local_var()
     test_global_var()
     test_function()
+    test_function_attrs()
     test_call()
     test_let()
     test_if()
     test_tuple_get_item()
     test_op()
     test_conv2d_attrs()
+

From 78069a38aa97f3c115a9aa1c8548431ea2e92b1f Mon Sep 17 00:00:00 2001
From: Jon Soifer <soiferj@gmail.com>
Date: Sat, 26 Oct 2019 17:05:22 -0700
Subject: [PATCH 41/59] [Relay][Frontend][ONNX] Add support for op Where
 (#4184)

* Add support for op Where

* Update impl version
---
 python/tvm/relay/frontend/onnx.py          | 10 +++++++-
 tests/python/frontend/onnx/test_forward.py | 27 ++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index a7f787484b2c..b007b41e61fe 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -922,6 +922,13 @@ class Erf(OnnxOpConverter):
     def _impl_v1(cls, inputs, attr, params):
         return _op.erf(inputs[0])
 
+class Where(OnnxOpConverter):
+    """Operator converter for Where
+    """
+    @classmethod
+    def _impl_v9(cls, inputs, attr, params):
+        return _op.where(inputs[0], inputs[1], inputs[2])
+
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -1042,7 +1049,8 @@ def _get_convert_map(opset):
         'Not': Not.get_converter(opset),
         'And': And.get_converter(opset),
         'Tile': Tile.get_converter(opset),
-        'Erf': Erf.get_converter(opset)
+        'Erf': Erf.get_converter(opset),
+        'Where': Where.get_converter(opset)
     }
 
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 16e717401174..3d1262f436bb 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1299,6 +1299,32 @@ def test_erf():
     z = scipy.special.erf(x)
     verify_erf(x, z)
 
+def verify_where(condition, x, y, dtype, outdata):
+    node = helper.make_node('Where', inputs=['condition', 'x', 'y'], outputs=['out'])
+    graph = helper.make_graph([node],
+                              'where_test',
+                              inputs=[helper.make_tensor_value_info('condition', TensorProto.BOOL, list(condition.shape)),
+                                      helper.make_tensor_value_info('x', dtype, list(x.shape)),
+                                      helper.make_tensor_value_info('y', dtype, list(y.shape))],
+                              outputs=[helper.make_tensor_value_info('out', dtype, list(outdata.shape))])
+    model = helper.make_model(graph, producer_name='where_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [condition, x, y], target, ctx, outdata.shape)
+        tvm.testing.assert_allclose(outdata, tvm_out)
+
+def test_where():
+    condition = np.array([[1, 0], [1, 1]], dtype=np.bool)
+    x = np.array([[1, 2], [3, 4]], dtype=np.int64)
+    y = np.array([[9, 8], [7, 6]], dtype=np.int64)
+    outdata = np.where(condition, x, y)
+    verify_where(condition, x, y, TensorProto.INT64, outdata)
+
+    x = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    y = np.array([[9, 8], [7, 6]], dtype=np.float32)
+    outdata = np.where(condition, x, y)
+    verify_where(condition, x, y, TensorProto.FLOAT, outdata)
+
 
 if __name__ == '__main__':
     test_flatten()
@@ -1347,3 +1373,4 @@ def test_erf():
     test_and()
     test_tile()
     test_erf()
+    test_where()

From 108d0844d1aa2c1bd953e0ffb25f1cc38250469a Mon Sep 17 00:00:00 2001
From: Benjamin Tu <tu.benjamin1115@gmail.com>
Date: Sat, 26 Oct 2019 17:06:49 -0700
Subject: [PATCH 42/59] [VTA][Chisel] TSIM VTA Source Refactor (#4163)

* app init push

* fix on readme

* change name, add bit serial explanantion

* rm serialLoadMM, change doc

* syntax change for readme

* add parallel test functionality

* fix readme

* add python doc

* syntax

* init commit

* fix empty line

* fix typo
---
 .../chisel/src/main/scala/accel/Compute.scala | 153 ++++++++++++------
 .../chisel/src/main/scala/accel/RegFile.scala |  10 +-
 vta/apps/gemm/src/driver.cc                   |  18 +--
 vta/apps/gemm/tests/python/chisel_accel.py    | 151 +++++++++--------
 4 files changed, 202 insertions(+), 130 deletions(-)

diff --git a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
index 325fce1bf38a..6bfe3e054121 100644
--- a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
+++ b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
@@ -22,21 +22,31 @@ package accel
 import chisel3._
 import chisel3.util._
 import vta.dpi._
+import vta.core._
+import vta.util.config._
+import vta.shell._
 
+class TestConfig extends Config(new CoreConfig ++ new PynqConfig)
 /** Compute
   *
   * Bit Slice GEMM:
   *
   * 1. Wait for launch to be asserted
-  * 2. Issue 2 read request for 8-byte value at inp1_baddr address and inp2_baddr address
+  * 2. Issue 1 read request for 8-bit value at inp1_baddr address (read matrix)
   * 3. Wait for the value
   * 4. Increment read-address for next value
-  * 5. Wait for sliced accumulator
-  * 6. Check if counter (cnt) is equal to length process,
-       otherwise goto step 2
-  * 7. Check if reset slice accumulator
-  * 8. Wait for overall accumulator
-  * 8. Issue a write request for 8-byte value at out_baddr address
+  * 5. Repeat until all inp1 data have been read
+
+  * 6. Issue 1 read request for 8-bit value at inp2_baddr address (read vector)
+  * 7. Wait for the value
+  * 8. Increment read-address for next value
+  * 9. Repeat until all inp2 data have been read
+
+  * 10. Wait for output to be calculated
+  * 11. Issue a write request for 8-byte value at out_baddr address
+  * 12. Increment write-address for next value to write
+  * 13. Check if counter (cntout) is equal to length to asser finish,
+       otherwise go to step 11
   */
 class Compute(implicit config: AccelConfig) extends Module {
   val io = IO(new Bundle {
@@ -47,19 +57,24 @@ class Compute(implicit config: AccelConfig) extends Module {
     val ptrs = Input(Vec(config.nPtrs, UInt(config.ptrBits.W)))
     val mem = new VTAMemDPIMaster
   })
-  val sIdle :: sReadAReq :: sReadAData :: sReadBReq :: sReadBData :: sWriteReq :: sWriteData :: Nil = Enum(7)
+  implicit val p: Parameters = new TestConfig
+  val sIdle :: sReadAReq :: sReadAData :: sReadADone ::sReadBReq :: sReadBData :: sReadBDone :: sInpDone ::sWait:: sWriteReq :: sWriteData :: sWriteDone :: Nil = Enum(12)
   val state = RegInit(sIdle)
   val shift = io.vals(0)
   val length = io.vals(1)
   val rstAccum = io.vals(2)
   val startDot = io.vals(3)
   val cycles = RegInit(0.U(config.regBits.W))
-  val reg1 = Reg(chiselTypeOf(io.mem.rd.bits))
-  val reg2 = Reg(chiselTypeOf(io.mem.rd.bits))
-  val cnt = Reg(UInt(config.regBits.W))
+  val mvc = Module(new MatrixVectorMultiplication)
+  val reg1 = Reg(chiselTypeOf(mvc.io.wgt.data.bits))
+  val reg2 = Reg(chiselTypeOf(mvc.io.inp.data.bits))
+  val cntwgt = Reg(UInt(config.regBits.W))
+  val cntinp = Reg(UInt(config.regBits.W))
+  val cntout = Reg(UInt(config.regBits.W))
   val raddr1 = Reg(UInt(config.ptrBits.W))
   val raddr2 = Reg(UInt(config.ptrBits.W))
   val waddr = Reg(UInt(config.ptrBits.W))
+  val accum = Module(new Accmulator(size = p(CoreKey).blockOut, accBits = p(CoreKey).accBits))
 
   switch (state) {
     is (sIdle) {
@@ -73,7 +88,14 @@ class Compute(implicit config: AccelConfig) extends Module {
     }
     is (sReadAData) {
       when (io.mem.rd.valid) {
+        state := sReadADone
+      }   
+    }
+    is (sReadADone) {
+      when (cntwgt === (length * length) - 1.U) {
         state := sReadBReq
+      } .otherwise {
+        state := sReadAReq
       }
     }
     is (sReadBReq) {
@@ -81,6 +103,23 @@ class Compute(implicit config: AccelConfig) extends Module {
     }
     is (sReadBData) {
       when (io.mem.rd.valid) {
+        state := sReadBDone
+      }
+    }
+    is (sReadBDone) {
+      when (cntinp === length-1.U) {
+        state := sInpDone
+      } .otherwise {
+        state := sReadBReq
+      }
+    }
+    // Both input is processed
+    is (sInpDone) {
+      state := sWait
+    }
+    // Wait for computation
+    is (sWait) {
+      when (accum.io.ready) {
         state := sWriteReq
       }
     }
@@ -89,15 +128,18 @@ class Compute(implicit config: AccelConfig) extends Module {
       state := sWriteData
     }
     is (sWriteData) {
-      when (cnt === (length - 1.U)) {
+        state := sWriteDone
+    }
+    is (sWriteDone) {
+      when (cntout === (length - 1.U)) {
         state := sIdle
       } .otherwise {
-        state := sReadAReq
+        state := sWriteReq
       }
     }
   }
 
-  val last = state === sWriteData && cnt === (length - 1.U)
+  val last = state === sWriteDone && cntout === (length - 1.U)
 
   // cycle counter
   when (state === sIdle) {
@@ -114,10 +156,12 @@ class Compute(implicit config: AccelConfig) extends Module {
     raddr1 := io.ptrs(0)
     raddr2 := io.ptrs(1)
     waddr := io.ptrs(2)
-  } .elsewhen (state === sWriteData) { // increment input array by 1-byte
+  } .elsewhen (state === sReadADone) { // increment input array by 1-byte
     raddr1 := raddr1 + 1.U
+  } .elsewhen (state === sReadBDone) { // increment input array by 1-byte
     raddr2 := raddr2 + 1.U
-    waddr := waddr
+  } .elsewhen (state === sWriteDone) {
+    waddr := waddr + 4.U // writing 4 bytes
   }
 
   // create request
@@ -128,59 +172,70 @@ class Compute(implicit config: AccelConfig) extends Module {
 
   // read
   when (state === sReadAData && io.mem.rd.valid) {
-    reg1 := io.mem.rd.bits(7, 0)
+    reg1(cntwgt/length)(cntwgt%length) := io.mem.rd.bits(7, 0)
   }
 
   when (state === sReadBData && io.mem.rd.valid) {
-    reg2 := io.mem.rd.bits(7, 0)
+    reg2(0)(cntinp) := io.mem.rd.bits(7, 0)
   }
 
   io.mem.rd.ready := state === sReadAData | state === sReadBData
+  mvc.io.inp.data.valid := state === sInpDone // 2 inputs have been processed 
+  mvc.io.wgt.data.valid := state === sInpDone // 2 inputs have been processed 
+
+  mvc.io.wgt.data.bits <> reg1
+  mvc.io.inp.data.bits <> reg2
+  // Modify when shift operation is supported
+  mvc.io.reset := false.B
+  mvc.io.acc_i.data.valid := true.B
+  for (i <- 0 until p(CoreKey).blockOut) {
+    mvc.io.acc_i.data.bits(0)(i) := 0.U
+  }
 
-  
-  val sliceAccum = Module(new Accumulator(63))
-  val overallAccum = Module(new Accumulator(64))
-
-  sliceAccum.io.valid := state === sWriteReq // 2 inputs have been processed 
-  sliceAccum.io.in := reg1 * reg2
-  sliceAccum.io.clear := startDot
-  overallAccum.io.clear := rstAccum
-  overallAccum.io.valid := last // last element has been processed
-  overallAccum.io.in := sliceAccum.io.sum << shift(7,0) // limit to 8 bits 
+  accum.io.in := mvc.io.acc_o.data.bits
+  accum.io.shift := shift
+  accum.io.clear := rstAccum
+  accum.io.valid := mvc.io.acc_o.data.valid
 
   // write
-  io.mem.wr.valid := overallAccum.io.ready 
-  io.mem.wr.bits := overallAccum.io.sum
-  
+  io.mem.wr.valid := state === sWriteData 
+  io.mem.wr.bits := accum.io.sum(cntout)
 
   // count read/write
   when (state === sIdle) {
-    cnt := 0.U
-  } .elsewhen (state === sWriteData) {
-    cnt := cnt + 1.U
+    cntwgt := 0.U
+    cntinp := 0.U
+    cntout := 0.U
+  } .elsewhen (state === sReadADone) {
+    cntwgt := cntwgt + 1.U
+  } .elsewhen (state === sReadBDone) {
+    cntinp := cntinp + 1.U
+  } .elsewhen (state === sWriteDone) {
+    cntout := cntout + 1.U
   }
 
-  io.finish := overallAccum.io.ready // data has been added
+  io.finish := last // data has been added
 }
-
-
-class Accumulator(dataBits: Int = 8) extends Module {
+// Shift operation until supported in MVM
+class Accmulator(size: Int = 16, accBits: Int = 32) extends Module {
   val io = IO(new Bundle {
     val clear = Input(Bool())
     val valid = Input(Bool())
     val ready = Output(Bool())
-    val in = Input(UInt(dataBits.W))
-    val sum = Output(UInt((dataBits).W))
+    val in = Input(Vec(1, Vec(size, (UInt(accBits.W)))))
+    val shift = Input(UInt(8.W))
+    val sum = Output(Vec(size, (UInt(accBits.W))))
   })
+    val reg = RegInit(VecInit(Seq.fill(size)(0.U(accBits.W))))
 
-  val reg = RegInit(0.U((dataBits).W))
-  val ready = RegNext(io.valid)
-  when (io.clear) {
-    reg := 0.U
-  } .elsewhen (io.valid) {
-    reg := reg + io.in
-  } 
-  io.ready := ready
-  io.sum := reg
+    for (i <- 0 until size) {
+      when (io.clear) {
+        reg(i) := 0.U
+      } .elsewhen(io.valid) {
+        reg(i) := reg(i) + (io.in(0)(i) << io.shift)
+      }
+    }
+    io.ready := RegNext(io.valid)
+    io.sum := reg
 }
 
diff --git a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
index 6f0bdbb6b34c..10c40b5c2e72 100644
--- a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
@@ -35,13 +35,9 @@ import vta.dpi._
   *  Shift value             | 0x08
   *  Vector length           | 0x0c
   *  Reset Accumulator       | 0x10
-  *  Reset Dot Module        | 0x14
-  *  Input1 pointer lsb      | 0x18
-  *  Input1 pointer msb      | 0x1c
-  *  Input2 pointer lsb      | 0x20
-  *  Input2 pointer msb      | 0x24
-  *  Output pointer lsb      | 0x28
-  *  Output pointer msb      | 0x2c
+  *  Input1 pointer          | 0x18
+  *  Input2 pointer          | 0x20
+  *  Output pointer          | 0x28
   * -------------------------------
 
   * ------------------------------
diff --git a/vta/apps/gemm/src/driver.cc b/vta/apps/gemm/src/driver.cc
index 8d380c323c9a..24b998edd211 100644
--- a/vta/apps/gemm/src/driver.cc
+++ b/vta/apps/gemm/src/driver.cc
@@ -66,10 +66,12 @@ class Device {
 
   uint32_t Run(DLTensor* inp1, DLTensor* inp2, uint32_t shiftVal, DLTensor* out, uint32_t reset) {
     uint32_t cycles;
-    uint32_t length = inp1->shape[0];
-    size_t size1 = (inp1->dtype.bits >> 3) * length;
+    uint32_t length = inp2->shape[0];
+    // 1 matrix 1 vector input
+    size_t size1 = (inp1->dtype.bits >> 3) * length * length;
     size_t size2 = (inp2->dtype.bits >> 3) * length;
-    size_t size3 = (64 >> 3);
+    // 1 vector output
+    size_t size3 = (32 >> 3) * length;
     inp1_ = this->MemAlloc(size1);
     inp2_ = this->MemAlloc(size2);
     out_ = this->MemAlloc(size3);
@@ -115,19 +117,17 @@ class Device {
 
   void Launch(uint32_t length, uint32_t shiftVal, uint32_t reset) {
     dpi_->WriteReg(0x08, shiftVal);
-    dpi_->WriteReg(0x0c, length); // vector length
+    dpi_->WriteReg(0x0c, length); // tensor size
     dpi_->WriteReg(0x18, this->MemGetPhyAddr(inp1_));
     dpi_->WriteReg(0x20, this->MemGetPhyAddr(inp2_));
     dpi_->WriteReg(0x28, this->MemGetPhyAddr(out_));
     dpi_->WriteReg(0x00, 0x1); // launch
-    dpi_->WriteReg(0x00, 0x0); // launch
+    dpi_->WriteReg(0x00, 0x0); 
 
     if (reset == 1) {
-      dpi_->WriteReg(0x10, 0x1); // reset accum
-      dpi_->WriteReg(0x10, 0x0); // stop reset accum
+      dpi_->WriteReg(0x10, 0x1); // reset accumulator
+      dpi_->WriteReg(0x10, 0x0); 
     }
-    dpi_->WriteReg(0x14, 0x1); // reset dot
-    dpi_->WriteReg(0x14, 0x0); // stop reset dot
   }
 
   uint32_t WaitForCompletion() {
diff --git a/vta/apps/gemm/tests/python/chisel_accel.py b/vta/apps/gemm/tests/python/chisel_accel.py
index 4aed5636b50e..4666661f9bc9 100644
--- a/vta/apps/gemm/tests/python/chisel_accel.py
+++ b/vta/apps/gemm/tests/python/chisel_accel.py
@@ -26,7 +26,7 @@
 A : Vector to be sliced and packed
 slice_width : slice width
 
-Returnsi
+Returns
 ---------
 C: 2d matrix where each cloumn (because of bit packing) represents each bit slice of A
 """
@@ -39,7 +39,7 @@ def slice(A, slice_width):
     elif dtype is np.uint16: row = 16 // slice_width
     elif dtype is np.uint32: row = 32 // slice_width
     elif dtype is np.uint64: row = 64 // slice_width
-    else: raise ValueError("datatype " + str(dtype) + "currently not supported")
+    else: raise ValueError("datatype currently not supported")
     if (row >= 8):
         dtype = 'uint' + str(row)
     else:
@@ -55,64 +55,88 @@ def slice(A, slice_width):
             C[y][x] = (np.uint64(A[x]) >> np.uint64(slice_width * y)) & np.uint64(slice_mask)
     return C
 
+def slice_mat(A, slice_width):
+    assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
+    dtype = type(A[0][0]) 
+    row = 0
+    # currently only supports uint
+    if dtype is np.uint8: row = 8 // slice_width
+    elif dtype is np.uint16: row = 16 // slice_width
+    elif dtype is np.uint32: row = 32 // slice_width
+    elif dtype is np.uint64: row = 64 // slice_width
+    else: raise ValueError("datatype currently not supported")
+    if (row >= 8):
+        dtype = 'uint' + str(row)
+    else:
+        dtype = 'uint8'
+
+    # 3d array (bits, row, clmn)
+    C = np.zeros((row, A.shape[0], A.shape[1])).astype(dtype) # sliced and transform 
+
+    # create mask
+    slice_mask = 2**(slice_width)-1
+    # slice and pack
+    for z in range(A.shape[0]):
+        C[:, z, :] = slice(A[z], slice_width)
+    return C
+
 """ Matrix Multiplication Function
 Parameters
 ----------
 A : Matrix A
 B: Matrix B
-w_width : weight slice width
-a_width : activation slice width
+i_width : weight slice width
+w_width : activation slice width
 
 Returns
 ---------
 C: result of A * B
 """
 # A is a n*m matrix, B is a m*p matrix(not transposed yet)
-def matrix_multiply(A, B, w_width, a_width):
+def matrix_multiply(A, B, i_width, w_width):
     assert A.shape[1] == B.shape[0], "can't perform multiplication"
     BT = B.transpose()
     cycles = 0
+    B_sliced = slice_mat(BT, w_width)
     C = np.zeros((A.shape[0], B.shape[1])).astype('uint64')
     for i in range(A.shape[0]):
-        for j in range(B.shape[1]):
-            # C[i, j] = A[i].dot(BT[j])
-            A_sliced = slice(A[i], w_width)
-            B_sliced = slice(BT[j], a_width)
-
-            C[i, j] = compute(A_sliced, B_sliced, w_width, a_width)
-            test = test_accel(A_sliced, B_sliced, w_width, a_width)
-            cycles += test[1]
-            np.testing.assert_equal(C[i,j], A[i].astype('uint64').dot(BT[j]))
-            print("PASS SW serial & parallel")
-
-            np.testing.assert_equal(test[0], C[i, j])
-            print("PASS SW & HW bit serial")
-
-            np.testing.assert_equal(test[0], A[i].astype('uint64').dot(BT[j]))
-            print("PASS SW bit parallel & HW bit parallel")
-
+        A_sliced = slice(A[i], i_width)
+        test = test_accel(A_sliced, B_sliced, i_width, w_width)
+        C[i] = test[0]
+        cycles += test[1]
+        np.testing.assert_array_equal(C[i], compute(A_sliced, B_sliced, i_width, w_width))
+        print("PASS row " + str(i))
+
+    np.testing.assert_array_equal(C, np.matmul(A.astype('uint64'),B))
     print("result: ")
     print(C)
-    print("ALL TESTS PASSED, cycles: " + str(cycles))
+    print("TEST PASSED, cycles: " + str(cycles))
     return C
 
-""" Software Verification Function"""
-# takes 2 matrix input (sliced and packed)
-def compute(A, B, w_width, a_width):
+""" Software Verification Function
+Parameter Dimesions
+---------
+A (bits, y) and B (bits, y, x) (transposed)
+
+Takes 1 vector and 1 matrix input (sliced and packed)
+
+Returns
+---------
+Resulting vector
+"""
+def compute(A, B, i_width, w_width):
     assert A.shape[1] == B.shape[1], "sliced shape not match"
     # reset hardware accumulator
-    accum = 0
+    accum = np.zeros(A.shape[1])
     for x in range(A.shape[0]):
         for y in range(B.shape[0]):
-            # hardware implementation
-            accum += np.uint64(A[x]).dot(np.uint64(B[y])) << np.uint64(x*w_width + y*a_width)
+            accum += np.matmul(A[x].astype('uint64'), B[y].transpose()) << np.uint64(x*i_width + y*w_width)
     # get value from accumulator
     return accum
 
-"""Testing Function for Dot Product"""
-def test_accel(A, B, w_width, a_width):
-    assert A.shape[1] == B.shape[1], "sliced shape not match"
-
+"""Testing Function for Matrix Vector Multiplication"""
+def test_accel(A, B, i_width, w_width):
+    assert A.shape[1] == B.shape[2], "sliced shape not match"
     dtype = A.dtype
     ctx = tvm.cpu(0)
     f = tsim.load_module()
@@ -126,57 +150,54 @@ def test_accel(A, B, w_width, a_width):
         a_arr.append(tvm.nd.array(list_a.astype(dtype), ctx))
 
     for i in range(B.shape[0]):
-        list_b = np.zeros(B.shape[1]).astype(dtype)
-        for j in range(B.shape[1]):
-            list_b[j] = B[i][j]
+        # transpose
+        list_b = np.zeros((B.shape[2], B.shape[1])).astype(dtype)
+        for j in range(B.shape[2]):
+            for k in range(B.shape[1]):
+                list_b[j][k] = B[i][j][k]
         b_arr.append(tvm.nd.array(list_b.astype(dtype), ctx))
 
     cycles = 0
-
-    accum = tvm.nd.array(np.array([0]).astype("uint64"), ctx)
+    accum = tvm.nd.array(np.zeros(A.shape[1]).astype("uint32"), ctx)
     for i in range(len(a_arr)):
         for j in range(len(b_arr)):
-            shift = np.uint8(i*w_width + j*a_width)
+            shift = np.uint8(i*i_width + j*w_width)
             if i == 0 and j == 0: 
-                cycles += f(a_arr[i], b_arr[j], shift, accum, np.uint32(1)) # reset accumulator
+                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(1)) # reset accumulator
             else: 
-                cycles += f(a_arr[i], b_arr[j], shift, accum, np.uint32(0)) # no reset
+                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(0)) # no reset
 
-    return (accum.asnumpy()[0], cycles)
+    return (accum.asnumpy(), cycles)
 
 """ Matrix Generator
 Parameters
 ----------     
 dtype : String, datatype generated (supports only uint)
-w_width : weight bit slices(needs to be less than actual bit width)
-a_width : activation bit slices(needs to be less than actual bit width)
+i_width : weight bit slices(needs to be less than actual bit width)
+w_width : activation bit slices(needs to be less than actual bit width)
 """
-def top_test(dtype, w_width, a_width):
-
-    rmax = np.random.randint(256)
-    # random matrix generation (dimension up to 8)
-    rrow = np.random.randint(7) + 1
-    rclmn = np.random.randint(7) + 1
-    rrow2 = np.random.randint(7) + 1 
-    A = np.random.randint(rmax, size=(rrow,rclmn)).astype(dtype)
-    B = np.random.randint(rmax, size=(rclmn,rrow2)).astype(dtype)
+def top_test(dtype, i_width, w_width):
 
-    print("A: ")
-    print(A)
-    print("\n")
-    print("B: ")
-    print(B)
-    print("\n")
-    matrix_multiply(A, B, w_width, a_width)
+    # only supports positive values (up to 2**(bits-1))
+    rmax = 127 
+    # (m,16) * (16,16) GEMM
+    rrow = np.random.randint(7) + 1 
+    clmn = 16
+    A = np.random.randint(rmax, size=(rrow,clmn)).astype(dtype)
+    B = np.random.randint(rmax, size=(clmn,clmn)).astype(dtype)
 
+    print("A: " + str(A))
+    print("B: " + str(B))
+    # perform GEMM
+    matrix_multiply(A, B, i_width, w_width)
 
 if __name__ == "__main__":
     tsim.init("chisel")
     for i in range(1):
-        # reg1 and reg2 bits in Compute.scala must be modified for slices greater than 8 bits
+        # reg1 and reg2 bits in hardware/chisel/src/main/Compute.scala must be modified for slices greater than 8 bits
         if sys.argv[1] == 'serial':
-          # generates a random uint8 GEMM with 2-bit(8/4) weight and 4-bit(8/2) activation 
-          top_test("uint8",4, 2)
+          # generates a random uint8 GEMM with 2-bit(8/4) input and 4-bit(8/2) weight 
+          top_test("uint8", 4, 2)
         elif sys.argv[1] == 'parallel':
-          # generates a random uint8 GEMM with 8-bit weight and 8-bit activation (bit parallel) 
-          top_test('uint8', 1, 1)
+          # generates a random uint8 GEMM with 8-bit input and 8-bit weight (bit parallel) 
+          top_test('uint8', 8, 8)

From 7e3bceb39c0256b0370eb2259d69b6af62d9c4a1 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 27 Oct 2019 16:41:20 -0400
Subject: [PATCH 43/59] [RUNTIME] Separate runtime related contrib into
 runtime/contrib (#4207)

---
 cmake/modules/CUDA.cmake                            |  4 ++--
 cmake/modules/Metal.cmake                           |  2 +-
 cmake/modules/ROCM.cmake                            |  4 ++--
 cmake/modules/contrib/BLAS.cmake                    |  2 +-
 cmake/modules/contrib/NNPack.cmake                  |  2 +-
 cmake/modules/contrib/Random.cmake                  |  2 +-
 cmake/modules/contrib/Sort.cmake                    |  2 +-
 src/{ => runtime}/contrib/cblas/cblas.cc            |  1 -
 src/{ => runtime}/contrib/cblas/gemm_common.h       |  1 -
 src/{ => runtime}/contrib/cublas/cublas.cc          |  5 ++---
 src/{ => runtime}/contrib/cublas/cublas_utils.cc    |  7 +++----
 src/{ => runtime}/contrib/cublas/cublas_utils.h     | 10 +++++-----
 src/{ => runtime}/contrib/cudnn/conv_forward.cc     |  5 ++---
 src/{ => runtime}/contrib/cudnn/cudnn_utils.cc      |  5 ++---
 src/{ => runtime}/contrib/cudnn/cudnn_utils.h       | 13 ++++++-------
 src/{ => runtime}/contrib/miopen/conv_forward.cc    |  5 ++---
 src/{ => runtime}/contrib/miopen/miopen_utils.cc    |  5 ++---
 src/{ => runtime}/contrib/miopen/miopen_utils.h     | 13 ++++++-------
 src/{ => runtime}/contrib/mps/conv.mm               |  0
 src/{ => runtime}/contrib/mps/gemm.mm               |  0
 src/{ => runtime}/contrib/mps/mps_utils.h           |  9 ++++-----
 src/{ => runtime}/contrib/mps/mps_utils.mm          |  0
 src/{ => runtime}/contrib/nnpack/convolution.cc     |  0
 src/{ => runtime}/contrib/nnpack/fully_connected.cc |  0
 src/{ => runtime}/contrib/nnpack/nnpack_utils.cc    |  0
 src/{ => runtime}/contrib/nnpack/nnpack_utils.h     | 10 +++++-----
 .../contrib/random/mt_random_engine.cc              |  0
 src/{ => runtime}/contrib/random/random.cc          |  0
 .../contrib/random/sgx_random_engine.cc             |  6 +++---
 src/{ => runtime}/contrib/rocblas/rocblas.cc        |  0
 src/{ => runtime}/contrib/sort/sort.cc              |  0
 31 files changed, 51 insertions(+), 62 deletions(-)
 rename src/{ => runtime}/contrib/cblas/cblas.cc (99%)
 rename src/{ => runtime}/contrib/cblas/gemm_common.h (99%)
 rename src/{ => runtime}/contrib/cublas/cublas.cc (99%)
 rename src/{ => runtime}/contrib/cublas/cublas_utils.cc (94%)
 rename src/{ => runtime}/contrib/cublas/cublas_utils.h (93%)
 rename src/{ => runtime}/contrib/cudnn/conv_forward.cc (99%)
 rename src/{ => runtime}/contrib/cudnn/cudnn_utils.cc (98%)
 rename src/{ => runtime}/contrib/cudnn/cudnn_utils.h (92%)
 rename src/{ => runtime}/contrib/miopen/conv_forward.cc (99%)
 rename src/{ => runtime}/contrib/miopen/miopen_utils.cc (98%)
 rename src/{ => runtime}/contrib/miopen/miopen_utils.h (91%)
 rename src/{ => runtime}/contrib/mps/conv.mm (100%)
 rename src/{ => runtime}/contrib/mps/gemm.mm (100%)
 rename src/{ => runtime}/contrib/mps/mps_utils.h (89%)
 rename src/{ => runtime}/contrib/mps/mps_utils.mm (100%)
 rename src/{ => runtime}/contrib/nnpack/convolution.cc (100%)
 rename src/{ => runtime}/contrib/nnpack/fully_connected.cc (100%)
 rename src/{ => runtime}/contrib/nnpack/nnpack_utils.cc (100%)
 rename src/{ => runtime}/contrib/nnpack/nnpack_utils.h (88%)
 rename src/{ => runtime}/contrib/random/mt_random_engine.cc (100%)
 rename src/{ => runtime}/contrib/random/random.cc (100%)
 rename src/{ => runtime}/contrib/random/sgx_random_engine.cc (99%)
 rename src/{ => runtime}/contrib/rocblas/rocblas.cc (100%)
 rename src/{ => runtime}/contrib/sort/sort.cc (100%)

diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index a947c36688c9..7094fe2d8ddd 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -40,7 +40,7 @@ if(USE_CUDA)
 
   if(USE_CUDNN)
     message(STATUS "Build with cuDNN support")
-    file(GLOB CONTRIB_CUDNN_SRCS src/contrib/cudnn/*.cc)
+    file(GLOB CONTRIB_CUDNN_SRCS src/runtime/contrib/cudnn/*.cc)
     list(APPEND RUNTIME_SRCS ${CONTRIB_CUDNN_SRCS})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDNN_LIBRARY})
 	  include_directories(${USE_CUDNN}/include)
@@ -48,7 +48,7 @@ if(USE_CUDA)
 
   if(USE_CUBLAS)
     message(STATUS "Build with cuBLAS support")
-    file(GLOB CONTRIB_CUBLAS_SRCS src/contrib/cublas/*.cc)
+    file(GLOB CONTRIB_CUBLAS_SRCS src/runtime/contrib/cublas/*.cc)
     list(APPEND RUNTIME_SRCS ${CONTRIB_CUBLAS_SRCS})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUBLAS_LIBRARY})
   endif(USE_CUBLAS)
diff --git a/cmake/modules/Metal.cmake b/cmake/modules/Metal.cmake
index b6512982039e..bde66b0aad60 100644
--- a/cmake/modules/Metal.cmake
+++ b/cmake/modules/Metal.cmake
@@ -24,7 +24,7 @@ if(USE_METAL)
   list(APPEND RUNTIME_SRCS ${RUNTIME_METAL_SRCS})
 
   if(USE_MPS)
-    file(GLOB MPS_CONTRIB_SRC src/contrib/mps/*.mm)
+    file(GLOB MPS_CONTRIB_SRC src/runtime/contrib/mps/*.mm)
     list(APPEND RUNTIME_SRCS ${MPS_CONTRIB_SRC})
     find_library(MPS_CONTRIB_LIB MetalPerformanceShaders)
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${MPS_CONTRIB_LIB})
diff --git a/cmake/modules/ROCM.cmake b/cmake/modules/ROCM.cmake
index a11133586cc0..b0874f3b27bc 100644
--- a/cmake/modules/ROCM.cmake
+++ b/cmake/modules/ROCM.cmake
@@ -37,14 +37,14 @@ if(USE_ROCM)
 
   if(USE_MIOPEN)
     message(STATUS "Build with MIOpen support")
-    file(GLOB MIOPEN_CONTRIB_SRCS src/contrib/miopen/*.cc)
+    file(GLOB MIOPEN_CONTRIB_SRCS src/runtime/contrib/miopen/*.cc)
     list(APPEND RUNTIME_SRCS ${MIOPEN_CONTRIB_SRCS})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_MIOPEN_LIBRARY})
   endif(USE_MIOPEN)
 
   if(USE_ROCBLAS)
     message(STATUS "Build with RocBLAS support")
-    file(GLOB ROCBLAS_CONTRIB_SRCS src/contrib/rocblas/*.cc)
+    file(GLOB ROCBLAS_CONTRIB_SRCS src/runtime/contrib/rocblas/*.cc)
     list(APPEND RUNTIME_SRCS ${ROCBLAS_CONTRIB_SRCS})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_ROCBLAS_LIBRARY})
   endif(USE_ROCBLAS)
diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index 2efe877ec034..6a5828749762 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -16,7 +16,7 @@
 # under the License.
 
 # Plugin rules for cblas
-file(GLOB CBLAS_CONTRIB_SRC src/contrib/cblas/*.cc)
+file(GLOB CBLAS_CONTRIB_SRC src/runtime/contrib/cblas/*.cc)
 
 if(USE_BLAS STREQUAL "openblas")
   find_library(BLAS_LIBRARY openblas)
diff --git a/cmake/modules/contrib/NNPack.cmake b/cmake/modules/contrib/NNPack.cmake
index 078efec45553..3b289d636dae 100644
--- a/cmake/modules/contrib/NNPack.cmake
+++ b/cmake/modules/contrib/NNPack.cmake
@@ -20,7 +20,7 @@ if(USE_NNPACK)
     set(NNPACK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/NNPack)
   endif()
 	set(PTHREAD_POOL_PATH ${NNPACK_PATH}/deps/pthreadpool)
-  file(GLOB NNPACK_CONTRIB_SRC src/contrib/nnpack/*.cc)
+  file(GLOB NNPACK_CONTRIB_SRC src/runtime/contrib/nnpack/*.cc)
   list(APPEND RUNTIME_SRCS ${NNPACK_CONTRIB_SRC})
 	include_directories(${NNPACK_PATH}/include)
 	include_directories(${PTHREAD_POOL_PATH}/include)
diff --git a/cmake/modules/contrib/Random.cmake b/cmake/modules/contrib/Random.cmake
index 1b28ec3b6945..dae2807fd6a5 100644
--- a/cmake/modules/contrib/Random.cmake
+++ b/cmake/modules/contrib/Random.cmake
@@ -17,6 +17,6 @@
 
 if(USE_RANDOM)
   message(STATUS "Build with contrib.random")
-  file(GLOB RANDOM_CONTRIB_SRC src/contrib/random/random.cc)
+  file(GLOB RANDOM_CONTRIB_SRC src/runtime/contrib/random/random.cc)
   list(APPEND RUNTIME_SRCS ${RANDOM_CONTRIB_SRC})
 endif(USE_RANDOM)
diff --git a/cmake/modules/contrib/Sort.cmake b/cmake/modules/contrib/Sort.cmake
index 52edd3851d36..5537e487ae23 100644
--- a/cmake/modules/contrib/Sort.cmake
+++ b/cmake/modules/contrib/Sort.cmake
@@ -17,6 +17,6 @@
 
 if(USE_SORT)
   message(STATUS "Build with contrib.sort")
-  file(GLOB SORT_CONTRIB_SRC src/contrib/sort/*.cc)
+  file(GLOB SORT_CONTRIB_SRC src/runtime/contrib/sort/*.cc)
   list(APPEND RUNTIME_SRCS ${SORT_CONTRIB_SRC})
 endif(USE_SORT)
diff --git a/src/contrib/cblas/cblas.cc b/src/runtime/contrib/cblas/cblas.cc
similarity index 99%
rename from src/contrib/cblas/cblas.cc
rename to src/runtime/contrib/cblas/cblas.cc
index 0f222e2f2a39..c6558673c108 100644
--- a/src/contrib/cblas/cblas.cc
+++ b/src/runtime/contrib/cblas/cblas.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file Use external cblas library call.
  */
 #include <dmlc/logging.h>
diff --git a/src/contrib/cblas/gemm_common.h b/src/runtime/contrib/cblas/gemm_common.h
similarity index 99%
rename from src/contrib/cblas/gemm_common.h
rename to src/runtime/contrib/cblas/gemm_common.h
index 2bcefb2f26bb..eff631d7e6c2 100644
--- a/src/contrib/cblas/gemm_common.h
+++ b/src/runtime/contrib/cblas/gemm_common.h
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file tvm/contrib/gemm.h
  * \brief Shared implementation of gemm
  */
diff --git a/src/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
similarity index 99%
rename from src/contrib/cublas/cublas.cc
rename to src/runtime/contrib/cublas/cublas.cc
index 5cee5be28bec..f605b7c27af7 100644
--- a/src/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file Use external cblas library call.
  */
 #include <tvm/runtime/registry.h>
diff --git a/src/contrib/cublas/cublas_utils.cc b/src/runtime/contrib/cublas/cublas_utils.cc
similarity index 94%
rename from src/contrib/cublas/cublas_utils.cc
rename to src/runtime/contrib/cublas/cublas_utils.cc
index 086c20868e45..9953cda32379 100644
--- a/src/contrib/cublas/cublas_utils.cc
+++ b/src/runtime/contrib/cublas/cublas_utils.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,13 +18,12 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file Use external cudnn utils function
  */
 #include "cublas_utils.h"
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
-#include "../../runtime/cuda/cuda_common.h"
+#include "../../cuda/cuda_common.h"
 
 namespace tvm {
 namespace contrib {
diff --git a/src/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h
similarity index 93%
rename from src/contrib/cublas/cublas_utils.h
rename to src/runtime/contrib/cublas/cublas_utils.h
index 8a91373d7e4f..bc677ff69795 100644
--- a/src/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,8 +22,8 @@
  * \file Use external cudnn utils function
  */
 
-#ifndef TVM_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
-#define TVM_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
+#ifndef TVM_RUNTIME_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
 
 #include <dmlc/logging.h>
 
@@ -67,4 +67,4 @@ struct CuBlasThreadEntry {
 }  // namespace contrib
 }  // namespace tvm
 
-#endif  // TVM_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
+#endif  // TVM_RUNTIME_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
diff --git a/src/contrib/cudnn/conv_forward.cc b/src/runtime/contrib/cudnn/conv_forward.cc
similarity index 99%
rename from src/contrib/cudnn/conv_forward.cc
rename to src/runtime/contrib/cudnn/conv_forward.cc
index c4d8b37a4906..b1c4d9f38e97 100644
--- a/src/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/contrib/cudnn/conv_forward.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file Use external cudnn utils function
  */
 #include <tvm/runtime/registry.h>
diff --git a/src/contrib/cudnn/cudnn_utils.cc b/src/runtime/contrib/cudnn/cudnn_utils.cc
similarity index 98%
rename from src/contrib/cudnn/cudnn_utils.cc
rename to src/runtime/contrib/cudnn/cudnn_utils.cc
index 75bfc1333162..fa185e97d1f5 100644
--- a/src/contrib/cudnn/cudnn_utils.cc
+++ b/src/runtime/contrib/cudnn/cudnn_utils.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file Use external cudnn utils function
  */
 #include "cudnn_utils.h"
diff --git a/src/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h
similarity index 92%
rename from src/contrib/cudnn/cudnn_utils.h
rename to src/runtime/contrib/cudnn/cudnn_utils.h
index c753854c3b78..8538f5100445 100644
--- a/src/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,17 +18,16 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file Use external cudnn utils function
  */
 
-#ifndef TVM_CONTRIB_CUDNN_CUDNN_UTILS_H_
-#define TVM_CONTRIB_CUDNN_CUDNN_UTILS_H_
+#ifndef TVM_RUNTIME_CONTRIB_CUDNN_CUDNN_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_CUDNN_CUDNN_UTILS_H_
 
 #include <dmlc/logging.h>
 #include <cudnn.h>
 #include <tvm/runtime/device_api.h>
-#include "../../runtime/cuda/cuda_common.h"
+#include "../../cuda/cuda_common.h"
 
 
 namespace tvm {
@@ -90,4 +89,4 @@ struct CuDNNThreadEntry {
 }  // namespace contrib
 }  // namespace tvm
 
-#endif  // TVM_CONTRIB_CUDNN_CUDNN_UTILS_H_
+#endif  // TVM_RUNTIME_CONTRIB_CUDNN_CUDNN_UTILS_H_
diff --git a/src/contrib/miopen/conv_forward.cc b/src/runtime/contrib/miopen/conv_forward.cc
similarity index 99%
rename from src/contrib/miopen/conv_forward.cc
rename to src/runtime/contrib/miopen/conv_forward.cc
index 6479d7d0906a..5094cef60f92 100644
--- a/src/contrib/miopen/conv_forward.cc
+++ b/src/runtime/contrib/miopen/conv_forward.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file Use external miopen utils function
  */
 #include <tvm/runtime/registry.h>
diff --git a/src/contrib/miopen/miopen_utils.cc b/src/runtime/contrib/miopen/miopen_utils.cc
similarity index 98%
rename from src/contrib/miopen/miopen_utils.cc
rename to src/runtime/contrib/miopen/miopen_utils.cc
index 8faa4bdfee4e..330ccdd043d0 100644
--- a/src/contrib/miopen/miopen_utils.cc
+++ b/src/runtime/contrib/miopen/miopen_utils.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file Use external miopen utils function
  */
 #include "miopen_utils.h"
diff --git a/src/contrib/miopen/miopen_utils.h b/src/runtime/contrib/miopen/miopen_utils.h
similarity index 91%
rename from src/contrib/miopen/miopen_utils.h
rename to src/runtime/contrib/miopen/miopen_utils.h
index 8f65fc7b28be..8831e4fac95c 100644
--- a/src/contrib/miopen/miopen_utils.h
+++ b/src/runtime/contrib/miopen/miopen_utils.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,18 +18,17 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file Use external miopen utils function
  */
 
-#ifndef TVM_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
-#define TVM_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
+#ifndef TVM_RUNTIME_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
 
 #include <dmlc/logging.h>
 #include <miopen/miopen.h>
 #include <tvm/runtime/device_api.h>
 #include <string>
-#include "../../runtime/rocm/rocm_common.h"
+#include "../../rocm/rocm_common.h"
 
 namespace tvm {
 namespace contrib {
@@ -75,4 +74,4 @@ struct MIOpenThreadEntry {
 }  // namespace contrib
 }  // namespace tvm
 
-#endif  // TVM_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
+#endif  // TVM_RUNTIME_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
diff --git a/src/contrib/mps/conv.mm b/src/runtime/contrib/mps/conv.mm
similarity index 100%
rename from src/contrib/mps/conv.mm
rename to src/runtime/contrib/mps/conv.mm
diff --git a/src/contrib/mps/gemm.mm b/src/runtime/contrib/mps/gemm.mm
similarity index 100%
rename from src/contrib/mps/gemm.mm
rename to src/runtime/contrib/mps/gemm.mm
diff --git a/src/contrib/mps/mps_utils.h b/src/runtime/contrib/mps/mps_utils.h
similarity index 89%
rename from src/contrib/mps/mps_utils.h
rename to src/runtime/contrib/mps/mps_utils.h
index 99288a8928d8..728646c537b9 100644
--- a/src/contrib/mps/mps_utils.h
+++ b/src/runtime/contrib/mps/mps_utils.h
@@ -18,12 +18,11 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file Use external mps utils function
  */
 
-#ifndef TVM_CONTRIB_MPS_MPS_UTILS_H_
-#define TVM_CONTRIB_MPS_MPS_UTILS_H_
+#ifndef TVM_RUNTIME_CONTRIB_MPS_MPS_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_MPS_MPS_UTILS_H_
 
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 #include <dmlc/logging.h>
@@ -32,7 +31,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/util.h>
 #include <vector>
-#include "../../runtime/metal/metal_common.h"
+#include "../../metal/metal_common.h"
 
 namespace tvm {
 namespace contrib {
@@ -56,4 +55,4 @@ struct MetalThreadEntry {
 }  // namespace contrib
 }  // namespace tvm
 
-#endif  // TVM_CONTRIB_MPS_MPS_UTILS_H_
+#endif  // TVM_RUNTIME_CONTRIB_MPS_MPS_UTILS_H_
diff --git a/src/contrib/mps/mps_utils.mm b/src/runtime/contrib/mps/mps_utils.mm
similarity index 100%
rename from src/contrib/mps/mps_utils.mm
rename to src/runtime/contrib/mps/mps_utils.mm
diff --git a/src/contrib/nnpack/convolution.cc b/src/runtime/contrib/nnpack/convolution.cc
similarity index 100%
rename from src/contrib/nnpack/convolution.cc
rename to src/runtime/contrib/nnpack/convolution.cc
diff --git a/src/contrib/nnpack/fully_connected.cc b/src/runtime/contrib/nnpack/fully_connected.cc
similarity index 100%
rename from src/contrib/nnpack/fully_connected.cc
rename to src/runtime/contrib/nnpack/fully_connected.cc
diff --git a/src/contrib/nnpack/nnpack_utils.cc b/src/runtime/contrib/nnpack/nnpack_utils.cc
similarity index 100%
rename from src/contrib/nnpack/nnpack_utils.cc
rename to src/runtime/contrib/nnpack/nnpack_utils.cc
diff --git a/src/contrib/nnpack/nnpack_utils.h b/src/runtime/contrib/nnpack/nnpack_utils.h
similarity index 88%
rename from src/contrib/nnpack/nnpack_utils.h
rename to src/runtime/contrib/nnpack/nnpack_utils.h
index 9491802ca8a5..8e936fcd716a 100644
--- a/src/contrib/nnpack/nnpack_utils.h
+++ b/src/runtime/contrib/nnpack/nnpack_utils.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -21,8 +21,8 @@
  *  Copyright (c) 2017 by Contributors
  * \file Use external nnpack library call.
  */
-#ifndef TVM_CONTRIB_NNPACK_NNPACK_UTILS_H_
-#define TVM_CONTRIB_NNPACK_NNPACK_UTILS_H_
+#ifndef TVM_RUNTIME_CONTRIB_NNPACK_NNPACK_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_NNPACK_NNPACK_UTILS_H_
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/util.h>
 #include <dmlc/thread_local.h>
@@ -41,4 +41,4 @@ struct NNPackThreadLocalEntry {
 bool NNPackConfig(uint64_t nthreads);
 }  // namespace contrib
 }  // namespace tvm
-#endif  // TVM_CONTRIB_NNPACK_NNPACK_UTILS_H_
+#endif  // TVM_RUNTIME_CONTRIB_NNPACK_NNPACK_UTILS_H_
diff --git a/src/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc
similarity index 100%
rename from src/contrib/random/mt_random_engine.cc
rename to src/runtime/contrib/random/mt_random_engine.cc
diff --git a/src/contrib/random/random.cc b/src/runtime/contrib/random/random.cc
similarity index 100%
rename from src/contrib/random/random.cc
rename to src/runtime/contrib/random/random.cc
diff --git a/src/contrib/random/sgx_random_engine.cc b/src/runtime/contrib/random/sgx_random_engine.cc
similarity index 99%
rename from src/contrib/random/sgx_random_engine.cc
rename to src/runtime/contrib/random/sgx_random_engine.cc
index 98e2b616bb69..8134c54dbe29 100644
--- a/src/contrib/random/sgx_random_engine.cc
+++ b/src/runtime/contrib/random/sgx_random_engine.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,7 +26,7 @@
 #include <sgx_trts.h>
 #include <algorithm>
 #include <cmath>
-#include "../../runtime/sgx/common.h"
+#include "../../sgx/common.h"
 
 namespace tvm {
 namespace contrib {
diff --git a/src/contrib/rocblas/rocblas.cc b/src/runtime/contrib/rocblas/rocblas.cc
similarity index 100%
rename from src/contrib/rocblas/rocblas.cc
rename to src/runtime/contrib/rocblas/rocblas.cc
diff --git a/src/contrib/sort/sort.cc b/src/runtime/contrib/sort/sort.cc
similarity index 100%
rename from src/contrib/sort/sort.cc
rename to src/runtime/contrib/sort/sort.cc

From 51423f4f64ed4cf80c28cb64e577ebf1601aa904 Mon Sep 17 00:00:00 2001
From: Logan Weber <36520469+weberlo@users.noreply.github.com>
Date: Mon, 28 Oct 2019 08:03:32 -0700
Subject: [PATCH 44/59] Fix type var docs (#4208)

---
 python/tvm/relay/ty.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 7e190fc405da..99692fda2822 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -56,8 +56,8 @@ def __call__(self, *args):
 class TensorType(Type):
     """A concrete TensorType in Relay.
 
-    This is the type assigned to tensor's with a known dype and shape. For
-    example a tensor of `float32` and `(5, 5)`.
+    This is the type assigned to tensors with a known dtype and shape. For
+    example, a tensor of `float32` and `(5, 5)`.
 
     Parameters
     ----------
@@ -119,13 +119,14 @@ class TypeVar(Type):
     functions which are generic over types.
     """
 
-    def __init__(self, var, kind=Kind.Type):
+    def __init__(self, name_hint, kind=Kind.Type):
         """Construct a TypeVar.
 
         Parameters
         ----------
-        var : tvm.expr.Var
-            The tvm.Var which backs the type parameter.
+        name_hint: str
+            The name of the type variable. This name only acts as a hint, and
+            is not used for equality.
 
         kind : Optional[Kind]
             The kind of the type parameter.
@@ -136,7 +137,7 @@ def __init__(self, var, kind=Kind.Type):
         type_var : tvm.relay.TypeVar
             The type variable.
         """
-        self.__init_handle_by_constructor__(_make.TypeVar, var, kind)
+        self.__init_handle_by_constructor__(_make.TypeVar, name_hint, kind)
 
 def ShapeVar(name):
     """A helper which constructs a type var of which the shape kind.
@@ -159,13 +160,15 @@ class GlobalTypeVar(Type):
     stored in the environment.
     """
 
-    def __init__(self, var, kind=Kind.AdtHandle):
+    def __init__(self, name_hint, kind=Kind.AdtHandle):
         """Construct a GlobalTypeVar.
 
         Parameters
         ----------
-        var: tvm.Var
-            The tvm.Var which backs the type parameter.
+        name_hint: str
+            The name of the global type variable. This name only acts as a
+            hint, and is not used for equality.
+
         kind: Kind, optional
             The kind of the type parameter, Kind.AdtHandle by default.
 
@@ -174,7 +177,7 @@ def __init__(self, var, kind=Kind.AdtHandle):
         type_var: GlobalTypeVar
             The global type variable.
         """
-        self.__init_handle_by_constructor__(_make.GlobalTypeVar, var, kind)
+        self.__init_handle_by_constructor__(_make.GlobalTypeVar, name_hint, kind)
 
 
 @register_relay_node

From 7ece7e3574d8d540783a2f89e64e7dc89aaf990a Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 28 Oct 2019 11:26:20 -0700
Subject: [PATCH 45/59] [Relay] Setting Legalize opt_level to 1. (#4198)

---
 src/relay/pass/legalize.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/pass/legalize.cc b/src/relay/pass/legalize.cc
index f57d9103412e..daed47afdcb3 100644
--- a/src/relay/pass/legalize.cc
+++ b/src/relay/pass/legalize.cc
@@ -103,7 +103,7 @@ Pass Legalize(const std::string& legalize_map_attr_name) {
       [=](Function f, Module m, PassContext pc) {
         return Downcast<Function>(relay::legalize::Legalize(f, legalize_map_attr_name));
       };
-  return CreateFunctionPass(pass_func, 0, "Legalize", {ir::StringImm::make("InferType")});
+  return CreateFunctionPass(pass_func, 1, "Legalize", {ir::StringImm::make("InferType")});
 }
 
 TVM_REGISTER_API("relay._transform.Legalize").set_body_typed(Legalize);

From 6f3be608d4e03b451d7c883e8cb29405ff3bc61e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 28 Oct 2019 14:28:00 -0400
Subject: [PATCH 46/59] [TOPI] Fix flaky testcase for check round (#4211)

---
 topi/tests/python/test_topi_math.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index ebbf6f7bd3d6..bb674364ff2e 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -52,7 +52,7 @@ def test_apply(
         a_np = np.random.uniform(low=low, high=high, size=shape).astype(A.dtype) * 10
         # avoid round check too close to boundary
         if check_round:
-            a_np += ((np.fmod(a_np, 1) - 0.5) < 1e-6) * 1e-5
+            a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-5
         b_np = f_numpy(a_np)
 
         def check_device(device):
@@ -100,7 +100,7 @@ def test_isnan(
         a_np.ravel()[np.random.choice(a_np.size, int(a_np.size * 0.5), replace=False)] = np.nan
         # avoid round check too close to boundary
         if check_round:
-            a_np += ((np.fmod(a_np, 1) - 0.5) < 1e-6) * 1e-5
+            a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-5
         b_np = np.isnan(a_np)
 
         def check_device(device):

From 08776e4962730fe4e583b4d511d7624d1a1e8ffb Mon Sep 17 00:00:00 2001
From: Xingyu Zhou <zhoxingy@amazon.com>
Date: Mon, 28 Oct 2019 11:34:56 -0700
Subject: [PATCH 47/59] [Relay][Op] Enhance Upsample Operator to support float
 scales   (#4206)

* :add scale2 for upsample

* update unit test for upsampling

* support latest upsample op for multiple frontend

* fix lint

* fix lint

* fix lint

* fix lint

* update scale description and rebase
---
 include/tvm/expr_operator.h                   |  3 ++
 include/tvm/relay/attrs/nn.h                  |  9 +++--
 nnvm/python/nnvm/to_relay.py                  |  3 +-
 python/tvm/relay/frontend/caffe2.py           |  2 +-
 python/tvm/relay/frontend/coreml.py           |  3 +-
 python/tvm/relay/frontend/darknet.py          |  5 +--
 python/tvm/relay/frontend/keras.py            |  8 +++--
 python/tvm/relay/frontend/nnvm_common.py      |  2 +-
 python/tvm/relay/frontend/onnx.py             |  5 +--
 python/tvm/relay/op/nn/_nn.py                 |  5 +--
 python/tvm/relay/op/nn/nn.py                  | 14 +++++---
 src/relay/op/nn/upsampling.cc                 | 11 +++---
 tests/python/relay/test_op_level2.py          | 25 +++++++------
 .../python/relay/test_pass_alter_op_layout.py |  4 +--
 tests/python/relay/test_pass_fuse_ops.py      |  8 ++---
 topi/python/topi/nn/upsampling.py             | 20 +++++++----
 topi/python/topi/testing/upsampling_python.py | 10 +++---
 topi/tests/python/test_topi_upsampling.py     | 35 ++++++++++---------
 18 files changed, 104 insertions(+), 68 deletions(-)

diff --git a/include/tvm/expr_operator.h b/include/tvm/expr_operator.h
index 007ae58ad4ba..adc77a8d0f0b 100644
--- a/include/tvm/expr_operator.h
+++ b/include/tvm/expr_operator.h
@@ -700,6 +700,9 @@ inline Expr make_zero(Type t) {
   }                                                            \
   inline Expr Name(const Expr& a, int b) {                     \
     return Name(a, make_const(a.type(), b));                   \
+  }                                                            \
+  inline Expr Name(const Expr& a, double b) {                  \
+    return Name(a, make_const(Float(64), b));                  \
   }
 
 #define TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(Name)                  \
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 793b43ad2bb3..f8e5af98c0a0 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -387,14 +387,17 @@ struct FIFOBufferAttrs : public tvm::AttrsNode<FIFOBufferAttrs> {
 
 /*! \brief Attributes for upsampling operator */
 struct UpSamplingAttrs : public tvm::AttrsNode<UpSamplingAttrs> {
-  int scale;
+  double scale_h;
+  double scale_w;
   std::string layout;
   std::string method;
   bool align_corners;
 
   TVM_DECLARE_ATTRS(UpSamplingAttrs, "relay.attrs.UpSamplingAttrs") {
-    TVM_ATTR_FIELD(scale)
-        .describe("Should be true to preserve the values at the corner pixels");
+    TVM_ATTR_FIELD(scale_h)
+        .describe("The upsampling factor for height");
+    TVM_ATTR_FIELD(scale_w)
+        .describe("The upsampling factor for width");
     TVM_ATTR_FIELD(layout).set_default("NCHW")
         .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
                   "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
diff --git a/nnvm/python/nnvm/to_relay.py b/nnvm/python/nnvm/to_relay.py
index 26dba0f94a27..94a736dabe70 100644
--- a/nnvm/python/nnvm/to_relay.py
+++ b/nnvm/python/nnvm/to_relay.py
@@ -219,7 +219,8 @@ def _upsampling(children, attrs, odtype='float32'):
     method = attrs.get_str('method', 'NEAREST_NEIGHBOR')
     return op.nn.upsampling(
         children[0],
-        scale=scale,
+        scale_h=scale,
+        scale_w=scale,
         layout=layout,
         method=method)
 
diff --git a/python/tvm/relay/frontend/caffe2.py b/python/tvm/relay/frontend/caffe2.py
index ac16a6bf13b6..456d782e521f 100644
--- a/python/tvm/relay/frontend/caffe2.py
+++ b/python/tvm/relay/frontend/caffe2.py
@@ -280,7 +280,7 @@ def _impl(cls, inputs, args, params):
         assert width_scale == height_scale
 
         return _op.nn.upsampling(
-            inputs[0], scale=int(width_scale), method="NEAREST_NEIGHBOR")
+            inputs[0], scale_h=int(width_scale), scale_w=int(width_scale), method="NEAREST_NEIGHBOR")
 
 
 class Sum(Caffe2OpConverter):
diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py
index 8b158ca0dec2..a24043df135d 100644
--- a/python/tvm/relay/frontend/coreml.py
+++ b/python/tvm/relay/frontend/coreml.py
@@ -313,7 +313,8 @@ def _UpsampleLayerParams(op, inexpr, etab):
         raise tvm.error.OpAttributeUnimplemented(
             'Upsample height and width must be equal.')
     interpolationMode = 'nearest_neighbor' if op.mode == 0 else 'bilinear'
-    return _op.nn.upsampling(inexpr, scale=op.scalingFactor[0], method=interpolationMode)
+    return _op.nn.upsampling(inexpr, scale_h=op.scalingFactor[0],
+                             scale_w=op.scalingFactor[1], method=interpolationMode)
 
 
 def _L2NormalizeLayerParams(op, inexpr, etab):
diff --git a/python/tvm/relay/frontend/darknet.py b/python/tvm/relay/frontend/darknet.py
index 982bceaafd36..a2a72eaf57ca 100644
--- a/python/tvm/relay/frontend/darknet.py
+++ b/python/tvm/relay/frontend/darknet.py
@@ -129,7 +129,7 @@ def _darknet_shortcut(inputs, params, attrs, prefix):
 
     if input_0_size > input_1_size:
         scale = int(input_0_size/input_1_size)
-        input_1 = get_relay_op('upsampling')(input_1, scale=scale)
+        input_1 = get_relay_op('upsampling')(input_1, scale_h=scale, scale_w=scale)
 
     elif input_0_size < input_1_size:
         stride = int(input_1_size/input_0_size)
@@ -196,7 +196,8 @@ def _darknet_reshape(inputs, params, attrs, prefix):
 def _darknet_upsampling(inputs, params, attrs, prefix):
     """Process the upsampling operation."""
     new_attrs = {}
-    new_attrs['scale'] = attrs.get('scale', 1)
+    new_attrs['scale_h'] = attrs.get('scale', 1)
+    new_attrs['scale_w'] = attrs.get('scale', 1)
     return get_relay_op('upsampling')(*inputs, **new_attrs)
 
 def _darknet_l2normalize(inputs, params, attrs, prefix):
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index cc092f380c5c..15f7440c3b42 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -398,13 +398,14 @@ def _convert_upsample(inexpr, keras_layer, _):
     params = {}
     if upsample_type == 'UpSampling1D':
         h = keras_layer.size
-        params['scale'] = h
+        params['scale_h'] = h
     elif upsample_type == 'UpSampling2D':
         h, w = keras_layer.size
         if h != w:
             raise tvm.error.OpAttributeInvalid(
                 'Height must equal width for operator Upsample.')
-        params['scale'] = h
+        params['scale_h'] = h
+        params['scale_w'] = h
 
         if hasattr(keras_layer, 'interpolation'):
             interpolation = keras_layer.interpolation
@@ -418,7 +419,8 @@ def _convert_upsample(inexpr, keras_layer, _):
         if h != w or w != d:
             raise tvm.error.OpAttributeInvalid(
                 'Height, width, and depth must all be equal for operator Upsample.')
-        params['scale'] = h
+        params['scale_h'] = h
+        params['scale_w'] = h
     else:
         raise tvm.error.OpNotImplemented(
             'Operator {} is not supported for frontend Keras.'.format(upsample_type))
diff --git a/python/tvm/relay/frontend/nnvm_common.py b/python/tvm/relay/frontend/nnvm_common.py
index ef2b81c1d2b8..5f24fa0a504e 100644
--- a/python/tvm/relay/frontend/nnvm_common.py
+++ b/python/tvm/relay/frontend/nnvm_common.py
@@ -112,7 +112,7 @@ def _transpose(inputs, attrs):
 
 def _upsampling(inputs, attrs):
     scale = attrs.get_int("scale")
-    return _op.nn.upsampling(inputs[0], scale=scale)
+    return _op.nn.upsampling(inputs[0], scale_h=scale, scale_w=scale)
 
 
 def _elemwise_sum(inputs, _, _dtype='float32'):
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index b007b41e61fe..1d74a01b1860 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -581,7 +581,7 @@ def _impl_v9(cls, inputs, attr, params):
             assert len(inputs) == 2, "Upsample op take 2 inputs, {} given".format(len(inputs))
             scales = params[inputs[1].name_hint].asnumpy()
             inputs = inputs[:1]
-        assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0 and scales[2] == scales[3]
+        assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0
         mode = attr.get('mode')
         if mode == b'nearest':
             method = "nearest_neighbor"
@@ -590,7 +590,8 @@ def _impl_v9(cls, inputs, attr, params):
         else:
             raise tvm.error.OpAttributeInvalid(
                 'Value {} in attribute "mode" of operator Upsample is not valid.'.format(mode))
-        attr = {'scale':int(scales[-1]), 'method':method, 'layout':'NCHW', 'align_corners':True}
+        attr = {'scale_h':scales[-2], 'scale_w':scales[-1], 'method':method,
+                'layout':'NCHW', 'align_corners':True}
         return AttrCvt('upsampling')(inputs, attr)
 
 
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 5786c228abc0..891548036017 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -409,11 +409,12 @@ def schedule_upsampling(_, outs, target):
 
 @reg.register_compute("nn.upsampling")
 def compute_upsampling(attrs, inputs, out_dtype, target):
-    scale = attrs.scale
+    scale_h = attrs.scale_h
+    scale_w = attrs.scale_w
     layout = attrs.layout
     method = attrs.method
     align_corners = attrs.align_corners
-    return [topi.nn.upsampling(inputs[0], scale, layout, method, align_corners)]
+    return [topi.nn.upsampling(inputs[0], scale_h, scale_w, layout, method, align_corners)]
 
 # pad
 reg.register_schedule("nn.pad", schedule_broadcast)
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 1f289d1bd27a..6488eab0d1d8 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -483,7 +483,8 @@ def global_avg_pool2d(data,
 
 
 def upsampling(data,
-               scale=1,
+               scale_h=1,
+               scale_w=1,
                layout="NCHW",
                method="nearest_neighbor",
                align_corners=False):
@@ -492,7 +493,7 @@ def upsampling(data,
     This operator takes data as input and does 2D scaling to the given scale factor.
     In the default case, where the data_layout is `NCHW`
     with data of shape (n, c, h, w)
-    out will have a shape (n, c, h*scale, w*scale)
+    out will have a shape (n, c, h*scale_h, w*scale_w)
 
     method indicates the algorithm to be used while calculating the out value
     and method can be one of ("bilinear", "nearest_neighbor", "bicubic")
@@ -502,8 +503,11 @@ def upsampling(data,
     data : tvm.relay.Expr
         The input data to the operator.
 
-    scale : tvm.relay.Expr
-        The scale factor for upsampling.
+    scale_h : tvm.relay.Expr
+        The scale factor for height upsampling.
+
+    scale_w : tvm.relay.Expr
+        The scale factor for width upsampling.
 
     layout : str, optional
         Layout of the input.
@@ -519,7 +523,7 @@ def upsampling(data,
     result : tvm.relay.Expr
         The computed result.
     """
-    return _make.upsampling(data, scale, layout, method, align_corners)
+    return _make.upsampling(data, scale_h, scale_w, layout, method, align_corners)
 
 
 def batch_flatten(data):
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index c473f86a39ca..e044722380ce 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -80,9 +80,8 @@ bool UpSamplingRel(const Array<Type>& types,
     << " But got " << in_layout;
 
   auto oshape = layout_converter.ForwardShape(data->shape);
-
-  oshape.Set(2, oshape[2] * param->scale);
-  oshape.Set(3, oshape[3] * param->scale);
+  oshape.Set(2, ir::Cast::make(oshape[2].type(), tvm::round(oshape[2] * param->scale_h)));
+  oshape.Set(3, ir::Cast::make(oshape[3].type(), tvm::round(oshape[3] * param->scale_w)));
 
   // assign output type
   reporter->Assign(types[1],
@@ -95,14 +94,16 @@ bool UpSamplingRel(const Array<Type>& types,
 // Positional relay function to create upsampling operator
 // used by frontend FFI.
 Expr MakeUpSampling(Expr data,
-                    int scale,
+                    double scale_h,
+                    double scale_w,
                     std::string layout,
                     std::string method,
                     bool align_corners) {
   auto attrs = make_node<UpSamplingAttrs>();
   attrs->layout = std::move(layout);
   attrs->method = std::move(method);
-  attrs->scale = scale;
+  attrs->scale_h = scale_h;
+  attrs->scale_w = scale_w;
   attrs->align_corners = align_corners;
   static const Op& op = Op::Get("nn.upsampling");
   return CallNode::make(op, {data}, Attrs(attrs), {});
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 9236d6e55fa0..982161d9899f 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -232,14 +232,17 @@ def test_conv2d_transpose_run():
 
 def test_upsampling_infer_type():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    scale = tvm.const(2.0, "float64")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    y = relay.nn.upsampling(x, scale=2, layout="NCHW", method="bilinear")
+    y = relay.nn.upsampling(x, scale_h=2, scale_w=2, layout="NCHW", method="bilinear")
     "method=\"BINLINEAR\"" in y.astext()
     yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h*2, w*2), "float32")
+    assert yy.checked_type == relay.TensorType((n, c, tvm.expr.Cast("int32", tvm.round(h*scale)),
+                                                tvm.expr.Cast("int32", tvm.round(w*scale))),
+                                                "float32")
     n, c = tvm.var("n"), tvm.var("c")
     x = relay.var("x", relay.TensorType((n, c, 100, 200), "float32"))
-    y = relay.nn.upsampling(x, scale=2, layout="NCHW", method="bilinear")
+    y = relay.nn.upsampling(x, scale_h=2, scale_w=2, layout="NCHW", method="bilinear")
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, 200, 400), "float32")
 
@@ -504,29 +507,31 @@ def test_batch_flatten():
 
 def _test_upsampling(layout, method, align_corners=False):
     n, c, h, w = tvm.var("n"), 16, 32, 32
-    scale = 2
+    scale_h = 2.0
+    scale_w = 2.0
     dtype = "float32"
     def get_shape():
         if layout == "NCHW":
-            return (c, h, w), (c, h*scale, w*scale)
+            return (c, h, w), (c, int(round(h*scale_h)), int(round(w*scale_w)))
         else:
-            return (h, w, c), (h*scale, w*scale, c)
+            return (h, w, c), (int(round(h*scale_h)), int(round(w*scale_w)), c)
     ishape, oshape = get_shape()
     x = relay.var("x", relay.TensorType((n,) + ishape, dtype))
-    y = relay.nn.upsampling(x, scale=scale, layout=layout,
+    y = relay.nn.upsampling(x, scale_h=scale_h, scale_w=scale_w, layout=layout,
                             method=method, align_corners=align_corners)
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n,) + oshape, dtype)
     dshape = (1,) + ishape
     x = relay.var("x", shape=dshape)
-    y = relay.nn.upsampling(x, scale=scale, layout=layout,
+    y = relay.nn.upsampling(x, scale_h=scale_h, scale_w=scale_w, layout=layout,
                             method=method, align_corners=align_corners)
     func = relay.Function([x], y)
     data = np.random.uniform(size=dshape).astype(dtype)
     if method == "nearest_neighbor":
-        ref = topi.testing.upsampling_python(data, (scale, scale), layout)
+        ref = topi.testing.upsampling_python(data, (scale_h, scale_w), layout)
     else:
-        ref = topi.testing.bilinear_resize_python(data, (h*scale, w*scale), layout)
+        ref = topi.testing.bilinear_resize_python(data, (int(round(h*scale_h)),
+                                                  int(round(w*scale_w))), layout)
     for target, ctx in ctx_list():
         executor = relay.create_executor("graph", ctx=ctx, target=target)
         out = executor.evaluate(func)(data)
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 8ae7777057f3..f1200ec62a32 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -487,7 +487,7 @@ def before():
         x = relay.var("x", shape=(1, 32, 28, 28))
         weight = relay.var('weight', shape=(32, 32, 3, 3))
         y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.nn.upsampling(y, scale=2)
+        y = relay.nn.upsampling(y, scale_h=2, scale_w=2)
         y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2))
         y = relay.Function(analysis.free_vars(y), y)
         return y
@@ -506,7 +506,7 @@ def expected():
         x = relay.layout_transform(x, "NCHW", "NCHW16c")
         y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1),
                             data_layout="NCHW16c")
-        y = relay.nn.upsampling(y, scale=2, layout="NCHW16c")
+        y = relay.nn.upsampling(y, scale_h=2, scale_w=2, layout="NCHW16c")
         y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2), layout='NCHW16c')
         y = relay.layout_transform(y, "NCHW16c", "NCHW")
         y = relay.Function(analysis.free_vars(y), y)
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 45faa14549ee..7ec21eab12df 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -126,7 +126,7 @@ def test_concatenate():
     def before(dshape):
         x = relay.var("x", shape=dshape)
         pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-        upsampled = relay.nn.upsampling(pooled, scale=2, layout="NCHW")
+        upsampled = relay.nn.upsampling(pooled, scale_h=2, scale_w=2, layout="NCHW")
         concat = relay.concatenate((upsampled, x), axis=1)
         out = relay.add(concat, relay.const(1, "float32"))
         return relay.Function(relay.analysis.free_vars(out), out)
@@ -138,7 +138,7 @@ def expected(dshape):
 
         p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2]//2, dshape[3]//2))
         p1 = relay.var("p1", shape=dshape)
-        upsampled = relay.nn.upsampling(p0, scale=2, layout="NCHW")
+        upsampled = relay.nn.upsampling(p0, scale_h=2, scale_w=2, layout="NCHW")
         concat = relay.concatenate((upsampled, p1), axis=1)
         out = relay.add(concat, relay.const(1, "float32"))
         f1 = relay.Function([p0, p1], out)
@@ -164,7 +164,7 @@ def test_tuple_root():
     def before(dshape):
         x = relay.var("x", shape=dshape)
         pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-        upsampled = relay.nn.upsampling(pooled, scale=2, layout="NCHW")
+        upsampled = relay.nn.upsampling(pooled, scale_h=2, scale_w=2, layout="NCHW")
         out = relay.Tuple((upsampled, x))
         return relay.Function(relay.analysis.free_vars(out), out)
 
@@ -174,7 +174,7 @@ def expected(dshape):
         f0 = relay.Function([x], pooled)
 
         p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2]//2, dshape[3]//2))
-        upsampled = relay.nn.upsampling(p0, scale=2, layout="NCHW")
+        upsampled = relay.nn.upsampling(p0, scale_h=2, scale_w=2, layout="NCHW")
         f1 = relay.Function([p0], upsampled)
 
         x = relay.var("x", shape=dshape)
diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py
index 609213637cf4..771c9e207a17 100644
--- a/topi/python/topi/nn/upsampling.py
+++ b/topi/python/topi/nn/upsampling.py
@@ -17,10 +17,12 @@
 """TVM operator upsampling compute."""
 from __future__ import absolute_import
 import topi
+import tvm
 from ..util import simplify
 
 
-def upsampling(data, scale, layout="NCHW", method='nearest_neighbor', align_corners=False):
+def upsampling(data, scale_h, scale_w, layout="NCHW", method='nearest_neighbor',
+               align_corners=False):
     """Perform upsampling on the data.
        Nearest neighbor and bilinear upsampling are supported.
 
@@ -31,8 +33,11 @@ def upsampling(data, scale, layout="NCHW", method='nearest_neighbor', align_corn
         [batch, channel, in_height, in_width]
         or  [batch, in_height, in_width, channel]
 
-    scale : int
-        Scaling factor
+    scale_h : float
+        Scaling factor for height
+
+    scale_w : float
+        Scaling factor for width
 
     layout : string, optional
         either "NCHW" or "NHWC"
@@ -43,14 +48,17 @@ def upsampling(data, scale, layout="NCHW", method='nearest_neighbor', align_corn
     Returns
     -------
     output : tvm.Tensor
-        4-D with shape [batch, channel, in_height*scale, in_width*scale]
+        4-D with shape [batch, channel, in_height*scale_h, in_width*scale_w]
         or [batch, in_height*scale, in_width*scale, channel]
     """
     base_layout = layout[0:4]
     if base_layout == "NCHW":
-        out_shape = (simplify(data.shape[2] * scale), simplify(data.shape[3] * scale))
+        out_shape = (simplify(topi.cast(tvm.round(data.shape[2] * scale_h), data.shape[2].dtype)),
+                     simplify(topi.cast(tvm.round(data.shape[3] * scale_w), data.shape[3].dtype)))
     elif layout == "NHWC":
-        out_shape = (simplify(data.shape[1] * scale), simplify(data.shape[2] * scale))
+        out_shape = (simplify(topi.cast(tvm.round(data.shape[1] * scale_h), data.shape[1].dtype)),
+                     simplify(topi.cast(tvm.round(data.shape[2] * scale_w), data.shape[2].dtype)))
+
     else:
         raise ValueError("not support this layout {} yet".format(layout))
     return topi.image.resize(data, out_shape, layout=layout,
diff --git a/topi/python/topi/testing/upsampling_python.py b/topi/python/topi/testing/upsampling_python.py
index 167fdfc7f227..6ea7d6ad8835 100644
--- a/topi/python/topi/testing/upsampling_python.py
+++ b/topi/python/topi/testing/upsampling_python.py
@@ -22,8 +22,8 @@
 def upsample_nearest(arr, scale):
     """ Populate the array by scale factor"""
     h, w = arr.shape
-    out_h = math.floor(h * scale[0])
-    out_w = math.floor(w * scale[1])
+    out_h = int(round(h * scale[0]))
+    out_w = int(round(w * scale[1]))
     out = np.empty((out_h, out_w))
     for y in range(out_h):
         for x in range(out_w):
@@ -37,14 +37,16 @@ def upsampling_python(data, scale, layout='NCHW'):
 
     ishape = data.shape
     if layout == 'NCHW':
-        oshape = (ishape[0], ishape[1], math.floor(ishape[2]*scale[0]), math.floor(ishape[3]*scale[1]))
+        oshape = (ishape[0], ishape[1], int(round(ishape[2]*scale[0])),
+                  int(round(ishape[3]*scale[1])))
         output_np = np.zeros(oshape, dtype=data.dtype)
         for b in range(oshape[0]):
             for c in range(oshape[1]):
                 output_np[b, c, :, :] = upsample_nearest(data[b, c, :, :], scale)
         return output_np
     if layout == 'NHWC':
-        oshape = (ishape[0], math.floor(ishape[1]*scale[0]), math.floor(ishape[1]*scale[1]), ishape[3])
+        oshape = (ishape[0], int(round(ishape[1]*scale[0])),
+                  int(round(ishape[2]*scale[1])), ishape[3])
         output_np = np.zeros(oshape, dtype=data.dtype)
         for b in range(oshape[0]):
             for c in range(oshape[3]):
diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py
index f878c23aed92..83909c085d14 100644
--- a/topi/tests/python/test_topi_upsampling.py
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -23,30 +23,29 @@
 
 from common import get_all_backend
 
-def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCHW', method="nearest_neighbor"):
-
-
+def verify_upsampling(batch, in_channel, in_height, in_width, scale_h, scale_w,
+                      layout='NCHW', method="nearest_neighbor"):
     if layout == 'NCHW':
         A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
         dtype = A.dtype
-        out_shape = (batch, in_channel, in_height*scale, in_width*scale)
+        out_shape = (batch, in_channel, int(round(in_height*scale_h)), int(round(in_width*scale_w)))
         a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
     elif layout == 'NHWC':
         A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A')
         dtype = A.dtype
-        out_shape = (batch, in_height*scale, in_width*scale, in_channel)
+        out_shape = (batch, int(round(in_height*scale_h)), int(round(in_width*scale_w)), in_channel)
         a_np = np.random.uniform(size=(batch, in_height, in_width, in_channel)).astype(dtype)
     else:
         raise NotImplementedError(
             'Layout not supported {} '.format(layout))
 
-    B = topi.nn.upsampling(A, scale, layout=layout, method=method, align_corners=False)
+    B = topi.nn.upsampling(A, scale_h, scale_w, layout=layout, method=method, align_corners=False)
 
     if method == "bilinear":
-        out_size = (in_height*scale, in_width*scale)
+        out_size = (int(round(in_height*scale_h)), int(round(in_width*scale_w)))
         b_np = topi.testing.bilinear_resize_python(a_np, out_size, layout, align_corners=False)
     else:
-        b_np = topi.testing.upsampling_python(a_np, (scale, scale), layout)
+        b_np = topi.testing.upsampling_python(a_np, (scale_h, scale_w), layout)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -68,20 +67,24 @@ def check_device(device):
 
 def test_upsampling():
     # nearest_neighbor - NCHW
-    verify_upsampling(8, 16, 32, 32, 2)
-    verify_upsampling(2, 32, 64, 64, 3)
+    verify_upsampling(8, 16, 32, 32, 2.0, 2.0)
+    verify_upsampling(2, 32, 64, 64, 3.0, 3.0)
+    verify_upsampling(1, 64, 22, 32, 1.954545497894287, 2.0)
 
     ## nearest_neighbor - NHWC
-    verify_upsampling(8, 16, 32, 32, 2, layout="NHWC")
-    verify_upsampling(2, 32, 64, 64, 3, layout="NHWC")
+    verify_upsampling(8, 16, 32, 32, 2.0, 2.0, layout="NHWC")
+    verify_upsampling(2, 32, 64, 64, 3.0, 3.0, layout="NHWC")
+    verify_upsampling(1, 64, 22, 32, 1.954545497894287, 2.0, layout="NHWC")
 
     # bilinear - NCHW
-    verify_upsampling(2, 2, 32, 32, 2, method="bilinear")
-    verify_upsampling(2, 2, 32, 32, 3, method="bilinear")
+    verify_upsampling(2, 2, 32, 32, 2.0, 2.0, method="bilinear")
+    verify_upsampling(2, 2, 32, 32, 3.0, 3.0, method="bilinear")
+    verify_upsampling(1, 64, 22, 32, 1.954545497894287, 2.0, method="bilinear")
 
     # bilinear - NHWC
-    verify_upsampling(2, 2, 32, 32, 2, layout="NHWC", method="bilinear")
-    verify_upsampling(2, 2, 32, 32, 3, layout="NHWC", method="bilinear")
+    verify_upsampling(2, 2, 32, 32, 2.0, 2.0, layout="NHWC", method="bilinear")
+    verify_upsampling(2, 2, 32, 32, 3.0, 3.0, layout="NHWC", method="bilinear")
+    verify_upsampling(1, 64, 22, 32,  3.0, 3.0, layout="NHWC", method="bilinear")
 
 if __name__ == "__main__":
     test_upsampling()

From dbc1cc7fec7bd68df1c6e4923f5ad2276795aee6 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 29 Oct 2019 00:51:29 -0400
Subject: [PATCH 48/59] [Relay][Quantize] Use fixed point mulplications (#4160)

---
 python/tvm/relay/quantize/quantize.py |  4 ++++
 src/relay/pass/quantize/quantize.cc   |  3 ++-
 src/relay/pass/quantize/quantize.h    |  2 ++
 src/relay/pass/quantize/realize.cc    | 23 +++++++++++++----------
 src/relay/qnn/op/requantize.cc        |  6 ++----
 src/relay/qnn/util.cc                 |  4 +++-
 src/relay/qnn/util.h                  |  6 +++---
 7 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index adde2058267a..7fa8a66878bc 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -83,6 +83,7 @@ class QConfig(NodeBase):
         "do_simulation": False,
         "round_for_shift": True,
         "debug_enabled_ops": None,
+        "rounding": "UPWARD"
     }
 
     # pylint: disable=no-member
@@ -160,6 +161,9 @@ def qconfig(**kwargs):
         is None, which means will try to call all operartors' annotate rewrite
         function.
 
+    rounding: "UPWARD" or "TONEAREST"
+        Rounding direction for fixed point multiplications.
+
     Returns
     -------
     config: QConfig
diff --git a/src/relay/pass/quantize/quantize.cc b/src/relay/pass/quantize/quantize.cc
index 3d0e71edfb7c..d564d2e76dbe 100644
--- a/src/relay/pass/quantize/quantize.cc
+++ b/src/relay/pass/quantize/quantize.cc
@@ -126,7 +126,8 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   p->stream << "skip_conv_layers==" << op->skip_conv_layers << ", ";
   p->stream << "do_simulation==" << op->do_simulation << ", ";
   p->stream << "round_for_shift==" << op->round_for_shift << ", ";
-  p->stream << "debug_enabled_ops==" << op->debug_enabled_ops;
+  p->stream << "debug_enabled_ops==" << op->debug_enabled_ops <<", ";
+  p->stream << "rounding==" << op->rounding;
   p->stream << ")";
 });
 
diff --git a/src/relay/pass/quantize/quantize.h b/src/relay/pass/quantize/quantize.h
index 412bce0a394e..8a0282ab4929 100644
--- a/src/relay/pass/quantize/quantize.h
+++ b/src/relay/pass/quantize/quantize.h
@@ -75,6 +75,7 @@ class QConfigNode : public Node {
   bool do_simulation = false;
   bool round_for_shift = true;
   Array<Expr> debug_enabled_ops = Array<Expr>(NodePtr<Node>(nullptr));
+  std::string rounding = "UPWARD";
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("nbit_input", &nbit_input);
@@ -88,6 +89,7 @@ class QConfigNode : public Node {
     v->Visit("do_simulation", &do_simulation);
     v->Visit("round_for_shift", &round_for_shift);
     v->Visit("debug_enabled_ops", &debug_enabled_ops);
+    v->Visit("rounding", &rounding);
   }
 
   static constexpr const char* _type_key = "relay.quantize.QConfig";
diff --git a/src/relay/pass/quantize/realize.cc b/src/relay/pass/quantize/realize.cc
index bdd0d732d146..a0e8ffc0ff04 100644
--- a/src/relay/pass/quantize/realize.cc
+++ b/src/relay/pass/quantize/realize.cc
@@ -31,6 +31,7 @@
 #include <tvm/relay/attrs/annotation.h>
 #include "./quantize.h"
 #include "../pattern_util.h"
+#include "../../qnn/util.h"
 
 namespace tvm {
 namespace relay {
@@ -97,7 +98,9 @@ inline Expr ForwardOp(const Call& ref_call, const Array<Expr>& args) {
 
 
 /* calculate `data * s1 / s2`, use shift if possible */
-inline Expr MulAndDiv(Expr data, float s1, float s2, DataType dtype) {
+inline Expr MulAndDiv(Expr data, float s1, float s2, DataType dtype,
+                      const Array<IndexExpr> &data_shape) {
+  const QConfig& cfg = QConfig::Current();
   // here we assume the dtype of data is dtype activation
   if (s1 == s2) return data;
 
@@ -110,9 +113,8 @@ inline Expr MulAndDiv(Expr data, float s1, float s2, DataType dtype) {
   } else if (static_cast<int>(factor) == factor) {
     return Multiply(data, MakeConstantScalar(dtype, factor));
   } else {
-    data = Cast(data, Float(32));
-    data = Multiply(data, MakeConstantScalar(Float(32), factor));
-    return Cast(Round(data), dtype);
+    data = qnn::FixedPointMultiply(Cast(data, Int(64)), factor, data_shape, cfg->rounding);
+    return Cast(data, dtype);
   }
 }
 
@@ -164,11 +166,12 @@ Expr QuantizeRealize(const Call& ref_call,
       data = Clip(data, clip_min_imm, clip_max_imm);
       return QRealizeIntExprNode::make(data, dom_scale, n->dtype);
     } else {
-      // float computation
-      data = Cast(data, Float(32));
-      Expr scaled_data = Multiply(data, Divide(n->dom_scale, dom_scale));
-      Expr round_data = Clip(Round(scaled_data), clip_min_imm, clip_max_imm);
-      return QRealizeIntExprNode::make(round_data, dom_scale, Float(32));
+      data = Cast(data, Int(64));
+      data = qnn::FixedPointMultiply(data, idom_scale_imm / odom_scale_imm,
+                                     ref_call->type_as<TensorTypeNode>()->shape,
+                                     cfg->rounding);
+      data = Cast(Clip(data, clip_min_imm, clip_max_imm), n->dtype);
+      return QRealizeIntExprNode::make(data, dom_scale, n->dtype);
     }
   }
 
@@ -355,7 +358,7 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
   Expr dom_scale = MakeConstantScalar(Float(32), s);
   for (size_t i = 0; i < ret.size(); ++i) {
     float cur_s = GetScalarFromConstant<float>(nptrs[i]->dom_scale);
-    ret.Set(i, MulAndDiv(ret[i], cur_s, s, dtype));
+    ret.Set(i, MulAndDiv(ret[i], cur_s, s, dtype, ref_args[i]->type_as<TensorTypeNode>()->shape));
   }
 
   *dtype_ptr = dtype;
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 4a424d1df693..a361969f79e0 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -37,8 +37,6 @@ TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
 
 // Lowering of qnn.requantize op
 
-
-
 /*
  * \brief Lower requantize to a sequence of ops.
  * \param input_tensor The input tensor to requantize op.
@@ -73,8 +71,8 @@ Expr RequantizeLower(const Expr& input_tensor, const RequantizeAttrs* param,
   // 2) If the input and output scales are same, we can skip the fixed point multiplication.
   auto scaled_int64_t = tensor;
   if (param->input_scale != param->output_scale) {
-    scaled_int64_t = FixedPointMuliply(scaled_int64_t, double_multiplier, input_shape,
-                                       param->rounding);
+    scaled_int64_t =
+        FixedPointMultiply(scaled_int64_t, double_multiplier, input_shape, param->rounding);
   }
 
   // 3) Add the output zero point.
diff --git a/src/relay/qnn/util.cc b/src/relay/qnn/util.cc
index d9e4506043c7..f0ad8ab7675f 100644
--- a/src/relay/qnn/util.cc
+++ b/src/relay/qnn/util.cc
@@ -76,7 +76,7 @@ std::pair<int32_t, int32_t> GetFixedPointMultiplierShift(
   return std::make_pair(significand, exponent);
 }
 
-Expr FixedPointMuliply(Expr tensor, double multiplier,
+Expr FixedPointMultiply(Expr tensor, double multiplier,
                    const Array<IndexExpr>& input_shape, const std::string& rounding) {
   // Choose high precision datatype to be int64. This is for avoiding overflow
   // in multiplication of two int32 values.
@@ -121,6 +121,8 @@ Expr FixedPointMuliply(Expr tensor, double multiplier,
     auto zero_t = Zeros(input_shape, hp_dtype);
     round_scalar =
         Where(GreaterEqual(tensor, zero_t), pos_rounder_t, neg_rounder_t);
+  } else {
+    LOG(FATAL) << "Rounding mode " << rounding << " not supported.";
   }
   // Add the rounding scalar.
   tensor = Add(tensor, round_scalar);
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index f94860d28cf9..0c357372ceba 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -115,9 +115,9 @@ static inline int64_t get_const_int(const tvm::Expr& x) {
  *       2) Round the result.
  *       3) Right shift the result
  */
-Expr FixedPointMuliply(Expr tensor, double multiplier,
-                       const Array<IndexExpr>& input_shape,
-                       const std::string& rounding);
+Expr FixedPointMultiply(Expr tensor, double multiplier,
+                        const Array<IndexExpr>& input_shape,
+                        const std::string& rounding);
 
 }  // namespace qnn
 }  // namespace relay

From 223351f996e613f66b633003bfcf332b26d0de43 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 29 Oct 2019 08:23:23 -0400
Subject: [PATCH 49/59] Update have_int8 condition to run on compute capability
 7.x devices (#4214)

---
 python/tvm/contrib/nvcc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 37e4f2aa2c7f..05ee338a26cb 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -245,8 +245,8 @@ def have_int8(compute_version):
     compute_version : str
         compute capability of a GPU (e.g. "6.1")
     """
-    major, minor = parse_compute_version(compute_version)
-    if major == 6 and minor == 1:
+    major, _ = parse_compute_version(compute_version)
+    if major >= 6:
         return True
 
     return False

From 5879332059397b45f36adb32eb6fc5ed7bdc66ae Mon Sep 17 00:00:00 2001
From: LiangHao <hliangac@connect.ust.hk>
Date: Wed, 30 Oct 2019 02:45:02 +0800
Subject: [PATCH 50/59] Optimizing autotvm task extraction speed (#4138)

* Optimize task extraction speed

* correct pylint errors

* Delete unused function

* remove unnecessary argument

* resolve code review comments

* corrent cpp lint errors

* remove one more graph_json return value

* fix test bugs
---
 python/tvm/autotvm/task/relay_integration.py  | 21 +++--
 python/tvm/relay/__init__.py                  |  2 +-
 .../relay/backend/graph_runtime_codegen.py    |  4 +-
 python/tvm/relay/build_module.py              | 88 +++++++++++++++++++
 src/relay/backend/build_module.cc             | 30 ++++---
 5 files changed, 122 insertions(+), 23 deletions(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 55be05f4b88f..6ee8bc01cb54 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -31,23 +31,28 @@
 logger = logging.getLogger('autotvm')
 
 
-# TODO(moreau89) find a more elegant way to build for VTAs
-def _build(func,
+# TODO(moreau89) find a more elegant way to lower for VTAs
+def _lower(func,
            target,
-           target_host,
            params):
-    """ Helper to build VTA properly.
+    """ Helper to lower VTA properly.
     """
 
     from tvm import relay
+    from tvm.relay.backend import graph_runtime_codegen
 
     if hasattr(target, 'device_name') and target.device_name == "vta":
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             import vta
             with vta.build_config():
-                return relay.build(func, target, target_host, params)
+                mod, _ = relay.optimize(func, target, params)
+                grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
+                return grc.codegen(mod["main"])
     # default case
-    return relay.build(func, target, target_host, params)
+    mod, _ = relay.optimize(func, target, params)
+    grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
+    return grc.codegen(mod["main"])
+
 
 def extract_from_program(func, params, ops, target, target_host=None):
     """ Extract tuning tasks from a relay program.
@@ -133,8 +138,8 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
             relay.backend.compile_engine.get().clear()
             # wrap build call in thread to avoid multiprocessing problems
             mod = relay.Module.from_expr(func)
-            build_thread = threading.Thread(target=_build,
-                                            args=(mod, target, target_host, param))
+            build_thread = threading.Thread(target=_lower,
+                                            args=(mod, target, param))
             build_thread.start()
             build_thread.join()
 
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index fff9c99e5007..f05098bd0c8e 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -28,7 +28,7 @@
 from . import adt
 from . import analysis
 from . import transform
-from .build_module import build, create_executor
+from .build_module import build, create_executor, optimize
 from .transform import build_config
 from . import prelude
 from . import parser
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index cf31e9cff833..73a700eca543 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -36,7 +36,7 @@
 from __future__ import absolute_import
 
 from tvm.ndarray import empty
-from tvm.relay import build_module
+from tvm.relay import _build_module
 from tvm import target as _target
 from tvm import expr as _expr
 
@@ -44,7 +44,7 @@ class GraphRuntimeCodegen(object):
     """The compiler from Relay to the TVM runtime system."""
 
     def __init__(self, mod, target):
-        self._mod = build_module._GraphRuntimeCodegen()
+        self._mod = _build_module._GraphRuntimeCodegen()
         self._init = self._mod["init"]
         self._codegen = self._mod["codegen"]
         self._get_graph_json = self._mod["get_graph_json"]
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 404829f74cf7..28ce16b9b452 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -60,6 +60,7 @@ def __init__(self):
         self._get_graph_json = self.mod["get_graph_json"]
         self._get_module = self.mod["get_module"]
         self._build = self.mod["build"]
+        self._optimize = self.mod["optimize"]
         self._set_params_func = self.mod["set_params"]
         self._get_params_func = self.mod["get_params"]
 
@@ -113,6 +114,42 @@ def build(self, func, target=None, target_host=None, params=None):
 
         return graph_json, mod, params
 
+    def optimize(self, func, target=None, params=None):
+        """
+        Parameters
+        ----------
+        func: relay.Function
+            The function to build.
+
+        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
+        device/context name) to str/tvm.target.Target, optional
+            For heterogeneous compilation, it is a dictionary indicating context
+            to target mapping. For homogeneous compilation, it is a build target.
+
+        params : dict of str to NDArray
+            Input parameters to the graph that do not change
+            during inference time. Used for constant folding.
+
+        Returns
+        -------
+        mod : relay.Module
+            The optimized relay module.
+
+        params : dict
+            The parameters of the final graph.
+        """
+        target = _update_target(target)
+
+        # Setup the params.
+        if params:
+            self._set_params(params)
+        mod = self._optimize(func, target)
+        # Get artifacts
+        params = self.get_params()
+
+        return mod, params
+
+
     def _set_params(self, params):
         inputs = {}
         for name, param in params.items():
@@ -208,6 +245,57 @@ def build(mod, target=None, target_host=None, params=None):
     return graph_json, mod, params
 
 
+def optimize(mod, target=None, params=None):
+    """Helper function that optimizes a Relay module.
+
+    Parameters
+    ----------
+    mod : relay.Module
+        The module to build. Using relay.Function is deprecated.
+
+    target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context
+    name) to str/tvm.target.Target, optional
+        For heterogeneous compilation, it is a dictionary indicating context to
+        target mapping. For homogeneous compilation, it is a build target.
+
+    params : dict of str to NDArray
+        Input parameters to the graph that do not change
+        during inference time. Used for constant folding.
+
+    Returns
+    -------
+    mod : relay.Module
+        The optimized relay module.
+
+    params : dict
+        The parameters of the final graph.
+    """
+    if isinstance(mod, _Module):
+        func = mod["main"]
+    elif isinstance(mod, _expr.Function):
+        func = mod
+        warnings.warn(
+            "Please use input parameter mod (tvm.relay.module.Module) "
+            "instead of deprecated parameter func (tvm.relay.expr.Function)",
+            DeprecationWarning)
+    else:
+        raise ValueError("Type of input parameter mod must be tvm.relay.module.Module")
+
+    target = _update_target(target)
+
+    # If current dispatch context is fallback context (the default root context),
+    # then load pre-tuned parameters from TopHub
+    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
+        tophub_context = autotvm.tophub.context(list(target.values()))
+    else:
+        tophub_context = autotvm.util.EmptyContext()
+
+    with tophub_context:
+        bld_mod = BuildModule()
+        mod, params = bld_mod.optimize(func, target, params)
+    return mod, params
+
+
 class GraphExecutor(_interpreter.Executor):
     """Wrapper around Executor interface.
 
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index dfe85fc10908..73cf6c27877d 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -148,6 +148,11 @@ class RelayBuildModule : public runtime::ModuleNode {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
           *rv = this->graph_codegen_->GetLoweredFunc();
       });
+    } else if (name == "optimize") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        CHECK_EQ(args.num_args, 2);
+        *rv = this->Optimize(args[0], args[1], this->params_);
+      });
     } else {
       LOG(FATAL) << "Unknown packed function: " << name;
       return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
@@ -273,19 +278,25 @@ class RelayBuildModule : public runtime::ModuleNode {
   }
 
   /*!
-   * \brief Optimize a Relay module.
+   * \brief Optimize a Relay Function.
    *
-   * \param relay_module The input Relay module where optmization will be
-   *        applied on.
+   * \param func The input Function where optmization will be applied on.
    * \param targets The device type to `Target` mapping.
    * \param params The param name to value mapping.
    *
    * \return relay::Module The updated Relay module after optimization.
    */
   relay::Module Optimize(
-      relay::Module relay_module,
+      Function func,
       const TargetsMap& targets,
       const std::unordered_map<std::string, runtime::NDArray>& params) {
+    if (params.size()) {
+      func = BindParamsByName(func, params);
+    }
+
+    // Perform Module->Module optimizations.
+    relay::Module relay_module = relay::ModuleNode::FromExpr(func);
+
     Array<Pass> pass_seqs;
 
     // Run all dialect legalization passes.
@@ -345,6 +356,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     // Fuse the operations if it is needed.
     relay_module = transform::FuseOps()(relay_module);
     relay_module = transform::InferType()(relay_module);
+    CHECK(relay_module.defined());
 
     return relay_module;
   }
@@ -440,14 +452,8 @@ class RelayBuildModule : public runtime::ModuleNode {
   void BuildRelay(
       Function func,
       const std::unordered_map<std::string, tvm::runtime::NDArray>& params) {
-    if (params.size()) {
-      func = BindParamsByName(func, params);
-    }
-
-    // Perform Module->Module optimizations.
-    relay::Module relay_module = relay::ModuleNode::FromExpr(func);
-    relay_module = Optimize(relay_module, targets_, params);
-    CHECK(relay_module.defined());
+    // Optimize input Relay Function and returns Relay Module
+    relay::Module relay_module = Optimize(func, targets_, params);
     // Get the updated function.
     func = relay_module->Lookup("main");
 

From f0c7129b0b1a1b01849b265e80356f333cfd2237 Mon Sep 17 00:00:00 2001
From: Logan Weber <36520469+weberlo@users.noreply.github.com>
Date: Tue, 29 Oct 2019 21:51:20 -0700
Subject: [PATCH 51/59] [Relay] Add Python type functor and tests (#4209)

* Add Python type functor and tests

* Lint roller
---
 python/tvm/relay/__init__.py            |   6 +
 python/tvm/relay/type_functor.py        | 194 ++++++++++++++++++++++++
 tests/python/relay/test_type_functor.py | 107 +++++++++++++
 3 files changed, 307 insertions(+)
 create mode 100644 python/tvm/relay/type_functor.py
 create mode 100644 tests/python/relay/test_type_functor.py

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index f05098bd0c8e..bd3f5bd1fb8d 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -23,6 +23,7 @@
 from . import base
 from . import ty
 from . import expr
+from . import type_functor
 from . import expr_functor
 from . import module
 from . import adt
@@ -118,6 +119,11 @@
 function_pass = transform.function_pass
 alpha_equal = analysis.alpha_equal
 
+# TypeFunctor
+TypeFunctor = type_functor.TypeFunctor
+TypeVisitor = type_functor.TypeVisitor
+TypeMutator = type_functor.TypeMutator
+
 # ExprFunctor
 ExprFunctor = expr_functor.ExprFunctor
 ExprVisitor = expr_functor.ExprVisitor
diff --git a/python/tvm/relay/type_functor.py b/python/tvm/relay/type_functor.py
new file mode 100644
index 000000000000..1331058b37ca
--- /dev/null
+++ b/python/tvm/relay/type_functor.py
@@ -0,0 +1,194 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The type functor of Relay."""
+from .ty import (TypeVar, IncompleteType, TensorType, FuncType,
+                 TupleType, TypeRelation, RefType, GlobalTypeVar, TypeCall)
+from .adt import TypeData
+
+class TypeFunctor:
+    """
+    An abstract visitor defined over Type.
+
+    Defines the default dispatch over types.
+    """
+    def __init__(self):
+        # TODO(weberlo): make type vars hashable, so we can memoize
+        pass
+
+    # pylint: disable=no-else-return
+    def visit(self, typ):
+        """Apply the visitor to a type."""
+        if isinstance(typ, TypeVar):
+            return self.visit_type_var(typ)
+        elif isinstance(typ, IncompleteType):
+            return self.visit_incomplete_type(typ)
+        elif isinstance(typ, TensorType):
+            return self.visit_tensor_type(typ)
+        elif isinstance(typ, FuncType):
+            return self.visit_func_type(typ)
+        elif isinstance(typ, TupleType):
+            return self.visit_tuple_type(typ)
+        elif isinstance(typ, TypeRelation):
+            return self.visit_type_relation(typ)
+        elif isinstance(typ, RefType):
+            return self.visit_ref_type(typ)
+        elif isinstance(typ, GlobalTypeVar):
+            return self.visit_global_type_var(typ)
+        elif isinstance(typ, TypeCall):
+            return self.visit_type_call(typ)
+        elif isinstance(typ, TypeData):
+            return self.visit_type_data(typ)
+        else:
+            raise Exception('unhandled case: {0}'.format(type(typ)))
+
+    def visit_type_var(self, _):
+        raise NotImplementedError()
+
+    def visit_incomplete_type(self, _):
+        raise NotImplementedError()
+
+    def visit_tensor_type(self, _):
+        raise NotImplementedError()
+
+    def visit_func_type(self, _):
+        raise NotImplementedError()
+
+    def visit_tuple_type(self, _):
+        raise NotImplementedError()
+
+    def visit_type_relation(self, _):
+        raise NotImplementedError()
+
+    def visit_ref_type(self, _):
+        raise NotImplementedError()
+
+    def visit_global_type_var(self, _):
+        raise NotImplementedError()
+
+    def visit_type_call(self, _):
+        raise NotImplementedError()
+
+    def visit_type_data(self, _):
+        raise NotImplementedError()
+
+
+class TypeVisitor(TypeFunctor):
+    """
+    A visitor over Type.
+
+    The default behavior recursively traverses the AST.
+    """
+    def visit_type_var(self, tv):
+        pass
+
+    def visit_incomplete_type(self, it):
+        pass
+
+    def visit_tensor_type(self, tt):
+        pass
+
+    def visit_func_type(self, ft):
+        for arg_type in ft.arg_types:
+            self.visit(arg_type)
+        self.visit(ft.ret_type)
+        for type_param in getattr(ft, 'type_params', []):
+            self.visit(type_param)
+        for type_constraint in getattr(ft, 'type_constraints', []):
+            self.visit(type_constraint)
+
+    def visit_tuple_type(self, tt):
+        for field in tt.fields:
+            self.visit(field)
+
+    def visit_type_relation(self, tr):
+        for arg in tr.args:
+            self.visit(arg)
+
+    def visit_ref_type(self, rt):
+        self.visit(rt.value)
+
+    def visit_global_type_var(self, gtv):
+        pass
+
+    def visit_type_call(self, tc):
+        self.visit(tc.func)
+        for arg in tc.args:
+            self.visit(arg)
+
+    def visit_type_data(self, td):
+        self.visit(td.header)
+        for type_var in td.type_vars:
+            self.visit(type_var)
+
+
+class TypeMutator(TypeFunctor):
+    """
+    A functional visitor over Type.
+
+    The default behavior recursively traverses the AST
+    and reconstructs the AST.
+    """
+    def visit_type_var(self, tv):
+        return TypeVar(tv.var.name, tv.kind)
+
+    def visit_incomplete_type(self, it):
+        return IncompleteType(it.kind)
+
+    def visit_tensor_type(self, tt):
+        return TensorType(tt.shape, tt.dtype)
+
+    def visit_func_type(self, ft):
+        new_arg_types = [self.visit(arg_type) for arg_type in ft.arg_types]
+        new_ret_type = self.visit(ft.ret_type)
+        new_type_params = [
+            self.visit(type_param)
+            for type_param in getattr(ft, 'type_params', [])]
+        new_type_constraints = [
+            self.visit(type_constraint)
+            for type_constraint in getattr(ft, 'type_constraints', [])]
+        return FuncType(
+            new_arg_types,
+            new_ret_type,
+            new_type_params,
+            new_type_constraints)
+
+    def visit_tuple_type(self, tt):
+        return TupleType([self.visit(field) for field in tt.fields])
+
+    def visit_type_relation(self, tr):
+        return TypeRelation(
+            tr.func,
+            [self.visit(arg) for arg in tr.args],
+            tr.num_inputs,
+            tr.attrs)
+
+    def visit_ref_type(self, rt):
+        return RefType(self.visit(rt.value))
+
+    def visit_global_type_var(self, gtv):
+        return GlobalTypeVar(gtv.var.name, gtv.kind)
+
+    def visit_type_call(self, tc):
+        return TypeCall(
+            self.visit(tc.func),
+            [self.visit(arg) for arg in tc.args])
+
+    def visit_type_data(self, td):
+        return TypeData(
+            self.visit(td.header),
+            [self.visit(type_var) for type_var in td.type_vars],
+            td.constructors)
diff --git a/tests/python/relay/test_type_functor.py b/tests/python/relay/test_type_functor.py
new file mode 100644
index 000000000000..d09a8938bb54
--- /dev/null
+++ b/tests/python/relay/test_type_functor.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import relay
+from tvm.relay import TypeFunctor, TypeMutator, TypeVisitor
+from tvm.relay.analysis import assert_graph_equal
+from tvm.relay.ty import (TypeVar, IncompleteType, TensorType, FuncType,
+                 TupleType, TypeRelation, RefType, GlobalTypeVar, TypeCall)
+from tvm.relay.adt import TypeData
+
+def check_visit(typ):
+    try:
+        ef = TypeFunctor()
+        ef.visit(typ)
+        assert False
+    except NotImplementedError:
+        pass
+
+    ev = TypeVisitor()
+    ev.visit(typ)
+
+    assert_graph_equal(TypeMutator().visit(typ), typ)
+
+
+def test_type_var():
+    tv = TypeVar('a')
+    check_visit(tv)
+
+
+def test_incomplete_type():
+    it = IncompleteType()
+    check_visit(it)
+
+
+def test_tensor_type():
+    tt = TensorType([])
+    check_visit(tt)
+
+
+def test_func_type():
+    tv = TypeVar('tv')
+    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    ft = FuncType([tt], tt, type_params=[tv])
+    check_visit(ft)
+
+
+def test_tuple_type():
+    tt = TupleType([TupleType([])])
+    check_visit(tt)
+
+
+def test_type_relation():
+    func = tvm.get_env_func('tvm.relay.type_relation.Broadcast')
+    attrs = tvm.make.node('attrs.TestAttrs', name='attr', padding=(3,4))
+    tp = TypeVar('tp')
+    tf = FuncType([], TupleType([]), [], [])
+    tt = TensorType([1, 2, 3], 'float32')
+    tr = TypeRelation(func, [tp, tf, tt], 2, attrs)
+
+    check_visit(tr)
+
+
+def test_ref_type():
+    rt = RefType(TupleType([]))
+    check_visit(rt)
+
+
+def test_global_type_var():
+    gtv = GlobalTypeVar('gtv')
+    check_visit(gtv)
+
+
+def test_type_call():
+    tc = TypeCall(GlobalTypeVar('tf'), [TupleType([])])
+    check_visit(tc)
+
+
+def test_type_data():
+    td = TypeData(GlobalTypeVar('td'), [TypeVar('tv')], [])
+    check_visit(td)
+
+
+if __name__ == "__main__":
+    test_type_var()
+    test_incomplete_type()
+    test_tensor_type()
+    test_func_type()
+    test_tuple_type()
+    test_type_relation()
+    test_ref_type()
+    test_global_type_var()
+    test_type_call()
+    test_type_data()

From a1ecdfef86ff21666637e13693bd1316e9c3c208 Mon Sep 17 00:00:00 2001
From: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Date: Wed, 30 Oct 2019 17:13:35 +0800
Subject: [PATCH 52/59] Fix typo in packed_func.h (#4219)

---
 include/tvm/runtime/packed_func.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index a42946ac2d2c..645f49979ef7 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -390,7 +390,7 @@ inline std::string TVMType2String(TVMType t);
 /*!
  * \brief Type traits to mark if a class is tvm extension type.
  *
- * To enable extension type in C++ must be register () ed via marco.
+ * To enable extension type in C++ must be registered via marco.
  * TVM_REGISTER_EXT_TYPE(TypeName) after defining this with this traits.
  *
  * Extension class can be passed and returned via PackedFunc in all tvm runtime.

From 0d5debe6961fa91ea77fd30193e203bd117ae22c Mon Sep 17 00:00:00 2001
From: shoubhik <shoubhikbhatti@gmail.com>
Date: Wed, 30 Oct 2019 09:12:14 -0700
Subject: [PATCH 53/59] Improve the lowering of Qnn Dense (#4213)

* [QNN] Improving Dense lowering.

* - Moving get_shape method to util
- Finalizing the test cases and the code structure for optimized dense computation.

* - Fixing cpplint.

* - Addressing review comments.

* - Renaming the variables correctly.

* - Renaming the variables correctly.
---
 include/tvm/relay/qnn/attrs.h                 |   2 +-
 python/tvm/relay/qnn/op/op_attrs.py           |   4 +
 src/relay/qnn/op/convolution.cc               |   7 -
 src/relay/qnn/op/dense.cc                     | 126 ++++++++++++------
 src/relay/qnn/util.h                          |   7 +
 ...test_qnn_dense.py => test_op_qnn_dense.py} |   9 --
 6 files changed, 99 insertions(+), 56 deletions(-)
 rename tests/python/relay/{test_qnn_dense.py => test_op_qnn_dense.py} (95%)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index e5f4ba94e12e..58707f4da19c 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -213,7 +213,7 @@ struct QnnDenseAttrs : public tvm::AttrsNode<QnnDenseAttrs> {
   int32_t input_zero_point;
   int32_t kernel_zero_point;
 
-  TVM_DECLARE_ATTRS(QnnDenseAttrs, "relay.attrs.qnn.QnnDenseAttrs") {
+  TVM_DECLARE_ATTRS(QnnDenseAttrs, "relay.attrs.QnnDenseAttrs") {
     TVM_ATTR_FIELD(units)
       .describe("Number of hidden units of the dense transformation.");
     TVM_ATTR_FIELD(out_dtype)
diff --git a/python/tvm/relay/qnn/op/op_attrs.py b/python/tvm/relay/qnn/op/op_attrs.py
index e3fe1c8924f6..24ca3b47cc3a 100644
--- a/python/tvm/relay/qnn/op/op_attrs.py
+++ b/python/tvm/relay/qnn/op/op_attrs.py
@@ -22,3 +22,7 @@
 @register_relay_attr_node
 class QnnConv2DAttrs(Attrs):
     """Attributes for qnn.conv2d"""
+
+@register_relay_attr_node
+class QnnDenseAttrs(Attrs):
+    """Attributes for qnn.dense"""
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index d17a18589d75..1103620829d1 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -70,13 +70,6 @@ using WorkloadType = std::tuple<int, int, int, int, int>;
  */
 WorkloadType GetWorkload(const Array<tvm::relay::Type>& arg_types, const QnnConv2DAttrs* param) {
   // Get conv parameters.
-  auto get_shape = [](const Type& type) {
-    auto input_tt = type.as<TensorTypeNode>();
-    CHECK(input_tt != nullptr) << "Type information missing."
-                               << " Please run infer_type pass.";
-    return input_tt->shape;
-  };
-
   const auto in_shape = get_shape(arg_types[0]);
   int batch_size, in_channels;
   if (param->data_layout == "NCHW") {
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index c708cfa3dc63..9ac01212dc37 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -29,6 +29,7 @@
 #include <tvm/relay/qnn/attrs.h>
 #include "../../op/nn/nn.h"
 #include "../../pass/pattern_util.h"
+#include "../util.h"
 
 namespace tvm {
 namespace relay {
@@ -37,33 +38,27 @@ namespace qnn {
 // relay.op.qnn.dense
 TVM_REGISTER_NODE_TYPE(QnnDenseAttrs);
 
-bool QnnDenseRel(const Array<Type>& types,
-                 int num_inputs,
-                 const Attrs& attrs,
+bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<QnnDenseAttrs>();
-  CHECK(param != nullptr) << "QnnConv2DAttrs cannot be nullptr.";
+  CHECK(param != nullptr) << "QnnDenseAttrs cannot be nullptr.";
   CHECK(data->dtype == Int(8) || data->dtype == UInt(8))
-    << "Expected quantized dense type(int8, uint8) for input but was " <<  data->dtype;
+      << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
   CHECK(weight->dtype == Int(8) || weight->dtype == UInt(8))
-    << "Expected quantized dense type(int8, uint8) for weight but was " <<  weight->dtype;
+      << "Expected quantized dense type(int8, uint8) for weight but was " << weight->dtype;
   CHECK(param->out_dtype == Int(32))
-    << "Expected quantized dense type(int32) for output but was " <<  param->out_dtype;
+      << "Expected quantized dense type(int32) for output but was " << param->out_dtype;
   CHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
   return DenseRel<QnnDenseAttrs>(types, num_inputs, attrs, reporter);
 }
 
 // Positional relay function to create quantized dense operator used by frontend FFI.
-Expr MakeQuantizedDense(Expr data,
-                        Expr weight,
-                        IndexExpr units,
-                        int32_t input_zero_point,
-                        int32_t kernel_zero_point,
-                        DataType out_dtype) {
+Expr MakeQuantizedDense(Expr data, Expr weight, IndexExpr units, int32_t input_zero_point,
+                        int32_t kernel_zero_point, DataType out_dtype) {
   auto attrs = make_node<QnnDenseAttrs>();
   attrs->units = std::move(units);
   attrs->out_dtype = out_dtype;
@@ -73,40 +68,93 @@ Expr MakeQuantizedDense(Expr data,
   return CallNode::make(op, {data, weight}, Attrs(attrs), {});
 }
 
-/**
- * \brief Lowers Qnn convolution in terms of core operators in relay.
- * Mathematically it is equals to -
- * Dense((quantized_input - input_zero_point;int32), (quantized_kernel - kernel_zero_point; int32))
- *
- * \param attrs QnnDenseAttrs for Qnn Dense layer.
+Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel,
+                    const QnnDenseAttrs* attrs) {
+  return Dense(quantized_data, quantized_kernel, attrs->units, attrs->out_dtype);
+}
+
+Expr DenseSecondTerm(const Expr& quantized_data, const Expr& zp_kernel) {
+  Array<Integer> axes = {1};
+  return Multiply(zp_kernel, Sum(Cast(quantized_data, Int(32)), axes, true, false));
+}
+
+Expr DenseThirdTerm(const Expr& quantized_kernel, const Expr& zp_data) {
+  Array<Integer> axes = {1};
+  return Multiply(zp_data, Sum(Cast(quantized_kernel, Int(32)), axes, false, false));
+}
+
+Expr DenseFourthTerm(const QnnDenseAttrs* attrs, int reduction_dim_size) {
+  int32_t scalar_term = attrs->input_zero_point * attrs->kernel_zero_point * reduction_dim_size;
+  return MakeConstantScalar(Int(32), scalar_term);
+}
+
+/*
+ * \brief Forward rewrite the qnn dense op.
+ * \param attrs The QNN dense attrs.
  * \param new_args The new mutated args to the call node.
- * \param arg_types The data types of input and output.
- * \reutrn The sequence of Relay ops for qnn cov2d op.
+ * \param arg_types The types of input and output.
+ * \return The sequence of Relay ops for qnn cov2d op.
+ * \note Lowering of the qnn.dense operator
+ *       A quantized tensor is represented in following manner
+ *          A = scale_a x (QA - zp_A)
+ *       where QA is quantized tensor, scale_a and zp_A are quantization
+ *       params.
+ *
+ *       Quantized dense multiplies two quantized tensors and returns a
+ *       quantized tensor of default dtype of int32, with scale equaling to the
+ *       product of scales of input tensors, and a zero point of zero.
+ *
+ *       The lowering for asymmetric quantized dense looks as follows. More details at
+ *       https://discuss.tvm.ai/t/tf-lite-quantized-conv2d-operator-conversion/2651/8
+ *       The computation gets unrolled into following 4 terms
+ *          C(m, n) = Sigma(k) (A(m, k) * W(n, k))
+ *
+ *          RHS becomes
+ *            Sigma(k) ([QA(m, k) - zp_a] * [QW(n, k) - zp_w])
+ *
+ *          Unrolling leads to following sequence
+ *            Sigma(k) QA(m, k) * QW(n, k)                         // Term1
+ *          - Sigma(k) zp_w * QA(m, k)                             // Term2
+ *          - Sigma(k) zp_a * QW(n, k)                             // Term3
+ *          - Sigma(k) * zp_a * zp_w                               // Term4
+ *
+ *       Term3 and Term4 can be computed at compile time.
  */
-Expr QnnDenseCanonicalize(const Attrs& attrs,
-                          const Array<Expr>& new_args,
+Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                           const Array<tvm::relay::Type>& arg_types) {
   CHECK_EQ(new_args.size(), 2);
   Expr quantized_data = new_args[0];
   Expr quantized_kernel = new_args[1];
+
+  const auto in_shape = get_shape(arg_types[0]);
+  const int reduction_dim_size = get_const_int(in_shape[1]);
+
   const auto* qnn_dense_attrs = attrs.as<QnnDenseAttrs>();
-  Expr quantized_data_int32 = Cast(quantized_data, Int(32));
-  if (qnn_dense_attrs->input_zero_point != 0) {
-    quantized_data_int32 = Subtract(quantized_data_int32,
-                                    MakeConstantScalar(Int(32),
-                                    qnn_dense_attrs->input_zero_point));
-  }
-  Expr quantized_kernel_int32 = Cast(quantized_kernel, Int(32));
-  if (qnn_dense_attrs->kernel_zero_point != 0) {
-    quantized_kernel_int32 = Subtract(quantized_kernel_int32,
-                                      MakeConstantScalar(Int(32),
-                                      qnn_dense_attrs->kernel_zero_point));
+  auto zp_kernel = MakeConstantScalar(Int(32), qnn_dense_attrs->kernel_zero_point);
+  auto zp_data = MakeConstantScalar(Int(32), qnn_dense_attrs->input_zero_point);
+
+  // Get all the terms as described in the comments.
+  auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
+  auto term2 = DenseSecondTerm(quantized_data, zp_kernel);
+  auto term3 = DenseThirdTerm(quantized_kernel, zp_data);
+  auto term4 = DenseFourthTerm(qnn_dense_attrs, reduction_dim_size);
+
+  // Combine those 4 terms depending on the zero points to get the best lowering.
+  if (qnn_dense_attrs->input_zero_point == 0 && qnn_dense_attrs->kernel_zero_point == 0) {
+    // term 2, 3 and 4 become zero.
+    return term1;
+  } else if (qnn_dense_attrs->input_zero_point == 0 && qnn_dense_attrs->kernel_zero_point != 0) {
+    // term 3 and term 4 become zero.
+    return Subtract(term1, term2);
+  } else if (qnn_dense_attrs->input_zero_point != 0 && qnn_dense_attrs->kernel_zero_point == 0) {
+    // term 2 and term 4 become zero.
+    return Subtract(term1, term3);
+  } else {
+    auto data_term = Subtract(term1, term2);
+    // Putting constant terms together, so that constant folding can fold it.
+    auto const_term = Subtract(term4, term3);
+    return Add(data_term, const_term);
   }
-  Expr int32_dense = Dense(quantized_data_int32,
-                           quantized_kernel_int32,
-                           qnn_dense_attrs->units,
-                           qnn_dense_attrs->out_dtype);
-  return int32_dense;
 }
 
 RELAY_REGISTER_OP("qnn.dense")
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 0c357372ceba..138fe6a50597 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -36,6 +36,13 @@ namespace tvm {
 namespace relay {
 namespace qnn {
 
+static inline Array<IndexExpr> get_shape(const Type& type) {
+  auto input_tt = type.as<TensorTypeNode>();
+  CHECK(input_tt != nullptr) << "Type information missing."
+                             << " Please run infer_type pass.";
+  return input_tt->shape;
+}
+
 static inline const int32_t GetQmin(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
       << "QNN ops support int32 or lower precision";
diff --git a/tests/python/relay/test_qnn_dense.py b/tests/python/relay/test_op_qnn_dense.py
similarity index 95%
rename from tests/python/relay/test_qnn_dense.py
rename to tests/python/relay/test_op_qnn_dense.py
index f1e0767aff2e..885fe37d29bb 100644
--- a/tests/python/relay/test_qnn_dense.py
+++ b/tests/python/relay/test_op_qnn_dense.py
@@ -193,29 +193,20 @@ def qnn_dense_driver(test_configuration):
 
 
 def test_qnn_dense_without_bias():
-    uint32_output_without_bias_paramas = \
-        make_uint_configuration(use_bias=False)
     int32_output_without_bias_params = \
         make_int_configuration(use_bias=False)
-    qnn_dense_driver(uint32_output_without_bias_paramas)
     qnn_dense_driver(int32_output_without_bias_params)
 
 
 def test_qnn_dense_with_bias():
-    uint32_output_with_bias_params = \
-        make_uint_configuration(use_bias=True)
     int32_output_with_bias_params = \
         make_int_configuration(use_bias=True)
-    qnn_dense_driver(uint32_output_with_bias_params)
     qnn_dense_driver(int32_output_with_bias_params)
 
 
 def test_qnn_dense_with_requantized_output():
-    uint8_requantized_output_with_bias_params = \
-        make_uint_configuration(use_bias=True, requantize_output=True)
     int8_requantized_output_with_bias_params = \
         make_int_configuration(use_bias=True, requantize_output=True)
-    qnn_dense_driver(uint8_requantized_output_with_bias_params)
     qnn_dense_driver(int8_requantized_output_with_bias_params)
 
 

From 02418d00478b2b083d70267f7f1ebdd4eb2825cb Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Wed, 30 Oct 2019 19:43:53 +0300
Subject: [PATCH 54/59] [ARITH] Fix the rule y < x && x <= y (#4220)

---
 src/arithmetic/rewrite_simplify.cc                   | 2 +-
 tests/python/unittest/test_arith_rewrite_simplify.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arithmetic/rewrite_simplify.cc b/src/arithmetic/rewrite_simplify.cc
index c55385331655..b26f8335055a 100644
--- a/src/arithmetic/rewrite_simplify.cc
+++ b/src/arithmetic/rewrite_simplify.cc
@@ -1610,7 +1610,7 @@ Mutate_(const And* op, const Expr& self) {
   TVM_TRY_REWRITE(x != y && x == y, cfalse);
   TVM_TRY_REWRITE(x && !x, cfalse);
   TVM_TRY_REWRITE(x <= y && y < x, cfalse);
-  TVM_TRY_REWRITE(y < x && y <= x, cfalse);
+  TVM_TRY_REWRITE(y < x && x <= y, cfalse);
 
   TVM_TRY_REWRITE_IF(x < c1 && c2 < x, cfalse,
                      c2.Eval()->value + 1 >= c1.Eval()->value);
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 246ac1339fb2..99c2942cd470 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -783,7 +783,7 @@ def test_logical_simplify():
               tvm.const(False, "bool"))
     ck.verify(tvm.expr.And(x > 1, tvm.expr.Not(x > 1)), tvm.const(False, "bool"))
     ck.verify(tvm.expr.And(x <= y, y < x), tvm.const(False, "bool"))
-    ck.verify(tvm.expr.And(y < x, y <= x), tvm.const(False, "bool"))
+    ck.verify(tvm.expr.And(y < x, x <= y), tvm.const(False, "bool"))
     ck.verify(tvm.expr.And(x < 1, 0 < x), tvm.const(False, "bool"))
     ck.verify(tvm.expr.And(x < 0, 1 < x), tvm.const(False, "bool"))
     ck.verify(tvm.expr.And(x < 1, 1 <= x), tvm.const(False, "bool"))

From b23ae42d12d88d1bef777b7a139961447e1d5dbb Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 30 Oct 2019 11:17:33 -0700
Subject: [PATCH 55/59] [PYTHON] Add __init__ to the generated grammar so that
 it can be installed properly (#4223)

---
 python/tvm/relay/grammar/__init__.py     | 16 ++++++++++++++++
 python/tvm/relay/grammar/py3/__init__.py | 16 ++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 python/tvm/relay/grammar/py3/__init__.py

diff --git a/python/tvm/relay/grammar/__init__.py b/python/tvm/relay/grammar/__init__.py
index e69de29bb2d1..13a83393a912 100644
--- a/python/tvm/relay/grammar/__init__.py
+++ b/python/tvm/relay/grammar/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/tvm/relay/grammar/py3/__init__.py b/python/tvm/relay/grammar/py3/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.

From f5902451ac51996538bba542fedbc27cb886f754 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Wed, 30 Oct 2019 11:24:47 -0700
Subject: [PATCH 56/59] [Relay][Frontend][ONNX] New Operators and Opsets to
 Support BERT (#4197)

* Added slice v10

* Added constantofshape operation and small refactor.

* Finished one_hot implementation.

* Reshape working across all bert layers.

* Fixed constantofshape and removed code duplication.

* onnx model fully ingested.

* Working on improving onnx tests.

* Changed onnx testing to use onnxruntime instead of caffe2, also formatted.

* Add arbitrary output nodes to onnx frontend.

* Added v6 tiling for bert squad 8 support.

* Small syntax fixes

* Reduced code duplication in split opset versions.

* Added batch matmul test

* Added unstack split testing.

* Adde onehot test, needs a little cleanup probably.

* Replaced deprecated constant fill with constantofshape and updated tests accordingly.

* Added tests for new opset version of slice and tile.

* lint clean up

* Lint fixes

* Changed onnx dependency

* Went back to caffe2 runtime for CI integration.

* Rebase and small typo/syntax changes.

* Added hard casting of onehot attributes to int.
---
 python/tvm/relay/frontend/common.py        |  46 ++
 python/tvm/relay/frontend/onnx.py          | 236 +++---
 python/tvm/relay/frontend/tensorflow.py    |  14 +-
 tests/python/frontend/onnx/test_forward.py | 829 ++++++++++++++-------
 4 files changed, 744 insertions(+), 381 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index d4b9162d6f3d..25ba0ef31d72 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -19,11 +19,13 @@
 import logging
 
 import tvm
+import numpy as np
 from topi.util import get_const_tuple
 from .. import expr as _expr
 from .. import module as _module
 from .. import transform as _transform
 from .. import op as _op
+from .. import analysis
 
 
 class RequiredAttr(object):
@@ -474,6 +476,50 @@ def infer_channels(inputs, transpose=False):
     return channels
 
 
+def infer_value(input_val, params):
+    """A hack for getting the value of an expression by evaluating a
+    portion of the relay graph. This is often needed for functions that
+    whose output shape depends on the value of a tensor.
+    """
+    from tvm.contrib import graph_runtime
+    # Check that all free variables have associated parameters.
+    assert all(var.name_hint in params.keys() for var in analysis.free_vars(
+        input_val)), "All inputs to infer must be available in params."
+    func = _expr.Function(analysis.free_vars(input_val), input_val)
+    with tvm.relay.build_config(opt_level=0):
+        graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
+    ctx = tvm.cpu(0)
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input(**params)
+    m.run()
+    return m.get_output(0)
+
+
+def infer_value_simulated(input_val, params):
+    """Extention to infer_value that can be used when some input
+    values are missing. This function creates dummy inputs with the same
+    shape and random values then calls infer_value. This is helpful when
+    implementing certain onnx operators where we need to evaluate the graph
+    to determine a static shape.
+    """
+    fake_params = []
+    # Add a fake copy of all missing params.
+    for free_param in analysis.free_vars(input_val):
+        if free_param.name_hint not in params:
+            fp_dtype = free_param.type_annotation.dtype
+            fp_shape = [s.value for s in free_param.type_annotation.shape]
+            fake_params.append(free_param)
+            params[free_param.name_hint] = tvm.nd.array(
+                np.random.rand(*fp_shape).astype(fp_dtype)
+            )
+    # Now infer the value.
+    output_value = infer_value(input_val, params)
+    # Clean fake params out of param dictionary.
+    for fake_p in fake_params:
+        params.pop(fake_p.name_hint, None)
+    return output_value
+
+
 def new_var(name_hint,
             type_annotation=None,
             shape=None,
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 1d74a01b1860..41fafbc55405 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -18,20 +18,30 @@
 """ONNX: Open Neural Network Exchange frontend for Relay."""
 from __future__ import absolute_import as _abs
 
-import logging
 import numpy as np
 import tvm
 from ... import nd as _nd
 from .. import analysis
-from .. import transform as _transform
 from .. import expr as _expr
 from .. import module as _module
 from .. import op as _op
 from .common import AttrCvt, Renamer
-from .common import get_relay_op, new_var, infer_shape, infer_channels, get_name
+from .common import get_relay_op, new_var, infer_shape, infer_channels
+from .common import infer_type, infer_value, infer_value_simulated, get_name
 
 __all__ = ['from_onnx']
 
+
+def get_numpy(tensor_proto):
+    """Grab data in TensorProto and convert to numpy array."""
+    try:
+        from onnx.numpy_helper import to_array
+    except ImportError as e:
+        raise ImportError(
+            "Unable to import onnx which is required {}".format(e))
+    return to_array(tensor_proto)
+
+
 def dimension_picker(prefix, surfix=''):
     def _impl(attr):
         kernel = attr['kernel_shape']
@@ -43,6 +53,7 @@ def _impl(attr):
 
     return _impl
 
+
 def revert_caffe2_pad(pads):
     """Caffe2 requires two times the normal padding."""
     if len(pads) == 4:
@@ -279,6 +290,21 @@ class MatMul(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "MatMul op take 2 inputs, {} given".format(len(inputs))
+        # Need to check input shape as batch matmul must be supported.
+        a_shape = infer_shape(inputs[0])
+        # When performing a batch matmul, we need to properly handle N-dim shapes.
+        if len(a_shape) > 2:
+            b_shape = infer_shape(inputs[1])
+            # Convert a and b into 3 dimensional tensors.
+            a = _op.reshape(inputs[0], [-1, a_shape[-2], a_shape[-1]])
+            b = _op.reshape(inputs[1], [-1, b_shape[-2], b_shape[-1]])
+            # Transpose matrix dimensions of b.
+            b = _op.transpose(b, [0, 2, 1])
+            # Perform a batch matmul.
+            output = _op.nn.batch_matmul(a, b)
+            # Reshape output to original dimensions.
+            return _op.reshape(output, [*a_shape[:-2], a_shape[-2], b_shape[-1]])
+        # Otherwise a simple dense op will get the job done.
         input_1_t = _op.transpose(inputs[1], axes=(1, 0))
         return _op.nn.dense(inputs[0], input_1_t)
 
@@ -426,35 +452,18 @@ class Reshape(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        if 'shape' in attr:
-            return _op.reshape(inputs[0], attr['shape'])
+        return _op.reshape(inputs[0], attr['shape'])
 
+    @classmethod
+    def _impl_v5(cls, inputs, attr, params):
         if get_name(inputs[1]) in params:
             shape = tuple(params[inputs[1].name_hint].asnumpy())
             out = _op.reshape(inputs[0], shape)
         else:
             data, shape = inputs
-            logging.warning("Constant evaluating Reshape's shape argument, may reduce performance")
-            shape_params = analysis.free_vars(shape)
-            func = _expr.Function(shape_params, shape)
-            mod = _module.Module.from_expr(func)
-            seq = _transform.Sequential([_transform.InferType(),
-                                         _transform.FoldConstant(),
-                                         _transform.FuseOps(0),
-                                         _transform.InferType()])
-            with tvm.relay.PassContext(opt_level=2):
-                mod = seq(mod)
-            with tvm.relay.build_config(opt_level=0):
-                ex = tvm.relay.create_executor("debug", mod=mod)
-                inputs = []
-                for sp in shape_params:
-                    if not sp.name_hint in params:
-                        sh = [int(i) for i in sp.type_annotation.shape]
-                        inputs.append(
-                            tvm.nd.array(np.random.rand(*sh).astype('float32')))
-                static_shape = ex.evaluate()(*inputs, **params)
-            out = _op.reshape(data, newshape=tuple(static_shape.asnumpy()))
-
+            static_shape = infer_value_simulated(shape, params)
+            out = _op.reshape(data, newshape=tuple(
+                static_shape.asnumpy().astype('int32')))
         return out
 
 class Concat(OnnxOpConverter):
@@ -640,11 +649,17 @@ class Split(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        attr['indices_or_sections'] = []
-        index = 0
-        for i in attr['split'][:-1]:
-            index += i
-            attr['indices_or_sections'].append(index)
+        splits = attr.get('split', False)
+        if splits:
+            attr['indices_or_sections'] = []
+            index = 0
+            for i in splits[:-1]:
+                index += i
+                attr['indices_or_sections'].append(index)
+        # When splits isnt specified divide evenly over axis.
+        else:
+            in_shape = infer_shape(inputs[0])
+            attr['indices_or_sections'] = in_shape[attr['axis']]
         return AttrCvt(
             'split',
             ignores=['split'])(inputs, attr, params)
@@ -653,6 +668,25 @@ def _impl_v1(cls, inputs, attr, params):
 class Slice(OnnxOpConverter):
     """ Operator converter for Slice.
     """
+
+    @classmethod
+    def _common(cls, starts, ends, axes):
+        new_axes = []
+        new_starts = []
+        new_ends = []
+        pop_index = 0
+        for i in range(max(axes) + 1):
+            if i in axes:
+                new_axes.append(i)
+                new_starts.append(starts[pop_index])
+                new_ends.append(ends[pop_index])
+                pop_index += 1
+            else:
+                new_axes.append(i)
+                new_starts.append(0)
+                new_ends.append(np.iinfo(np.int32).max)
+        return new_starts, new_ends, new_axes
+
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         if isinstance(attr['starts'], int):
@@ -663,22 +697,9 @@ def _impl_v1(cls, inputs, attr, params):
             # Update the starts and ends according to axes if required.
             if isinstance(attr['axes'], int):
                 attr['axes'] = (attr['axes'],)
-
             if (max(attr['axes']) + 1) != len(attr['axes']):
-                new_axes = []
-                new_starts = []
-                new_ends = []
-                pop_index = 0
-                for i in range(max(attr['axes']) + 1):
-                    if i in attr['axes']:
-                        new_axes.append(i)
-                        new_starts.append(attr['starts'][pop_index])
-                        new_ends.append(attr['ends'][pop_index])
-                        pop_index += 1
-                    else:
-                        new_axes.append(i)
-                        new_starts.append(0)
-                        new_ends.append(np.iinfo(np.int32).max)
+                new_starts, new_ends, new_axes = cls._common(
+                    attr['starts'], attr['ends'], attr['axes'])
                 attr['axes'] = new_axes
                 attr['starts'] = new_starts
                 attr['ends'] = new_ends
@@ -690,6 +711,23 @@ def _impl_v1(cls, inputs, attr, params):
                                    'ends': 'end'},
                        ignores=['axes'])(inputs, attr)
 
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        starts = params[get_name(inputs[1])].asnumpy()
+        ends = params[get_name(inputs[2])].asnumpy()
+
+        # Update the starts and ends according to axes if required.
+        if len(inputs) >= 4:
+            axes = params[get_name(inputs[3])].asnumpy()
+
+            if max(axes + 1) != len(axes):
+                new_starts, new_ends, _ = cls._common(
+                    starts, ends, axes)
+                starts = new_starts
+                ends = new_ends
+        return _op.strided_slice(inputs[0], begin=starts, end=ends)
+
+
 class Gather(OnnxOpConverter):
     """ Operator converter for Gather.
     """
@@ -698,7 +736,6 @@ def _impl_v1(cls, inputs, attr, params):
         axis = attr.get('axis', 0)
         return AttrCvt('take',
                        extras={'axis':axis})(inputs, {})
-        #return _op.take(inputs[0], inputs[1], axis)
 
 
 class Greater(OnnxOpConverter):
@@ -848,33 +885,49 @@ def _impl_v1(cls, inputs, attr, params):
             attr['axis'] = 1
         return AttrCvt('softmax', transforms={'axis': ('axis', 1)})(inputs, attr, params)
 
-class ConstantFill(OnnxOpConverter):
-    """ Operator converter for ConstantFill.
+
+class OneHot(OnnxOpConverter):
+    """ Operator converter for OneHot.
     """
     @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        num_inputs = len(inputs)
-        if 'shape' in attr:
-            if num_inputs > 1:
-                raise ImportError(
-                    "Can't set shape and input tensor at a time")
-            shape = attr.pop('shape')
+    def _impl_v9(cls, inputs, attr, params):
+        # Extract relay one_hot inputs.
+        indices, depth, values = inputs
+        # Split onnx on off values into two separate expressions.
+        off_value, on_value = _op.take(
+            values, _op.const(0)), _op.take(values, _op.const(1))
+        # Extract the datatype of the output from on_value.
+        dtype = infer_type(on_value).checked_type.dtype
+        # Convert depth into an integer.
+        depth = int(infer_value(depth, params).asnumpy()[0])
+        # set default value when axis is not set in the model
+        if 'axis' not in attr:
+            attr['axis'] = -1
+        return _op.one_hot(indices,
+                           on_value,
+                           off_value,
+                           depth,
+                           int(attr['axis']),
+                           dtype=dtype)
+
+
+class ConstantOfShape(OnnxOpConverter):
+    """ Operator converter for ConstantOfShape.
+    """
+    @classmethod
+    def _impl_v9(cls, inputs, attr, params):
+        if 'value' in attr:
+            np_value = get_numpy(attr.pop('value'))[0]
+            value = _expr.const(np_value)
+            dtype = np_value.dtype.name
         else:
-            if num_inputs == 1:
-                raise ImportError(
-                    "Either shape attribute or input should be set")
-            if 'input_as_shape' in attr and attr['input_as_shape']:
-                shape = params[get_name(inputs[0])].asnumpy()
-            else:
-                if 'extra_shape' in attr:
-                    raise tvm.error.OpAttributeInvalid('Attribute "extra_shape" not '
-                                                       'supported with "fill_like" for '
-                                                       'operator ConstantFill.')
-                return _op.full_like(inputs[0], inputs[1])
+            value = _expr.const(0)
+            dtype = 'float32'
+        static_shape = infer_value_simulated(inputs[0], params)
+        output = _op.full(
+            value, shape=tuple(static_shape.asnumpy().astype('int32')), dtype=dtype)
+        return output
 
-        if 'extra_shape' in attr:
-            shape = shape + attr.pop('extra_shape')
-        return _op.full(inputs[0], shape)
 
 class Sign(OnnxOpConverter):
     """ Operator converter for Sign.
@@ -916,6 +969,12 @@ def _impl_v1(cls, inputs, attr, params):
         reps = attr.pop('repeats')  # The number of times repeating the tensor data.
         return _op.tile(inputs[0], reps)
 
+    @classmethod
+    def _impl_v6(cls, inputs, attr, params):
+        reps = tuple(infer_value_simulated(
+            inputs[1], params).asnumpy().astype('int32'))
+        return _op.tile(inputs[0], reps)
+
 class Erf(OnnxOpConverter):
     """Operator converter for Erf
     """
@@ -948,7 +1007,7 @@ def _get_convert_map(opset):
         'ThresholdedRelu': ThresholdedRelu.get_converter(opset),
         'ScaledTanh': ScaledTanh.get_converter(opset),
         'ParametricSoftplus': ParametricSoftPlus.get_converter(opset),
-        'ConstantFill': ConstantFill.get_converter(opset),
+        'ConstantOfShape': ConstantOfShape.get_converter(opset),
         # 'GivenTensorFill'
         'FC': AttrCvt('dense', ignores=['axis', 'axis_w']),
         'Scale': Scale.get_converter(opset),
@@ -958,7 +1017,7 @@ def _get_convert_map(opset):
         # 'MeanVarianceNormalization'
         # 'Crop'
         # 'Embedding'
-        'Upsample' : Upsample.get_converter(opset),
+        'Upsample': Upsample.get_converter(opset),
         'SpatialBN': BatchNorm.get_converter(opset),
 
         # defs/generator
@@ -1002,6 +1061,7 @@ def _get_convert_map(opset):
         # softmax default axis is different in onnx
         'Softmax': Softmax.get_converter(opset),
         'LogSoftmax': AttrCvt('log_softmax', {'axis': ('axis', 1)}),
+        'OneHot': OneHot.get_converter(opset),
         # 'Hardmax'
         'Softsign': Softsign.get_converter(opset),
         'SoftPlus': SoftPlus.get_converter(opset),
@@ -1164,14 +1224,6 @@ def from_onnx(self, graph, opset):
                     shape=list(t_proto.dims),
                     dtype=array.dtype)
             else:
-                if op_name == "ConstantFill":
-                    fill_value = attr.get('value', 0.0)
-                    dtype = attr.get('dtype', b'int32').decode("utf-8")
-                    i_name = node.output[0]
-                    self._params[i_name] = fill_value
-                    self._nodes[i_name] = new_var(node.output[0], shape=(), dtype=dtype)
-                    inputs.append(self._nodes[i_name])
-
                 i_name = self._parse_value_proto(node)
                 attr['tvm_custom'] = {}
                 attr['tvm_custom']['name'] = i_name
@@ -1214,13 +1266,7 @@ def _parse_dtype(self, value_proto, dtype):
             return dtype
 
     def _parse_array(self, tensor_proto):
-        """Grab data in TensorProto and convert to numpy array."""
-        try:
-            from onnx.numpy_helper import to_array
-        except ImportError as e:
-            raise ImportError(
-                "Unable to import onnx which is required {}".format(e))
-        np_array = to_array(tensor_proto).reshape(tuple(tensor_proto.dims))
+        np_array = get_numpy(tensor_proto).reshape(tuple(tensor_proto.dims))
         return _nd.array(np_array)
 
     def _parse_attr(self, attr_proto):
@@ -1301,7 +1347,8 @@ def _fix_outputs(self, op_name, outputs):
 
 def from_onnx(model,
               shape=None,
-              dtype="float32"):
+              dtype="float32",
+              opset=None):
     """Convert a ONNX model into an equivalent Relay Function.
 
     ONNX graphs are represented as Python Protobuf objects.
@@ -1322,6 +1369,10 @@ def from_onnx(model,
     dtype : str or dict of str to str
         The input types to the graph
 
+    opset : int, optional
+        Override to autodetected opset.
+        This can be helpful for some testing.
+
     Returns
     -------
     mod : tvm.relay.Module
@@ -1344,9 +1395,10 @@ def from_onnx(model,
         pass
     g = GraphProto(shape, dtype)
     graph = model.graph
-    try:
-        opset = model.opset_import[0].version if model.opset_import else 1
-    except AttributeError:
-        opset = 1
+    if opset is None:
+        try:
+            opset = model.opset_import[0].version if model.opset_import else 1
+        except AttributeError:
+            opset = 1
     mod, params = g.from_onnx(graph, opset)
     return mod, params
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index bfa3431ba29e..2ef8d15fe291 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -39,22 +39,10 @@
 from .common import infer_type as _infer_type
 from .common import infer_shape as _infer_shape
 from .common import infer_channels as _infer_channels
+from .common import infer_value as _infer_value
 
 __all__ = ['from_tensorflow']
 
-def _infer_value(input_val, params):
-    from tvm.contrib import graph_runtime
-    # Check that all free variables have associated parameters.
-    assert all(var.name_hint in params.keys() for var in analysis.free_vars(
-        input_val)), "All inputs to infer must be available in params."
-    func = _expr.Function(analysis.free_vars(input_val), input_val)
-    with tvm.relay.build_config(opt_level=0):
-        graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
-    ctx = tvm.context("llvm", 0)
-    m = graph_runtime.create(graph, lib, ctx)
-    m.set_input(**params)
-    m.run()
-    return m.get_output(0)
 
 def _get_pad_pair(input1d, kernel1d, stride1d):
     if input1d % stride1d == 0:
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 3d1262f436bb..2d2265b57b95 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import attr
 import numpy as np
 import math
 import torch
@@ -26,11 +25,11 @@
 from tvm.contrib import graph_runtime
 from nnvm.testing.config import ctx_list
 import onnx
-from onnx import helper, TensorProto
-import unittest
+from onnx import helper, TensorProto, mapping
 import scipy
 
-def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output_dtype='float32'):
+
+def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output_dtype='float32', opset=None):
     """ Generic function to execute and get tvm output"""
     target = 'llvm'
     if isinstance(input_data, list):
@@ -46,21 +45,22 @@ def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output
         shape_dict = {input_names: input_data.shape}
         dtype_dict = {input_names: input_data.dtype}
 
-    mod, params = relay.frontend.from_onnx(graph_def, shape_dict)
+    mod, params = relay.frontend.from_onnx(graph_def, shape_dict, opset=opset)
     with relay.build_config(opt_level=1):
         graph, lib, params = relay.build(mod,
                                          target,
                                          params=params)
 
     ctx = tvm.cpu(0)
-    from tvm.contrib import graph_runtime
     m = graph_runtime.create(graph, lib, ctx)
     # set inputs
     if isinstance(input_data, list):
         for i, e in enumerate(input_names):
-            m.set_input(input_names[i], tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
+            m.set_input(input_names[i], tvm.nd.array(
+                input_data[i].astype(input_data[i].dtype)))
     else:
-        m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype)))
+        m.set_input(input_names, tvm.nd.array(
+            input_data.astype(input_data.dtype)))
 
     m.set_input(**params)
     # execute
@@ -76,6 +76,7 @@ def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output
         tvm_output = m.get_output(0)
         return tvm_output.asnumpy()
 
+
 def get_caffe2_output(model, x, dtype='float32'):
     import caffe2.python.onnx.backend
     prepared_backend = caffe2.python.onnx.backend.prepare(model)
@@ -93,15 +94,20 @@ def verify_onnx_forward_impl(graph_file, data_shape, out_shape):
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, dtype)
         tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def verify_super_resolution_example():
-    verify_onnx_forward_impl(super_resolution, (1, 1, 224, 224), (1, 1, 672, 672))
+    verify_onnx_forward_impl(
+        super_resolution, (1, 1, 224, 224), (1, 1, 672, 672))
+
 
 def verify_squeezenet1_1():
     verify_onnx_forward_impl(squeezenet1_1, (1, 3, 224, 224), (1, 1000))
 
+
 def verify_lenet():
     verify_onnx_forward_impl(lenet, (1, 1, 28, 28), (1, 10))
 
+
 def verify_resnet18():
     verify_onnx_forward_impl(resnet18_1_0, (1, 3, 224, 224), (1, 1000))
 
@@ -112,20 +118,20 @@ def test_reshape():
 
     ref_array = np.array(ref_shape)
     ref_node = onnx.helper.make_node('Constant',
-                                 inputs=[],
-                                 outputs=['ref_in'],
-                                 value=onnx.helper.make_tensor(name = 'const_tensor',
-                                                               data_type = onnx.TensorProto.INT32,
-                                                               dims = ref_array.shape,
-                                                               vals = ref_array.flatten().astype(int)))
+                                     inputs=[],
+                                     outputs=['ref_in'],
+                                     value=onnx.helper.make_tensor(name='const_tensor',
+                                                                   data_type=onnx.TensorProto.INT32,
+                                                                   dims=ref_array.shape,
+                                                                   vals=ref_array.flatten().astype(int)))
     reshape_node = helper.make_node("Reshape", ["in", "ref_in"], ["out"])
 
     graph = helper.make_graph([ref_node, reshape_node],
                               "reshape_test",
-                              inputs = [helper.make_tensor_value_info("in",
-                                            TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(ref_shape))])
+                              inputs=[helper.make_tensor_value_info("in",
+                                                                    TensorProto.FLOAT, list(in_shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(ref_shape))])
 
     model = helper.make_model(graph, producer_name='reshape_test')
 
@@ -135,28 +141,29 @@ def test_reshape():
 
     tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
+
 def test_shape():
     in_shape = (4, 3, 3, 4)
     ref_shape = (6, 2, 4, 3)
 
     ref_array = np.array(ref_shape)
     ref_node = onnx.helper.make_node('Constant',
-                                 inputs=[],
-                                 outputs=['ref_in'],
-                                 value=onnx.helper.make_tensor(name = 'const_tensor',
-                                                               data_type = onnx.TensorProto.INT32,
-                                                               dims = ref_array.shape,
-                                                               vals = ref_array.flatten().astype(int)))
+                                     inputs=[],
+                                     outputs=['ref_in'],
+                                     value=onnx.helper.make_tensor(name='const_tensor',
+                                                                   data_type=onnx.TensorProto.INT32,
+                                                                   dims=ref_array.shape,
+                                                                   vals=ref_array.flatten().astype(int)))
     reshape_node = helper.make_node("Reshape", ["in", "ref_in"], ["out"])
 
     shape_node = helper.make_node("Shape", ['out'], ['final_out'])
 
     graph = helper.make_graph([ref_node, reshape_node, shape_node],
                               "shape_test",
-                              inputs = [helper.make_tensor_value_info("in",
-                                            TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("final_out",
-                                            TensorProto.FLOAT, list(ref_shape))])
+                              inputs=[helper.make_tensor_value_info("in",
+                                                                    TensorProto.FLOAT, list(in_shape))],
+                              outputs=[helper.make_tensor_value_info("final_out",
+                                                                     TensorProto.FLOAT, list(ref_shape))])
 
     model = helper.make_model(graph, producer_name='shape_test')
 
@@ -166,6 +173,7 @@ def test_shape():
 
     tvm.testing.assert_allclose(ref_shape, tvm_out)
 
+
 def _test_power_iteration(x_shape, y_shape):
     if isinstance(y_shape, int):
         y_shape = [y_shape]
@@ -179,12 +187,12 @@ def _test_power_iteration(x_shape, y_shape):
 
     graph = helper.make_graph([res],
                               'power_test',
-                              inputs = [helper.make_tensor_value_info("x",
-                                            TensorProto.FLOAT, list(x_shape)),
-                                        helper.make_tensor_value_info("y",
-                                            TensorProto.FLOAT, list(y_shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(np_res.shape))])
+                              inputs=[helper.make_tensor_value_info("x",
+                                                                    TensorProto.FLOAT, list(x_shape)),
+                                      helper.make_tensor_value_info("y",
+                                                                    TensorProto.FLOAT, list(y_shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(np_res.shape))])
 
     model = helper.make_model(graph, producer_name='power_test')
 
@@ -192,11 +200,13 @@ def _test_power_iteration(x_shape, y_shape):
         tvm_out = get_tvm_output(model, [x, y], target, ctx, np_res.shape)
         tvm.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_power():
     _test_power_iteration((1, 3), (1))
     _test_power_iteration((2, 3), (2, 3))
     _test_power_iteration((2, 3), (1, 3))
 
+
 def test_squeeze():
     in_shape = (1, 3, 1, 3, 1, 1)
     out_shape = (3, 3)
@@ -204,10 +214,10 @@ def test_squeeze():
 
     graph = helper.make_graph([y],
                               'squeeze_test',
-                              inputs = [helper.make_tensor_value_info("in",
-                                            TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(out_shape))])
+                              inputs=[helper.make_tensor_value_info("in",
+                                                                    TensorProto.FLOAT, list(in_shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(out_shape))])
 
     model = helper.make_model(graph, producer_name='squeeze_test')
 
@@ -217,20 +227,21 @@ def test_squeeze():
 
     tvm.testing.assert_allclose(out_shape, tvm_out.shape)
 
+
 def test_flatten():
 
     in_shape = (1, 3, 4, 4)
     axis = 1
     ref_shape = (1, 48)
 
-    flatten_node = helper.make_node("Flatten", ["in"], ["out"], axis = axis)
+    flatten_node = helper.make_node("Flatten", ["in"], ["out"], axis=axis)
 
     graph = helper.make_graph([flatten_node],
                               "flatten_test",
-                              inputs = [helper.make_tensor_value_info("in",
-                                            TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(ref_shape))])
+                              inputs=[helper.make_tensor_value_info("in",
+                                                                    TensorProto.FLOAT, list(in_shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(ref_shape))])
 
     model = helper.make_model(graph, producer_name='flatten_test')
 
@@ -240,6 +251,7 @@ def test_flatten():
 
     tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
+
 def test_unsqueeze():
     in_shape = (3, 3)
     axis = (0, 3, 4)
@@ -248,10 +260,10 @@ def test_unsqueeze():
 
     graph = helper.make_graph([y],
                               'squeeze_test',
-                              inputs = [helper.make_tensor_value_info("in",
-                                            TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(out_shape))])
+                              inputs=[helper.make_tensor_value_info("in",
+                                                                    TensorProto.FLOAT, list(in_shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(out_shape))])
 
     model = helper.make_model(graph, producer_name='squeeze_test')
 
@@ -261,6 +273,7 @@ def test_unsqueeze():
 
     tvm.testing.assert_allclose(out_shape, tvm_out.shape)
 
+
 def verify_gather(in_shape, indices, axis, dtype):
     x = np.random.uniform(size=in_shape).astype(dtype)
     indices = np.array(indices, dtype="int32")
@@ -270,52 +283,123 @@ def verify_gather(in_shape, indices, axis, dtype):
 
     graph = helper.make_graph([y],
                               'gather_test',
-                              inputs = [helper.make_tensor_value_info("in",
-                                            TensorProto.FLOAT, list(in_shape)),
-                                        helper.make_tensor_value_info("indices",
-                                            TensorProto.INT32, list(indices.shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(out_np.shape))])
+                              inputs=[helper.make_tensor_value_info("in",
+                                                                    TensorProto.FLOAT, list(in_shape)),
+                                      helper.make_tensor_value_info("indices",
+                                                                    TensorProto.INT32, list(indices.shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(out_np.shape))])
     model = helper.make_model(graph, producer_name='gather_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape)
+        tvm_out = get_tvm_output(
+            model, [x, indices], target, ctx, out_np.shape)
         tvm.testing.assert_allclose(out_np, tvm_out)
 
+
 def test_gather():
     verify_gather((4,), [1], 0, 'int32')
-    verify_gather((1,4), [0], 0, 'int32')
-    verify_gather((4,), [[[1,0],[0,1]]], 0, 'float32')
-    verify_gather((2,2), [[[1,0],[0,1]]], 1, 'int32')
-    verify_gather((3,3,3), [[[1,0]]], -1, 'int32')
-    verify_gather((4,3,5,6), [[2,1,0,0]], 0, 'float32')
+    verify_gather((1, 4), [0], 0, 'int32')
+    verify_gather((4,), [[[1, 0], [0, 1]]], 0, 'float32')
+    verify_gather((2, 2), [[[1, 0], [0, 1]]], 1, 'int32')
+    verify_gather((3, 3, 3), [[[1, 0]]], -1, 'int32')
+    verify_gather((4, 3, 5, 6), [[2, 1, 0, 0]], 0, 'float32')
+
 
-def _test_slice_iteration(indata, outdata, starts, ends, axes=None):
+def _test_slice_iteration_v1(indata, outdata, starts, ends, axes=None):
     if axes:
-        y = helper.make_node("Slice", ['in'], ['out'], axes=axes, starts=starts, ends=ends)
+        y = helper.make_node(
+            "Slice", ['in'], ['out'], axes=axes, starts=starts, ends=ends)
     else:
-        y = helper.make_node("Slice", ['in'], ['out'], starts=starts, ends=ends)
+        y = helper.make_node(
+            "Slice", ['in'], ['out'], starts=starts, ends=ends)
 
     graph = helper.make_graph([y],
                               'slice_test',
-                              inputs = [helper.make_tensor_value_info("in",
-                                            TensorProto.FLOAT, list(indata.shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(outdata.shape))])
+                              inputs=[helper.make_tensor_value_info("in",
+                                                                    TensorProto.FLOAT, list(indata.shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(outdata.shape))])
 
     model = helper.make_model(graph, producer_name='slice_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+        tvm_out = get_tvm_output(
+            model, indata, target, ctx, outdata.shape, 'float32', opset=1)
 
     tvm.testing.assert_allclose(outdata, tvm_out)
 
+
+def _test_slice_iteration_v10(indata, outdata, starts, ends, axes=None):
+    if isinstance(starts, int):
+        starts = (starts, )
+    if isinstance(ends, int):
+        ends = (ends, )
+    if isinstance(axes, int):
+        axes = (axes, )
+    starts = np.asarray(starts)
+    ends = np.asarray(ends)
+    inputs = [
+        helper.make_tensor_value_info("data", TensorProto.FLOAT,
+                                      list(indata.shape)),
+        helper.make_tensor_value_info("starts", TensorProto.INT32,
+                                      list(starts.shape)),
+        helper.make_tensor_value_info("ends", TensorProto.INT32,
+                                      list(ends.shape))
+    ]
+    initializer = [
+        helper.make_tensor("starts", TensorProto.INT32, list(starts.shape),
+                           starts),
+        helper.make_tensor("ends", TensorProto.INT32, list(ends.shape), ends)
+    ]
+
+    if axes:
+        axes = np.asarray(axes)
+        y = helper.make_node("Slice", ["data", "starts", "ends", "axes"],
+                             ["out"])
+        inputs.append(
+            helper.make_tensor_value_info("axes", TensorProto.INT32,
+                                          list(axes.shape)))
+        initializer.append(
+            helper.make_tensor("axes", TensorProto.INT32, list(axes.shape),
+                               axes))
+    else:
+        y = helper.make_node("Slice", ["data", "starts", "ends"], ["out"])
+
+    graph = helper.make_graph([y],
+                              'slice_test',
+                              inputs=inputs,
+                              outputs=[
+                                  helper.make_tensor_value_info(
+                                      "out", TensorProto.FLOAT,
+                                      list(outdata.shape))
+                              ],
+                              initializer=initializer)
+    model = helper.make_model(graph, producer_name='slice_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model,
+                                 indata,
+                                 target,
+                                 ctx,
+                                 outdata.shape,
+                                 'float32',
+                                 opset=10)
+
+    tvm.testing.assert_allclose(outdata, tvm_out)
+
+
 def test_slice():
     x = np.random.randn(20, 10, 5).astype(np.float32)
-    _test_slice_iteration(x, x[0:3, 0:10], (0, 0), (3, 10), (0, 1))
-    _test_slice_iteration(x, x[:, :, 3:4], (0, 0, 3), (20, 10, 4))
-    _test_slice_iteration(x, x[:, 1:1000], (1), (1000), (1))
-    _test_slice_iteration(x, x[:, 0:-1], (0), (-1), (1))
+    _test_slice_iteration_v1(x, x[0:3, 0:10], (0, 0), (3, 10), (0, 1))
+    _test_slice_iteration_v1(x, x[:, :, 3:4], (0, 0, 3), (20, 10, 4))
+    _test_slice_iteration_v1(x, x[:, 1:1000], (1), (1000), (1))
+    _test_slice_iteration_v1(x, x[:, 0:-1], (0), (-1), (1))
+    _test_slice_iteration_v10(x, x[0:3, 0:10], (0, 0), (3, 10), (0, 1))
+    _test_slice_iteration_v10(x, x[:, :, 3:4], (0, 0, 3), (20, 10, 4))
+    _test_slice_iteration_v10(x, x[:, 1:1000], (1), (1000), (1))
+    _test_slice_iteration_v10(x, x[:, 0:-1], (0), (-1), (1))
+
 
 def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
     indata = np.random.uniform(-1, 1, size=inshape).astype(dtype)
@@ -325,24 +409,29 @@ def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
 
     graph = helper.make_graph([y],
                               opname+'_test',
-                              inputs = [helper.make_tensor_value_info("in",
-                                            TensorProto.FLOAT, list(indata.shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(outdata.shape))])
+                              inputs=[helper.make_tensor_value_info("in",
+                                                                    TensorProto.FLOAT, list(indata.shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(outdata.shape))])
 
     model = helper.make_model(graph, producer_name=opname+'_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype)
+        tvm_out = get_tvm_output(
+            model, indata, target, ctx, outdata.shape, dtype)
 
     tvm.testing.assert_allclose(outdata, tvm_out)
 
+
 def test_floor():
-    _test_onnx_op_elementwise((2, 4, 5, 6), np.floor, {}, 'float32', 'Floor', {})
+    _test_onnx_op_elementwise((2, 4, 5, 6), np.floor,
+                              {}, 'float32', 'Floor', {})
+
 
 def test_ceil():
     _test_onnx_op_elementwise((2, 4, 5, 6), np.ceil, {}, 'float32', 'Ceil', {})
 
+
 def test_clip():
     _test_onnx_op_elementwise((2, 4, 5, 6),
                               np.clip,
@@ -351,6 +440,38 @@ def test_clip():
                               'Clip',
                               {'min': -1.0, 'max': 1.0})
 
+
+def test_onehot():
+    indices_shape = [10]
+    indices_array = np.random.randint(
+        low=0, high=9, size=indices_shape, dtype='int32')
+    depth = 10
+    values = np.asarray([0, 1])
+    out_np = np.eye(depth)[indices_array.reshape(-1)]
+
+    onehot_node = helper.make_node(
+        "OneHot", ["indices", "depth", "values"], ["out"])
+
+    graph = helper.make_graph([onehot_node],
+                              "onehot_test",
+                              inputs=[helper.make_tensor_value_info("indices",
+                                                                    TensorProto.INT32, indices_shape),
+                                      helper.make_tensor_value_info("depth",
+                                                                    TensorProto.INT32, [1]),
+                                      helper.make_tensor_value_info("values",
+                                                                    TensorProto.INT32, values.shape)],
+                              initializer=[helper.make_tensor("depth", TensorProto.INT32, [1], [depth]),
+                                           helper.make_tensor("values", TensorProto.INT32, values.shape, values)],
+                              outputs=[helper.make_tensor_value_info("out", TensorProto.INT32, out_np.shape)])
+
+    model = helper.make_model(graph, producer_name="onehot_test")
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(
+            model, [indices_array], target, ctx, out_np.shape)
+        tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+
 def test_matmul():
     a_shape = (4, 3)
     b_shape = (3, 4)
@@ -363,52 +484,84 @@ def test_matmul():
 
     graph = helper.make_graph([mul_node],
                               "matmul_test",
-                              inputs = [helper.make_tensor_value_info("a",
-                                            TensorProto.FLOAT, list(a_shape)),
-                                        helper.make_tensor_value_info("b",
-                                            TensorProto.FLOAT, list(b_shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(out_np.shape))])
+                              inputs=[helper.make_tensor_value_info("a",
+                                                                    TensorProto.FLOAT, list(a_shape)),
+                                      helper.make_tensor_value_info("b",
+                                                                    TensorProto.FLOAT, list(b_shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(out_np.shape))])
+
+    model = helper.make_model(graph, producer_name='matmul_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(
+            model, [a_array, b_array], target, ctx, out_np.shape)
+        tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+
+def test_batch_matmul():
+    a_shape = (2, 3, 4, 3)
+    b_shape = (2, 3, 3, 4)
+
+    a_array = np.random.uniform(size=a_shape).astype('float32')
+    b_array = np.random.uniform(size=b_shape).astype('float32')
+    out_np = np.matmul(a_array, b_array)
+
+    mul_node = helper.make_node("MatMul", ["a", "b"], ["out"])
+
+    graph = helper.make_graph([mul_node],
+                              "matmul_test",
+                              inputs=[helper.make_tensor_value_info("a",
+                                                                    TensorProto.FLOAT, list(a_shape)),
+                                      helper.make_tensor_value_info("b",
+                                                                    TensorProto.FLOAT, list(b_shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(out_np.shape))])
 
     model = helper.make_model(graph, producer_name='matmul_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, [a_array, b_array], target, ctx, out_np.shape)
+        tvm_out = get_tvm_output(
+            model, [a_array, b_array], target, ctx, out_np.shape)
         tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None):
     in_array = np.random.uniform(size=shape).astype(dtype)
 
-    if alpha == None and beta == None and bias==None:
+    if alpha == None and beta == None and bias == None:
         alpha = 0.0001
         beta = 0.75
         bias = 1.0
-        node = onnx.helper.make_node('LRN', inputs=['in'], outputs=['out'], size=nsize)
+        node = onnx.helper.make_node(
+            'LRN', inputs=['in'], outputs=['out'], size=nsize)
     else:
         node = onnx.helper.make_node('LRN', inputs=['in'], outputs=['out'], alpha=alpha,
                                      beta=beta, bias=bias, size=nsize)
 
     graph = helper.make_graph([node],
                               "lrn_test",
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(shape))])
+                              inputs=[helper.make_tensor_value_info(
+                                  "in", TensorProto.FLOAT, list(shape))],
+                              outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(shape))])
     model = helper.make_model(graph, producer_name='lrn_test')
 
     def _get_python_lrn():
         square_sum = np.zeros(shape).astype(dtype)
         for n, c, h, w in np.ndindex(in_array.shape):
             square_sum[n, c, h, w] = sum(in_array[n,
-                                         max(0, c - int(math.floor((nsize - 1) / 2))): \
-                                             min(5, c + int(math.ceil((nsize - 1) / 2)) + 1),
-                                         h,
-                                         w] ** 2)
+                                                  max(0, c - int(math.floor((nsize - 1) / 2))):
+                                                  min(5, c + int(math.ceil((nsize - 1) / 2)) + 1),
+                                                  h,
+                                                  w] ** 2)
         py_out = in_array / ((bias + (alpha / nsize) * square_sum) ** beta)
         return py_out
 
     for target, ctx in ctx_list():
         input_name = model.graph.input[0].name
         py_out = _get_python_lrn()
-        tvm_out = get_tvm_output(model, in_array, target, ctx, py_out.shape, 'float32')
+        tvm_out = get_tvm_output(
+            model, in_array, target, ctx, py_out.shape, 'float32')
         tvm.testing.assert_allclose(py_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
@@ -436,20 +589,22 @@ def _get_python_instance_norm(x, gamma, beta, epsilon=1e-5):
     y = _get_python_instance_norm(x, gamma, beta, epsilon).astype(np.float32)
 
     node = onnx.helper.make_node(
-            'InstanceNormalization',
-            inputs=['x', 'gamma', 'beta'],
-            outputs=['y'],
-            epsilon=epsilon,
-        )
+        'InstanceNormalization',
+        inputs=['x', 'gamma', 'beta'],
+        outputs=['y'],
+        epsilon=epsilon,
+    )
     graph = helper.make_graph([node],
                               "instance_norm_test",
                               inputs=[helper.make_tensor_value_info("x", TensorProto.FLOAT, list(shape)),
-                                      helper.make_tensor_value_info("gamma", TensorProto.FLOAT, (shape[1],)),
+                                      helper.make_tensor_value_info(
+                                          "gamma", TensorProto.FLOAT, (shape[1],)),
                                       helper.make_tensor_value_info("beta", TensorProto.FLOAT, (shape[1],))],
                               outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(shape))])
     model = helper.make_model(graph, producer_name='instance_norm_test')
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, [x, gamma, beta], target, ctx, shape, 'float32')
+        tvm_out = get_tvm_output(
+            model, [x, gamma, beta], target, ctx, shape, 'float32')
         tvm.testing.assert_allclose(y, tvm_out, rtol=1e-5, atol=1e-5)
 
 
@@ -464,103 +619,122 @@ def _test_upsample_nearest():
     scale = 2
     in_shape = (1, 1, 3, 3)
     out_shape = (1, 1, 3*scale, 3*scale)
-    y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0])
+    y = helper.make_node("Upsample", ['in'], [
+                         'out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0])
 
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
-    out_array = topi.testing.upsampling_python(in_array, (scale, scale), "NCHW")
+    out_array = topi.testing.upsampling_python(
+        in_array, (scale, scale), "NCHW")
 
     graph = helper.make_graph([y],
                               'upsample_nearest_test',
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+                              inputs=[helper.make_tensor_value_info(
+                                  "in", TensorProto.FLOAT, list(in_shape))],
+                              outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
 
     model = helper.make_model(graph, producer_name='upsample_nearest_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        tvm_out = get_tvm_output(
+            model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out)
 
+
 def _test_upsample_bilinear():
     scale = 2
     in_shape = (1, 1, 3, 3)
     out_shape = (1, 1, 3*scale, 3*scale)
-    y = helper.make_node("Upsample", ['in'], ['out'], mode='linear', scales=[1.0, 1.0, 2.0, 2.0])
+    y = helper.make_node("Upsample", ['in'], [
+                         'out'], mode='linear', scales=[1.0, 1.0, 2.0, 2.0])
 
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
-    out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW")
+    out_array = topi.testing.bilinear_resize_python(
+        in_array, (3*scale, 3*scale), "NCHW")
 
     graph = helper.make_graph([y],
                               'upsample_bilinear_test',
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+                              inputs=[helper.make_tensor_value_info(
+                                  "in", TensorProto.FLOAT, list(in_shape))],
+                              outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
 
     model = helper.make_model(graph, producer_name='upsample_bilinear_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        tvm_out = get_tvm_output(
+            model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def _test_upsample_bilinear_opset9():
     scale = 2
     in_shape = (1, 1, 3, 3)
     out_shape = (1, 1, 3*scale, 3*scale)
-    y = helper.make_node("Upsample", ['in','scales'], ['out'], mode='linear')
-    scales=[1.0, 1.0, 2.0, 2.0]
+    y = helper.make_node("Upsample", ['in', 'scales'], ['out'], mode='linear')
+    scales = [1.0, 1.0, 2.0, 2.0]
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
-    out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW")
+    out_array = topi.testing.bilinear_resize_python(
+        in_array, (3*scale, 3*scale), "NCHW")
 
     ref_array = np.array(scales)
     ref_node = helper.make_node('Constant',
-                                 inputs=[],
-                                 outputs=['scales'],
-                                 value=onnx.helper.make_tensor(name = 'const_tensor',
-                                                               data_type = TensorProto.FLOAT,
-                                                               dims = ref_array.shape,
-                                                               vals = ref_array.flatten().astype(float)))
+                                inputs=[],
+                                outputs=['scales'],
+                                value=onnx.helper.make_tensor(name='const_tensor',
+                                                              data_type=TensorProto.FLOAT,
+                                                              dims=ref_array.shape,
+                                                              vals=ref_array.flatten().astype(float)))
 
     graph = helper.make_graph([ref_node, y],
                               'upsample_bilinear_opset9_test',
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+                              inputs=[helper.make_tensor_value_info(
+                                  "in", TensorProto.FLOAT, list(in_shape))],
+                              outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
 
-    model = helper.make_model(graph, producer_name='upsample_bilinear_opset9_test')
+    model = helper.make_model(
+        graph, producer_name='upsample_bilinear_opset9_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        tvm_out = get_tvm_output(
+            model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_upsample():
     _test_upsample_nearest()
     _test_upsample_bilinear()
     _test_upsample_bilinear_opset9()
 
+
 def _test_softmax(inshape, axis):
     opname = 'Softmax'
     indata = np.random.uniform(size=inshape).astype(np.float32)
     outshape = inshape
     outdata = topi.testing.softmax_python(indata)
     if isinstance(axis, int):
-        y = helper.make_node(opname, ['in'], ['out'], axis = axis)
+        y = helper.make_node(opname, ['in'], ['out'], axis=axis)
     elif axis is None:
         y = helper.make_node(opname, ['in'], ['out'])
 
     graph = helper.make_graph([y],
                               opname+'_test',
-                              inputs = [helper.make_tensor_value_info("in",
-                                            TensorProto.FLOAT, list(indata.shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(outdata.shape))])
+                              inputs=[helper.make_tensor_value_info("in",
+                                                                    TensorProto.FLOAT, list(indata.shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(outdata.shape))])
 
     model = helper.make_model(graph, producer_name=opname+'_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outshape, 'float32')
+        tvm_out = get_tvm_output(
+            model, indata, target, ctx, outshape, 'float32')
         tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_softmax():
     _test_softmax((1, 10), None)
     _test_softmax((1, 10), 1)
 
+
 def verify_min(input_dim):
     dtype = 'float32'
 
@@ -574,25 +748,28 @@ def verify_min(input_dim):
 
     graph = helper.make_graph([min_node],
                               "Min_test",
-                              inputs = [helper.make_tensor_value_info("a_np1",
-                                            TensorProto.FLOAT, list(input_dim)),
-                                        helper.make_tensor_value_info("a_np2",
-                                            TensorProto.FLOAT, list(input_dim)),
-                                        helper.make_tensor_value_info("a_np3",
-                                            TensorProto.FLOAT, list(input_dim))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(b_np.shape))])
+                              inputs=[helper.make_tensor_value_info("a_np1",
+                                                                    TensorProto.FLOAT, list(input_dim)),
+                                      helper.make_tensor_value_info("a_np2",
+                                                                    TensorProto.FLOAT, list(input_dim)),
+                                      helper.make_tensor_value_info("a_np3",
+                                                                    TensorProto.FLOAT, list(input_dim))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(b_np.shape))])
 
     model = helper.make_model(graph, producer_name='Min_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        tvm_out = get_tvm_output(
+            model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_forward_min():
     verify_min((1, 3, 20, 20))
     verify_min((20, 20))
 
+
 def verify_max(input_dim):
     dtype = 'float32'
 
@@ -606,25 +783,28 @@ def verify_max(input_dim):
 
     graph = helper.make_graph([max_node],
                               "Max_test",
-                              inputs = [helper.make_tensor_value_info("a_np1",
-                                            TensorProto.FLOAT, list(input_dim)),
-                                        helper.make_tensor_value_info("a_np2",
-                                            TensorProto.FLOAT, list(input_dim)),
-                                        helper.make_tensor_value_info("a_np3",
-                                            TensorProto.FLOAT, list(input_dim))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(b_np.shape))])
+                              inputs=[helper.make_tensor_value_info("a_np1",
+                                                                    TensorProto.FLOAT, list(input_dim)),
+                                      helper.make_tensor_value_info("a_np2",
+                                                                    TensorProto.FLOAT, list(input_dim)),
+                                      helper.make_tensor_value_info("a_np3",
+                                                                    TensorProto.FLOAT, list(input_dim))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(b_np.shape))])
 
     model = helper.make_model(graph, producer_name='Max_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        tvm_out = get_tvm_output(
+            model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_forward_max():
     verify_max((1, 3, 20, 20))
     verify_max((20, 20))
 
+
 def verify_mean(input_dim):
     dtype = 'float32'
 
@@ -638,25 +818,28 @@ def verify_mean(input_dim):
 
     graph = helper.make_graph([mean_node],
                               "Mean_test",
-                              inputs = [helper.make_tensor_value_info("a_np1",
-                                            TensorProto.FLOAT, list(input_dim)),
-                                        helper.make_tensor_value_info("a_np2",
-                                            TensorProto.FLOAT, list(input_dim)),
-                                        helper.make_tensor_value_info("a_np3",
-                                            TensorProto.FLOAT, list(input_dim))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(b_np.shape))])
+                              inputs=[helper.make_tensor_value_info("a_np1",
+                                                                    TensorProto.FLOAT, list(input_dim)),
+                                      helper.make_tensor_value_info("a_np2",
+                                                                    TensorProto.FLOAT, list(input_dim)),
+                                      helper.make_tensor_value_info("a_np3",
+                                                                    TensorProto.FLOAT, list(input_dim))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(b_np.shape))])
 
     model = helper.make_model(graph, producer_name='Mean_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        tvm_out = get_tvm_output(
+            model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_forward_mean():
     verify_mean((1, 3, 20, 20))
     verify_mean((20, 20))
 
+
 def verify_hardsigmoid(input_dim, alpha, beta):
     dtype = 'float32'
 
@@ -664,14 +847,15 @@ def verify_hardsigmoid(input_dim, alpha, beta):
 
     b_np = np.clip(a_np1 * alpha + beta, 0, 1)
 
-    hardsigmoid_node = helper.make_node("HardSigmoid", ["a_np1"], ["out"], alpha=alpha, beta=beta)
+    hardsigmoid_node = helper.make_node("HardSigmoid", ["a_np1"], [
+                                        "out"], alpha=alpha, beta=beta)
 
     graph = helper.make_graph([hardsigmoid_node],
                               "HardSigmoid_test",
-                              inputs = [helper.make_tensor_value_info("a_np1",
-                                            TensorProto.FLOAT, list(input_dim))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(b_np.shape))])
+                              inputs=[helper.make_tensor_value_info("a_np1",
+                                                                    TensorProto.FLOAT, list(input_dim))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.FLOAT, list(b_np.shape))])
 
     model = helper.make_model(graph, producer_name='HardSigmoid_test')
 
@@ -679,10 +863,12 @@ def verify_hardsigmoid(input_dim, alpha, beta):
         tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_forward_hardsigmoid():
     verify_hardsigmoid((1, 3, 20, 20), 0.5, 0.6)
     verify_hardsigmoid((20, 20), 0.3, 0.4)
 
+
 def verify_argmin(input_dim, axis=None, keepdims=None):
     def _argmin_numpy(data, axis=0, keepdims=True):
         result = np.argmin(data, axis=axis)
@@ -717,17 +903,19 @@ def _argmin_numpy(data, axis=0, keepdims=True):
                                      keepdims=keepdims)
     graph = helper.make_graph([node],
                               "argmin_test",
-                              inputs = [helper.make_tensor_value_info("a_np1",
-                                            TensorProto.INT32, list(a_np1.shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.INT32, list(b_np.shape))])
+                              inputs=[helper.make_tensor_value_info("a_np1",
+                                                                    TensorProto.INT32, list(a_np1.shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.INT32, list(b_np.shape))])
 
     model = helper.make_model(graph, producer_name='argmin_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
+        tvm_out = get_tvm_output(
+            model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def verify_argmax(input_dim, axis=None, keepdims=None):
     def _argmax_numpy(data, axis=0, keepdims=True):
         result = np.argmax(data, axis=axis)
@@ -763,66 +951,72 @@ def _argmax_numpy(data, axis=0, keepdims=True):
 
     graph = helper.make_graph([node],
                               "argmax_test",
-                              inputs = [helper.make_tensor_value_info("a_np1",
-                                            TensorProto.INT32, list(a_np1.shape))],
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.INT32, list(b_np.shape))])
+                              inputs=[helper.make_tensor_value_info("a_np1",
+                                                                    TensorProto.INT32, list(a_np1.shape))],
+                              outputs=[helper.make_tensor_value_info("out",
+                                                                     TensorProto.INT32, list(b_np.shape))])
 
     model = helper.make_model(graph, producer_name='argmax_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
+        tvm_out = get_tvm_output(
+            model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_forward_arg_min_max():
     '''Verify argmin and argmax'''
-    verify_argmin([3,4,4])
-    verify_argmax([3,4,4])
-    verify_argmin([3,4,4], axis=1)
-    verify_argmax([3,4,4], axis=0)
-    verify_argmin([3,4,4], keepdims=0)
-    verify_argmax([3,4,4], keepdims=1)
-    for axis in [None, 0,1,2]:
-        for keepdims in [None, True,False]:
-            verify_argmin([3,4,4], axis, keepdims)
-            verify_argmax([3,4,4], axis, keepdims)
-
-def verify_constantfill(is_shape, input_dim, out_dim, value, dtype, **kwargs):
-    input_a = np.random.uniform(size=input_dim).astype(dtype)
-    out = np.empty(shape=out_dim, dtype=dtype)
+    verify_argmin([3, 4, 4])
+    verify_argmax([3, 4, 4])
+    verify_argmin([3, 4, 4], axis=1)
+    verify_argmax([3, 4, 4], axis=0)
+    verify_argmin([3, 4, 4], keepdims=0)
+    verify_argmax([3, 4, 4], keepdims=1)
+    for axis in [None, 0, 1, 2]:
+        for keepdims in [None, True, False]:
+            verify_argmin([3, 4, 4], axis, keepdims)
+            verify_argmax([3, 4, 4], axis, keepdims)
+
+
+def verify_constantofshape(input_dim, value, dtype):
+    out = np.empty(shape=input_dim, dtype=dtype)
     out.fill(value)
 
-    if is_shape == True:
-        fill_node = helper.make_node("ConstantFill", [], ["out"], shape=input_dim, value=value, **kwargs)
-    else:
-        fill_node = helper.make_node("ConstantFill", ["input_a"], ["out"], value=value, dtype=dtype, **kwargs)
-
-    if is_shape == True:
-        inputs = []
-    else:
-        inputs = [helper.make_tensor_value_info("input_a",
-                  TensorProto.FLOAT, list(input_dim))]
-
-    graph = helper.make_graph([fill_node],
-                              "fill_test",
-                              inputs,
-                              outputs = [helper.make_tensor_value_info("out",
-                                            TensorProto.FLOAT, list(out.shape))])
+    fill_node = helper.make_node("ConstantOfShape", ["input"], ["output"],
+                                 value=helper.make_tensor(
+                                     'value',
+                                     mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)],
+                                     (1, ), (value, )))
+
+    inputs = [
+        helper.make_tensor_value_info("input", TensorProto.FLOAT, input_dim)
+    ]
+
+    graph = helper.make_graph(
+        [fill_node],
+        "fill_test",
+        inputs,
+        outputs=[
+            helper.make_tensor_value_info("output", TensorProto.FLOAT,
+                                          list(out.shape))
+        ],
+        initializer=[
+            helper.make_tensor("input", TensorProto.INT32, (len(input_dim), ),
+                               input_dim)
+        ])
 
     model = helper.make_model(graph, producer_name='fill_test')
 
     for target, ctx in ctx_list():
-        if is_shape == True:
-            tvm_out = get_tvm_output(model, [], target, ctx, out.shape)
-        else:
-            tvm_out = get_tvm_output(model, [input_a], target, ctx, out.shape)
+        tvm_out = get_tvm_output(model, [], target, ctx, out.shape)
 
         tvm.testing.assert_allclose(out, tvm_out, rtol=1e-5, atol=1e-5)
 
-def test_constantfill():
-    verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
-    verify_constantfill(False, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
-    verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5, 4, 5, 6), 10, 'float32', extra_shape=(4, 5, 6))
+
+def test_constantofshape():
+    verify_constantofshape((2, 3, 4, 5), 10, 'float32')
+    verify_constantofshape((3, 3), 0, 'int32')
+    verify_constantofshape((1, 2, 3), -1, 'float32')
 
 
 def verify_pad(indata, pads, mode='constant', value=0.0):
@@ -841,7 +1035,8 @@ def verify_pad(indata, pads, mode='constant', value=0.0):
             pads=pads,
         )
     else:
-        outdata = np.pad(indata, pad_width=np_pads, mode='constant', constant_values=value)
+        outdata = np.pad(indata, pad_width=np_pads,
+                         mode='constant', constant_values=value)
         node = helper.make_node(
             'Pad',
             inputs=['input'],
@@ -852,22 +1047,30 @@ def verify_pad(indata, pads, mode='constant', value=0.0):
         )
     graph = helper.make_graph([node],
                               'pad_test',
-                              inputs = [helper.make_tensor_value_info("input",
-                                            TensorProto.FLOAT, list(indata.shape))],
-                              outputs = [helper.make_tensor_value_info("output",
-                                            TensorProto.FLOAT, list(outdata.shape))])
+                              inputs=[helper.make_tensor_value_info("input",
+                                                                    TensorProto.FLOAT, list(indata.shape))],
+                              outputs=[helper.make_tensor_value_info("output",
+                                                                     TensorProto.FLOAT, list(outdata.shape))])
     model = helper.make_model(graph, producer_name='pad_test')
     #  tvm result
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+        tvm_out = get_tvm_output(
+            model, indata, target, ctx, outdata.shape, 'float32')
     tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_pad():
-    verify_pad(np.random.randn(2, 2).astype(np.float32), [0, 1, 0, 0], 'constant', 0.0)
-    verify_pad(np.random.randn(2, 3).astype(np.float32), [1, 0, 0, 1], 'constant', 0.0)
-    verify_pad(np.random.randn(3, 2).astype(np.float32), [0, 0, 1, 0], 'constant', 5.0)
-    verify_pad(np.random.randn(1, 3, 4, 5).astype(np.float32), [0, 0, 1, 1, 0, 0, 1, 1], 'edge')
-    verify_pad(np.random.randn(1, 3, 4, 5).astype(np.float32), [0, 0, 1, 1, 0, 0, 1, 1], 'reflect')
+    verify_pad(np.random.randn(2, 2).astype(
+        np.float32), [0, 1, 0, 0], 'constant', 0.0)
+    verify_pad(np.random.randn(2, 3).astype(
+        np.float32), [1, 0, 0, 1], 'constant', 0.0)
+    verify_pad(np.random.randn(3, 2).astype(
+        np.float32), [0, 0, 1, 0], 'constant', 5.0)
+    verify_pad(np.random.randn(1, 3, 4, 5).astype(
+        np.float32), [0, 0, 1, 1, 0, 0, 1, 1], 'edge')
+    verify_pad(np.random.randn(1, 3, 4, 5).astype(
+        np.float32), [0, 0, 1, 1, 0, 0, 1, 1], 'reflect')
+
 
 def verify_reduce_x(name, indata, axis, keepdims):
     indata = np.array(indata).astype(np.float32)
@@ -893,16 +1096,18 @@ def verify_reduce_x(name, indata, axis, keepdims):
                                 axes=axis, keepdims=keepdims)
     graph = helper.make_graph([node],
                               '{}_test'.format(name),
-                              inputs = [helper.make_tensor_value_info("input",
-                                            TensorProto.FLOAT, list(indata.shape))],
-                              outputs = [helper.make_tensor_value_info("output",
-                                            TensorProto.FLOAT, list(outdata.shape))])
+                              inputs=[helper.make_tensor_value_info("input",
+                                                                    TensorProto.FLOAT, list(indata.shape))],
+                              outputs=[helper.make_tensor_value_info("output",
+                                                                     TensorProto.FLOAT, list(outdata.shape))])
     model = helper.make_model(graph, producer_name='{}_test'.format(name))
     #  tvm result
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+        tvm_out = get_tvm_output(
+            model, indata, target, ctx, outdata.shape, 'float32')
     tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
 
+
 def test_reduce_max():
     verify_reduce_x("ReduceMax",
                     np.random.randn(3, 2, 2).astype(np.float32),
@@ -914,6 +1119,7 @@ def test_reduce_max():
                     np.random.randn(3, 3, 3).astype(np.float32),
                     axis=(1,), keepdims=1)
 
+
 def test_reduce_min():
     verify_reduce_x("ReduceMin",
                     np.random.randn(3, 2, 2).astype(np.float32),
@@ -925,6 +1131,7 @@ def test_reduce_min():
                     np.random.randn(3, 3, 3).astype(np.float32),
                     axis=(1,), keepdims=1)
 
+
 def test_reduce_sum():
     verify_reduce_x("ReduceSum",
                     np.random.randn(3, 2, 2).astype(np.float32),
@@ -936,6 +1143,7 @@ def test_reduce_sum():
                     np.random.randn(3, 3, 3).astype(np.float32),
                     axis=(1,), keepdims=1)
 
+
 def test_reduce_mean():
     verify_reduce_x("ReduceMean",
                     np.random.randn(3, 2, 2).astype(np.float32),
@@ -947,40 +1155,52 @@ def test_reduce_mean():
                     np.random.randn(3, 3, 3).astype(np.float32),
                     axis=(1,), keepdims=1)
 
+
 def verify_split(indata, outdatas, split, axis=0):
     indata = np.array(indata).astype(np.float32)
     outdatas = [np.array(o).astype(np.float32) for o in outdatas]
+    if split:
+        split_index = range(len(split))
+    else:
+        split_index = range(len(outdatas))
     node = helper.make_node(
         'Split',
         inputs=['input'],
-        outputs=['output_{}'.format(i) for i in range(len(split))],
+        outputs=['output_{}'.format(i) for i in range(len(split_index))],
         axis=axis,
         split=split
     )
     graph = helper.make_graph([node],
                               'split_test',
-                              inputs = [helper.make_tensor_value_info("input",
-                                            TensorProto.FLOAT, list(indata.shape))],
-                              outputs = [helper.make_tensor_value_info("output_{}".format(i),
-                                            TensorProto.FLOAT, list(outdatas[i].shape))
-                                            for i in range(len(split))
-                                         ])
+                              inputs=[helper.make_tensor_value_info("input",
+                                                                    TensorProto.FLOAT, list(indata.shape))],
+                              outputs=[helper.make_tensor_value_info("output_{}".format(i),
+                                                                     TensorProto.FLOAT, list(outdatas[i].shape))
+                                       for i in range(len(split_index))
+                                       ])
     model = helper.make_model(graph, producer_name='split_test')
 
     for target, ctx in ctx_list():
         output_shape = [o.shape for o in outdatas]
         output_type = ['float32', 'float32', 'float32']
-        tvm_out = get_tvm_output(model, indata, target, ctx, output_shape, output_type)
+        tvm_out = get_tvm_output(
+            model, indata, target, ctx, output_shape, output_type)
     for o, t in zip(outdatas, tvm_out):
         tvm.testing.assert_allclose(o, t)
 
+
 def test_split():
     # 1D
-    verify_split([1., 2., 3., 4., 5., 6.], [[1., 2.], [3., 4.], [5., 6.]], [2, 2, 2], 0)
-    verify_split([1., 2., 3., 4., 5., 6.], [[1., 2.], [3.], [4., 5., 6.]], [2, 1, 3], 0)
+    verify_split([1., 2., 3., 4., 5., 6.], [
+                 [1., 2.], [3., 4.], [5., 6.]], [2, 2, 2], 0)
+    verify_split([1., 2., 3., 4., 5., 6.], [
+                 [1., 2.], [3.], [4., 5., 6.]], [2, 1, 3], 0)
     # 2D
     verify_split([[1., 2., 3., 4.], [7., 8., 9., 10.]],
                  [[[1., 2.], [7., 8.]], [[3., 4.], [9., 10.]]], [2, 2], 1)
+    # Split evenly (unstack)
+    verify_split([1, 2, 3], [[1], [2], [3]], False)
+
 
 def test_binary_ops():
     in_shape = (1, 2, 3, 3)
@@ -993,13 +1213,13 @@ def verify_binary_ops(op, x, y, out_np, broadcast=None):
         else:
             z = helper.make_node(op, ['in1', 'in2'], ['out'], broadcast=1)
         graph = helper.make_graph([z],
-                                   '_test',
-                                  inputs = [helper.make_tensor_value_info("in1",
-                                                TensorProto.FLOAT, list(in_shape)),
-                                            helper.make_tensor_value_info("in2",
-                                                TensorProto.FLOAT, list(in_shape))],
-                                  outputs = [helper.make_tensor_value_info("out",
-                                                TensorProto.FLOAT, list(out_shape))])
+                                  '_test',
+                                  inputs=[helper.make_tensor_value_info("in1",
+                                                                        TensorProto.FLOAT, list(in_shape)),
+                                          helper.make_tensor_value_info("in2",
+                                                                        TensorProto.FLOAT, list(in_shape))],
+                                  outputs=[helper.make_tensor_value_info("out",
+                                                                         TensorProto.FLOAT, list(out_shape))])
         model = helper.make_model(graph, producer_name='_test')
         for target, ctx in ctx_list():
             tvm_out = get_tvm_output(model, [x, y], target, ctx)
@@ -1008,11 +1228,11 @@ def verify_binary_ops(op, x, y, out_np, broadcast=None):
     x = np.random.uniform(size=in_shape).astype(dtype)
     y = np.random.uniform(size=in_shape).astype(dtype)
     z = np.random.uniform(size=(3,)).astype(dtype)
-    verify_binary_ops("Add",x, y, x + y, broadcast=None)
+    verify_binary_ops("Add", x, y, x + y, broadcast=None)
     verify_binary_ops("Add", x, z,  x + z, broadcast=True)
     verify_binary_ops("Sub", x, y, x - y, broadcast=None)
     verify_binary_ops("Sub", x, z, x - z, broadcast=True)
-    verify_binary_ops("Mul",x, y, x * y, broadcast=None)
+    verify_binary_ops("Mul", x, y, x * y, broadcast=None)
     verify_binary_ops("Mul", x, z,  x * z, broadcast=True)
     verify_binary_ops("Div", x, y, x / y, broadcast=None)
     verify_binary_ops("Div", x, z, x / z, broadcast=True)
@@ -1021,6 +1241,7 @@ def verify_binary_ops(op, x, y, out_np, broadcast=None):
     verify_binary_ops("Less", x, y, x < y, broadcast=True)
     verify_binary_ops("Equal", x, y, x == y, broadcast=True)
 
+
 def test_single_ops():
     in_shape = (1, 2, 3, 3)
     dtype = "float32"
@@ -1029,29 +1250,30 @@ def test_single_ops():
     def verify_single_ops(op, x, out_np, rtol=1e-5, atol=1e-5):
         z = helper.make_node(op, ['in1'], ['out'])
         graph = helper.make_graph([z],
-                                   '_test',
-                                  inputs = [helper.make_tensor_value_info("in1",
-                                                TensorProto.FLOAT, list(in_shape)),],
-                                  outputs = [helper.make_tensor_value_info("out",
-                                                TensorProto.FLOAT, list(out_shape))])
+                                  '_test',
+                                  inputs=[helper.make_tensor_value_info("in1",
+                                                                        TensorProto.FLOAT, list(in_shape)), ],
+                                  outputs=[helper.make_tensor_value_info("out",
+                                                                         TensorProto.FLOAT, list(out_shape))])
         model = helper.make_model(graph, producer_name='_test')
         for target, ctx in ctx_list():
             tvm_out = get_tvm_output(model, [x], target, ctx)
             tvm.testing.assert_allclose(out_np, tvm_out, rtol=rtol, atol=atol)
 
     x = np.random.uniform(size=in_shape).astype(dtype)
-    verify_single_ops("Neg",x, -x)
-    verify_single_ops("Abs",x, np.abs(x))
-    verify_single_ops("Reciprocal",x, 1/x)
-    verify_single_ops("Sqrt",x, np.sqrt(x))
-    verify_single_ops("Relu",x, np.maximum(x, 0))
-    verify_single_ops("Exp",x, np.exp(x))
-    verify_single_ops("Log",x, np.log(x))
-    verify_single_ops("Log",x, np.log(x))
-    verify_single_ops("Tanh",x, np.tanh(x))
-    verify_single_ops("Sigmoid",x, 1 / (1 + np.exp(-x)))
-    verify_single_ops("Softsign",x, x / (1 + np.abs(x)))
-    verify_single_ops("SoftPlus",x, np.log(1 + np.exp(x)))
+    verify_single_ops("Neg", x, -x)
+    verify_single_ops("Abs", x, np.abs(x))
+    verify_single_ops("Reciprocal", x, 1/x)
+    verify_single_ops("Sqrt", x, np.sqrt(x))
+    verify_single_ops("Relu", x, np.maximum(x, 0))
+    verify_single_ops("Exp", x, np.exp(x))
+    verify_single_ops("Log", x, np.log(x))
+    verify_single_ops("Log", x, np.log(x))
+    verify_single_ops("Tanh", x, np.tanh(x))
+    verify_single_ops("Sigmoid", x, 1 / (1 + np.exp(-x)))
+    verify_single_ops("Softsign", x, x / (1 + np.abs(x)))
+    verify_single_ops("SoftPlus", x, np.log(1 + np.exp(x)))
+
 
 def test_leaky_relu():
     def leaky_relu_x(x, alpha):
@@ -1063,6 +1285,7 @@ def leaky_relu_x(x, alpha):
                               'LeakyRelu',
                               {'alpha': 0.25})
 
+
 def test_elu():
     def elu_x(x, alpha):
         return np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
@@ -1073,6 +1296,7 @@ def elu_x(x, alpha):
                               'Elu',
                               {'alpha': 0.25})
 
+
 def test_selu():
     def selu_x(x, alpha, gamma):
         return gamma * np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
@@ -1083,6 +1307,7 @@ def selu_x(x, alpha, gamma):
                               'Selu',
                               {'alpha': 0.25, 'gamma': 0.3})
 
+
 def test_ThresholdedRelu():
     def ThresholdedRelu_x(x, alpha):
         out_np = np.clip(x, alpha, np.inf)
@@ -1095,6 +1320,7 @@ def ThresholdedRelu_x(x, alpha):
                               'ThresholdedRelu',
                               {'alpha': 0.25})
 
+
 def test_ScaledTanh():
     def ScaledTanh_x(x, alpha, beta):
         return alpha * np.tanh(beta * x)
@@ -1105,6 +1331,7 @@ def ScaledTanh_x(x, alpha, beta):
                               'ScaledTanh',
                               {'alpha': 0.25, 'beta': 0.3})
 
+
 def test_ParametricSoftplus():
     def ParametricSoftplus_x(x, alpha, beta):
         return alpha * np.log(np.exp(beta * x) + 1)
@@ -1115,6 +1342,7 @@ def ParametricSoftplus_x(x, alpha, beta):
                               'ParametricSoftplus',
                               {'alpha': 0.25, 'beta': 0.3})
 
+
 def test_Scale():
     def Scale_x(x, scale):
         return scale * x
@@ -1125,6 +1353,7 @@ def Scale_x(x, scale):
                               'Scale',
                               {'scale': 0.25})
 
+
 def test_LogSoftmax():
     _test_onnx_op_elementwise((1, 4),
                               topi.testing.log_softmax_python,
@@ -1138,7 +1367,8 @@ def check_torch_conversion(model, input_size):
     dummy_input = torch.randn(*input_size)
     file_name = '{}.onnx'.format(model.__name__)
     # Set verbose=True for more output
-    torch.onnx.export(model(), dummy_input, file_name, export_params=True, verbose=False)
+    torch.onnx.export(model(), dummy_input, file_name,
+                      export_params=True, verbose=False)
     onnx_model = onnx.load(file_name)
     for target, ctx in ctx_list():
         input_data = np.random.uniform(size=input_size).astype('int32')
@@ -1146,13 +1376,14 @@ def check_torch_conversion(model, input_size):
         tvm_out = get_tvm_output(onnx_model, input_data, target, ctx)
         tvm.testing.assert_allclose(c2_out, tvm_out)
 
+
 def test_resnet():
-    check_torch_conversion(torchvision.models.resnet18, (1,3,224,224))
+    check_torch_conversion(torchvision.models.resnet18, (1, 3, 224, 224))
     # check_torch_conversion(torchvision.models.resnet101, (1,3,224,224))
 
 # def test_alexnet():
-    # Torch's ONNX export does not support the adaptive pooling used by AlexNet?
-    # check_torch_conversion(torchvision.models.alexnet, (1,3,224,224))
+# Torch's ONNX export does not support the adaptive pooling used by AlexNet?
+# check_torch_conversion(torchvision.models.alexnet, (1,3,224,224))
 
 # Torch's ONNX export does not support the adaptive pooling used by vgg16?
 # def test_vgg16():
@@ -1163,11 +1394,13 @@ def test_resnet():
 #     # Torch's ONNX export does not support the max pooling used by Squezenet
 #     check_torch_conversion(torchvision.models.squeezenet1_0, (1,3,224,224))
 
+
 def test_densenet():
-    check_torch_conversion(torchvision.models.densenet161, (1,3,224,224))
+    check_torch_conversion(torchvision.models.densenet161, (1, 3, 224, 224))
+
 
 def test_inception():
-    check_torch_conversion(torchvision.models.inception_v3, (1,3,224,224))
+    check_torch_conversion(torchvision.models.inception_v3, (1, 3, 224, 224))
 
 # TODO(@jroesch): Update Torch + ONNX to support this import.
 # def test_googlenet():
@@ -1177,6 +1410,7 @@ def test_inception():
 # def test_shufflenetv2():
 #     check_torch_conversion(torchvision.models.shufflenetv2, (1,3,224,224))
 
+
 def test_sign():
     def Sign_x(x):
         return np.sign(x)
@@ -1196,7 +1430,8 @@ def verify_not(indata, dtype):
 
     graph = helper.make_graph([node],
                               'not_test',
-                              inputs=[helper.make_tensor_value_info("in", TensorProto.BOOL, list(x.shape))],
+                              inputs=[helper.make_tensor_value_info(
+                                  "in", TensorProto.BOOL, list(x.shape))],
                               outputs=[helper.make_tensor_value_info("out", TensorProto.BOOL, list(outdata.shape))])
 
     model = helper.make_model(graph, producer_name='not_test')
@@ -1262,31 +1497,70 @@ def test_and():
     verify_and(indata=[x, y], dtype=bool)
 
 
-def verify_tile(indata, outdata, **kwargs):
+def verify_tile_v1(indata, outdata, **kwargs):
     node = helper.make_node('Tile', inputs=['in'], outputs=['out'], **kwargs)
     graph = helper.make_graph([node],
                               'tile_test',
-                              inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
+                              inputs=[helper.make_tensor_value_info(
+                                  "in", TensorProto.FLOAT, list(indata.shape))],
                               outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))])
 
     model = helper.make_model(graph, producer_name='tile_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, [indata], target, ctx, outdata.shape)
+        tvm_out = get_tvm_output(
+            model, [indata], target, ctx, outdata.shape, opset=1)
+        tvm.testing.assert_allclose(outdata, tvm_out)
+
+
+def verify_tile_v6(indata, repeats, outdata):
+    node = helper.make_node('Tile',
+                            inputs=['input', 'repeats'],
+                            outputs=['out'])
+    graph = helper.make_graph(
+        [node],
+        'tile_test',
+        inputs=[
+            helper.make_tensor_value_info("input", TensorProto.FLOAT,
+                                          list(indata.shape)),
+            helper.make_tensor_value_info("repeats", TensorProto.INT64,
+                                          list(repeats.shape))
+        ],
+        outputs=[
+            helper.make_tensor_value_info("out", TensorProto.FLOAT,
+                                          list(outdata.shape))
+        ],
+        initializer=[
+            helper.make_tensor("repeats", TensorProto.INT64,
+                               list(repeats.shape), repeats)
+        ])
+
+    model = helper.make_model(graph, producer_name='tile_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [indata],
+                                 target,
+                                 ctx,
+                                 outdata.shape,
+                                 opset=6)
         tvm.testing.assert_allclose(outdata, tvm_out)
 
 
 def test_tile():
     x = np.random.rand(2, 3, 4, 5).astype(np.float32)
-    repeats = np.random.randint(low=1, high=10, size=(np.ndim(x),)).astype(np.int64)
+    repeats = np.random.randint(
+        low=1, high=10, size=(np.ndim(x),)).astype(np.int64)
     z = np.tile(x, repeats)
-    verify_tile(x, z, repeats=repeats)
+    verify_tile_v1(x, z, repeats=repeats)
+    verify_tile_v6(x, repeats, z)
+
 
 def verify_erf(indata, outdata):
     node = helper.make_node('Erf', inputs=['in'], outputs=['out'])
     graph = helper.make_graph([node],
                               'erf_test',
-                              inputs=[helper.make_tensor_value_info('in', TensorProto.FLOAT, list(indata.shape))],
+                              inputs=[helper.make_tensor_value_info(
+                                  'in', TensorProto.FLOAT, list(indata.shape))],
                               outputs=[helper.make_tensor_value_info('out', TensorProto.FLOAT, list(outdata.shape))])
     model = helper.make_model(graph, producer_name='erf_test')
 
@@ -1294,6 +1568,7 @@ def verify_erf(indata, outdata):
         tvm_out = get_tvm_output(model, [indata], target, ctx, outdata.shape)
         tvm.testing.assert_allclose(outdata, tvm_out)
 
+
 def test_erf():
     x = np.random.rand(2, 3, 4, 6).astype(np.float32)
     z = scipy.special.erf(x)
@@ -1337,7 +1612,9 @@ def test_where():
     test_floor()
     test_ceil()
     test_clip()
+    test_onehot()
     test_matmul()
+    test_batch_matmul()
     test_gather()
     test_lrn()
     test_instance_norm()
@@ -1348,7 +1625,7 @@ def test_where():
     test_forward_hardsigmoid()
     test_forward_arg_min_max()
     test_softmax()
-    test_constantfill()
+    test_constantofshape()
     test_reduce_max()
     test_reduce_min()
     test_reduce_sum()

From 76c8ead492b7646d1c531a78314174761093510d Mon Sep 17 00:00:00 2001
From: Jon Soifer <soiferj@gmail.com>
Date: Wed, 30 Oct 2019 11:43:09 -0700
Subject: [PATCH 57/59] [Relay][Topi][TensorFlow][ONNX][Lang] Add support for
 Any op (#4205)

* Add support for Any op

* Support ONNX frontend

* Add doc

* Add to relay docs

* Dummy change to retrigger CI
---
 docs/api/python/topi.rst                      |  2 +
 docs/frontend/tensorflow.rst                  |  1 +
 docs/langref/relay_op.rst                     |  2 +
 include/tvm/expr_operator.h                   |  7 +++
 python/tvm/relay/frontend/onnx.py             |  9 +++-
 python/tvm/relay/frontend/tensorflow.py       |  1 +
 python/tvm/relay/op/_reduce.py                |  1 +
 python/tvm/relay/op/reduce.py                 | 52 +++++++++++++++++++
 src/lang/expr_operator.cc                     | 10 ++++
 src/relay/op/tensor/reduce.cc                 | 37 +++++++++++++
 tests/python/frontend/onnx/test_forward.py    | 48 +++++++++++++++++
 .../frontend/tensorflow/test_forward.py       | 12 ++++-
 tests/python/relay/test_op_level4.py          |  3 +-
 topi/include/topi/reduction.h                 | 21 ++++++++
 topi/python/topi/reduction.py                 | 25 +++++++++
 topi/src/topi.cc                              |  5 ++
 topi/tests/python/test_topi_reduce.py         | 24 +++++++++
 17 files changed, 256 insertions(+), 4 deletions(-)

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 3483668a5b08..0e203c176711 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -91,6 +91,7 @@ List of operators
    topi.greater_equal
    topi.less_equal
    topi.all
+   topi.any
    topi.logical_and
    topi.logical_or
    topi.logical_not
@@ -151,6 +152,7 @@ topi
 .. autofunction:: topi.full
 .. autofunction:: topi.full_like
 .. autofunction:: topi.all
+.. autofunction:: topi.any
 .. autofunction:: topi.max
 .. autofunction:: topi.sum
 .. autofunction:: topi.min
diff --git a/docs/frontend/tensorflow.rst b/docs/frontend/tensorflow.rst
index 827f5d637988..878288840354 100644
--- a/docs/frontend/tensorflow.rst
+++ b/docs/frontend/tensorflow.rst
@@ -116,6 +116,7 @@ Supported Ops
 - Abs
 - Add
 - All
+- Any
 - ArgMax
 - ArgMin
 - AvgPool
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 57325b53d974..db741206caa6 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -137,6 +137,7 @@ This level enables additional math and transform operators.
    tvm.relay.less
    tvm.relay.less_equal
    tvm.relay.all
+   tvm.relay.any
    tvm.relay.logical_and
    tvm.relay.logical_or
    tvm.relay.logical_not
@@ -300,6 +301,7 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.less
 .. autofunction:: tvm.relay.less_equal
 .. autofunction:: tvm.relay.all
+.. autofunction:: tvm.relay.any
 .. autofunction:: tvm.relay.logical_and
 .. autofunction:: tvm.relay.logical_or
 .. autofunction:: tvm.relay.logical_not
diff --git a/include/tvm/expr_operator.h b/include/tvm/expr_operator.h
index adc77a8d0f0b..625ee8e49286 100644
--- a/include/tvm/expr_operator.h
+++ b/include/tvm/expr_operator.h
@@ -519,6 +519,13 @@ TVM_DLL Expr sum(Expr source, Array<IterVar> axis);
  */
 TVM_DLL Expr all(Expr source, Array<IterVar> axis);
 
+/*!
+ * \brief logical Or of of source expression over axis
+ * \param source The source expression.
+ * \param axis List of iteration variables that will be used for reduction.
+ */
+TVM_DLL Expr any(Expr source, Array<IterVar> axis);
+
 /*!
  * \brief max of of source expression over axis
  * \param source The source expression.
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 41fafbc55405..a28b8f682b26 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -989,6 +989,12 @@ class Where(OnnxOpConverter):
     def _impl_v9(cls, inputs, attr, params):
         return _op.where(inputs[0], inputs[1], inputs[2])
 
+class Or(Elemwise):
+    """ Operator converter for Or.
+    """
+    @classmethod
+    def _impl_v7(cls, inputs, attr, params):
+        return _op.logical_or(inputs[0], inputs[1])
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -1111,7 +1117,8 @@ def _get_convert_map(opset):
         'And': And.get_converter(opset),
         'Tile': Tile.get_converter(opset),
         'Erf': Erf.get_converter(opset),
-        'Where': Where.get_converter(opset)
+        'Where': Where.get_converter(opset),
+        'Or': Or.get_converter(opset)
     }
 
 
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 2ef8d15fe291..648d7f42162f 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1330,6 +1330,7 @@ def _impl(inputs, attr, params):
     'Abs'                               : AttrCvt('abs'),
     'Add'                               : _elemwise('add'),
     'All'                               : _reduce('all'),
+    'Any'                               : _reduce('any'),
     'ArgMax'                            : _argx(_op.argmax, 'argmax'),
     'ArgMin'                            : _argx(_op.argmin, 'argmin'),
     'Assert'                            : _assert(),
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
index 845ec4b9ba87..06d0d66bdfb0 100644
--- a/python/tvm/relay/op/_reduce.py
+++ b/python/tvm/relay/op/_reduce.py
@@ -31,6 +31,7 @@ def _schedule_reduce(_, outs, target):
 _reg.register_schedule("argmin", _schedule_reduce)
 _reg.register_schedule("sum", _schedule_reduce)
 _reg.register_schedule("all", _schedule_reduce)
+_reg.register_schedule("any", _schedule_reduce)
 _reg.register_schedule("max", _schedule_reduce)
 _reg.register_schedule("min", _schedule_reduce)
 _reg.register_schedule("prod", _schedule_reduce)
diff --git a/python/tvm/relay/op/reduce.py b/python/tvm/relay/op/reduce.py
index 49193fd4b5c6..baf896e6bc9a 100644
--- a/python/tvm/relay/op/reduce.py
+++ b/python/tvm/relay/op/reduce.py
@@ -166,6 +166,58 @@ def all(data, axis=None, keepdims=False, exclude=False):
     return _make.all(data, axis, keepdims, exclude)
 
 
+def any(data, axis=None, keepdims=False, exclude=False):
+    """Computes the logical OR of boolean array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input boolean tensor
+
+    axis : None or int or tuple of int
+        Axis or axes along which a sum is performed. The default, axis=None,
+        will sum all of the elements of the input array. If axis is
+        negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as
+        dimensions with size one. With this option, the result will broadcast
+        correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+        NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+
+    Examples
+    --------
+    .. code-block:: python
+
+    data = relay.Constant(tvm.nd.array([[[ True,  True,  True],
+                                         [ True,  True,  True],
+                                         [False,  True, False]],
+                                        [[ True, False, False],
+                                         [ True,  True, False],
+                                         [False,  True,  True]]]))
+
+    relay.any(data, axis=1)
+    # [[True, True, True],
+    # [True,  True, True]]
+
+    relay.any(data, axis=0)
+    # [[ True, True, True],
+    # [ True,  True, True],
+    # [False,  True, True]]
+
+    """
+    axis = [axis] if isinstance(axis, int) else axis
+    return _make.any(data, axis, keepdims, exclude)
+
+
 def max(data, axis=None, keepdims=False, exclude=False):
     """ Computes the max of array elements over given axes.
 
diff --git a/src/lang/expr_operator.cc b/src/lang/expr_operator.cc
index 9c9100b1902e..220d4378cc97 100644
--- a/src/lang/expr_operator.cc
+++ b/src/lang/expr_operator.cc
@@ -486,6 +486,16 @@ Expr all(Expr source, Array<IterVar> rdom) {
   return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
+Expr any(Expr source, Array<IterVar> rdom) {
+  CHECK(source.type().is_bool());
+  Var x("x", source.type()), y("y", source.type());
+  Expr result = ir::Or::make(x, y);
+  Expr identity_element = make_const(source.type(), false);
+  ir::CommReducer combiner =
+    ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
+}
+
 Expr max(Expr source, Array<IterVar> rdom) {
   Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Max::make(x, y);
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 51714bd9f756..63524bc4e81d 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -420,6 +420,43 @@ Example::
 .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 
+Array<Tensor> AnyCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::any);
+}
+
+
+RELAY_REGISTER_REDUCE_OP("any")
+.describe(R"code(Computes the logical OR of boolean array elements over given axes.
+
+Example::
+
+  data = [[[ True,  True,  True],
+           [ True,  True,  True],
+           [False,  True, False]],
+          [[ True, False, False],
+           [ True,  True, False],
+           [False,  True,  True]]]
+
+  any(data, axis=1)
+  [[True,  True, True],
+   [True,  True, True]]
+
+  any(data, axis=0)
+  [[ True,  True, True],
+   [ True,  True, True],
+   [False,  True, True]]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type<ReduceAttrs>()
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", AnyCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+
 Array<Tensor> MaxCompute(const Attrs& attrs,
                          const Array<Tensor>& inputs,
                          const Type& out_type,
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 2d2265b57b95..5dfaee42471e 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1601,6 +1601,53 @@ def test_where():
     verify_where(condition, x, y, TensorProto.FLOAT, outdata)
 
 
+def verify_or(indata, dtype):
+    x = indata[0].astype(dtype)
+    y = indata[1].astype(dtype)
+    outdata = np.logical_or(x, y)
+
+    node = helper.make_node('Or', inputs=['in1', 'in2'], outputs=['out'], )
+
+    graph = helper.make_graph([node],
+                              'or_test',
+                              inputs=[helper.make_tensor_value_info("in1", TensorProto.BOOL, list(x.shape)),
+                                      helper.make_tensor_value_info("in2", TensorProto.BOOL, list(y.shape))],
+                              outputs=[helper.make_tensor_value_info("out", TensorProto.BOOL, list(outdata.shape))])
+
+    model = helper.make_model(graph, producer_name='or_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [x, y], target, ctx, outdata.shape)
+        tvm.testing.assert_allclose(outdata, tvm_out)
+
+
+def test_or():
+    # 2d
+    x = (np.random.randn(3, 4) > 0)
+    y = (np.random.randn(3, 4) > 0)
+    verify_or(indata=[x, y], dtype=bool)
+
+    # 3d
+    x = (np.random.randn(3, 4, 5) > 0)
+    y = (np.random.randn(3, 4, 5) > 0)
+    verify_or(indata=[x, y], dtype=bool)
+
+    # 4d
+    x = (np.random.randn(3, 4, 5, 6) > 0)
+    y = (np.random.randn(3, 4, 5, 6) > 0)
+    verify_or(indata=[x, y], dtype=bool)
+
+    # 3d vs 1d
+    x = (np.random.randn(3, 4, 5) > 0)
+    y = (np.random.randn(5) > 0)
+    verify_or(indata=[x, y], dtype=bool)
+
+    # 3d vs 2d
+    x = (np.random.randn(3, 4, 5) > 0)
+    y = (np.random.randn(4, 5) > 0)
+    verify_or(indata=[x, y], dtype=bool)
+
+
 if __name__ == '__main__':
     test_flatten()
     test_reshape()
@@ -1651,3 +1698,4 @@ def test_where():
     test_tile()
     test_erf()
     test_where()
+    test_or()
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 11c6a7befca6..88787efdba16 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -2198,7 +2198,7 @@ def check_size(ishape):
     check_size((10,))
 
 #######################################################################
-# All, Max, Min
+# All, Any, Max, Min
 # -------------
 def test_forward_reduce_all():
     """Test the All operator."""
@@ -2208,6 +2208,14 @@ def test_forward_reduce_all():
     tf.reduce_all(in_data, name="all")
     compare_tf_with_tvm([np_data], ['in_data:0'], 'all:0')
 
+def test_forward_reduce_any():
+    """Test the Any operator."""
+    np_data = np.random.choice([True, False], size=(5, 7, 11))
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.bool, (5, 7, 11), name="in_data")
+    tf.reduce_any(in_data, name="any")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'any:0')
+
 def test_forward_reduce_max():
     def check_max(ishape, axis, keepdims, dtype):
         tf.reset_default_graph()
@@ -2432,7 +2440,7 @@ def test_forward_one_hot():
     test_forward_mean()
     test_forward_reduce_prod()
     test_forward_reduce_all()
-    test_forward_reduce_max()
+    test_forward_reduce_any()
     test_forward_reduce_min()
 
     # General
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index c34dddfd0fd7..6a8a678bfda3 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -145,7 +145,7 @@ def test_where():
 def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32"):
     test_func = funcs[0]
     ref_func = funcs[1]
-    dtype = "bool" if ref_func in [np.all] else dtype
+    dtype = "bool" if ref_func in [np.all, np.any] else dtype
 
     x = relay.var("x", relay.TensorType(data, dtype))
     z = test_func(x, axis, keepdims, exclude)
@@ -207,6 +207,7 @@ def _wrapper(data, axis=None, keepdims=False):
                  [relay.std, np.std],
                  [relay.prod, np.prod],
                  [relay.all, np.all],
+                 [relay.any, np.any],
                  [relay.argmin, _with_keepdims(np.argmin)],
                  [relay.argmax, _with_keepdims(np.argmax)]]:
         verify_reduce(func, (d1, d2, d3, d4), None, False, False, ())
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index 14dec7765151..b7036770aa4a 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -390,6 +390,27 @@ inline Tensor all(const Tensor& data,
   return CommReduce(data, axis, tvm::all, keepdims, atleast1d);
 }
 
+/*!
+* \brief Creates an operation that computes the logical OR of elements
+* over a given axis
+*
+* \param data The input boolean tensor
+* \param axis The axes to reduce. If axis is empty, the operation will
+* perform logical OR over all elements of the array.
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
+*
+* \return A Tensor whose op member is the all operation
+*/
+inline Tensor any(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, tvm::any, keepdims, atleast1d);
+}
+
 /*!
 * \brief Creates an operation that finds the minimum of elements over
 * a given axis.
diff --git a/topi/python/topi/reduction.py b/topi/python/topi/reduction.py
index 5079bf474deb..7c4e059d8334 100644
--- a/topi/python/topi/reduction.py
+++ b/topi/python/topi/reduction.py
@@ -90,6 +90,31 @@ def all(data, axis=None, keepdims=False):
     return cpp.all(data, axis, keepdims)
 
 
+def any(data, axis=None, keepdims=False):
+    """Logical OR of array elements over a given axis or a list of axes
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        The input tvm boolean tensor
+
+    axis : None or int or tuple of int
+        Axis or axes along which a logical OR is performed.
+        The default, axis=None, will perform logical OR over all elements of the input array.
+        If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.any(data, axis, keepdims)
+
+
 def max(data, axis=None, keepdims=False):
     """Maximum of array elements over a given axis or a list of axes
 
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index a0700bffa7e3..01fc5983617c 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -300,6 +300,11 @@ TVM_REGISTER_GLOBAL("topi.all")
   *rv = topi::all(args[0], ArrayOrInt(args[1]), args[2]);
   });
 
+TVM_REGISTER_GLOBAL("topi.any")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::any(args[0], ArrayOrInt(args[1]), args[2]);
+  });
+
 /* Ops from transform.h */
 TVM_REGISTER_GLOBAL("topi.expand_dims")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 6e6470dad588..d266cfc6ceb5 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -52,6 +52,8 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32")
         B = topi.sum(A1, axis=axis, keepdims=keepdims)
     elif type == "all":
         B = topi.all(A, axis=axis, keepdims=keepdims)
+    elif type == "any":
+        B = topi.any(A, axis=axis, keepdims=keepdims)
     elif type == "max":
         B = topi.max(A1, axis=axis, keepdims=keepdims)
     elif type == "min":
@@ -86,6 +88,8 @@ def check_device(device):
             out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
         elif type == "all" and dtype == 'bool':
             out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
+        elif type == "any" and dtype == "bool":
+            out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
         elif type == "max":
             out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
         elif type == "min":
@@ -173,6 +177,26 @@ def test_reduce_map():
                           keepdims=True,
                           type="sum",
                           dtype="float64")
+    verify_reduce_map_ele(in_shape=(2, 3),
+                          axis=None,
+                          keepdims=True,
+                          type="any",
+                          dtype="bool")
+    verify_reduce_map_ele(in_shape=(32, 128, 24),
+                          axis=None,
+                          keepdims=True,
+                          type="any",
+                          dtype="bool")
+    verify_reduce_map_ele(in_shape=(1, 4, 7),
+                          axis=1,
+                          keepdims=True,
+                          type="any",
+                          dtype="bool")
+    verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
+                          axis=2,
+                          keepdims=False,
+                          type="any",
+                          dtype="bool")
 
 if __name__ == "__main__":
     test_reduce_map()

From cd69b8ad6087ec40d9b87f5c106e6910887636ae Mon Sep 17 00:00:00 2001
From: Wang Yao <kevinthesunwy@gmail.com>
Date: Wed, 30 Oct 2019 14:42:03 -0700
Subject: [PATCH 58/59]  Update dmlc_tvm_commit_id.txt

---
 dmlc_tvm_commit_id.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dmlc_tvm_commit_id.txt b/dmlc_tvm_commit_id.txt
index 1081f592982d..41319ceb4688 100644
--- a/dmlc_tvm_commit_id.txt
+++ b/dmlc_tvm_commit_id.txt
@@ -1 +1 @@
-cf046972eb5602c2d1b67edea230f6ca07c966b1
\ No newline at end of file
+b07b1952642739356c9b1df40d66288a113e9116
\ No newline at end of file

From 9b1d15b9956b9b55e5d218d7a0f238da3849ff41 Mon Sep 17 00:00:00 2001
From: Wang Yao <kevinthesunwy@gmail.com>
Date: Wed, 30 Oct 2019 14:45:06 -0700
Subject: [PATCH 59/59] Merge from upstream

---
 dmlc_tvm_commit_id.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dmlc_tvm_commit_id.txt b/dmlc_tvm_commit_id.txt
index 41319ceb4688..54cac107af79 100644
--- a/dmlc_tvm_commit_id.txt
+++ b/dmlc_tvm_commit_id.txt
@@ -1 +1 @@
-b07b1952642739356c9b1df40d66288a113e9116
\ No newline at end of file
+76c8ead492b7646d1c531a78314174761093510d