diff --git a/CODEOWNERS b/.github/CODEOWNERS
similarity index 51%
rename from CODEOWNERS
rename to .github/CODEOWNERS
index 0e22cff91e0b..5d0e94533bf4 100644
--- a/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,13 +5,23 @@
 *        @dmlc/tvm-committers
 
 # LLVM backends
-src/llvm/*          @aatluri
+src/codegen/llvm/*          @aatluri
 
 # ROCM runtime
 src/runtime/rocm/*    @aatluri
 
+# SGX support
+src/runtime/sgx/*       @nhynes
+apps/sgx/*              @nhynes
+
 # JVM language
-jvm/*   @javelinjs
+jvm/*   @yzhliu
+
+# WebGL backends
+src/runtime/opengl/*    @phisiart
+src/codegen/*opengl*    @phisiart
 
 # TOPI
 topi/python/topi/*  @Laurawly @Huyuwei
+
+
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 000000000000..0e2a130d489e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,7 @@
+Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.  You are always welcomed to post on the forum first :)
+
+Issues that are inactive for a period of time may get closed. We adopt this policy so that we won't lose track of actionable issues that may fall at the bottom of the pile. Feel free to reopen a new one if you feel there is an additional problem that needs attention when an old one gets closed.
+
+For bug reports, to help the developer act on the issues, please include a description of your environment, preferably a minimum script to reproduce the problem.
+
+For feature proposals, list clear, small actionable items so we can track the progress of the change.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000000..313b776b0824
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1 @@
+Thanks for contributing to TVM!   Please refer to guideline https://docs.tvm.ai/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from others in the community.
diff --git a/.gitignore b/.gitignore
index f59a58552f8d..3c968eb3ed47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -98,7 +98,6 @@ build_*
 Win32
 *.dir
 perf
-nnvm
 *.wasm
 .emscripten
 
@@ -132,13 +131,63 @@ xcuserdata/
 .emscripten*
 .m2
 
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
 ## Other
 *.moved-aside
 *.xccheckout
 *.xcscmblueprint
 .DS_Store
 tags
+cscope*
+*.lock
 
 # vim temporary files
 *.swp
 *.swo
+
+# TVM generated code
+perf
+.bash_history
+*.json
+*.params
+*.onnx
+*.h5
+synset.txt
+cat.jpg
+docs.tgz
+cat.png
+*.mlmodel
+# Mac OS X
+.DS_Store
+build*
+
+# Jetbrain
+.idea
+
+# tmp file
+.nfs*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f42705ae7fda..39776d53d1f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,12 @@
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.2)
 project(tvm C CXX)
 
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
-  include(${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
-endif()
-
-include(cmake/Util.cmake)
+# Utility functions
+include(cmake/util/Util.cmake)
+include(cmake/util/FindCUDA.cmake)
+include(cmake/util/FindVulkan.cmake)
+include(cmake/util/FindLLVM.cmake)
+include(cmake/util/FindROCM.cmake)
 
 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
   include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
@@ -19,33 +20,47 @@ endif()
 # You can create a config.cmake at build folder
 # and add set(OPTION VALUE) to override these build options.
 # Alernatively, use cmake -DOPTION=VALUE through command-line.
-
 tvm_option(USE_CUDA "Build with CUDA" OFF)
 tvm_option(USE_OPENCL "Build with OpenCL" OFF)
+tvm_option(USE_VULKAN "Build with Vulkan" OFF)
+tvm_option(USE_OPENGL "Build with OpenGL" OFF)
 tvm_option(USE_METAL "Build with Metal" OFF)
+tvm_option(USE_ROCM "Build with ROCM" OFF)
+tvm_option(ROCM_PATH "The path to rocm" /opt/rocm)
 tvm_option(USE_RPC "Build with RPC" ON)
+tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
-tvm_option(USE_LLVM "Build with LLVM" OFF)
+tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
 tvm_option(USE_MSVC_MT "Build with MT" OFF)
 tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF)
 
+# Contrib library options
+tvm_option(USE_BLAS "The blas library to be linked" none)
+tvm_option(USE_MKL_PATH "MKL root path when use MKL blas" none)
+tvm_option(USE_CUDNN "Build with cuDNN" OFF)
+tvm_option(USE_CUBLAS "Build with cuBLAS" OFF)
+tvm_option(USE_MIOPEN "Build with ROCM:MIOpen" OFF)
+tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF)
+tvm_option(USE_SORT "Build with sort support" OFF)
+tvm_option(USE_NNPACK "Build with nnpack support" OFF)
+tvm_option(USE_RANDOM "Build with random support" OFF)
+
+# include directories
 include_directories("include")
-include_directories("HalideIR/src")
 include_directories("dlpack/include")
+include_directories("dmlc-core/include")
 
-
+# initial variables
 set(TVM_LINKER_LIBS "")
 set(TVM_RUNTIME_LINKER_LIBS "")
 
-# compile
+# Generic compilation options
 if(MSVC)
   add_definitions(-DWIN32_LEAN_AND_MEAN)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
   add_definitions(-D_SCL_SECURE_NO_WARNINGS)
-  add_definitions(-DTVM_EXPORTS)
   add_definitions(-DHalide_SHARED)
-  add_definitions(-DHalide_EXPORTS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /bigobj")
@@ -61,146 +76,142 @@ if(MSVC)
 else(MSVC)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++11"    SUPPORT_CXX11)
-  set(CMAKE_C_FLAGS "-O3 -Wall -std=c++11 -fPIC")
-  set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS})
+  set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}")
+  set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS}")
+  if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
+      CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
+  endif()
 endif(MSVC)
 
 # add source group
-FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "HalideIR/src/*.cpp")
-FILE(GLOB_RECURSE GROUP_Include "src/*.h" "include/*.h" "HalideIR/src/*.h")
+FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "HalideIR/src/*.cpp" "nnvm/src/*.cc")
+FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "HalideIR/src/*.h"
+                                "nnvm/src/*.h" "nnvm/include/*.h")
 assign_source_group("Source" ${GROUP_SOURCE})
-assign_source_group("Include" ${GROUP_Include})
+assign_source_group("Include" ${GROUP_INCLUDE})
 
+# Source file lists
 file(GLOB COMPILER_SRCS
     src/api/*.cc
     src/arithmetic/*.cc
+    src/autotvm/*.cc
     src/codegen/*.cc
     src/codegen/stack_vm/*.cc
     src/lang/*.cc
     src/pass/*.cc
     src/op/*.cc
     src/schedule/*.cc
+    )
+
+if(NOT MSVC)
+  file(GLOB COMPILER_VERILOG_SRCS src/codegen/verilog/*.cc)
+  list(APPEND COMPILER_SRCS ${COMPILER_VERILOG_SRCS})
+endif()
+
+file(GLOB_RECURSE NNVM_COMPILER_SRCS
+    nnvm/src/c_api/*.cc
+    nnvm/src/core/*.cc
+    nnvm/src/pass/*.cc
+    nnvm/src/compiler/*.cc
+    nnvm/src/top/*.cc
+    )
+
+file(GLOB TOPI_SRCS
+    topi/src/*.cc
 )
 file(GLOB_RECURSE HALIDEIR_SRCS HalideIR/src/*.cpp)
 list(APPEND COMPILER_SRCS ${HALIDEIR_SRCS})
 file(GLOB RUNTIME_SRCS src/runtime/*.cc)
-file(GLOB COMPILER_LLVM_SRCS src/codegen/llvm/*.cc)
-file(GLOB RUNTIME_CUDA_SRCS src/runtime/cuda/*.cc)
-file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
-file(GLOB RUNTIME_METAL_SRCS src/runtime/metal/*.mm)
-file(GLOB RUNTIME_RPC_SRCS src/runtime/rpc/*.cc)
-file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
-
-if(USE_CUDA)
-find_package(CUDA)
-# Find CUDA doesn't find all the libraries we need, add the extra ones
-find_library(CUDA_CUDA_LIBRARIES cuda
-  PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
-find_library(CUDA_NVRTC_LIBRARIES nvrtc
-  PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
-  set(CUDA_CUDA_LIBRARY ${CUDA_CUDA_LIBRARIES})
-
-  find_package(CUDA QUIET REQUIRED)
-  message(STATUS "Build with CUDA support")
-	include_directories(${CUDA_INCLUDE_DIRS})
-  list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDART_LIBRARY})
-  list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDA_LIBRARY})
-  list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_SRCS})
-  if(MSVC)
-    find_library(CUDA_NVRTC_LIB nvrtc
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib/win32)
-    list(APPEND TVM_LINKER_LIBS ${CUDA_NVRTC_LIB})
-  else(MSVC)
-    find_library(CUDA_NVRTC_LIB nvrtc
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib)
-    list(APPEND TVM_LINKER_LIBS ${CUDA_NVRTC_LIB})
-  endif(MSVC)
-  add_definitions(-DTVM_CUDA_RUNTIME=1)
-else(USE_CUDA)
-  add_definitions(-DTVM_CUDA_RUNTIME=0)
-endif(USE_CUDA)
-
-if(USE_OPENCL)
-  find_package(OpenCL QUIET REQUIRED)
-  message(STATUS "Build with OpenCL support")
-  include_directories(${OPENCL_INCLUDE_DIRS})
-  list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
-  list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
-  add_definitions(-DTVM_OPENCL_RUNTIME=1)
-else(USE_OPENCL)
-  add_definitions(-DTVM_OPENCL_RUNTIME=0)
-endif(USE_OPENCL)
-
-if(USE_METAL)
-  find_package(OpenCL QUIET REQUIRED)
-  message(STATUS "Build with Metal support")
-  FIND_LIBRARY(METAL_LIB Metal)
-  FIND_LIBRARY(FOUNDATION_LIB Foundation)
-  list(APPEND TVM_RUNTIME_LINKER_LIBS ${METAL_LIB} ${FOUNDATION_LIB})
-  list(APPEND RUNTIME_SRCS ${RUNTIME_METAL_SRCS})
-  add_definitions(-DTVM_METAL_RUNTIME=1)
-else(USE_METAL)
-  add_definitions(-DTVM_METAL_RUNTIME=0)
-endif(USE_METAL)
+
+# Package runtime rules
+if(NOT USE_RTTI)
+  add_definitions(-DDMLC_ENABLE_RTTI=0)
+endif()
 
 if(USE_RPC)
   message(STATUS "Build with RPC support...")
+  file(GLOB RUNTIME_RPC_SRCS src/runtime/rpc/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_RPC_SRCS})
 endif(USE_RPC)
 
 if(USE_GRAPH_RUNTIME)
   message(STATUS "Build with Graph runtime support...")
+  file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_SRCS})
-endif(USE_GRAPH_RUNTIME)
 
-if(USE_LLVM)
-  find_package(LLVM CONFIG REQUIRED)
-  include_directories(${LLVM_INCLUDE_DIRS})
-  add_definitions(${LLVM_DEFINITIONS})
-  set(TVM_LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR})
-  message(STATUS "Build with LLVM " ${LLVM_PACKAGE_VERSION})
-  message(STATUS "Set TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION})
-  add_definitions(-DTVM_LLVM_VERSION=${TVM_LLVM_VERSION})
-  add_definitions(-DDMLC_USE_FOPEN64=0)
-  llvm_map_components_to_libnames(LLVM_LIBS all)
-  list(REMOVE_ITEM LLVM_LIBS LTO)
-  list(APPEND TVM_LINKER_LIBS ${LLVM_LIBS})
-  list(APPEND COMPILER_SRCS ${COMPILER_LLVM_SRCS})
-  if(NOT MSVC)
-    set_property(SOURCE ${COMPILER_LLVM_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS
-      "-fno-rtti -DDMLC_ENABLE_RTTI=0")
-  endif()
-endif(USE_LLVM)
-
-if(NOT USE_RTTI)
-  add_definitions(-DDMLC_ENABLE_RTTI=0)
-endif()
+  if(USE_GRAPH_RUNTIME_DEBUG)
+    set_source_files_properties(${RUNTIME_GRAPH_SRCS}
+      PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG")
+  endif(USE_GRAPH_RUNTIME_DEBUG)
+endif(USE_GRAPH_RUNTIME)
 
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/CMakeLists.txt)
-  include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/include)
-  if (INSTALL_DEV)
-    install(
-      DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/include/." DESTINATION "include"
-      FILES_MATCHING
-      PATTERN "*.h"
-    )
-  endif()
-elseif(DMLC_CORE_PATH)
-  include_directories(${DMLC_CORE_PATH}/include)
-endif()
+# Module rules
+include(cmake/modules/VTA.cmake)
+include(cmake/modules/CUDA.cmake)
+include(cmake/modules/OpenCL.cmake)
+include(cmake/modules/OpenGL.cmake)
+include(cmake/modules/Vulkan.cmake)
+include(cmake/modules/Metal.cmake)
+include(cmake/modules/ROCM.cmake)
+include(cmake/modules/LLVM.cmake)
+include(cmake/modules/contrib/BLAS.cmake)
+include(cmake/modules/contrib/Random.cmake)
+include(cmake/modules/contrib/Sort.cmake)
+include(cmake/modules/contrib/NNPack.cmake)
 
-list(APPEND RUNTIME_SRCS ${GROUP_Include})
 add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
+add_library(tvm_topi SHARED ${TOPI_SRCS})
 add_library(tvm_runtime SHARED ${RUNTIME_SRCS})
+add_library(nnvm_compiler SHARED ${NNVM_COMPILER_SRCS})
+
 target_link_libraries(tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
-target_link_libraries(tvm_runtime  ${TVM_RUNTIME_LINKER_LIBS})
+target_link_libraries(tvm_topi tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
+target_link_libraries(tvm_runtime ${TVM_RUNTIME_LINKER_LIBS})
+target_link_libraries(nnvm_compiler tvm)
+
+# Related headers
+target_include_directories(
+  tvm
+  PUBLIC "HalideIR/src"
+  PUBLIC "topi/include")
+target_include_directories(
+  tvm_topi
+  PUBLIC "topi/include")
+target_include_directories(
+  nnvm_compiler
+  PUBLIC "nnvm/include"
+  PUBLIC "topi/include")
+
+# Tests
+set(TEST_EXECS "")
+file(GLOB TEST_SRCS tests/cpp/*.cc)
+find_library(GTEST_LIB gtest)
+
+if(GTEST_LIB)
+  foreach(__srcpath ${TEST_SRCS})
+    get_filename_component(__srcname ${__srcpath} NAME)
+    string(REPLACE ".cc" "" __execname ${__srcname})
+    add_executable(${__execname} ${__srcpath})
+    list(APPEND TEST_EXECS ${__execname})
+    target_link_libraries(${__execname}
+      tvm ${GTEST_LIB} pthread)
+    set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_ALL 1)
+    set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
+  endforeach()
+  add_custom_target(cpptest DEPENDS ${TEST_EXECS})
+endif()
+
+# Custom targets
+add_custom_target(runtime DEPENDS tvm_runtime)
+
+# Installation rules
+install(TARGETS tvm DESTINATION lib${LIB_SUFFIX})
+install(TARGETS tvm_topi DESTINATION lib${LIB_SUFFIX})
 install(TARGETS tvm_runtime DESTINATION lib${LIB_SUFFIX})
+install(TARGETS nnvm_compiler DESTINATION lib${LIB_SUFFIX})
+
 if (INSTALL_DEV)
-  install(TARGETS tvm DESTINATION lib${LIB_SUFFIX})
   install(
     DIRECTORY "include/." DESTINATION "include"
     FILES_MATCHING
@@ -220,11 +231,25 @@ if (INSTALL_DEV)
     DIRECTORY "dlpack/include/." DESTINATION "include"
     FILES_MATCHING
     PATTERN "*.h"
-  )
+    )
+  install(
+    DIRECTORY "nnvm/include/." DESTINATION "include"
+    FILES_MATCHING
+    PATTERN "*.h"
+    )
 else(INSTALL_DEV)
   install(
     DIRECTORY "include/tvm/runtime/." DESTINATION "include/tvm/runtime"
     FILES_MATCHING
     PATTERN "*.h"
-  )
+    )
 endif(INSTALL_DEV)
+
+# More target definitions
+if(MSVC)
+  target_compile_definitions(tvm PRIVATE -DHalide_EXPORTS)
+  target_compile_definitions(tvm_runtime PRIVATE -DHalide_EXPORTS)
+  target_compile_definitions(tvm PRIVATE -DTVM_EXPORTS)
+  target_compile_definitions(tvm_runtime PRIVATE -DTVM_EXPORTS)
+  target_compile_definitions(nnvm_compiler PRIVATE -DNNVM_EXPORTS)
+endif()
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index ab9950a9f31d..6e3cf55b94b0 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,37 +1,42 @@
-Contributors of TVM
-===================
-TVM adopts Apache style committer model. The package is developed and used by the community.
+TVM Contributors
+================
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use,
+contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community.
 
-We actively seek committers that comes from contributors who:
-- Made substantial contribution to the project.
-- Willing to spent time on maintaining and lead the project.
+See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines.
 
-How to Contribute
------------------
-See [Contributor guide](docs/how_to/contribute.md) on how to contribute
-
-Committers
-----------
-Committers are people who have made substantial contribution to the project and granted write access to the project.
-- [Tianqi Chen](https://github.com/tqchen), University of Washington
-- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/), University of Washington
-- [Haichen Shen](http://homes.cs.washington.edu/~haichen/), University of Washington
-- [Ziheng Jiang](https://github.com/ZihengJiang), Fudan University
-
-Code Owners
------------
-[Code owners](CODEOWNERS) are people who make substantial contribution to a module
-and are qualified to lead development and review changes of the owned module.
+## Committers
+- [Tianqi Chen](https://github.com/tqchen) (PMC)
+- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/)
+- [Ziheng Jiang](https://github.com/ZihengJiang)
+- [Haichen Shen](http://homes.cs.washington.edu/~haichen/)
+- [Yizhi Liu](https://github.com/yzhliu)
 
+## Code Owners
 - [Aditya Atluri](https://github.com/adityaatluri) ROCM
 - [Leyuan Wang](https://github.com/Laurawly) TOPI
 - [Yuwei Hu](https://github.com/Huyuwei) TOPI
-- [Yizhi Liu](https://github.com/javelinjs) JVM package
+- [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
+- [Nick Hynes](https://github.com/nhynes) SGX and secured computing
+
+## Reviewers
+- [Masahiro Masuda](https://github.com/masahi)
+- [Kazutaka Morita](https://github.com/kazum)
+- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
+- [Siva](https://github.com/srkreddy1238)
+- [Alex Weaver](https://github.com/alex-weaver)
+- [Eddie Yan](https://github.com/eqy)
+- [Joshua Z. Zhang](https://github.com/zhreshold)
+- [Lianmin Zheng](https://github.com/merrymercy)
 
-List of Contributors
---------------------
+## List of Contributors
 - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)
   - To contributors: please add your name to the list.
 - [Qiao Zhang](https://github.com/zhangqiaorjc)
 - [Jian Weng](https://github.com/were)
 - [Masahiro Masuda](https://github.com/masahi)
+- [Haolong Zhang](https://github.com/haolongzhangm)
+- [Cody Hao Yu](https://github.com/comaniac)
+- [Chris Nuernberger](https://github.com/cnuernber)
+- [Tatsuya Nishiyama](https://github.com/nishi-t)
+- [Kazutaka Morita](https://github.com/kazum)
diff --git a/HalideIR b/HalideIR
index d91cf97d5d6c..a0b9563f4571 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit d91cf97d5d6cd2b47ec408bb08e978b88cbf6ab7
+Subproject commit a0b9563f45719553adf4d39fe3c14db1af0e1f40
diff --git a/Jenkinsfile b/Jenkinsfile
index ef9666351ba5..8d76ebedeaae 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -4,13 +4,14 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // tvm libraries
-tvm_runtime = "lib/libtvm_runtime.so, config.mk"
-tvm_lib = "lib/libtvm.so, " + tvm_runtime
+tvm_runtime = "build/libtvm_runtime.so, build/config.cmake"
+tvm_lib = "build/libtvm.so, " + tvm_runtime
 // LLVM upstream lib
-tvm_multilib = "lib/libtvm_llvm40.so, lib/libtvm_llvm50.so, lib/libtvm_llvm60.so, " + tvm_runtime
+tvm_multilib = "build/libtvm.so, " +
+             "build/libvta.so, build/libtvm_topi.so, build/libnnvm_compiler.so, " + tvm_runtime
 
 // command to start a docker container
-docker_run = 'tests/ci_build/ci_build.sh'
+docker_run = 'docker/bash.sh'
 // timeout in minutes
 max_time = 60
 
@@ -38,7 +39,7 @@ stage("Sanity Check") {
     node('linux') {
       ws('workspace/tvm/sanity') {
         init_git()
-        sh "${docker_run} lint  ./tests/scripts/task_lint.sh"
+        sh "${docker_run} tvmai/ci-lint  ./tests/scripts/task_lint.sh"
       }
     }
   }
@@ -47,14 +48,14 @@ stage("Sanity Check") {
 // Run make. First try to do an incremental make from a previous workspace in hope to
 // accelerate the compilation. If something wrong, clean the workspace and then
 // build from scratch.
-def make(docker_type, make_flag) {
+def make(docker_type, path, make_flag) {
   timeout(time: max_time, unit: 'MINUTES') {
     try {
-      sh "${docker_run} ${docker_type} make ${make_flag}"
+      sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}"
     } catch (exc) {
       echo 'Incremental compilation failed. Fall back to build from scratch'
-      sh "${docker_run} ${docker_type} make clean"
-      sh "${docker_run} ${docker_type} make ${make_flag}"
+      sh "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}"
+      sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}"
     }
   }
 }
@@ -84,30 +85,35 @@ stage('Build') {
       ws('workspace/tvm/build-gpu') {
         init_git()
         sh """
-           cp make/config.mk .
-           echo USE_CUDNN=1 >> config.mk
-           echo USE_CUDA=1 >> config.mk
-           echo USE_OPENCL=1 >> config.mk
-           echo LLVM_CONFIG=llvm-config-4.0 >> config.mk
-           echo USE_RPC=1 >> config.mk
-           echo USE_GRAPH_RUNTIME=1 >> config.mk
-           echo USE_BLAS=openblas >> config.mk
-           rm -f lib/libtvm_runtime.so lib/libtvm.so
+           mkdir -p build
+           cd build
+           cp ../cmake/config.cmake .
+           echo set\\(USE_CUBLAS ON\\) >> config.cmake
+           echo set\\(USE_CUDNN ON\\) >> config.cmake
+           echo set\\(USE_CUDA ON\\) >> config.cmake
+           echo set\\(USE_OPENGL ON\\) >> config.cmake
+           echo set\\(USE_LLVM llvm-config-6.0\\) >> config.cmake
+           echo set\\(USE_RPC ON\\) >> config.cmake
+           echo set\\(USE_SORT ON\\) >> config.cmake
+           echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake
+           echo set\\(USE_BLAS openblas\\) >> config.cmake
+           echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
+           echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
-        make('gpu', '-j2')
-        sh "mv lib/libtvm.so lib/libtvm_llvm40.so"
-        sh "echo LLVM_CONFIG=llvm-config-5.0 >> config.mk"
-        make('gpu', '-j2')
-        sh "mv lib/libtvm.so lib/libtvm_llvm50.so"
-        sh "echo LLVM_CONFIG=llvm-config-6.0 >> config.mk"
-        make('gpu', '-j2')
-        sh "mv lib/libtvm.so lib/libtvm_llvm60.so"
+        make('tvmai/ci-gpu', 'build', '-j2')
         pack_lib('gpu', tvm_multilib)
+        // compiler test
         sh """
-           echo USE_ROCM=1 >> config.mk
-           echo ROCM_PATH=/opt/rocm >> config.mk
+           mkdir -p build2
+           cd build2
+           cp ../cmake/config.cmake .
+           echo set\\(USE_OPENCL ON\\) >> config.cmake
+           echo set\\(USE_ROCM ON\\) >> config.cmake
+           echo set\\(USE_VULKAN ON\\) >> config.cmake
+           echo set\\(CMAKE_CXX_COMPILER clang-6.0\\) >> config.cmake
+           echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
-        make('gpu', '-j2')
+        make('tvmai/ci-gpu', 'build2', '-j2')
       }
     }
   },
@@ -116,13 +122,20 @@ stage('Build') {
       ws('workspace/tvm/build-cpu') {
         init_git()
         sh """
-           cp make/config.mk .
-           echo USE_CUDA=0 >> config.mk
-           echo USE_OPENCL=0 >> config.mk
-           echo USE_RPC=0 >> config.mk
+           mkdir -p build
+           cd build
+           cp ../cmake/config.cmake .
+           echo set\\(USE_SORT ON\\) >> config.cmake
+           echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
+           echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
+           echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
-        make('cpu', '-j2')
+        make('tvmai/ci-cpu', 'build', '-j2')
         pack_lib('cpu', tvm_lib)
+        timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_cpp_unittest.sh"
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_vta.sh"
+        }
       }
     }
   },
@@ -131,48 +144,19 @@ stage('Build') {
       ws('workspace/tvm/build-i386') {
         init_git()
         sh """
-           cp make/config.mk .
-           echo USE_CUDA=0 >> config.mk
-           echo USE_OPENCL=0 >> config.mk
-           echo LLVM_CONFIG=llvm-config-4.0 >> config.mk
-           echo USE_RPC=1 >> config.mk
+           mkdir -p build
+           cd build
+           cp ../cmake/config.cmake .
+           echo set\\(USE_SORT ON\\) >> config.cmake
+           echo set\\(USE_RPC ON\\) >> config.cmake
+           echo set\\(USE_LLVM llvm-config-5.0\\) >> config.cmake
+           echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
+           echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
-        make('i386', '-j2')
-        sh "mv lib/libtvm.so lib/libtvm_llvm40.so"
-        sh "echo LLVM_CONFIG=llvm-config-5.0 >> config.mk"
-        make('i386', '-j2')
-        sh "mv lib/libtvm.so lib/libtvm_llvm50.so"
-        sh "echo LLVM_CONFIG=llvm-config-6.0 >> config.mk"
-        make('i386', '-j2')
-        sh "mv lib/libtvm.so lib/libtvm_llvm60.so"
+        make('tvmai/ci-i386', 'build', '-j2')
         pack_lib('i386', tvm_multilib)
       }
     }
-  },
-  'web': {
-    node('emcc') {
-      ws('workspace/tvm/build-weblib') {
-        init_git()
-        sh """
-           cp make/config.mk .
-           echo USE_CUDA=0 >> config.mk
-           echo USE_OPENCL=0 >> config.mk
-           echo LLVM_CONFIG=llvm-config >> config.mk
-           echo USE_RPC=0 >> config.mk
-           """
-        sh "${docker_run} emscripten echo testing javascript..."
-        timeout(time: max_time, unit: 'MINUTES') {
-          try {
-            sh "${docker_run} emscripten ./tests/scripts/task_web_build.sh"
-          } catch (exc) {
-            echo 'Incremental compilation failed. Fall back to build from scratch'
-            sh "${docker_run} emscripten make clean"
-            sh "${docker_run} emscripten ./tests/scripts/task_web_build.sh"
-         }
-        }
-        pack_lib('weblib', tvm_lib)
-      }
-    }
   }
 }
 
@@ -182,14 +166,8 @@ stage('Unit Test') {
       ws('workspace/tvm/ut-python-gpu') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
-        sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} gpu ./tests/scripts/task_python_unittest.sh"
-        }
-        // Test on the lastest mainline.
-        sh "cp lib/libtvm_llvm60.so lib/libtvm.so"
-        timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} gpu ./tests/scripts/task_python_unittest.sh"
+          sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_unittest.sh"
         }
       }
     }
@@ -199,26 +177,10 @@ stage('Unit Test') {
       ws('workspace/tvm/ut-python-i386') {
         init_git()
         unpack_lib('i386', tvm_multilib)
-        sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
-        timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} i386 ./tests/scripts/task_python_unittest.sh"
-          sh "${docker_run} i386 ./tests/scripts/task_python_integration.sh"
-        }
-        // Test on llvm 5.0
-        sh "cp lib/libtvm_llvm50.so lib/libtvm.so"
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} i386 ./tests/scripts/task_python_integration.sh"
-        }
-      }
-    }
-  },
-  'cpp': {
-    node('linux') {
-      ws('workspace/tvm/ut-cpp') {
-        init_git()
-        unpack_lib('cpu', tvm_lib)
-        timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} cpu ./tests/scripts/task_cpp_unittest.sh"
+          sh "${docker_run} tvmai/ci-i386 ./tests/scripts/task_python_unittest.sh"
+          sh "${docker_run} tvmai/ci-i386 ./tests/scripts/task_python_integration.sh"
+          sh "${docker_run} tvmai/ci-i386 ./tests/scripts/task_python_vta.sh"
         }
       }
     }
@@ -228,9 +190,8 @@ stage('Unit Test') {
       ws('workspace/tvm/ut-java') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
-        sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} gpu ./tests/scripts/task_java_unittest.sh"
+          sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_java_unittest.sh"
         }
       }
     }
@@ -243,22 +204,11 @@ stage('Integration Test') {
       ws('workspace/tvm/it-python-gpu') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
-        sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
-        timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} gpu ./tests/scripts/task_python_integration.sh"
-          sh "${docker_run} gpu ./tests/scripts/task_python_topi.sh"
-        }
-      }
-    }
-  },
-  'web': {
-    node('emcc') {
-      ws('workspace/tvm/it-weblib') {
-        init_git()
-        unpack_lib('weblib', tvm_lib)
-        sh "${docker_run} emscripten echo testing javascript..."
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} emscripten ./tests/scripts/task_web_test.sh"
+          sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_integration.sh"
+          sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_topi.sh"
+          sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_cpp_topi.sh"
+          sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_nnvm.sh"
         }
       }
     }
@@ -268,9 +218,8 @@ stage('Integration Test') {
       ws('workspace/tvm/docs-python-gpu') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
-        sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} gpu ./tests/scripts/task_python_docs.sh"
+          sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_docs.sh"
         }
         pack_lib('mydocs', 'docs.tgz')
       }
diff --git a/Makefile b/Makefile
index 4a16d5162102..2d3d4843c4c0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,7 @@
 ROOTDIR = $(CURDIR)
 
-ifndef config
-ifneq ("$(wildcard ./config.mk)","")
-	config ?= config.mk
-else
-	config ?= make/config.mk
-endif
-endif
-
-include $(config)
-
-.PHONY: clean install installdev all test doc pylint cpplint lint verilog cython cython2 cython3 web runtime
+.PHONY: clean all test doc pylint cpplint lint\
+	 cython cython2 cython3 web runtime vta
 
 ifndef DMLC_CORE_PATH
   DMLC_CORE_PATH = $(ROOTDIR)/dmlc-core
@@ -20,242 +11,65 @@ ifndef DLPACK_PATH
   DLPACK_PATH = $(ROOTDIR)/dlpack
 endif
 
-UNAME_S := $(shell uname -s)
-
-# The flags
-LLVM_CFLAGS= -fno-rtti -DDMLC_ENABLE_RTTI=0 -DDMLC_USE_FOPEN64=0
-LDFLAGS = -pthread -lm -ldl
-INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include -IHalideIR/src -Itopi/include
-CFLAGS = -std=c++11 -Wall -O2 $(INCLUDE_FLAGS) -fPIC
-FRAMEWORKS =
-OBJCFLAGS = -fno-objc-arc
-EMCC_FLAGS= -s RESERVED_FUNCTION_POINTERS=2 -s NO_EXIT_RUNTIME=1 -s MAIN_MODULE=1 -DDMLC_LOG_STACK_TRACE=0\
-	 -std=c++11 -Oz $(INCLUDE_FLAGS)
-
-# llvm configuration
-ifdef LLVM_CONFIG
-	LLVM_VERSION=$(shell $(LLVM_CONFIG) --version| cut -b 1,3)
-	LLVM_INCLUDE=$(filter -I%, $(shell $(LLVM_CONFIG) --cxxflags))
-	LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags --libs --system-libs)
-	LLVM_CFLAGS += $(LLVM_INCLUDE) -DTVM_LLVM_VERSION=$(LLVM_VERSION)
-else
-	LLVM_VERSION=00
-endif
-
-# The source code dependencies
-LIB_HALIDEIR = HalideIR/lib/libHalideIR.a
+INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include
+PKG_CFLAGS = -std=c++11 -Wall -O2 $(INCLUDE_FLAGS) -fPIC
+PKG_LDFLAGS =
 
-CC_SRC = $(filter-out src/contrib/%.cc src/runtime/%.cc src/codgen/llvm/%.cc,\
-             $(wildcard src/*/*.cc src/*/*/*.cc))
-LLVM_SRC = $(wildcard src/codegen/llvm/*.cc src/codegen/llvm/*/*.cc)
-METAL_SRC = $(wildcard src/runtime/metal/*.mm)
-CUDA_SRC = $(wildcard src/runtime/cuda/*.cc)
-ROCM_SRC = $(wildcard src/runtime/rocm/*.cc)
-OPENCL_SRC = $(wildcard src/runtime/opencl/*.cc)
-RPC_SRC = $(wildcard src/runtime/rpc/*.cc)
-GRAPH_SRC = $(wildcard src/runtime/graph/*.cc)
-RUNTIME_SRC = $(wildcard src/runtime/*.cc)
 
-# Objectives
-LLVM_BUILD = build/llvm${LLVM_VERSION}
-LLVM_OBJ = $(patsubst src/%.cc, ${LLVM_BUILD}/%.o, $(LLVM_SRC))
-METAL_OBJ = $(patsubst src/%.mm, build/%.o, $(METAL_SRC))
-CUDA_OBJ = $(patsubst src/%.cc, build/%.o, $(CUDA_SRC))
-ROCM_OBJ = $(patsubst src/%.cc, build/%.o, $(ROCM_SRC))
-OPENCL_OBJ = $(patsubst src/%.cc, build/%.o, $(OPENCL_SRC))
-RPC_OBJ = $(patsubst src/%.cc, build/%.o, $(RPC_SRC))
-GRAPH_OBJ = $(patsubst src/%.cc, build/%.o, $(GRAPH_SRC))
-CC_OBJ = $(patsubst src/%.cc, build/%.o, $(CC_SRC)) $(LLVM_OBJ)
-RUNTIME_OBJ = $(patsubst src/%.cc, build/%.o, $(RUNTIME_SRC))
-CONTRIB_OBJ =
+all:
+	@mkdir -p build && cd build && cmake .. && $(MAKE)
 
-# Deps
-ALL_DEP = $(CC_OBJ) $(CONTRIB_OBJ) $(LIB_HALIDEIR)
-RUNTIME_DEP = $(RUNTIME_OBJ)
+runtime:
+	@mkdir -p build && cd build && cmake .. && $(MAKE) runtime
 
-# Dependency specific rules
-ifdef CUDA_PATH
-	NVCC=$(CUDA_PATH)/bin/nvcc
-	CFLAGS += -I$(CUDA_PATH)/include
-	LDFLAGS += -L$(CUDA_PATH)/lib64
-endif
+vta:
+	@mkdir -p build && cd build && cmake .. && $(MAKE) vta
 
-ifeq ($(USE_CUDA), 1)
-	CFLAGS += -DTVM_CUDA_RUNTIME=1
-	LDFLAGS += -lcuda -lcudart -lnvrtc
-	RUNTIME_DEP += $(CUDA_OBJ)
-else
-	CFLAGS += -DTVM_CUDA_RUNTIME=0
-endif
+cpptest:
+	@mkdir -p build && cd build && cmake .. && $(MAKE) cpptest
 
-ifdef ROCM_PATH
-	CFLAGS += -I$(ROCM_PATH)/include
-	LDFLAGS += -L$(ROCM_PATH)/lib
-endif
+# EMCC; Web related scripts
+EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\
+	-Oz -s RESERVED_FUNCTION_POINTERS=2 -s MAIN_MODULE=1 -s NO_EXIT_RUNTIME=1\
+	-s TOTAL_MEMORY=1073741824\
+	-s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap','getValue','setValue','addFunction']"\
+	-s USE_GLFW=3 -s USE_WEBGL2=1 -lglfw\
+	$(INCLUDE_FLAGS)
 
-ifeq ($(USE_ROCM), 1)
-	CFLAGS += -DTVM_ROCM_RUNTIME=1 -D__HIP_PLATFORM_HCC__=1
-	LDFLAGS += -lhip_hcc
-	RUNTIME_DEP += $(ROCM_OBJ)
-else
-	CFLAGS += -DTVM_ROCM_RUNTIME=0
-endif
-
-ifeq ($(USE_OPENCL), 1)
-	CFLAGS += -DTVM_OPENCL_RUNTIME=1
-	ifeq ($(UNAME_S), Darwin)
-		FRAMEWORKS += -framework OpenCL
-	else
-		LDFLAGS += -lOpenCL
-	endif
-	RUNTIME_DEP += $(OPENCL_OBJ)
-else
-	CFLAGS += -DTVM_OPENCL_RUNTIME=0
-endif
-
-ifeq ($(USE_METAL), 1)
-	CFLAGS += -DTVM_METAL_RUNTIME=1
-	LDFLAGS += -lobjc
-	RUNTIME_DEP += $(METAL_OBJ)
-	FRAMEWORKS += -framework Metal -framework Foundation
-else
-	CFLAGS += -DTVM_METAL_RUNTIME=0
-endif
-
-ifeq ($(USE_RPC), 1)
-	RUNTIME_DEP += $(RPC_OBJ)
-endif
-
-ifeq ($(USE_GRAPH_RUNTIME), 1)
-	RUNTIME_DEP += $(GRAPH_OBJ)
-endif
-
-include make/contrib/cblas.mk
-include make/contrib/nnpack.mk
-include make/contrib/cudnn.mk
-
-ifdef ADD_CFLAGS
-	CFLAGS += $(ADD_CFLAGS)
-endif
-
-ifdef ADD_LDFLAGS
-	LDFLAGS += $(ADD_LDFLAGS)
-endif
-
-ifeq ($(OS),Windows_NT)
-	JVM_PKG_PROFILE := windows
-	SHARED_LIBRARY_SUFFIX := dll
-else
-	UNAME_S := $(shell uname -s)
-	ifeq ($(UNAME_S), Darwin)
-		JVM_PKG_PROFILE := osx-x86_64
-		SHARED_LIBRARY_SUFFIX := dylib
-	else
-		JVM_PKG_PROFILE := linux-x86_64
-		SHARED_LIBRARY_SUFFIX := so
-	endif
-endif
+web: build/libtvm_web_runtime.js build/libtvm_web_runtime.bc
 
-JVM_TEST_ARGS := $(if $(JVM_TEST_ARGS),$(JVM_TEST_ARGS),-DskipTests -Dcheckstyle.skip=true)
-
-ifeq ($(USE_CUDA), 1)
-	JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-gpu
-else ifeq ($(USE_OPENCL), 1)
-	JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-gpu
-else ifeq ($(USE_METAL), 1)
-	JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-gpu
-else
-	JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-cpu
-endif
-
-BUILD_TARGETS ?= lib/libtvm.$(SHARED_LIBRARY_SUFFIX) lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX)
-all: ${BUILD_TARGETS}
-runtime: lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX)
-web: lib/libtvm_web_runtime.js lib/libtvm_web_runtime.bc
-
-include tests/cpp/unittest.mk
-
-test: $(TEST)
-
-include verilog/verilog.mk
-verilog: $(VER_LIBS)
-
-# Special rules for LLVM related modules.
-${LLVM_BUILD}/codegen/llvm/%.o: src/codegen/llvm/%.cc
-	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) $(LLVM_CFLAGS) -MM -MT ${LLVM_BUILD}/codegen/llvm/$*.o $< >${LLVM_BUILD}/codegen/llvm/$*.d
-	$(CXX) -c $(CFLAGS) $(LLVM_CFLAGS) -c $< -o $@
-
-build/runtime/metal/%.o: src/runtime/metal/%.mm
-	@mkdir -p $(@D)
-	$(CXX) $(OBJCFLAGS) $(CFLAGS) -MM -MT build/runtime/metal/$*.o $< >build/runtime/metal/$*.d
-	$(CXX) $(OBJCFLAGS) -c $(CFLAGS) -c $< -o $@
-
-build/%.o: src/%.cc
-	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
-	$(CXX) -c $(CFLAGS) -c $< -o $@
-
-lib/libtvm.dylib: $(ALL_DEP) $(RUNTIME_DEP)
-	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
-
-lib/libtvm_runtime.dylib: $(RUNTIME_DEP)
-	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
-
-lib/libtvm.so: $(ALL_DEP) $(RUNTIME_DEP)
-	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
-
-lib/libtvm_runtime.so: $(RUNTIME_DEP)
-	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
-
-lib/libtvm_web_runtime.bc: web/web_runtime.cc
+build/libtvm_web_runtime.bc: web/web_runtime.cc
 	@mkdir -p build/web
 	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -MM -MT lib/libtvm_web_runtime.bc $< >build/web/web_runtime.d
+	emcc $(EMCC_FLAGS) -MM -MT build/libtvm_web_runtime.bc $< >build/web/web_runtime.d
 	emcc $(EMCC_FLAGS) -o $@ web/web_runtime.cc
 
-lib/libtvm_web_runtime.js: lib/libtvm_web_runtime.bc
+build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
 	@mkdir -p $(@D)
-	emcc $(EMCC_FLAGS) -o $@ lib/libtvm_web_runtime.bc
-
-$(LIB_HALIDEIR): LIBHALIDEIR
-
-LIBHALIDEIR:
-	+ cd HalideIR; make lib/libHalideIR.a DMLC_CORE_PATH=../dmlc-core; cd $(ROOTDIR)
+	emcc $(EMCC_FLAGS) -o $@ build/libtvm_web_runtime.bc
 
+# Lint scripts
 cpplint:
-	python dmlc-core/scripts/lint.py topi cpp topi/include;
-	python dmlc-core/scripts/lint.py tvm cpp include src verilog\
+	python3 dmlc-core/scripts/lint.py vta cpp vta/include vta/src
+	python3 dmlc-core/scripts/lint.py topi cpp topi/include;
+	python3 dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
+	python3 dmlc-core/scripts/lint.py tvm cpp include src verilog\
 	 examples/extension/src examples/graph_executor/src
 
 pylint:
-	pylint python/tvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
-	pylint topi/python/topi --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+	python3 -m pylint python/tvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+	python3 -m pylint topi/python/topi --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+	python3 -m pylint nnvm/python/nnvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+	python3 -m pylint vta/python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
 
 jnilint:
-	python dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
+	python3 dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
 
 lint: cpplint pylint jnilint
 
 doc:
 	doxygen docs/Doxyfile
 
-install: lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX)
-	mkdir -p $(DESTDIR)$(PREFIX)/include/tvm/runtime
-	cp -R include/tvm/runtime/. $(DESTDIR)$(PREFIX)/include/tvm/runtime
-	cp lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) $(DESTDIR)$(PREFIX)/lib
-
-installdev: lib/libtvm.$(SHARED_LIBRARY_SUFFIX) lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) lib/libtvm.a
-	mkdir -p $(DESTDIR)$(PREFIX)/include
-	cp -R include/tvm $(DESTDIR)$(PREFIX)/include
-	cp lib/libtvm.$(SHARED_LIBRARY_SUFFIX) $(DESTDIR)$(PREFIX)/lib
-	cp lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) $(DESTDIR)$(PREFIX)/lib
-	cp lib/libtvm.a $(DESTDIR)$(PREFIX)/lib
-
 # Cython build
 cython:
 	cd python; python setup.py build_ext --inplace
@@ -269,22 +83,34 @@ cython3:
 cyclean:
 	rm -rf python/tvm/*/*/*.so python/tvm/*/*/*.dylib python/tvm/*/*/*.cpp
 
+# JVM build rules
+ifeq ($(OS),Windows_NT)
+  JVM_PKG_PROFILE := windows
+  SHARED_LIBRARY_SUFFIX := dll
+else
+  UNAME_S := $(shell uname -s)
+  ifeq ($(UNAME_S), Darwin)
+    JVM_PKG_PROFILE := osx-x86_64
+    SHARED_LIBRARY_SUFFIX := dylib
+  else
+    JVM_PKG_PROFILE := linux-x86_64
+    SHARED_LIBRARY_SUFFIX := so
+  endif
+endif
+
+JVM_TEST_ARGS := $(if $(JVM_TEST_ARGS),$(JVM_TEST_ARGS),-DskipTests -Dcheckstyle.skip=true)
+
 jvmpkg:
 	(cd $(ROOTDIR)/jvm; \
 		mvn clean package -P$(JVM_PKG_PROFILE) -Dcxx="$(CXX)" \
-			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dcurrent_libdir="$(ROOTDIR)/lib" $(JVM_TEST_ARGS))
+			-Dcflags="$(PKG_CFLAGS)" -Dldflags="$(PKG_LDFLAGS)" \
+			-Dcurrent_libdir="$(ROOTDIR)/build" $(JVM_TEST_ARGS))
 jvminstall:
 	(cd $(ROOTDIR)/jvm; \
 		mvn install -P$(JVM_PKG_PROFILE) -Dcxx="$(CXX)" \
-			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dcurrent_libdir="$(ROOTDIR)/lib" $(JVM_TEST_ARGS))
+			-Dcflags="$(PKG_CFLAGS)" -Dldflags="$(PKG_LDFLAGS)" \
+			-Dcurrent_libdir="$(ROOTDIR)/build" $(JVM_TEST_ARGS))
 
+# clean rule
 clean:
-	$(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o */*.d */*/*.d */*/*/*.d
-	cd HalideIR; make clean; cd $(ROOTDIR)
-
--include build/*.d
--include build/*/*.d
--include build/*/*/*.d
--include build/*/*/*/*.d
+	@mkdir -p build && cd build && cmake .. && $(MAKE) clean
diff --git a/NEWS.md b/NEWS.md
index 6bc97b163ab1..567aabf3fcbd 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,11 +3,104 @@ TVM Change Log
 
 This file records the changes in TVM library in reverse chronological order.
 
+## On-going version
 
-## On onging verison
+Refer to the Roadmap issue for complete list on on-going version features.
+If you check in something that is not reflected in Roadmap issue, please reply
+to that issue so it can get added.
+
+## 0.3
+
+This release features numerous improvements in TOPI and backends. We make the first step toward object detection support in TOPI, featuring operators necessary for YOLO and SSDs. The topi now supports numpy-style API and operator overloading. RPC is significantly improved to support resource allocation and using a pool of devices. We are adding two new backends: WebGL for running GPUs on the browser, and Vulkan for running on next-generation graphics API.
+
+- TOPI Vision operators
+   - SSD support
+   - YOLO support
+   - NMS operator support in vision
+- TOPI general numpy-style operators
+   - numpy style operator overload in topi
+   - more operators: flip, take
+   - dilation support on conv2d and depthwise
+- 8bit support
+    - ARM 8bit gemm
+    - ARM 8bit conv
+- Low bit operator support
+    - popcount intrinsics
+    - 1-bit fully connected
+- Contrib: MPSDNN fully-connected and conv2d support
+- Better RPC support
+   - RPC Tracker support to allow centralized resource management
+   - RPC protocol upgrade (this is a non-backward compatible change) to support timeout in the proxy
+     - This is a breaking change, need to use the latest version of TVM runtime with the RPC
+   - Fault-tolerant to early server termination with correct exception propagated
+   - RPC support enabled for ROCm AMDGPUs
+- Tutorials and docs
+  - How to deploy to android devices.
+- Optimizations for hardware backends
+  - intel CPU (AVX and AVX512)
+- Schedule Primitives
+   - rfactor now support factor_axis to specify the factored dimension in the result
+   - cache_write now support multiple output operators
+   - enable warp memory which generates shuffle instructions
+- Framework bridge
+  - MXNet bridge supported
+- C++ compiler API support
+   - build migration
+   - topi migration to c++
+   - Target system in c++
+- WebGL backend
+   - runtime and codegen
+   - topi integration
+   - end to end pipeline on the browser
+- Vulkan backend
+   - vulkan runtime
+   - spirv code generator
+- Security
+    - intel SGX runtime support
+    - multi-threaded SGX runtime
+- LLVM 7.0 support
+- Robustness
+   - VerifyMemory to verify incorrect GPU schedules that writes into GPU memory from cpu
+   - Verify compute formulas
+- Better CPU parallel runtime
+
+## 0.2
+
+This release comes with a complete set of TOPI support for NNVM compiler, which allows compilation of end to end workloads.
+We also make major improvements in supporting new backends: ROCm for AMDGPUs and ARM GPU.
+
+- Backend support
+   - Support LLVM mainline(4.0, 5.0, 6.0)
+   - Support ROCM stack for AMD GPUs
+   - More robust OpenCL support for ARM GPUs
+- Android RPC runtime
+- Multi-threading optimization for ARM
+   - multi-threaded depthwise
+   - multi-threaded conv2d
+- New schedule primitives
+   - storage_align for shared memory alignment
+   - double_buffer
 - UnrollLoop : more robust version of unroll loop, count maximum steps that can be unrolled.
+- Full set of TOPI operators
+   - Introduce tvm.target to specify target options for compilation better.
+   - broadcast/ reduction operators
+   - pooling and global pooling
+   - Generic target support for topi
+   - schedule with external libraries
+- End to end deep learning pipelines for CPU, GPU, ARM GPU
+- Tutorials
+  - How to load compiled module in any language runtime
+  -  How to use java runtime
+- Contrib library: MIOpen, CuDNN
+- Ongoing items that contains functioning pieces
+  - WebGL backend
+  - C++ compiler support
+  - MPS DNN
+  - low bit support, introduced popcount
+
+
+## 0.1
 
-## 0.1rc
 - Language runtime
     - python
     - javascript
diff --git a/README.md b/README.md
index 07e550d76043..561ca91d5abe 100644
--- a/README.md
+++ b/README.md
@@ -1,33 +1,27 @@
-TVM: Tensor IR Stack for Deep Learning Systems
+<img src=https://raw.githubusercontent.com/tqchen/tvm.ai/master/images/logo/tvm-logo-small.png width=128/> Open Deep Learning Compiler Stack
 ==============================================
 
-[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
-[![Build Status](http://mode-gpu.cs.washington.edu:8080/buildStatus/icon?job=dmlc/tvm/master)](http://mode-gpu.cs.washington.edu:8080/job/dmlc/job/tvm/job/master/)
+[![GitHub license](https://dmlc.github.io/img/apache2.svg)](./LICENSE)
+[![Build Status](http://mode-gpu.cs.washington.edu:8080/buildStatus/icon?job=tvm/master)](http://mode-gpu.cs.washington.edu:8080/job/tvm/job/master/)
 
-[Installation](docs/how_to/install.md) |
-[Documentation](http://docs.tvmlang.org) |
-[Tutorials](http://tutorials.tvmlang.org) |
-[Operator Inventory](topi) |
-[FAQ](docs/faq.md) |
+[Documentation](https://docs.tvm.ai) |
 [Contributors](CONTRIBUTORS.md) |
+[Community](https://tvm.ai/community.html) |
 [Release Notes](NEWS.md)
 
-TVM is a Tensor intermediate representation(IR) stack for deep learning systems. It is designed to close the gap between the
+TVM is a compiler stack for deep learning systems. It is designed to close the gap between the
 productivity-focused deep learning frameworks, and the performance- and efficiency-focused hardware backends.
 TVM works with deep learning frameworks to provide end to end compilation to different backends.
-Checkout our [announcement](http://tvmlang.org/2017/08/17/tvm-release-announcement.html) for more details.
+Checkout the [tvm stack homepage](https://tvm.ai/)  for more information.
 
 License
 -------
-© Contributors, 2017. Licensed under an [Apache-2.0](https://github.com/dmlc/tvm/blob/master/LICENSE) license.
+© Contributors Licensed under an [Apache-2.0](https://github.com/dmlc/tvm/blob/master/LICENSE) license.
 
 Contribute to TVM
 -----------------
 TVM adopts apache committer model, we aim to create an open source project that is maintained and owned by the community.
-
-- [Contributor Guide](docs/how_to/contribute.md)
-- Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md)
-- Please also update [NEWS.md](NEWS.md) on changes and improvements in API and codes.
+Checkout the [Contributor Guide](https://docs.tvm.ai/contribute/)
 
 Acknowledgement
 ---------------
diff --git a/apps/README.md b/apps/README.md
index 254f8c26a510..2345cc3ab548 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -3,9 +3,9 @@ This folder contains various extension projects using TVM,
 they also serve as examples on how to use TVM in your own project.
 
 If you are interested in writing optimized kernels with TVM, checkout [TOPI: TVM Operator Inventory](../topi).
-If you are interested in end to end deep learning model compilation, checkout  [NNVM Compiler](https://github.com/dmlc/nnvm).
 
 - [extension](extension) How to extend TVM C++ api along with python API.
 - [ios_rpc](ios_rpc) iOS RPC server.
 - [android_rpc](android_rpc) Android RPC server.
+- [benchmark](benchmark) Example end to end compilation benchmarks
 - [howto_deploy](howto_deploy) Tutorial on how to deploy TVM with minimum code dependency.
diff --git a/apps/android_deploy/.gitignore b/apps/android_deploy/.gitignore
new file mode 100644
index 000000000000..39fb081a42a8
--- /dev/null
+++ b/apps/android_deploy/.gitignore
@@ -0,0 +1,9 @@
+*.iml
+.gradle
+/local.properties
+/.idea/workspace.xml
+/.idea/libraries
+.DS_Store
+/build
+/captures
+.externalNativeBuild
diff --git a/apps/android_deploy/README.md b/apps/android_deploy/README.md
new file mode 100644
index 000000000000..801ca8bdf95c
--- /dev/null
+++ b/apps/android_deploy/README.md
@@ -0,0 +1,119 @@
+# Android TVM Demo
+
+This folder contains Android Demo app that allows us to show how to deploy model using TVM runtime api on a Android phone.
+
+You will need [JDK](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html), [Android SDK](https://developer.android.com/studio/index.html), [Android NDK](https://developer.android.com/ndk) and an Android device to use this.
+
+## Build and Installation
+
+### Build APK
+
+We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
+
+Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/dmlc/tvm/blob/master/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
+
+```
+dependencies {
+    compile fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+        exclude group: 'com.android.support', module: 'support-annotations'
+    })
+    compile 'com.android.support:appcompat-v7:26.0.1'
+    compile 'com.android.support.constraint:constraint-layout:1.0.2'
+    compile 'com.android.support:design:26.0.1'
+    compile 'ml.dmlc.tvm:tvm4j-core:0.0.1-SNAPSHOT'
+    testCompile 'junit:junit:4.12'
+}
+```
+
+Application default has CPU version TVM runtime flavor and follow below instruction to setup.
+In `app/src/main/jni/make` you will find JNI Makefile config `config.mk` and copy it to `app/src/main/jni` and modify it.
+
+```bash
+cd apps/android_deploy/app/src/main/jni
+cp make/config.mk .
+```
+
+Here's a piece of example for `config.mk`.
+
+```makefile
+APP_ABI = arm64-v8a
+
+APP_PLATFORM = android-17
+
+# whether enable OpenCL during compile
+USE_OPENCL = 0
+```
+
+Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file.
+
+```bash
+export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]
+cd apps/android_deploy
+gradle clean build
+```
+
+In `app/build/outputs/apk` you'll find `app-release-unsigned.apk`, use `dev_tools/gen_keystore.sh` to generate a signature and use `dev_tools/sign_apk.sh` to get the signed apk file `app/build/outputs/apk/tvmdemo-release.apk`.
+
+Upload `tvmdemo-release.apk` to your Android device and install it.
+
+### Build with OpenCL
+
+Application does not link with OpenCL library unless you configure it to. Modify JNI Makefile config `app/src/main/jni` with proper target OpenCL configuration.
+
+Here's a piece of example for `config.mk`.
+
+```makefile
+APP_ABI = arm64-v8a
+
+APP_PLATFORM = android-17
+
+# whether enable OpenCL during compile
+USE_OPENCL = 1
+
+# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
+ADD_C_INCLUDES = /opt/adrenosdk-osx/Development/Inc
+
+# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
+ADD_LDLIBS = libOpenCL.so
+```
+
+Note that you should specify the correct GPU development headers for your android device. Run `adb shell dumpsys | grep GLES` to find out what GPU your android device uses. It is very likely the library (libOpenCL.so) is already present on the mobile device. For instance, I found it under `/system/vendor/lib64`. You can do `adb pull /system/vendor/lib64/libOpenCL.so ./` to get the file to your desktop.
+
+After you setup the `config.mk`, follow the instructions in [Build APK](#buildapk) to build the Android package with OpenCL flavor.
+
+## Cross Compile and Run on Android Devices
+
+### Architecture and Android Standalone Toolchain
+
+In order to cross compile a shared library (.so) for your android device, you have to know the target triple for the device. (Refer to [Cross-compilation using Clang](https://clang.llvm.org/docs/CrossCompilation.html) for more information). Run `adb shell cat /proc/cpuinfo` to list the device's CPU information.
+
+Now use NDK to generate standalone toolchain for your device. For my test device, I use following command.
+
+```bash
+cd /opt/android-ndk/build/tools/
+./make-standalone-toolchain.sh --platform=android-24 --use-llvm --arch=arm64 --install-dir=/opt/android-toolchain-arm64
+```
+
+If everything goes well, you will find compile tools in `/opt/android-toolchain-arm64/bin`. For example, `bin/aarch64-linux-android-g++` can be used to compile C++ source codes and create shared libraries for arm64 Android devices.
+
+### Place compiled model on Android application assets folder
+
+Follow instruction to get compiled version model for android target [here.](http://docs.tvm.ai/deploy/android.html)
+
+Copied these compiled model deploy_lib.so, deploy_graph.json and deploy_param.params to apps/android_deploy/app/src/main/assets/ and modify TVM flavor changes on [java](https://github.com/dmlc/tvm/blob/master/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java#L81)
+
+`CPU Verison flavor`
+```
+    private static final boolean EXE_GPU            = false;
+```
+
+`OpenCL Verison flavor`
+```
+    private static final boolean EXE_GPU            = true;
+```
+
+
+Install compiled android application on phone and enjoy the image classifier demo using extraction model
+
+You can define your own TVM operators and deploy via this demo application on your Android device to find the most optimized TVM schedule.
diff --git a/apps/android_deploy/app/.gitignore b/apps/android_deploy/app/.gitignore
new file mode 100644
index 000000000000..796b96d1c402
--- /dev/null
+++ b/apps/android_deploy/app/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/apps/android_deploy/app/build.gradle b/apps/android_deploy/app/build.gradle
new file mode 100644
index 000000000000..6790308a9ec4
--- /dev/null
+++ b/apps/android_deploy/app/build.gradle
@@ -0,0 +1,56 @@
+// import DownloadModels task
+project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets'
+project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
+
+// Download default models(darknet framework extraction model compiled version);
+// if you wish to use your own models then place them in the "assets" directory 
+// and comment out this line.
+apply from: "download-models.gradle"
+
+apply plugin: 'com.android.application'
+
+task buildJni(type: Exec, description: 'Build JNI libs') {
+    commandLine 'sh', 'src/main/jni/build.sh'
+}
+
+tasks.withType(JavaCompile) {
+    compileTask -> compileTask.dependsOn buildJni
+}
+
+android {
+    compileSdkVersion 26
+    buildToolsVersion "26.0.1"
+    defaultConfig {
+        applicationId "ml.dmlc.tvm.android.demo"
+        minSdkVersion 17
+        targetSdkVersion 26
+        versionCode 1
+        versionName "1.0"
+        testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    sourceSets {
+        main {
+            jni.srcDirs = []
+            jniLibs.srcDirs = ['src/main/libs']
+            assets.srcDirs = [project.ext.ASSET_DIR]
+        }
+    }
+}
+
+dependencies {
+    compile fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+        exclude group: 'com.android.support', module: 'support-annotations'
+    })
+    compile 'com.android.support:appcompat-v7:26.0.1'
+    compile 'com.android.support.constraint:constraint-layout:1.0.2'
+    compile 'com.android.support:design:26.0.1'
+    compile 'ml.dmlc.tvm:tvm4j-core:0.0.1-SNAPSHOT'
+    testCompile 'junit:junit:4.12'
+}
diff --git a/apps/android_deploy/app/download-models.gradle b/apps/android_deploy/app/download-models.gradle
new file mode 100644
index 000000000000..5b0509fbca2b
--- /dev/null
+++ b/apps/android_deploy/app/download-models.gradle
@@ -0,0 +1,64 @@
+/*
+ * download-models.gradle
+ *     Downloads model files from ${MODEL_URL} into application's asset folder
+ * Input:
+ *     project.ext.TMP_DIR: absolute path to hold downloaded zip files
+ *     project.ext.ASSET_DIR: absolute path to save unzipped model files
+ * Output:
+ *     3 model files will be downloaded into given folder of ext.ASSET_DIR
+ */
+// hard coded model files
+def models = ['extraction.zip']
+
+// Root URL for model archives
+def MODEL_URL = 'https://github.com/PariksheetPinjari909/TVM_models/blob/master/extraction_model'
+buildscript {
+    repositories {
+        jcenter()
+    }
+    dependencies {
+        classpath 'de.undercouch:gradle-download-task:3.2.0'
+    }
+}
+
+import de.undercouch.gradle.tasks.download.Download
+task downloadFile(type: Download){
+    for (f in models) {
+        src "${MODEL_URL}/" + f + "?raw=true"
+        dest new File(project.ext.TMP_DIR + "/" + f)
+    }
+    overwrite true
+}
+
+task extractModels(type: Copy) {
+    def needDownload = false
+    for (f in models) {
+        def localFile = f.split("/")[-1]
+        if (!(new File(project.ext.TMP_DIR + '/' + localFile)).exists()) {
+            needDownload = true
+        }
+    }
+
+    if (needDownload) {
+        dependsOn downloadFile
+    }
+
+    for (f in models) {
+        def localFile = f.split("/")[-1]
+        from zipTree(project.ext.TMP_DIR + '/' + localFile)
+    }
+
+    into file(project.ext.ASSET_DIR)
+    fileMode  0644
+    exclude '**/LICENSE'
+}
+
+tasks.whenTaskAdded { task ->
+    if (task.name == 'assembleDebug') {
+        task.dependsOn 'extractModels'
+    }
+    if (task.name == 'assembleRelease') {
+        task.dependsOn 'extractModels'
+    }
+}
+
diff --git a/apps/android_deploy/app/src/main/AndroidManifest.xml b/apps/android_deploy/app/src/main/AndroidManifest.xml
new file mode 100644
index 000000000000..bac82ee90faa
--- /dev/null
+++ b/apps/android_deploy/app/src/main/AndroidManifest.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="ml.dmlc.tvm.android.demo" >
+
+    <uses-permission android:name="android.permission.CAMERA" />
+    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
+
+    <application
+        android:allowBackup="true"
+        android:label="@string/app_name"
+        android:supportsRtl="true"
+        android:theme="@style/AppTheme" >
+        <activity
+            android:name=".MainActivity"
+            android:label="@string/app_name"
+            android:theme="@style/AppTheme.NoActionBar"
+            android:screenOrientation="portrait">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+        <provider
+            android:name="android.support.v4.content.FileProvider"
+            android:authorities="${applicationId}.provider"
+            android:exported="false"
+            android:grantUriPermissions="true">
+            <meta-data
+                android:name="android.support.FILE_PROVIDER_PATHS"
+                android:resource="@xml/provider_paths"/>
+        </provider>
+    </application>
+
+    <uses-permission android:name="android.permission.INTERNET" />
+
+</manifest>
\ No newline at end of file
diff --git a/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
new file mode 100644
index 000000000000..f3cdefe1c2ff
--- /dev/null
+++ b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
@@ -0,0 +1,633 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm.android.demo;
+
+import android.Manifest;
+import android.content.Intent;
+import android.content.pm.PackageManager;
+import android.content.res.AssetManager;
+import android.app.AlertDialog;
+import android.app.ProgressDialog;
+import android.content.DialogInterface;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+import android.graphics.Canvas;
+import android.graphics.Matrix;
+import android.net.Uri;
+import android.os.AsyncTask;
+import android.os.Build;
+import android.os.Bundle;
+import android.os.Environment;
+import android.os.SystemClock;
+import android.provider.MediaStore;
+import android.support.v4.content.FileProvider;
+import android.support.v7.app.AppCompatActivity;
+import android.support.v7.widget.Toolbar;
+import android.util.Log;
+import android.view.View;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.Toast;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Vector;
+
+import ml.dmlc.tvm.Function;
+import ml.dmlc.tvm.Module;
+import ml.dmlc.tvm.NDArray;
+import ml.dmlc.tvm.TVMContext;
+import ml.dmlc.tvm.TVMValue;
+import ml.dmlc.tvm.TVMType;
+
+public class MainActivity extends AppCompatActivity {
+    private static final String TAG = MainActivity.class.getSimpleName();
+
+    private static final int PERMISSIONS_REQUEST    = 100;
+    private static final int PICTURE_FROM_GALLERY   = 101;
+    private static final int PICTURE_FROM_CAMERA    = 102;
+    private static final int IMAGE_PREVIEW_WIDTH    = 960;
+    private static final int IMAGE_PREVIEW_HEIGHT   = 720;
+
+    // TVM constants
+    private static final int OUTPUT_INDEX           = 0;
+    private static final int IMG_CHANNEL            = 3;
+    private static final String INPUT_NAME          = "data";
+
+    // Configuration values for extraction model. Note that the graph, lib and params is not
+    // included with TVM and must be manually placed in the assets/ directory by the user.
+    // Graphs and models downloaded from https://github.com/pjreddie/darknet/blob/ may be
+    // converted e.g. via  define_and_compile_model.py.
+    private static final boolean EXE_GPU            = false;
+    private static final int MODEL_INPUT_SIZE       = 224;
+    private static final String MODEL_CL_LIB_FILE   = "file:///android_asset/deploy_lib_opencl.so";
+    private static final String MODEL_CPU_LIB_FILE  = "file:///android_asset/deploy_lib_cpu.so";
+    private static final String MODEL_GRAPH_FILE    = "file:///android_asset/deploy_graph.json";
+    private static final String MODEL_PARAM_FILE    = "file:///android_asset/deploy_param.params";
+    private static final String MODEL_LABEL_FILE    = "file:///android_asset/imagenet.shortnames.list";
+
+    private Uri mCameraImageUri;
+    private ImageView mImageView;
+    private TextView mResultView;
+    private AssetManager assetManager;
+    private Module graphRuntimeModule;
+    private Vector<String> labels = new Vector<String>();
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.activity_main);
+        Toolbar toolbar = findViewById(R.id.toolbar);
+        setSupportActionBar(toolbar);
+        assetManager = getAssets();
+
+        mImageView = (ImageView) findViewById(R.id.imageView);
+        mResultView = (TextView) findViewById(R.id.resultTextView);
+        findViewById(R.id.btnPickImage).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                showPictureDialog();
+            }
+        });
+
+        if (hasPermission()) {
+            // instantiate tvm runtime and setup environment on background after application begin
+            new LoadModleAsyncTask().execute();
+        } else {
+            requestPermission();
+        }
+    }
+
+    /*
+        Load precompiled model on TVM graph runtime and init the system.
+     */
+    private class LoadModleAsyncTask extends AsyncTask<Void, Void, Integer> {
+        ProgressDialog dialog = new ProgressDialog(MainActivity.this);
+
+        @Override
+        protected Integer doInBackground(Void... args) {
+
+            // load synset name
+            String lableFilename = MODEL_LABEL_FILE.split("file:///android_asset/")[1];
+            Log.i(TAG, "Reading synset name from: " + lableFilename);
+            try {
+                String labelsContent = new String(getBytesFromFile(assetManager, lableFilename));
+                for (String line : labelsContent.split("\\r?\\n")) {
+                    labels.add(line);
+                }
+            } catch (IOException e) {
+                Log.e(TAG, "Problem reading synset name file!" + e);
+                return -1;//failure
+            }
+
+            // load json graph
+            String modelGraph = null;
+            String graphFilename = MODEL_GRAPH_FILE.split("file:///android_asset/")[1];
+            Log.i(TAG, "Reading json graph from: " + graphFilename);
+            try {
+                modelGraph = new String(getBytesFromFile(assetManager, graphFilename));
+            } catch (IOException e) {
+                Log.e(TAG, "Problem reading json graph file!" + e);
+                return -1;//failure
+            }
+
+            // upload tvm compiled function on application cache folder
+            String libCacheFilePath = null;
+            String libFilename = EXE_GPU ? MODEL_CL_LIB_FILE.split("file:///android_asset/")[1] :
+                    MODEL_CPU_LIB_FILE.split("file:///android_asset/")[1];
+            Log.i(TAG, "Uploading compiled function to cache folder");
+            try {
+                libCacheFilePath = getTempLibFilePath(libFilename);
+                byte[] modelLibByte = getBytesFromFile(assetManager, libFilename);
+                FileOutputStream fos = new FileOutputStream(libCacheFilePath);
+                fos.write(modelLibByte);
+                fos.close();
+            } catch (IOException e) {
+                Log.e(TAG, "Problem uploading compiled function!" + e);
+                return -1;//failure
+            }
+
+            // load parameters
+            byte[] modelParams = null;
+            String paramFilename = MODEL_PARAM_FILE.split("file:///android_asset/")[1];
+            try {
+                modelParams = getBytesFromFile(assetManager, paramFilename);
+            } catch (IOException e) {
+                Log.e(TAG, "Problem reading params file!" + e);
+                return -1;//failure
+            }
+
+            // create java tvm context
+            TVMContext tvmCtx = EXE_GPU ? TVMContext.opencl() : TVMContext.cpu();
+
+            // tvm module for compiled functions
+            Module modelLib = Module.load(libCacheFilePath);
+
+            // get global function module for graph runtime
+            Function runtimeCreFun = Function.getFunction("tvm.graph_runtime.create");
+            TVMValue runtimeCreFunRes = runtimeCreFun.pushArg(modelGraph)
+                    .pushArg(modelLib)
+                    .pushArg(tvmCtx.deviceType)
+                    .pushArg(tvmCtx.deviceId)
+                    .invoke();
+            graphRuntimeModule = runtimeCreFunRes.asModule();
+
+            // get the function from the module(load parameters)
+            Function loadParamFunc = graphRuntimeModule.getFunction("load_params");
+            loadParamFunc.pushArg(modelParams).invoke();
+
+            // release tvm local variables
+            modelLib.release();
+            loadParamFunc.release();
+            runtimeCreFun.release();
+
+            return 0;//success
+        }
+
+        @Override
+        protected void onPreExecute() {
+            dialog.setCancelable(false);
+            dialog.setMessage("Loading Model...");
+            dialog.show();
+            super.onPreExecute();
+        }
+
+        @Override
+        protected void onPostExecute(Integer status) {
+            if (dialog != null && dialog.isShowing()) {
+                dialog.dismiss();
+            }
+            if (status != 0) {
+                showDialog("Error", "Fail to initialized model, check compiled model");
+            }
+        }
+    }
+
+    /*
+        Execute prediction for processed decode input bitmap image content on TVM graph runtime.
+     */
+    private class ModelRunAsyncTask extends AsyncTask<Bitmap, Void, Integer> {
+        ProgressDialog dialog = new ProgressDialog(MainActivity.this);
+
+        @Override
+        protected Integer doInBackground(Bitmap... bitmaps) {
+            if (null != graphRuntimeModule) {
+                int count  = bitmaps.length;
+                for (int i = 0 ; i < count ; i++) {
+                    long processingTimeMs = SystemClock.uptimeMillis();
+                    Log.i(TAG, "Decode JPEG image content");
+
+                    // extract the jpeg content
+                    ByteArrayOutputStream stream = new ByteArrayOutputStream();
+                    bitmaps[i].compress(Bitmap.CompressFormat.JPEG,100,stream);
+                    byte[] byteArray = stream.toByteArray();
+                    Bitmap imageBitmap = BitmapFactory.decodeByteArray(byteArray, 0, byteArray.length);
+
+                    // crop input image at centre to model input size
+                    // commecial deploy note:: instead of cropying image do resize
+                    // image to model input size so we never lost the image content
+                    Bitmap cropImageBitmap = Bitmap.createBitmap(MODEL_INPUT_SIZE, MODEL_INPUT_SIZE, Bitmap.Config.ARGB_8888);
+                    Matrix frameToCropTransform = getTransformationMatrix(imageBitmap.getWidth(), imageBitmap.getHeight(),
+                            MODEL_INPUT_SIZE, MODEL_INPUT_SIZE, 0, true);
+                    Canvas canvas = new Canvas(cropImageBitmap);
+                    canvas.drawBitmap(imageBitmap, frameToCropTransform, null);
+
+                    // image pixel int values
+                    int[] pixelValues = new int[MODEL_INPUT_SIZE * MODEL_INPUT_SIZE];
+                    // image RGB float values
+                    float[] imgRgbValues = new float[MODEL_INPUT_SIZE * MODEL_INPUT_SIZE * IMG_CHANNEL];
+                    // image RGB transpose float values
+                    float[] imgRgbTranValues = new float[MODEL_INPUT_SIZE * MODEL_INPUT_SIZE * IMG_CHANNEL];
+
+                    // pre-process the image data from 0-255 int to normalized float based on the
+                    // provided parameters.
+                    cropImageBitmap.getPixels(pixelValues, 0, MODEL_INPUT_SIZE, 0, 0, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE);
+                    for (int j = 0; j < pixelValues.length; ++j) {
+                        imgRgbValues[j * 3 + 0] = ((pixelValues[j] >> 16) & 0xFF)/255.0f;
+                        imgRgbValues[j * 3 + 1] = ((pixelValues[j] >> 8) & 0xFF)/255.0f;
+                        imgRgbValues[j * 3 + 2] = (pixelValues[j] & 0xFF)/255.0f;
+                    }
+
+                    // pre-process the image rgb data transpose based on the provided parameters.
+                    for (int k = 0; k < IMG_CHANNEL; ++k) {
+                        for (int l = 0; l < MODEL_INPUT_SIZE; ++l) {
+                            for (int m = 0; m < MODEL_INPUT_SIZE; ++m) {
+                                int dst_index = m + MODEL_INPUT_SIZE*l + MODEL_INPUT_SIZE*MODEL_INPUT_SIZE*k;
+                                int src_index = k + IMG_CHANNEL*m + IMG_CHANNEL*MODEL_INPUT_SIZE*l;
+                                imgRgbTranValues[dst_index] = imgRgbValues[src_index];
+                            }
+                        }
+                    }
+
+                    // get the function from the module(set input data)
+                    Log.i(TAG, "set input data");
+                    NDArray inputNdArray = NDArray.empty(new long[]{1, IMG_CHANNEL, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE}, new TVMType("float32"));;
+                    inputNdArray.copyFrom(imgRgbTranValues);
+                    Function setInputFunc = graphRuntimeModule.getFunction("set_input");
+                    setInputFunc.pushArg(INPUT_NAME).pushArg(inputNdArray).invoke();
+                    // release tvm local variables
+                    inputNdArray.release();
+                    setInputFunc.release();
+
+                    // get the function from the module(run it)
+                    Log.i(TAG, "run function on target");
+                    Function runFunc = graphRuntimeModule.getFunction("run");
+                    runFunc.invoke();
+                    // release tvm local variables
+                    runFunc.release();
+
+                    // get the function from the module(get output data)
+                    Log.i(TAG, "get output data");
+                    NDArray outputNdArray = NDArray.empty(new long[]{1000}, new TVMType("float32"));
+                    Function getOutputFunc = graphRuntimeModule.getFunction("get_output");
+                    getOutputFunc.pushArg(OUTPUT_INDEX).pushArg(outputNdArray).invoke();
+                    float[] output = outputNdArray.asFloatArray();
+                    // release tvm local variables
+                    outputNdArray.release();
+                    getOutputFunc.release();
+
+                    // display the result from extracted output data
+                    if (null != output) {
+                        int maxPosition = -1;
+                        float maxValue = 0;
+                        for (int j = 0; j < output.length; ++j) {
+                            if (output[j] > maxValue) {
+                                maxValue = output[j];
+                                maxPosition = j;
+                            }
+                        }
+                        processingTimeMs = SystemClock.uptimeMillis() - processingTimeMs;
+                        String label = "Prediction Result : ";
+                        label += labels.size() > maxPosition ? labels.get(maxPosition) : "unknown";
+                        label += "\nPrediction Time : " + processingTimeMs + "ms";
+                        mResultView.setText(label);
+                    }
+                    Log.i(TAG, "prediction finished");
+                }
+                return 0;
+            }
+            return -1;
+        }
+
+        @Override
+        protected void onPreExecute() {
+            dialog.setCancelable(false);
+            dialog.setMessage("Prediction running on image...");
+            dialog.show();
+            super.onPreExecute();
+        }
+
+        @Override
+        protected void onPostExecute(Integer status) {
+            if (dialog != null && dialog.isShowing()) {
+                dialog.dismiss();
+            }
+            if (status != 0) {
+                showDialog("Error", "Fail to predict image, GraphRuntime exception");
+            }
+        }
+    }
+
+    @Override
+    protected void onDestroy() {
+        // release tvm local variables
+        if (null != graphRuntimeModule)
+            graphRuntimeModule.release();
+        super.onDestroy();
+    }
+
+    /**
+     * Read file from assets and return byte array.
+     *
+     * @param assets The asset manager to be used to load assets.
+     * @param fileName The filepath of read file.
+     * @return byte[] file content
+     * @throws IOException
+     */
+    private byte[] getBytesFromFile(AssetManager assets, String fileName) throws IOException {
+        InputStream is = assets.open(fileName);
+        int length = is.available();
+        byte[] bytes = new byte[length];
+        // Read in the bytes
+        int offset = 0;
+        int numRead = 0;
+        try {
+            while (offset < bytes.length
+                    && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) {
+                offset += numRead;
+            }
+        } finally {
+            is.close();
+        }
+        // Ensure all the bytes have been read in
+        if (offset < bytes.length) {
+            throw new IOException("Could not completely read file " + fileName);
+        }
+        return bytes;
+    }
+
+    /**
+     * Dialog show pick option for select image from Gallery or Camera.
+     */
+    private void showPictureDialog(){
+        AlertDialog.Builder pictureDialog = new AlertDialog.Builder(this);
+        pictureDialog.setTitle("Select Action");
+        String[] pictureDialogItems = {
+                "Select photo from gallery",
+                "Capture photo from camera" };
+        pictureDialog.setItems(pictureDialogItems,
+                new DialogInterface.OnClickListener() {
+                    @Override
+                    public void onClick(DialogInterface dialog, int which) {
+                        switch (which) {
+                            case 0:
+                                choosePhotoFromGallery();
+                                break;
+                            case 1:
+                                takePhotoFromCamera();
+                                break;
+                        }
+                    }
+                });
+        pictureDialog.show();
+    }
+
+    /**
+     * Request to pick image from Gallery.
+     */
+    public void choosePhotoFromGallery() {
+        Intent galleryIntent = new Intent(Intent.ACTION_PICK,
+                android.provider.MediaStore.Images.Media.EXTERNAL_CONTENT_URI);
+
+        startActivityForResult(galleryIntent, PICTURE_FROM_GALLERY);
+    }
+
+    /**
+     * Request to capture image from Camera.
+     */
+    private void takePhotoFromCamera() {
+        Intent intent = new Intent(android.provider.MediaStore.ACTION_IMAGE_CAPTURE);
+
+        if (Build.VERSION.SDK_INT < Build.VERSION_CODES.N) {
+            mCameraImageUri = Uri.fromFile(createImageFile());
+        } else {
+            File file = new File(createImageFile().getPath());
+            mCameraImageUri = FileProvider.getUriForFile(getApplicationContext(), getApplicationContext().getPackageName() + ".provider", file);
+        }
+
+        intent.putExtra(MediaStore.EXTRA_OUTPUT, mCameraImageUri);
+        startActivityForResult(intent, PICTURE_FROM_CAMERA);
+    }
+
+    @Override
+    public void onActivityResult(int requestCode, int resultCode, Intent data) {
+        super.onActivityResult(requestCode, resultCode, data);
+        if (resultCode == this.RESULT_CANCELED) {
+            return;
+        }
+        Uri contentURI = null;
+        if (requestCode == PICTURE_FROM_GALLERY) {
+            if (data != null) {
+                contentURI = data.getData();
+            }
+        } else if (requestCode == PICTURE_FROM_CAMERA) {
+            contentURI = mCameraImageUri;
+        }
+        if (null != contentURI) {
+            try {
+                Bitmap bitmap = MediaStore.Images.Media.getBitmap(this.getContentResolver(), contentURI);
+                Bitmap scaled = Bitmap.createScaledBitmap(bitmap, IMAGE_PREVIEW_HEIGHT, IMAGE_PREVIEW_WIDTH, true);
+                mImageView.setImageBitmap(scaled);
+                new ModelRunAsyncTask().execute(scaled);
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+    }
+
+    /**
+     * Get application cache path where to place compiled functions.
+     *
+     * @param fileName library file name.
+     * @return String application cache folder path
+     * @throws IOException
+     */
+    private final String getTempLibFilePath(String fileName) throws IOException {
+        File tempDir = File.createTempFile("tvm4j_demo_", "");
+        if (!tempDir.delete() || !tempDir.mkdir()) {
+            throw new IOException("Couldn't create directory " + tempDir.getAbsolutePath());
+        }
+        return (tempDir + File.separator + fileName);
+    }
+
+    /**
+     * Create image file under storage where camera application save captured image.
+     *
+     * @return File image file under sdcard where camera can save image
+     */
+    private File createImageFile() {
+        // Create an image file name
+        String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
+        String imageFileName = "JPEG_" + timeStamp + "_";
+        File storageDir = Environment.getExternalStoragePublicDirectory(
+                Environment.DIRECTORY_PICTURES);
+        try {
+            File image = File.createTempFile(
+                    imageFileName,  // prefix
+                    ".jpg",         // suffix
+                    storageDir      // directory
+            );
+            return image;
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        return null;
+    }
+
+    /**
+     * Show dialog to user.
+     *
+     * @param title dialog display title
+     * @param msg dialog display message
+     */
+    private void showDialog(String title, String msg) {
+        AlertDialog.Builder builder = new AlertDialog.Builder(this);
+        builder.setTitle(title);
+        builder.setMessage(msg);
+        builder.setCancelable(true);
+        builder.setNeutralButton(android.R.string.ok,
+                new DialogInterface.OnClickListener() {
+                    public void onClick(DialogInterface dialog, int id) {
+                        dialog.cancel();
+                        finish();
+                    }
+                });
+        builder.create().show();
+    }
+
+    @Override
+    public void onRequestPermissionsResult (final int requestCode, final String[] permissions, final int[] grantResults){
+        if (requestCode == PERMISSIONS_REQUEST) {
+            if (grantResults.length > 0
+                    && grantResults[0] == PackageManager.PERMISSION_GRANTED
+                    && grantResults[1] == PackageManager.PERMISSION_GRANTED) {
+                // instantiate tvm runtime and setup environment on background after application begin
+                new LoadModleAsyncTask().execute();
+            } else {
+                requestPermission();
+            }
+        }
+    }
+
+    /**
+     * Whether application has required mandatory permissions to run.
+     */
+    private boolean hasPermission() {
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+            return checkSelfPermission(Manifest.permission.CAMERA) == PackageManager.PERMISSION_GRANTED &&
+                    checkSelfPermission(Manifest.permission.WRITE_EXTERNAL_STORAGE) == PackageManager.PERMISSION_GRANTED;
+        } else {
+            return true;
+        }
+    }
+
+    /**
+     * Request required mandatory permission for application to run.
+     */
+    private void requestPermission() {
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+            if (shouldShowRequestPermissionRationale(Manifest.permission.CAMERA) ||
+                    shouldShowRequestPermissionRationale(Manifest.permission.WRITE_EXTERNAL_STORAGE)) {
+                Toast.makeText(this,
+                        "Camera AND storage permission are required for this demo", Toast.LENGTH_LONG).show();
+            }
+            requestPermissions(new String[] {Manifest.permission.CAMERA, Manifest.permission.WRITE_EXTERNAL_STORAGE}, PERMISSIONS_REQUEST);
+        }
+    }
+
+    /**
+     * Returns a transformation matrix from one reference frame into another.
+     * Handles cropping (if maintaining aspect ratio is desired) and rotation.
+     *
+     * @param srcWidth Width of source frame.
+     * @param srcHeight Height of source frame.
+     * @param dstWidth Width of destination frame.
+     * @param dstHeight Height of destination frame.
+     * @param applyRotation Amount of rotation to apply from one frame to another.
+     *  Must be a multiple of 90.
+     * @param maintainAspectRatio If true, will ensure that scaling in x and y remains constant,
+     * cropping the image if necessary.
+     * @return The transformation fulfilling the desired requirements.
+     */
+    public static Matrix getTransformationMatrix(
+            final int srcWidth,
+            final int srcHeight,
+            final int dstWidth,
+            final int dstHeight,
+            final int applyRotation,
+            final boolean maintainAspectRatio) {
+        final Matrix matrix = new Matrix();
+
+        if (applyRotation != 0) {
+            if (applyRotation % 90 != 0) {
+                Log.w(TAG, "Rotation of %d % 90 != 0 " + applyRotation);
+            }
+
+            // Translate so center of image is at origin.
+            matrix.postTranslate(-srcWidth / 2.0f, -srcHeight / 2.0f);
+
+            // Rotate around origin.
+            matrix.postRotate(applyRotation);
+        }
+
+        // Account for the already applied rotation, if any, and then determine how
+        // much scaling is needed for each axis.
+        final boolean transpose = (Math.abs(applyRotation) + 90) % 180 == 0;
+
+        final int inWidth = transpose ? srcHeight : srcWidth;
+        final int inHeight = transpose ? srcWidth : srcHeight;
+
+        // Apply scaling if necessary.
+        if (inWidth != dstWidth || inHeight != dstHeight) {
+            final float scaleFactorX = dstWidth / (float) inWidth;
+            final float scaleFactorY = dstHeight / (float) inHeight;
+
+            if (maintainAspectRatio) {
+                // Scale by minimum factor so that dst is filled completely while
+                // maintaining the aspect ratio. Some image may fall off the edge.
+                final float scaleFactor = Math.max(scaleFactorX, scaleFactorY);
+                matrix.postScale(scaleFactor, scaleFactor);
+            } else {
+                // Scale exactly to fill dst from src.
+                matrix.postScale(scaleFactorX, scaleFactorY);
+            }
+        }
+
+        if (applyRotation != 0) {
+            // Translate back from origin centered reference to destination frame.
+            matrix.postTranslate(dstWidth / 2.0f, dstHeight / 2.0f);
+        }
+
+        return matrix;
+    }
+}
\ No newline at end of file
diff --git a/apps/android_deploy/app/src/main/jni/Android.mk b/apps/android_deploy/app/src/main/jni/Android.mk
new file mode 100644
index 000000000000..a99517f90332
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/Android.mk
@@ -0,0 +1,42 @@
+LOCAL_PATH := $(call my-dir)
+MY_PATH := $(LOCAL_PATH)
+
+include $(CLEAR_VARS)
+
+LOCAL_PATH := $(MY_PATH)
+ROOT_PATH := $(MY_PATH)/../../../../../..
+
+ifndef config
+	ifneq ("$(wildcard ./config.mk)","")
+	  config ?= config.mk
+	else
+	  config ?= make/config.mk
+	endif
+endif
+
+include $(config)
+
+LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
+LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
+
+LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
+                    $(ROOT_PATH)/dlpack/include \
+                    $(ROOT_PATH)/dmlc-core/include \
+                    $(ROOT_PATH)/HalideIR/src \
+                    $(ROOT_PATH)/topi/include
+
+LOCAL_MODULE = tvm4j_runtime_packed
+
+LOCAL_CPP_FEATURES += exceptions
+LOCAL_LDLIBS += -latomic
+LOCAL_ARM_MODE := arm
+
+ifdef ADD_C_INCLUDES
+	LOCAL_C_INCLUDES += $(ADD_C_INCLUDES)
+endif
+
+ifdef ADD_LDLIBS
+	LOCAL_LDLIBS += $(ADD_LDLIBS)
+endif
+
+include $(BUILD_SHARED_LIBRARY)
diff --git a/apps/android_deploy/app/src/main/jni/Application.mk b/apps/android_deploy/app/src/main/jni/Application.mk
new file mode 100644
index 000000000000..8e81a8d6a81c
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/Application.mk
@@ -0,0 +1,16 @@
+ifndef config
+	ifneq ("$(wildcard ./config.mk)","")
+	  config ?= config.mk
+	else
+	  config ?= make/config.mk
+	endif
+endif
+
+include $(config)
+
+APP_STL := c++_static
+
+APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
+ifeq ($(USE_OPENCL), 1)                                                                                                                                             
+	APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
+endif
diff --git a/apps/android_deploy/app/src/main/jni/build.sh b/apps/android_deploy/app/src/main/jni/build.sh
new file mode 100644
index 000000000000..1ca38ae5bd12
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/build.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+PATH="$PATH:/usr/local/bin"
+CURR_DIR=$(cd `dirname $0`; pwd)
+ROOT_DIR="$CURR_DIR/../../../../../.."
+javah -o $CURR_DIR/ml_dmlc_tvm_native_c_api.h -cp "$ROOT_DIR/jvm/core/target/*" ml.dmlc.tvm.LibInfo || exit -1
+cp -f $ROOT_DIR/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc $CURR_DIR/ || exit -1
+cp -f $ROOT_DIR/jvm/native/src/main/native/jni_helper_func.h $CURR_DIR/ || exit -1
+rm -rf $CURR_DIR/../libs
+ndk-build --directory=$CURR_DIR
diff --git a/apps/android_deploy/app/src/main/jni/make/config.mk b/apps/android_deploy/app/src/main/jni/make/config.mk
new file mode 100644
index 000000000000..8d6f5a56dd5b
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/make/config.mk
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling
+#
+#  If you want to change the configuration, please use the following
+#  steps. Assume you are on the root directory. First copy the this
+#  file so that any local changes will be ignored by git
+#
+#  cp make/config.mk .
+#
+#  Next modify the according entries, and then compile by
+#
+#  ./build.sh
+#
+#-------------------------------------------------------------------------------
+APP_ABI = all
+
+APP_PLATFORM = android-17
+
+# whether enable OpenCL during compile
+USE_OPENCL = 0
+
+# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
+ADD_C_INCLUDES =
+
+# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
+ADD_LDLIBS =
diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
new file mode 100644
index 000000000000..0b5f4ee67237
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
@@ -0,0 +1,27 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm_runtime.h
+ * \brief Pack all tvm runtime source files
+ */
+#include <sys/stat.h>
+#include <fstream>
+
+#include "../src/runtime/c_runtime_api.cc"
+#include "../src/runtime/cpu_device_api.cc"
+#include "../src/runtime/workspace_pool.cc"
+#include "../src/runtime/module_util.cc"
+#include "../src/runtime/system_lib_module.cc"
+#include "../src/runtime/module.cc"
+#include "../src/runtime/registry.cc"
+#include "../src/runtime/file_util.cc"
+#include "../src/runtime/dso_module.cc"
+#include "../src/runtime/thread_pool.cc"
+#include "../src/runtime/threading_backend.cc"
+#include "../src/runtime/ndarray.cc"
+
+#include "../src/runtime/graph/graph_runtime.cc"
+
+#ifdef TVM_OPENCL_RUNTIME
+#include "../src/runtime/opencl/opencl_device_api.cc"
+#include "../src/runtime/opencl/opencl_module.cc"
+#endif
diff --git a/apps/android_deploy/app/src/main/res/layout/activity_main.xml b/apps/android_deploy/app/src/main/res/layout/activity_main.xml
new file mode 100644
index 000000000000..b16a5c2548a6
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/layout/activity_main.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="utf-8"?>
+<android.support.design.widget.CoordinatorLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context="ml.dmlc.tvm.android.demo.MainActivity">
+
+    <android.support.design.widget.AppBarLayout
+        android:layout_height="wrap_content"
+        android:layout_width="match_parent"
+        android:theme="@style/AppTheme.AppBarOverlay">
+
+        <android.support.v7.widget.Toolbar
+            android:id="@+id/toolbar"
+            android:layout_width="match_parent"
+            android:layout_height="?attr/actionBarSize"
+            android:background="?attr/colorPrimary"
+            app:popupTheme="@style/AppTheme.PopupOverlay" />
+
+    </android.support.design.widget.AppBarLayout>
+
+    <include layout="@layout/content_main"/>
+
+</android.support.design.widget.CoordinatorLayout>
+
diff --git a/apps/android_deploy/app/src/main/res/layout/content_main.xml b/apps/android_deploy/app/src/main/res/layout/content_main.xml
new file mode 100644
index 000000000000..34de93843645
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/layout/content_main.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+              xmlns:tools="http://schemas.android.com/tools"
+              xmlns:app="http://schemas.android.com/apk/res-auto"
+              android:orientation="vertical"
+              android:layout_width="fill_parent"
+              android:layout_height="wrap_content"
+              app:layout_behavior="@string/appbar_scrolling_view_behavior"
+              tools:showIn="@layout/activity_main">
+
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:orientation="horizontal">
+
+        <Button
+            android:id="@+id/btnPickImage"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_weight="1"
+            android:text="Select or Capture picture" />
+
+    </LinearLayout>
+    <View
+        android:layout_width="match_parent"
+        android:layout_height="4dp" />
+    <TextView
+        android:id="@+id/resultTextView"
+        android:layout_width="match_parent"
+        android:layout_height="60dp"
+        android:layout_weight="1"
+        android:textColor="@color/colorPrimary"
+        android:textSize="15sp" />
+    <View
+        android:layout_width="match_parent"
+        android:layout_height="4dp" />
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="375dp"
+        android:layout_weight="1" />
+
+    <View
+        android:layout_width="match_parent"
+        android:layout_height="10dp" />
+</LinearLayout>
diff --git a/apps/android_deploy/app/src/main/res/values/colors.xml b/apps/android_deploy/app/src/main/res/values/colors.xml
new file mode 100644
index 000000000000..3bdabdf11d00
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/values/colors.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="colorPrimary">#3F51B5</color>
+    <color name="colorPrimaryDark">#303F9F</color>
+    <color name="colorAccent">#06d467</color>
+</resources>
diff --git a/apps/android_deploy/app/src/main/res/values/strings.xml b/apps/android_deploy/app/src/main/res/values/strings.xml
new file mode 100644
index 000000000000..cf1fa24069a1
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/values/strings.xml
@@ -0,0 +1,3 @@
+<resources>
+    <string name="app_name">TVM Android Demo</string>
+</resources>
\ No newline at end of file
diff --git a/apps/android_deploy/app/src/main/res/values/styles.xml b/apps/android_deploy/app/src/main/res/values/styles.xml
new file mode 100644
index 000000000000..44f664f202f9
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/values/styles.xml
@@ -0,0 +1,17 @@
+<resources>
+
+    <!-- Base application theme. -->
+    <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
+        <!-- Customize your theme here. -->
+        <item name="colorPrimary">@color/colorPrimary</item>
+        <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
+        <item name="colorAccent">@color/colorAccent</item>
+    </style>
+    <style name="AppTheme.NoActionBar">
+        <item name="windowActionBar">false</item>
+        <item name="windowNoTitle">true</item>
+    </style>
+    <style name="AppTheme.AppBarOverlay" parent="ThemeOverlay.AppCompat.Dark.ActionBar" />
+    <style name="AppTheme.PopupOverlay" parent="ThemeOverlay.AppCompat.Light" />
+
+</resources>
diff --git a/apps/android_deploy/app/src/main/res/xml/provider_paths.xml b/apps/android_deploy/app/src/main/res/xml/provider_paths.xml
new file mode 100644
index 000000000000..74a5cde1d8fd
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/xml/provider_paths.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<paths xmlns:android="http://schemas.android.com/apk/res/android">
+    <external-path name="external_files" path="."/>
+</paths>
\ No newline at end of file
diff --git a/apps/android_deploy/build.gradle b/apps/android_deploy/build.gradle
new file mode 100644
index 000000000000..f7bbe2641c9d
--- /dev/null
+++ b/apps/android_deploy/build.gradle
@@ -0,0 +1,29 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+    repositories {
+        jcenter()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:2.3.3'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.4'
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        jcenter()
+        maven {
+          url 'https://maven.google.com'
+        }
+        mavenLocal()
+        mavenCentral()
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
diff --git a/apps/android_deploy/dev_tools/gen_keystore.sh b/apps/android_deploy/dev_tools/gen_keystore.sh
new file mode 100644
index 000000000000..e91cd05ad957
--- /dev/null
+++ b/apps/android_deploy/dev_tools/gen_keystore.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+CURR_DIR=$(cd `dirname $0`; pwd)
+keytool -genkey -keystore $CURR_DIR/tvmdemo.keystore -alias tvmdemo -keyalg RSA -validity 10000
diff --git a/apps/android_deploy/dev_tools/sign_apk.sh b/apps/android_deploy/dev_tools/sign_apk.sh
new file mode 100644
index 000000000000..314f82cdb76c
--- /dev/null
+++ b/apps/android_deploy/dev_tools/sign_apk.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+CURR_DIR=$(cd `dirname $0`; pwd)
+APK_DIR=$CURR_DIR/../app/build/outputs/apk
+UNSIGNED_APK=$APK_DIR/app-release-unsigned.apk
+SIGNED_APK=$APK_DIR/tvmdemo-release.apk
+jarsigner -verbose -keystore $CURR_DIR/tvmdemo.keystore -signedjar $SIGNED_APK $UNSIGNED_APK 'tvmdemo'
+echo $SIGNED_APK
diff --git a/apps/android_deploy/settings.gradle b/apps/android_deploy/settings.gradle
new file mode 100644
index 000000000000..e7b4def49cb5
--- /dev/null
+++ b/apps/android_deploy/settings.gradle
@@ -0,0 +1 @@
+include ':app'
diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index 062227b3e424..64e7779f150a 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -1,6 +1,6 @@
 # Android TVM RPC
 
-This folder contains Android RPC app that allows us to launch an rpc server on a Android device and connect to it through python script and do testing on the python side as normal TVM RPC.
+This folder contains Android RPC app that allows us to launch an RPC server on a Android device and connect to it through python script and do testing on the python side as normal TVM RPC.
 
 You will need JDK, [Android NDK](https://developer.android.com/ndk) and an Android device to use this.
 
@@ -8,6 +8,8 @@ You will need JDK, [Android NDK](https://developer.android.com/ndk) and an Andro
 
 ### <a name="buildapk">Build APK</a>
 
+We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
+
 Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/dmlc/tvm/blob/master/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
 
 ```
@@ -24,17 +26,17 @@ dependencies {
 }
 ```
 
-The Gradle build script is provided in the app root folder. It downloads the proper version of Gradle, compiles JNI, resolves Java dependencies and builds the Android application together with tvm4j. Run following script to build apk file.
+Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file.
 
 ```bash
 export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]
 cd apps/android_rpc
-./gradlew clean build
+gradle clean build
 ```
 
 In `app/build/outputs/apk` you'll find `app-release-unsigned.apk`, use `dev_tools/gen_keystore.sh` to generate a signature and use `dev_tools/sign_apk.sh` to get the signed apk file `app/build/outputs/apk/tvmrpc-release.apk`.
 
-Now upload `tvmrpc-release.apk` to your Android device and install it.
+Upload `tvmrpc-release.apk` to your Android device and install it.
 
 ### Build with OpenCL
 
@@ -49,15 +51,15 @@ Here's a piece of example for `config.mk`.
 
 ```makefile
 APP_ABI = arm64-v8a
- 
+
 APP_PLATFORM = android-17
- 
+
 # whether enable OpenCL during compile
 USE_OPENCL = 1
- 
+
 # the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
 ADD_C_INCLUDES = /opt/adrenosdk-osx/Development/Inc
- 
+
 # the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
 ADD_LDLIBS = libOpenCL.so
 ```
@@ -83,19 +85,22 @@ If everything goes well, you will find compile tools in `/opt/android-toolchain-
 
 ### Cross Compile and Upload to the Android Device
 
-First start a proxy server using `python -m tvm.exec.rpc_proxy` and make your Android device connect to this proxy server via TVM RPC application.
+First start an RPC tracker using `python -m tvm.exec.rpc_tracker --port [PORT]` and connect your Android device to this RPC tracker via the TVM RPC application.
+Set the `Address` and `Port` fields to the address and port of the RPC tracker respectively.
+The key should be set to "android" if you wish to avoid modifying the default test script.
 
 Then checkout [android\_rpc/tests/android\_rpc\_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py) and run,
 
 ```bash
-# Specify the proxy host
-export TVM_ANDROID_RPC_PROXY_HOST=0.0.0.0
+# Specify the RPC tracker
+export TVM_TRACKER_HOST=0.0.0.0
+export TVM_TRACKER_PORT=[PORT]
 # Specify the standalone Android C++ compiler
 export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
 python android_rpc_test.py
 ```
 
-This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector additon on your Android device. On my test device, it gives following results.
+This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector addition on your Android device. On my test device, it gives following results.
 
 ```bash
 TVM: Initializing cython mode...
diff --git a/apps/android_rpc/app/build.gradle b/apps/android_rpc/app/build.gradle
index 97364da5cd87..a91455fc5477 100644
--- a/apps/android_rpc/app/build.gradle
+++ b/apps/android_rpc/app/build.gradle
@@ -13,7 +13,7 @@ android {
     buildToolsVersion "26.0.1"
     defaultConfig {
         applicationId "ml.dmlc.tvm.tvmrpc"
-        minSdkVersion 17
+        minSdkVersion 24
         targetSdkVersion 26
         versionCode 1
         versionName "1.0"
diff --git a/apps/android_rpc/app/src/main/AndroidManifest.xml b/apps/android_rpc/app/src/main/AndroidManifest.xml
index 6b0d6d995dba..2dbc06ece6e3 100644
--- a/apps/android_rpc/app/src/main/AndroidManifest.xml
+++ b/apps/android_rpc/app/src/main/AndroidManifest.xml
@@ -2,11 +2,14 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="ml.dmlc.tvm.tvmrpc" >
 
+    <uses-permission android:name="android.permission.INTERNET" />
+
     <application
         android:allowBackup="true"
         android:label="@string/app_name"
         android:supportsRtl="true"
-        android:theme="@style/AppTheme" >
+        android:theme="@style/AppTheme"
+        android:icon="@mipmap/ic_launcher" >
         <activity
             android:name=".MainActivity"
             android:label="@string/app_name"
@@ -17,8 +20,16 @@
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
+        <service android:name=".RPCWatchdogService"
+         android:process=":RPCWatchdogServiceProcess"
+         android:permission="android.permission.BIND_JOB_SERVICE" />
+        <activity
+            android:name=".RPCActivity"
+            android:process=":RPCProcess"
+            android:label="@string/rpc_name"
+            android:theme="@style/AppTheme.NoActionBar"
+            android:screenOrientation="portrait">
+        </activity>
     </application>
 
-    <uses-permission android:name="android.permission.INTERNET" />
-
-</manifest>
\ No newline at end of file
+</manifest>
diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
index 62c877e8b34c..d80008bbe258 100644
--- a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
+++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
@@ -19,34 +19,30 @@
 
 import android.annotation.SuppressLint;
 import android.app.AlertDialog;
+import android.content.Context;
 import android.content.DialogInterface;
+import android.content.SharedPreferences;
 import android.os.Bundle;
 import android.os.Handler;
 import android.os.Message;
+
 import android.support.v7.app.AppCompatActivity;
 import android.support.v7.widget.Toolbar;
 import android.widget.CompoundButton;
 import android.widget.EditText;
 import android.widget.Switch;
+import android.widget.Button;
+import android.view.View;
+import android.content.Intent;
+import android.app.NotificationChannel;
+import android.app.NotificationManager;
+
 
 public class MainActivity extends AppCompatActivity {
-  static final int MSG_RPC_ERROR = 0;
-  static final String MSG_RPC_ERROR_DATA_KEY = "msg_rpc_error_data_key";
-
-  private RPCProcessor tvmServerWorker;
-  @SuppressLint("HandlerLeak")
-  private final Handler rpcHandler = new Handler() {
-    @Override
-    public void dispatchMessage(Message msg) {
-      Switch switchConnect = findViewById(R.id.switch_connect);
-      if (msg.what == MSG_RPC_ERROR && switchConnect.isChecked()) {
-        // switch off and show alert dialog.
-        switchConnect.setChecked(false);
-        String msgBody = msg.getData().getString(MSG_RPC_ERROR_DATA_KEY);
-        showDialog("Error", msgBody);
-      }
-    }
-  };
+  private boolean skipRelaunch = true;
+  // wait time before automatic restart of RPC Activity
+  public static final int HANDLER_RESTART_DELAY = 5000;
+
 
   private void showDialog(String title, String msg) {
     AlertDialog.Builder builder = new AlertDialog.Builder(this);
@@ -62,61 +58,124 @@ public void onClick(DialogInterface dialog, int id) {
     builder.create().show();
   }
 
+  public Intent updateRPCPrefs() {
+    System.err.println("updating preferences...");
+    EditText edProxyAddress = findViewById(R.id.input_address);
+    EditText edProxyPort = findViewById(R.id.input_port);
+    EditText edAppKey = findViewById(R.id.input_key);
+    Switch inputSwitch =  findViewById(R.id.switch_persistent);
+
+    final String proxyHost = edProxyAddress.getText().toString();
+    final int proxyPort = Integer.parseInt(edProxyPort.getText().toString());
+    final String key = edAppKey.getText().toString();
+    final boolean isChecked = inputSwitch.isChecked();
+
+    SharedPreferences pref = getApplicationContext().getSharedPreferences("RPCProxyPreference", Context.MODE_PRIVATE);
+    SharedPreferences.Editor editor = pref.edit();
+    editor.putString("input_address", proxyHost);
+    editor.putString("input_port", edProxyPort.getText().toString());
+    editor.putString("input_key", key);
+    editor.putBoolean("input_switch", isChecked);
+    editor.commit();
+
+    Intent intent = new Intent(this, RPCActivity.class);
+    intent.putExtra("host", proxyHost);
+    intent.putExtra("port", proxyPort);
+    intent.putExtra("key", key);
+    return intent;
+  }
+
+  private void setupRelaunch() {
+    final Context context = this;
+    final Switch switchPersistent = findViewById(R.id.switch_persistent);
+    final Runnable rPCStarter = new Runnable() {
+        public void run() {
+            if (switchPersistent.isChecked()) {
+              System.err.println("relaunching RPC activity in 5s...");
+              Intent intent = ((MainActivity) context).updateRPCPrefs();
+              startActivity(intent);
+            }
+        }
+    };
+    Handler handler = new Handler();
+    handler.postDelayed(rPCStarter, HANDLER_RESTART_DELAY);
+  }
+
   @Override
   protected void onCreate(Bundle savedInstanceState) {
     super.onCreate(savedInstanceState);
     setContentView(R.layout.activity_main);
     Toolbar toolbar = findViewById(R.id.toolbar);
     setSupportActionBar(toolbar);
+    final Context context = this;
 
-    tvmServerWorker = new RPCProcessor(rpcHandler);
-    tvmServerWorker.setDaemon(true);
-    tvmServerWorker.start();
-
-    Switch switchConnect = findViewById(R.id.switch_connect);
-    switchConnect.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+    Switch switchPersistent = findViewById(R.id.switch_persistent);
+    switchPersistent.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
       @Override
       public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
         if (isChecked) {
-          enableInputView(false);
-          connectProxy();
+          System.err.println("automatic RPC restart enabled...");
+          updateRPCPrefs();
         } else {
-          disconnect();
-          enableInputView(true);
+          System.err.println("automatic RPC restart disabled...");
+          updateRPCPrefs();
         }
       }
     });
-  }
 
-  @Override
-  protected void onDestroy() {
-    super.onDestroy();
-    tvmServerWorker.disconnect();
-  }
-
-  private void connectProxy() {
-    EditText edProxyAddress = findViewById(R.id.input_address);
-    EditText edProxyPort = findViewById(R.id.input_port);
-    EditText edAppKey = findViewById(R.id.input_key);
+    Button startRPC = findViewById(R.id.button_start_rpc);
+    startRPC.setOnClickListener(new View.OnClickListener() {
+        public void onClick(View v) {
+            Intent intent = ((MainActivity) context).updateRPCPrefs();
+            startActivity(intent);
+        }
+    });
 
-    final String proxyHost = edProxyAddress.getText().toString();
-    final int proxyPort = Integer.parseInt(edProxyPort.getText().toString());
-    final String key = edAppKey.getText().toString();
+    enableInputView(true);
+  }
 
-    tvmServerWorker.connect(proxyHost, proxyPort, key);
+  @Override
+  protected void onResume() {
+    System.err.println("MainActivity onResume...");
+    System.err.println("skipRelaunch: " + skipRelaunch);
+    // if this is the first time onResume is called, do nothing, otherwise we
+    // may double launch
+    if (!skipRelaunch) {
+        enableInputView(true);
+        setupRelaunch();
+    } else {
+        skipRelaunch = false;
+    }
+    super.onResume();
   }
 
-  private void disconnect() {
-    tvmServerWorker.disconnect();
-    System.err.println("Disconnected.");
+  @Override
+  protected void onDestroy() {
+    super.onDestroy();
   }
 
   private void enableInputView(boolean enable) {
     EditText edProxyAddress = findViewById(R.id.input_address);
     EditText edProxyPort = findViewById(R.id.input_port);
     EditText edAppKey = findViewById(R.id.input_key);
+    Switch input_switch = findViewById(R.id.switch_persistent);
     edProxyAddress.setEnabled(enable);
     edProxyPort.setEnabled(enable);
     edAppKey.setEnabled(enable);
+
+    if (enable) {
+    SharedPreferences pref = getApplicationContext().getSharedPreferences("RPCProxyPreference", Context.MODE_PRIVATE);
+    String inputAddress = pref.getString("input_address", null);
+    if (null != inputAddress)
+        edProxyAddress.setText(inputAddress);
+    String inputPort = pref.getString("input_port", null);
+    if (null != inputPort)
+        edProxyPort.setText(inputPort);
+    String inputKey = pref.getString("input_key", null);
+    if (null != inputKey)
+        edAppKey.setText(inputKey);
+    boolean isChecked = pref.getBoolean("input_switch", false);
+    input_switch.setChecked(isChecked);
+    }
   }
 }
diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCActivity.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCActivity.java
new file mode 100644
index 000000000000..912a7c9e69a6
--- /dev/null
+++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCActivity.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm.tvmrpc;
+
+import android.os.Bundle;
+import android.support.v7.app.AppCompatActivity;
+import android.content.Intent;
+import android.widget.Button;
+import android.view.View;
+
+public class RPCActivity extends AppCompatActivity {
+  private RPCProcessor tvmServerWorker;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_rpc);
+
+    Button stopRPC = findViewById(R.id.button_stop_rpc);
+    stopRPC.setOnClickListener(new View.OnClickListener() {
+        public void onClick(View v) {
+            System.err.println(tvmServerWorker == null);
+            if (tvmServerWorker != null) {
+                // currently will raise a socket closed exception
+                tvmServerWorker.disconnect();
+            }
+            finish();
+            // prevent Android from recycling the process
+            System.exit(0);
+        }
+    });
+
+    System.err.println("rpc activity onCreate...");
+    Intent intent = getIntent();
+    String host = intent.getStringExtra("host");
+    int port = intent.getIntExtra("port", 9090);
+    String key = intent.getStringExtra("key");
+
+    tvmServerWorker = new RPCProcessor();
+    tvmServerWorker.setDaemon(true);
+    tvmServerWorker.start();
+    tvmServerWorker.connect(host, port, key);
+  }
+
+  @Override
+  protected void onDestroy() {
+    System.err.println("rpc activity onDestroy");
+    tvmServerWorker.disconnect();
+    super.onDestroy();
+  }
+}
diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCProcessor.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCProcessor.java
index 2ff7fee8a6b3..6da890931104 100644
--- a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCProcessor.java
+++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCProcessor.java
@@ -17,15 +17,11 @@
 
 package ml.dmlc.tvm.tvmrpc;
 
-import android.os.Bundle;
-import android.os.Handler;
-import android.os.Message;
 import android.os.ParcelFileDescriptor;
-
 import java.net.Socket;
-
-import ml.dmlc.tvm.rpc.ConnectProxyServerProcessor;
+import ml.dmlc.tvm.rpc.ConnectTrackerServerProcessor;
 import ml.dmlc.tvm.rpc.SocketFileDescriptorGetter;
+import ml.dmlc.tvm.rpc.RPCWatchdog;
 
 /**
  * Connect to RPC proxy and deal with requests.
@@ -34,10 +30,10 @@ class RPCProcessor extends Thread {
   private String host;
   private int port;
   private String key;
-
   private boolean running = false;
-  private ConnectProxyServerProcessor currProcessor;
-  private final Handler uiHandler;
+  private long startTime;
+  private ConnectTrackerServerProcessor currProcessor;
+  private boolean first = true;
 
   static final SocketFileDescriptorGetter socketFdGetter
       = new SocketFileDescriptorGetter() {
@@ -47,11 +43,9 @@ public int get(Socket socket) {
         }
       };
 
-  RPCProcessor(Handler uiHandler) {
-    this.uiHandler = uiHandler;
-  }
-
   @Override public void run() {
+    RPCWatchdog watchdog = new RPCWatchdog();
+    watchdog.start();
     while (true) {
       synchronized (this) {
         currProcessor = null;
@@ -61,20 +55,17 @@ public int get(Socket socket) {
           } catch (InterruptedException e) {
           }
         }
-        currProcessor = new ConnectProxyServerProcessor(host, port, key, socketFdGetter);
+        try {
+          currProcessor = new ConnectTrackerServerProcessor(host, port, key, socketFdGetter, watchdog);
+        } catch (Throwable e) {
+          e.printStackTrace();
+          // kill if creating a new processor failed
+          System.exit(0);
+        }
       }
-      try {
+      if (currProcessor != null)
         currProcessor.run();
-      } catch (Throwable e) {
-        disconnect();
-        // turn connect switch off.
-        Message message = new Message();
-        message.what = MainActivity.MSG_RPC_ERROR;
-        Bundle bundle = new Bundle();
-        bundle.putString(MainActivity.MSG_RPC_ERROR_DATA_KEY, e.getMessage());
-        message.setData(bundle);
-        uiHandler.sendMessage(message);
-      }
+      watchdog.finishTimeout();
     }
   }
 
@@ -101,6 +92,6 @@ synchronized void connect(String host, int port, String key) {
     this.port = port;
     this.key = key;
     running = true;
-    notify();
+    this.notify();
   }
 }
diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk
index 01cad9b783a7..5bf52bdaffc0 100644
--- a/apps/android_rpc/app/src/main/jni/Application.mk
+++ b/apps/android_rpc/app/src/main/jni/Application.mk
@@ -8,9 +8,18 @@ endif
 
 include $(config)
 
-APP_STL := gnustl_static
+# We target every architecture except armeabi here, for two reasons:
+# 1) armeabi is deprecated in NDK r16 and removed in r17
+# 2) vulkan is not supported in armeabi
+APP_ABI ?= armeabi-v7a arm64-v8a x86 x86_64 mips
+APP_STL := c++_static
 
 APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
-ifeq ($(USE_OPENCL), 1)                                                                                                                                             
+ifeq ($(USE_OPENCL), 1)
 	APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
+
+ifeq ($(USE_VULKAN), 1)
+	APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
+	APP_LDFLAGS += -lvulkan
+endif
diff --git a/apps/android_rpc/app/src/main/jni/make/config.mk b/apps/android_rpc/app/src/main/jni/make/config.mk
index 8d6f5a56dd5b..c40ce4ba3ec7 100644
--- a/apps/android_rpc/app/src/main/jni/make/config.mk
+++ b/apps/android_rpc/app/src/main/jni/make/config.mk
@@ -14,11 +14,19 @@
 #-------------------------------------------------------------------------------
 APP_ABI = all
 
-APP_PLATFORM = android-17
+APP_PLATFORM = android-24
 
 # whether enable OpenCL during compile
 USE_OPENCL = 0
 
+# whether to enable Vulkan during compile
+USE_VULKAN = 0
+
+ifeq ($(USE_VULKAN), 1)
+  # Statically linking vulkan requires API Level 24 or higher
+  APP_PLATFORM = android-24
+endif
+
 # the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
 ADD_C_INCLUDES =
 
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index fc384a8fcd72..c3c33b0fde37 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -6,6 +6,18 @@
 #include <sys/stat.h>
 #include <fstream>
 
+/* Enable custom logging - this will cause TVM to pass every log message
+ * through CustomLogMessage instead of LogMessage. By enabling this, we must
+ * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log
+ * messages to Android logcat.
+ */
+#define DMLC_LOG_CUSTOMIZE 1
+
+/* Ensure that fatal errors are passed to the logger before throwing
+ * in LogMessageFatal
+ */
+#define DMLC_LOG_BEFORE_THROW 1
+
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/workspace_pool.cc"
@@ -21,10 +33,25 @@
 #include "../src/runtime/rpc/rpc_module.cc"
 #include "../src/runtime/rpc/rpc_socket_impl.cc"
 #include "../src/runtime/thread_pool.cc"
-
+#include "../src/runtime/threading_backend.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
+#include "../src/runtime/ndarray.cc"
 
 #ifdef TVM_OPENCL_RUNTIME
 #include "../src/runtime/opencl/opencl_device_api.cc"
 #include "../src/runtime/opencl/opencl_module.cc"
 #endif
+
+#ifdef TVM_VULKAN_RUNTIME
+#include "../src/runtime/vulkan/vulkan_device_api.cc"
+#include "../src/runtime/vulkan/vulkan_module.cc"
+#endif
+
+
+#include <android/log.h>
+
+void dmlc::CustomLogMessage::Log(const std::string& msg) {
+  // This is called for every message logged by TVM.
+  // We pass the message to logcat.
+  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str());
+}
diff --git a/apps/android_rpc/app/src/main/res/layout/activity_main.xml b/apps/android_rpc/app/src/main/res/layout/activity_main.xml
index f617cf2a04bb..53d48bbd60d9 100644
--- a/apps/android_rpc/app/src/main/res/layout/activity_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/activity_main.xml
@@ -24,4 +24,3 @@
     <include layout="@layout/content_main"/>
 
 </android.support.design.widget.CoordinatorLayout>
-
diff --git a/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml b/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml
new file mode 100644
index 000000000000..ba3102a6033c
--- /dev/null
+++ b/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="utf-8"?>
+<android.support.design.widget.CoordinatorLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context="ml.dmlc.tvm.tvmrpc.RPCActivity">
+
+    <android.support.design.widget.AppBarLayout
+        android:layout_height="wrap_content"
+        android:layout_width="match_parent"
+        android:theme="@style/AppTheme.AppBarOverlay">
+
+        <android.support.v7.widget.Toolbar
+            android:id="@+id/toolbar"
+            android:layout_width="match_parent"
+            android:layout_height="?attr/actionBarSize"
+            android:background="?attr/colorPrimary"
+            app:popupTheme="@style/AppTheme.PopupOverlay" />
+
+    </android.support.design.widget.AppBarLayout>
+
+    <include layout="@layout/content_rpc"/>
+
+</android.support.design.widget.CoordinatorLayout>
diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml
index 827cdfb01b8a..0f2564833ecd 100644
--- a/apps/android_rpc/app/src/main/res/layout/content_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml
@@ -64,9 +64,9 @@
         <TextView
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:text="@string/label_connect"/>
+            android:text="@string/label_persistent"/>
         <Switch
-            android:id="@+id/switch_connect"
+            android:id="@+id/switch_persistent"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
             android:switchMinWidth="55dp"
@@ -76,4 +76,15 @@
             android:textOn="@string/switch_on" />
     </LinearLayout>
 
+    <LinearLayout
+        android:orientation="horizontal"
+        android:layout_width="fill_parent"
+        android:layout_height="wrap_content">
+        <Button
+            android:id="@+id/button_start_rpc"
+            android:layout_height="wrap_content"
+            android:layout_width="wrap_content"
+            android:text="@string/start_rpc" />
+    </LinearLayout>
+
 </LinearLayout>
diff --git a/apps/android_rpc/app/src/main/res/layout/content_rpc.xml b/apps/android_rpc/app/src/main/res/layout/content_rpc.xml
new file mode 100644
index 000000000000..fb9ab2f66a9b
--- /dev/null
+++ b/apps/android_rpc/app/src/main/res/layout/content_rpc.xml
@@ -0,0 +1,14 @@
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+              xmlns:tools="http://schemas.android.com/tools"
+              xmlns:app="http://schemas.android.com/apk/res-auto"
+              android:orientation="vertical"
+              android:layout_width="fill_parent"
+              android:layout_height="wrap_content"
+              app:layout_behavior="@string/appbar_scrolling_view_behavior"
+              tools:showIn="@layout/activity_rpc">
+        <Button
+            android:id="@+id/button_stop_rpc"
+            android:layout_height="wrap_content"
+            android:layout_width="wrap_content"
+            android:text="@string/stop_rpc" />
+</LinearLayout>
diff --git a/apps/android_rpc/app/src/main/res/mipmap-hdpi/ic_launcher.png b/apps/android_rpc/app/src/main/res/mipmap-hdpi/ic_launcher.png
new file mode 100644
index 000000000000..32a4f0f9157f
Binary files /dev/null and b/apps/android_rpc/app/src/main/res/mipmap-hdpi/ic_launcher.png differ
diff --git a/apps/android_rpc/app/src/main/res/mipmap-mdpi/ic_launcher.png b/apps/android_rpc/app/src/main/res/mipmap-mdpi/ic_launcher.png
new file mode 100644
index 000000000000..8e5d4dd8331e
Binary files /dev/null and b/apps/android_rpc/app/src/main/res/mipmap-mdpi/ic_launcher.png differ
diff --git a/apps/android_rpc/app/src/main/res/values/strings.xml b/apps/android_rpc/app/src/main/res/values/strings.xml
index 468fbed8ceaa..33caa374b496 100644
--- a/apps/android_rpc/app/src/main/res/values/strings.xml
+++ b/apps/android_rpc/app/src/main/res/values/strings.xml
@@ -1,15 +1,19 @@
 <resources>
     <string name="app_name">TVM RPC</string>
+    <string name="rpc_name">RPC</string>
 
-    <string name="input_address">Enter the proxy server address</string>
-    <string name="input_port">Enter the proxy server port</string>
+    <string name="input_address">Enter the tracker server address</string>
+    <string name="input_port">Enter the tracker server port</string>
     <string name="input_key">Enter the app connection key</string>
 
     <string name="label_address">Address</string>
     <string name="label_port">Port</string>
     <string name="label_key">Key</string>
-    <string name="label_connect">Connect to Proxy</string>
+    <string name="label_persistent">Keep RPC Alive</string>
 
-    <string name="switch_on">Connected</string>
-    <string name="switch_off">Disconnected</string>
+    <string name="switch_on">Enabled</string>
+    <string name="switch_off">Disabled</string>
+
+    <string name="start_rpc">Start RPC</string>
+    <string name="stop_rpc">Stop RPC</string>
 </resources>
diff --git a/apps/android_rpc/gradle/wrapper/gradle-wrapper.jar b/apps/android_rpc/gradle/wrapper/gradle-wrapper.jar
deleted file mode 100644
index 13372aef5e24..000000000000
Binary files a/apps/android_rpc/gradle/wrapper/gradle-wrapper.jar and /dev/null differ
diff --git a/apps/android_rpc/gradle/wrapper/gradle-wrapper.properties b/apps/android_rpc/gradle/wrapper/gradle-wrapper.properties
deleted file mode 100644
index 80a1f0954c16..000000000000
--- a/apps/android_rpc/gradle/wrapper/gradle-wrapper.properties
+++ /dev/null
@@ -1,6 +0,0 @@
-#Mon Aug 14 21:31:55 CST 2017
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
diff --git a/apps/android_rpc/gradlew b/apps/android_rpc/gradlew
deleted file mode 100755
index 9d82f7891513..000000000000
--- a/apps/android_rpc/gradlew
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env bash
-
-##############################################################################
-##
-##  Gradle start up script for UN*X
-##
-##############################################################################
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS=""
-
-APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
-
-warn ( ) {
-    echo "$*"
-}
-
-die ( ) {
-    echo
-    echo "$*"
-    echo
-    exit 1
-}
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-case "`uname`" in
-  CYGWIN* )
-    cygwin=true
-    ;;
-  Darwin* )
-    darwin=true
-    ;;
-  MINGW* )
-    msys=true
-    ;;
-esac
-
-# Attempt to set APP_HOME
-# Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
-    ls=`ls -ld "$PRG"`
-    link=`expr "$ls" : '.*-> \(.*\)$'`
-    if expr "$link" : '/.*' > /dev/null; then
-        PRG="$link"
-    else
-        PRG=`dirname "$PRG"`"/$link"
-    fi
-done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
-    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
-        # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD="$JAVA_HOME/jre/sh/java"
-    else
-        JAVACMD="$JAVA_HOME/bin/java"
-    fi
-    if [ ! -x "$JAVACMD" ] ; then
-        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-else
-    JAVACMD="java"
-    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-fi
-
-# Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
-    MAX_FD_LIMIT=`ulimit -H -n`
-    if [ $? -eq 0 ] ; then
-        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
-            MAX_FD="$MAX_FD_LIMIT"
-        fi
-        ulimit -n $MAX_FD
-        if [ $? -ne 0 ] ; then
-            warn "Could not set maximum file descriptor limit: $MAX_FD"
-        fi
-    else
-        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
-    fi
-fi
-
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
-    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
-
-# For Cygwin, switch paths to Windows format before running java
-if $cygwin ; then
-    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
-    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-    JAVACMD=`cygpath --unix "$JAVACMD"`
-
-    # We build the pattern for arguments to be converted via cygpath
-    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
-    SEP=""
-    for dir in $ROOTDIRSRAW ; do
-        ROOTDIRS="$ROOTDIRS$SEP$dir"
-        SEP="|"
-    done
-    OURCYGPATTERN="(^($ROOTDIRS))"
-    # Add a user-defined pattern to the cygpath arguments
-    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
-        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
-    fi
-    # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    i=0
-    for arg in "$@" ; do
-        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
-        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
-
-        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
-            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
-        else
-            eval `echo args$i`="\"$arg\""
-        fi
-        i=$((i+1))
-    done
-    case $i in
-        (0) set -- ;;
-        (1) set -- "$args0" ;;
-        (2) set -- "$args0" "$args1" ;;
-        (3) set -- "$args0" "$args1" "$args2" ;;
-        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
-        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
-        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
-        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
-        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
-        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
-    esac
-fi
-
-# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
-function splitJvmOpts() {
-    JVM_OPTS=("$@")
-}
-eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
-JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
-
-exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/apps/android_rpc/gradlew.bat b/apps/android_rpc/gradlew.bat
deleted file mode 100644
index aec99730b4e8..000000000000
--- a/apps/android_rpc/gradlew.bat
+++ /dev/null
@@ -1,90 +0,0 @@
-@if "%DEBUG%" == "" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS=
-
-set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto init
-
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto init
-
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:init
-@rem Get command-line arguments, handling Windowz variants
-
-if not "%OS%" == "Windows_NT" goto win9xME_args
-if "%@eval[2+2]" == "4" goto 4NT_args
-
-:win9xME_args
-@rem Slurp the command line arguments.
-set CMD_LINE_ARGS=
-set _SKIP=2
-
-:win9xME_args_slurp
-if "x%~1" == "x" goto execute
-
-set CMD_LINE_ARGS=%*
-goto execute
-
-:4NT_args
-@rem Get arguments from the 4NT Shell from JP Software
-set CMD_LINE_ARGS=%$
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
-
-:end
-@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index 650892ab5735..cfb04c1ca9a9 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -1,17 +1,19 @@
 """Testcode for Android RPC.
 
-To use it, start a rpc proxy with "python -m tvm.exec.rpc_proxy".
-And configure the proxy host field as commented.
+To use it, start an RPC tracker with "python -m tvm.exec.rpc_tracker".
+Use the tracker's address and port when configuring the RPC app.
+Use "android" as the key if you wish to avoid modifying this script.
 """
 
 import tvm
 import os
-from tvm.contrib import rpc, util, ndk, rpc_proxy
+from tvm import rpc
+from tvm.contrib import util, ndk
 import numpy as np
 
 # Set to be address of tvm proxy.
-proxy_host = os.environ["TVM_ANDROID_RPC_PROXY_HOST"]
-proxy_port = 9090
+tracker_host = os.environ["TVM_TRACKER_HOST"]
+tracker_port = int(os.environ["TVM_TRACKER_PORT"])
 key = "android"
 
 # Change target configuration.
@@ -32,7 +34,7 @@ def test_rpc_module():
     # Build the dynamic lib.
     # If we don't want to do metal and only use cpu, just set target to be target
     f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
-    path_dso1 = temp.relpath("dev_lib.so")
+    path_dso1 = temp.relpath("dev_lib2.so")
     f.export_library(path_dso1, ndk.create_shared)
 
     s = tvm.create_schedule(B.op)
@@ -44,29 +46,31 @@ def test_rpc_module():
     path_dso2 = temp.relpath("cpu_lib.so")
     f.export_library(path_dso2, ndk.create_shared)
 
-    # connect to the proxy
-    remote = rpc.connect(proxy_host, proxy_port, key=key)
+    tracker = rpc.connect_tracker(tracker_host, tracker_port)
+    remote = tracker.request(key, priority=0,
+                             session_timeout=60)
 
-    print('Run GPU test ...')
-    ctx = remote.cl(0)
-    remote.upload(path_dso1)
-    f1 = remote.load_module("dev_lib.so")
+    print('Run CPU test ...')
+    ctx = remote.cpu(0)
+    remote.upload(path_dso2)
+    f2 = remote.load_module("cpu_lib.so")
     a_np = np.random.uniform(size=1024).astype(A.dtype)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+    time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
     cost = time_f(a, b).mean
     print('%g secs/op' % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    print('Run CPU test ...')
-    ctx = remote.cpu(0)
-    remote.upload(path_dso2)
-    f2 = remote.load_module("cpu_lib.so")
+
+    print('Run GPU test ...')
+    ctx = remote.cl(0)
+    remote.upload(path_dso1)
+    f1 = remote.load_module("dev_lib2.so")
     a_np = np.random.uniform(size=1024).astype(A.dtype)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
+    time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
     cost = time_f(a, b).mean
     print('%g secs/op' % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
new file mode 100644
index 000000000000..e83e47c46eb7
--- /dev/null
+++ b/apps/benchmark/README.md
@@ -0,0 +1,70 @@
+# Performance Benchmark
+
+## Results
+
+See results on wiki page https://github.com/dmlc/tvm/wiki/Benchmark
+
+## How to Reproduce
+
+### ARM CPU
+We use RPC infrastructure in TVM to make device management easy. So you need to use it for reproducing benchmark results.
+
+1. Start an RPC Tracker on the host machine
+```bash
+python3 -m tvm.exec.rpc_tracker
+```
+
+2. Register devices to the tracker
+* For Linux device
+  * Build tvm runtime on your device [Help](https://docs.tvm.ai/tutorials/nnvm/deploy_model_on_rasp.html#build-tvm-runtime-on-device)
+  * Register your device to tracker by
+  ```bash
+  python3 -m tvm.exec.rpc_sever --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
+  ```
+  replace `[HOST_IP]` with the IP address of the host machine, `[DEVICE_KEY]` with the name of device.
+  
+  E.g. Here is an example command for RK3399,
+  `python3 -m tvm.exec.rpc_sever --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
+
+* For Android device
+   * Build and install tvm RPC apk on your device [Help](https://github.com/dmlc/tvm/tree/master/apps/android_rpc).
+     Make sure you can pass the android rpc test. Then you have alreadly known how to register.
+
+3. Verify the device registration  
+  We can query all registered devices by
+  ```bash
+  python3 -m tvm.exec.query_rpc_tracker
+  ```
+  You should be able to find your devices in `Queue Status`. Make sure the registration is correct before going ahead.
+
+  For our test environment, one sample output can be 
+  ```bash
+  Queue Status                
+  ------------------------------
+  key            free    pending    
+  ------------------------------
+  mate10pro      1       0   
+  p20pro         2       0  
+  pixel2         2       0 
+  rk3399         2       0
+  rasp3b         8       0
+  ```
+
+ 4. Run benchmark  
+  We did auto-tuning for Huawei P20/Mate10 Pro, Google Pixel2, Raspberry Pi3 and Firefly-RK3399,
+  and release pre-tuned parameters in [this repo](https://github.com/uwsaml/tvm-distro).
+  During compilation, TVM will download these operator parameters automatically.
+
+  ```bash
+  python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key rasp3b
+  python3 arm_cpu_imagenet_bench.py --device rk3399 --rpc-key rk3399
+  python3 arm_cpu_imagenet_bench.py --device pixel2 --rpc-key pixel2
+  python3 arm_cpu_imagenet_bench.py --device p20pro --rpc-key p20pro
+  python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro  
+  ```
+
+  If your device has a same SoC of the above device, you can reuse these parameters
+  (e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target).
+  Otherwise, you need to tune for your own device, please follow this 
+  [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
+
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
new file mode 100644
index 000000000000..7baf244e0dae
--- /dev/null
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -0,0 +1,96 @@
+"""Benchmark script for performance on ARM CPU.
+see README.md for the usage and results of this script.
+"""
+
+import argparse
+import time
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == 'resnet-18':
+        net, params = nnvm.testing.resnet.get_workload(num_layers=18,
+                                                       batch_size=batch_size, image_shape=(3, 224, 224))
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size,
+                                                           version='1.1')
+    elif name == 'vgg-16':
+        net, params = nnvm.testing.vgg.get_workload(batch_size=batch_size, num_layers=16)
+    else:
+        raise RuntimeError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", type=str, choices=['resnet-18', 'mobilenet', 'squeezenet v1.1', 'vgg-16'])
+    parser.add_argument("--device", type=str, required=True, choices=['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro', 
+                                                                      'pixel2', 'rasp3b', 'pynq'])
+    parser.add_argument("--host", type=str, default='localhost')
+    parser.add_argument("--port", type=int, default=9190)
+    parser.add_argument("--rpc-key", type=str, required=True)
+    parser.add_argument("--number", type=int, default=6)
+    args = parser.parse_args()
+
+    dtype = 'float32'
+
+    if args.network is None:
+        networks = ['squeezenet v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
+    else:
+        networks = [args.network]
+
+    target = tvm.target.arm_cpu(model=args.device)
+
+    # connect to remote device
+    tracker = tvm.rpc.connect_tracker(args.host, args.port)
+    remote = tracker.request(args.rpc_key)
+
+    print("--------------------------------------------------")
+    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
+    print("--------------------------------------------------")
+    for network in networks:
+        net, params, input_shape, output_shape = get_network(network, batch_size=1)
+
+        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+        tmp = tempdir()
+        if 'android' in str(target):
+            from tvm.contrib import ndk
+            filename = "%s.so" % network
+            lib.export_library(tmp.relpath(filename), ndk.create_shared)
+        else:
+            filename = "%s.tar" % network
+            lib.export_library(tmp.relpath(filename))
+
+        # upload library and params
+        ctx = remote.context(str(target), 0)
+        remote.upload(tmp.relpath(filename))
+        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+
+        rlib = remote.load_module(filename)
+        module = runtime.create(graph, rlib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module.set_input('data', data_tvm)
+        module.set_input(**rparams)
+
+        # evaluate
+        ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+        print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
new file mode 100644
index 000000000000..fca4e35b6516
--- /dev/null
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -0,0 +1,80 @@
+""" Benchmark script for performance on GPUs.
+
+For example, run the file with:
+`python gpu_imagenet_bench.py --model=mobilenet --target=cuda`.
+For more details about how to set up the inference environment on GPUs,
+please refer to NNVM Tutorial: ImageNet Inference on the GPU
+"""
+import time
+import argparse
+import numpy as np
+import tvm
+import nnvm.compiler
+import nnvm.testing
+from tvm.contrib import util, nvcc
+from tvm.contrib import graph_runtime as runtime
+
+@tvm.register_func
+def tvm_callback_cuda_compile(code):
+    ptx = nvcc.compile_cuda(code, target="ptx")
+    return ptx
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, required=True,
+                        choices=['resnet', 'mobilenet'],
+                        help="The model type.")
+    parser.add_argument('--target', type=str, required=True,
+                        choices=['cuda', 'rocm', 'opencl', 'metal', 'nvptx'],
+                        help="Compilation target.")
+    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
+    parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.")
+    parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.")
+    args = parser.parse_args()
+    opt_level = args.opt_level
+    num_iter = args.num_iter
+    ctx = tvm.context(args.target, 0)
+    batch_size = 1
+    num_classes = 1000
+    image_shape = (3, 224, 224)
+
+    data_shape = (batch_size,) + image_shape
+    out_shape = (batch_size, num_classes)
+    if args.model == 'resnet':
+        net, params = nnvm.testing.resnet.get_workload(
+            batch_size=1, image_shape=image_shape)
+    elif args.model == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(
+            batch_size=1, image_shape=image_shape)
+    else:
+        raise ValueError('no benchmark prepared for {}.'.format(args.model))
+
+    if args.target == "cuda":
+        unroll = 1400
+    else:
+        unroll = 128
+    with nnvm.compiler.build_config(opt_level=opt_level):
+        with tvm.build_config(auto_unroll_max_step=unroll,
+                              unroll_explicit=(args.target != "cuda")):
+            graph, lib, params = nnvm.compiler.build(
+                net, args.target, shape={"data": data_shape}, params=params)
+
+    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+    module = runtime.create(graph, lib, ctx)
+    module.set_input(**params)
+    module.set_input("data", data)
+    module.run()
+    out = module.get_output(0, tvm.nd.empty(out_shape))
+    out.asnumpy()
+
+    print('benchmark args: {}'.format(args))
+    ftimer = module.module.time_evaluator("run", ctx, num_iter)
+    for i in range(args.repeat):
+        prof_res = ftimer()
+        print(prof_res)
+        # sleep for avoiding device overheat
+        if i + 1 != args.repeat:
+            time.sleep(45)
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index 6d7f4bdf7533..bb8b4b694187 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -22,12 +22,11 @@ struct extension_class_info<tvm_ext::IntVector> {
 }  // namespace tvm
 }  // namespace runtime
 
-
-namespace tvm_ext {
-
 using namespace tvm;
 using namespace tvm::runtime;
 
+namespace tvm_ext {
+
 TVM_REGISTER_EXT_TYPE(IntVector);
 
 TVM_REGISTER_GLOBAL("tvm_ext.ivec_create")
@@ -66,3 +65,18 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev")
     *rv = (*tvm::runtime::Registry::Get("device_api.cpu"))();
   });
 }  // namespace tvm_ext
+
+// This callback approach allows extension allows tvm to extract
+// This way can be helpful when we want to use a header only
+// minimum version of TVM Runtime.
+extern "C" int TVMExtDeclare(TVMFunctionHandle pregister) {
+  const PackedFunc& fregister =
+      *static_cast<PackedFunc*>(pregister);
+  auto mul = [](TVMArgs args, TVMRetValue *rv) {
+    int x = args[0];
+    int y = args[1];
+    *rv = x * y;
+  };
+  fregister("mul", PackedFunc(mul));
+  return 0;
+}
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index 0bbfff14eeef..628602f0baea 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -44,8 +44,14 @@ def ivec_cb(v2):
 
     tvm.convert(ivec_cb)(ivec)
 
+def test_extract_ext():
+    fdict = tvm.extract_ext_funcs(tvm_ext._LIB.TVMExtDeclare)
+    assert fdict["mul"](3, 4) == 12
+
+
 if __name__ == "__main__":
     test_ext_dev()
     test_ext_vec()
     test_bind_add()
     test_sym_add()
+    test_extract_ext()
diff --git a/apps/howto_deploy/Makefile b/apps/howto_deploy/Makefile
index 8e59084c60ae..ad4e56680d21 100644
--- a/apps/howto_deploy/Makefile
+++ b/apps/howto_deploy/Makefile
@@ -8,7 +8,7 @@ PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${DMLC_CORE}/include\
 	-I${TVM_ROOT}/dlpack/include\
 
-PKG_LDFLAGS = -L${TVM_ROOT}/lib -ldl -lpthread
+PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -lpthread
 
 .PHONY: clean all
 
diff --git a/apps/howto_deploy/README.md b/apps/howto_deploy/README.md
index 6c732879a6a5..fda6251ae9c5 100644
--- a/apps/howto_deploy/README.md
+++ b/apps/howto_deploy/README.md
@@ -8,4 +8,4 @@ Type the following command to run the sample code under the current folder(need
 ./run_example.sh
 ```
 
-Checkout [How to Deploy TVM Modules](http://docs.tvmlang.org/how_to/deploy.html) for more information.
+Checkout [How to Deploy TVM Modules](http://docs.tvm.ai/deploy/cpp_deploy.html) for more information.
diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc
index e3a88550dc2b..1fd22e5f2b5f 100644
--- a/apps/howto_deploy/cpp_deploy.cc
+++ b/apps/howto_deploy/cpp_deploy.cc
@@ -28,10 +28,10 @@ void Verify(tvm::runtime::Module mod, std::string fname) {
   DLTensor* x;
   DLTensor* y;
   int ndim = 1;
-  int dtype_code = kFloat;
+  int dtype_code = kDLFloat;
   int dtype_bits = 32;
   int dtype_lanes = 1;
-  int device_type = kCPU;
+  int device_type = kDLCPU;
   int device_id = 0;
   int64_t shape[1] = {10};
   TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes,
diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc
index e5c65b66b71a..27f95e9e6065 100644
--- a/apps/howto_deploy/tvm_runtime_pack.cc
+++ b/apps/howto_deploy/tvm_runtime_pack.cc
@@ -25,7 +25,9 @@
 #include "../../src/runtime/module.cc"
 #include "../../src/runtime/registry.cc"
 #include "../../src/runtime/file_util.cc"
+#include "../../src/runtime/threading_backend.cc"
 #include "../../src/runtime/thread_pool.cc"
+#include "../../src/runtime/ndarray.cc"
 
 // NOTE: all the files after this are optional modules
 // that you can include remove, depending on how much feature you use.
@@ -44,10 +46,19 @@
 // #include "../../src/runtime/rpc/rpc_event_impl.cc"
 // #include "../../src/runtime/rpc/rpc_server_env.cc"
 
+// These macros enables the device API when uncommented.
+#define TVM_CUDA_RUNTIME 1
+#define TVM_METAL_RUNTIME 1
+#define TVM_OPENCL_RUNTIME 1
+
 // Uncomment the following lines to enable Metal
 // #include "../../src/runtime/metal/metal_device_api.mm"
 // #include "../../src/runtime/metal/metal_module.mm"
 
+// Uncomment the following lines to enable CUDA
+// #include "../../src/runtime/cuda/cuda_device_api.cc"
+// #include "../../src/runtime/cuda/cuda_module.cc"
+
 // Uncomment the following lines to enable OpenCL
 // #include "../../src/runtime/opencl/opencl_device_api.cc"
 // #include "../../src/runtime/opencl/opencl_module.cc"
diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
index a3df1d3a9043..c0cfcde6e294 100644
--- a/apps/ios_rpc/tests/ios_rpc_test.py
+++ b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -6,7 +6,8 @@
 
 import tvm
 import os
-from tvm.contrib import rpc, util, xcode
+from tvm import rpc
+from tvm.contrib import util, xcode
 import numpy as np
 
 # Set to be address of tvm proxy.
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.h b/apps/ios_rpc/tvmrpc/TVMRuntime.h
index a758f9454460..fec351e7b22b 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.h
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.h
@@ -31,9 +31,12 @@ using FEventHandler = std::function<int(const std::string& in_bytes, int event_f
  *
  * \param outputStream The output stream used to send outputs.
  * \param name The name of the server.
+ * \param remote_key The remote key
  * \return The event handler.
  */
-FEventHandler CreateServerEventHandler(NSOutputStream *outputStream, std::string name);
+FEventHandler CreateServerEventHandler(NSOutputStream *outputStream,
+                                       std::string name,
+                                       std::string remote_key);
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
index 881c82ddb05e..53e58edc5b5d 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
@@ -8,12 +8,14 @@
 #include "../../src/runtime/cpu_device_api.cc"
 #include "../../src/runtime/workspace_pool.cc"
 #include "../../src/runtime/thread_pool.cc"
+#include "../../src/runtime/threading_backend.cc"
 #include "../../src/runtime/module_util.cc"
 #include "../../src/runtime/system_lib_module.cc"
 #include "../../src/runtime/module.cc"
 #include "../../src/runtime/registry.cc"
 #include "../../src/runtime/file_util.cc"
 #include "../../src/runtime/dso_module.cc"
+#include "../../src/runtime/ndarray.cc"
 // RPC server
 #include "../../src/runtime/rpc/rpc_session.cc"
 #include "../../src/runtime/rpc/rpc_server_env.cc"
@@ -59,9 +61,10 @@ size_t Recv(void* data, size_t size) final {
   NSOutputStream* stream_;
 };
 
-FEventHandler CreateServerEventHandler(NSOutputStream *outputStream, std::string name) {
+FEventHandler CreateServerEventHandler(
+    NSOutputStream *outputStream, std::string name, std::string remote_key) {
   std::unique_ptr<NSStreamChannel> ch(new NSStreamChannel(outputStream));
-  std::shared_ptr<RPCSession> sess = RPCSession::Create(std::move(ch), name);
+  std::shared_ptr<RPCSession> sess = RPCSession::Create(std::move(ch), name, remote_key);
   return [sess](const std::string& in_bytes, int flag) {
     return sess->ServerEventHandler(in_bytes, flag);
   };
@@ -101,13 +104,13 @@ void LaunchSyncServer() {
       ->ServerLoop();
 }
 
-TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.workpath")
+TVM_REGISTER_GLOBAL("tvm.rpc.server.workpath")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     static RPCEnv env;
     *rv = env.GetPath(args[0]);
   });
 
-TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.load_module")
+TVM_REGISTER_GLOBAL("tvm.rpc.server.load_module")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
     std::string name = args[0];
     std::string fmt = GetFileFormat(name, "");
diff --git a/apps/ios_rpc/tvmrpc/ViewController.mm b/apps/ios_rpc/tvmrpc/ViewController.mm
index f25501809b01..98527bd67b50 100644
--- a/apps/ios_rpc/tvmrpc/ViewController.mm
+++ b/apps/ios_rpc/tvmrpc/ViewController.mm
@@ -143,7 +143,7 @@ - (void)open {
   [outputStream_ scheduleInRunLoop:[NSRunLoop currentRunLoop] forMode:NSDefaultRunLoopMode];
   [outputStream_ open];
   [inputStream_ open];
-  handler_ = tvm::runtime::CreateServerEventHandler(outputStream_, key_);
+  handler_ = tvm::runtime::CreateServerEventHandler(outputStream_, key_, "%toinit");
   CHECK(handler_ != nullptr);
   self.infoText.text = @"";
   self.statusLabel.text = @"Connecting...";
@@ -169,7 +169,6 @@ - (IBAction)connect:(id)sender {
 }
 
 - (IBAction)disconnect:(id)sender {
-
   [self close];
 }
 
diff --git a/apps/pynq_rpc/start_rpc_server.sh b/apps/pynq_rpc/start_rpc_server.sh
new file mode 100755
index 000000000000..30b3c9a90d6b
--- /dev/null
+++ b/apps/pynq_rpc/start_rpc_server.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
+
+export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
+python -m vta.exec.rpc_server
diff --git a/apps/rocm_rpc/Makefile b/apps/rocm_rpc/Makefile
new file mode 100644
index 000000000000..b4e527980941
--- /dev/null
+++ b/apps/rocm_rpc/Makefile
@@ -0,0 +1,23 @@
+# Makefile Example to deploy TVM modules.
+ROCM_PATH=/opt/rocm
+
+TVM_ROOT=$(shell cd ../..; pwd)
+NNVM_PATH=nnvm
+DMLC_CORE=${TVM_ROOT}/dmlc-core
+
+PKG_CFLAGS = -std=c++11 -O2 -fPIC\
+	-I${TVM_ROOT}/include\
+	-I${DMLC_CORE}/include\
+	-I${TVM_ROOT}/dlpack/include\
+	-I${ROCM_PATH}/include
+
+PKG_LDFLAGS = -L${ROCM_PATH}/lib -L${TVM_ROOT}/lib -ldl -lpthread -lhip_hcc -lMIOpen
+
+.PHONY: clean all
+
+all: lib/libtvm_runtime_rocm.so
+
+# Build rule for all in one TVM package library
+lib/libtvm_runtime_rocm.so: rocm_runtime_pack.cc
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -shared -o $@ $(filter %.cc %.o %.a, $^) $(PKG_LDFLAGS)
diff --git a/apps/rocm_rpc/README.md b/apps/rocm_rpc/README.md
new file mode 100644
index 000000000000..70ce9780a31d
--- /dev/null
+++ b/apps/rocm_rpc/README.md
@@ -0,0 +1,41 @@
+# TVM ROCm RPC
+
+This folder contains a simple recipe to make RPC work together with ROCm.TVM's RPC server relies on process
+fork to create a new process for each incoming session.
+Like CUDA, opencl driver, the runtime ROCm runtime is not fork-safe.
+A typical CUDA or opencl driver will initialize lazily
+and we can use normal TVM RPC server because we won't touch the driver API before we fork a new session.
+However, the current ROCm runtime eagerly initialize during startup and will directly cause error during fork.
+This folder provides a workaround to this problem.
+
+## Usage
+- Build tvm **without** rocm (it is important to exclude rocm from runtime)
+- Modify the ROCM_PATH to be the correct path the current [Makefile](Makefile)
+- Type make to build lib/libtvm_runtime_rocm.so, which is a standalone dll module
+- Use [start_rpc_server.sh](start_rpc_server.sh) to start the RPC server
+
+## How it works
+- The RPC server starts without ROCm dependency.
+- lib/libtvm_runtim_rocm.so is dynamically loaded only after the fork.
+
+## Note
+With ROCm RPC, we can build AMDGPU program from a machine without AMD GPU
+and remotely upload and execute on a AMDGPU machine.
+Please note that you will need to set the gfx version correctly(via ```-model``` or ```-mcpu```)
+because we can no longer query the GPU version dynamically during runtime.
+
+
+```python
+import tvm
+from tvm.contrib import rpc
+
+# set mcpu explicitly to be the gpu version.
+target = "rocm -mcpu=gfx900"
+remote = rpc.connect(server_host, server_port)
+mod = tvm.build(s, args, target)
+mod.export_library("mylib.so")
+
+remote.upload("mylib.so")
+foo = remote.load_module("mylib.so")
+# same as normal RPC
+```
diff --git a/apps/rocm_rpc/rocm_runtime_pack.cc b/apps/rocm_rpc/rocm_runtime_pack.cc
new file mode 100644
index 000000000000..174d9f0a8270
--- /dev/null
+++ b/apps/rocm_rpc/rocm_runtime_pack.cc
@@ -0,0 +1,15 @@
+/*!
+ * \brief This is an all in one file for ROCM runtime library.
+ *
+ * This is used to create a RPC module library that can be
+ * safely passed to rocm
+ */
+
+#define TVM_ROCM_RUNTIME 1
+#define TVM_USE_MIOPEN 1
+#define __HIP_PLATFORM_HCC__ 1
+
+#include "../../src/runtime/rocm/rocm_device_api.cc"
+#include "../../src/runtime/rocm/rocm_module.cc"
+#include "../../src/contrib/miopen/conv_forward.cc"
+#include "../../src/contrib/miopen/miopen_utils.cc"
diff --git a/apps/rocm_rpc/start_rpc_server.sh b/apps/rocm_rpc/start_rpc_server.sh
new file mode 100755
index 000000000000..e082d9d63ee6
--- /dev/null
+++ b/apps/rocm_rpc/start_rpc_server.sh
@@ -0,0 +1,5 @@
+#/bin/bash
+PROJ_ROOT=$(realpath $(dirname "$0")/../..)
+export PYTHONPATH=${PROJ_ROOT}/python:${PYTHONPATH}
+
+python -m tvm.exec.rpc_server "$@" --load-library=${PROJ_ROOT}/apps/rocm_rpc/lib/libtvm_runtime_rocm.so
diff --git a/apps/sgx/.gitignore b/apps/sgx/.gitignore
new file mode 100644
index 000000000000..c3af857904eb
--- /dev/null
+++ b/apps/sgx/.gitignore
@@ -0,0 +1 @@
+lib/
diff --git a/apps/sgx/Makefile b/apps/sgx/Makefile
new file mode 100644
index 000000000000..cd7034d4c41b
--- /dev/null
+++ b/apps/sgx/Makefile
@@ -0,0 +1,88 @@
+# Makefile for example to deploy TVM modules in SGX.
+
+TVM_ROOT := $(shell cd ../..; pwd)
+NNVM_PATH := nnvm
+DMLC_CORE := ${TVM_ROOT}/dmlc-core
+
+SGX_SDK ?= /opt/sgxsdk
+SGX_MODE ?= SIM
+SGX_ARCH ?= x64
+SGX_DEBUG ?= 1
+
+sgx_edger8r := $(SGX_SDK)/bin/x64/sgx_edger8r
+sgx_enclave_signer := $(SGX_SDK)/bin/x64/sgx_sign
+
+ifneq ($(SGX_MODE), HW)
+	sgx_sim := _sim
+endif
+urts_library_name := sgx_urts$(sgx_sim)
+trts_library_name := sgx_trts$(sgx_sim)
+tservice_library_name := sgx_tservice$(sgx_sim)
+uservice_library_name := sgx_uae_service$(sgx_sim)
+
+pkg_cflags := -std=c++11 -O2 -fPIC\
+	-I${TVM_ROOT}/include\
+	-I${DMLC_CORE}/include\
+	-I${TVM_ROOT}/dlpack/include\
+	-I.\
+	-DDMLC_LOG_STACK_TRACE=0\
+	-fmax-errors=4
+
+pkg_ldflags := -L${TVM_ROOT}/lib
+
+enclave_include_paths := -I$(SGX_SDK)/include\
+	-I$(SGX_SDK)/include/tlibc\
+	-I$(SGX_SDK)/include/libcxx\
+	-I$(SGX_SDK)/include/stdc++\
+
+enclave_cflags := -static -nostdinc\
+	-fvisibility=hidden -fpie -fstack-protector-strong\
+	-ffunction-sections -fdata-sections\
+	-DDMLC_CXX11_THREAD_LOCAL=0\
+	-include "lib/tvm_t.h"\
+	$(enclave_include_paths)\
+
+enclave_cxxflags := -nostdinc++ $(enclave_cflags) -DTVM_SGX_MAX_CONCURRENCY=4
+
+enclave_ldflags :=\
+	-Wl,--no-undefined -nostdlib -nodefaultlibs -nostartfiles -L$(SGX_SDK)/lib64\
+	-Wl,--whole-archive -l$(trts_library_name) -Wl,--no-whole-archive\
+	-Wl,--start-group\
+	-lsgx_tstdc -lsgx_tstdcxx -lsgx_tcxx -lsgx_tcrypto -lsgx_tkey_exchange -l$(tservice_library_name)\
+	-Wl,--end-group\
+	-Wl,-Bstatic -Wl,-Bsymbolic -Wl,--no-undefined\
+	-Wl,-pie,-eenclave_entry -Wl,--export-dynamic\
+	-Wl,--defsym,__ImageBase=0 -Wl,--gc-sections
+
+.PHONY: clean all
+
+all: lib/test_addone.signed.so
+
+# The code library built by TVM
+lib/test_addone_sys.o: prepare_test_libs.py
+	python prepare_test_libs.py
+
+lib/tvm_t.h: ../../src/runtime/sgx/tvm.edl
+	$(sgx_edger8r) --trusted $< --trusted-dir lib --search-path $(SGX_SDK)/include
+	mv $@ $@.in
+	awk 'NR==4{print "#include <tvm/runtime/c_runtime_api.h>"}1' $@.in > $@
+
+lib/tvm_t.c: lib/tvm_t.h
+
+lib/tvm_t.o: lib/tvm_t.c
+	$(CC) $(enclave_cflags) $(pkg_cflags) -c $< -o $@ -include $(TVM_ROOT)/include/tvm/runtime/c_runtime_api.h
+
+# The enclave library
+lib/test_addone.so: $(TVM_ROOT)/src/runtime/sgx/trusted/runtime.cc lib/tvm_t.o lib/test_addone_sys.o
+	$(CXX) $^ -o $@ $(pkg_cflags) $(pkg_ldflags) $(enclave_cxxflags) $(enclave_ldflags) -g
+
+# The demo enclave signing key
+lib/enclave.pem:
+	curl -Lso $@ https://gist.githubusercontent.com/nhynes/8a2d80068a92e672f8b0b7d710ceb404/raw/2d5ae5fbe83198ede49465fdc6535065e093543b/tvm_sgx_demo.pem
+
+# The signed enclave
+lib/test_addone.signed.so: lib/test_addone.so enclave_config.xml lib/enclave.pem
+	$(sgx_enclave_signer) sign -key lib/enclave.pem -enclave $< -out $@ -config enclave_config.xml
+
+clean:
+	rm -rf lib
diff --git a/apps/sgx/README.md b/apps/sgx/README.md
new file mode 100644
index 000000000000..565519d457ce
--- /dev/null
+++ b/apps/sgx/README.md
@@ -0,0 +1,34 @@
+# TVM in Intel SGX Example
+
+This application demonstrates the use of a simple TVM model in the [Intel SGX](https://software.intel.com/en-us/blogs/2013/09/26/protecting-application-secrets-with-intel-sgx) trusted computing environment.
+
+## Prerequisites
+
+1. A GNU/Linux environment
+2. TVM compiled with LLVM and SGX; and the `tvm` Python module
+3. The [Linux SGX SDK](https://github.com/intel/linux-sgx) [link to pre-built libraries](https://01.org/intel-software-guard-extensions/downloads)
+
+## Running the example
+
+`SGX_SDK=/path/to/sgxsdk bash run_example.sh`
+
+If everything goes well, you should see a lot of build messages and below them
+the text `It works!`.
+
+## High-level overview
+
+First of all, it helps to think of an SGX enclave as a library that can be called
+to perform trusted computation.
+In this library, one can use other libraries like TVM.
+
+Building this example performs the following steps:
+
+1. Creates a simple TVM module that computes `x + 1` and save it as a system library.
+2. Builds a minimal TVM runtime pack that can load the module.
+3. Links the TVM module into an SGX enclave along with some code that runs the module.
+4. Compiles and runs an executable that loads the enclave and calls a function
+   which invokes the TVM module.
+
+For more information on building, please refer to the `Makefile`.  
+For more information on the TVM module, please refer to `../howto_deploy`.  
+For more in formation on SGX enclaves, please refer to the [SGX Enclave Demo](https://github.com/intel/linux-sgx/tree/master/SampleCode/SampleEnclave/)
diff --git a/apps/sgx/enclave_config.xml b/apps/sgx/enclave_config.xml
new file mode 100644
index 000000000000..07be0d7a7ad2
--- /dev/null
+++ b/apps/sgx/enclave_config.xml
@@ -0,0 +1,11 @@
+<EnclaveConfiguration>
+  <ProdID>0</ProdID>
+  <ISVSVN>0</ISVSVN>
+  <StackMaxSize>0x2000</StackMaxSize>
+  <HeapMaxSize>0x2000</HeapMaxSize>
+  <TCSNum>5</TCSNum>
+  <TCSPolicy>1</TCSPolicy>
+  <DisableDebug>0</DisableDebug>
+  <MiscSelect>0</MiscSelect>
+  <MiscMask>0xFFFFFFFF</MiscMask>
+</EnclaveConfiguration>
diff --git a/apps/sgx/prepare_test_libs.py b/apps/sgx/prepare_test_libs.py
new file mode 100644
index 000000000000..f676f46b7ff0
--- /dev/null
+++ b/apps/sgx/prepare_test_libs.py
@@ -0,0 +1,26 @@
+"""Script to prepare test_addone_sys.o"""
+
+from os import path as osp
+
+import tvm
+
+CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
+
+
+def main():
+    out_dir = osp.join(CWD, 'lib')
+
+    n = tvm.var('n')
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
+    s = tvm.create_schedule(B.op)
+    s[B].parallel(s[B].op.axis[0])
+    print(tvm.lower(s, [A, B], simple_mode=True))
+
+    # Compile library in system library mode
+    fadd_syslib = tvm.build(s, [A, B], 'llvm --system-lib')
+    fadd_syslib.save(osp.join(out_dir, 'test_addone_sys.o'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/sgx/run_example.sh b/apps/sgx/run_example.sh
new file mode 100755
index 000000000000..9334b260cbf3
--- /dev/null
+++ b/apps/sgx/run_example.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+sgx_sdk=${SGX_SDK:=/opt/sgxsdk}
+make
+echo "========================="
+LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH} TVM_CACHE_DIR=/tmp python test_addone.py
diff --git a/apps/sgx/test_addone.py b/apps/sgx/test_addone.py
new file mode 100644
index 000000000000..5ddccfa425cc
--- /dev/null
+++ b/apps/sgx/test_addone.py
@@ -0,0 +1,13 @@
+import tvm
+import numpy as np
+
+ctx = tvm.context('cpu', 0)
+fadd1 = tvm.module.load('lib/test_addone.signed.so')
+
+n = 10
+x = tvm.nd.array(np.random.uniform(size=n).astype('float32'), ctx)
+y = tvm.nd.array(np.zeros(n, dtype='float32'), ctx)
+fadd1(x, y)
+
+np.testing.assert_allclose(y.asnumpy(), x.asnumpy() + 1)
+print("It works!")
diff --git a/cmake/config.cmake b/cmake/config.cmake
new file mode 100644
index 000000000000..85c5102169a9
--- /dev/null
+++ b/cmake/config.cmake
@@ -0,0 +1,116 @@
+#--------------------------------------------------------------------
+#  Template custom cmake configuration for compiling
+#
+#  This file is used to override the build options in build.
+#  If you want to change the configuration, please use the following
+#  steps. Assume you are on the root directory. First copy the this
+#  file so that any local changes will be ignored by git
+#
+#  $ mkdir build
+#  $ cp cmake/config.cmake build
+#
+#  Next modify the according entries, and then compile by
+#
+#  $ cd build
+#  $ cmake ..
+#
+#  Then buld in parallel with 8 threads
+#
+#  $ make -j8
+#--------------------------------------------------------------------
+
+#---------------------------------------------
+# Backend runtimes.
+#---------------------------------------------
+
+# Whether enable CUDA during compile,
+#
+# Possible values:
+# - ON: enable CUDA with cmake's auto search
+# - OFF: disbale CUDA
+# - /path/to/cuda: use specific path to cuda toolkit
+set(USE_CUDA OFF)
+
+# Whether enable ROCM runtime
+#
+# Possible values:
+# - ON: enable ROCM with cmake's auto search
+# - OFF: disbale ROCM
+# - /path/to/rocm: use specific path to rocm
+set(USE_ROCM OFF)
+
+# Whether enable SDAccel runtime
+set(USE_SDACCEL OFF)
+
+# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
+set(USE_AOCL OFF)
+
+# Whether enable OpenCL runtime
+set(USE_OPENCL OFF)
+
+# Whether enable Metal runtime
+set(USE_METAL OFF)
+
+# Whether enable Vulkan runtime
+#
+# Possible values:
+# - ON: enable Vulkan with cmake's auto search
+# - OFF: disbale vulkan
+# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
+set(USE_VULKAN OFF)
+
+# Whether enable OpenGL runtime
+set(USE_OPENGL OFF)
+
+# Whether enable RPC runtime
+set(USE_RPC ON)
+
+# Whether enable tiny embedded graph runtime.
+set(USE_GRAPH_RUNTIME ON)
+
+# Whether enable additional graph debug functions
+set(USE_GRAPH_RUNTIME_DEBUG OFF)
+
+# Whether build with LLVM support
+# Requires LLVM version >= 4.0
+#
+# Possible values:
+# - ON: enable llvm with cmake's find search
+# - OFF: disbale llvm
+# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
+set(USE_LLVM OFF)
+
+#---------------------------------------------
+# Contrib libraries
+#---------------------------------------------
+# Whether use BLAS, choices: openblas, mkl, atlas, apple
+set(USE_BLAS none)
+
+# /path/to/mkl: mkl root path when use mkl blas library
+# set(USE_MKL_PATH /opt/intel/mkl) for UNIX
+# set(USE_MKL_PATH ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
+set(USE_MKL_PATH none)
+
+# Whether use contrib.random in runtime
+set(USE_RANDOM OFF)
+
+# Whether use NNPack
+set(USE_NNPACK OFF)
+
+# Whether use CuDNN
+set(USE_CUDNN OFF)
+
+# Whether use cuBLAS
+set(USE_CUBLAS OFF)
+
+# Whether use MIOpen
+set(USE_MIOPEN OFF)
+
+# Whether use MPS
+set(USE_MPS OFF)
+
+# Whether use rocBlas
+set(USE_ROCBLAS OFF)
+
+# Whether use contrib sort
+set(USE_SORT OFF)
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
new file mode 100644
index 000000000000..70c8c8eebe28
--- /dev/null
+++ b/cmake/modules/CUDA.cmake
@@ -0,0 +1,40 @@
+# CUDA Module
+find_cuda(${USE_CUDA})
+
+if(CUDA_FOUND)
+  # always set the includedir when cuda is available
+  # avoid global retrigger of cmake
+	include_directories(${CUDA_INCLUDE_DIRS})
+endif(CUDA_FOUND)
+
+if(USE_CUDA)
+  if(NOT CUDA_FOUND)
+    message(FATAL_ERROR "Cannot find CUDA, USE_CUDA=" ${USE_CUDA})
+  endif()
+  message(STATUS "Build with CUDA support")
+  file(GLOB RUNTIME_CUDA_SRCS src/runtime/cuda/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_SRCS})
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_cuda_on.cc)
+
+  list(APPEND TVM_LINKER_LIBS ${CUDA_NVRTC_LIBRARY})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDART_LIBRARY})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDA_LIBRARY})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_NVRTC_LIBRARY})
+
+  if(USE_CUDNN)
+    message(STATUS "Build with cuDNN support")
+    file(GLOB CONTRIB_CUDNN_SRCS src/contrib/cudnn/*.cc)
+    list(APPEND RUNTIME_SRCS ${CONTRIB_CUDNN_SRCS})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDNN_LIBRARY})
+  endif(USE_CUDNN)
+
+  if(USE_CUBLAS)
+    message(STATUS "Build with cuBLAS support")
+    file(GLOB CONTRIB_CUBLAS_SRCS src/contrib/cublas/*.cc)
+    list(APPEND RUNTIME_SRCS ${CONTRIB_CUBLAS_SRCS})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUBLAS_LIBRARY})
+  endif(USE_CUBLAS)
+
+else(USE_CUDA)
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_cuda_off.cc)
+endif(USE_CUDA)
diff --git a/cmake/modules/LLVM.cmake b/cmake/modules/LLVM.cmake
new file mode 100644
index 000000000000..3e896a601056
--- /dev/null
+++ b/cmake/modules/LLVM.cmake
@@ -0,0 +1,21 @@
+# LLVM rules
+add_definitions(-DDMLC_USE_FOPEN64=0)
+
+if(NOT USE_LLVM STREQUAL "OFF")
+  find_llvm(${USE_LLVM})
+  include_directories(${LLVM_INCLUDE_DIRS})
+  add_definitions(${LLVM_DEFINITIONS})
+  message(STATUS "Build with LLVM " ${LLVM_PACKAGE_VERSION})
+  message(STATUS "Set TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION})
+  # Set flags that are only needed for LLVM target
+  add_definitions(-DTVM_LLVM_VERSION=${TVM_LLVM_VERSION})
+  file(GLOB COMPILER_LLVM_SRCS src/codegen/llvm/*.cc)
+  list(APPEND TVM_LINKER_LIBS ${LLVM_LIBS})
+  list(APPEND COMPILER_SRCS ${COMPILER_LLVM_SRCS})
+  if(NOT MSVC)
+    set_source_files_properties(${COMPILER_LLVM_SRCS}
+      PROPERTIES COMPILE_DEFINITIONS "DMLC_ENABLE_RTTI=0")
+    set_source_files_properties(${COMPILER_LLVM_SRCS}
+      PROPERTIES COMPILE_FLAGS "-fno-rtti")
+  endif()
+endif()
diff --git a/cmake/modules/Metal.cmake b/cmake/modules/Metal.cmake
new file mode 100644
index 000000000000..27aa5a226f2b
--- /dev/null
+++ b/cmake/modules/Metal.cmake
@@ -0,0 +1,17 @@
+if(USE_METAL)
+  message(STATUS "Build with Metal support")
+  find_library(METAL_LIB Metal)
+  find_library(FOUNDATION_LIB Foundation)
+  file(GLOB RUNTIME_METAL_SRCS src/runtime/metal/*.mm)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${METAL_LIB} ${FOUNDATION_LIB})
+  list(APPEND RUNTIME_SRCS ${RUNTIME_METAL_SRCS})
+
+  if(USE_MPS)
+    file(GLOB MPS_CONTRIB_SRC src/contrib/mps/*.mm)
+    list(APPEND RUNTIME_SRCS ${MPS_CONTRIB_SRC})
+    find_library(MPS_CONTRIB_LIB MetalPerformanceShaders)
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${MPS_CONTRIB_LIB})
+  endif()
+else(USE_METAL)
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_metal_off.cc)
+endif(USE_METAL)
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
new file mode 100644
index 000000000000..b30df1864522
--- /dev/null
+++ b/cmake/modules/OpenCL.cmake
@@ -0,0 +1,42 @@
+# OPENCL Module
+find_package(OpenCL QUIET)
+
+if(OpenCL_FOUND)
+  # always set the includedir when cuda is available
+  # avoid global retrigger of cmake
+  include_directories(${OpenCL_INCLUDE_DIRS})
+endif(OpenCL_FOUND)
+
+if(USE_SDACCEL)
+  message(STATUS "Build with SDAccel support")
+  file(GLOB RUNTIME_SDACCEL_SRCS src/runtime/opencl/sdaccel/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_SDACCEL_SRCS})
+  if(NOT USE_OPENCL)
+    message(STATUS "Enable OpenCL support required for SDAccel")
+    set(USE_OPENCL ON)
+  endif()
+else()
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_sdaccel_off.cc)
+endif(USE_SDACCEL)
+
+if(USE_AOCL)
+  message(STATUS "Build with Intel FPGA SDK for OpenCL support")
+  file(GLOB RUNTIME_AOCL_SRCS src/runtime/opencl/aocl/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_AOCL_SRCS})
+  if(NOT USE_OPENCL)
+    message(STATUS "Enable OpenCL support required for Intel FPGA SDK for OpenCL")
+    set(USE_OPENCL ON)
+  endif()
+else()
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_aocl_off.cc)
+endif(USE_AOCL)
+
+if(USE_OPENCL)
+  find_package(OpenCL REQUIRED)
+  message(STATUS "Build with OpenCL support")
+  file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
+  list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
+else()
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_opencl_off.cc)
+endif(USE_OPENCL)
diff --git a/cmake/modules/OpenGL.cmake b/cmake/modules/OpenGL.cmake
new file mode 100644
index 000000000000..2b62c9f302d4
--- /dev/null
+++ b/cmake/modules/OpenGL.cmake
@@ -0,0 +1,18 @@
+find_package(OpenGL QUIET)
+
+if(OpenGL_FOUND)
+  # always set the includedir when dir is available
+  # avoid global retrigger of cmake
+  include_directories(${OPENGL_INCLUDE_DIRS})
+endif(OpenGL_FOUND)
+
+if(USE_OPENGL)
+  find_package(OpenGL REQUIRED)
+  find_package(glfw3 QUIET REQUIRED)
+  message(STATUS "Build with OpenGL support")
+  file(GLOB RUNTIME_OPENGL_SRCS src/runtime/opengl/*.cc)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenGL_LIBRARIES} glfw)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_OPENGL_SRCS})
+else(USE_OPENGL)
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_opengl_off.cc)
+endif(USE_OPENGL)
diff --git a/cmake/modules/ROCM.cmake b/cmake/modules/ROCM.cmake
new file mode 100644
index 000000000000..0e45fdac66d9
--- /dev/null
+++ b/cmake/modules/ROCM.cmake
@@ -0,0 +1,36 @@
+# ROCM Module
+find_rocm(${USE_ROCM})
+
+if(ROCM_FOUND)
+  # always set the includedir
+  # avoid global retrigger of cmake
+  include_directories(${ROCM_INCLUDE_DIRS})
+  add_definitions(-D__HIP_PLATFORM_HCC__=1)
+endif(ROCM_FOUND)
+
+
+if(USE_ROCM)
+  if(NOT ROCM_FOUND)
+    message(FATAL_ERROR "Cannot find ROCM, USE_ROCM=" ${USE_ROCM})
+  endif()
+  message(STATUS "Build with ROCM support")
+  file(GLOB RUNTIME_ROCM_SRCS src/runtime/rocm/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_ROCM_SRCS})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_HIPHCC_LIBRARY})
+
+  if(USE_MIOPEN)
+    message(STATUS "Build with MIOpen support")
+    file(GLOB MIOPEN_CONTRIB_SRCS src/contrib/miopen/*.cc)
+    list(APPEND RUNTIME_SRCS ${MIOPEN_CONTRIB_SRCS})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_MIOPEN_LIBRARY})
+  endif(USE_MIOPEN)
+
+  if(USE_ROCBLAS)
+    message(STATUS "Build with RocBLAS support")
+    file(GLOB ROCBLAS_CONTRIB_SRCS src/contrib/rocblas/*.cc)
+    list(APPEND RUNTIME_SRCS ${ROCBLAS_CONTRIB_SRCS})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_ROCBLAS_LIBRARY})
+  endif(USE_ROCBLAS)
+else(USE_ROCM)
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_rocm_off.cc)
+endif(USE_ROCM)
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
new file mode 100644
index 000000000000..43fb700203c7
--- /dev/null
+++ b/cmake/modules/VTA.cmake
@@ -0,0 +1,51 @@
+# CMake Build rules for VTA
+find_program(PYTHON NAMES python python3 python3.6)
+
+if(MSVC)
+  message(STATUS "VTA build is skipped in Windows..")
+elseif(PYTHON)
+  set(VTA_CONFIG ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/vta/config/vta_config.py)
+
+  if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
+    message(STATUS "Use VTA config " ${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
+    set(VTA_CONFIG ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/vta/config/vta_config.py
+      --use-cfg=${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
+  endif()
+
+  execute_process(COMMAND ${VTA_CONFIG} --target OUTPUT_VARIABLE __vta_target)
+  string(STRIP ${__vta_target} VTA_TARGET)
+
+  message(STATUS "Build VTA runtime with target: " ${VTA_TARGET})
+
+  execute_process(COMMAND ${VTA_CONFIG} --defs OUTPUT_VARIABLE __vta_defs)
+
+  string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_=.]*" VTA_DEFINITIONS "${__vta_defs}")
+
+  file(GLOB VTA_RUNTIME_SRCS vta/src/*.cc)
+  file(GLOB __vta_target_srcs vta/src/${VTA_TARGET}/*.cc)
+  list(APPEND VTA_RUNTIME_SRCS ${__vta_target_srcs})
+
+  add_library(vta SHARED ${VTA_RUNTIME_SRCS})
+
+  target_include_directories(vta PUBLIC vta/include)
+
+  foreach(__def ${VTA_DEFINITIONS})
+    string(SUBSTRING ${__def} 3 -1 __strip_def)
+    target_compile_definitions(vta PUBLIC ${__strip_def})
+  endforeach()
+
+  if(APPLE)
+    set_target_properties(vta PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+  endif(APPLE)
+
+  # PYNQ rules
+  if(${VTA_TARGET} STREQUAL "pynq")
+    find_library(__sds_lib NAMES sds_lib PATHS /usr/lib)
+    find_library(__dma_lib NAMES dma PATHS
+      "/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/"
+      "/opt/python3.6/lib/python3.6/site-packages/pynq/lib/")
+    target_link_libraries(vta ${__sds_lib} ${__dma_lib})
+  endif()
+else()
+  message(STATUS "Cannot found python in env, VTA build is skipped..")
+endif()
diff --git a/cmake/modules/Vulkan.cmake b/cmake/modules/Vulkan.cmake
new file mode 100644
index 000000000000..4093f88f2e10
--- /dev/null
+++ b/cmake/modules/Vulkan.cmake
@@ -0,0 +1,22 @@
+# Be compatible with older version of CMake
+find_vulkan(${USE_VULKAN})
+
+if(Vulkan_FOUND)
+  # always set the includedir
+  # avoid global retrigger of cmake
+  include_directories(${Vulkan_INCLUDE_DIRS})
+endif(Vulkan_FOUND)
+
+if(USE_VULKAN)
+  if(NOT Vulkan_FOUND)
+    message(FATAL_ERROR "Cannot find Vulkan, USE_VULKAN=" ${USE_VULKAN})
+  endif()
+  message(STATUS "Build with VULKAN support")
+  file(GLOB RUNTIME_VULKAN_SRCS src/runtime/vulkan/*.cc)
+  file(GLOB COMPILER_VULKAN_SRCS src/codegen/spirv/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_VULKAN_SRCS})
+  list(APPEND COMPILER_SRCS ${COMPILER_VULKAN_SRCS})
+
+  list(APPEND TVM_LINKER_LIBS ${Vulkan_SPIRV_TOOLS_LIBRARY})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${Vulkan_LIBRARY})
+endif(USE_VULKAN)
diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
new file mode 100644
index 000000000000..45269a20715d
--- /dev/null
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -0,0 +1,34 @@
+# Plugin rules for cblas
+file(GLOB CBLAS_CONTRIB_SRC src/contrib/cblas/*.cc)
+
+if(USE_BLAS STREQUAL "openblas")
+  find_library(BLAS_LIBRARY openblas)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
+  list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+  message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
+elseif(USE_BLAS STREQUAL "mkl")
+  if(NOT IS_DIRECTORY ${USE_MKL_PATH})
+    set(USE_MKL_PATH /opt/intel/mkl)
+  endif()
+  find_library(BLAS_LIBRARY mkl_rt ${USE_MKL_PATH}/lib/ ${USE_MKL_PATH}/lib/intel64)
+  include_directories(${USE_MKL_PATH}/include)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
+  list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+  add_definitions(-DUSE_MKL_BLAS=1)
+  message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
+elseif(USE_BLAS STREQUAL "atlas" OR USE_BLAS STREQUAL "blas")
+  find_library(BLAS_LIBRARY cblas)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
+  list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+  message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
+elseif(USE_BLAS STREQUAL "apple")
+  find_library(BLAS_LIBRARY Accelerate)
+  include_directories(${BLAS_LIBRARY}/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
+  list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+  message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
+elseif(USE_BLAS STREQUAL "none")
+  # pass
+else()
+  message(FATAL_ERROR "Invalid option: USE_BLAS=" ${USE_BLAS})
+endif()
diff --git a/cmake/modules/contrib/NNPack.cmake b/cmake/modules/contrib/NNPack.cmake
new file mode 100644
index 000000000000..82de88a21e63
--- /dev/null
+++ b/cmake/modules/contrib/NNPack.cmake
@@ -0,0 +1,14 @@
+if(USE_NNPACK)
+  if(NNPACK_PATH STREQUAL "")
+    set(NNPACK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/NNPack)
+  endif()
+	set(PTHREAD_POOL_PATH ${NNPACK_PATH}/deps/pthreadpool)
+  file(GLOB NNPACK_CONTRIB_SRC src/contrib/nnpack/*.cc)
+  list(APPEND RUNTIME_SRCS ${NNPACK_CONTRIB_SRC})
+	include_directories(${NNPACK_PATH}/include)
+	include_directories(${PTHREAD_POOL_PATH}/include)
+    find_library(NNPACK_CONTRIB_LIB nnpack ${NNPACK_PATH}/lib)
+  find_library(NNPACK_PTHREAD_CONTRIB_LIB pthreadpool ${NNPACK_PATH}/lib)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CONTRIB_LIB})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_PTHREAD_CONTRIB_LIB})
+endif(USE_NNPACK)
diff --git a/cmake/modules/contrib/Random.cmake b/cmake/modules/contrib/Random.cmake
new file mode 100644
index 000000000000..a6980be8bb5b
--- /dev/null
+++ b/cmake/modules/contrib/Random.cmake
@@ -0,0 +1,5 @@
+if(USE_RANDOM)
+  message(STATUS "Build with contrib.random")
+  file(GLOB RANDOM_CONTRIB_SRC src/contrib/random/random.cc)
+  list(APPEND RUNTIME_SRCS ${RANDOM_CONTRIB_SRC})
+endif(USE_RANDOM)
diff --git a/cmake/modules/contrib/Sort.cmake b/cmake/modules/contrib/Sort.cmake
new file mode 100644
index 000000000000..9ef637ecd99f
--- /dev/null
+++ b/cmake/modules/contrib/Sort.cmake
@@ -0,0 +1,5 @@
+if(USE_SORT)
+  message(STATUS "Build with contrib.sort")
+  file(GLOB SORT_CONTRIB_SRC src/contrib/sort/*.cc)
+  list(APPEND RUNTIME_SRCS ${SORT_CONTRIB_SRC})
+endif(USE_SORT)
diff --git a/cmake/util/FindCUDA.cmake b/cmake/util/FindCUDA.cmake
new file mode 100644
index 000000000000..3ce0cc40a5e5
--- /dev/null
+++ b/cmake/util/FindCUDA.cmake
@@ -0,0 +1,74 @@
+#######################################################
+# Enhanced version of find CUDA.
+#
+# Usage:
+#   find_cuda(${USE_CUDA})
+#
+# - When USE_CUDA=ON, use auto search
+# - When USE_CUDA=/path/to/cuda-path, use the cuda path
+#
+# Provide variables:
+#
+# - CUDA_FOUND
+# - CUDA_INCLUDE_DIRS
+# - CUDA_TOOLKIT_ROOT_DIR
+# - CUDA_CUDA_LIBRARY
+# - CUDA_CUDART_LIBRARY
+# - CUDA_NVRTC_LIBRARY
+# - CUDA_CUDNN_LIBRARY
+# - CUDA_CUBLAS_LIBRARY
+#
+macro(find_cuda use_cuda)
+  set(__use_cuda ${use_cuda})
+  if(__use_cuda STREQUAL "ON")
+    find_package(CUDA QUIET)
+  elseif(IS_DIRECTORY ${__use_cuda})
+    set(CUDA_TOOLKIT_ROOT_DIR ${__use_cuda})
+    message(STATUS "Custom CUDA_PATH=" ${CUDA_TOOLKIT_ROOT_DIR})
+    set(CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_ROOT_DIR}/include)
+    set(CUDA_FOUND TRUE)
+    if(MSVC)
+      find_library(CUDA_CUDART_LIBRARY cudart
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+    else(MSVC)
+      find_library(CUDA_CUDART_LIBRARY cudart
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+    endif(MSVC)
+  endif()
+
+  # additional libraries
+  if(CUDA_FOUND)
+    if(MSVC)
+      find_library(CUDA_CUDA_LIBRARY cuda
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+      find_library(CUDA_NVRTC_LIBRARY nvrtc
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+      find_library(CUDA_CUDNN_LIBRARY cudnn
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+      find_library(CUDA_CUBLAS_LIBRARY cublas
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+    else(MSVC)
+      find_library(_CUDA_CUDA_LIBRARY cuda
+        PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+      if(_CUDA_CUDA_LIBRARY)
+        set(CUDA_CUDA_LIBRARY ${_CUDA_CUDA_LIBRARY})
+      endif()
+      find_library(CUDA_NVRTC_LIBRARY nvrtc
+        PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+      find_library(CUDA_CUDNN_LIBRARY cudnn
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+      find_library(CUDA_CUBLAS_LIBRARY cublas
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+    endif(MSVC)
+  endif(CUDA_FOUND)
+endmacro(find_cuda)
diff --git a/cmake/util/FindLLVM.cmake b/cmake/util/FindLLVM.cmake
new file mode 100644
index 000000000000..4bb58d462d12
--- /dev/null
+++ b/cmake/util/FindLLVM.cmake
@@ -0,0 +1,59 @@
+#######################################################
+# Enhanced version of find llvm.
+#
+# Usage:
+#   find_llvm(${USE_LLVM})
+#
+# - When USE_LLVM=ON, use auto search
+# - When USE_LLVM=/path/to/llvm-config, use corresponding config
+#
+# Provide variables:
+# - LLVM_INCLUDE_DIRS
+# - LLVM_LIBS
+# - LLVM_DEFINITIONS
+# - TVM_LLVM_VERISON
+#
+macro(find_llvm use_llvm)
+  set(LLVM_CONFIG ${use_llvm})
+  if(LLVM_CONFIG STREQUAL "ON")
+    find_package(LLVM REQUIRED CONFIG)
+    llvm_map_components_to_libnames(LLVM_LIBS all)
+    list (FIND LLVM_LIBS "LLVM" _llvm_dynlib_index)
+    if (${_llvm_dynlib_index} GREATER -1)
+      set(LLVM_LIBS LLVM)
+      message(STATUS "Link with dynamic LLVM library")
+    else()
+      list(REMOVE_ITEM LLVM_LIBS LTO)
+      message(STATUS "Link with static LLVM libraries")
+    endif()
+    set(TVM_LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR})
+  elseif(NOT LLVM_CONFIG STREQUAL "OFF")
+    # use llvm config
+    message(STATUS "Use llvm-config=" ${LLVM_CONFIG})
+    execute_process(COMMAND ${LLVM_CONFIG} --libfiles
+      OUTPUT_VARIABLE __llvm_libfiles)
+    execute_process(COMMAND ${LLVM_CONFIG} --system-libs
+      OUTPUT_VARIABLE __llvm_system_libs)
+    execute_process(COMMAND ${LLVM_CONFIG} --cxxflags
+      OUTPUT_VARIABLE __llvm_cxxflags)
+    execute_process(COMMAND ${LLVM_CONFIG} --version
+      COMMAND cut -b 1,3
+      OUTPUT_VARIABLE TVM_LLVM_VERSION)
+    # definitions
+    string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_]*" LLVM_DEFINITIONS ${__llvm_cxxflags})
+    # include dir
+    string(REGEX MATCHALL "(^| )-I[^ ]*" __llvm_include_flags ${__llvm_cxxflags})    
+    set(LLVM_INCLUDE_DIRS "")
+    foreach(__flag IN ITEMS ${__llvm_include_flags})
+      string(REGEX REPLACE "(^| )-I" "" __dir "${__flag}")
+      list(APPEND LLVM_INCLUDE_DIRS "${__dir}")
+    endforeach()
+    message(STATUS ${LLVM_INCLUDE_DIRS})
+    # libfiles
+    string(STRIP ${__llvm_libfiles} __llvm_libfiles)
+    string(STRIP ${__llvm_system_libs} __llvm_system_libs)
+    set(LLVM_LIBS "${__llvm_libfiles} ${__llvm_system_libs}")
+    separate_arguments(LLVM_LIBS)
+    string(STRIP ${TVM_LLVM_VERSION} TVM_LLVM_VERSION)
+  endif()
+endmacro(find_llvm)
diff --git a/cmake/util/FindROCM.cmake b/cmake/util/FindROCM.cmake
new file mode 100644
index 000000000000..235969813382
--- /dev/null
+++ b/cmake/util/FindROCM.cmake
@@ -0,0 +1,41 @@
+#######################################################
+# Enhanced version of find rocm.
+#
+# Usage:
+#   find_rocm(${USE_ROCM})
+#
+# - When USE_VULKAN=ON, use auto search
+# - When USE_VULKAN=/path/to/vulkan-sdk-path, use the sdk
+#
+# Provide variables:
+#
+# - ROCM_FOUND
+# - ROCM_INCLUDE_DIRS
+# - ROCM_HIPHCC_LIBRARY
+# - ROCM_MIOPEN_LIBRARY
+# - ROCM_ROCBLAS_LIBRARY
+#
+
+macro(find_rocm use_rocm)
+  set(__use_rocm ${use_rocm})
+  if(IS_DIRECTORY ${__use_rocm})
+    set(__rocm_sdk ${__use_rocm})
+    message(STATUS "Custom ROCM SDK PATH=" ${__use_rocm})
+   elseif(IS_DIRECTORY $ENV{ROCM_PATH})
+     set(__rocm_sdk $ENV{ROCM_PATH})
+   elseif(IS_DIRECTORY /opt/rocm)
+     set(__rocm_sdk /opt/rocm)
+   else()
+     set(__rocm_sdk "")
+   endif()
+
+   if(__rocm_sdk)
+     set(ROCM_INCLUDE_DIRS ${__rocm_sdk}/include)
+     find_library(ROCM_HIPHCC_LIBRARY hip_hcc ${__rocm_sdk}/lib)
+     find_library(ROCM_MIOPEN_LIBRARY MIOpen ${__rocm_sdk}/lib)
+     find_library(ROCM_ROCBLAS_LIBRARY rocblas ${__rocm_sdk}/lib)
+     if(ROCM_HIPHCC_LIBRARY)
+       set(ROCM_FOUND TRUE)
+     endif()
+   endif(__rocm_sdk)
+endmacro(find_rocm)
diff --git a/cmake/util/FindVulkan.cmake b/cmake/util/FindVulkan.cmake
new file mode 100644
index 000000000000..0b85e8f47d79
--- /dev/null
+++ b/cmake/util/FindVulkan.cmake
@@ -0,0 +1,55 @@
+#######################################################
+# Enhanced version of find Vulkan.
+#
+# Usage:
+#   find_vulkan(${USE_VULKAN})
+#
+# - When USE_VULKAN=ON, use auto search
+# - When USE_VULKAN=/path/to/vulkan-sdk-path, use the sdk
+#
+# Provide variables:
+#
+# - Vulkan_FOUND
+# - Vulkan_INCLUDE_DIRS
+# - Vulkan_LIBRARY
+# - Vulkan_SPIRV_TOOLS_LIBRARY
+#
+
+macro(find_vulkan use_vulkan)
+  set(__use_vulkan ${use_vulkan})
+  if(IS_DIRECTORY ${__use_vulkan})
+    set(__vulkan_sdk ${__use_vulkan})
+    message(STATUS "Custom Vulkan SDK PATH=" ${__use_vulkan})
+   elseif(IS_DIRECTORY $ENV{VULKAN_SDK})
+     set(__vulkan_sdk $ENV{VULKAN_SDK})
+   else()
+     set(__vulkan_sdk "")
+   endif()
+
+   if(__vulkan_sdk)
+     set(Vulkan_INCLUDE_DIRS ${__vulkan_sdk}/include)
+     find_library(Vulkan_LIBRARY NAMES vulkan vulkan-1 PATHS ${__vulkan_sdk}/lib)
+     if(Vulkan_LIBRARY)
+       set(Vulkan_FOUND TRUE)
+     endif()
+   endif(__vulkan_sdk)
+
+   # resort to find vulkan of option is on
+   if(NOT Vulkan_FOUND)
+     if(__use_vulkan STREQUAL "ON")
+       find_package(Vulkan QUIET)
+     endif()
+   endif()
+   # additional libraries
+
+  if(Vulkan_FOUND)
+    get_filename_component(VULKAN_LIBRARY_PATH ${Vulkan_LIBRARY} DIRECTORY)
+    find_library(Vulkan_SPIRV_TOOLS_LIBRARY SPIRV-Tools
+      ${VULKAN_LIBRARY_PATH}/spirv-tools)
+
+    find_path(_libspirv libspirv.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv-tools)
+    find_path(_spirv spirv.hpp HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
+    find_path(_glsl_std GLSL.std.450.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
+    list(APPEND Vulkan_INCLUDE_DIRS ${_libspirv} ${_spirv} ${_glsl_std})
+  endif(Vulkan_FOUND)
+endmacro(find_vulkan)
diff --git a/cmake/Util.cmake b/cmake/util/Util.cmake
similarity index 72%
rename from cmake/Util.cmake
rename to cmake/util/Util.cmake
index 0cc3acea5e55..fb3118bde9e0 100644
--- a/cmake/Util.cmake
+++ b/cmake/util/Util.cmake
@@ -1,8 +1,14 @@
+macro(__tvm_option variable description value)
+  if(NOT DEFINED ${variable})
+    set(${variable} ${value} CACHE STRING ${description})
+  endif()
+endmacro()
+
 #######################################################
 # An option that the user can select. Can accept condition to control when option is available for user.
 # Usage:
 #   tvm_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
-function(tvm_option variable description value)
+macro(tvm_option variable description value)
   set(__value ${value})
   set(__condition "")
   set(__varname "__value")
@@ -21,23 +27,23 @@ function(tvm_option variable description value)
   if(${__condition})
     if("${__value}" MATCHES ";")
       if(${__value})
-        option(${variable} "${description}" ON)
+        __tvm_option(${variable} "${description}" ON)
       else()
-        option(${variable} "${description}" OFF)
+        __tvm_option(${variable} "${description}" OFF)
       endif()
     elseif(DEFINED ${__value})
       if(${__value})
-        option(${variable} "${description}" ON)
+        __tvm_option(${variable} "${description}" ON)
       else()
-        option(${variable} "${description}" OFF)
+        __tvm_option(${variable} "${description}" OFF)
       endif()
     else()
-      option(${variable} "${description}" ${__value})
+      __tvm_option(${variable} "${description}" "${__value}")
     endif()
   else()
     unset(${variable} CACHE)
   endif()
-endfunction()
+endmacro()
 
 function(assign_source_group group)
     foreach(_source IN ITEMS ${ARGN})
@@ -50,4 +56,4 @@ function(assign_source_group group)
         string(REPLACE "/" "\\" _source_path_msvc "${_source_path}")
         source_group("${group}\\${_source_path_msvc}" FILES "${_source}")
     endforeach()
-endfunction(assign_source_group)
\ No newline at end of file
+endfunction(assign_source_group)
diff --git a/conda/conda_build_config.yaml b/conda/conda_build_config.yaml
new file mode 100644
index 000000000000..7f18f5eea432
--- /dev/null
+++ b/conda/conda_build_config.yaml
@@ -0,0 +1,4 @@
+python:
+  - 3.5
+  - 3.6
+  - 3.7
\ No newline at end of file
diff --git a/conda/nnvm/build.sh b/conda/nnvm/build.sh
new file mode 100644
index 000000000000..9f7889e610e7
--- /dev/null
+++ b/conda/nnvm/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+cd nnvm/python
+$PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml
new file mode 100644
index 000000000000..a8b47d0de118
--- /dev/null
+++ b/conda/nnvm/meta.yaml
@@ -0,0 +1,39 @@
+{% set version = "0.4.dev" %}
+
+package:
+  name: nnvm
+  version: {{ version }}
+
+source:
+  path: ../..
+
+build:
+  number: 0
+  skip: True  # [win]
+
+requirements:
+  build:
+    - {{ compiler('cxx') }}
+  host:
+    - python {{ python }}
+    - cython
+    - numpy
+    - setuptools
+    - decorator
+    - tvm-libs =={{ version }}
+  run:
+    - tvm =={{ version }}
+    - topi =={{ version }}
+    - tvm-libs =={{ version }}
+    - python
+    - {{ pin_compatible('numpy') }}
+    - decorator
+
+test:
+  imports:
+    - nnvm
+
+about:
+  home: https://github.com/dmlc/nnvm
+  license: Apache2
+  summary: Bring deep learning to bare metal
diff --git a/conda/topi/build.sh b/conda/topi/build.sh
new file mode 100644
index 000000000000..a1f5e491c8eb
--- /dev/null
+++ b/conda/topi/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+cd topi/python
+$PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml
new file mode 100644
index 000000000000..af2fb4fd4228
--- /dev/null
+++ b/conda/topi/meta.yaml
@@ -0,0 +1,34 @@
+{% set version = "0.4.dev" %}
+
+package:
+  name: topi
+  version: {{ version }}
+
+source:
+  path: ../..
+
+build:
+  number: 0
+
+requirements:
+  host:
+    - python {{ python }}
+    - numpy
+    - setuptools
+    - decorator
+    - tvm-libs =={{ version }}
+  run:
+    - python
+    - {{ pin_compatible('numpy') }}
+    - decorator
+    - tvm-libs =={{ version }}
+    - tvm =={{ version }}
+
+test:
+  imports:
+    - topi
+
+about:
+  home: https://github.com/dmlc/tvm
+  license: Apache2
+  summary: "TOPI: TVM Operator Inventory"
diff --git a/conda/tvm-libs/build.sh b/conda/tvm-libs/build.sh
new file mode 100644
index 000000000000..d427d922a21e
--- /dev/null
+++ b/conda/tvm-libs/build.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -e
+
+if [ -z "$PREFIX" ]; then
+  PREFIX="$CONDA_PREFIX"
+fi
+
+rm -rf build || true
+mkdir -p build
+cd build
+cmake -DUSE_LLVM=ON -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" ..
+make -j2 VERBOSE=1
+make install
+cd ..
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
new file mode 100644
index 000000000000..dbdfd4a7701f
--- /dev/null
+++ b/conda/tvm-libs/meta.yaml
@@ -0,0 +1,30 @@
+{% set version = "0.4.dev" %}
+
+package:
+  name: tvm-libs
+  version: {{ version }}
+
+source:
+  path: ../..
+
+build:
+  number: 0
+
+requirements:
+  build:
+    - {{ compiler('cxx') }}  # [linux]
+    - llvmdev ==6.0.0  # [osx]
+  host:
+    # The OS X build will require some manual setup or it will break
+    # See https://conda.io/docs/user-guide/tasks/build-packages/compiler-tools.html#macos-sdk
+    # It is also ass-backward because of llvm brokeness when mixed with the
+    # conda OS X compiler
+    - {{ compiler('cxx') }}  # [osx]
+    - cmake
+    - llvmdev ==6.0.0  # [linux]
+    - zlib  # [linux]
+
+about:
+  home: https://github.com/dmlc/tvm
+  license: Apache2
+  summary: a low level domain specific language for compiling tensor computation pipelines
\ No newline at end of file
diff --git a/conda/tvm/build.sh b/conda/tvm/build.sh
new file mode 100644
index 000000000000..9c958a32e629
--- /dev/null
+++ b/conda/tvm/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+cd python
+$PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
diff --git a/python/conda/meta.yaml b/conda/tvm/meta.yaml
similarity index 55%
rename from python/conda/meta.yaml
rename to conda/tvm/meta.yaml
index 9ebb5afac543..478e095322eb 100644
--- a/python/conda/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.1.dev" %}
+{% set version = "0.4.dev" %}
 
 package:
   name: tvm
@@ -8,23 +8,27 @@ source:
   path: ../..
 
 build:
-  number: 1
-  skip: True  # [win]
-  script_env:
-    - CONDA_CUDA_HOME
+  number: 0
 
 requirements:
   build:
-    - llvmdev ==4.0.0
-    - python >=3
+    - {{ compiler('cxx') }}
+  host:
+    - python {{ python }}
+    - cython
     - numpy
     - setuptools
-    - nose
     - decorator
+    - tvm-libs =={{ version }}
   run:
-    - python >=3
-    - numpy
+    - python
+    - {{ pin_compatible('numpy') }}
     - decorator
+    - tvm-libs =={{ version }}
+
+test:
+  imports:
+    - tvm
 
 about:
   home: https://github.com/dmlc/tvm
diff --git a/dlpack b/dlpack
index 9422e98f3f4d..10892ac964f1 160000
--- a/dlpack
+++ b/dlpack
@@ -1 +1 @@
-Subproject commit 9422e98f3f4dafc6bc3473cf8484543ad376aab6
+Subproject commit 10892ac964f1af7c81aae145cd3fab78bbccd297
diff --git a/dmlc-core b/dmlc-core
index 04f91953ace7..e864aa6757cd 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 04f91953ace74aced3bb317990515304c5425849
+Subproject commit e864aa6757cdbe78b1296fe5231fd3050b7802c3
diff --git a/tests/ci_build/Dockerfile.cpu b/docker/Dockerfile.ci_cpu
similarity index 62%
rename from tests/ci_build/Dockerfile.cpu
rename to docker/Dockerfile.ci_cpu
index b113fc548fcb..0f0fc6f04d4c 100644
--- a/tests/ci_build/Dockerfile.cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -1,4 +1,4 @@
-# For CPU
+# CI docker CPU env
 FROM ubuntu:16.04
 
 RUN apt-get update --fix-missing
@@ -9,11 +9,12 @@ RUN bash /install/ubuntu_install_core.sh
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
 
-COPY install/ubuntu_install_iverilog.sh /install/ubuntu_install_iverilog.sh
-RUN bash /install/ubuntu_install_iverilog.sh
-
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
-COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
-RUN bash /install/ubuntu_install_java.sh
+COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
+RUN bash /install/ubuntu_install_llvm.sh
+
+# AutoTVM deps
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh
diff --git a/tests/ci_build/Dockerfile.emscripten b/docker/Dockerfile.ci_emscripten
similarity index 72%
rename from tests/ci_build/Dockerfile.emscripten
rename to docker/Dockerfile.ci_emscripten
index 59bf02ea7d2c..b4d5a63c52ef 100644
--- a/tests/ci_build/Dockerfile.emscripten
+++ b/docker/Dockerfile.ci_emscripten
@@ -15,4 +15,8 @@ RUN bash /install/ubuntu_install_emscripten.sh
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
-RUN cp /root/.emscripten /emsdk-portable/
\ No newline at end of file
+RUN chmod a+rwx -R /emsdk-portable
+RUN cp -r /emsdk-portable  /emsdk-portable-backup
+RUN mv /emsdk-portable  /emsdk-portable-x
+RUN mv /emsdk-portable-backup /emsdk-portable
+RUN cp /root/.emscripten /emsdk-portable/
diff --git a/tests/ci_build/Dockerfile.gpu b/docker/Dockerfile.ci_gpu
similarity index 56%
rename from tests/ci_build/Dockerfile.gpu
rename to docker/Dockerfile.ci_gpu
index 9dff84e84635..c177ef9d420a 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -1,3 +1,4 @@
+# CI docker GPU env
 FROM nvidia/cuda:8.0-cudnn7-devel
 
 # Base scripts
@@ -15,9 +16,6 @@ RUN bash /install/ubuntu_install_llvm.sh
 COPY install/ubuntu_install_opencl.sh /install/ubuntu_install_opencl.sh
 RUN bash /install/ubuntu_install_opencl.sh
 
-COPY install/ubuntu_install_iverilog.sh /install/ubuntu_install_iverilog.sh
-RUN bash /install/ubuntu_install_iverilog.sh
-
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
@@ -26,7 +24,10 @@ RUN bash /install/ubuntu_install_sphinx.sh
 
 # Fix recommonmark to latest version
 RUN git clone https://github.com/rtfd/recommonmark
-RUN cd recommonmark; python setup.py install
+RUN cd recommonmark; python3 setup.py install
+
+# Enable doxygen for c++ doc build
+RUN apt-get update && apt-get install -y doxygen graphviz libprotobuf-dev protobuf-compiler
 
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh
@@ -37,16 +38,45 @@ RUN bash /install/ubuntu_install_nodejs.sh
 COPY install/ubuntu_install_rocm.sh /install/ubuntu_install_rocm.sh
 RUN bash /install/ubuntu_install_rocm.sh
 
-# Enable doxygen for c++ doc build
-RUN apt-get install -y doxygen graphviz
+COPY install/ubuntu_install_opengl.sh /install/ubuntu_install_opengl.sh
+RUN bash /install/ubuntu_install_opengl.sh
+
+# DL Frameworks
+COPY install/ubuntu_install_mxnet.sh /install/ubuntu_install_mxnet.sh
+RUN bash /install/ubuntu_install_mxnet.sh
+
+COPY install/ubuntu_install_coreml.sh /install/ubuntu_install_coreml.sh
+RUN bash /install/ubuntu_install_coreml.sh
+
+COPY install/ubuntu_install_keras.sh /install/ubuntu_install_keras.sh
+RUN bash /install/ubuntu_install_keras.sh
+
+COPY install/ubuntu_install_darknet.sh /install/ubuntu_install_darknet.sh
+RUN bash /install/ubuntu_install_darknet.sh
+
+COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
+RUN bash /install/ubuntu_install_onnx.sh
+
+RUN pip3 install Pillow
+
+COPY install/ubuntu_install_vulkan.sh /install/ubuntu_install_vulkan.sh
+RUN bash /install/ubuntu_install_vulkan.sh
+
+# AutoTVM deps
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh
 
 # Environment variables
-ENV PATH=/node_modules/.bin:${PATH}
 ENV PATH=/usr/local/nvidia/bin:${PATH}
-ENV PATH=/usr/clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-14.04/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}
 ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
 ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH}
 ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LIBRARY_PATH}
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
+
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH}
+ENV PATH=/node_modules/.bin:${PATH}
+ENV VULKAN_SDK=/usr/local/VulkanSDK/1.0.65.0/x86_64
+ENV PATH=${PATH}:${VULKAN_SDK}/bin
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${VULKAN_SDK}/lib
+ENV VK_LAYER_PATH=${VULKAN_SDK}/etc/explicit_layer.d
diff --git a/tests/ci_build/Dockerfile.i386 b/docker/Dockerfile.ci_i386
similarity index 78%
rename from tests/ci_build/Dockerfile.i386
rename to docker/Dockerfile.ci_i386
index e4577c37edd1..6a8394e85763 100644
--- a/tests/ci_build/Dockerfile.i386
+++ b/docker/Dockerfile.ci_i386
@@ -1,3 +1,5 @@
+# CI docker i386 env
+
 FROM ioft/i386-ubuntu:16.04
 
 RUN apt-get update --fix-missing
@@ -13,3 +15,7 @@ RUN bash /install/ubuntu_install_python.sh
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
+
+# AutoTVM deps
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
new file mode 100644
index 000000000000..132e8ebb7df9
--- /dev/null
+++ b/docker/Dockerfile.ci_lint
@@ -0,0 +1,9 @@
+# For lint test
+# CI docker lint env
+FROM ubuntu:16.04
+
+RUN apt-get update && apt-get install -y sudo wget
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+RUN apt-get install -y doxygen graphviz
+RUN pip3 install cpplint pylint mypy
diff --git a/docker/Dockerfile.demo_cpu b/docker/Dockerfile.demo_cpu
new file mode 100644
index 000000000000..0778b0a28784
--- /dev/null
+++ b/docker/Dockerfile.demo_cpu
@@ -0,0 +1,31 @@
+# Minimum docker image for demo purposes
+# prebuilt-image: tvmai/demo-cpu
+FROM ubuntu:16.04
+
+RUN apt-get update --fix-missing
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+# Python: basic dependencies
+RUN apt-get update && apt-get install -y python3-dev python3-pip
+RUN pip3 install numpy nose-timer cython decorator scipy
+
+# LLVM
+RUN echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main \
+     >> /etc/apt/sources.list.d/llvm.list && \
+     wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - && \
+     apt-get update && apt-get install -y --force-yes llvm-6.0
+
+# Jupyter notebook.
+RUN pip3 install matplotlib Image Pillow jupyter[notebook]
+
+# Deep learning frameworks
+RUN pip3 install mxnet tensorflow keras
+
+# Build TVM
+COPY install/install_tvm_cpu.sh /install/install_tvm_cpu.sh
+RUN bash /install/install_tvm_cpu.sh
+
+# Environment variables
+ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
diff --git a/docker/Dockerfile.demo_gpu b/docker/Dockerfile.demo_gpu
new file mode 100644
index 000000000000..6f249986e22c
--- /dev/null
+++ b/docker/Dockerfile.demo_gpu
@@ -0,0 +1,34 @@
+# Minimum docker image for demo purposes
+# prebuilt-image: tvmai/demo-gpu
+FROM nvidia/cuda:8.0-cudnn7-devel
+
+RUN apt-get update --fix-missing
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+# Python: basic dependencies
+RUN apt-get update && apt-get install -y python3-dev python3-pip
+RUN pip3 install numpy nose-timer cython decorator scipy
+
+# LLVM
+RUN echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main \
+     >> /etc/apt/sources.list.d/llvm.list && \
+     wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - && \
+     apt-get update && apt-get install -y --force-yes llvm-6.0
+
+# Jupyter notebook.
+RUN pip3 install matplotlib Image Pillow jupyter[notebook]
+
+# Deep learning frameworks
+RUN pip3 install mxnet tensorflow keras
+
+# Build TVM
+COPY install/install_tvm_gpu.sh /install/install_tvm_gpu.sh
+RUN bash /install/install_tvm_gpu.sh
+
+# Environment variables
+ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
+ENV PATH=/usr/local/nvidia/bin:${PATH}
+ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 000000000000..e9b8b503062f
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,87 @@
+# TVM Docker
+
+This directory contains the TVM's docker infrastructure.
+We use docker to provide build environments for CI and images for demo.
+We need [docker](https://docs.docker.com/engine/installation/) and
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker/) for GPU images.
+
+## Start Docker Bash Session
+
+You can use the following helper script to start an
+interactive bash session with a given image_name.
+
+```bash
+/path/to/tvm/docker/bash.sh image_name
+```
+
+The script does the following things:
+- Mount current directory to /workspace and set it as home
+- Switch user to be the same user that calls the bash.sh
+- Use the host-side network
+
+The helper bash script can be useful to build demo sessions.
+
+## Prebuilt Docker Images
+
+We provide several pre-built images for doing quick exploration with TVM installed.
+For example, you can run the following command to get ```tvmai/demo-cpu``` image.
+
+```bash
+/path/to/tvm/docker/bash.sh tvmai/demo-cpu
+```
+
+Then inside the docker container, you can type the following command to start the jupyter notebook
+```bash
+jupyter notebook
+```
+
+Check out https://hub.docker.com/r/tvmai/ to get the full list of available prebuilt images.
+
+
+## Use Local Build Script
+
+We also provide script to build docker images locally.
+We use (`build.sh`)[./build.sh] to build and run the commands.
+To build and run docker images, we can run the following command
+at the root of the project.
+
+```bash
+./docker/build.sh image_name [command]
+```
+
+Here image_name corresponds to the docker defined in the
+```Dockerfile.image_name```.
+
+You can also start an interactive session by typing
+
+```bash
+./docker/build.sh image_name -it bash
+```
+
+The build command will map the tvm root to /workspace/ inside the container
+with the same user as the user invoking the docker command.
+Here are some common use examples to perform CI tasks.
+
+- lint the python codes
+
+  ```bash
+  ./docker/build.sh ci_lint make pylint
+  ```
+
+- build codes with CUDA support
+
+  ```bash
+  ./docker/build.sh ci_gpu make -j$(nproc)
+  ```
+
+- do the python unittest
+
+  ```bash
+  ./docker/build.sh ci_gpu tests/scripts/task_python_unittest.sh
+  ```
+
+- build the documents. The results will be available at `docs/_build/html`
+
+  ```bash
+  ./docker/ci_build.sh ci_gpu make -C docs html
+  ```
diff --git a/docker/bash.sh b/docker/bash.sh
new file mode 100755
index 000000000000..ba935d7ed089
--- /dev/null
+++ b/docker/bash.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+#
+# Start a bash, mount /workspace to be current directory.
+#
+# Usage: docker/bash.sh <CONTAINER_NAME>
+#     Starts an interactive session
+#
+# Usage2: docker/bash.sh <CONTAINER_NAME> [COMMAND]
+#     Execute command in the docker image, non-interactive
+#
+if [ "$#" -lt 1 ]; then
+    echo "Usage: docker/bash.sh <CONTAINER_NAME> [COMMAND]"
+    exit -1
+fi
+
+DOCKER_IMAGE_NAME=("$1")
+
+if [ "$#" -eq 1 ]; then
+    COMMAND="bash"
+    CI_DOCKER_EXTRA_PARAMS=("-it --net=host")
+else
+    shift 1
+    COMMAND=("$@")
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+WORKSPACE="$(pwd)"
+
+# Use nvidia-docker if the container is GPU.
+if [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* ]]; then
+    DOCKER_BINARY="nvidia-docker"
+else
+    DOCKER_BINARY="docker"
+fi
+
+# Print arguments.
+echo "WORKSPACE: ${WORKSPACE}"
+echo "DOCKER CONTAINER NAME: ${DOCKER_IMAGE_NAME}"
+echo ""
+
+echo "Running '${COMMAND[@]}' inside ${DOCKER_IMAGE_NAME}..."
+
+# By default we cleanup - remove the container once it finish running (--rm)
+# and share the PID namespace (--pid=host) so the process inside does not have
+# pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it).
+echo ${DOCKER_BINARY}
+${DOCKER_BINARY} run --rm --pid=host\
+    -v ${WORKSPACE}:/workspace \
+    -v ${SCRIPT_DIR}:/docker \
+    -w /workspace \
+    -e "CI_BUILD_HOME=/workspace" \
+    -e "CI_BUILD_USER=$(id -u -n)" \
+    -e "CI_BUILD_UID=$(id -u)" \
+    -e "CI_BUILD_GROUP=$(id -g -n)" \
+    -e "CI_BUILD_GID=$(id -g)" \
+    ${CI_DOCKER_EXTRA_PARAMS[@]} \
+    ${DOCKER_IMAGE_NAME}\
+    bash /docker/with_the_same_user \
+    ${COMMAND[@]}
diff --git a/tests/ci_build/ci_build.sh b/docker/build.sh
similarity index 93%
rename from tests/ci_build/ci_build.sh
rename to docker/build.sh
index 86c138aaf3a5..1d476e52e642 100755
--- a/tests/ci_build/ci_build.sh
+++ b/docker/build.sh
@@ -2,7 +2,7 @@
 #
 # Execute command within a docker container
 #
-# Usage: ci_build.sh <CONTAINER_TYPE> [--dockerfile <DOCKERFILE_PATH>] [-it]
+# Usage: build.sh <CONTAINER_TYPE> [--dockerfile <DOCKERFILE_PATH>] [-it]
 #                    <COMMAND>
 #
 # CONTAINER_TYPE: Type of the docker container used the run the build: e.g.,
@@ -37,6 +37,11 @@ if [[ "$1" == "-it" ]]; then
     shift 1
 fi
 
+if [[ "$1" == "--net=host" ]]; then
+    CI_DOCKER_EXTRA_PARAMS+=('--net=host')
+    shift 1
+fi
+
 if [[ ! -f "${DOCKERFILE_PATH}" ]]; then
     echo "Invalid Dockerfile path: \"${DOCKERFILE_PATH}\""
     exit 1
@@ -71,8 +76,8 @@ function upsearch () {
 
 # Set up WORKSPACE and BUILD_TAG. Jenkins will set them for you or we pick
 # reasonable defaults if you run it outside of Jenkins.
-WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../../}"
-BUILD_TAG="${BUILD_TAG:-tvm-ci}"
+WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../}"
+BUILD_TAG="${BUILD_TAG:-tvm}"
 
 # Determine the docker image name
 DOCKER_IMG_NAME="${BUILD_TAG}.${CONTAINER_TYPE}"
@@ -122,5 +127,5 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GID=$(id -g)" \
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMG_NAME} \
-    bash tests/ci_build/with_the_same_user \
+    bash docker/with_the_same_user \
     ${COMMAND[@]}
diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh
new file mode 100644
index 000000000000..51593e66506e
--- /dev/null
+++ b/docker/install/install_tvm_cpu.sh
@@ -0,0 +1,12 @@
+cd /usr
+git clone https://github.com/dmlc/tvm --recursive
+cd /usr/tvm
+echo set\(USE_LLVM llvm-config-6.0\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
+echo set\(USE_BLAS openblas\) >> config.cmake
+mkdir -p build
+cd build
+cmake ..
+make -j10
diff --git a/docker/install/install_tvm_gpu.sh b/docker/install/install_tvm_gpu.sh
new file mode 100644
index 000000000000..8a1324646fd5
--- /dev/null
+++ b/docker/install/install_tvm_gpu.sh
@@ -0,0 +1,14 @@
+cd /usr
+git clone https://github.com/dmlc/tvm --recursive
+cd /usr/tvm
+echo set\(USE_LLVM llvm-config-6.0\) >> config.cmake
+echo set\(USE_CUDA ON\) >> config.cmake
+echo set\(USE_CUDNN ON\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
+echo set\(USE_BLAS openblas\) >> config.cmake
+mkdir -p build
+cd build
+cmake ..
+make -j10
diff --git a/tests/ci_build/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
similarity index 76%
rename from tests/ci_build/install/ubuntu_install_core.sh
rename to docker/install/ubuntu_install_core.sh
index 9823ae0788ac..efc69c946b97 100644
--- a/tests/ci_build/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -1,5 +1,5 @@
 # install libraries for building c++ core on ubuntu
-apt-get install -y --no-install-recommends --force-yes \
+apt-get update && apt-get install -y --no-install-recommends --force-yes \
         git make libgtest-dev cmake wget unzip libtinfo-dev libz-dev\
         libcurl4-openssl-dev libopenblas-dev g++ sudo
 
diff --git a/docker/install/ubuntu_install_coreml.sh b/docker/install/ubuntu_install_coreml.sh
new file mode 100644
index 000000000000..4b0fd126c61d
--- /dev/null
+++ b/docker/install/ubuntu_install_coreml.sh
@@ -0,0 +1 @@
+pip3 install coremltools
diff --git a/docker/install/ubuntu_install_darknet.sh b/docker/install/ubuntu_install_darknet.sh
new file mode 100644
index 000000000000..f5e0c2791d80
--- /dev/null
+++ b/docker/install/ubuntu_install_darknet.sh
@@ -0,0 +1,4 @@
+#install the necessary dependancies, cffi, opencv
+wget 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
+pip2 install opencv-python cffi
+pip3 install opencv-python cffi
diff --git a/tests/ci_build/install/ubuntu_install_emscripten.sh b/docker/install/ubuntu_install_emscripten.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_emscripten.sh
rename to docker/install/ubuntu_install_emscripten.sh
diff --git a/tests/ci_build/install/ubuntu_install_iverilog.sh b/docker/install/ubuntu_install_iverilog.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_iverilog.sh
rename to docker/install/ubuntu_install_iverilog.sh
diff --git a/tests/ci_build/install/ubuntu_install_java.sh b/docker/install/ubuntu_install_java.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_java.sh
rename to docker/install/ubuntu_install_java.sh
diff --git a/docker/install/ubuntu_install_keras.sh b/docker/install/ubuntu_install_keras.sh
new file mode 100644
index 000000000000..33bc38c80972
--- /dev/null
+++ b/docker/install/ubuntu_install_keras.sh
@@ -0,0 +1,2 @@
+pip2 install keras tensorflow h5py
+pip3 install keras tensorflow h5py
diff --git a/tests/ci_build/install/ubuntu_install_llvm.sh b/docker/install/ubuntu_install_llvm.sh
similarity index 76%
rename from tests/ci_build/install/ubuntu_install_llvm.sh
rename to docker/install/ubuntu_install_llvm.sh
index e5b28b911f61..16d0fe150b7e 100644
--- a/tests/ci_build/install/ubuntu_install_llvm.sh
+++ b/docker/install/ubuntu_install_llvm.sh
@@ -8,10 +8,15 @@ echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-5.0 main\
 echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-5.0 main\
      >> /etc/apt/sources.list.d/llvm.list
 
+echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main\
+     >> /etc/apt/sources.list.d/llvm.list
+echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main\
+     >> /etc/apt/sources.list.d/llvm.list
+
 echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
      >> /etc/apt/sources.list.d/llvm.list
 echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
      >> /etc/apt/sources.list.d/llvm.list
 
 wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
-apt-get update && apt-get install -y --force-yes llvm-4.0 llvm-5.0 llvm-6.0
+apt-get update && apt-get install -y --force-yes llvm-4.0 llvm-5.0 llvm-6.0 clang-6.0
diff --git a/docker/install/ubuntu_install_mxnet.sh b/docker/install/ubuntu_install_mxnet.sh
new file mode 100644
index 000000000000..0e7e9e3939a8
--- /dev/null
+++ b/docker/install/ubuntu_install_mxnet.sh
@@ -0,0 +1 @@
+pip3 install mxnet
diff --git a/tests/ci_build/install/ubuntu_install_nodejs.sh b/docker/install/ubuntu_install_nodejs.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_nodejs.sh
rename to docker/install/ubuntu_install_nodejs.sh
diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
new file mode 100644
index 000000000000..517ea77ab81e
--- /dev/null
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -0,0 +1,8 @@
+# fix to certain version for now
+pip2 install onnx>=1.1.0
+pip3 install onnx>=1.1.0
+
+pip2 install http://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp27-cp27mu-manylinux1_x86_64.whl
+pip2 install torchvision
+pip3 install http://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp35-cp35m-manylinux1_x86_64.whl
+pip3 install torchvision
diff --git a/tests/ci_build/install/ubuntu_install_opencl.sh b/docker/install/ubuntu_install_opencl.sh
similarity index 68%
rename from tests/ci_build/install/ubuntu_install_opencl.sh
rename to docker/install/ubuntu_install_opencl.sh
index 636236539a98..ca4d1d04fd5c 100644
--- a/tests/ci_build/install/ubuntu_install_opencl.sh
+++ b/docker/install/ubuntu_install_opencl.sh
@@ -1,8 +1,8 @@
 # Install OpenCL runtime in nvidia docker.
-apt-get install -y --no-install-recommends --force-yes \
-        ocl-icd-libopencl1 \
+apt-get update && apt-get install -y --no-install-recommends --force-yes \
+        ocl-icd-opencl-dev \
         clinfo && \
-        rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 
 mkdir -p /etc/OpenCL/vendors && \
     echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
diff --git a/docker/install/ubuntu_install_opengl.sh b/docker/install/ubuntu_install_opengl.sh
new file mode 100644
index 000000000000..f8be6e351581
--- /dev/null
+++ b/docker/install/ubuntu_install_opengl.sh
@@ -0,0 +1,4 @@
+apt-get update --fix-missing
+
+apt-get install -y --no-install-recommends --force-yes \
+        libgl1-mesa-dev libglfw3-dev
\ No newline at end of file
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
new file mode 100644
index 000000000000..a34019e1003e
--- /dev/null
+++ b/docker/install/ubuntu_install_python.sh
@@ -0,0 +1,12 @@
+# install python and pip, don't modify this, modify install_python_package.sh
+apt-get update && apt-get install -y python-dev
+
+# python 3.6
+apt-get update && yes | apt-get install software-properties-common
+add-apt-repository ppa:jonathonf/python-3.6 &&\
+    apt-get update && apt-get install -y python-pip python-dev python3.6 python3.6-dev
+
+rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
+
+# Install pip
+cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python2 get-pip.py && python3.6 get-pip.py
diff --git a/tests/ci_build/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
similarity index 81%
rename from tests/ci_build/install/ubuntu_install_python_package.sh
rename to docker/install/ubuntu_install_python_package.sh
index fbed2e1904cd..3e5c88674079 100644
--- a/tests/ci_build/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -1,3 +1,3 @@
 # install libraries for python package on ubuntu
 pip2 install nose pylint numpy nose-timer cython decorator scipy tornado
-pip3 install nose pylint numpy nose-timer cython decorator scipy tornado
+pip3 install nose pylint numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset
diff --git a/docker/install/ubuntu_install_redis.sh b/docker/install/ubuntu_install_redis.sh
new file mode 100644
index 000000000000..dfc9a3c381b6
--- /dev/null
+++ b/docker/install/ubuntu_install_redis.sh
@@ -0,0 +1,3 @@
+apt-get update && apt-get install -y redis-server
+pip2 install xgboost psutil
+pip3 install xgboost psutil
diff --git a/tests/ci_build/install/ubuntu_install_rocm.sh b/docker/install/ubuntu_install_rocm.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_rocm.sh
rename to docker/install/ubuntu_install_rocm.sh
diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
new file mode 100644
index 000000000000..ba04c2e25e6f
--- /dev/null
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -0,0 +1 @@
+pip3 install sphinx sphinx-gallery sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image commonmark>=0.7.3 docutils>=0.11
diff --git a/docker/install/ubuntu_install_vulkan.sh b/docker/install/ubuntu_install_vulkan.sh
new file mode 100644
index 000000000000..a4155da49651
--- /dev/null
+++ b/docker/install/ubuntu_install_vulkan.sh
@@ -0,0 +1,9 @@
+#/bin/bash
+
+wget https://sdk.lunarg.com/sdk/download/1.0.65.0/linux/vulkansdk-linux-x86_64-1.0.65.0.run
+
+bash vulkansdk-linux-x86_64-1.0.65.0.run
+mv VulkanSDK /usr/local/VulkanSDK
+cd /usr/local/VulkanSDK/1.0.65.0
+./build_tools.sh
+./build_samples.sh
diff --git a/tests/ci_build/with_the_same_user b/docker/with_the_same_user
similarity index 91%
rename from tests/ci_build/with_the_same_user
rename to docker/with_the_same_user
index 1e6ab883694b..470d64384de6 100644
--- a/tests/ci_build/with_the_same_user
+++ b/docker/with_the_same_user
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # This script is a wrapper creating the same user inside container as the one
-# running the ci_build.sh outside the container. It also set the home directory
+# running the docker/build.sh outside the container. It also set the home directory
 # for the user inside container to match the same absolute path as the workspace
 # outside of container.  Do not run this manually. It does not make sense. It is
 # intended to be called by ci_build.sh only.
@@ -30,5 +30,6 @@ HOME=${CI_BUILD_HOME}\
     sudo -u "#${CI_BUILD_UID}" --preserve-env\
     PATH=${PATH}\
     LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\
+    PYTHONPATH=${PYTHONPATH}\
     HOME=${CI_BUILD_HOME}\
     ${COMMAND[@]}
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 87e7da7043e5..7bb47ccab4c5 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -753,7 +753,7 @@ WARN_LOGFILE           =
 # spaces.
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = include/tvm topi/include/topi
+INPUT                  = include/tvm topi/include/topi nnvm/include/nnvm vta/include/vta
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1934,7 +1934,7 @@ ENABLE_PREPROCESSING   = YES
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES
 
 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and
diff --git a/docs/Makefile b/docs/Makefile
index 1e45fb5e3787..d7a12839ba3d 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -3,15 +3,10 @@
 
 # You can set these variables from the command line.
 SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
+SPHINXBUILD   = python3 -m sphinx
 PAPER         =
 BUILDDIR      = _build
 
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
 # Internal variables.
 PAPEROPT_a4     = -D latex_paper_size=a4
 PAPEROPT_letter = -D latex_paper_size=letter
@@ -51,6 +46,8 @@ help:
 clean:
 	rm -rf $(BUILDDIR)/*
 	rm -rf gen_modules
+	rm -rf tutorials
+	rm -rf vta/tutorials
 
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
diff --git a/docs/README.txt b/docs/README.txt
index b8780dd9fc87..fffdaa233ef8 100644
--- a/docs/README.txt
+++ b/docs/README.txt
@@ -1,6 +1,28 @@
-The documentation of tvm is generated with recommonmark and sphinx.
+TVM Documentations
+==================
+This folder contains the source of TVM documents
 
-- A hosted version of doc is at http://docs.tvmlang.org
-- pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark
+- A hosted version of doc is at http://docs.tvm.ai
+- pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark Pillow
 - Build tvm first in the root folder.
 - To build locally, you need to enable USE_CUDA, USE_OPENCL, LLVM_CONFIG in config.mk and then type "make html" in this folder.
+
+Only Execute Specified Tutorials
+--------------------------------
+The document build process will execute all the tutorials in the sphinx gallery.
+This will cause failure in some cases when certain machines do not have necessary
+environment. You can set ```TVM_TUTORIAL_EXEC_PATTERN``` to only execute
+the path that matches the regular expression pattern.
+
+For example, to only build tutorials under /vta/tutorials, run
+
+```bash
+TVM_TUTORIAL_EXEC_PATTERN=/vta/tutorials make html
+```
+
+To only build one specific file, do
+
+```bash
+# The slash \ is used to get . in regular expression
+TVM_TUTORIAL_EXEC_PATTERN=file_name\.py make html
+```
diff --git a/docs/_static/css/tvm_theme.css b/docs/_static/css/tvm_theme.css
index 5e0838abf6cb..274589887b3f 100644
--- a/docs/_static/css/tvm_theme.css
+++ b/docs/_static/css/tvm_theme.css
@@ -9,3 +9,13 @@
 nav .hidden-section {
     display: inherit;
 }
+
+.wy-side-nav-search {
+    background-color: #fff;
+    color: #333;
+}
+
+.version{
+    color: #404040 !important;
+}
+
diff --git a/docs/_static/img/README b/docs/_static/img/README
new file mode 100644
index 000000000000..414328cc729d
--- /dev/null
+++ b/docs/_static/img/README
@@ -0,0 +1,2 @@
+The logo file in this repo is an exception due to the need of sphinx.
+By default we avoid to put large binary blobs into this repo.
\ No newline at end of file
diff --git a/docs/_static/img/tvm-logo-small.png b/docs/_static/img/tvm-logo-small.png
new file mode 100644
index 000000000000..c3519fece55b
Binary files /dev/null and b/docs/_static/img/tvm-logo-small.png differ
diff --git a/docs/api/python/autotvm.rst b/docs/api/python/autotvm.rst
new file mode 100644
index 000000000000..0a2ae40f24a9
--- /dev/null
+++ b/docs/api/python/autotvm.rst
@@ -0,0 +1,73 @@
+tvm.autotvm
+-----------
+.. automodule:: tvm.autotvm
+
+tvm.autotvm.measure
+~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.measure.measure
+
+.. autoclass:: tvm.autotvm.measure.MeasureInput
+    :members:
+
+.. autoclass:: tvm.autotvm.measure.MeasureResult
+    :members:
+
+.. autofunction:: tvm.autotvm.measure.measure_option
+
+.. autofunction:: tvm.autotvm.measure.create_measure_batch
+
+
+tvm.autotvm.tuner
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.tuner
+    :members:
+
+.. autoclass:: tvm.autotvm.tuner.Tuner
+    :members:
+
+.. autoclass:: tvm.autotvm.tuner.RandomTuner
+    :members:
+    :inherited-members:
+
+.. autoclass:: tvm.autotvm.tuner.GridSearchTuner
+    :members:
+    :inherited-members:
+
+.. autoclass:: tvm.autotvm.tuner.GATuner
+    :members:
+    :inherited-members:
+
+.. autoclass:: tvm.autotvm.tuner.XGBTuner
+    :members:
+    :inherited-members:
+
+.. automodule:: tvm.autotvm.tuner.callback
+    :members:
+
+.. automodule:: tvm.autotvm.tuner.graph_tuning
+    :members:
+
+tvm.autotvm.task
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.task
+    :members:
+
+.. automodule:: tvm.autotvm.task.task
+    :members:
+
+.. automodule:: tvm.autotvm.task.space
+    :members:
+
+.. automodule:: tvm.autotvm.task.dispatcher
+    :members:
+
+.. automodule:: tvm.autotvm.task.topi_integration
+    :members:
+
+.. automodule:: tvm.autotvm.task.nnvm_integration
+    :members:
+
+tvm.autotvm.record
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.record
+    :members:
diff --git a/docs/api/python/bridge.rst b/docs/api/python/bridge.rst
new file mode 100644
index 000000000000..afc7dc298652
--- /dev/null
+++ b/docs/api/python/bridge.rst
@@ -0,0 +1,7 @@
+Framework Bridge APIs
+---------------------
+
+tvm.contrib.mxnet
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.mxnet
+    :members:
diff --git a/docs/api/python/contrib.rst b/docs/api/python/contrib.rst
index ed04230deb8d..a58a3aa4fbef 100644
--- a/docs/api/python/contrib.rst
+++ b/docs/api/python/contrib.rst
@@ -1,38 +1,103 @@
-Contrib APIs
-------------
+Additional Contrib APIs
+-----------------------
 .. automodule:: tvm.contrib
 
-tvm.contrib.nvcc
-~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.nvcc
+tvm.contrib.cblas
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.cc
+    :members:
+
+
+tvm.contrib.clang
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.clang
     :members:
 
+
 tvm.contrib.cc
 ~~~~~~~~~~~~~~
 .. automodule:: tvm.contrib.cc
     :members:
 
-tvm.contrib.xcode
-~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.xcode
+
+tvm.contrib.cublas
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.cublas
+    :members:
+
+
+tvm.contrib.emscripten
+~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.emscripten
     :members:
 
-tvm.contrib.rpc
+tvm.contrib.miopen
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.miopen
+    :members:
+
+tvm.contrib.ndk
 ~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.rpc
+.. automodule:: tvm.contrib.ndk
+    :members:
+
+
+tvm.contrib.nnpack
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.nnpack
+    :members:
+
+
+tvm.contrib.nvcc
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.nvcc
+    :members:
+
+
+tvm.contrib.pickle_memoize
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.pickle_memoize
+    :members:
+
+
+tvm.contrib.random
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.random
     :members:
 
-tvm.contrib.graph_runtime
-~~~~~~~~~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.graph_runtime
+
+tvm.contrib.rocblas
+~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.rocblas
     :members:
 
+
+tvm.contrib.rocm
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.rocm
+    :members:
+
+
+tvm.contrib.spirv
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.spirv
+    :members:
+
+
+tvm.contrib.tar
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.tar
+    :members:
+
+
 tvm.contrib.util
 ~~~~~~~~~~~~~~~~
 .. automodule:: tvm.contrib.util
     :members:
 
-tvm.contrib.cblas
+
+
+tvm.contrib.xcode
 ~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.cblas
+.. automodule:: tvm.contrib.xcode
     :members:
diff --git a/docs/api/python/graph_runtime.rst b/docs/api/python/graph_runtime.rst
new file mode 100644
index 000000000000..89a223323e14
--- /dev/null
+++ b/docs/api/python/graph_runtime.rst
@@ -0,0 +1,4 @@
+tvm.contrib.graph_runtime
+-------------------------
+.. automodule:: tvm.contrib.graph_runtime
+    :members:
diff --git a/docs/api/python/hybrid.rst b/docs/api/python/hybrid.rst
new file mode 100644
index 000000000000..ac4111cfe768
--- /dev/null
+++ b/docs/api/python/hybrid.rst
@@ -0,0 +1,11 @@
+tvm.hybrid
+----------
+.. automodule:: tvm.hybrid
+
+.. autosummary::
+
+   tvm.hybrid.parse
+   tvm.hybrid.script
+
+.. autofunction:: tvm.hybrid.parse
+.. autofunction:: tvm.hybrid.script
diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index ee830e031462..59bd1795b7ec 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -14,6 +14,13 @@ Python API
    ndarray
    container
    function
+   autotvm
+   graph_runtime
+   rpc
+   bridge
    contrib
    dev
    topi
+   vta/index
+   nnvm/index
+   hybrid
diff --git a/docs/api/python/intrin.rst b/docs/api/python/intrin.rst
index 71ecaaa2ce8d..3942c57f1a04 100644
--- a/docs/api/python/intrin.rst
+++ b/docs/api/python/intrin.rst
@@ -10,7 +10,11 @@ tvm.intrin
    tvm.register_intrin_rule
    tvm.exp
    tvm.log
-
+   tvm.floor
+   tvm.ceil
+   tvm.trunc
+   tvm.round
+   tvm.abs
 
 .. autofunction:: tvm.call_packed
 .. autofunction:: tvm.call_pure_intrin
@@ -18,3 +22,8 @@ tvm.intrin
 .. autofunction:: tvm.register_intrin_rule
 .. autofunction:: tvm.exp
 .. autofunction:: tvm.log
+.. autofunction:: tvm.floor
+.. autofunction:: tvm.ceil
+.. autofunction:: tvm.trunc
+.. autofunction:: tvm.round
+.. autofunction:: tvm.abs
diff --git a/docs/api/python/ndarray.rst b/docs/api/python/ndarray.rst
index a06117e05543..2c8f0c292a43 100644
--- a/docs/api/python/ndarray.rst
+++ b/docs/api/python/ndarray.rst
@@ -14,5 +14,6 @@ tvm.ndarray
 .. autofunction:: tvm.opencl
 .. autofunction:: tvm.metal
 .. autofunction:: tvm.ndarray.array
+.. autofunction:: tvm.ndarray.empty
 
 .. autofunction:: tvm.register_extension
diff --git a/docs/api/python/nnvm/compiler.rst b/docs/api/python/nnvm/compiler.rst
new file mode 100644
index 000000000000..4b995b28cd9e
--- /dev/null
+++ b/docs/api/python/nnvm/compiler.rst
@@ -0,0 +1,23 @@
+nnvm.compiler
+-------------
+
+.. automodule:: nnvm.compiler
+
+.. autofunction:: nnvm.compiler.build
+
+.. autofunction:: nnvm.compiler.build_config
+
+.. autofunction:: nnvm.compiler.save_param_dict
+
+.. autofunction:: nnvm.compiler.load_param_dict
+
+.. autofunction:: nnvm.compiler.optimize
+
+.. automodule:: nnvm.compiler.graph_util
+    :members:
+
+.. automodule:: nnvm.compiler.graph_attr
+    :members:
+
+.. automodule:: nnvm.compiler.compile_engine
+    :members:
diff --git a/docs/api/python/nnvm/frontend.rst b/docs/api/python/nnvm/frontend.rst
new file mode 100644
index 000000000000..f872a6b878e2
--- /dev/null
+++ b/docs/api/python/nnvm/frontend.rst
@@ -0,0 +1,12 @@
+nnvm.frontend
+-------------
+
+.. automodule:: nnvm.frontend
+
+.. autofunction:: nnvm.frontend.from_mxnet
+
+.. autofunction:: nnvm.frontend.from_onnx
+
+.. autofunction:: nnvm.frontend.from_coreml
+
+.. autofunction:: nnvm.frontend.from_keras
diff --git a/docs/api/python/nnvm/graph.rst b/docs/api/python/nnvm/graph.rst
new file mode 100644
index 000000000000..5b36ab5194fd
--- /dev/null
+++ b/docs/api/python/nnvm/graph.rst
@@ -0,0 +1,8 @@
+nnvm.graph
+----------
+.. automodule:: nnvm.graph
+
+.. autofunction:: nnvm.graph.create
+
+.. autoclass:: nnvm.graph.Graph
+   :members:
diff --git a/docs/api/python/nnvm/index.rst b/docs/api/python/nnvm/index.rst
new file mode 100644
index 000000000000..c0e5912c76be
--- /dev/null
+++ b/docs/api/python/nnvm/index.rst
@@ -0,0 +1,13 @@
+NNVM API
+========
+
+This document contains the python API to NNVM compiler toolchain.
+
+.. toctree::
+   :maxdepth: 2
+
+   compiler
+   frontend
+   symbol
+   graph
+   top
diff --git a/docs/api/python/nnvm/symbol.rst b/docs/api/python/nnvm/symbol.rst
new file mode 100644
index 000000000000..c341d2ef71d7
--- /dev/null
+++ b/docs/api/python/nnvm/symbol.rst
@@ -0,0 +1,10 @@
+nnvm.symbol
+-----------
+.. automodule:: nnvm.symbol
+
+.. autoclass:: nnvm.symbol.Symbol
+    :members:
+
+.. autoclass:: nnvm.symbol.Variable
+
+.. autofunction:: nnvm.symbol.Group
diff --git a/docs/api/python/nnvm/top.rst b/docs/api/python/nnvm/top.rst
new file mode 100644
index 000000000000..fd28ff363f0d
--- /dev/null
+++ b/docs/api/python/nnvm/top.rst
@@ -0,0 +1,13 @@
+nnvm.top
+--------
+.. automodule:: nnvm.top
+
+.. autofunction:: register_compute
+
+.. autofunction:: register_schedule
+
+.. autofunction:: register_pattern
+
+
+.. autoclass:: nnvm.top.AttrDict
+   :members:
diff --git a/docs/api/python/rpc.rst b/docs/api/python/rpc.rst
new file mode 100644
index 000000000000..6c4ef59a493c
--- /dev/null
+++ b/docs/api/python/rpc.rst
@@ -0,0 +1,22 @@
+tvm.rpc
+-------
+.. automodule:: tvm.rpc
+
+.. autofunction:: tvm.rpc.connect
+.. autofunction:: tvm.rpc.connect_tracker
+
+.. autoclass:: tvm.rpc.TrackerSession
+    :members:
+    :inherited-members:
+
+.. autoclass:: tvm.rpc.RPCSession
+    :members:
+    :inherited-members:
+
+.. autoclass:: tvm.rpc.LocalSession
+    :members:
+    :inherited-members:
+
+.. autoclass:: tvm.rpc.Server
+    :members:
+    :inherited-members:
diff --git a/docs/api/python/target.rst b/docs/api/python/target.rst
index 0f824324d0c5..e5723349b5c0 100644
--- a/docs/api/python/target.rst
+++ b/docs/api/python/target.rst
@@ -1,13 +1,4 @@
 tvm.target
 ----------
 .. automodule:: tvm.target
-
-.. autofunction:: tvm.target.generic_func
-
-.. autoclass:: tvm.target.Target
     :members:
-
-.. autofunction:: tvm.target.cuda
-.. autofunction:: tvm.target.rocm
-.. autofunction:: tvm.target.rasp
-.. autofunction:: tvm.target.create
diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 4f7d8cace31b..7f150ddbf7cd 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -1,24 +1,45 @@
-TVM Operator Inventory
-----------------------
+TOPI
+----
 .. automodule:: topi
 
-Index
-~~~~~
-
-**List of operators**
+List of operators
+~~~~~~~~~~~~~~~~~
 
 .. autosummary::
 
+   topi.identity
+   topi.negative
+   topi.floor
+   topi.ceil
+   topi.trunc
+   topi.round
+   topi.abs
    topi.exp
    topi.tanh
    topi.log
    topi.sqrt
    topi.sigmoid
+   topi.clip
+   topi.cast
    topi.transpose
+   topi.flip
+   topi.strided_slice
    topi.expand_dims
+   topi.reshape
+   topi.squeeze
+   topi.concatenate
+   topi.split
+   topi.take
+   topi.full
+   topi.full_like
    topi.nn.relu
    topi.nn.leaky_relu
    topi.nn.dilate
+   topi.nn.pool
+   topi.nn.global_pool
+   topi.nn.upsampling
+   topi.nn.softmax
+   topi.nn.log_softmax
    topi.nn.conv2d_nchw
    topi.nn.conv2d_hwcn
    topi.nn.depthwise_conv2d_nchw
@@ -26,15 +47,28 @@ Index
    topi.max
    topi.sum
    topi.min
+   topi.argmax
+   topi.argmin
    topi.broadcast_to
-   topi.broadcast_add
-   topi.broadcast_sub
-   topi.broadcast_mul
-   topi.broadcast_div
-
+   topi.add
+   topi.subtract
+   topi.multiply
+   topi.divide
+   topi.mod
+   topi.maximum
+   topi.minimum
+   topi.power
+   topi.greater
+   topi.less
+   topi.equal
+   topi.not_equal
+   topi.greater_equal
+   topi.less_equal
+   topi.image.resize
 
-**List of schedules**
 
+List of schedules
+~~~~~~~~~~~~~~~~~
 .. autosummary::
 
    topi.generic.schedule_conv2d_nchw
@@ -45,33 +79,65 @@ Index
 
 topi
 ~~~~
+.. autofunction:: topi.negative
+.. autofunction:: topi.identity
+.. autofunction:: topi.floor
+.. autofunction:: topi.ceil
+.. autofunction:: topi.trunc
+.. autofunction:: topi.round
+.. autofunction:: topi.abs
 .. autofunction:: topi.exp
 .. autofunction:: topi.tanh
 .. autofunction:: topi.log
 .. autofunction:: topi.sqrt
 .. autofunction:: topi.sigmoid
+.. autofunction:: topi.clip
+.. autofunction:: topi.cast
 .. autofunction:: topi.transpose
+.. autofunction:: topi.flip
+.. autofunction:: topi.strided_slice
 .. autofunction:: topi.expand_dims
+.. autofunction:: topi.reshape
+.. autofunction:: topi.squeeze
+.. autofunction:: topi.concatenate
+.. autofunction:: topi.split
+.. autofunction:: topi.take
+.. autofunction:: topi.full
+.. autofunction:: topi.full_like
 .. autofunction:: topi.max
 .. autofunction:: topi.sum
 .. autofunction:: topi.min
 .. autofunction:: topi.broadcast_to
-.. autofunction:: topi.broadcast_add
-.. autofunction:: topi.broadcast_sub
-.. autofunction:: topi.broadcast_mul
-.. autofunction:: topi.broadcast_div
-
+.. autofunction:: topi.add
+.. autofunction:: topi.subtract
+.. autofunction:: topi.multiply
+.. autofunction:: topi.divide
+.. autofunction:: topi.mod
+.. autofunction:: topi.maximum
+.. autofunction:: topi.minimum
+.. autofunction:: topi.power
+.. autofunction:: topi.greater
+.. autofunction:: topi.less
 
 topi.nn
 ~~~~~~~
 .. autofunction:: topi.nn.relu
 .. autofunction:: topi.nn.leaky_relu
 .. autofunction:: topi.nn.dilate
+.. autofunction:: topi.nn.pool
+.. autofunction:: topi.nn.global_pool
+.. autofunction:: topi.nn.upsampling
+.. autofunction:: topi.nn.softmax
+.. autofunction:: topi.nn.log_softmax
 .. autofunction:: topi.nn.conv2d_nchw
 .. autofunction:: topi.nn.conv2d_hwcn
 .. autofunction:: topi.nn.depthwise_conv2d_nchw
 .. autofunction:: topi.nn.depthwise_conv2d_nhwc
 
+topi.image
+~~~~~~~~~~
+.. autofunction:: topi.image.resize
+
 
 topi.generic
 ~~~~~~~~~~~~
diff --git a/docs/api/python/tvm.rst b/docs/api/python/tvm.rst
index 8700da38273b..6522df3ae9d3 100644
--- a/docs/api/python/tvm.rst
+++ b/docs/api/python/tvm.rst
@@ -15,6 +15,7 @@ The user facing API for computation declaration.
    tvm.extern
    tvm.decl_buffer
    tvm.reduce_axis
+   tvm.select
    tvm.thread_axis
    tvm.comm_reducer
    tvm.sum
@@ -33,6 +34,7 @@ The user facing API for computation declaration.
 .. autofunction:: tvm.extern
 .. autofunction:: tvm.decl_buffer
 .. autofunction:: tvm.reduce_axis
+.. autofunction:: tvm.select
 .. autofunction:: tvm.thread_axis
 .. autofunction:: tvm.comm_reducer
 .. autofunction:: tvm.sum
diff --git a/docs/api/python/vta/index.rst b/docs/api/python/vta/index.rst
new file mode 100644
index 000000000000..014b789e5aa0
--- /dev/null
+++ b/docs/api/python/vta/index.rst
@@ -0,0 +1,28 @@
+VTA API
+=======
+
+This document contains the python API to VTA compiler toolchain.
+
+.. automodule:: vta
+
+Hardware Information
+--------------------
+
+.. autofunction:: vta.Environment
+.. autofunction:: vta.get_env
+
+RPC Utilities
+-------------
+
+.. autofunction:: vta.reconfig_runtime
+.. autofunction:: vta.program_fpga
+
+
+Compiler API
+------------
+We program VTA using TVM, so the compiler API in vta package
+is only a thin wrapper to provide VTA specific extensions.
+
+.. autofunction:: vta.build_config
+.. autofunction:: vta.build
+.. autofunction:: vta.lower
diff --git a/docs/api_links.rst b/docs/api_links.rst
index 9a55af1728b9..909cfe367f29 100644
--- a/docs/api_links.rst
+++ b/docs/api_links.rst
@@ -1,5 +1,5 @@
-Links to API References
-=======================
+Links to C++ and JS API References
+==================================
 
 This page contains links to API references that are build with different doc build system.
 
diff --git a/docs/conf.py b/docs/conf.py
index 4a42fb0fedb0..989d26f87d3e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -25,6 +25,8 @@
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../python/'))
 sys.path.insert(0, os.path.join(curr_path, '../topi/python'))
+sys.path.insert(0, os.path.join(curr_path, '../nnvm/python'))
+sys.path.insert(0, os.path.join(curr_path, '../vta/python'))
 
 # -- General configuration ------------------------------------------------
 
@@ -40,6 +42,7 @@
     '.md': CommonMarkParser
 }
 os.environ['TVM_BUILD_DOC'] = '1'
+os.environ['NNVM_BUILD_DOC'] = '1'
 # Version information.
 import tvm
 version = tvm.__version__
@@ -137,6 +140,14 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 
+html_theme_options = {
+    'analytics_id': 'UA-75982049-2',
+    'logo_only': True,
+}
+
+html_logo = "_static/img/tvm-logo-small.png"
+
+
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
 
@@ -174,12 +185,17 @@ def run_doxygen(folder):
 
 from sphinx_gallery.sorting import ExplicitOrder
 
-examples_dirs = ['../tutorials/']
-gallery_dirs = ['tutorials']
+examples_dirs = ["../tutorials/", "../vta/tutorials/"]
+gallery_dirs = ["tutorials", "vta/tutorials"]
+
 subsection_order = ExplicitOrder(
     ['../tutorials/language',
      '../tutorials/optimize',
-     '../tutorials/deployment'])
+     '../tutorials/autotvm',
+     '../tutorials/vta',
+     '../tutorials/topi',
+     '../tutorials/deployment',
+     '../tutorials/nnvm'])
 
 def generate_doxygen_xml(app):
     """Run the doxygen make commands if we're on the ReadTheDocs server"""
@@ -207,7 +223,7 @@ def setup(app):
     'examples_dirs': examples_dirs,
     'gallery_dirs': gallery_dirs,
     'subsection_order': subsection_order,
+    'filename_pattern': os.environ.get("TVM_TUTORIAL_EXEC_PATTERN", ".py"),
     'find_mayavi_figures': False,
-    'filename_pattern': '.py',
     'expected_failing_examples': []
 }
diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
new file mode 100644
index 000000000000..dc7d998ca37f
--- /dev/null
+++ b/docs/contribute/code_guide.rst
@@ -0,0 +1,39 @@
+.. _code_guide:
+
+Code Guide and Tips
+===================
+
+This is a document used to record tips in tvm codebase for reviewers and contributors.
+Most of them are summarized through lessons during the contributing and process.
+
+
+C++ Code Styles
+---------------
+- Use the Google C/C++ style.
+- The public facing functions are documented in doxygen format.
+- Favor concrete type declaration over ``auto`` as long as it is short.
+- Favor passing by const reference (e.g. ``const Expr&``) over passing by value.
+  Except when the function consumes the value by copy constructor or move,
+  pass by value is better than pass by const reference in such cases.
+
+Python Code Styles
+------------------
+- The functions and classes are documented in `numpydoc <https://numpydoc.readthedocs.io/en/latest/>`_ format.
+- Check your code style using ``make pylint``
+
+
+Handle Integer Constant Expression
+----------------------------------
+We often need to handle constant integer expressions in tvm. Before we do so, the first question we want to ask is that is it really necessary to get a constant integer. If symbolic expression also works and let the logic flow, we should use symbolic expression as much as possible. So the generated code works for shapes that are not known ahead of time.
+
+Note that in some cases we cannot know certain information, e.g. sign of symbolic variable, it is ok to make assumptions in certain cases. While adding precise support if the variable is constant.
+
+If we do have to get constant integer expression, we should get the constant value using type ``int64_t`` instead of ``int``, to avoid potential integer overflow. We can always reconstruct an integer with the corresponding expression type via ``make_const``. The following code gives an example.
+
+.. code:: c++
+
+   Expr CalculateExpr(Expr value) {
+     int64_t int_value = GetConstInt<int64_t>(value);
+     int_value = CalculateExprInInt64(int_value);
+     return make_const(value.type(), int_value);
+   }
diff --git a/docs/contribute/code_review.rst b/docs/contribute/code_review.rst
new file mode 100644
index 000000000000..344296932703
--- /dev/null
+++ b/docs/contribute/code_review.rst
@@ -0,0 +1,63 @@
+Perform Code Reviews
+====================
+
+This is a general guideline for code reviewers. First of all, while it is great to add new features to a project, we must also be aware that each line of code we introduce also brings **technical debt** that we may have to eventually pay.
+
+Open source code is maintained by a community with diverse backend, and it is even more important to bring clear, documented and maintainable code. Code reviews are shepherding process to spot potential problems, improve quality of the code. We should, however, not rely on code review process to get the code into a ready state. Contributors are encouraged to polish the code to a ready state before requesting reviews. This is especially expected for code owner and comitter candidates.
+
+Here are some checklists for code reviews, it is also helpful reference for contributors
+
+
+Hold the Highest Standard
+-------------------------
+The first rule for code reviewers is to always keep the highest standard, and do not approve code just to "be friendly". Good, informative critics each other learn and prevents technical debt in early stages.
+
+Ensure Test Coverage
+--------------------
+Each new change of features should introduce test cases, bug fixes should include regression tests that prevent the problem from happening again.
+
+Documentations are Mandatory
+----------------------------
+Documentation is usually a place we overlooked, new functions or change to a function should be directly updated in documents. A new feature is meaningless without documentation to make it accessible. See more at :ref:`doc_guide`
+
+Deliberate on User-facing API
+-----------------------------
+A good, minimum and stable API is critical to the project’s life. A good API makes a huge difference. Always think very carefully about all the aspects including naming, arguments definitions and behavior. One good rule to check is to be consistent with existing well-known package’s APIs if the feature overlap. For example, tensor operation APIs should always be consistent with the numpy.
+
+Minimum Dependency
+------------------
+Always be cautious in introducing dependencies. While it is important to reuse code and not reinventing the wheel, dependencies can increase burden of users in deployment. A good design principle only depends on the part when a user actually use it.
+
+Ensure Readability
+------------------
+While it is hard to implement a new feature, it is even harder to make others understand and maintain the code you wrote. It is common for a PMC or committer to not being able to understand certain contributions. In such case, a reviewer should say "I don’t understand" and ask the contributor to clarify. We highly encourage code comments which explain the code logic along with the code.
+
+Concise Implementation
+----------------------
+Some basic principles applied here: favor vectorized array code over loops, is there existing API that solves the problem.
+
+Document Lessons in Code Reviews
+--------------------------------
+When you find there are some common lessons that can be summarized in the guideline,
+add it to the :ref:`code_guide`.
+It is always good to refer to the guideline document when requesting changes,
+so the lessons can be shared to all the community.
+
+Respect each other
+------------------
+The code reviewers and contributors are paying the most precious currencies in the world -- time. We are volunteers in the community to spend the time to build good code, help each other, learn and have fun hacking.
+
+Learn from other Code Reviews
+-----------------------------
+There can be multiple reviewers reviewing the same changes. Many cases other reviewers
+may spot things you did not find. Try to learn from other code reviews,
+when possible, document these lessons.
+
+Approve and Request Changes Explicitly
+--------------------------------------
+The contributor and code owner can request code reviews from multiple reviewers.
+Remember to approve changes when your comments are addressed in a code review.
+To do so -- please click on changes tab in the pull request, then select approve,
+or comment on the code and click request changes.
+Code owner can decide if the code can be merged in case by case if some of the reviewers
+did not respond in time(e.g. a week) and existing reviews are sufficient.
diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
new file mode 100644
index 000000000000..1023cf0ddccc
--- /dev/null
+++ b/docs/contribute/community.rst
@@ -0,0 +1,51 @@
+TVM Community Structure
+=======================
+
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community. There are several roles in the community:
+
+- Project Management Committee(PMC) Small group of active committers that moderate the discussion, RFC, manage project releases.
+- Committer Individual who has made substantial contributions to the project and is granted write access to the project and oversees the general direction of the projects.
+- Code Owner Individual who is responsible for a specific area of the codebase.
+- Reviewer Individual who is qualified to review for a specific area of the codebase.
+- Contributor Anyone who contributes to the project.
+
+This document explains responsibility and criteria for each role.
+See `CONTRIBUTORS.md <https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md>`_ for the current list of contributors and their roles.
+
+
+Project Management Committee
+----------------------------
+
+The PMC consists of a small group of active committers that moderate the discussion, provide mentorship to committers and code owners and manage the project release. PMC members need to actively manage the general project directions. Note that most major design choices and proposed changes should reach consensus among the committers.
+
+Committer
+---------
+
+Committers are individuals who are granted the write access to the project. Committers oversee the general project directions and participate in the evaluation of the RFCs involving major design changes. Here is a list of useful things to do to help become a committer.
+
+- Deep understanding of one or a few modules in the project.
+- Good understanding of general project structure, demonstrated by discussion over RFCs, code reviews and proposals of new features
+- Active history of code reviews that demonstrate a good technical ability
+- Contribution history of high-quality documentation and tutorials to the promote project
+- History of creating clean, maintainable code and including good test cases.
+
+New committers are nominated by current committers from current code owners.
+
+Code Owner
+----------
+
+A code owner is an individual who is responsible for a specific area of the code-base. Code owners are responsible for the areas they are in charge of and oversee the code review process of the corresponding module. Changes to a specific area need to be approved by one of its owners in order to be merged. Once a pull request is approved by the designated code owner, the code can be directly merged into the repo. Code owners are essential for a high quality and healthy codebase.
+
+We welcome new code owners that help to keep good code quality, testing, and documentation in specific areas. Here is a list of useful traits that help the community to recognize potential code owners:
+
+- High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review
+- Good coverage of tests and documentation in the contributions
+- Informative code reviews to help other contributors that adhere to a good standard, spot problems in contributions etc.
+- Active participation in the discussion forum
+
+Reviewer
+--------
+
+A reviewer is an individual who actively contributed to the project and is willing to participate in the code review of new contributions. We invite reviewers from active contributors. The reviewer invitation will be sent to the potential reviewer’s email, so please log in to the discussion forum so that we can know which email address we could send an invitation to.
+We actively seek reviews from reviewers. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project.
+A pull request to the project has to be reviewed by a reviewer in order to be merged.
diff --git a/docs/contribute/document.rst b/docs/contribute/document.rst
new file mode 100644
index 000000000000..ab67fbec9384
--- /dev/null
+++ b/docs/contribute/document.rst
@@ -0,0 +1,88 @@
+.. _doc_guide:
+
+Write Document and Tutorials
+============================
+
+We use the `Sphinx <http://sphinx-doc.org>`_ for the main documentation.
+Sphinx support both the reStructuredText and markdown.
+When possible, we encourage to use reStructuredText as it has richer features.
+Note that the python doc-string and tutorials allow you to embed reStructuredText syntax.
+
+
+Document Python
+---------------
+We use `numpydoc <https://numpydoc.readthedocs.io/en/latest/>`_
+format to document the function and classes.
+The following snippet gives an example docstring.
+We always document all the public functions,
+when necessary, provide an usage example of the features we support(as shown below).
+
+.. code:: python
+
+    def myfunction(arg1, arg2, arg3=3):
+        """Briefly describe my function.
+
+        Parameters
+        ----------
+        arg1 : Type1
+            Description of arg1
+
+        arg2 : Type2
+            Description of arg2
+
+        arg3 : Type3, optional
+            Description of arg3
+
+        Returns
+        -------
+        rv1 : RType1
+            Description of return type one
+
+        Examples
+        --------
+        .. code:: python
+
+            # Example usage of myfunction
+            x = myfunction(1, 2)
+        """
+        return rv1
+
+Be careful to leave blank lines between sections of your documents.
+In the above case, there has to be a blank line before `Parameters`, `Returns` and `Examples`
+in order for the doc to be built correctly. To add a new function to the doc,
+we need to add the `sphinx.autodoc <http://www.sphinx-doc.org/en/master/ext/autodoc.html>`_
+rules to the `docs/api/python <https://github.com/dmlc/tvm/tree/master/docs/api/python>`_).
+You can refer to the existing files under this folder on how to add the functions.
+
+
+Document C++
+------------
+We use the doxgen format to document c++ functions.
+The following snippet shows an example of c++ docstring.
+
+.. code:: c++
+
+    /*!
+     * \brief Description of my function
+     * \param arg1 Description of arg1
+     * \param arg2 Descroption of arg2
+     * \returns describe return value
+     */
+    int myfunction(int arg1, int arg2) {
+      // When necessary, also add comment to clarify internal logics
+    }
+
+Besides documenting function usages, we also highly recommend contributors
+to add comments about code logics to improve readability.
+
+
+Write Tutorials
+---------------
+We use the `sphinx-gallery <https://sphinx-gallery.github.io/>`_ to build python tutorials.
+You can find the source code under `tutorials <https://github.com/dmlc/tvm/tree/master/tutorials>`_ quite self explanatory.
+One thing that worth noting is that the comment blocks are written in reStructuredText instead of markdown so be aware of the syntax.
+
+The tutorial code will run on our build server to generate the document page.
+So we may have a restriction like not being able to access a remote Raspberry Pi,
+in such case add a flag variable to the tutorial (e.g. `use_rasp`) and allow users to easily switch to the real device by changing one flag.
+Then use the existing environment to demonstrate the usage.
diff --git a/docs/contribute/git_howto.md b/docs/contribute/git_howto.md
new file mode 100644
index 000000000000..53ff89b127df
--- /dev/null
+++ b/docs/contribute/git_howto.md
@@ -0,0 +1,57 @@
+# Git Usage Tips
+
+Here are some tips for git workflow.
+
+## How to resolve conflict with master
+- First rebase to most recent master
+```bash
+# The first two steps can be skipped after you do it once.
+git remote add upstream [url to tvm repo]
+git fetch upstream
+git rebase upstream/master
+```
+- The git may show some conflicts it cannot merge, say ```conflicted.py```.
+  - Manually modify the file to resolve the conflict.
+  - After you resolved the conflict, mark it as resolved by
+```bash
+git add conflicted.py
+```
+- Then you can continue rebase by
+```bash
+git rebase --continue
+```
+- Finally push to your fork, you may need to force push here.
+```bash
+git push --force
+```
+
+## How to combine multiple commits into one
+Sometimes we want to combine multiple commits, especially when later commits are only fixes to previous ones,
+to create a PR with set of meaningful commits. You can do it by following steps.
+- Before doing so, configure the default editor of git if you haven't done so before.
+```bash
+git config core.editor the-editor-you-like
+```
+- Assume we want to merge last 3 commits, type the following commands
+```bash
+git rebase -i HEAD~3
+```
+- It will pop up an text editor. Set the first commit as ```pick```, and change later ones to ```squash```.
+- After you saved the file, it will pop up another text editor to ask you modify the combined commit message.
+- Push the changes to your fork, you need to force push.
+```bash
+git push --force
+```
+
+## Reset to the most recent master
+You can always use git reset to reset your version to the most recent master.
+Note that all your ***local changes will get lost***.
+So only do it when you do not have local changes or when your pull request just get merged.
+```bash
+git reset --hard [hash tag of master]
+git push --force
+```
+
+## What is the consequence of force push
+The previous two tips requires force push, this is because we altered the path of the commits.
+It is fine to force push to your own fork, as long as the commits changed are only yours.
diff --git a/docs/contribute/index.rst b/docs/contribute/index.rst
new file mode 100644
index 000000000000..ea358e287f60
--- /dev/null
+++ b/docs/contribute/index.rst
@@ -0,0 +1,31 @@
+Contribute to TVM
+=================
+
+TVM has been developed by community members.
+Everyone is welcomed to contribute.
+We value all forms of contributions, including, but not limited to:
+
+- Code reviewing of the existing patches.
+- Documentation and usage examples
+- Community participation in forums and issues.
+- Code readability and developer guide
+
+  - We welcome contributions that add code comments
+    to improve readability
+  - We also welcome contributions to docs to explain the
+    design choices of the internal.
+
+- Test cases to make the codebase more robust
+- Tutorials, blog posts, talks that promote the project.
+
+Here are guidelines for contributing to various aspect of the project:
+
+.. toctree::
+   :maxdepth: 2
+
+   community
+   code_review
+   document
+   code_guide
+   pull_request
+   git_howto
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
new file mode 100644
index 000000000000..80a0448c08dd
--- /dev/null
+++ b/docs/contribute/pull_request.rst
@@ -0,0 +1,26 @@
+Submit a Pull Request
+=====================
+
+This is a quick guide to submit a pull request, please also refer to the detailed guidelines.
+
+- Before submit, please rebase your code on the most recent version of master, you can do it by
+
+  .. code:: bash
+
+    git remote add upstream [url to tvm repo]
+    git fetch upstream
+    git rebase upstream/master
+
+- Make sure code style check pass by typing ``make lint``, and all the existing test-cases pass.
+- Add test-cases to cover the new features or bugfix the patch introduces.
+- Document the code you wrote, see more at :ref:`doc_guide`
+- Send the pull request,  fix the problems reported by automatic checks.
+  Request code reviews from other contributors and improves your patch according to feedbacks.
+
+  - To get your code reviewed quickly, we encourage you to help review others' code so they can do the favor in return.
+  - Code review is a shepherding process that helps to improve contributor's code quality.
+    We should treat it proactively, to improve the code as much as possible before the review.
+    We highly value patches that can get in without extensive reviews.
+  - The detailed guidelines and summarizes useful lessons.
+
+- The patch can be merged after the reviewers approve the pull request.
diff --git a/docs/deploy/android.md b/docs/deploy/android.md
new file mode 100644
index 000000000000..ca431693c63a
--- /dev/null
+++ b/docs/deploy/android.md
@@ -0,0 +1,25 @@
+# Deploy to Android
+
+
+## Build model for Android Target
+
+NNVM compilation of model for android target could follow same approach like android_rpc.
+
+An reference exampe can be found at [chainer-nnvm-example](https://github.com/tkat0/chainer-nnvm-example)
+
+Above example will directly run the compiled model on RPC target. Below modification at [rum_mobile.py](https://github.com/tkat0/chainer-nnvm-example/blob/5b97fd4d41aa4dde4b0aceb0be311054fb5de451/run_mobile.py#L64) will save the compilation output which is required on android target.
+
+```
+lib.export_library("deploy_lib.so", ndk.create_shared)
+with open("deploy_graph.json", "w") as fo:
+    fo.write(graph.json())
+with open("deploy_param.params", "wb") as fo:
+    fo.write(nnvm.compiler.save_param_dict(params))
+```
+
+deploy_lib.so, deploy_graph.json, deploy_param.params will go to android target.
+
+## TVM Runtime for Android Target
+
+Refer [here](https://github.com/dmlc/tvm/blob/master/apps/android_deploy/README.md#build-and-installation) to build CPU/OpenCL version flavor TVM runtime for android target.
+From android java TVM API to load model & execute can be refered at this [java](https://github.com/dmlc/tvm/blob/master/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java) sample source.
diff --git a/docs/deploy/aocl_fpga.md b/docs/deploy/aocl_fpga.md
new file mode 100644
index 000000000000..bd0dae97879d
--- /dev/null
+++ b/docs/deploy/aocl_fpga.md
@@ -0,0 +1,92 @@
+AOCL Backend Example
+====================
+
+TVM supports Intel FPGA SDK for OpenCL also known as AOCL.  Here is a tutorial for how to use TVM with AOCL.
+
+***Note***: This feature is still experimental.  We cannot use AOCL to deploy an end to end neural networks for now.  In addition, we only tested compilation for emulation mode of AOCL.
+
+We use two python scripts for this tutorial.
+
+- build.py - a script to synthesize FPGA bitstream.
+```
+import tvm
+
+tgt_host="llvm"
+tgt="aocl -device=s5_ref -mattr=emulator"
+
+n = tvm.var("n")
+A = tvm.placeholder((n,), name='A')
+B = tvm.placeholder((n,), name='B')
+C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+s = tvm.create_schedule(C.op)
+px, x = s[C].split(C.op.axis[0], nparts=1)
+
+s[C].bind(px, tvm.thread_axis("pipeline"))
+
+fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+
+fadd.save("myadd.o")
+fadd.imported_modules[0].save("myadd.aocx")
+
+tvm.contrib.cc.create_shared("myadd.so", ["myadd.o"])
+```
+
+- run.py - a script to use FPGA as an accelerator.
+```
+import tvm
+import numpy as np
+import os
+
+tgt="aocl -device=s5_ref -mattr=emulator"
+
+fadd = tvm.module.load("myadd.so")
+fadd_dev = tvm.module.load("myadd.aocx")
+fadd.import_module(fadd_dev)
+
+ctx = tvm.context(tgt, 0)
+
+n = 1024
+a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
+b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
+c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
+
+fadd(a, b, c)
+np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+```
+
+Setup
+-----
+
+- Install AOCL 17.1 on Ubuntu 16.04.4 LTS.
+- Install BSP for your FPGA device.
+- Install FPGA device driver.
+- Create an ICD file at /etc/OpenCL/vendors/Altera.icd so that the OpenCL platform can be found.
+```
+/opt/intelFPGA/17.1/hld/linux64/lib/libalteracl.so
+```
+- Create an FCD file for example at /opt/Intel/OpenCL/Boards/s5_ref.fcd so that your FPGA device can be found.
+```
+/opt/intelFPGA/17.1/hld/board/s5_ref/linux64/lib/libaltera_s5_ref_mmd.so
+```
+- Setup TVM with AOCL and OpenCL enabled.
+
+Emulation
+---------
+
+- Run software emulation
+```
+export CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1
+
+python build.py
+python run.py
+```
+
+- Run on FPGA devices (not tested)
+    - Change tgt value to "aocl -device=s5_ref" on build.py and run.py
+```
+unset CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA
+
+python build.py
+python run.py
+```
diff --git a/docs/deploy/aws_fpga.md b/docs/deploy/aws_fpga.md
new file mode 100644
index 000000000000..7554ce7f64cd
--- /dev/null
+++ b/docs/deploy/aws_fpga.md
@@ -0,0 +1,152 @@
+HLS Backend Example
+===================
+
+TVM supports Xilinx FPGA board with SDAccel.  Here is a tutorial for how to deploy TVM to AWS F1 FPGA instance.
+
+***Note***: This feature is still experimental.  We cannot use SDAccel to deploy an end to end neural networks for now.
+
+We use two python scripts for this tutorial.
+
+- build.py - a script to synthesize FPGA bitstream.
+```python
+import tvm
+
+tgt_host="llvm"
+tgt="sdaccel"
+
+n = tvm.var("n")
+A = tvm.placeholder((n,), name='A')
+B = tvm.placeholder((n,), name='B')
+C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+s = tvm.create_schedule(C.op)
+px, x = s[C].split(C.op.axis[0], nparts=1)
+
+s[C].bind(px, tvm.thread_axis("pipeline"))
+
+fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+
+fadd.save("myadd.o")
+fadd.imported_modules[0].save("myadd.xclbin")
+
+tvm.contrib.cc.create_shared("myadd.so", ["myadd.o"])
+```
+
+- run.py - a script to use FPGA as an accelerator.
+```python
+import tvm
+import numpy as np
+import os
+
+tgt="sdaccel"
+
+fadd = tvm.module.load("myadd.so")
+if os.environ.get("XCL_EMULATION_MODE"):
+    fadd_dev = tvm.module.load("myadd.xclbin")
+else:
+    fadd_dev = tvm.module.load("myadd.awsxclbin")
+fadd.import_module(fadd_dev)
+
+ctx = tvm.context(tgt, 0)
+
+n = 1024
+a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
+b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
+c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
+
+fadd(a, b, c)
+np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+```
+
+Setup
+-----
+
+- Launch an instance using the FPGA Developer AMI.  We don't need an F1 instance for emulation and synthesis, so it is recommended to use a lower cost instance for them.
+
+- Setup AWS FPGA development kit.
+```bash
+git clone https://github.com/aws/aws-fpga.git
+cd aws-fpga
+source sdaccel_setup.sh
+source ${XILINX_SDX}/settings64.sh
+```
+
+- Setup TVM with OpenCL enabled.
+
+Emulation
+---------
+
+- Create emconfig.json for emulation.
+```bash
+emconfigutil --platform ${AWS_PLATFORM} --nd 1
+```
+
+- Copy emconfig.json to the python binary directory.  It is because the current Xilinx toolkit assumes that both host binary and the emconfig.json file are in the same path.
+```bash
+cp emconfig.json $(dirname $(which python))
+```
+
+- Run software emulation
+```bash
+export XCL_EMULATION_MODE=1
+export XCL_TARGET=sw_emu
+
+python build.py
+python run.py
+```
+
+- Run hardware emulation
+```bash
+export XCL_EMULATION_MODE=1
+export XCL_TARGET=hw_emu
+
+python build.py
+python run.py
+```
+
+
+Synthesis
+---------
+
+- Run synthesis with the following script. `XCL_EMULATION_MODE` must be set to 1 at this stage.
+
+```bash
+export XCL_EMULATION_MODE=1
+export XCL_TARGET=hw
+
+python build.py
+```
+
+- Create AWS FPGA image and upload it to AWS S3.
+```
+${SDACCEL_DIR}/tools/create_sdaccel_afi.sh -xclbin=myadd.xclbin -o=myadd \
+    -s3_bucket=<bucket-name> -s3_dcp_key=<dcp-folder-name> -s3_logs_key=<logs-folder-name>
+```
+This also generates an awsxclbin file, which is necessary to use the AWS FPGA image on F1 instances.
+
+Run
+---
+
+- Launch Amazon EC2 F1 instance.
+
+- Copy `myadd.so`, `myadd.awsxclbin`, and `run.py` to the F1 instance.
+
+- Setup AWS FPGA development kit.
+```bash
+git clone https://github.com/aws/aws-fpga.git
+cd aws-fpga
+source sdaccel_setup.sh
+```
+
+- Setup TVM with OpenCL enabled.
+
+- Become root and setup environment variables.
+```bash
+sudo sh
+source ${INSTALL_ROOT}/setup.sh
+```
+
+- Run
+```bash
+python run.py
+```
diff --git a/docs/how_to/deploy.md b/docs/deploy/cpp_deploy.md
similarity index 93%
rename from docs/how_to/deploy.md
rename to docs/deploy/cpp_deploy.md
index b9f219acc335..d02d33d18694 100644
--- a/docs/how_to/deploy.md
+++ b/docs/deploy/cpp_deploy.md
@@ -1,5 +1,6 @@
-How to Deploy TVM Modules
-=========================
+Deploy TVM Module using C++ API
+===============================
+
 We provide an example on how to deploy TVM modules in [apps/howto_deploy](https://github.com/dmlc/tvm/tree/master/apps/howto_deploy)
 
 To run the example, you can use the following command
@@ -12,8 +13,6 @@ cd apps/howto_deploy
 Get TVM Runtime Library
 -----------------------
 
-![](http://www.tvmlang.org/images/release/tvm_flexible.png)
-
 The only thing we need is to link to a TVM runtime in your target platform.
 TVM provides a minimum runtime, which costs around 300K to 600K depending on how much modules we use.
 In most cases, we can use ```libtvm_runtime.so``` that comes with the build.
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
new file mode 100644
index 000000000000..0ef5cf5c8246
--- /dev/null
+++ b/docs/deploy/index.rst
@@ -0,0 +1,54 @@
+.. _deploy-and-integration:
+
+Deploy and Integration
+======================
+
+This page contains guidelines on how to deploy TVM to various platforms
+as well as how to integrate it with your project.
+
+.. image::  http://www.tvm.ai/images/release/tvm_flexible.png
+
+Unlike traditional deep learning frameworks. TVM stack is divided into two major components:
+
+- TVM compiler, which does all the compilation and optimizations
+- TVM runtime, which runs on the target devices.
+
+In order to integrate the compiled module, we **do not** need to build entire TVM on the target device. You only need to build the TVM compiler stack on your desktop and use that to cross-compile modules that are deployed on the target device.
+We only need to use a light-weight runtime API that can be integrated into various platforms.
+
+For example, you can run the following commands to build the runtime API
+on a Linux based embedded system such as Raspberry Pi:
+
+.. code:: bash
+
+    git clone --recursive https://github.com/dmlc/tvm
+    cd tvm
+    mkdir build
+    cp cmake/config.cmake build
+    cd build
+    cmake ..
+    make runtime
+
+Note that we type `make runtime` to only build the runtime library.
+If you want to include additional runtime such as OpenCL,
+you can modify `config.cmake` to enable these options.
+After you get the TVM runtime library, you can link the compiled library
+
+The easiest and recommended way to test, tune and benchmark TVM kernels on
+embedded devices is through TVM's RPC API.
+Here are the links to the related tutorials.
+
+- :ref:`tutorial-cross-compilation-and-rpc`
+- :ref:`tutorial-deploy-model-on-mali-gpu`
+- :ref:`tutorial-deploy-model-on-rasp`
+
+After you finished tuning and benchmarking, you might need to deploy the model on the
+target device without relying on RPC. see the following resources on how to do so.
+
+.. toctree::
+   :maxdepth: 2
+
+   cpp_deploy
+   android
+   nnvm
+   integrate
diff --git a/docs/deploy/integrate.md b/docs/deploy/integrate.md
new file mode 100644
index 000000000000..b6f3b1faa3da
--- /dev/null
+++ b/docs/deploy/integrate.md
@@ -0,0 +1,50 @@
+Integrate TVM into Your Project
+===============================
+
+TVM's runtime is designed to be lightweight and portable.
+There are several ways you can integrate TVM into your project.
+
+This article introduces possible ways to integrate TVM
+as a JIT compiler to generate functions on your system.
+
+
+## DLPack Support
+
+TVM's generated function follows the PackedFunc convention.
+It is a function that can take positional arguments including
+standard types such as float, integer, string.
+The PackedFunc takes DLTensor pointer in [dlpack](https://github.com/dmlc/dlpack) convention.
+So the only thing you need to solve is to create a corresponding DLTensor object.
+
+
+
+## Integrate User Defined C++ Array
+
+The only thing we have to do in C++ is to convert your array to DLTensor and pass in its address as
+```DLTensor*``` to the generated function.
+
+
+## Integrate User Defined Python Array
+
+Assume you have a python object ```MyArray```. There are three things that you need to do
+
+- Add ```_tvm_tcode``` field to your array which returns ```tvm.TypeCode.ARRAY_HANDLE```
+- Support ```_tvm_handle``` property in your object, which returns the address of DLTensor in python integer
+- Register this class by ```tvm.register_extension```
+
+```python
+# Example code
+import tvm
+
+class MyArray(object):
+    _tvm_tcode = tvm.TypeCode.ARRAY_HANDLE
+
+    @property
+    def _tvm_handle(self):
+        dltensor_addr = self.get_dltensor_addr()
+        return dltensor_addr
+
+# You can put registration step in a separate file mypkg.tvm.py
+# and only optionally import that if you only want optional dependency.
+tvm.register_extension(MyArray)
+```
diff --git a/docs/deploy/nnvm.md b/docs/deploy/nnvm.md
new file mode 100644
index 000000000000..aa6c39facd1f
--- /dev/null
+++ b/docs/deploy/nnvm.md
@@ -0,0 +1,118 @@
+# Deploy NNVM Modules
+NNVM compiled modules are fully embedded in TVM runtime as long as ```GRAPH_RUNTIME``` option
+is enabled in tvm runtime.
+
+
+In a nutshell, we will need three items to deploy a compiled module.
+Checkout our tutorials on getting started with NNVM compiler for more details.
+
+- The graph json data which contains the execution graph.
+- The tvm module library of compiled functions.
+- The parameter blobs for stored parameters.
+
+We can then use TVM's runtime API to deploy the compiled module.
+Here is an example in python.
+
+```python
+import tvm
+
+# tvm module for compiled functions.
+loaded_lib = tvm.module.load("deploy.so")
+# json graph
+loaded_json = open(temp.relpath("deploy.json")).read()
+# parameters in binary
+loaded_params = bytearray(open(temp.relpath("deploy.params"), "rb").read())
+
+fcreate = tvm.get_global_func("tvm.graph_runtime.create")
+ctx = tvm.gpu(0)
+gmodule = fcreate(loaded_json, loaded_lib, ctx.device_type, ctx.device_id)
+set_input, get_output, run = gmodule["set_input"], gmodule["get_output"], gmodule["run"]
+set_input("x", tvm.nd.array(x_np))
+gmodule["load_params"](loaded_params)
+run()
+out = tvm.nd.empty(shape)
+get_output(0, out)
+print(out.asnumpy())
+```
+
+An example in c++.
+```cpp
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <fstream>
+#include <iterator>
+#include <algorithm>
+
+int main()
+{
+    // tvm module for compiled functions
+    tvm::runtime::Module mod_syslib = tvm::runtime::Module::LoadFromFile("deploy.so");
+
+    // json graph
+    std::ifstream json_in("deploy.json", std::ios::in);
+    std::string json_data((std::istreambuf_iterator<char>(json_in)), std::istreambuf_iterator<char>());
+    json_in.close();
+
+    // parameters in binary
+    std::ifstream params_in("deploy.params", std::ios::binary);
+    std::string params_data((std::istreambuf_iterator<char>(params_in)), std::istreambuf_iterator<char>());
+    params_in.close();
+
+    // parameters need to be TVMByteArray type to indicate the binary data
+    TVMByteArray params_arr;
+    params_arr.data = params_data.c_str();
+    params_arr.size = params_data.length();
+
+    int dtype_code = kDLFloat;
+    int dtype_bits = 32;
+    int dtype_lanes = 1;
+    int device_type = kDLCPU;
+    int device_id = 0;
+
+    // get global function module for graph runtime
+    tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))(json_data, mod_syslib, device_type, device_id);
+
+    DLTensor* x;
+    int in_ndim = 4;
+    int64_t in_shape[4] = {1, 3, 224, 224};
+    TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+    // load image data saved in binary
+    std::ifstream data_fin("cat.bin", std::ios::binary);
+    data_fin.read(static_cast<char*>(x->data), 3 * 224 * 224 * 4);
+
+    // get the function from the module(set input data)
+    tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input");
+    set_input("data", x);
+
+    // get the function from the module(load patameters)
+    tvm::runtime::PackedFunc load_params = mod.GetFunction("load_params");
+    load_params(params_arr);
+
+    // get the function from the module(run it)
+    tvm::runtime::PackedFunc run = mod.GetFunction("run");
+    run();
+
+    DLTensor* y;
+    int out_ndim = 1;
+    int64_t out_shape[1] = {1000, };
+    TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+
+    // get the function from the module(get output data)
+    tvm::runtime::PackedFunc get_output = mod.GetFunction("get_output");
+    get_output(0, y);
+
+    // get the maximum position in output vector
+    auto y_iter = static_cast<float*>(y->data);
+    auto max_iter = std::max_element(y_iter, y_iter + 1000);
+    auto max_index = std::distance(y_iter, max_iter);
+    std::cout << "The maximum position in output vector is: " << max_index << std::endl;
+
+    TVMArrayFree(x);
+    TVMArrayFree(y);
+
+    return 0;
+}
+```
diff --git a/docs/dev/hybrid_script.rst b/docs/dev/hybrid_script.rst
new file mode 100644
index 000000000000..0af02a56e72c
--- /dev/null
+++ b/docs/dev/hybrid_script.rst
@@ -0,0 +1,76 @@
+Hybrid Frontend Developer Guide
+===============================
+
+If you are a developer:
+
+1. who is trying writing some preliminary patterns that have not been supported by TVM yet,
+maybe :ref:`hybrid-langref-label` is a better place for you.
+
+2. who wants to know the implementing details of this module, you are right here!
+
+Features
+--------
+
+Software emulation
+~~~~~~~~~~~~~~~~~~
+
+In software emulation, the most intresting thing is the decorator ``tvm.hybrid.script``.
+This decorator helps 2 things:
+
+1. Importing runtime variables
+
+2. Overload the function according to the arguments passed
+
+Correct me if I am wrong: I believe that how 1. is implemented is dangerous, but I have no
+choice. What I did is add those names into python dict ``func.__global__`` and after
+the call to ``func`` is done, those names will be cleaned up. 
+
+Overload is simple: the decorator checks the arguments' types and determines which function
+should be actually called.
+
+
+Backend Compilation
+~~~~~~~~~~~~~~~~~~~
+
+Compilation is a large module, you can see ``python/tvm/hybrid/var_decl.py`` and
+``python/tvm/hybrid/parser.py`` for more details. The first stage determines the
+usage, or more accurately the declaration of each variable and the second stage does
+the actual IR generation.
+
+Attributes
+~~~~~~~~~~
+
+So far, ONLY tensors' `shape` attribute is supported. You can see ``visit_Subscript``
+in ``python/tvm/hybrid/parser.py`` for more details. This is a hacky solution, I just
+check the attributes when subscript.
+
+Loops
+~~~~~
+
+In HalideIR, loops have in total 4 types: ``serial``, ``unrolled``, ``parallel``, and ``vectorized``.
+
+
+.. note::
+
+    Unlike what that is in HalideIR, in ``loop_type(a, b)``, ``a`` is the starting point and ``b``
+    is the trip count of iterations. Here ``loop_type(a, b)`` indicates ``[a, b)``. Thus, when lowering it
+    to HalideIR, we need to do ``start, extent = a, b - a``
+
+
+.. note::
+
+    In HalideIR those are enums, they are in passive form.
+    Here we use active form to annotate loops, because they are ready to run.
+
+
+Variables
+~~~~~~~~~
+
+Because there is no variables in ``HalideIR``, all the mutatable variables will be lowered to an array with size 1.
+It takes the first store of a variable as its declaration.
+
+Math intrinsics
+~~~~~~~~~~~~~~~
+So far, these math intrinsics, ``log``, ``exp``, ``sigmoid``, ``tanh``, ``power``, and ``popcount``, are supported.
+Math intrinsics will be imported by the decorator. Most of the intrinsics are borrowed by library implementation
+except ``popcount`` and ``sigmoid``. I implemented them manually.
diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index 0d0ee852f6f8..f3ab322bfe53 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -1,11 +1,13 @@
-TVM Design and Developer Guide
-==============================
+Design and Developer Guide
+==========================
 
-Building an IR stack for deep learning systems involves many
-many systems-level design decisions.
+Building a compiler stack for deep learning systems involves many many systems-level design decisions.
 In this part of documentation, we share the rationale for the specific choices made when designing TVM.
 
 .. toctree::
    :maxdepth: 2
 
    runtime
+   nnvm_json_spec
+   nnvm_overview
+   hybrid_script
diff --git a/docs/dev/nnvm_json_spec.rst b/docs/dev/nnvm_json_spec.rst
new file mode 100644
index 000000000000..31f2d2dc5c07
--- /dev/null
+++ b/docs/dev/nnvm_json_spec.rst
@@ -0,0 +1,212 @@
+NNVM Graph JSON Specification
+=============================
+
+NNVM uses JSON for graph serialization. This allows NNVM graph to be
+exported to any backend either natively supported or by third-party
+without any dependency such as protobuf.
+
+Getting started
+---------------
+
+A serialized NNVM graph in JSON format can be deserialized by any JSON
+parser.
+
+.. code:: python
+
+    # python
+    import json
+    with open('model.json', 'r') as f:
+      graph = json.loads(f.read())
+    print(graph.keys())
+
+``['nodes', 'arg_nodes', 'heads', 'node_row_ptr']``
+
+Actually, the following keys are valid in JSON graph.
+
++--------------------------------------+------------+-----------------------------------+
+| Keys                                 | Required   | Description                       |
++======================================+============+===================================+
+| `nodes <#nodes>`__                   | Yes        | The nodes in graph.               |
++--------------------------------------+------------+-----------------------------------+
+| `arg\_nodes <#arg_nodes>`__          | Yes        | Indices of input nodes.           |
++--------------------------------------+------------+-----------------------------------+
+| `heads <#heads>`__                   | Yes        | Indices of output nodes.          |
++--------------------------------------+------------+-----------------------------------+
+| `node\_row\_ptr <#node_row_ptr>`__   | Optional   | Depth first search row indices.   |
++--------------------------------------+------------+-----------------------------------+
+| `attr <#attr>`__                     | Optional   | Additional information.           |
++--------------------------------------+------------+-----------------------------------+
+
+nodes
+-----
+
+Explained by the name itself, ``nodes`` are either placeholders or
+computational nodes in NNVM graph. The ``nodes`` are stored in list.
+
+.. code:: python
+
+    nodes = graph['nodes']
+    print(len(nodes))
+    print(nodes[0])
+    print(nodes[3])
+
+::
+
+    53
+    {'inputs': [], 'name': 'data', 'op': 'null'}
+    {'inputs': [[0, 0, 0], [1, 0, 0], [2, 0, 0]], 'attrs': {'channels': '64',
+    'padding': '(1, 1)', 'layout': 'NCHW', 'kernel_size': '[3, 3]', 'groups': '1',
+    'strides': '(1, 1)', 'use_bias': 'True', 'dilation': '(1, 1)'},
+    'name': 'conv1_1', 'op': 'conv2d'}
+
+The following keys are valid in each node:
+
++----------------+------------------+----------+
+| Keys           | Required         | Descript |
+|                |                  | ion      |
++================+==================+==========+
+| op             | Yes              | The      |
+|                |                  | operator |
+|                |                  | type     |
+|                |                  | name,    |
+|                |                  | 'null'   |
+|                |                  | is used  |
+|                |                  | if it's  |
+|                |                  | a        |
+|                |                  | placehol |
+|                |                  | der/vari |
+|                |                  | able/inp |
+|                |                  | ut.      |
++----------------+------------------+----------+
+| name           | Yes              | The      |
+|                |                  | given    |
+|                |                  | name of  |
+|                |                  | the      |
+|                |                  | node,    |
+|                |                  | defined  |
+|                |                  | by user  |
+|                |                  | composin |
+|                |                  | g        |
+|                |                  | the      |
+|                |                  | network. |
++----------------+------------------+----------+
+| inputs         | Yes              | List of  |
+|                |                  | Entry    |
+|                |                  | of the   |
+|                |                  | input    |
+|                |                  | nodes,   |
+|                |                  | can be   |
+|                |                  | empty    |
+|                |                  | list []. |
+|                |                  | Entry is |
+|                |                  | a list   |
+|                |                  | of       |
+|                |                  | [nose\_i |
+|                |                  | d,       |
+|                |                  | index,   |
+|                |                  | version] |
++----------------+------------------+----------+
+| attrs          | Optional         | Extra    |
+|                |                  | attribut |
+|                |                  | es       |
+|                |                  | for the  |
+|                |                  | specific |
+|                |                  | operator |
+|                |                  | .        |
++----------------+------------------+----------+
+| control\_deps  | Optional         | Control  |
+|                |                  | dependen |
+|                |                  | cies,    |
+|                |                  | left     |
+|                |                  | blank    |
+|                |                  | unless   |
+|                |                  | specific |
+|                |                  | ally     |
+|                |                  | used.    |
++----------------+------------------+----------+
+
+``attrs`` for operators is a dictionary. Key-value pair examples:
+
++----------------+------------------+----------+----------+
+| Keys           | Value            | Operator | Descript |
+|                |                  |          | ion      |
++================+==================+==========+==========+
+| 'channels'     | '64'             | conv2d   | Output   |
+|                |                  |          | channels |
+|                |                  |          | for 2d   |
+|                |                  |          | convolut |
+|                |                  |          | ion.     |
++----------------+------------------+----------+----------+
+| 'kernel\_size' | '[3, 3]'         | conv2d   | Convolut |
+|                |                  |          | ion      |
+|                |                  |          | filter   |
+|                |                  |          | kernel   |
+|                |                  |          | size in  |
+|                |                  |          | (h, w),  |
+|                |                  |          | list and |
+|                |                  |          | tuple    |
+|                |                  |          | both     |
+|                |                  |          | works.   |
++----------------+------------------+----------+----------+
+| 'use\_bias'    | '1'              | conv2d   | Whether  |
+|                |                  |          | use bias |
+|                |                  |          | such     |
+|                |                  |          | that     |
+|                |                  |          | `y = w   |
+|                |                  |          | * x + b` |
+|                |                  |          | .        |
++----------------+------------------+----------+----------+
+
+.. note::
+
+    Tips for parsing key-value pair:
+
+    * Both key and value are stored as strings.
+
+    * Boolean values need extra attention, convert to int is recommended since `bool('0') == True` in python.
+
+    * For a full list of operator attributes, please refer to the core operator `documentation <top.html>`__.
+
+arg\_nodes
+----------
+
+``arg_nodes`` is a list of indices of nodes which is
+placeholder/variable/input to the graph.
+
+.. code:: python
+
+    print(graph['arg_nodes'])
+
+::
+
+    [0, 1, 2, 6, 7, 11, 12, 15, 16, 20, 21, 24, 25, 29, 30, 33, 34, 39, 40, 44, 45, 49, 50]
+
+For example, ``nodes[3]`` is not in ``arg_nodes`` because it's an
+internal node.
+
+heads
+-----
+
+``heads`` is a list of entries as the outlet/output of the graph.
+
+.. code:: python
+
+    print(graph['heads'])
+
+::
+
+    [[52, 0, 0]]
+
+This example indicating that there's only one output in the graph, with
+index 52.
+
+node\_row\_ptr
+--------------
+
+``node_row_ptr`` stores the history of forward path, so you can skip
+constructing the entire graph in inference tasks.
+
+attrs
+-----
+
+``attrs`` can contain version numbers or similar helpful informations.
diff --git a/docs/dev/nnvm_overview.md b/docs/dev/nnvm_overview.md
new file mode 100644
index 000000000000..4f01fdda2a03
--- /dev/null
+++ b/docs/dev/nnvm_overview.md
@@ -0,0 +1,126 @@
+
+# NNVM Design Overview
+
+NNVM is a reusable graph IR stack for deep learning systems. It provides useful API to construct, represent and transform computation graphs to get most high-level optimization needed in deep learning.
+As a part of TVM stack for deep learning, NNVM also provides a shared compiler for deep learning frameworks to optimize, compile and deploy into different hardware backends via [TVM](https://github.com/dmlc/tvm)
+
+## Key Requirements and Design Choices
+
+- Have minimum dependency in the deployment module.
+- Being able to add new operators to the IR, in a decentralized fashion.
+- Being able to add new optimization passes to the IR and applies to existing graphs.
+
+The item2 and 3 are particularly interesting if we compare it to a typical compiler IR. Compiler IR usually contains a fixed set of primitives(instructions), and use them as a contract between optimization pass designers. This design enables easy addition of new optimization passes, but not new operator(instruction). Because every time we add a new instruction, we need to modify the passes to accommodate these changes.
+
+Deep learning frameworks usually have a fixed operator interface(schema). These interfaces can contain properties like shape inference function, whether in-place computation can happen.  The operator interface is an again contract that makes it easy to add new an operator. But it is hard to add new passes in decentralized fashion a new optimization pass usually requires additional information, and this results in frequent changes of the centralized operator interface when we are exploring new optimizations. There is also a drawback of modularization. For example, a graph compiler for FPGA devices may not need the GPU device specific attributes.
+
+During our explorations in graph optimization and compilation, we find that it is important to quickly add both operators and passes to the framework without changing the core library.
+
+Here is a list of key elements in NNVM's design
+
+-  Operator registry system to register and add new operators
+-  Operator attribute system provide property of operator in decentralized fashion
+-  A reusable IR data structure for optimization passes.
+
+The above list is more like the generic language part of NNVM, besides of that, we also provide a collection of core operator primitives, and graph optimization passes.   The core tensor operator primitives and optimizations already cover commonly deep learning workloads. This design allows the NNVM compiler to be directly used as optimization and compilation stack for frameworks. The extendible nature of NNVM makes new adjustment easy without constraining the backend providers.
+
+## Minimum Registration for a Symbolic Front-End
+To use NNVM to build language front end, a developer only needs to register minimum information about each operator.
+
+```c++
+NNVM_REGISTER_OP(add)
+.describe("add two data together")
+.set_num_inputs(2);
+
+NNVM_REGISTER_OP(conv2d)
+.describe("take 2d convolution of input")
+.set_num_inputs(2);
+
+NNVM_REGISTER_OP(assign)
+.describe("assign second input argument to the first one")
+.set_num_inputs(2);
+```
+
+Compiling the code with NNVM library. User can use the following interface to compose the computation graph in python, like the following code.
+
+```python
+import nnvm.symbol as nn
+
+# symbolic variable
+x = nn.Variable('x')
+y = nn.Variable('y')
+w = nn.Variable('w')
+
+z = nn.conv2d(nn.elemwise_add(x, y), w, kernel_size=(2,2), name='conv1')
+```
+
+The graph structure is interchangeable between the frontend and the backend.  Python interface is supported currently. More language support can be easily
+moved in the future.
+
+## Operator Attribute for More Extensions
+
+The minimum information provided by the operator is enough to get a front-end. However,   we need more knowledge about each operator to do transformations and executing the graph.
+A typical difference between neural nets' computation graph and traditional compiler IR is that there are a lot more high-level operators. We cannot fix the set of operators in the IR.
+
+NNVM allow developers to register attributes of each operator. The attributes can include shape inference function, whether the operator can perform in-place calculation etc.
+
+This design to having an operator attribute registry is not uncommon in deep learning systems.
+For example, MXNet has a ```OpProperty``` class, Tensorflow has a ```OpDef``` and Caffe2 have a ```OperatorSchema``` class.
+However, the operator attribute interface listed in these frameworks only support a fixed number of defined attributes of interest to the system. If we want to extend the framework to add a new attribute in each operator, we need to change the operator registry.
+Eventually, the operator interface grows into to be very big and have to evolve in the centralized repo.
+
+In NNVM, we decided to change the design and support arbitrary type of operator attributes, without changing the interface registry. The minimum interface also makes it easier to share across multiple projects
+
+User can register new attribute, such as inplace property checking function as follows.
+```c++
+using FInplaceOption = std::function<
+  std::vector<std::pair<int, int> > (const NodeAttrs& attrs)>;
+
+// we can register attributes from multiple places.
+NNVM_REGISTER_OP(elemwise_add)
+.set_num_inputs(2);
+
+// register to tell first input can be calculate inplace with first output
+NNVM_REGISTER_OP(add)
+.set_attr<FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs) {
+  return std::vector<std::pair<int, int> >{{0, 0}};
+ });
+
+NNVM_REGISTER_OP(exp)
+.set_num_inputs(1)
+.set_attr<FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs) {
+  return std::vector<std::pair<int, int> >{{0, 0}};
+ });
+```
+
+We can query these attributes at arbitrary parts of the code, like the following parts. Under the hood, each attribute is stored in a columnar store, that can easily be retrieved table and do quick lookups.
+
+```c++
+void MyFunction() {
+  const Op* add = Op::Get("add");
+  // if we need quick query, we can use static variable
+  // attribute map contains attributes of all operators.
+  static auto& finplace_option_map = Op::GetAttr<FInplaceOption>("FInplaceOption");
+
+  // quick look up attribute of add, O(1) time, vector index lookup internally.
+  auto add_inplace = finplace_option_map[add];
+}
+```
+Besides making the code minimum, this attribute store enables decentralization of projects.
+Before, all the attributes of operator have to sit on a centralized interface class.
+Now, everyone can register attributes of their own, take some other attributes they need from another project without changing the operator interface and core library
+
+
+## Graph and Pass
+
+We can use the additional information on attribute registry to do optimizations and get more information about the graph. Graph is the unit we manipulate in these steps. A Graph in NNVM contains
+two parts:
+- The computation graph structure
+- A attribute map from string to any type ```map<string, shared_ptr<any> >```
+
+The second attribute map is quite important, as we may need different kinds
+of information about the graph during the transformation process. Let it be
+shapes of each tensor, types of each tensor or the storage allocation plans.
+
+A ```Pass``` can take a graph with existing attribute information,
+and transform it to the same graph structure with more graph attributes or another graph.
diff --git a/docs/dev/runtime.md b/docs/dev/runtime.md
index b9cc81186200..a5d8138c3372 100644
--- a/docs/dev/runtime.md
+++ b/docs/dev/runtime.md
@@ -1,18 +1,18 @@
 # TVM Runtime System
 
-TVM support multiple programming languages for compiler stack development and deployment.
-In this note, we explain the key element of TVM runtime.
+TVM supports multiple programming languages for the compiler stack development and deployment.
+In this note, we explain the key elements of the TVM runtime.
 
-![](http://www.tvmlang.org/images/release/tvm_flexible.png)
+![](http://www.tvm.ai/images/release/tvm_flexible.png)
 
 We need to satisfy quite a few interesting requirements
 
 - Deployment: invoke the compiled function from python/javascript/c++ language.
 - Debug: define a function in python and call that from a compiled function.
-- Link: write driver code to call device specific code(CUDA) and call it from compiled host function.
+- Link: write driver code to call device specific code (CUDA) and call it from compiled host function.
 - Prototype: define an IR pass from python and call that from C++ backend.
-- Expose:  compiler stack developed in c++ to front-end (i.e, python)
-- Experiment: ship a compiled function to an embedded device directly run there.
+- Expose: compiler stack developed in c++ to front-end (i.e, python)
+- Experiment: ship a compiled function to an embedded device to directly run there.
 
 We want to be able to define a function from any language and call from another.
 We also want the runtime core to be minimal to deploy to embedded devices.
@@ -41,11 +41,11 @@ void CallPacked() {
 ```
 In the above codeblock, we defined a PackedFunc MyAdd. It takes two arguments
 : ```args``` represents input arguments and ```rv``` represents return value.
-The function is type-erased, which means the function signature does not restrict which input type to pass in or type to return.
+The function is type-erased, which means that the function signature does not restrict which input type to pass in or type to return.
 Under the hood, when we call a PackedFunc, it packs the input arguments to TVMArgs on stack,
-and get the result back via TVMRetValue.
+and gets the result back via TVMRetValue.
 
-Thanks to template tricks in C++, we can call a PackedFunc just like a  normal function. Because of its type-erased nature, we can call a PackedFunc from dynamic languages like python, without additional glue code for each new type function created.
+Thanks to template tricks in C++, we can call a PackedFunc just like a normal function. Because of its type-erased nature, we can call a PackedFunc from dynamic languages like python, without additional glue code for each new type function created.
 The following example registers PackedFunc in C++ and calls from python.
 
 ```c++
@@ -74,7 +74,7 @@ The restriction makes the implementation simple without the need of serializatio
 Despite being minimum, the PackedFunc is sufficient for the use-case of deep learning deployment as
 most functions only take DLTensor or numbers.
 
-Since one PackedFunc can take another PackedFunc as argument,
+Since one PackedFunc can take another PackedFunc as an argument,
 we can pass functions from python(as PackedFunc) to C++.
 ```c++
 TVM_REGISTER_GLOBAL("callhello")
@@ -97,15 +97,15 @@ callhello(f)
 ```
 
 TVM provides a [minimum C API](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h),
-that allows us to embedded the PackedFunc into any languages. Besides python, so far we supported
+which allows us to embed the PackedFunc into any languages. Besides python, so far we supported
 [java](https://github.com/dmlc/tvm/tree/master/jvm) and [javascript](https://github.com/dmlc/tvm/tree/master/web).
-This philosophy of embedded API is very like Lua, except that we don't have a new language and uses C++.
+This philosophy of embedded API is very like Lua, except that we don't have a new language but use C++.
 
 One fun fact about PackedFunc is that we use it for both compiler and deployment stack.
 - All TVM's compiler pass functions are exposed to frontend as PackedFunc, see [here](https://github.com/dmlc/tvm/tree/master/src/api)
-- The compiled modules also returns compiled function as PackedFunc
+- The compiled module also returns the compiled function as PackedFunc
 
-To keep the runtime minimum, we isolated the IR Node support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules(e.g., CUDA) get included.
+To keep the runtime minimum, we isolated the IR Node support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules (e.g., CUDA) get included.
 
 The overhead of calling into PackedFunc vs. a normal function is small, as it is only saving a few values on the stack.
 So it is OK as long as we don't wrap small functions.
@@ -113,9 +113,9 @@ In summary, the PackedFunc is the universal glue in TVM where we use it extensiv
 
 ## Module
 
-Since TVM support multiple types of devices, we need to support different type of drivers.
-We have to use driver API to load the kernel, set up the argument in packed format and perform kernel launch.
-We also need to patch up the driver API so that the exposed functions is threadsafe.
+Since TVM supports multiple types of devices, we need to support different type of drivers.
+We have to use the driver API to load the kernel, set up the argument in packed format and perform kernel launch.
+We also need to patch up the driver API so that the exposed functions are threadsafe.
 So we often need to implement these driver glues in C++ and expose them to the user.
 We can certainly not do it for each type of functions, so again PackedFunc is our answer.
 
@@ -130,32 +130,32 @@ of new device easy, and we do not need to redo the host code generation for each
 ## Remote Deployment
 
 The PackedFunc and Module system also makes it easy to ship the function into remote devices directly.
-Under the hood, we have a RPCModule that serializes the arguments and do the data movement and launches the computation on the remote.
+Under the hood, we have an RPCModule that serializes the arguments to do the data movement and launches the computation on the remote.
 
-![](http://www.tvmlang.org/images/release/tvm_rpc.png)
+![](http://www.tvm.ai/images/release/tvm_rpc.png)
 
 The RPC server itself is minimum and can be bundled into the runtime. We can start a minimum TVM
-RPC server on iPhone/android/raspberry pi or even your browser. The cross compilation on server and shipping of the module for testing can be done in the same script. Checkout
-[Cross compilation and RPC tutorial](http://docs.tvmlang.org/tutorials/deployment/cross_compilation_and_rpc.html#sphx-glr-tutorials-deployment-cross-compilation-and-rpc-py)  for more details.
+RPC server on iPhone/android/raspberry pi or even the browser. The cross compilation on server and shipping of the module for testing can be done in the same script. Checkout
+[Cross compilation and RPC tutorial](http://docs.tvm.ai/tutorials/deployment/cross_compilation_and_rpc.html#sphx-glr-tutorials-deployment-cross-compilation-and-rpc-py)  for more details.
 
-This instant feedback gives us a lot of advantages. For example, to test the correctness of generated code on iPhone, we no longer have to write test-cases in swift/objective-c from scratch -- We can use RPC to execute on iPhone copy the result back and do verification on my host via numpy. We can also do the profiling using the same script.
+This instant feedback gives us a lot of advantages. For example, to test the correctness of generated code on iPhone, we no longer have to write test-cases in swift/objective-c from scratch -- We can use RPC to execute on iPhone, copy the result back and do verification on the host via numpy. We can also do the profiling using the same script.
 
 ## TVM Node and Compiler Stack
 
 As we mentioned earlier, we build compiler stack API on top of the PackedFunc runtime system.
-We faced a constant changing the compiler API for the need of research. We need a new language object or IR node from now and then when we want to test out new primitives.
+We faced a constant changing of the compiler API for the need of research. We need a new language object or IR node whenever we want to test out new primitives.
 However, we don't want to change our API from time to time. Besides that, we also want to
 
 - be able to serialize any language object and IRs
 - be able to explore, print, and manipulate the IR objects in front-end language to do quick prototyping.
 
 We introduced a base class, called [Node](https://github.com/dmlc/HalideIR/blob/master/src/tvm/node.h#L52) to solve this problem.
-All the language object in compiler stack is a subclass of Node. Each node contains a string type_key that uniquely identifies
-the type of object. We choose string instead of int as type key so new Node class can be added in decentralized fashion without
+All the language object in the compiler stack is a subclass of Node. Each node contains a string type_key that uniquely identifies
+the type of object. We choose string instead of int as type key so new Node class can be added in the decentralized fashion without
 adding the code back to the central repo. To ease the speed of dispatching, we allocate an integer type_index at runtime for each type_key.
 
-Since usually one Node object could be referenced in multiple places in the language. We use a shared_ptr to keep
-track of reference. We use NodeRef class to represents a reference to the Node.
+Since usually one Node object could be referenced in multiple places in the language, we use a shared_ptr to keep
+track of reference. We use NodeRef class to represent a reference to the Node.
 We can roughly view NodeRef class as shared_ptr to the Node container.
 We can also define subclass NodeRef to hold each subtypes of Node. Each Node class needs to define the VisitAttr function.
 
@@ -206,7 +206,7 @@ class TensorNode : public Node {
 ```
 In the above examples, both ```Operation``` and ```Array<Expr>``` are NodeRef.
 The VisitAttrs gives us a reflection API to visit each member of the object.
-We can use this function to visit the node any serialize any language object recursively.
+We can use this function to visit the node and serialize any language object recursively.
 It also allows us to get members of an object easily in front-end language.
 For example, in the following code, we accessed the op field of the TensorNode.
 
@@ -220,13 +220,13 @@ print(x.op.name)
 
 New Node can be added to C++ without changing the front-end runtime, making it easy to make extensions to the compiler stack.
 Note that this is not the fastest way to expose members to front-end language, but might be one of the simplest
-approach possible. We also find it fits our purposes as we mainly use python for testing and prototyping and still use c++
+approaches possible. We also find that it fits our purposes as we mainly use python for testing and prototyping and still use c++
 to do the heavy lifting job.
 
 ## Implementation Details
 
 Each argument in PackedFunc contains a union value [TVMValue](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L122)
-and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language
+and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language to
 do runtime type checking during conversion.
 
 The relevant files are
diff --git a/docs/faq.md b/docs/faq.md
index 92cb886f1ca7..54df0ced8fa8 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -4,7 +4,7 @@ This document contains frequently asked questions.
 
 How to Install
 --------------
-See [Installation](https://github.com/dmlc/tvm/blob/master/docs/how_to/install.md)
+See [Installation](http://tvm.ai/install/)
 
 TVM's relation to Other IR/DSL Projects
 ---------------------------------------
diff --git a/docs/how_to/contribute.md b/docs/how_to/contribute.md
deleted file mode 100644
index a0ba99bdf718..000000000000
--- a/docs/how_to/contribute.md
+++ /dev/null
@@ -1,99 +0,0 @@
-# Contribute to TVM
-
-TVM has been developed by community members.
-Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users.
-
-- Please add your name to [CONTRIBUTORS.md](https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md)
-- Please update [NEWS.md](https://github.com/dmlc/tvm/blob/master/NEWS.md) to add note on your changes to the API or added a new document.
-
-## Guidelines
-* [Submit Pull Request](#submit-pull-request)
-* [Git Workflow Howtos](#git-workflow-howtos)
-  - [How to resolve conflict with master](#how-to-resolve-conflict-with-master)
-  - [How to combine multiple commits into one](#how-to-combine-multiple-commits-into-one)
-  - [What is the consequence of force push](#what-is-the-consequence-of-force-push)
-* [Document](#document)
-* [Testcases](#testcases)
-* [Examples](#examples)
-* [Core Library](#core-library)
-* [Python Package](#python-package)
-
-## Submit Pull Request
-* Before submit, please rebase your code on the most recent version of master, you can do it by
-```bash
-git remote add upstream [url to tvm repo]
-git fetch upstream
-git rebase upstream/master
-```
-* If you have multiple small commits,
-  it might be good to merge them together(use git rebase then squash) into more meaningful groups.
-* Send the pull request!
-  - Fix the problems reported by automatic checks
-  - If you are contributing a new module or new function, add a test.
-
-## Git Workflow Howtos
-### How to resolve conflict with master
-- First rebase to most recent master
-```bash
-# The first two steps can be skipped after you do it once.
-git remote add upstream [url to tvm repo]
-git fetch upstream
-git rebase upstream/master
-```
-- The git may show some conflicts it cannot merge, say ```conflicted.py```.
-  - Manually modify the file to resolve the conflict.
-  - After you resolved the conflict, mark it as resolved by
-```bash
-git add conflicted.py
-```
-- Then you can continue rebase by
-```bash
-git rebase --continue
-```
-- Finally push to your fork, you may need to force push here.
-```bash
-git push --force
-```
-
-### How to combine multiple commits into one
-Sometimes we want to combine multiple commits, especially when later commits are only fixes to previous ones,
-to create a PR with set of meaningful commits. You can do it by following steps.
-- Before doing so, configure the default editor of git if you haven't done so before.
-```bash
-git config core.editor the-editor-you-like
-```
-- Assume we want to merge last 3 commits, type the following commands
-```bash
-git rebase -i HEAD~3
-```
-- It will pop up an text editor. Set the first commit as ```pick```, and change later ones to ```squash```.
-- After you saved the file, it will pop up another text editor to ask you modify the combined commit message.
-- Push the changes to your fork, you need to force push.
-```bash
-git push --force
-```
-
-### Reset to the most recent master
-You can always use git reset to reset your version to the most recent master.
-Note that all your ***local changes will get lost***.
-So only do it when you do not have local changes or when your pull request just get merged.
-```bash
-git reset --hard [hash tag of master]
-git push --force
-```
-
-### What is the consequence of force push
-The previous two tips requires force push, this is because we altered the path of the commits.
-It is fine to force push to your own fork, as long as the commits changed are only yours.
-
-## Testcases
-- All the testcases are in tests
-
-## Core Library
-- Follow Google C style for C++.
-- We use doxygen to document all the interface code.
-- You can reproduce the linter checks by typing ```make lint```
-
-## Python Package
-- Always add docstring to the new functions in numpydoc format.
-- You can reproduce the linter checks by typing ```make lint```
diff --git a/docs/how_to/install.md b/docs/how_to/install.md
deleted file mode 100644
index 54db42281623..000000000000
--- a/docs/how_to/install.md
+++ /dev/null
@@ -1,92 +0,0 @@
-Installation Guide
-==================
-This page gives instructions on how to build and install the tvm package from
-scratch on various systems. It consists of two steps:
-
-1. First build the shared library from the C++ codes (`libtvm.so` for linux/osx and `libtvm.dll` for windows).
-2. Setup for the language packages (e.g. Python Package).
-
-To get started, clone tvm repo from github. It is important to clone the submodules along, with ```--recursive``` option.
-```bash
-git clone --recursive https://github.com/dmlc/tvm
-```
-For windows users who use github tools, you can open the git shell, and type the following command.
-```bash
-git submodule init
-git submodule update
-```
-
-## Contents
-- [Build the Shared Library](#build-the-shared-library)
-- [Python Package Installation](#python-package-installation)
-
-## Build the Shared Library
-
-Our goal is to build the shared library:
-- On Linux/OSX the target library is `libtvm.so`
-- On Windows the target library is `libtvm.dll`
-
-The minimal building requirement is
-- A recent c++ compiler supporting C++ 11 (g++-4.8 or higher)
-
-You can edit `make/config.mk` to change the compile options, and then build by
-`make`. If everything goes well, we can go to the specific language installation section.
-
-### Building on Windows
-
-TVM support build via MSVC using cmake. The minimum required VS version is **Visual Studio Community 2015 Update 3**. In order to generate the VS solution file using cmake,
-make sure you have a recent version of cmake added to your path and then from the tvm directory:
-
-```bash
-mkdir build
-cd build
-cmake -G "Visual Studio 14 2015 Win64" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" ..
-```
-This will generate the VS project using the MSVC 14 64 bit generator. Open the .sln file in the build directory and build with Visual Studio.
-
-### Customized Building
-
-Install prerequisites first:
-
-```bash
-sudo apt-get update
-sudo apt-get install -y python python-dev python-setuptools gcc libtinfo-dev zlib1g-dev
-```
-
-The configuration of tvm can be modified by ```config.mk```
-- First copy ```make/config.mk``` to the project root, on which
-  any local modification will be ignored by git, then modify the according flags.
-- TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.
-  - LLVM 4.0 or higher is needed for build with LLVM. Note that verison of LLVM from default apt may lower than 4.0.
-  - Since LLVM takes long time to build from source, you can download pre-built version of LLVM frorm
-    [LLVM Download Page](http://releases.llvm.org/download.html).
-    - Unzip to a certain location, modify ```config.mk``` to add ```LLVM_CONFIG=/path/to/your/llvm/bin/llvm-config```
-  - You can also use [LLVM Nightly Ubuntu Build](https://apt.llvm.org/)
-    - Note that apt-package append ```llvm-config``` with version number. For example, set ```LLVM_CONFIG=llvm-config-4.0``` if you installed 4.0 package
-  - By default CUDA and OpenCL code generator do not require llvm.
-
-## Python Package Installation
-
-The python package is located at python
-There are several ways to install the package:
-
-1. Set the environment variable `PYTHONPATH` to tell python where to find
-   the library. For example, assume we cloned `tvm` on the home directory
-   `~`. then we can added the following line in `~/.bashrc`.
-    It is ***recommended for developers*** who may change the codes.
-    The changes will be immediately reflected once you pulled the code and rebuild the project (no need to call ```setup``` again)
-
-    ```bash
-    export PYTHONPATH=/path/to/tvm/python:/path/to/tvm/topi/python:${PYTHONPATH}
-    ```
-
-2. Install tvm python bindings by `setup.py`:
-
-    ```bash
-    # install tvm package for the current user
-    # NOTE: if you installed python via homebrew, --user is not needed during installaiton
-    #       it will be automatically installed to your user directory.
-    #       providing --user flag may trigger error during installation in such case.
-    cd python; python setup.py install --user; cd ..
-    cd topi/python; python setup.py install --user; cd ../..
-    ```
diff --git a/docs/index.rst b/docs/index.rst
index 9fa690e00fd9..20e64bfef641 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,22 +1,39 @@
 TVM Documentation
 =================
 
-Welcome to TVM documentation.
-
-
-Contents
---------
-
+Get Started
+-----------
 .. toctree::
    :maxdepth: 1
 
-   self
-   how_to/install
+   install/index
    tutorials/index
+   vta/index
+   deploy/index
+   contribute/index
    faq
-   how_to/deploy
-   how_to/contribute
+
+API Reference
+-------------
+.. toctree::
+   :maxdepth: 2
+
+   langref/index
    api/python/index
-   dev/index
    api_links
+
+Developer Guide
+---------------
+.. toctree::
+   :maxdepth: 2
+
+   dev/index
+   nnvm_top
+
+
+Index
+-----
+.. toctree::
+   :maxdepth: 1
+
    genindex
diff --git a/docs/install/docker.rst b/docs/install/docker.rst
new file mode 100644
index 000000000000..8d089522761d
--- /dev/null
+++ b/docs/install/docker.rst
@@ -0,0 +1,45 @@
+.. _docker-images:
+
+Docker Images
+=============
+We provide several prebuilt docker images to quickly try out tvm.
+These images are also helpful run through TVM demo and tutorials.
+You can get the docker images via the following steps.
+We need `docker <https://docs.docker.com/engine/installation/>`_ and
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker/>`_ if we want to use cuda.
+
+First, clone tvm repo to get the auxiliary scripts
+
+.. code:: bash
+
+    git clone --recursive https://github.com/dmlc/tvm
+
+
+We can then use the following command to launch a `tvmai/demo-cpu` image.
+
+.. code:: bash
+
+    /path/to/tvm/docker/bash.sh tvmai/demo-cpu
+
+You can also change `demo-cpu` to `demo-gpu` to get a CUDA enabled image.
+You can find all the prebuilt images in `<https://hub.docker.com/r/tvmai/>`_
+
+
+This auxiliary script does the following things:
+
+- Mount current directory to /workspace
+- Switch user to be the same user that calls the bash.sh (so you can read/write host system)
+- Use the host-side network (so you can use jupyter notebook)
+
+
+Then you can start a jupyter notebook by typing
+
+.. code:: bash
+
+   jupyter notebook
+
+
+Docker Source
+-------------
+Check out `<https://github.com/dmlc/tvm/tree/master/docker>`_ if you are interested in
+building your own docker images.
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
new file mode 100644
index 000000000000..edeba1ccfadc
--- /dev/null
+++ b/docs/install/from_source.rst
@@ -0,0 +1,157 @@
+.. _install-from-source:
+
+Install from Source
+===================
+This page gives instructions on how to build and install the tvm package from
+scratch on various systems. It consists of two steps:
+
+1. First build the shared library from the C++ codes (`libtvm.so` for linux/osx and `libtvm.dll` for windows).
+2. Setup for the language packages (e.g. Python Package).
+
+To get started, clone tvm repo from github. It is important to clone the submodules along, with ``--recursive`` option.
+
+.. code:: bash
+
+    git clone --recursive https://github.com/dmlc/tvm
+
+For windows users who use github tools, you can open the git shell, and type the following command.
+
+.. code:: bash
+
+   git submodule init
+   git submodule update
+
+
+Build the Shared Library
+------------------------
+
+Our goal is to build the shared libraries:
+
+- On Linux the target library are `libtvm.so, libtvm_topi.so`
+- On OSX the target library are `libtvm.dylib, libtvm_topi.dylib`
+- On Windows the target library are `libtvm.dll, libtvm_topi.dll`
+
+
+.. code:: bash
+
+    sudo apt-get update
+    sudo apt-get install -y python python-dev python-setuptools gcc libtinfo-dev zlib1g-dev
+
+The minimal building requirements are
+
+- A recent c++ compiler supporting C++ 11 (g++-4.8 or higher)
+- CMake 3.5 or higher
+- We highly recommend to build with LLVM to enable all the features.
+- It is possible to build without llvm dependency if we only want to use CUDA/OpenCL
+
+We use cmake to build the library.
+The configuration of tvm can be modified by `config.cmake`.
+
+
+- First, check the cmake in your system, you do not have cmake
+  you can obtain the latest version from `official website <https://cmake.org/download/>`_
+- First create a build directory, copy the ``cmake/config.cmake`` to the directory.
+
+  .. code:: bash
+
+      mkdir build
+      cp cmake/config.cmake build
+
+- Edit ``build/config.cmake`` to customize the compilation options
+
+  - On macOS, for some versions of XCode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
+  - Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. So do other backends and libraries
+    (OpenCL, RCOM, METAL, VULKAN, ...).
+
+- TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.
+
+  - LLVM 4.0 or higher is needed for build with LLVM. Note that verison of LLVM from default apt may lower than 4.0.
+  - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from
+    [LLVM Download Page](http://releases.llvm.org/download.html).
+
+
+    - Unzip to a certain location, modify ``build/config.cmake`` to add ``set(USE_LLVM /path/to/your/llvm/bin/llvm-config)``
+    - You can also directly set ``set(USE_LLVM ON)`` and let cmake search for a usable version of LLVM.
+
+  - You can also use [LLVM Nightly Ubuntu Build](https://apt.llvm.org/)
+
+    - Note that apt-package append ``llvm-config`` with version number.
+      For example, set ``set(LLVM_CONFIG llvm-config-4.0)`` if you installed 4.0 package
+
+- We can then build tvm and related libraries.
+
+  .. code:: bash
+
+      cd build
+      cmake ..
+      make -j4
+
+If everything goes well, we can go to :ref:`python-package-installation`
+
+Building on Windows
+~~~~~~~~~~~~~~~~~~~
+
+TVM support build via MSVC using cmake. The minimum required VS version is **Visual Studio Community 2015 Update 3**.
+In order to generate the VS solution file using cmake,
+make sure you have a recent version of cmake added to your path and then from the tvm directory:
+
+.. code:: bash
+
+  mkdir build
+  cd build
+  cmake -G "Visual Studio 14 2015 Win64" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" ..
+
+This will generate the VS project using the MSVC 14 64 bit generator.
+Open the .sln file in the build directory and build with Visual Studio.
+In order to build with LLVM in windows, you will need to build LLVM from source.
+You need to run build the nnvm by running the same script under the nnvm folder.
+
+Building ROCm support
+~~~~~~~~~~~~~~~~~~~~~
+
+Currently, ROCm is supported only on linux, so all the instructions are written with linux in mind.
+
+- Set ``set(USE_ROCM ON)``, set ROCM_PATH to the correct path.
+- You need to first install HIP runtime from ROCm. Make sure the installation system has ROCm installed in it.
+- Install latest stable version of LLVM (v6.0.1), and LLD, make sure ``ld.lld`` is available via command line.
+
+.. _python-package-installation:
+
+Python Package Installation
+---------------------------
+
+The python package is located at python
+There are several ways to install the package:
+
+1. Set the environment variable `PYTHONPATH` to tell python where to find
+   the library. For example, assume we cloned `tvm` on the home directory
+   `~`. then we can added the following line in `~/.bashrc`.
+   It is **recommended for developers** who may change the codes.
+   The changes will be immediately reflected once you pulled the code and rebuild the project (no need to call ``setup`` again)
+
+   .. code:: bash
+
+       export PYTHONPATH=/path/to/tvm/python:/path/to/tvm/topi/python:/path/to/tvm/nnvm/python:${PYTHONPATH}
+
+
+2. Install tvm python bindings by `setup.py`:
+
+   .. code:: bash
+
+       # install tvm package for the current user
+       # NOTE: if you installed python via homebrew, --user is not needed during installaiton
+       #       it will be automatically installed to your user directory.
+       #       providing --user flag may trigger error during installation in such case.
+       export MACOSX_DEPLOYMENT_TARGET=10.9  # This is required for mac to avoid symbol conflicts with libstdc++
+       cd python; python setup.py install --user; cd ..
+       cd topi/python; python setup.py install --user; cd ../..
+       cd nnvm/python; python setup.py install --user; cd ../..
+
+
+Install Contrib Libraries
+-------------------------
+
+.. toctree::
+   :maxdepth: 1
+
+   nnpack
diff --git a/docs/install/index.rst b/docs/install/index.rst
new file mode 100644
index 000000000000..cc39f2433c7e
--- /dev/null
+++ b/docs/install/index.rst
@@ -0,0 +1,13 @@
+Installation
+============
+To install TVM, please read :ref:`install-from-source`.
+If you are interested in deploying to mobile/embedded devices,
+you do not need to install the entire tvm stack on your device,
+instead, you only need the runtime, please read :ref:`deploy-and-integration`.
+If you would like to quickly try out TVM or do demo/tutorials, checkout :ref:`docker-images`
+
+.. toctree::
+   :maxdepth: 2
+
+   from_source
+   docker
diff --git a/docs/install/nnpack.md b/docs/install/nnpack.md
new file mode 100644
index 000000000000..d4e6e39e4023
--- /dev/null
+++ b/docs/install/nnpack.md
@@ -0,0 +1,81 @@
+# NNPACK Contrib Installation
+
+[NNPACK](https://github.com/Maratyszcza/NNPACK) is an acceleration package
+for neural network computations, which can run on x86-64, ARMv7, or ARM64 architecture CPUs.
+Using NNPACK, higher-level libraries like _MXNet_ can speed up
+the execution on multi-core CPU computers, including laptops and mobile devices.
+
+***Note***: AS TVM already has natively tuned schedules, NNPACK is here mainly for reference and comparison purpose.
+For regular use prefer native tuned TVM implementation.
+
+_TVM_ supports NNPACK for forward propagation (inference only) in convolution, max-pooling, and fully-connected layers.
+In this document, we give a high level overview of how to use NNPACK with _TVM_.
+
+## Conditions
+The underlying implementation of NNPACK utilizes several acceleration methods,
+including [fft](https://arxiv.org/abs/1312.5851) and [winograd](https://arxiv.org/abs/1509.09308).
+These algorithms work better on some special `batch size`, `kernel size`, and `stride` settings than on other,
+so depending on the context, not all convolution, max-pooling, or fully-connected layers can be powered by NNPACK.
+When favorable conditions for running NNPACKS are not met,
+
+NNPACK only supports Linux and OS X systems. Windows is not supported at present.
+
+## Build/Install NNPACK
+
+If the trained model meets some conditions of using NNPACK,
+you can build TVM with NNPACK support.
+Follow these simple steps:
+* Build NNPACK shared library with the following commands. _TVM_ will link NNPACK dynamically.
+
+Note: The following NNPACK installation instructions have been tested on Ubuntu 16.04.
+
+### Build [Ninja](https://ninja-build.org/)
+
+NNPACK need a recent version of Ninja. So we need to install ninja from source.
+```bash
+git clone git://github.com/ninja-build/ninja.git
+cd ninja
+./configure.py --bootstrap
+```
+
+Set the environment variable PATH to tell bash where to find the ninja executable. For example, assume we cloned ninja on the home directory ~. then we can added the following line in ~/.bashrc.
+```bash
+export PATH="${PATH}:~/ninja"
+```
+
+### Build [NNPACK](https://github.com/Maratyszcza/NNPACK)
+
+The new CMAKE version of NNPACK download [Peach](https://github.com/Maratyszcza/PeachPy) and other dependencies alone
+
+```bash
+git clone --recursive https://github.com/Maratyszcza/NNPACK.git
+cd NNPACK
+# Add PIC option in CFLAG and CXXFLAG to build NNPACK shared library
+sed -i "s|gnu99|gnu99 -fPIC|g" CMakeLists.txt
+sed -i "s|gnu++11|gnu++11 -fPIC|g" CMakeLists.txt
+mkdir build
+cd build
+# Generate ninja build rule and add shared library in configuration
+cmake -G Ninja -D BUILD_SHARED_LIBS=ON ..
+ninja
+sudo ninja install
+
+# Add NNPACK lib folder in your ldconfig
+echo "/usr/local/lib" > /etc/ld.so.conf.d/nnpack.conf
+sudo ldconfig
+```
+
+## Build TVM with NNPACK support
+
+```bash
+git clone --recursive https://github.com/dmlc/tvm
+```
+
+* Set `set(USE_NNPACK ON)` in config.cmake.
+* Set `NNPACK_PATH` to the $(YOUR_NNPACK_INSTALL_PATH)
+
+after configuration use `make` to build TVM
+
+```bash
+make
+```
diff --git a/docs/langref/hybrid_script.rst b/docs/langref/hybrid_script.rst
new file mode 100644
index 000000000000..fdaed2b5be40
--- /dev/null
+++ b/docs/langref/hybrid_script.rst
@@ -0,0 +1,172 @@
+.. _hybrid-langref-label:
+
+Hybrid Frontend Language Reference
+==================================
+
+Overview
+--------
+
+This hybrid frontend allows users to write preliminary versions of some idioms that yet have
+been supported by TVM officially.
+
+Features
+--------
+
+Software Emulation
+~~~~~~~~~~~~~~~~~~
+
+Both software emulation and compilation are supported. To define a function,
+you need to use ``tvm.hybrid.script`` decorator to indicate this is a hybrid function:
+
+.. code-block:: python
+
+    @tvm.hybrid.script
+    def outer_product(a, b, c):
+        for i in range(a.shape[0]):
+            for j in range(b.shape[0]):
+                c[i, j] = a[i] * b[j]
+    a = numpy.random.rand(100)
+    b = numpy.random.rand(99)
+    c = numpy.zeros((100, 99))
+    outer_product(a, b, c)
+
+This decorator will import `Keywords`_ required spontaneously when software emulation.
+After software emulation is done, the imported keywords will be cleaned up. Users do not need
+worry about keyword conflict and pollution.
+
+Every element passed for software emulation in the argument list is either a python variable
+or ``numpy`` numeric type.
+
+Backend Compilation
+~~~~~~~~~~~~~~~~~~~
+
+The current parse interface looks like:
+
+.. code-block:: python
+
+   a = tvm.placeholder((100, ), name='a')
+   b = tvm.placeholder((99, ), name='b')
+   c = tvm.placeholder((100, 99), name='c')
+   tvm.hybrid.parse(outer_product, [a, b, c]) # return an ir root of this function
+
+If we pass these tvm tensors to this function, it returns a op node:
+
+**Under construction, we are still deciding what kind of node should be returned.**
+
+.. code-block:: python
+
+   a = tvm.placeholder((100, ), name='a')
+   b = tvm.placeholder((99, ), name='b')
+   c = tvm.placeholder((100, 99), name='c')
+   op = outer_product(a, b, c) # return the corresponding op node
+
+Tuning
+~~~~~~
+
+**Under construction, not truly supported yet.**
+
+Follow up the example above, you can use some tvm like interfaces to tune the code: 
+
+.. code-block:: python
+
+   sch = tvm.create_schedule(op)
+   jo, ji = sch.split(j, 4)
+   sch.vectorize(ji)
+
+``split``, ``reorder``, and loop_annotation will be supported!
+
+Loops
+~~~~~
+
+In HalideIR, loops have in total 4 types: ``serial``, ``unrolled``, ``parallel``, and ``vectorized``.
+
+Here we use ``range`` aka ``serial``, ``unroll``, ``parallel``, and ``vectorize``,
+these **4** keywords to annotate the corresponding types of for loops.
+The the usage is roughly the same as Python standard ``range``.
+
+Variables
+~~~~~~~~~
+
+All the mutatable variables will be lowered to an array with size 1.
+It regards the first store of a variable as its declaration.
+
+.. note::
+
+        Unlike conventional Python, in hybrid script, the declared variable
+        can only be used in the scope level it is declared.
+
+
+.. note::
+
+        Currently, you can ONLY use basic-typed variables, i.e. the type of the
+        variable should be either ``float32``, or ``int32``.
+
+.. code-block:: python
+
+   for i in range(5):
+       s = 0 # declaration, this s will be a 1-array in lowered IR
+       for j in range(5):
+     	  s += a[i, j] # do something with sum
+       b[i] = sum # you can still use sum in this level
+   a[0] = s # you CANNOT use s here, even though it is allowed in conventional Python
+   b = (1, 2) # this has NOT been supported yet!
+
+
+Attributes
+~~~~~~~~~~
+
+So far, ONLY tensors' ``shape`` attribute is supported! The ``shape`` atrribute is essentailly a
+tuple, so you MUST access it as an array. Also, currently, only constant-indexed access is supported.
+
+.. code-block:: python
+
+   x = a.shape[2] # OK!
+   for i in range(3):
+      for j in a.shape[i]: # BAD! i is not a constant!
+          # do something
+
+
+Conditional Statement and Expression
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+   if condition:
+        # do something
+   a = b if condition else c
+
+However, NO ``True`` and ``False`` keyword supported yet.
+
+
+Math Intrinsics
+~~~~~~~~~~~~~~~
+
+So far, these math intrinsics, ``log``, ``exp``, ``sigmoid``,
+``tanh``, ``power``, and ``popcount``, are supported.
+No import is required, just as it is mentioned in `Software Emulation`_, just use it!
+
+Array Allocation
+~~~~~~~~~~~~~~~~
+
+**Under construction, this function will be supported later!**
+
+Use a function call ``allocation(shape, type, share/local)`` to declare an array buffer.
+The basic usage is roughly the same as a normal array.
+
+
+Thread Bind
+~~~~~~~~~~~
+
+
+You can also do loop-thread bind by writing code like this:
+
+.. code-block:: python
+
+   for tx in bind("threadIdx.x", 100):
+       a[tx] = b[tx]
+
+
+Keywords
+~~~~~~~~
+- For keywords: ``serial``, ``range``, ``unroll``, ``parallel``, ``vectorize``, ``bind``
+- Math keywords: ``log``, ``exp``, ``sigmoid``, ``tanh``, ``power``, ``popcount``
diff --git a/docs/langref/index.rst b/docs/langref/index.rst
new file mode 100644
index 000000000000..65f78d1d278b
--- /dev/null
+++ b/docs/langref/index.rst
@@ -0,0 +1,9 @@
+Language Reference
+==================
+This document provide references to
+embedded languages in TVM stack.
+
+.. toctree::
+   :maxdepth: 2
+
+   hybrid_script
diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
new file mode 100644
index 000000000000..96a37b779e1e
--- /dev/null
+++ b/docs/nnvm_top.rst
@@ -0,0 +1,193 @@
+NNVM Core Tensor Operators
+==========================
+
+This page contains the list of core tensor operator primitives pre-defined in NNVM.
+The core tensor operator primitives(``nnvm.top``) covers typical workloads in deep learning.
+They can represent workloads in front-end frameworks, and provide basic building blocks for optimization.
+Since deep learning is a fast evolving field and it is that possible to have operators that are not in here.
+NNVM is designed for this problem and can easily new operators without changing the core library.
+
+.. note::
+
+   Each operator node in the graph IR contains the following two kinds of parameters.
+
+   - inputs: positional list of input tensors
+   - attrs: attributes about operator(e.g. kernel_size in conv2d)
+
+   This document lists both inputs and attributes in the parameter field.  You can distinguish them by the marked type. The inputs are of type Tensor, while the rest parameters are attributes.
+   To construct the graph with NNVM python API, a user can pass in the input Tensors as positional arguments, and attributes as keyword arguments.
+
+
+Overview of Operators
+---------------------
+**Level 1: Basic Operators**
+
+This level enables fully connected multi-layer perceptron.
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.dense
+   nnvm.symbol.relu
+   nnvm.symbol.tanh
+   nnvm.symbol.sigmoid
+   nnvm.symbol.exp
+   nnvm.symbol.log
+   nnvm.symbol.sqrt
+   nnvm.symbol.elemwise_add
+   nnvm.symbol.elemwise_sub
+   nnvm.symbol.elemwise_mul
+   nnvm.symbol.elemwise_div
+   nnvm.symbol.elemwise_sum
+   nnvm.symbol.flatten
+   nnvm.symbol.concatenate
+   nnvm.symbol.expand_dims
+   nnvm.symbol.squeeze
+   nnvm.symbol.split
+   nnvm.symbol.dropout
+   nnvm.symbol.batch_norm
+   nnvm.symbol.softmax
+   nnvm.symbol.log_softmax
+   nnvm.symbol.pad
+   nnvm.symbol.block_grad
+
+
+**Level 2: Convolutions**
+
+This level enables typical convnet models.
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.conv2d
+   nnvm.symbol.conv2d_transpose
+   nnvm.symbol.max_pool2d
+   nnvm.symbol.avg_pool2d
+   nnvm.symbol.global_max_pool2d
+   nnvm.symbol.global_avg_pool2d
+
+
+**Level 3: Additional Tensor Ops**
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.reshape
+   nnvm.symbol.copy
+   nnvm.symbol.negative
+   nnvm.symbol.floor
+   nnvm.symbol.ceil
+   nnvm.symbol.round
+   nnvm.symbol.trunc
+   nnvm.symbol.abs
+   nnvm.symbol.leaky_relu
+   nnvm.symbol.__add_scalar__
+   nnvm.symbol.__sub_scalar__
+   nnvm.symbol.__rsub_scalar__
+   nnvm.symbol.__mul_scalar__
+   nnvm.symbol.__div_scalar__
+   nnvm.symbol.__rdiv_scalar__
+   nnvm.symbol.__pow_scalar__
+   nnvm.symbol.__rpow_scalar__
+   nnvm.symbol.__lshift_scalar__
+   nnvm.symbol.__rshift_scalar__
+
+**Level 4: Broadcast and Reductions**
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.transpose
+   nnvm.symbol.broadcast_to
+   nnvm.symbol.sum
+   nnvm.symbol.min
+   nnvm.symbol.max
+   nnvm.symbol.broadcast_add
+   nnvm.symbol.broadcast_sub
+   nnvm.symbol.broadcast_mul
+   nnvm.symbol.broadcast_div
+   nnvm.symbol.clip
+   nnvm.symbol.greater
+   nnvm.symbol.less
+   nnvm.symbol.expand_like
+   nnvm.symbol.reshape_like
+   nnvm.symbol.full
+   nnvm.symbol.full_like
+   nnvm.symbol.ones
+   nnvm.symbol.ones_like
+   nnvm.symbol.zeros
+   nnvm.symbol.zeros_like
+
+Detailed Definitions
+--------------------
+.. autofunction:: nnvm.symbol.dense
+.. autofunction:: nnvm.symbol.relu
+.. autofunction:: nnvm.symbol.tanh
+.. autofunction:: nnvm.symbol.sigmoid
+.. autofunction:: nnvm.symbol.exp
+.. autofunction:: nnvm.symbol.log
+.. autofunction:: nnvm.symbol.sqrt
+.. autofunction:: nnvm.symbol.elemwise_add
+.. autofunction:: nnvm.symbol.elemwise_sub
+.. autofunction:: nnvm.symbol.elemwise_mul
+.. autofunction:: nnvm.symbol.elemwise_div
+.. autofunction:: nnvm.symbol.elemwise_sum
+.. autofunction:: nnvm.symbol.flatten
+.. autofunction:: nnvm.symbol.concatenate
+.. autofunction:: nnvm.symbol.expand_dims
+.. autofunction:: nnvm.symbol.squeeze
+.. autofunction:: nnvm.symbol.split
+.. autofunction:: nnvm.symbol.dropout
+.. autofunction:: nnvm.symbol.batch_norm
+.. autofunction:: nnvm.symbol.softmax
+.. autofunction:: nnvm.symbol.log_softmax
+.. autofunction:: nnvm.symbol.pad
+.. autofunction:: nnvm.symbol.block_grad
+
+.. autofunction:: nnvm.symbol.conv2d
+.. autofunction:: nnvm.symbol.conv2d_transpose
+.. autofunction:: nnvm.symbol.max_pool2d
+.. autofunction:: nnvm.symbol.avg_pool2d
+.. autofunction:: nnvm.symbol.global_max_pool2d
+.. autofunction:: nnvm.symbol.global_avg_pool2d
+
+.. autofunction:: nnvm.symbol.reshape
+.. autofunction:: nnvm.symbol.copy
+.. autofunction:: nnvm.symbol.negative
+.. autofunction:: nnvm.symbol.floor
+.. autofunction:: nnvm.symbol.ceil
+.. autofunction:: nnvm.symbol.round
+.. autofunction:: nnvm.symbol.trunc
+.. autofunction:: nnvm.symbol.abs
+.. autofunction:: nnvm.symbol.leaky_relu
+.. autofunction:: nnvm.symbol.__add_scalar__
+.. autofunction:: nnvm.symbol.__sub_scalar__
+.. autofunction:: nnvm.symbol.__rsub_scalar__
+.. autofunction:: nnvm.symbol.__mul_scalar__
+.. autofunction:: nnvm.symbol.__div_scalar__
+.. autofunction:: nnvm.symbol.__rdiv_scalar__
+.. autofunction:: nnvm.symbol.__pow_scalar__
+.. autofunction:: nnvm.symbol.__rpow_scalar__
+.. autofunction:: nnvm.symbol.__lshift_scalar__
+.. autofunction:: nnvm.symbol.__rshift_scalar__
+
+.. autofunction:: nnvm.symbol.transpose
+.. autofunction:: nnvm.symbol.broadcast_to
+.. autofunction:: nnvm.symbol.sum
+.. autofunction:: nnvm.symbol.min
+.. autofunction:: nnvm.symbol.max
+.. autofunction:: nnvm.symbol.broadcast_add
+.. autofunction:: nnvm.symbol.broadcast_sub
+.. autofunction:: nnvm.symbol.broadcast_mul
+.. autofunction:: nnvm.symbol.broadcast_div
+.. autofunction:: nnvm.symbol.clip
+.. autofunction:: nnvm.symbol.greater
+.. autofunction:: nnvm.symbol.less
+.. autofunction:: nnvm.symbol.expand_like
+.. autofunction:: nnvm.symbol.reshape_like
+.. autofunction:: nnvm.symbol.full
+.. autofunction:: nnvm.symbol.full_like
+.. autofunction:: nnvm.symbol.ones
+.. autofunction:: nnvm.symbol.ones_like
+.. autofunction:: nnvm.symbol.zeros
+.. autofunction:: nnvm.symbol.zeros_like
diff --git a/docs/vta/.gitignore b/docs/vta/.gitignore
new file mode 100644
index 000000000000..a07068979a60
--- /dev/null
+++ b/docs/vta/.gitignore
@@ -0,0 +1 @@
+tutorials
\ No newline at end of file
diff --git a/docs/vta/dev/config.rst b/docs/vta/dev/config.rst
new file mode 100644
index 000000000000..5b0ee966872b
--- /dev/null
+++ b/docs/vta/dev/config.rst
@@ -0,0 +1,70 @@
+VTA Configuration
+=================
+
+The VTA stack incorporates both a hardware accelerator stack and
+a TVM based software stack.
+VTA incorporates flexibility out of the box: by modifying the
+``vta/config/vta_config.json`` high-level configuration file,
+the user can change the shape of the tensor intrinsic,
+clock frequency, pipelining, data type width, and on-chip buffer sizes.
+
+Parameters Overview
+-------------------
+
+We explain the parameters listed in the ``vta_config.json`` file in the table
+below.
+
++-----------------------+------------+--------------------------------------------------------+
+| Attribute             | Format     | Description                                            |
++=======================+============+========================================================+
+| ``TARGET``            | String     | The TVM device target.                                 |
++-----------------------+------------+--------------------------------------------------------+
+| ``HW_TARGET``         | Int        | FPGA frequency in MHz.                                 |
++-----------------------+------------+--------------------------------------------------------+
+| ``HW_CLK_TARGET``     | Int        | FPGA clock period in ns target for HLS tool.           |
++-----------------------+------------+--------------------------------------------------------+
+| ``HW_VER``            | String     | VTA hardware version number.                           |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_INP_WIDTH``     | Int (log2) | Input data type signed integer width.                  |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_WGT_WIDTH``     | Int (log2) | Weight data type signed integer width.                 |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_ACC_WIDTH``     | Int (log2) | Accumulator data type signed integer width.            |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_OUT_WIDTH``     | Int (log2) | Output data type signed integer width.                 |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_BATCH``         | Int (log2) | VTA matrix multiply intrinsic output dimension 0.      |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_BLOCK_IN``      | Int (log2) | VTA matrix multiply reduction dimension.               |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_BLOCK_OUT``     | Int (log2) | VTA matrix multiply intrinsic output dimension 1.      |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_UOP_BUFF_SIZE`` | Int (log2) | Micro-op on-chip buffer in Bytes.                      |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_INP_BUFF_SIZE`` | Int (log2) | Input on-chip buffer in Bytes.                         |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_WGT_BUFF_SIZE`` | Int (log2) | Weight on-chip buffer in Bytes.                        |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_ACC_BUFF_SIZE`` | Int (log2) | Accumulator on-chip buffer in Bytes.                   |
++-----------------------+------------+--------------------------------------------------------+
+
+
+ .. note::
+
+    When a parameter name is preceded with ``LOG``, it means that it describes a value that can only be expressed a power of two.
+    For that reason we describe these parameters by their log2 value.
+    For instance, to describe an integer width of 8-bits for the input data types, we set the ``LOG_INP_WIDTH`` to be 3, which is the log2 of 8.
+    Similarly, to descibe a 64kB micro-op buffer, we would set ``LOG_UOP_BUFF_SIZE`` to be 16.
+
+We provide additional detail below regarding each parameter:
+
+ - ``TARGET``: Can be set to ``"pynq"`` or ``"sim"``.
+ - ``HW_TARGET``: In pynq mode, can be set to ``100``, ``142``, ``167``, or ``200`` MHz.
+ - ``HW_CLK_TARGET``: The lower the target, the more pipeline stages HLS will insert to achieve timing closure during place and route (this can also slightly decrease performance).
+ - ``HW_VER``: Hardware version which increments everytime the VTA hardware design changes. This parameter is used to uniquely idenfity hardware bitstreams.
+ - ``LOG_OUT_WIDTH``: We recommend matching ``LOG_OUT_WIDTH`` to ``LOG_INP_WIDTH``.
+ - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
+ - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
+ - ``LOG_BLOCK_IN``: Equivalent to B in multiplication of shape (A, B) x (B, C), or typically, the input channel dimension.
+ - ``LOG_BLOCK_OUT``: Equivalent to C in multiplication of shape (A, B) x (B, C), or typically, the output channel dimension.
+
diff --git a/docs/vta/dev/hardware.rst b/docs/vta/dev/hardware.rst
new file mode 100644
index 000000000000..fd19f969687c
--- /dev/null
+++ b/docs/vta/dev/hardware.rst
@@ -0,0 +1,282 @@
+VTA Hardware Guide
+==================
+
+We present a top-down overview of the VTA hardware design.
+This hardware design guide covers VTA hardware at two levels:
+
+ - An architectural overview of the VTA design and its ISA hardware-software
+   interface.
+ - A micro-architectural overview of the VTA hardware modules, and the
+   micro-code specification for the compute core.
+
+VTA Overview
+------------
+
+VTA is a generic deep learning accelerator built for fast and efficient dense linear algebra.
+VTA incorporates a simple RISC-like processor that can perform dense linear algebra operations on rank 1 or 2 tensor registers.
+In addition the design adopts decoupled access-execute to hide memory access latency.
+
+
+To a broader extent, VTA can serve as a template deep learning accelerator design for full stack optimization, exposing a generic tensor computation interface to the compiler stack.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_overview.png
+   :align: center
+   :width: 80%
+
+The figure above gives a high-level overview of the VTA hardware organization.
+VTA is composed of four modules that communicate among each other via FIFO queues and local memory blocks (SRAM), to enable task-level pipeline parallelism:
+
+- The fetch module takes care of loading an instruction stream from DRAM. It also decodes those instructions to route them into one of three command queues.
+- The load module takes care of loading input and weight tensors from DRAM into data-specialized on-chip memories.
+- The compute module performs both dense linear algebra computation with its GEMM core, and general computation with its tensor ALU. It also takes care of loading data from DRAM into the register file, and loading micro-op kernels into the micro-op cache.
+- The store module stores results produced by the compute core back to DRAM.
+
+HLS Hardware Source Organization
+--------------------------------
+
+The VTA design is currently specified in Vivado HLS C++, which is only supported
+by Xilinx toolchains.
+The VTA hardware sources are contained under ``vta/hardware/xilinx/sources``:
+
+ - ``vta.cc`` contains the definitions for each VTA module, as well as a top
+   level behavioral model for the top-level VTA design.
+ - ``vta.h`` contains type definitions using Xilinx ``ap_int`` types, and
+   function prototypes declarations.
+
+In addition preprocessor macros are defined under ``vta/include/vta/hw_spec.h``.
+Much of these macro definitions are derived from the parameters listed in the
+``vta/config/vta_config.json`` file.
+The json file is processed by ``vta/config/vta_config.py`` to produce a string of
+compile flags that define the preprocessor macros.
+That string is used by the makefile in order to set those high-level
+parameters in both the HLS hardware synthesis compiler, and the C++
+compiler that builds the VTA runtime.
+
+HLS Module Example
+~~~~~~~~~~~~~~~~~~
+
+We show a definition of one of the VTA modules defined in C++:
+
+.. code-block:: c
+
+  void fetch(
+    uint32_t insn_count,
+    volatile insn_T *insns,
+    hls::stream<insn_T> &load_queue,
+    hls::stream<insn_T> &gemm_queue,
+    hls::stream<insn_T> &store_queue) {
+  #pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
+  #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
+  #pragma HLS INTERFACE axis port = load_queue
+  #pragma HLS INTERFACE axis port = gemm_queue
+  #pragma HLS INTERFACE axis port = store_queue
+  #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+
+    INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
+  #pragma HLS PIPELINE II = 1
+      // Read instruction fields
+      insn_T insn = insns[pc];
+      // Do some partial decoding
+      opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
+      memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+      // Push to appropriate instruction queue
+      if (opcode == VTA_OPCODE_STORE) {
+        store_queue.write(insn);
+      } else if (opcode == VTA_OPCODE_LOAD &&
+          (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) {
+        load_queue.write(insn);
+      } else {
+        gemm_queue.write(insn);
+      }
+    }
+  }
+
+A few observations on HLS coding:
+ - *Parameters:* The parameter list of each function, combined with the
+   interface pragmas define the hardware interface exposed by the
+   generated hardware module.
+
+    - Parameters passed by value indicate a read-only hardware memory-mapped
+      register that the host can write to.
+      This fetch function for instance has an ``insn_count`` parameter
+      which will be synthesized as a memory mapped register for the host
+      to write to, in order to set the length of a given VTA instruction
+      sequence.
+    - Pointer parameters can mean one of two things depending on the interface
+      pragma being used.
+
+       - When used with a ``m_axi`` interface pragma, an AXI master interface
+         gets generated to provide DMA access to DRAM.
+       - When used with a ``bram`` interface pragma, a BRAM interface gets
+         generated to expose read and/or write ports to an FPGA block-RAM.
+    - HLS streams being passed by reference combined with the ``axis`` interface
+      pragma produce FIFO interfaces to the module. Hardware FIFOs provide a
+      useful synchronization mechanism between modules.
+ - *Pragmas*: Compiler pragmas are essential to define hardware implementation
+   of each module. We list several pragmas used in the VTA design to communicate
+   implementation requirements to the compiler.
+
+    - ``HLS INTERFACE``: specifies the interface of the synthesized
+      hardware module.
+    - ``HLS PIPELINE``: defines hardware pipeline performance target by setting
+      an initiation interval goal. When the ``II == 1`` target is set, it tells
+      the compiler that the synthesized hardware pipeline should be able to
+      execute one loop iteration per cycle.
+    - ``HLS DEPENDENCE``: instructs the compiler to ignore certain types
+      of dependence checks in a given loop. Consider a loop body that writes
+      and reads to the same BRAM structure, and needs to achieve an II of 1.
+      The HLS compiler has to assume worst-case scenario, whereby a read is
+      issued to an address that a past write updated the cycle prior: this
+      cannot be achieved given BRAM timing characteristics (it takes at least
+      2 cycles to see the updated value). Therefore in order to achieve an II of 1,
+      the dependence checks have to be relaxed.
+      Note that when turning this optimization on, it falls onto
+      the software stack to prevent writes followed by reads to the same address.
+
+ .. note::
+    This `reference guide <https://www.xilinx.com/support/documentation/sw_manuals/xilinx2018_2/ug902-vivado-high-level-synthesis.pdf>`_
+    provides a much more in-depth, and complete specification of HLS for the Xilinx 2018.2 toolchains.
+
+Architectural Overview
+----------------------
+
+Instruction Set Architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+VTA's instruction set architecture (ISA) is composed of 4 CISC instructions that have a variable execution latency, two of which execute a micro-coded instruction sequence to perform computation.
+
+The VTA instructions are listed below:
+
+- ``LOAD`` instruction: loads a 2D tensor from DRAM into the input buffer, weight buffer, or register file. It can also load a micro-kernel into the micro-op cache. Supports dynamic padding when loading input and weight tiles.
+- ``GEMM`` instruction: performs a micro-op sequence of matrix-matrix multiplications over an input tensor and a weight tensors, and adds the result to a register-file tensor.
+- ``ALU`` instruction: performs a micro-op sequence of matrix-matrix ALU operations over register-file tensor data.
+- ``STORE`` instruction: stores a 2D tensor from the output buffer to DRAM.
+
+The ``LOAD`` instructions are executed by the load and compute modules depending on the store memory buffer location target.
+The ``GEMM`` and ``ALU`` instructions are executed by the compute module's GEMM core and tensor ALU.
+Finally, the ``STORE`` instructions are executed by the store module exclusively.
+The fields of each instruction is described in the figure below.
+The meaning of each field will be further explained in the :ref:`vta-uarch` section.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/vta_instructions.png
+   :align: center
+   :width: 100%
+
+.. note::
+   Note that the VTA ISA changes as VTA's architectural parameters are modified (i.e. GEMM core shape, data type, memory size etc.), and as a result the ISA does not guarantee compatibility across all variants of VTA.
+   This is acceptable however, since the VTA runtime adapts to parameter changes, and produces binary code tailored for the version of the accelerator that gets generated.
+   This exemplifies the co-design philosophy adopted by the VTA stack which embraces fluidity of the hardware-software interface.
+
+Dataflow Execution
+~~~~~~~~~~~~~~~~~~
+
+VTA relies on dependence FIFO queues between hardware modules to synchronize the execution of concurrent tasks.
+The figure below shows how a given hardware module can execute concurrently from its producer and consumer modules in a dataflow fashion through the use of dependence FIFO queues, and single-reader/single-writer SRAM buffers.
+Each module is connected to its consumer and producer via read-after-write (RAW) and write-after-read (WAR) dependence queues.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/dataflow.png
+   :align: center
+   :width: 100%
+
+The pseudo-code above describes how a module executes a given instruction predicated on dependences with other instructions.
+First, the dependence flags within each instruction are decoded in hardware.
+If the instruction has an incoming RAW dependences, execution is predicated upon receiving a RAW dependence token from the producer module.
+Similarly, if the task has an incoming WAR dependence, execution is predicated upon receiving a WAR dependence token from the consumer module.
+Finally when the task is done, we check for outgoing RAW and WAR dependences, and notify the consumer and producer modules respectively.
+
+.. note::
+   Note that the dependence tokens in this scenario are information-less.
+   This is because the instructions executed by each module cannot be reordered by design, as they arrive in FIFO order.
+
+Pipeline Expandability
+~~~~~~~~~~~~~~~~~~~~~~
+
+The default VTA design is composed of four modules that describe a 3-stage ``load-compute-store`` task pipeline.
+Following the dataflow hardware organization principle, we can extend VTA the pipeline to include more stages.
+For example, we can envision separating the tensor ALU from the GEMM core in order to maximize the utilization of the GEMM core.
+This would result in a ``load-gemm-activate-store`` task pipeline which closely reflects the TPU design.
+Adding more stages has a cost however: it can add storage and extra logic overhead, which is why we opted for a default 3-stage pipeline.
+
+.. _vta-uarch:
+Microarchitectural Overview
+----------------------
+
+We describe the modules that compose the VTA design.
+The module definitions are contained in ``vta/hardware/xilinx/sources/vta.cc``.
+
+Fetch Module
+~~~~~~~~~~~~
+
+VTA is programmed by a linear instruction stream.
+The fetch module is the entry point of VTA to the CPU and is programmed via three memory mapped registers:
+
+- The read-write ``control`` register starts the fetch module, and is read to check for its completion.
+- The write-only ``insn_count`` register sets the number of instructions to execute.
+- The write-only ``insns`` register sets the start address of the instruction stream in DRAM.
+
+The CPU prepares the instruction stream in DRAM in a physically-contiguous buffer prepared by the VTA runtime.
+When the instruction stream is ready, the CPU writes the start physical address into the ``insns`` register, the length of the instruction stream into the ``insn_count`` register, and asserts the start signal in the ``control`` register.
+This procedure starts VTA, which reads in the instruction stream from DRAM via DMA. 
+
+Upon accessing the instruction stream, the fetch module partially decodes instructions, and pushes those instructions into command queues that feed into the load, compute, and store modules:
+
+- ``STORE`` instructions are pushed to the store command queue to be processed by the store module.
+- ``GEMM`` and ``ALU`` instructions are pushed to the compute command queue to be processed by the compute module.
+- ``LOAD`` instructions that describe a load operation of micro-op kernels or register file data are pushed to the compute command queue to be processed by the compute module.
+- ``LOAD`` instructions that describe a load operation of input or weight data are pushed to the load command queue to be processed by the load module.
+
+When one of the command queues becomes full, the fetch module stalls until the queue is not full.
+Consequently, the command queues are sized to be deep enough to allow for a wide execution window, and allow multiple tasks to be in flight concurrently across the ``load-compute-store`` pipeline.
+
+
+Compute Module
+~~~~~~~~~~~~~~
+
+VTA's compute module acts as a RISC processor that performs computation on tensor registers rather than scalar registers.
+Two functional units mutate the register file: the tensor ALU, and the GEMM core.
+
+The compute module executes RISC micro-ops from the micro-op cache.
+There are two types of compute micro-ops: ALU and GEMM operations.
+To minimize the footprint of micro-op kernels, while avoiding the need for control-flow instructions such as conditional jumps, the compute module executes micro-op sequences inside a two-level nested loop that computes the location of each tensor register location via an affine function.
+This compression approach helps reduce the micro-kernel instruction footprint, and applies to both matrix multiplication and 2D convolution, commonly found in neural network operators.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/gemm_core.png
+   :align: center
+   :width: 100%
+
+The **GEMM core** evaluates GEMM instructions, by executing a micro-code sequence in a 2-level nested loop described in the Figure above.
+The GEMM core can perform one input-weight matrix multiplication per cycle.
+The dimensions of the single-cycle matrix multiplication defines a hardware *tensorization intrinsic* which the TVM compiler has to lower a computation schedule onto.
+This tensorization intrinsic is defined by the dimensions of the input, weight and accumulator tensors.
+Each data type can have a different integer precision: typically both weight and input types are low-precision (8-bits or less), while the accumulator tensor has a wider type to prevent overflows (32-bits).
+In order to keep the GEMM core busy, each of the input buffer, weight buffer, and register file have to expose sufficient read/write bandwidth.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/alu_core.png
+   :align: center
+   :width: 100%
+
+The **Tensor ALU** supports a set of standard operations to implement common activation, normalization, and pooling operators.
+VTA being a modular design, the range of operators that the Tensor ALU supports can be extended for higher operator coverage, at the expense of higher resource utilization.
+The Tensor ALU can perform tensor-tensor operations, as well as tensor-scalar operations on an immediate value.
+The opcode of the tensor ALU, and the immediate value are specified by the high-level CISC instruction.
+The micro-code in the context of tensor ALU computation only takes care of specifying data access patterns.
+
+.. note::
+   In terms of computational throughput, the Tensor ALU does not execute at a rate of one operation per cycle.
+   The limitation comes from the lack of read-ports: since one register file tensor can be read per cycle, the tensor ALU has an initiation interval of at least 2 (i.e. performs at most 1 operation every 2 cycles).
+   In addition, performing a single tensor-tensor operation at once can be expensive especially given that register file types are wide, typically 32-bit integers.
+   As a result, in order to balance the resource utilization footprint of the Tensor ALU with the GEMM core, a tensor-tensor operation is by default performed via vector-vector operations over multiple cycles.
+
+
+Load and Store Modules
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/2d_dma.png
+   :align: center
+   :width: 100%
+
+The load and store modules perform 2D DMA loads with a strided access pattern from DRAM to SRAM.
+In addition, the load module can insert 2D padding on the fly, which is useful when blocking 2D convolution.
+This means that VTA can tile 2D convolution inputs without paying the overhead of re-laying data out in DRAM to insert spatial padding around input and weight tiles.
+
+
diff --git a/docs/vta/dev/index.rst b/docs/vta/dev/index.rst
new file mode 100644
index 000000000000..788bafe34b3e
--- /dev/null
+++ b/docs/vta/dev/index.rst
@@ -0,0 +1,14 @@
+VTA Design and Developer Guide
+==============================
+
+This developer guide details the complete VTA-TVM hardware-software stack.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_stack.png
+   :align: center
+   :width: 60%
+
+.. toctree::
+   :maxdepth: 2
+
+   config
+   hardware
\ No newline at end of file
diff --git a/docs/vta/hardware.rst b/docs/vta/hardware.rst
new file mode 100644
index 000000000000..294b99a8269f
--- /dev/null
+++ b/docs/vta/hardware.rst
@@ -0,0 +1,2 @@
+VTA Hardware Design Overview
+============================
diff --git a/docs/vta/index.rst b/docs/vta/index.rst
new file mode 100644
index 000000000000..d29dd9f2ffcf
--- /dev/null
+++ b/docs/vta/index.rst
@@ -0,0 +1,36 @@
+VTA: Deep Learning Accelerator Stack
+====================================
+
+The Versatile Tensor Accelerator (VTA) is an open, generic, and customizable deep learning accelerator with a complete TVM-based compiler stack. We designed VTA to expose the most salient and common characteristics of mainstream deep learning accelerators. Together TVM and VTA form an end-to-end hardware-software deep learning system stack that includes hardware design, drivers, a JIT runtime, and an optimizing compiler stack based on TVM.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_overview.png
+   :align: center
+   :width: 60%
+
+VTA has the following key features:
+
+- Generic, modular, open-source hardware.
+- Streamlined workflow to deploy to FPGAs.
+- Simulator support to prototype compilation passes on regular workstations.
+- Pynq-based driver and JIT runtime for both simulated and FPGA hardware back-end.
+- End to end TVM stack integration.
+
+This page contains links to all the resources related to VTA:
+
+
+.. toctree::
+   :maxdepth: 1
+
+   install
+   dev/index
+   tutorials/index
+
+
+Literature
+----------
+
+- Read the VTA `release blog post`_.
+- Read the VTA tech report: `An Open Hardware Software Stack for Deep Learning`_.
+
+.. _release blog post: https://tvm.ai/2018/07/12/vta-release-announcement.html
+.. _An Open Hardware Software Stack for Deep Learning: https://arxiv.org/abs/1807.04188
\ No newline at end of file
diff --git a/docs/vta/install.md b/docs/vta/install.md
new file mode 100644
index 000000000000..ca5969386e80
--- /dev/null
+++ b/docs/vta/install.md
@@ -0,0 +1,263 @@
+VTA Installation Guide
+======================
+
+We present three installation guides, each extending on the previous one:
+1. [Simulator installation](#vta-simulator-installation)
+2. [Hardware test setup](#vta-pynq-based-test-setup)
+3. [FPGA toolchain installation](#vta-fpga-toolchain-installation)
+
+## VTA Simulator Installation
+
+You need [TVM installed](https://docs.tvm.ai/install/index.html) on your machine.
+For a quick and easy start, use the pre-built [TVM Docker image](https://docs.tvm.ai/install/docker.html).
+
+The VTA simulator library is built by default with TVM.
+Add the VTA library to your python path to run the VTA examples.
+
+```bash
+export PYTHONPATH=/path/to/vta/python:${PYTHONPATH}
+```
+
+### Testing your VTA Simulation Setup
+
+To ensure that you've properly installed the VTA python package, run the following 2D convolution testbench.
+
+```bash
+python <tvm root>/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+```
+
+> Note: You'll notice that for every convolution layer, the throughput gets reported in GOPS. These numbers are actually the computational throughput that the simulator achieves, by evaluating the convolutions in software.
+
+You are invited to try out our [VTA programming tutorials](https://docs.tvm.ai/vta/tutorials/index.html).
+
+
+### Advanced Configuration (optional)
+
+VTA is a generic configurable deep learning accelerator.
+The configuration is specified by `vta_config.json` under the TVM root folder.
+This file provides an architectural specification of the VTA accelerator to parameterize the TVM compiler stack and the VTA hardware stack.
+
+The VTA configuration file also specifies the TVM compiler target.
+When `TARGET` is set to `sim`, all TVM workloads execute on the VTA simulator.
+You can modify the content of the configuration file to rebuild VTA to a different parameterization.
+To do so,
+
+```bash
+cd <tvm root>
+cp vta/config/vta_config.json vta_config.json
+# edit vta_config.json
+make vta
+```
+
+## VTA Pynq-Based Test Setup
+
+This second guide extends the *VTA Simulator Installation* guide above to run FPGA hardware tests of the complete TVM and VTA software-hardware stack.
+In terms of hardware components you'll need:
+* The [Pynq](http://www.pynq.io/) FPGA development board which can be acquired for $200, or $150 for academics from [Digilent](https://store.digilentinc.com/pynq-z1-python-productivity-for-zynq/).
+* An Ethernet-to-USB adapter to connect the Pynq board to your development machine.
+* An 8+GB micro SD card.
+* An AC to DC 12V 3A power adapter.
+
+This guide covers the following themes:
+1. Pynq board setup instructions.
+2. Pynq-side RPC server build and deployment.
+3. Revisiting the test examples from the *VTA Simulator Installation* guide, this time executing on the Pynq board.
+
+### Pynq Board Setup
+
+Setup your Pynq board based on the [Pynq board getting started tutorial](http://pynq.readthedocs.io/en/latest/getting_started.html).
+You should follow the instructions up to and including the *Turning On the PYNQ-Z1* step (no need to pursue the tutorial beyond this point).
+* Make sure that you've downloaded the latest Pynq image, [PYNQ-Z1 v2.1](http://pynq-testing.readthedocs.io/en/image_v2.2/getting_started/pynq_image.html) (released 21 Feb 2018), and have imaged your SD card with it (we recommend the free [Etcher](https://etcher.io/) program).
+* For this test setup, follow the ["Connect to a Computer"](http://pynq.readthedocs.io/en/latest/getting_started.html#connect-to-a-computer) Ethernet setup instructions. To be able to talk to the board, make sure to [assign your computer a static IP address](http://pynq.readthedocs.io/en/latest/appendix.html#assign-your-computer-a-static-ip)
+
+Once the board is powered on and connected to your development machine, try connecting to it to make sure you've properly set up your Pynq board:
+```bash
+# To connect to the Pynq board use the [username, password] combo: [xilinx, xilinx]
+ssh xilinx@192.168.2.99
+```
+
+### Pynq-Side RPC Server Build & Deployment
+
+Because the direct board-to-computer connection prevents the board from directly accessing the internet, we'll need to mount the Pynq's file system to your development machine's file system with [sshfs](https://www.digitalocean.com/community/tutorials/how-to-use-sshfs-to-mount-remote-file-systems-over-ssh). Next we directly clone the TVM repository into the sshfs mountpoint on your development machine.
+
+```bash
+# On the Host-side
+mkdir <mountpoint>
+sshfs xilinx@192.168.2.99:/home/xilinx <mountpoint>
+cd <mountpoint>
+git clone --recursive https://github.com/dmlc/tvm
+# When finished, you can leave the moutpoint and unmount the directory
+cd ~
+sudo umount <mountpoint>
+```
+
+Now that we've cloned the VTA repository in the Pynq's file system, we can ssh into it and launch the build of the TVM-based RPC server.
+The build process should take roughly 5 minutes.
+
+```bash
+ssh xilinx@192.168.2.99
+# Build TVM runtime library (takes 5 mins)
+cd /home/xilinx/tvm
+mkdir build
+cp cmake/config.cmake build/.
+# Copy pynq specific configuration
+cp vta/config/pynq_sample.json build/vta_config.json
+cd build
+cmake ..
+make runtime vta -j2
+# Build VTA RPC server (takes 1 min)
+cd ..
+sudo ./apps/pynq_rpc/start_rpc_server.sh # pw is 'xilinx'
+```
+
+You should see the following being displayed when starting the RPC server. In order to run the next examples, you'll need to leave the RPC server running in an `ssh` session.
+```
+INFO:root:RPCServer: bind to 0.0.0.0:9091
+```
+
+Tips regarding the Pynq RPC Server:
+* The RPC server should be listening on port `9091`. If not, an earlier process might have terminated unexpectedly and it's recommended in this case to just reboot the Pynq, and re-run the RPC server.
+* To kill the RPC server, just send the `Ctrl + c` command. You can re-run it with `sudo ./apps/pynq_rpc/start_rpc_server.sh`.
+* If unresponsive, the board can be rebooted by power-cycling it with the physical power switch.
+
+### Testing your Pynq-based Hardware Setup
+
+Before running the examples on your development machine, you'll need to configure your host environment as follows:
+```bash
+# On the Host-side
+export VTA_PYNQ_RPC_HOST=192.168.2.99
+export VTA_PYNQ_RPC_PORT=9091
+```
+
+In addition, you'll need to edit the `vta_config.json` file on the host to indicate that we are targeting the Pynq platform, by setting the `TARGET` field to `"pynq"`.
+Alternatively, you can copy the default `vta/config/pynq_sample.json` into the TVM root as `vta_config.json`.
+> Note: in contrast to our simulation setup, there are no libraries to compile on the host side since the host offloads all of the computation to the Pynq board.
+
+```bash
+# On the Host-side
+cd <tvm root>
+cp vta/config/pynq_sample.json vta_config.json
+```
+
+This time again, we will run the 2D convolution testbench.
+Beforehand, we need to program the Pynq board FPGA with a VTA bitstream, and build the VTA runtime via RPC.
+The following `test_program_rpc.py` script will perform two operations:
+* FPGA programming, by downloading a pre-compiled bitstream from a [VTA bitstream repository](https://github.com/uwsaml/vta-distro) that matches the default `vta_config.json` configuration set by the host, and sending it over to the Pynq via RPC to program the Pynq's FPGA.
+* Runtime building on the Pynq, which needs to be run every time the `vta_config.json` configuration is modified. This ensures that the VTA software runtime that generates the accelerator's executable via just-in-time (JIT) compilation matches the specifications of the VTA design that is programmed on the FPGA. The build process takes about 30 seconds to complete so be patient!
+
+```bash
+# On the Host-side
+python <tvm root>/vta/tests/python/pynq/test_program_rpc.py
+```
+
+> Tip: You can track progress of the FPGA programming and the runtime rebuilding steps by looking at the RPC server's logging messages in your Pynq `ssh` session.
+
+We are now ready to run the 2D convolution testbench in hardware.
+
+```bash
+# On the Host-side
+python <tvm root>/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+```
+
+The performance metrics measured on the Pynq board will be reported for each convolutional layer.
+
+You can also try out our [VTA programming tutorials](https://docs.tvm.ai/vta/tutorials/index.html).
+
+
+## VTA FPGA Toolchain Installation
+
+This third and last guide allows users to generate custom VTA bitstreams using free-to-use Xilinx compilation toolchains.
+
+### Xilinx Toolchain Installation
+
+We recommend using `Vivado 2018.2` since our scripts have been tested to work on this version of the Xilinx toolchains.
+Our guide is written for Linux (Ubuntu) installation.
+
+You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2018.2](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain.
+
+#### Obtaining and Launching the Vivado GUI Installer
+
+1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2018-2.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2018.2: WebPACK and Editions.
+2. You’ll have to sign in with a Xilinx account. This requires a Xilinx account creation that will take 2 minutes.
+3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin`.
+4. Now that the file is downloaded, go to your `Downloads` directory, and change the file permissions so it can be executed:
+```bash
+chmod u+x Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
+```
+5. Now you can execute the binary:
+```bash
+./Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
+```
+
+#### Xilinx Vivado GUI Installer Steps
+
+At this point you've launched the Vivado 2017.1 Installer GUI program.
+
+1. Click “Next” on the *Welcome* screen.
+2. On the *Select Install Type* screen, enter your Xilinx user credentials under the “User Authentication” box and select the “Download and Install Now” option before clicking “Next” .
+3. On the *Accept License Agreements* screen, accept all terms before clicking “Next”.
+4. On the *Select Edition to Install* screen, select the “Vivado HL WebPACK” before clicking “Next” .
+5. Under the *Vivado HL WebPACK* screen, before hitting “Next", check the following options (the rest should be unchecked):
+   * Design Tools -> Vivado Design Suite -> Vivado
+   * Devices -> Production Devices -> SoCs -> Zynq-7000 (if you are targeting the Pynq board)
+   * Devices -> Production Devices -> SoCs -> UltraScale+ MPSoC (if you are targeting the Ultra-96 board)
+6. Your total download size should be about 5GB and the amount of Disk Space Required 23GB.
+7. On the *Select Destination Directory* screen, set the installation directory before clicking “Next”. It might highlight some paths as red - that’s because the installer doesn’t have the permission to write to the directory. In that case select a path that doesn’t require special write permissions (e.g. your home directory).
+8. On the *Installation Summary* screen, hit “Install”.
+9. An *Installation Progress* window will pop-up to track progress of the download and the installation.
+10. This process will take about 20-30 minutes depending on your connection speed.
+11. A pop-up window will inform you that the installation completed successfully. Click "OK".
+12. Finally the *Vivado License Manager* will launch. Select "Get Free ISE WebPACK, ISE/Vivado IP or PetaLinux License" and click "Connect Now" to complete the license registration process.
+
+#### Environment Setup
+
+The last step is to update your `~/.bashrc` with the following lines. This will include all of the Xilinx binary paths so you can launch compilation scripts from the command line.
+```bash
+# Xilinx Vivado 2018.2 environment
+export XILINX_VIVADO=${XILINX_PATH}/Vivado/2018.2
+export PATH=${XILINX_VIVADO}/bin:${PATH}
+```
+
+### Custom VTA Bitstream Compilation
+
+High-level hardware parameters are listed in the VTA configuration file and can be customized by the user.
+For this custom VTA bitstream compilation exercise, we'll change the frequency of our design, so it can be clocked a little faster.
+* Set the `HW_FREQ` field to `142`. The Pynq board supports 100, 142, 167 and 200MHz clocks. Note that the higher the frequency, the harder it will be to close timing. Increasing the frequency can lead to timing violation and thus faulty hardware execution.
+* Set the `HW_CLK_TARGET` to `6`. This parameters refers to the target clock period in nano seconds for HLS - a lower clock period leads to more aggressive pipelining to achieve timing closure at higher frequencies. Technically a 142MHz clock would require a 7ns target, but we intentionally lower the clock target to 6ns to more aggressively pipeline our design.
+
+Bitstream generation is driven by a top-level `Makefile` under `<tvm root>/vta/hardware/xilinx/`.
+
+If you just want to simulate the VTA design in software emulation to make sure that it is functional, enter:
+```bash
+cd <tvm root>/vta/hardware/xilinx
+make ip MODE=sim
+```
+
+If you just want to generate the HLS-based VTA IP cores without launching the entire design place and route, enter:
+```bash
+make ip
+```
+You'll be able to view the HLS synthesis reports under `<tvm root>/vta/build/hardware/xilinx/hls/` `<configuration>/<block>/solution0/syn/report/<block>_csynth.rpt`
+> Note: The `<configuration>` name is a string that summarizes the VTA configuration parameters listed in the `vta_config.json`. The `<block>` name refers to the specific module (or HLS function) that compose the high-level VTA pipeline.
+
+Finally to run the full hardware compilation and generate the VTA bitstream, run:
+
+```bash
+make
+```
+
+This process is lengthy, and can take around up to an hour to complete depending on your machine's specs.
+We recommend setting the `VTA_HW_COMP_THREADS` variable in the Makefile to take full advantage of all the cores on your development machine.
+
+Once the compilation completes, the generated bitstream can be found under `<tvm root>/vta/build/hardware/xilinx/vivado/<configuration>/export/vta.bit`.
+
+### Use the Custom Bitstream
+
+We can program the new VTA FPGA bitstream by setting the bitstream path of the `vta.program_fpga()` function in the tutorial examples, or in the `test_program_rpc.py` script.
+
+```python
+vta.program_fpga(remote, bitstream="<tvm root>/vta/build/hardware/xilinx/vivado/<configuration>/export/vta.bit")
+```
+
+Instead of downloading a pre-built bitstream from the VTA bitstream repository, TVM will instead use the new bitstream you just generated, which is a VTA design clocked at a higher frequency.
+Do you observe a noticeable performance increase on the ImageNet classification example?
diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index 9c2b5194ebe9..93bff2762481 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file api_registry.h
+ * \file tvm/api_registry.h
  * \brief This files include necessary headers to
  *  be used to register an global API function.
  */
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index c4f338f1cd47..6a3c395fd404 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file arithmetic.h
+ * \file tvm/arithmetic.h
  * \brief Algebra and set operations and simplifications.
  */
 #ifndef TVM_ARITHMETIC_H_
diff --git a/include/tvm/base.h b/include/tvm/base.h
index 9a78c5ed503f..d113f45352bb 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file base.h
+ * \file tvm/base.h
  * \brief Defines the base data structure
  */
 #ifndef TVM_BASE_H_
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 610532e261a3..41fa1fa804a8 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file buffer.h
+ * \file tvm/buffer.h
  * \brief Symbolic n-dimensional array, to represent a memory buffer.
  */
 #ifndef TVM_BUFFER_H_
@@ -51,8 +51,11 @@ class Buffer : public NodeRef {
    * \brief Get access ptr to the entire buffer.
    * \param access_mask The access mask
    * \param ptr_type The type of the pointer.
+   * \param content_lanes The number of lanes for the (data) type.
+   * \param offset The offset of ptr.
    */
-  TVM_DLL Expr access_ptr(int access_mask, Type ptr_type = Handle()) const;
+  TVM_DLL Expr access_ptr(int access_mask, Type ptr_type = Handle(),
+                          int content_lanes = 1, Expr offset = make_const(Int(32), 0)) const;
   /*!
    * \brief Create an Expr that does a vector load at begin index.
    * \param begin The beginning index
@@ -122,6 +125,11 @@ class BufferNode : public Node {
     v->Visit("offset_factor", &offset_factor);
   }
 
+  /*! \return preferred index type for this buffer node */
+  Type DefaultIndexType() const {
+    return shape.size() != 0 ? shape[0].type() : Int(32);
+  }
+
   // User can specify data_alignment and offset_factor to be 0
   // A default value will be picked.
   TVM_DLL static Buffer make(Var ptr,
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
new file mode 100644
index 000000000000..96b876fe92f0
--- /dev/null
+++ b/include/tvm/build_module.h
@@ -0,0 +1,464 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file tvm/build_module.h
+* \brief Functions for compiling ops.
+*/
+#ifndef TVM_BUILD_MODULE_H_
+#define TVM_BUILD_MODULE_H_
+
+#include <string>
+#include <vector>
+#include <utility>
+#include "./runtime/packed_func.h"
+#include "./schedule_pass.h"
+#include "./lowered_func.h"
+
+namespace tvm {
+using namespace tvm::runtime;
+
+/*!
+* \brief Container for target device information.
+* Use target::llvm, target::cuda etc functions instead of constructing directly.
+*/
+class TargetNode : public Node {
+ public:
+  /*! \brief The name of the target device */
+  std::string target_name;
+  /*! \brief The name of the target device */
+  std::string device_name;
+  /*! \brief The type of the target device */
+  int device_type;
+  /*! \brief The maximum threads that a schedule should use for this device */
+  int max_num_threads = 1;
+  /*! \brief The warp size that should be used by the LowerThreadAllreduce pass */
+  int thread_warp_size = 1;
+  /*! \brief Keys for this target */
+  Array<Expr> keys_array;
+  /*! \brief Options for this target */
+  Array<Expr> options_array;
+  /*! \brief Collection of imported libs */
+  Array<Expr> libs_array;
+
+  /*! \return the full device string to pass to codegen::Build */
+  EXPORT std::string str() const;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("target_name", &target_name);
+    v->Visit("device_name", &device_name);
+    v->Visit("device_type", &device_type);
+    v->Visit("max_num_threads", &max_num_threads);
+    v->Visit("thread_warp_size", &thread_warp_size);
+    v->Visit("keys_array", &keys_array);
+    v->Visit("options_array", &options_array);
+    v->Visit("libs_array", &libs_array);
+  }
+
+  /*! \brief Get the keys for this target as a vector of string */
+  EXPORT std::vector<std::string> keys() const;
+
+  /*! \brief Get the options for this target as a vector of string */
+  EXPORT std::vector<std::string> options() const;
+
+  /*! \brief Get the keys for this target as an unordered_set of string */
+  EXPORT std::unordered_set<std::string> libs() const;
+
+  static constexpr const char* _type_key = "Target";
+  TVM_DECLARE_NODE_TYPE_INFO(TargetNode, Node);
+};
+
+class Target : public NodeRef {
+ public:
+  Target() {}
+  explicit Target(std::shared_ptr<Node> n) : NodeRef(n) {}
+
+  /*!
+  * \brief Create a Target given a string
+  * \param target_str the string to parse
+  */
+  EXPORT static Target create(const std::string& target_str);
+
+  /*!
+  * \brief Push a new target context onto the thread local stack. The Target on top of
+  * the stack is used to determine which specialization to use when invoking a GenericFunc.
+  * \param target The target to set as the current context.
+  */
+  EXPORT static void EnterTargetScope(const tvm::Target& target);
+
+  /*!
+  * \brief Pop a target off the thread local context stack, restoring the previous target
+  * as the current context.
+  */
+  EXPORT static void ExitTargetScope();
+
+  /*!
+  * \brief Get the current target context from thread local storage.
+  * \param allow_not_defined If the context stack is empty and this is set to true, an
+  * undefined Target will be returned. Otherwise, an empty context stack will cause a
+  * runtime error.
+  * \return The target that is the current context. The target may not be defined if
+  * allow_not_defined is true.
+  */
+  EXPORT static tvm::Target current_target(bool allow_not_defined = true);
+
+  inline const TargetNode* operator->() const {
+      return static_cast<const TargetNode*>(node_.get());
+  }
+
+  using ContainerType = TargetNode;
+};
+
+/*!
+ * \brief RAII container to provide a scoped target context. Pushes a target onto the
+ * context stack when constructed, and pops it when destructed.
+ */
+struct TargetContext {
+  /*!
+   * \brief Enter a new target context. The given target becomes the new current context.
+   * When the TargetContext is destructed, the previous context is restored.
+   * \param target The target to set as the new current context.
+   */
+  explicit TargetContext(const tvm::Target& target) {
+    Target::EnterTargetScope(target);
+  }
+
+  /*! \brief Destructor. Pops the context off the thread local stack. */
+  ~TargetContext() {
+    Target::ExitTargetScope();
+  }
+};
+
+/*! \brief This namespace provides functions to construct Target instances */
+namespace target {
+/*! \return A target for LLVM */
+EXPORT Target llvm(const std::vector<std::string>& options =
+                   std::vector<std::string>());
+
+/*! \return A target for CUDA */
+EXPORT Target cuda(const std::vector<std::string>& options =
+                   std::vector<std::string>());
+
+/*! \return A target for ROCm */
+EXPORT Target rocm(const std::vector<std::string>& options =
+                   std::vector<std::string>());
+
+/*! \return A target for OpenCL */
+EXPORT Target opencl(const std::vector<std::string>& options =
+                     std::vector<std::string>());
+
+/*! \return A target for Metal */
+EXPORT Target metal(const std::vector<std::string>& options =
+                    std::vector<std::string>());
+
+/*! \return A target for rasp */
+EXPORT Target rasp(const std::vector<std::string>& options =
+                   std::vector<std::string>());
+
+/*! \return A target for Mali */
+EXPORT Target mali(const std::vector<std::string>& options =
+                   std::vector<std::string>());
+
+/*! \return A target for Intel Graphics */
+EXPORT Target intel_graphics(const std::vector<std::string>& options =
+                             std::vector<std::string>());
+
+/*! \return A target for stackvm */
+EXPORT Target stackvm(const std::vector<std::string>& options =
+                      std::vector<std::string>());
+
+}  // namespace target
+
+class BuildConfig;
+
+/*!
+* \brief Container for build configuration options
+*/
+class BuildConfigNode : public Node {
+ public:
+  /*!
+   * \brief The data alignment to use when constructing buffers. If this is set to
+   * -1, then TVM's internal default will be used
+   */
+  int data_alignment = -1;
+  /*!
+   * \brief The offset factor to use when constructing buffers. If this is set to
+   * 0, then the offset field is not used.
+   */
+  int offset_factor = 0;
+
+  /*!
+   * \brief Splitting factor for loop splitting. If this is set to zero, no splitting will be
+   * done. Otherwise, a split will be done with this factor and the inner loop will be unrolled.
+   */
+  int double_buffer_split_loop = 1;
+  /*! \brief Threshold of number of steps in the loop to be automatically unrolled */
+  int auto_unroll_max_step = 0;
+  /*! \brief The maximum nested level of loops that can be automatically unrolled */
+  int auto_unroll_max_depth = 8;
+  /*! \brief The maximum extent of loop that will be unrolled */
+  int auto_unroll_max_extent = 0;
+  /*!
+   * \brief Whether to explicitly unroll the loop. If set to false, the unroll hint will
+   * be passed to the CodeGen phase. Set to true if CodeGen supports unroll pragma.
+   */
+  bool unroll_explicit = true;
+
+  /*! \brief Set to true if buffer arguments do not overlap. This enables more optimization. */
+  bool restricted_func = true;
+
+  /*! \brief Whether to detect global barrier */
+  bool detect_global_barrier = false;
+
+  /*! \brief Whether to partition const loop */
+  bool partition_const_loop = false;
+
+  /*! \brief Whether to dump the IR of each pass (only when building from python) */
+  std::vector< std::pair<int, PackedFunc> > add_lower_pass;
+
+  /*! \brief Whether to dump the IR of each pass (only when building from python) */
+  bool dump_pass_ir = false;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("data_alignment", &data_alignment);
+    v->Visit("offset_factor", &offset_factor);
+    v->Visit("double_buffer_split_loop", &double_buffer_split_loop);
+    v->Visit("auto_unroll_max_step", &auto_unroll_max_step);
+    v->Visit("auto_unroll_max_depth", &auto_unroll_max_depth);
+    v->Visit("auto_unroll_max_extent", &auto_unroll_max_extent);
+    v->Visit("unroll_explicit", &unroll_explicit);
+    v->Visit("restricted_func", &restricted_func);
+    v->Visit("detect_global_barrier", &detect_global_barrier);
+    v->Visit("partition_const_loop", &partition_const_loop);
+    v->Visit("dump_pass_ir", &dump_pass_ir);
+  }
+
+  static constexpr const char* _type_key = "BuildConfig";
+  TVM_DECLARE_NODE_TYPE_INFO(BuildConfigNode, Node);
+};
+
+/*!
+* \brief Container for build configuration options
+*/
+class BuildConfig : public ::tvm::NodeRef {
+ public:
+  BuildConfig() {}
+  explicit BuildConfig(std::shared_ptr<::tvm::Node> n) : NodeRef(n) {}
+
+  const BuildConfigNode* operator->() const {
+    return static_cast<const BuildConfigNode*>(node_.get());
+  }
+
+  BuildConfigNode* operator->() {
+    return static_cast<BuildConfigNode*>(node_.get());
+  }
+
+  /*!
+   * \brief Push a new BuildConfig context onto the thread local stack.
+   * \param build_config The configuration to set as the current context.
+   */
+  EXPORT static void EnterBuildConfigScope(const tvm::BuildConfig& build_config);
+
+  /*!
+   * \brief Pop a build config off the thread local context stack, restoring the previous
+   * configuration as the current context.
+   */
+  EXPORT static void ExitBuildConfigScope();
+
+  /*!
+   * \brief Get the current BuildConfig context from thread local storage, or a default
+   * configuration if a BuildConfig scope has not been entered.
+   * \return The configuration that is the current context.
+   */
+  EXPORT static tvm::BuildConfig Current();
+
+  using ContainerType = BuildConfigNode;
+};
+
+/*!
+ * \brief RAII container to provide a scoped BuildConfig context. Pushes a configuration onto the
+ * context stack when constructed, and pops it when destructed.
+ */
+struct BuildConfigContext {
+  /*!
+   * \brief Enter a new BuildConfig context. The given BuildConfig becomes the new current
+   * context. When the BuildConfigContext is destructed, the previous context is restored.
+   * \param build_config The BuildConfig to set as the new current context.
+   */
+  explicit BuildConfigContext(const tvm::BuildConfig& build_config) {
+    BuildConfig::EnterBuildConfigScope(build_config);
+  }
+
+  /*! \brief Destructor. Pops the context off the thread local stack. */
+  ~BuildConfigContext() {
+    BuildConfig::ExitBuildConfigScope();
+  }
+};
+
+/*!
+* \brief Construct a BuildConfig containing a new BuildConfigNode
+* \return The new BuildConfig
+*/
+EXPORT BuildConfig build_config();
+
+/*!
+* \brief Build a LoweredFunc given a schedule, args and binds
+* \param sch The schedule to lower.
+* \param args The arguments to the function.
+* \param name The name of the lowered function.
+* \param binds Buffer assignments.
+* \param config The build configuration.
+* \return The lowered function.
+*/
+EXPORT Array<LoweredFunc> lower(Schedule sch,
+                                const Array<Tensor>& args,
+                                const std::string& name,
+                                const std::unordered_map<Tensor, Buffer>& binds,
+                                const BuildConfig& config);
+
+/*!
+* \brief Build a device and host module for a specific target from an array of lowered functions.
+* \param funcs The functions to be built.
+* \param target The target device to build for.
+* \param target_host The target for building host code. To use the default, pass Target()
+* \param config The build configuration.
+* \return The built module.
+*/
+EXPORT runtime::Module build(const Array<LoweredFunc>& funcs,
+                             const Target& target,
+                             const Target& target_host,
+                             const BuildConfig& config);
+
+class GenericFuncNode;
+
+/*!
+ * \brief Generic function that can be specialized on a per-target basis.
+ */
+class GenericFunc : public NodeRef {
+ public:
+  GenericFunc() {}
+  explicit GenericFunc(std::shared_ptr<Node> n) : NodeRef(n) {}
+
+  /*!
+   * \brief Set the default function implementaiton.
+   * \param value The default function
+   * \param allow_override If true, this call may override a previously registered function. If
+   * false, an error will be logged if the call would override a previously registered function.
+   * \return reference to self.
+   */
+  TVM_DLL GenericFunc& set_default(const PackedFunc value,
+                                   bool allow_override = false);
+  /*!
+   * \brief Register a specialized function
+   * \param tags The tags for this specialization
+   * \param value The specialized function
+   * \param allow_override If true, this call may override previously registered tags. If false,
+   * an error will be logged if the call would override previously registered tags.
+   * \return reference to self.
+   */
+  TVM_DLL GenericFunc& register_func(const std::vector<std::string>& tags,
+                                     const PackedFunc value,
+                                     bool allow_override = false);
+  /*!
+   * \brief Call generic function by directly passing in unpacked format.
+   * \param args Arguments to be passed.
+   * \tparam Args arguments to be passed.
+   *
+   * \code
+   *   // Example code on how to call generic function
+   *   void CallGeneirc(GenericFunc f) {
+   *     // call like normal functions by pass in arguments
+   *     // return value is automatically converted back
+   *     int rvalue = f(1, 2.0);
+   *   }
+   * \endcode
+   */
+  template<typename... Args>
+  inline TVMRetValue operator()(Args&& ...args) const;
+  /*!
+   * \brief Invoke the relevant function for the current target context, set by set_target_context.
+   * Arguments are passed in packed format.
+   * \param args The arguments to pass to the function.
+   * \param ret The return value
+   */
+  TVM_DLL void CallPacked(TVMArgs args, TVMRetValue* ret) const;
+
+  /*!
+   * \brief Find or register the GenericFunc instance corresponding to the give name
+   * \param name The name of the registered GenericFunc
+   * \return The GenericFunc instance
+   */
+  TVM_DLL static GenericFunc Get(const std::string& name);
+
+  /*!
+   * \brief Add a GenericFunc instance to the registry
+   * \param func The GenericFunc instance
+   * \param name The name of the registered GenericFunc
+   */
+  TVM_DLL static void RegisterGenericFunc(GenericFunc func, const std::string& name);
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline GenericFuncNode* operator->();
+
+  // declare container type
+  using ContainerType = GenericFuncNode;
+
+  // Internal class.
+  struct Manager;
+
+ private:
+  friend struct Manager;
+};
+
+template<typename... Args>
+inline TVMRetValue GenericFunc::operator()(Args&& ...args) const {
+  const int kNumArgs = sizeof...(Args);
+  const int kArraySize = kNumArgs > 0 ? kNumArgs : 1;
+  TVMValue values[kArraySize];
+  int type_codes[kArraySize];
+  detail::for_each(TVMArgsSetter(values, type_codes),
+    std::forward<Args>(args)...);
+  TVMRetValue rv;
+  CallPacked(TVMArgs(values, type_codes, kNumArgs), &rv);
+  return rv;
+}
+
+/*!
+ * \brief Represents a generic function that can be specialized on a per-target basis.
+ */
+class GenericFuncNode : public Node {
+ public:
+  /*! \brief name of the function */
+  std::string name_;
+  /* \brief the generic builder */
+  PackedFunc generic_func_;
+  /* \brief map from keys to registered functions */
+  std::unordered_map<std::string, PackedFunc> dispatch_dict_;
+
+  static constexpr const char* _type_key = "GenericFunc";
+  TVM_DECLARE_NODE_TYPE_INFO(GenericFuncNode, Node);
+};
+
+inline GenericFuncNode* GenericFunc::operator->() {
+  return static_cast<GenericFuncNode*>(node_.get());
+}
+
+#define TVM_GENERIC_FUNC_REG_VAR_DEF                               \
+  static TVM_ATTRIBUTE_UNUSED ::tvm::GenericFunc& __mk_ ## TVM
+
+/*!
+ * \def TVM_REGISTER_GENERIC_FUNC
+ * \brief Register a new generic function, or set a device-specific variant
+ * of the corresponding function.
+ *
+ * \param name The name of the function
+ */
+#define TVM_REGISTER_GENERIC_FUNC(name)                           \
+  TVM_STR_CONCAT(TVM_GENERIC_FUNC_REG_VAR_DEF, __COUNTER__) =     \
+      ::tvm::GenericFunc::Get(#name)
+
+
+}  // namespace tvm
+
+#endif  // TVM_BUILD_MODULE_H_
diff --git a/include/tvm/c_dsl_api.h b/include/tvm/c_dsl_api.h
index f81018a7610e..6f15ef9a3e80 100644
--- a/include/tvm/c_dsl_api.h
+++ b/include/tvm/c_dsl_api.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file c_dsl_api.h
+ * \file tvm/c_dsl_api.h
  *
  * \brief TVM DSL Node C API, used to interact to DSL compilation.
  *
@@ -17,7 +17,7 @@
 #include "./runtime/c_runtime_api.h"
 
 #ifdef __cplusplus
-TVM_EXTERN_C {
+extern "C" {
 #endif
 
 /*! \brief handle to node */
diff --git a/include/tvm/channel.h b/include/tvm/channel.h
index 56adff4ad8df..28d9b5f7ce4a 100644
--- a/include/tvm/channel.h
+++ b/include/tvm/channel.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file channel.h
+ * \file tvm/channel.h
  * \brief Channel object for pipeline.
  */
 #ifndef TVM_CHANNEL_H_
diff --git a/include/tvm/codegen.h b/include/tvm/codegen.h
index c43227bb1164..6b5116a143cc 100644
--- a/include/tvm/codegen.h
+++ b/include/tvm/codegen.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file codegen.h
+ * \file tvm/codegen.h
  * \brief Collection of Lowlevel IR pass to codegen.
  */
 #ifndef TVM_CODEGEN_H_
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 4e4e25c0ce7d..8c789f8df1dc 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file expr.h
+ * \file tvm/expr.h
  * \brief The Expr and related elements in DataFlow construction.
  */
 #ifndef TVM_EXPR_H_
@@ -16,31 +16,31 @@
 
 namespace tvm {
 
-using Halide::Type;
-using Halide::Float;
-using Halide::Bool;
-using Halide::Int;
-using Halide::UInt;
-using Halide::Handle;
-using Halide::ExprHash;
-using Halide::ExprEqual;
+using HalideIR::Type;
+using HalideIR::Float;
+using HalideIR::Bool;
+using HalideIR::Int;
+using HalideIR::UInt;
+using HalideIR::Handle;
+using HalideIR::ExprHash;
+using HalideIR::ExprEqual;
 
-using Halide::Expr;
-using Halide::VarExpr;
-using Halide::IR::RangeNode;
-using Halide::IR::FunctionRef;
-using Halide::IR::FunctionBaseNode;
-using Halide::Internal::Stmt;
-using Halide::Internal::IRPrinter;
-using Halide::Internal::Variable;
+using HalideIR::Expr;
+using HalideIR::VarExpr;
+using HalideIR::IR::RangeNode;
+using HalideIR::IR::FunctionRef;
+using HalideIR::IR::FunctionBaseNode;
+using HalideIR::Internal::Stmt;
+using HalideIR::Internal::IRPrinter;
+using HalideIR::Internal::Variable;
 
-using Halide::Internal::make_const;
-using Halide::Internal::make_zero;
-using Halide::Internal::as_const_int;
-using Halide::Internal::as_const_uint;
-using Halide::Internal::const_true;
-using Halide::Internal::const_false;
-using Halide::Internal::is_no_op;
+using HalideIR::Internal::make_const;
+using HalideIR::Internal::make_zero;
+using HalideIR::Internal::as_const_int;
+using HalideIR::Internal::as_const_uint;
+using HalideIR::Internal::const_true;
+using HalideIR::Internal::const_false;
+using HalideIR::Internal::is_no_op;
 
 inline Type TVMShapeIndexType() {
   if (std::is_signed<tvm_index_t>::value) {
@@ -51,7 +51,7 @@ inline Type TVMShapeIndexType() {
 }
 
 inline Type TVMType2Type(TVMType t) {
-  return Type(static_cast<halide_type_code_t>(t.code), t.bits, t.lanes);
+  return Type(static_cast<halideir_type_code_t>(t.code), t.bits, t.lanes);
 }
 
 inline TVMType Type2TVMType(Type t) {
@@ -71,9 +71,9 @@ inline int GetVectorBytes(Type dtype) {
 }
 
 /*! \brief a named variable in TVM */
-class Var : public Halide::VarExpr {
+class Var : public HalideIR::VarExpr {
  public:
-  explicit Var(const std::string& name_hint = "v",
+  EXPORT explicit Var(const std::string& name_hint = "v",
                Type t = Int(32)) : VarExpr(name_hint, t) {}
   explicit Var(std::shared_ptr<Node> n) : VarExpr(n) {}
   explicit Var(VarExpr v) : VarExpr(v) {}
@@ -94,7 +94,7 @@ class Var : public Halide::VarExpr {
 class IterVarNode;
 
 /*!
- * \brief same as Halide::IR::Range
+ * \brief same as HalideIR::IR::Range
  *  except it provide an constructor with (begin, end)
  *
  *  \note Traditional Halide's Range have a constructor with
@@ -102,11 +102,11 @@ class IterVarNode;
  *   We decided to correct it by removing the constructor in HalideIR,
  *   and add it back in TVM's range.
  */
-class Range : public Halide::IR::Range {
+class Range : public HalideIR::IR::Range {
  public:
   /*! \brief constructor */
   Range() {}
-  explicit Range(std::shared_ptr<Node> n) : Halide::IR::Range(n) {}
+  explicit Range(std::shared_ptr<Node> n) : HalideIR::IR::Range(n) {}
   /*!
    * \brief constructor by begin and end
    * \param begin The begin of the range.
@@ -291,6 +291,13 @@ inline const char* IterVarType2String(IterVarType t) {
   return "Unknown";
 }
 
+/*!
+ * \brief Construct a new Var expression
+ * \param name_hint The name hint for the expression
+ * \param t The type of the expression
+ */
+TVM_DLL Var var(const std::string& name_hint, Type t = Int(32));
+
 /*
  * \brief Template function to convert Map to unordered_map
  *  Sometimes useful for API gluing when internal uses unordered_map
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index ae53d38b82b2..9ea16131188d 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file ir.h
+ * \file tvm/ir.h
  * \brief Additional high level nodes in the IR
  */
 #ifndef TVM_IR_H_
@@ -16,11 +16,11 @@
 namespace tvm {
 namespace ir {
 
-using Halide::Internal::ExprNode;
-using Halide::Internal::StmtNode;
-using Halide::Internal::IRNodeType;
-using Halide::Internal::ForType;
-using Halide::DeviceAPI;
+using HalideIR::Internal::ExprNode;
+using HalideIR::Internal::StmtNode;
+using HalideIR::Internal::IRNodeType;
+using HalideIR::Internal::ForType;
+using HalideIR::DeviceAPI;
 
 // Node container for CommReducer
 struct CommReducerNode;
@@ -152,6 +152,12 @@ constexpr const char* coproc_scope = "coproc_scope";
 constexpr const char* coproc_uop_scope = "coproc_uop_scope";
 /*! \brief Mark the scope as volatile access for certain handle. */
 constexpr const char* volatile_scope = "volatile_scope";
+/*!
+ * \brief Mark the scope as generated by extern primitive.
+ *  such scope can contain arbitrary ir program and we need to be careful
+ *  when make certain assumptions about the structure of the program.
+ */
+constexpr const char* extern_scope = "extern_scope";
 /*!
  * \brief Mark the scope as when computation start to happen
  *  This can hint some code generator to create a new function for compute.
@@ -171,8 +177,10 @@ constexpr const char* device_context_type = "device_context_type";
 constexpr const char* loop_scope = "loop_scope";
 /*! \brief Mark of reduce scope */
 constexpr const char* reduce_scope = "reduce_scope";
-/*! \brief Mark region is guarded by the pragma */
-constexpr const char* pragma_scope = "pragma_scope";
+/*! \brief Mark region is guarded by the pragma extension */
+constexpr const char* pragma_scope_prefix = "pragma_";
+/*! \brief Import llvm source or file into the final code gen module */
+constexpr const char* pragma_import_llvm = "pragma_import_llvm";
 /*!
  * \brief Mark of prefetch scope, value=offset,
  *  run prefetch of Tensor on the current loop scope
@@ -220,6 +228,23 @@ constexpr const char* channel_write_advance = "channel_write_advance";
 constexpr const char* pipeline_stage_scope = "pipeline_stage_scope";
 /*! \brief pipeline execution scope, implies the scope can be pipelined. */
 constexpr const char* pipeline_exec_scope = "pipeline_exec_scope";
+/*!
+ * \brief Mark that this stage is an OpenGL shader. Since OpenGL shader only
+ * allows writing out to one element of the output texture, the Provide node
+ * gets translated to a special Call::glsl_texture_store statement instead of a
+ * Store statement.
+ */
+constexpr const char* opengl_stage_scope = "opengl_stage_scope";
+
+/*!
+ * \brief Check if attr_key is a pragma key extension
+ * \param attr_key The attr key to be compared
+ * \return true if it is a pragma key
+ */
+inline bool IsPragmaKey(const std::string& attr_key) {
+  return attr_key.compare(0, 7, "pragma_") == 0;
+}
+
 }  // namespace attr
 
 /*! \brief namespace of TVM Intrinsic functions */
@@ -256,6 +281,11 @@ constexpr const char* tvm_if_then_else = "tvm_if_then_else";
  *  }
  */
 constexpr const char* tvm_access_ptr = "tvm_access_ptr";
+/*!
+ * \brief Create a function local static handle that iniitalizes to nullptr.
+ *  can be used to cache function local static resources.
+ */
+constexpr const char* tvm_static_handle = "tvm_static_handle";
 /*!
  * \brief Return a unique context id, used for hint of workspace separation.
  *  Different context id ganrantees not having overlapping workspace.
@@ -393,6 +423,14 @@ constexpr const char* tvm_call_packed_lowered = "tvm_call_packed_lowered";
  *  }
  */
 constexpr const char* tvm_storage_sync = "tvm_storage_sync";
+/*!
+ * \brief See pseudo code
+ *
+ *  Type tvm_warp_shuffle(Type value, warp_id) {
+ *     return (value passed in by warp indicated by warp_id);
+ *  }
+ */
+constexpr const char* tvm_warp_shuffle = "tvm_warp_shuffle";
 /*!
  * \brief Initialize the global barrier.
  *  Call this at beginning of kernel that need global barrier.
@@ -433,51 +471,61 @@ enum TVMStructFieldKind : int {
 }   // namespace intrinsic
 
 // Reuse IR node defintiion from HalideIR
-using Halide::Internal::IntImm;
-using Halide::Internal::UIntImm;
-using Halide::Internal::FloatImm;
-using Halide::Internal::StringImm;
-using Halide::Internal::Cast;
-using Halide::Internal::Add;
-using Halide::Internal::Sub;
-using Halide::Internal::Mul;
-using Halide::Internal::Div;
-using Halide::Internal::Mod;
-using Halide::Internal::Min;
-using Halide::Internal::Max;
-using Halide::Internal::EQ;
-using Halide::Internal::NE;
-using Halide::Internal::LT;
-using Halide::Internal::LE;
-using Halide::Internal::GT;
-using Halide::Internal::GE;
-using Halide::Internal::And;
-using Halide::Internal::Or;
-using Halide::Internal::Not;
-using Halide::Internal::Select;
-using Halide::Internal::Load;
-using Halide::Internal::Ramp;
-using Halide::Internal::Broadcast;
-using Halide::Internal::Call;
-using Halide::Internal::Let;
-using Halide::Internal::LetStmt;
-using Halide::Internal::AttrStmt;
-using Halide::Internal::AssertStmt;
-using Halide::Internal::ProducerConsumer;
-using Halide::Internal::For;
-using Halide::Internal::Store;
-using Halide::Internal::Provide;
-using Halide::Internal::Allocate;
-using Halide::Internal::Free;
-using Halide::Internal::Realize;
-using Halide::Internal::Prefetch;
-using Halide::Internal::Block;
-using Halide::Internal::IfThenElse;
-using Halide::Internal::Evaluate;
-using Halide::Internal::Shuffle;
+using HalideIR::Internal::IntImm;
+using HalideIR::Internal::UIntImm;
+using HalideIR::Internal::FloatImm;
+using HalideIR::Internal::StringImm;
+using HalideIR::Internal::Cast;
+using HalideIR::Internal::Add;
+using HalideIR::Internal::Sub;
+using HalideIR::Internal::Mul;
+using HalideIR::Internal::Div;
+using HalideIR::Internal::Mod;
+using HalideIR::Internal::Min;
+using HalideIR::Internal::Max;
+using HalideIR::Internal::EQ;
+using HalideIR::Internal::NE;
+using HalideIR::Internal::LT;
+using HalideIR::Internal::LE;
+using HalideIR::Internal::GT;
+using HalideIR::Internal::GE;
+using HalideIR::Internal::And;
+using HalideIR::Internal::Or;
+using HalideIR::Internal::Not;
+using HalideIR::Internal::Select;
+using HalideIR::Internal::Load;
+using HalideIR::Internal::Ramp;
+using HalideIR::Internal::Broadcast;
+using HalideIR::Internal::Call;
+using HalideIR::Internal::Let;
+using HalideIR::Internal::LetStmt;
+using HalideIR::Internal::AttrStmt;
+using HalideIR::Internal::AssertStmt;
+using HalideIR::Internal::ProducerConsumer;
+using HalideIR::Internal::For;
+using HalideIR::Internal::Store;
+using HalideIR::Internal::Provide;
+using HalideIR::Internal::Allocate;
+using HalideIR::Internal::Free;
+using HalideIR::Internal::Realize;
+using HalideIR::Internal::Prefetch;
+using HalideIR::Internal::Block;
+using HalideIR::Internal::IfThenElse;
+using HalideIR::Internal::Evaluate;
+using HalideIR::Internal::Shuffle;
 // ir functions
-using Halide::Internal::is_const_power_of_two_integer;
+using HalideIR::Internal::is_const_power_of_two_integer;
 
+/*!
+ * \brief Create a type annotation expression
+ * \param dtype The data type
+ * \return Expr a expression with dtype.
+ */
+inline Expr TypeAnnotation(Type dtype) {
+  return ir::Call::make(dtype,
+                        "type_annotation", {},
+                        ir::Call::PureIntrinsic);
+}
 }  // namespace ir
 }  // namespace tvm
 
diff --git a/include/tvm/ir_functor_ext.h b/include/tvm/ir_functor_ext.h
index 55368fbea14d..3784608c8da1 100644
--- a/include/tvm/ir_functor_ext.h
+++ b/include/tvm/ir_functor_ext.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file ir_functor_ext.h
+ * \file tvm/ir_functor_ext.h
  * \brief More powerful Visitor that allows define function signatures.
  */
 #ifndef TVM_IR_FUNCTOR_EXT_H_
diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
index 1faf1724ddb6..b8aae3638149 100644
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file ir_mutator.h
+ * \file tvm/ir_mutator.h
  * \brief Defines general IRMutation pass
  */
 #ifndef TVM_IR_MUTATOR_H_
diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h
index a0726f0030ab..947c3b736d80 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/ir_operator.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file ir_operator.h
+ * \file tvm/ir_operator.h
  * \brief Common operators of Expr
  */
 #ifndef TVM_IR_OPERATOR_H_
@@ -12,14 +12,13 @@
 
 namespace tvm {
 
-using Halide::likely;
-using Halide::likely_if_innermost;
+using HalideIR::likely;
+using HalideIR::likely_if_innermost;
 // functions
-using Halide::cast;
-using Halide::min;
-using Halide::max;
-using Halide::abs;
-using Halide::select;
+using HalideIR::cast;
+using HalideIR::min;
+using HalideIR::max;
+using HalideIR::select;
 
 /*!
  * \brief sum of of source expression over axis
@@ -42,16 +41,55 @@ TVM_DLL Expr max(Expr source, Array<IterVar> axis);
  */
 TVM_DLL Expr min(Expr source, Array<IterVar> axis);
 
+
 // Unary intrinsic operators
 #define TVM_DECLARE_INTRIN_UNARY(OpName)                                \
   inline Expr OpName(Expr x) {                                          \
-    return ir::Call::make(x.type(), #OpName, {x}, ir::Call::PureExtern); \
+    return ir::Call::make(x.type(), #OpName, {x}, ir::Call::PureIntrinsic); \
   }                                                                     \
 
+
 TVM_DECLARE_INTRIN_UNARY(exp);
 TVM_DECLARE_INTRIN_UNARY(tanh);
 TVM_DECLARE_INTRIN_UNARY(sigmoid);
 TVM_DECLARE_INTRIN_UNARY(sqrt);
+TVM_DECLARE_INTRIN_UNARY(log);
+TVM_DECLARE_INTRIN_UNARY(floor);
+TVM_DECLARE_INTRIN_UNARY(ceil);
+TVM_DECLARE_INTRIN_UNARY(round);
+TVM_DECLARE_INTRIN_UNARY(trunc);
+
+/*!
+ * \brief Calculate power(x, y)
+ * \param x The left operand.
+ * \param y The right operand.
+ */
+inline Expr pow(Expr x, Expr y) {
+  match_types(x, y);
+  CHECK(x.type().is_float()) << "power only applies to float";
+  return ir::Call::make(x.type(), "pow", { x, y }, ir::Call::PureIntrinsic);
+}
+
+/*!
+ * \brief Calculate absolute value of x, elementwise
+ * \param x The input data
+ *
+ * \return The aboslute value of input data x
+ */
+inline Expr abs(Expr x) {
+  if (x.type().is_int()) {
+    return select(x >= make_zero(x.type()), x, -x);
+  } else if (x.type().is_float()) {
+    return ir::Call::make(x.type(), "fabs", {x}, ir::Call::PureIntrinsic);
+  } else if (x.type().is_uint()) {
+    return x;
+  } else {
+    LOG(WARNING) << "Warning: Data type " << x.type()
+      <<" not supported for absolute op. Skipping absolute op...";
+    return x;
+  }
+}
+
 }  // namespace tvm
 
 #endif  // TVM_IR_OPERATOR_H_
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index 6b95bd268652..d875621a3f5e 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file ir_pass.h
+ * \file tvm/ir_pass.h
  * \brief Collection of IR pass functions
  *
  *  When the pass functions in this file are for Stmt,
@@ -22,27 +22,39 @@
 namespace tvm {
 namespace ir {
 
-inline Expr Simplify(Expr a) {
-  return Halide::Internal::simplify(a);
-}
+/*!
+ * \brief Simplify the expression.
+ * \param expr The expression to be simplifed.
+ * \param vrange The range information about the variable.
+ * \return Canonicalized statement.
+ */
+EXPORT Expr Simplify(Expr expr, Map<Var, Range> vrange = Map<Var, Range>());
 
-inline Stmt Simplify(Stmt a) {
-  return Halide::Internal::simplify(a);
-}
+/*!
+ * \brief Simplify the statement.
+ * \param stmt The statement to be simplifed.
+ * \param vrange The range information about the variable.
+ * \return Canonicalized statement.
+ */
+Stmt Simplify(Stmt stmt, Map<Var, Range> vrange = Map<Var, Range>());
 
 /*!
  * \brief Simplify by applying canonical form.
  * \param stmt The statement to be canonically simplifed.
+ * \param vrange The range information about the variable.
  * \return Canonicalized statement.
  */
-Stmt CanonicalSimplify(Stmt stmt);
+Stmt CanonicalSimplify(Stmt stmt,
+                       Map<Var, Range> vrange = Map<Var, Range>());
 
 /*!
  * \brief Simplify by applying canonical form.
  * \param expr The statement to be canonically simplifed.
+ * \param vrange The range information about the variable.
  * \return Canonicalized expression.
  */
-Expr CanonicalSimplify(Expr expr);
+EXPORT Expr CanonicalSimplify(Expr expr,
+                              Map<Var, Range> vrange = Map<Var, Range>());
 
 /*!
  * \brief Deep compare lhs and rhs
@@ -50,7 +62,7 @@ Expr CanonicalSimplify(Expr expr);
  * \param rhs The right operand
  * \return The comparison result.
  */
-bool Equal(const Expr& lhs, const Expr& rhs);
+EXPORT bool Equal(const Expr& lhs, const Expr& rhs);
 
 /*!
  * \brief Deep compare lhs and rhs
@@ -204,11 +216,17 @@ Stmt NarrowChannelAccess(Stmt stmt);
  *
  * \param stmt The statment to be unrolled.
  * \param auto_max_step The maximum step before stop attach automatic unroll
- * \param auto_min_depth The minimum depth before we can start automatic unroll
+ * \param auto_max_depth The maximum depth before stop attach automatic unroll
+ * \param auto_max_extent The maximum extent of the loop we can unroll,
+ *                        this is an legacy option that donot take the loop total steps into account.
  * \param explicit_unroll Whether explicitly unroll the loop, or leave unroll annotation to codegen.
  * \return Transformed stmt.
  */
-Stmt UnrollLoop(Stmt stmt, int auto_max_step, int auto_min_depth, bool explicit_unroll);
+Stmt UnrollLoop(Stmt stmt,
+                int auto_max_step,
+                int auto_max_depth,
+                int auto_max_extent,
+                bool explicit_unroll);
 
 /*!
  * \brief vectorize the constant loops
@@ -271,9 +289,10 @@ Stmt StorageRewrite(Stmt stmt);
 /*!
  * \brief partition loops in the stmt
  * \param stmt The stmt to do loop partition
+ * \param split_const_loop flag to enable partition for const loop
  * \return Transformed stmt.
  */
-Stmt LoopPartition(Stmt stmt);
+Stmt LoopPartition(Stmt stmt, bool split_const_loop);
 
 /*!
  * \brief Detect and insert sync points to co-processor.
@@ -388,6 +407,29 @@ LoweredFunc ThreadSync(LoweredFunc stmt, std::string storage_scope);
  */
 LoweredFunc LowerThreadAllreduce(LoweredFunc f, int warp_size);
 
+/*!
+ * \brief Lower warp memory in stmt.
+ * \param f The device function to be lowered.
+ * \param warp_size the size of warp where no sync is needed.
+ *        this function will only take in effect if warp_size is bigger than one.
+ * \return Transformed function.
+ */
+LoweredFunc LowerWarpMemory(LoweredFunc f, int warp_size);
+
+/*!
+ * \brief Remap the thread axis
+ *
+ *  This can be used to get equivalent program which uses
+ *  threadIdx.y in place of threadIdx.x by passing
+ *  {"threadIdx.x": thread_axis("threadIdx.y")}
+ *
+ *
+ * \param f The device function to be lowered.
+ * \param axis_map The map from StringImm -> ItrVar
+ * \return Transformed function.
+ */
+LoweredFunc RemapThreadAxis(LoweredFunc f, Map<Expr, IterVar> axis_map);
+
 /*!
  * \brief Lower packed function call.
  * \param f The function to be lowered.
@@ -402,6 +444,18 @@ LoweredFunc LowerTVMBuiltin(LoweredFunc f);
  */
 LoweredFunc CombineContextCall(LoweredFunc f);
 
+/*!
+ * \brief Rewrite the pointer content type of arguments,
+ *  as well as Alloc internal to the function to use
+ *  the most frequently accessed type for load/store
+ *  to avoid pointer casting in backend when possible.
+ *
+ * \note implemeneted in storage_rewrite.cc
+ * \param f The function to be trasnformed
+ * \return Transformed function.
+ */
+LoweredFunc PointerValueTypeRewrite(LoweredFunc f);
+
 /*!
  * \brief Lower intrinsic function calls.
  * \param f The device function to be lowered.
@@ -409,6 +463,44 @@ LoweredFunc CombineContextCall(LoweredFunc f);
  * \return Transformed function.
  */
 LoweredFunc LowerIntrin(LoweredFunc f, const std::string& target);
+
+/*!
+ * \brief Verify if memory accesses are legal for a specific target device type.
+ *
+ *  In the case that tgt is cuda, if not all workload is bound with
+ *  threads, CPU code is generated that tries to access GPU memory,
+ *  which is illegal. This pass performs verification for this case.
+ *
+ * \param func The function to be verified.
+ * \param device_type The target device type.
+ * \return Success of memory verification.
+ */
+bool VerifyMemory(LoweredFunc func, int device_type);
+
+
+/*!
+ * \brief Verify the correctness of a GPU code
+ *        It will check the whether the amount of memory usage or the number of threads
+ *        in a block exceeds the limit
+ * \param stmt The statement to be checked
+ * \param constraints The dict to specify constraints to check.
+ *        Possible keys are
+ *
+ *        "max_local_memory_per_block": Total amount of local memory per block (in bytes).
+ *        "max_shared_memory_per_block": Total amount of shared memory per block (in bytes).
+ *        "max_threads_per_block": Maximum number of threads per block.
+ *        "max_thread_x": Maximum length of threadIdx.x.
+ *        "max_thread_y": Maximum length of threadIdx.y.
+ *        "max_thread_z": Maximum length of threadIdx.z.
+ *
+ *        If one key is missing in this argument, the pass won't check for that item.
+ * \return valid Whether it is a valid GPU code
+ *
+ */
+bool VerifyGPUCode(Stmt stmt,
+                   Map<std::string, Expr> constraints);
+
+
 }  // namespace ir
 }  // namespace tvm
 
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index 7cfd45b833c8..8919b0f7a5c2 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file ir_visitor.h
+ * \file tvm/ir_visitor.h
  * \brief Visitor to quickly visit IR trees
  */
 #ifndef TVM_IR_VISITOR_H_
diff --git a/include/tvm/logging.h b/include/tvm/logging.h
new file mode 100644
index 000000000000..070b6e092a2e
--- /dev/null
+++ b/include/tvm/logging.h
@@ -0,0 +1,99 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/logging.h
+ * \brief logging utilities on top of dmlc-core
+ */
+#ifndef TVM_LOGGING_H_
+#define TVM_LOGGING_H_
+
+// a technique that enables overriding macro names on the number of parameters. This is used
+// to define other macros below
+#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME
+
+/*!
+ * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X
+ * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG (defined dmlc-core/include/dmlc/logging.h.)
+ * COND_X (but not COND_X_N) are supposed to be used outside this file.
+ * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert',
+ * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X.
+ * quit_on_assert determines the overall behaviour of COND_X. If it's true COND_X
+ * quits the program on assertion failure. If it's false, then it moves on and somehow reports
+ * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false'
+ * in a function, or 'continue' or 'break' in a loop)
+ * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not
+ * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what
+ * to do when when quit_on_assertion is false and the assertion fails.
+ *
+ * Rationale: These macros were designed to implement functions that have two behaviours
+ * in a concise way. Those behaviours are quitting on assertion failures, or trying to
+ * move on from assertion failures. Note that these macros hide lots of control flow in them,
+ * and therefore, makes the logic of the whole code slightly harder to understand. However,
+ * in pieces of code that use these macros frequently, it will significantly shorten the
+ * amount of code needed to be read, and we won't need to clutter the main logic of the
+ * function by repetitive control flow structure. The first problem
+ * mentioned will be improved over time as the developer gets used to the macro.
+ *
+ * Here is an example of how to use it
+ * \code
+ * bool f(..., bool quit_on_assertion) {
+ *   int a = 0, b = 0;
+ *   ...
+ *   a = ...
+ *   b = ...
+ *   // if quit_on_assertion is true, if a==b, continue, otherwise quit.
+ *   // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default behaviour)
+ *   COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when  quiting"
+ *   ...
+ *   for (int i = 0; i < N; i++) {
+ *     a = ...
+ *     b = ...
+ *     // if quit_on_assertion is true, if a==b, continue, otherwise quit.
+ *     // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default
+ *     // behaviour, therefore, has to be explicitly specified)
+ *     COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when  quiting"
+ *   }
+ * }
+ * \endcode
+ */
+#define COND_CHECK_GE(...) \
+  GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__)
+#define COND_CHECK_EQ(...) \
+  GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__)
+#define COND_CHECK(...) \
+  GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__)
+#define COND_LOG(...) \
+  GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__)
+
+// Not supposed to be used by users directly.
+#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \
+  if (!quit_on_assert) { \
+    if (!((x) op (y))) \
+      what; \
+  } \
+  else /* NOLINT(*) */ \
+    CHECK_##op(x, y)
+
+#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==)
+#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=)
+
+#define COND_CHECK_3(quit_on_assert, x, what) \
+  if (!quit_on_assert) { \
+    if (!(x)) \
+      what; \
+  } \
+  else /* NOLINT(*) */ \
+    CHECK(x)
+
+#define COND_LOG_3(quit_on_assert, x, what) \
+  if (!quit_on_assert) { \
+    what; \
+  } \
+  else /* NOLINT(*) */ \
+    LOG(x)
+
+#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false)
+#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false)
+#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false)
+#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false)
+
+#endif   // TVM_LOGGING_H_
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index 7b7ebcf1e4d5..19f7e27f1c75 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file lowered_func.h
+ * \file tvm/lowered_func.h
  * \brief Information about a lowered TVM function.
  *  This data structure is final step toward codegen.
  */
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index 8242bfbeefb4..d13680531af9 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file operation.h
+ * \file tvm/operation.h
  * \brief Operation node can generate one or multiple Tensors
  */
 #ifndef TVM_OPERATION_H_
@@ -41,6 +41,8 @@ class OperationNode : public FunctionBaseNode {
   std::string name;
   /*! \brief optional tag of the operation */
   std::string tag;
+  /*! \brief addtitional attributes of the operation*/
+  Map<std::string, NodeRef> attrs;
   /*! \return name of the operation */
   const std::string& func_name() const final {
     return name;
@@ -117,11 +119,13 @@ class OperationNode : public FunctionBaseNode {
    * \brief Build the statement that provide the output tensors.
    * \param stage The schedule stage of the op.
    * \param dom_map The domain map of all iteration domains.
+   * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
    * \return A statement that add production and wraps consumer.
    */
   virtual Stmt BuildProvide(
       const Stage& stage,
-      const std::unordered_map<IterVar, Range>& dom_map) const = 0;
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop) const = 0;
 
   static constexpr const char* _type_key = "Operation";
 
@@ -160,10 +164,13 @@ class PlaceholderOpNode : public OperationNode {
       const Stmt& body) const final;
   Stmt BuildProvide(
       const Stage& stage,
-      const std::unordered_map<IterVar, Range>& dom_map) const final;
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop) const final;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("name", &name);
+    v->Visit("tag", &tag);
+    v->Visit("attrs", &attrs);
     v->Visit("shape", &shape);
     v->Visit("dtype", &dtype);
   }
@@ -178,7 +185,7 @@ class PlaceholderOpNode : public OperationNode {
 /*!
  * \brief A Compute op that compute a tensor on certain domain.
  */
-class ComputeOpNode : public OperationNode {
+class TVM_DLL ComputeOpNode : public OperationNode {
  public:
   /*! \brief IterVar on each axis */
   Array<IterVar> axis;
@@ -211,17 +218,20 @@ class ComputeOpNode : public OperationNode {
       const Stmt& body) const final;
   Stmt BuildProvide(
       const Stage& stage,
-      const std::unordered_map<IterVar, Range>& dom_map) const final;
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop) const final;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
+    v->Visit("attrs", &attrs);
     v->Visit("axis", &axis);
     v->Visit("reduce_axis", &reduce_axis);
     v->Visit("body", &body);
   }
   static Operation make(std::string name,
                         std::string tag,
+                        Map<std::string, NodeRef> attrs,
                         Array<IterVar> axis,
                         Array<Expr> body);
 
@@ -282,11 +292,13 @@ class ScanOpNode : public OperationNode {
       const Stmt& body) const final;
   Stmt BuildProvide(
       const Stage& stage,
-      const std::unordered_map<IterVar, Range>& dom_map) const final;
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop) const final;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
+    v->Visit("attrs", &attrs);
     v->Visit("scan_axis", &scan_axis);
     v->Visit("init", &init);
     v->Visit("update", &update);
@@ -296,6 +308,7 @@ class ScanOpNode : public OperationNode {
   }
   static Operation make(std::string name,
                         std::string tag,
+                        Map<std::string, NodeRef> attrs,
                         IterVar axis,
                         Array<Tensor> init,
                         Array<Tensor> update,
@@ -345,20 +358,23 @@ class ExternOpNode : public OperationNode {
       const Stmt& body) const final;
   Stmt BuildProvide(
       const Stage& stage,
-      const std::unordered_map<IterVar, Range>& dom_map) const final;
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop) const final;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
+    v->Visit("attrs", &attrs);
     v->Visit("inputs", &inputs);
     v->Visit("body", &body);
   }
-  static Operation make(std::string name,
-                        std::string tag,
-                        Array<Tensor> inputs,
-                        Array<Buffer> input_placeholders,
-                        Array<Buffer> output_placeholders,
-                        Stmt body);
+  EXPORT static Operation make(std::string name,
+                               std::string tag,
+                               Map<std::string, NodeRef> attrs,
+                               Array<Tensor> inputs,
+                               Array<Buffer> input_placeholders,
+                               Array<Buffer> output_placeholders,
+                               Stmt body);
 
   static constexpr const char* _type_key = "ExternOp";
   TVM_DECLARE_NODE_TYPE_INFO(ExternOpNode, OperationNode);
@@ -387,11 +403,13 @@ TVM_DLL Tensor placeholder(Array<Expr> shape,
  * \param fcompute The compute function to create the tensor.
  * \param name The optional name of the tensor.
  * \param tag The optional tag of the tensor.
+ * \param attrs Optional additional attributes of the compute.
  */
 TVM_DLL Tensor compute(Array<Expr> shape,
                        FCompute fcompute,
                        std::string name = "tensor",
-                       std::string tag = "");
+                       std::string tag = "",
+                       Map<std::string, NodeRef> attrs = {});
 
 /*!
  * \brief Construct a new tensor by computing over shape,
@@ -400,11 +418,13 @@ TVM_DLL Tensor compute(Array<Expr> shape,
  * \param fcompute The compute function to create the tensors.
  * \param name The optional name of the tensor.
  * \param tag The optional tag of the tensor.
+ * \param attrs Optional additional attributes of the compute.
  */
 TVM_DLL Array<Tensor> compute(Array<Expr> shape,
                               FBatchCompute fcompute,
                               std::string name = "tensor",
-                              std::string tag = "");
+                              std::string tag = "",
+                              Map<std::string, NodeRef> attrs = {});
 
 /*!
  * \brief Construct new tensors by scan.
@@ -416,42 +436,48 @@ TVM_DLL Array<Tensor> compute(Array<Expr> shape,
  *    but recommended to provide concrete information about scan body.
  * \param name The optional name of the tensor.
  * \param tag The optional tag of the tensor.
+ * \param attrs Optional additional attributes of the compute.
  */
 TVM_DLL Array<Tensor> scan(Array<Tensor> init,
                            Array<Tensor> update,
                            Array<Tensor> state_placeholder,
                            Array<Tensor> inputs = Array<Tensor>(),
                            std::string name = "scan",
-                           std::string tag = "");
+                           std::string tag = "",
+                           Map<std::string, NodeRef> attrs = {});
 
 // same as compute, specialized for different fcompute function
 inline Tensor compute(Array<Expr> shape,
                       std::function<Expr(Var)> f,
                       std::string name = "tensor",
-                      std::string tag = "") {
+                      std::string tag = "",
+                      Map<std::string, NodeRef> attrs = {}) {
   FCompute fc = [f] (const Array<Var>& i) { return f(i[0]); };
-  return compute(shape, fc, name, tag);
+  return compute(shape, fc, name, tag, attrs);
 }
 inline Tensor compute(Array<Expr> shape,
                       std::function<Expr(Var, Var)> f,
                       std::string name = "tensor",
-                      std::string tag = "") {
+                      std::string tag = "",
+                      Map<std::string, NodeRef> attrs = {}) {
   FCompute fc = [f] (const Array<Var>& i) { return f(i[0], i[1]); };
-  return compute(shape, fc, name, tag);
+  return compute(shape, fc, name, tag, attrs);
 }
 inline Tensor compute(Array<Expr> shape,
                       std::function<Expr(Var, Var, Var)> f,
                       std::string name = "tensor",
-                      std::string tag = "") {
+                      std::string tag = "",
+                      Map<std::string, NodeRef> attrs = {}) {
   FCompute fc = [f] (const Array<Var>& i) { return f(i[0], i[1], i[2]); };
-  return  compute(shape, fc, name, tag);
+  return  compute(shape, fc, name, tag, attrs);
 }
 inline Tensor compute(Array<Expr> shape,
                       std::function<Expr(Var, Var, Var, Var)> f,
                       std::string name = "tensor",
-                      std::string tag = "") {
+                      std::string tag = "",
+                      Map<std::string, NodeRef> attrs = {}) {
   FCompute fc = [f] (const Array<Var>& i) { return f(i[0], i[1], i[2], i[3]); };
-  return compute(shape, fc, name, tag);
+  return compute(shape, fc, name, tag, attrs);
 }
 
 // inline function.
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 1f66232baacc..95964547ef8e 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file packed_func_ext.h
+ * \file tvm/packed_func_ext.h
  * \brief Extension package to PackedFunc
  *   This enales pass NodeRef types into/from PackedFunc.
  */
@@ -14,6 +14,7 @@
 
 #include "./base.h"
 #include "./expr.h"
+#include "./tensor.h"
 #include "./runtime/packed_func.h"
 
 namespace tvm {
@@ -59,6 +60,25 @@ struct NodeTypeChecker<Array<T> > {
   }
 };
 
+template<typename V>
+struct NodeTypeChecker<Map<std::string, V> > {
+  static inline bool Check(Node* sptr) {
+    if (sptr == nullptr) return false;
+    if (!sptr->is_type<StrMapNode>()) return false;
+    StrMapNode* n = static_cast<StrMapNode*>(sptr);
+    for (const auto& kv : n->data) {
+      if (!NodeTypeChecker<V>::Check(kv.second.get())) return false;
+    }
+    return true;
+  }
+  static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
+    os << "map<string";
+    os << ',';
+    NodeTypeChecker<V>::PrintName(os);
+    os << '>';
+  }
+};
+
 template<typename K, typename V>
 struct NodeTypeChecker<Map<K, V> > {
   static inline bool Check(Node* sptr) {
@@ -103,12 +123,12 @@ inline TNodeRef TVMArgValue::AsNodeRef() const {
   return TNodeRef(sptr);
 }
 
-inline TVMArgValue::operator Halide::Expr() const {
+inline TVMArgValue::operator HalideIR::Expr() const {
   if (type_code_ == kNull) return Expr();
-  if (type_code_ == kInt) {
+  if (type_code_ == kDLInt) {
     return Expr(static_cast<int>(value_.v_int64));
   }
-  if (type_code_ == kFloat) {
+  if (type_code_ == kDLFloat) {
     return Expr(static_cast<float>(value_.v_float64));
   }
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
@@ -116,6 +136,9 @@ inline TVMArgValue::operator Halide::Expr() const {
   if (sptr->is_type<IterVarNode>()) {
     return IterVar(sptr)->var;
   }
+  if (sptr->is_type<TensorNode>()) {
+    return Tensor(sptr)();
+  }
   CHECK(NodeTypeChecker<Expr>::Check(sptr.get()))
       << "Expected type " << NodeTypeName<Expr>()
       << " but get " << sptr->type_key();
@@ -180,20 +203,20 @@ inline void TVMArgsSetter::operator()(size_t i, const NodeRef& other) const {  /
 }
 
 // type related stuffs
-inline TVMRetValue& TVMRetValue::operator=(const Halide::Type& t) {
+inline TVMRetValue& TVMRetValue::operator=(const HalideIR::Type& t) {
   return this->operator=(Type2TVMType(t));
 }
 
-inline TVMRetValue::operator Halide::Type() const {
+inline TVMRetValue::operator HalideIR::Type() const {
   return TVMType2Type(operator TVMType());
 }
 
-inline TVMArgValue::operator Halide::Type() const {
+inline TVMArgValue::operator HalideIR::Type() const {
   return TVMType2Type(operator TVMType());
 }
 
 inline void TVMArgsSetter::operator()(
-    size_t i, const Halide::Type& t) const {
+    size_t i, const HalideIR::Type& t) const {
   this->operator()(i, Type2TVMType(t));
 }
 }  // namespace runtime
diff --git a/include/tvm/runtime/c_backend_api.h b/include/tvm/runtime/c_backend_api.h
index e512921c969e..60e284610494 100644
--- a/include/tvm/runtime/c_backend_api.h
+++ b/include/tvm/runtime/c_backend_api.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file c_backend_api.h
+ * \file tvm/runtime/c_backend_api.h
  * \brief TVM runtime backend API.
  *
  *  The functions defined in this header are intended to be
@@ -13,7 +13,7 @@
 #include "./c_runtime_api.h"
 
 #ifdef __cplusplus
-TVM_EXTERN_C {
+extern "C" {
 #endif
 
 // Backend related functions.
@@ -44,14 +44,20 @@ TVM_DLL int TVMBackendRegisterSystemLibSymbol(const char* name, void* ptr);
  *
  * \note The result allocate spaced is ensured to be aligned to kTempAllocaAlignment.
  *
- * \param size The size of the space requested.
+ * \param nbytes The size of the space requested.
  * \param device_type The device type which the space will be allocated.
  * \param device_id The device id which the space will be allocated.
+ * \param dtype_code_hint The type code of the array elements. Only used in
+ * certain backends such as OpenGL.
+ * \param dtype_bits_hint The type bits of the array elements. Only used in
+ * certain backends such as OpenGL.
  * \return nullptr when error is thrown, a valid ptr if success
  */
 TVM_DLL void* TVMBackendAllocWorkspace(int device_type,
                                        int device_id,
-                                       uint64_t size);
+                                       uint64_t nbytes,
+                                       int dtype_code_hint,
+                                       int dtype_bits_hint);
 
 /*!
  * \brief Backend function to free temporal workspace.
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 91175f671a56..17d00bf479aa 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file c_runtime_api.h
+ * \file tvm/runtime/c_runtime_api.h
  * \brief TVM runtime library.
  *
  *  The philosophy of TVM project is to customize the compilation
@@ -18,10 +18,11 @@
 #ifndef TVM_RUNTIME_C_RUNTIME_API_H_
 #define TVM_RUNTIME_C_RUNTIME_API_H_
 
-#ifdef __cplusplus
-#define TVM_EXTERN_C extern "C"
+// Macros to do weak linking
+#ifdef _MSC_VER
+#define TVM_WEAK __declspec(selectany)
 #else
-#define TVM_EXTERN_C
+#define TVM_WEAK __attribute__((weak))
 #endif
 
 #ifdef __EMSCRIPTEN__
@@ -41,11 +42,15 @@
 #endif
 #endif
 
+// TVM version
+#define TVM_VERSION "0.4.0"
+
+
 // TVM Runtime is DLPack compatible.
 #include <dlpack/dlpack.h>
 
 #ifdef __cplusplus
-TVM_EXTERN_C {
+extern "C" {
 #endif
 #include <stdint.h>
 #include <stddef.h>
@@ -55,9 +60,13 @@ typedef int64_t tvm_index_t;
 
 /*! \brief Extension device types in TVM */
 typedef enum {
+  kDLAOCL = 5,
+  kDLSDAccel = 6,
+  kDLVulkan = 7,
+  kOpenGL = 11,
   // Extension DRAM type, used for quickly test extension device
   // The device api can differ depending on the xpu driver registered.
-  kExtDev = 12
+  kExtDev = 12,
   // AddExtraTVMType which is not in DLPack here
 } TVMDeviceExtType;
 
@@ -79,6 +88,7 @@ typedef enum {
   kFuncHandle = 10U,
   kStr = 11U,
   kBytes = 12U,
+  kNDArrayContainer = 13U,
   // Extension codes for other frameworks to integrate TVM PackedFunc.
   // To make sure each framework's id do not conflict, use first and
   // last sections to mark ranges.
@@ -113,6 +123,9 @@ typedef DLContext TVMContext;
  */
 typedef DLTensor TVMArray;
 
+/*! \brief the array handle */
+typedef TVMArray* TVMArrayHandle;
+
 /*!
  * \brief Union type of values
  *  being passed through API and function calls.
@@ -141,8 +154,6 @@ typedef void* TVMModuleHandle;
 typedef void* TVMFunctionHandle;
 /*! \brief Handle to hold return value. */
 typedef void* TVMRetValueHandle;
-/*! \brief the array handle */
-typedef TVMArray* TVMArrayHandle;
 /*!
  * \brief The stream that is specific to device
  * can be NULL, which indicates the default one.
@@ -311,6 +322,17 @@ typedef int (*TVMPackedCFunc)(
  */
 typedef void (*TVMPackedCFuncFinalizer)(void* resource_handle);
 
+/*!
+ * \brief Signature for extension function declarer.
+ *
+ *  TVM call this function to get the extension functions
+ *  The declarer will call register_func to register function and their name.
+ *
+ * \param register_func_handle The register function
+ * \return 0 if success, -1 if failure happens
+ */
+typedef int (*TVMExtensionFuncDeclarer)(TVMFunctionHandle register_func_handle);
+
 /*!
  * \brief Wrap a TVMPackedCFunc to become a FunctionHandle.
  *
@@ -423,6 +445,26 @@ TVM_DLL int TVMArrayCopyFromTo(TVMArrayHandle from,
                                TVMArrayHandle to,
                                TVMStreamHandle stream);
 
+/*!
+ * \brief Create a new runtime stream.
+ *
+ * \param device_type The device type of context
+ * \param device_id The device id of context
+ * \param out The new stream handle
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out);
+
+/*!
+ * \brief Free a created stream handle.
+ *
+ * \param device_type The device type of context
+ * \param device_id The device id of context
+ * \param stream The stream to be freed
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream);
+
 /*!
  * \brief Set the runtime stream of current thread to be stream.
  *  The subsequent calls to the same device_type
@@ -446,6 +488,20 @@ TVM_DLL int TVMSetStream(int device_type, int device_id, TVMStreamHandle handle)
  */
 TVM_DLL int TVMSynchronize(int device_type, int device_id, TVMStreamHandle stream);
 
+/*!
+ * \brief Synchronize two streams of execution.
+ *
+ * \param device_type The device type of context
+ * \param device_id The device id of context
+ * \param src The source stream to synchronize.
+ * \param dst The destination stream to synchronize.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMStreamStreamSynchronize(int device_type,
+                                       int device_id,
+                                       TVMStreamHandle src,
+                                       TVMStreamHandle dst);
+
 #ifdef __cplusplus
 }  // TVM_EXTERN_C
 #endif
diff --git a/include/tvm/runtime/config.h b/include/tvm/runtime/config.h
deleted file mode 100644
index 73857f1a939e..000000000000
--- a/include/tvm/runtime/config.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file config.h
- * \brief Runtime library related configurations.
- */
-#ifndef TVM_RUNTIME_CONFIG_H_
-#define TVM_RUNTIME_CONFIG_H_
-
-/*!
- *\brief whether to use CUDA runtime
- */
-#ifndef TVM_CUDA_RUNTIME
-#define TVM_CUDA_RUNTIME 1
-#endif
-
-/*!
- *\brief whether to use opencl runtime
- */
-#ifndef TVM_OPENCL_RUNTIME
-#define TVM_OPENCL_RUNTIME 0
-#endif
-
-/*!
- *\brief whether to use metal runtime
- */
-#ifndef TVM_METAL_RUNTIME
-#define TVM_METAL_RUNTIME 0
-#endif
-
-#endif  // TVM_RUNTIME_CONFIG_H_
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 318a5363726f..3458c143e662 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file device_api.h
+ * \file tvm/runtime/device_api.h
  * \brief Abstract device memory management API
  */
 #ifndef TVM_RUNTIME_DEVICE_API_H_
@@ -19,7 +19,12 @@ enum DeviceAttrKind : int {
   kExist = 0,
   kMaxThreadsPerBlock = 1,
   kWarpSize = 2,
-  kComputeVersion = 3
+  kMaxSharedMemoryPerBlock = 3,
+  kComputeVersion = 4,
+  kDeviceName = 5,
+  kMaxClockRate = 6,
+  kMultiProcessorCount = 7,
+  kMaxThreadDimensions = 8
 };
 
 /*! \brief Number of bytes each allocation must align to */
@@ -55,11 +60,16 @@ class DeviceAPI {
   /*!
    * \brief Allocate a data space on device.
    * \param ctx The device context to perform operation.
-   * \param size The size of the memory
+   * \param nbytes The number of bytes in memory.
    * \param alignment The alignment of the memory.
-   * \return The allocated device pointer
+   * \param type_hint The type of elements. Only needed by certain backends such
+   * as OpenGL, as nbytes & alignment are sufficient for most backends.
+   * \return The allocated device pointer.
    */
-  virtual void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) = 0;
+  virtual void* AllocDataSpace(TVMContext ctx,
+                               size_t nbytes,
+                               size_t alignment,
+                               TVMType type_hint) = 0;
   /*!
    * \brief Free a data space on device.
    * \param ctx The device context to perform operation.
@@ -72,19 +82,37 @@ class DeviceAPI {
    * \param from_offset The byte offeset in the from.
    * \param to The target array.
    * \param to_offset The byte offset in the to.
-   * \param size The size of the memory
+   * \param num_bytes The size of the memory in bytes
    * \param ctx_from The source context
    * \param ctx_to The target context
+   * \param type_hint The type of elements, only neded by certain backends.
+   *                  can be useful for cross device endian converison.
    * \param stream Optional stream object.
    */
   virtual void CopyDataFromTo(const void* from,
                               size_t from_offset,
                               void* to,
                               size_t to_offset,
-                              size_t size,
+                              size_t num_bytes,
                               TVMContext ctx_from,
                               TVMContext ctx_to,
+                              TVMType type_hint,
                               TVMStreamHandle stream) = 0;
+    /*!
+   * \brief Create a new stream of execution.
+   *
+   * \param ctx The context of allocation.
+   */
+  TVM_DLL virtual TVMStreamHandle CreateStream(TVMContext ctx);
+
+  /*!
+   * \brief Free a stream of execution
+   *
+   * \param ctx The context of the stream
+   * \param stream The pointer to be freed.
+   */
+  TVM_DLL virtual void FreeStream(TVMContext ctx, TVMStreamHandle stream);
+
   /*!
    * \brief Synchronize the stream
    * \param ctx The context to perform operation.
@@ -97,6 +125,21 @@ class DeviceAPI {
    * \param stream The stream to be set.
    */
   virtual void SetStream(TVMContext ctx, TVMStreamHandle stream) {}
+  /*!
+   * \brief Synchronize 2 streams of execution.
+   *
+   * An event is created in event_src stream that the second then
+   * stream waits on.  Neither event_src or event_dst need to be of
+   * the same device ID as the context, but they must be of the same
+   * device type.
+   *
+   * \param ctx The context of the streams.
+   * \param event_src The source stream to synchronize.
+   * \param event_dst The destination stream to synchronize.
+   */
+  TVM_DLL virtual void SyncStreamFromTo(TVMContext ctx,
+                                        TVMStreamHandle event_src,
+                                        TVMStreamHandle event_dst);
   /*!
    * \brief Allocate temporal workspace for backend execution.
    *
@@ -109,9 +152,13 @@ class DeviceAPI {
    *  - Workspace should not overlap between different threads(i.e. be threadlocal)
    *
    * \param ctx The context of allocation.
-   * \param size The size to be allocated.
+   * \param nbytes The size to be allocated.
+   * \param type_hint The type of elements. Only needed by certain backends such
+   * as OpenGL, as nbytes is sufficient for most backends.
    */
-  TVM_DLL virtual void* AllocWorkspace(TVMContext ctx, size_t size);
+  TVM_DLL virtual void* AllocWorkspace(TVMContext ctx,
+                                       size_t nbytes,
+                                       TVMType type_hint = {});
   /*!
    * \brief Free temporal workspace in backend execution.
    *
@@ -119,6 +166,7 @@ class DeviceAPI {
    * \param ptr The pointer to be freed.
    */
   TVM_DLL virtual void FreeWorkspace(TVMContext ctx, void* ptr);
+
   /*!
    * \brief Get device API base don context.
    * \param ctx The context
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 3d0991034c41..de0b02500b6d 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file module.h
+ * \file tvm/runtime/module.h
  * \brief Runtime container of the functions generated by TVM,
  *  This is used to support dynamically link, load and save
  *  functions from different convention under unified API.
@@ -38,8 +38,14 @@ class Module {
    * \param query_imports Whether also query dependency modules.
    * \return The result function.
    *  This function will return PackedFunc(nullptr) if function do not exist.
+   * \note Implemented in packed_func.cc
    */
-  TVM_DLL PackedFunc GetFunction(const std::string& name, bool query_imports = false);
+  inline PackedFunc GetFunction(const std::string& name, bool query_imports = false);
+  /*! \return internal container */
+  inline ModuleNode* operator->();
+  /*! \return internal container */
+  inline const ModuleNode* operator->() const;
+  // The following functions requires link with runtime.
   /*!
    * \brief Import another module into this module.
    * \param other The module to be imported.
@@ -57,10 +63,6 @@ class Module {
    */
   TVM_DLL static Module LoadFromFile(const std::string& file_name,
                                      const std::string& format = "");
-  /*! \return internal container */
-  inline ModuleNode* operator->();
-  /*! \return internal container */
-  inline const ModuleNode* operator->() const;
 
  private:
   std::shared_ptr<ModuleNode> node_;
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
new file mode 100644
index 000000000000..2b51b2e0fcfe
--- /dev/null
+++ b/include/tvm/runtime/ndarray.h
@@ -0,0 +1,419 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file tvm/runtime/ndarray.h
+ * \brief Abstract device memory management API
+ */
+#ifndef TVM_RUNTIME_NDARRAY_H_
+#define TVM_RUNTIME_NDARRAY_H_
+
+#include <atomic>
+#include <vector>
+#include <utility>
+#include "./c_runtime_api.h"
+#include "./serializer.h"
+
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief Managed NDArray.
+ *  The array is backed by reference counted blocks.
+ */
+class NDArray {
+ public:
+  // internal container type
+  struct Container;
+  /*! \brief default constructor */
+  NDArray() {}
+  /*!
+   * \brief cosntruct a NDArray that refers to data
+   * \param data The data this NDArray refers to
+   */
+  explicit inline NDArray(Container* data);
+  /*!
+   * \brief copy constructor
+   * \param other The value to be copied
+   */
+  inline NDArray(const NDArray& other);  // NOLINT(*)
+  /*!
+   * \brief move constructor
+   * \param other The value to be moved
+   */
+  NDArray(NDArray&& other) // NOLINT(*)
+      : data_(other.data_) {
+    other.data_ = nullptr;
+  }
+  /*! \brief destructor */
+  ~NDArray() {
+    this->reset();
+  }
+  /*!
+   * \brief Swap this array with another NDArray
+   * \param other The other NDArray
+   */
+  void swap(NDArray& other) {  // NOLINT(*)
+    std::swap(data_, other.data_);
+  }
+  /*!
+   * \brief copy assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NDArray& operator=(const NDArray& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NDArray(other).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+  /*!
+   * \brief move assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NDArray& operator=(NDArray&& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NDArray(std::move(other)).swap(*this); // NOLINT(*)
+    return *this;
+  }
+  /*! \return If NDArray is defined */
+  bool defined() const {
+    return data_ != nullptr;
+  }
+  /*! \return If both NDArray reference the same container */
+  bool same_as(const NDArray& other) const {
+    return data_ == other.data_;
+  }
+  /*! \brief reset the content of NDArray to be nullptr */
+  inline void reset();
+  /*!
+   * \return the reference counter
+   * \note this number is approximate in multi-threaded setting.
+   */
+  inline int use_count() const;
+  /*! \return Pointer to content of DLTensor */
+  inline const DLTensor* operator->() const;
+  /*!
+   * \brief Copy data content from another array.
+   * \param other The source array to be copied from.
+   * \note The copy may happen asynchrously if it involves a GPU context.
+   *       TVMSynchronize is necessary.
+   */
+  inline void CopyFrom(DLTensor* other);
+  inline void CopyFrom(const NDArray& other);
+  /*!
+   * \brief Copy data content into another array.
+   * \param other The source array to be copied from.
+   * \note The copy may happen asynchrously if it involves a GPU context.
+   *       TVMSynchronize is necessary.
+   */
+  inline void CopyTo(DLTensor* other) const;
+  inline void CopyTo(const NDArray& other) const;
+  /*!
+   * \brief Copy the data to another context.
+   * \param ctx The target context.
+   * \return The array under another context.
+   */
+  inline NDArray CopyTo(const DLContext& ctx) const;
+  /*!
+   * \brief Load NDArray from stream
+   * \param stream The input data stream
+   * \return Whether load is successful
+   */
+  inline bool Load(dmlc::Stream* stream);
+  /*!
+   * \brief Save NDArray to stream
+   * \param stream The output data stream
+   */
+  inline void Save(dmlc::Stream* stream) const;
+  /*!
+   * \brief Create a NDArray that shares the data memory with the current one.
+   * \param shape The shape of the new array.
+   * \param dtype The data type of the new array.
+   * \note The memory size of new array must be smaller than the current one.
+   */
+  TVM_DLL NDArray CreateView(
+      std::vector<int64_t> shape, DLDataType dtype);
+  /*!
+   * \brief Create a reference view of NDArray that
+   *  represents as DLManagedTensor.
+   * \return A DLManagedTensor
+   */
+  TVM_DLL DLManagedTensor* ToDLPack() const;
+  /*!
+   * \brief Create an empty NDArray.
+   * \param shape The shape of the new array.
+   * \param dtype The data type of the new array.
+   * \param ctx The context of the Array.
+   * \return The created Array
+   */
+  TVM_DLL static NDArray Empty(std::vector<int64_t> shape,
+                               DLDataType dtype,
+                               DLContext ctx);
+  /*!
+   * \brief Create a NDArray backed by a dlpack tensor.
+   *
+   * This allows us to create a NDArray using the memory
+   * allocated by an external deep learning framework
+   * that is DLPack compatible.
+   *
+   * The memory is retained until the NDArray went out of scope.
+   *
+   * \return The created NDArray view.
+   */
+  TVM_DLL static NDArray FromDLPack(DLManagedTensor* tensor);
+  /*!
+   * \brief Function to copy data from one array to another.
+   * \param from The source array.
+   * \param to The target array.
+   * \param stream The stream used in copy.
+   */
+  TVM_DLL static void CopyFromTo(
+      DLTensor* from, DLTensor* to, TVMStreamHandle stream = nullptr);
+
+  // internal namespace
+  struct Internal;
+ private:
+  /*! \brief Internal Data content */
+  Container* data_{nullptr};
+  // enable internal functions
+  friend struct Internal;
+  friend class TVMRetValue;
+  friend class TVMArgsSetter;
+};
+
+/*!
+ * \brief Save a DLTensor to stream
+ * \param strm The outpu stream
+ * \param tensor The tensor to be saved.
+ */
+inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
+
+/*!
+ * \brief Reference counted Container object used to back NDArray.
+ *
+ *  This object is DLTensor compatible:
+ *    the pointer to the NDArrayContainer can be directly
+ *    interpreted as a DLTensor*
+ *
+ * \note: do not use this function directly, use NDArray.
+ */
+struct NDArray::Container {
+ public:
+  // NOTE: the first part of this structure is the same as
+  // DLManagedTensor, note that, however, the deleter
+  // is only called when the reference counter goes to 0
+  /*!
+   * \brief The corresponding dl_tensor field.
+   * \note it is important that the first field is DLTensor
+   *  So that this data structure is DLTensor compatible.
+   *  The head ptr of this struct can be viewed as DLTensor*.
+   */
+  DLTensor dl_tensor;
+  /*!
+   * \brief addtional context, reserved for recycling
+   * \note We can attach additional content here
+   *  which the current container depend on
+   *  (e.g. reference to original memory when creating views).
+   */
+  void* manager_ctx{nullptr};
+  /*!
+   * \brief Customized deleter
+   *
+   * \note The customized deleter is helpful to enable
+   *  different ways of memory allocator that are not
+   *  currently defined by the system.
+   */
+  void (*deleter)(Container* self) = nullptr;
+  /*! \brief default constructor */
+  Container() {
+    dl_tensor.data = nullptr;
+    dl_tensor.ndim = 0;
+    dl_tensor.shape = nullptr;
+    dl_tensor.strides = nullptr;
+    dl_tensor.byte_offset = 0;
+  }
+  /*! \brief developer function, increases reference counter */
+  void IncRef() {
+    ref_counter_.fetch_add(1, std::memory_order_relaxed);
+  }
+  /*! \brief developer function, decrease reference counter */
+  void DecRef() {
+    if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      if (this->deleter != nullptr) {
+        (*this->deleter)(this);
+      }
+    }
+  }
+
+ private:
+  friend class NDArray;
+  /*!
+   * \brief The shape container,
+   *  can be used used for shape data.
+   */
+  std::vector<int64_t> shape_;
+  /*! \brief The internal array object */
+  std::atomic<int> ref_counter_{0};
+};
+
+// implementations of inline functions
+// the usages of functions are documented in place.
+inline NDArray::NDArray(Container* data)
+  : data_(data) {
+  data_->IncRef();
+}
+
+inline NDArray::NDArray(const NDArray& other)
+  : data_(other.data_) {
+  data_->IncRef();
+}
+
+inline void NDArray::reset() {
+  if (data_ != nullptr) {
+    data_->DecRef();
+    data_ = nullptr;
+  }
+}
+
+inline void NDArray::CopyFrom(DLTensor* other) {
+  CHECK(data_ != nullptr);
+  CopyFromTo(other, &(data_->dl_tensor));
+}
+
+inline void NDArray::CopyFrom(const NDArray& other) {
+  CHECK(data_ != nullptr);
+  CHECK(other.data_ != nullptr);
+  CopyFromTo(&(other.data_->dl_tensor), &(data_->dl_tensor));
+}
+
+inline void NDArray::CopyTo(DLTensor* other) const {
+  CHECK(data_ != nullptr);
+  CopyFromTo(&(data_->dl_tensor), other);
+}
+
+inline void NDArray::CopyTo(const NDArray& other) const {
+  CHECK(data_ != nullptr);
+  CHECK(other.data_ != nullptr);
+  CopyFromTo(&(data_->dl_tensor), &(other.data_->dl_tensor));
+}
+
+inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
+  CHECK(data_ != nullptr);
+  const DLTensor* dptr = operator->();
+  NDArray ret = Empty(std::vector<int64_t>(dptr->shape, dptr->shape + dptr->ndim),
+                      dptr->dtype, ctx);
+  this->CopyTo(ret);
+  return ret;
+}
+
+inline int NDArray::use_count() const {
+  if (data_ == nullptr) return 0;
+  return data_->ref_counter_.load(std::memory_order_relaxed);
+}
+
+inline const DLTensor* NDArray::operator->() const {
+  return &(data_->dl_tensor);
+}
+
+/*! \brief Magic number for NDArray file */
+constexpr uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
+
+inline bool SaveDLTensor(dmlc::Stream* strm,
+                         DLTensor* tensor) {
+  uint64_t header = kTVMNDArrayMagic, reserved = 0;
+  strm->Write(header);
+  strm->Write(reserved);
+  // Always save data as CPU context
+  //
+  // Parameters that get serialized should be in CPU by default.
+  // So even the array's context is GPU, it will be stored as CPU array.
+  // This is used to prevent case when another user loads the parameters
+  // back on machine that do not have GPU or related context.
+  //
+  // We can always do array.CopyTo(target_ctx) to get a corresponding
+  // array in the target context.
+  DLContext cpu_ctx;
+  cpu_ctx.device_type = kDLCPU;
+  cpu_ctx.device_id = 0;
+  strm->Write(cpu_ctx);
+  strm->Write(tensor->ndim);
+  strm->Write(tensor->dtype);
+  int ndim = tensor->ndim;
+  strm->WriteArray(tensor->shape, ndim);
+  int type_bytes = tensor->dtype.bits / 8;
+  int64_t num_elems = 1;
+  for (int i = 0; i < ndim; ++i) {
+    num_elems *= tensor->shape[i];
+  }
+  int64_t data_byte_size = type_bytes * num_elems;
+  strm->Write(data_byte_size);
+
+  if (DMLC_IO_NO_ENDIAN_SWAP &&
+      tensor->ctx.device_type == kDLCPU &&
+      tensor->strides == nullptr &&
+      tensor->byte_offset == 0) {
+    // quick path
+    strm->Write(tensor->data, data_byte_size);
+  } else {
+    std::vector<uint8_t> bytes(data_byte_size);
+    CHECK_EQ(TVMArrayCopyToBytes(
+        tensor, dmlc::BeginPtr(bytes), data_byte_size), 0)
+        << TVMGetLastError();
+    if (!DMLC_IO_NO_ENDIAN_SWAP) {
+      dmlc::ByteSwap(dmlc::BeginPtr(bytes), type_bytes, num_elems);
+    }
+    strm->Write(dmlc::BeginPtr(bytes), data_byte_size);
+  }
+  return true;
+}
+
+inline void NDArray::Save(dmlc::Stream* strm) const {
+  SaveDLTensor(strm, const_cast<DLTensor*>(operator->()));
+}
+
+inline bool NDArray::Load(dmlc::Stream* strm) {
+  uint64_t header, reserved;
+  CHECK(strm->Read(&header))
+      << "Invalid DLTensor file format";
+  CHECK(strm->Read(&reserved))
+      << "Invalid DLTensor file format";
+  CHECK(header == kTVMNDArrayMagic)
+      << "Invalid DLTensor file format";
+  DLContext ctx;
+  int ndim;
+  DLDataType dtype;
+  CHECK(strm->Read(&ctx))
+      << "Invalid DLTensor file format";
+  CHECK(strm->Read(&ndim))
+      << "Invalid DLTensor file format";
+  CHECK(strm->Read(&dtype))
+      << "Invalid DLTensor file format";
+  CHECK_EQ(ctx.device_type, kDLCPU)
+      << "Invalid DLTensor context: can only save as CPU tensor";
+  std::vector<int64_t> shape(ndim);
+  if (ndim != 0) {
+    CHECK(strm->ReadArray(&shape[0], ndim))
+        << "Invalid DLTensor file format";
+  }
+  NDArray ret = NDArray::Empty(shape, dtype, ctx);
+  int64_t num_elems = 1;
+  int elem_bytes = (ret->dtype.bits + 7) / 8;
+  for (int i = 0; i < ret->ndim; ++i) {
+    num_elems *= ret->shape[i];
+  }
+  int64_t data_byte_size;
+  CHECK(strm->Read(&data_byte_size))
+      << "Invalid DLTensor file format";
+  CHECK(data_byte_size == num_elems * elem_bytes)
+      << "Invalid DLTensor file format";
+  CHECK(strm->Read(ret->data, data_byte_size))
+      << "Invalid DLTensor file format";
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    dmlc::ByteSwap(ret->data, elem_bytes, num_elems);
+  }
+  *this = ret;
+  return true;
+}
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_NDARRAY_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 41537760e46d..6d8df4a5e3d6 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file packed_func.h
+ * \file tvm/runtime/packed_func.h
  * \brief Type-erased function used across TVM API.
  */
 #ifndef TVM_RUNTIME_PACKED_FUNC_H_
@@ -16,14 +16,20 @@
 #include <type_traits>
 #include "./c_runtime_api.h"
 #include "./module.h"
+#include "./ndarray.h"
 
-namespace Halide {
+namespace HalideIR {
 // Forward declare type for extensions
 // The header works fine without depending on this.
 struct Type;
 struct Expr;
 }
 
+// Whether use TVM runtime in header only mode.
+#ifndef TVM_RUNTIME_HEADER_ONLY
+#define TVM_RUNTIME_HEADER_ONLY 0
+#endif
+
 namespace tvm {
 // Forward declare NodeRef and Node for extensions.
 // This header works fine without depend on NodeRef
@@ -217,25 +223,25 @@ class ExtTypeVTable {
 class TVMPODValue_ {
  public:
   operator double() const {
-    TVM_CHECK_TYPE_CODE(type_code_, kFloat);
+    TVM_CHECK_TYPE_CODE(type_code_, kDLFloat);
     return value_.v_float64;
   }
   operator int64_t() const {
-    TVM_CHECK_TYPE_CODE(type_code_, kInt);
+    TVM_CHECK_TYPE_CODE(type_code_, kDLInt);
     return value_.v_int64;
   }
   operator uint64_t() const {
-    TVM_CHECK_TYPE_CODE(type_code_, kInt);
+    TVM_CHECK_TYPE_CODE(type_code_, kDLInt);
     return value_.v_int64;
   }
   operator int() const {
-    TVM_CHECK_TYPE_CODE(type_code_, kInt);
+    TVM_CHECK_TYPE_CODE(type_code_, kDLInt);
     CHECK_LE(value_.v_int64,
              std::numeric_limits<int>::max());
     return static_cast<int>(value_.v_int64);
   }
   operator bool() const {
-    TVM_CHECK_TYPE_CODE(type_code_, kInt);
+    TVM_CHECK_TYPE_CODE(type_code_, kDLInt);
     return value_.v_int64 != 0;
   }
   operator void*() const {
@@ -244,10 +250,22 @@ class TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kHandle);
     return value_.v_handle;
   }
-  operator TVMArray*() const {
-    if (type_code_ == kNull) return nullptr;
-    TVM_CHECK_TYPE_CODE(type_code_, kArrayHandle);
-    return static_cast<TVMArray*>(value_.v_handle);
+  operator DLTensor*() const {
+    if (type_code_ == kArrayHandle ||
+        type_code_ == kNDArrayContainer) {
+      return static_cast<DLTensor*>(value_.v_handle);
+    } else {
+      if (type_code_ == kNull) return nullptr;
+      LOG(FATAL) << "Expected "
+                 << "DLTensor* or NDArray but get "
+                 << TypeCode2Str(type_code_);
+      return nullptr;
+    }
+  }
+  operator NDArray() const {
+    if (type_code_ == kNull) return NDArray();
+    TVM_CHECK_TYPE_CODE(type_code_, kNDArrayContainer);
+    return NDArray(static_cast<NDArray::Container*>(value_.v_handle));
   }
   operator TVMContext() const {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMContext);
@@ -307,8 +325,10 @@ class TVMArgValue : public TVMPODValue_ {
   using TVMPODValue_::operator int;
   using TVMPODValue_::operator bool;
   using TVMPODValue_::operator void*;
-  using TVMPODValue_::operator TVMArray*;
+  using TVMPODValue_::operator DLTensor*;
+  using TVMPODValue_::operator NDArray;
   using TVMPODValue_::operator TVMContext;
+
   // conversion operator.
   operator std::string() const {
     if (type_code_ == kTVMType) {
@@ -351,8 +371,8 @@ class TVMArgValue : public TVMPODValue_ {
            typename = typename std::enable_if<
              std::is_class<TNodeRef>::value>::type>
   inline bool IsNodeType() const;
-  inline operator Halide::Type() const;
-  inline operator Halide::Expr() const;
+  inline operator HalideIR::Type() const;
+  inline operator HalideIR::Expr() const;
   // get internal node ptr, if it is node
   inline std::shared_ptr<Node>& node_sptr();
 };
@@ -389,8 +409,9 @@ class TVMRetValue : public TVMPODValue_ {
   using TVMPODValue_::operator int;
   using TVMPODValue_::operator bool;
   using TVMPODValue_::operator void*;
-  using TVMPODValue_::operator TVMArray*;
+  using TVMPODValue_::operator DLTensor*;
   using TVMPODValue_::operator TVMContext;
+  using TVMPODValue_::operator NDArray;
   // Disable copy and assign from another value, but allow move.
   TVMRetValue(const TVMRetValue& other) {
     this->Assign(other);
@@ -430,7 +451,7 @@ class TVMRetValue : public TVMPODValue_ {
     return *this;
   }
   TVMRetValue& operator=(double value) {
-    this->SwitchToPOD(kFloat);
+    this->SwitchToPOD(kDLFloat);
     value_.v_float64 = value;
     return *this;
   }
@@ -445,12 +466,12 @@ class TVMRetValue : public TVMPODValue_ {
     return *this;
   }
   TVMRetValue& operator=(int64_t value) {
-    this->SwitchToPOD(kInt);
+    this->SwitchToPOD(kDLInt);
     value_.v_int64 = value;
     return *this;
   }
   TVMRetValue& operator=(int value) {
-    this->SwitchToPOD(kInt);
+    this->SwitchToPOD(kDLInt);
     value_.v_int64 = value;
     return *this;
   }
@@ -460,7 +481,7 @@ class TVMRetValue : public TVMPODValue_ {
     return *this;
   }
   TVMRetValue& operator=(bool value) {
-    this->SwitchToPOD(kInt);
+    this->SwitchToPOD(kDLInt);
     value_.v_int64 = value;
     return *this;
   }
@@ -472,6 +493,13 @@ class TVMRetValue : public TVMPODValue_ {
     this->SwitchToClass(kBytes, std::string(value.data, value.size));
     return *this;
   }
+  TVMRetValue& operator=(NDArray other) {
+    this->Clear();
+    type_code_ = kNDArrayContainer;
+    value_.v_handle = other.data_;
+    other.data_ = nullptr;
+    return *this;
+  }
   TVMRetValue& operator=(PackedFunc f) {
     this->SwitchToClass(kFuncHandle, f);
     return *this;
@@ -531,8 +559,8 @@ class TVMRetValue : public TVMPODValue_ {
   inline TVMRetValue& operator=(const NodeRef& other);
   inline TVMRetValue& operator=(const std::shared_ptr<Node>& other);
   // type related
-  inline operator Halide::Type() const;
-  inline TVMRetValue& operator=(const Halide::Type& other);
+  inline operator HalideIR::Type() const;
+  inline TVMRetValue& operator=(const HalideIR::Type& other);
 
  private:
   template<typename T>
@@ -554,6 +582,10 @@ class TVMRetValue : public TVMPODValue_ {
         SwitchToClass<Module>(kModuleHandle, other);
         break;
       }
+      case kNDArrayContainer: {
+        *this = other.operator NDArray();
+        break;
+      }
       case kNodeHandle: {
         SwitchToClass<std::shared_ptr<Node> >(
             kNodeHandle, *other.template ptr<std::shared_ptr<Node> >());
@@ -564,11 +596,15 @@ class TVMRetValue : public TVMPODValue_ {
           SwitchToPOD(other.type_code());
           value_ = other.value_;
         } else {
+#if TVM_RUNTIME_HEADER_ONLY
+          LOG(FATAL) << "Header only mode do not support ext type";
+#else
           this->Clear();
           type_code_ = other.type_code();
           value_.v_handle =
               (*(ExtTypeVTable::Get(other.type_code())->clone))(
                   other.value().v_handle);
+#endif
         }
         break;
       }
@@ -598,9 +634,17 @@ class TVMRetValue : public TVMPODValue_ {
       case kFuncHandle: delete ptr<PackedFunc>(); break;
       case kModuleHandle: delete ptr<Module>(); break;
       case kNodeHandle: delete ptr<std::shared_ptr<Node> >(); break;
+      case kNDArrayContainer: {
+        static_cast<NDArray::Container*>(value_.v_handle)->DecRef();
+        break;
+      }
     }
     if (type_code_ > kExtBegin) {
+#if TVM_RUNTIME_HEADER_ONLY
+          LOG(FATAL) << "Header only mode do not support ext type";
+#else
       (*(ExtTypeVTable::Get(type_code_)->destroy))(value_.v_handle);
+#endif
     }
     type_code_ = kNull;
   }
@@ -609,9 +653,9 @@ class TVMRetValue : public TVMPODValue_ {
 // implementation details
 inline const char* TypeCode2Str(int type_code) {
   switch (type_code) {
-    case kInt: return "int";
-    case kUInt: return "uint";
-    case kFloat: return "float";
+    case kDLInt: return "int";
+    case kDLUInt: return "uint";
+    case kDLFloat: return "float";
     case kStr: return "str";
     case kBytes: return "bytes";
     case kHandle: return "handle";
@@ -622,11 +666,13 @@ inline const char* TypeCode2Str(int type_code) {
     case kTVMContext: return "TVMContext";
     case kFuncHandle: return "FunctionHandle";
     case kModuleHandle: return "ModuleHandle";
+    case kNDArrayContainer: return "NDArrayContainer";
     default: LOG(FATAL) << "unknown type_code="
                         << static_cast<int>(type_code); return "";
   }
 }
 
+#ifndef _LIBCPP_SGX_NO_IOSTREAMS
 inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
   os << TypeCode2Str(t.code);
   if (t.code == kHandle) return os;
@@ -636,11 +682,23 @@ inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
   }
   return os;
 }
+#endif
 
 inline std::string TVMType2String(TVMType t) {
+#ifndef _LIBCPP_SGX_NO_IOSTREAMS
   std::ostringstream os;
   os << t;
   return os.str();
+#else
+  std::string repr = "";
+  repr += TypeCode2Str(t.code);
+  if (t.code == kHandle) return repr;
+  repr += std::to_string(static_cast<int>(t.bits));
+  if (t.lanes != 1) {
+    repr += "x" + std::to_string(static_cast<int>(t.lanes));
+  }
+  return repr;
+#endif
 }
 
 inline TVMType String2TVMType(std::string s) {
@@ -648,11 +706,11 @@ inline TVMType String2TVMType(std::string s) {
   t.bits = 32; t.lanes = 1;
   const char* scan;
   if (s.substr(0, 3) == "int") {
-    t.code = kInt;  scan = s.c_str() + 3;
+    t.code = kDLInt;  scan = s.c_str() + 3;
   } else if (s.substr(0, 4) == "uint") {
-    t.code = kUInt; scan = s.c_str() + 4;
+    t.code = kDLUInt; scan = s.c_str() + 4;
   } else if (s.substr(0, 5) == "float") {
-    t.code = kFloat; scan = s.c_str() + 5;
+    t.code = kDLFloat; scan = s.c_str() + 5;
   } else if (s.substr(0, 6) == "handle") {
     t.code = kHandle;
     t.bits = 64;  // handle uses 64 bit by default.
@@ -661,10 +719,12 @@ inline TVMType String2TVMType(std::string s) {
     scan = s.c_str();
     LOG(FATAL) << "unknown type " << s;
   }
-  unsigned bits = t.bits, lanes = t.lanes;
-  sscanf(scan, "%ux%u", &bits, &lanes);
-  t.bits = static_cast<uint8_t>(bits);
-  t.lanes = static_cast<uint16_t>(lanes);
+  char* xdelim;  // emulate sscanf("%ux%u", bits, lanes)
+  uint8_t bits = static_cast<uint8_t>(strtoul(scan, &xdelim, 10));
+  if (bits != 0) t.bits = bits;
+  if (*xdelim == 'x') {
+    t.lanes = static_cast<uint16_t>(strtoul(xdelim + 1, nullptr, 10));
+  }
   return t;
 }
 
@@ -724,17 +784,17 @@ class TVMArgsSetter {
              std::is_integral<T>::value>::type>
   void operator()(size_t i, T value) const {
     values_[i].v_int64 = static_cast<int64_t>(value);
-    type_codes_[i] = kInt;
+    type_codes_[i] = kDLInt;
   }
   void operator()(size_t i, uint64_t value) const {
     values_[i].v_int64 = static_cast<int64_t>(value);
     CHECK_LE(value,
              static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
-    type_codes_[i] = kInt;
+    type_codes_[i] = kDLInt;
   }
   void operator()(size_t i, double value) const {
     values_[i].v_float64 = value;
-    type_codes_[i] = kFloat;
+    type_codes_[i] = kDLFloat;
   }
   void operator()(size_t i, std::nullptr_t value) const {
     values_[i].v_handle = value;
@@ -748,7 +808,7 @@ class TVMArgsSetter {
     values_[i].v_handle = value;
     type_codes_[i] = kHandle;
   }
-  void operator()(size_t i, TVMArray* value) const {
+  void operator()(size_t i, DLTensor* value) const {
     values_[i].v_handle = value;
     type_codes_[i] = kArrayHandle;
   }
@@ -783,6 +843,10 @@ class TVMArgsSetter {
     values_[i].v_handle = const_cast<Module*>(&value);
     type_codes_[i] = kModuleHandle;
   }
+  void operator()(size_t i, const NDArray& value) const {  // NOLINT(*)
+    values_[i].v_handle = value.data_;
+    type_codes_[i] = kNDArrayContainer;
+  }
   void operator()(size_t i, const TVMRetValue& value) const {  // NOLINT(*)
     if (value.type_code() == kStr) {
       values_[i].v_str = value.ptr<std::string>()->c_str();
@@ -800,7 +864,7 @@ class TVMArgsSetter {
   inline void operator()(size_t i, const T& value) const;
   // NodeRef related extenstions: in tvm/packed_func_ext.h
   inline void operator()(size_t i, const NodeRef& other) const;  // NOLINT(*)
-  inline void operator()(size_t i, const Halide::Type& t) const;
+  inline void operator()(size_t i, const HalideIR::Type& t) const;
 
  private:
   /*! \brief The values fields */
@@ -882,6 +946,20 @@ inline ExtTypeVTable* ExtTypeVTable::Register_() {
   vt.destroy = ExtTypeInfo<T>::destroy;
   return ExtTypeVTable::RegisterInternal(code, vt);
 }
+
+// Implement Module::GetFunction
+// Put implementation in this file so we have seen the PackedFunc
+inline PackedFunc Module::GetFunction(const std::string& name, bool query_imports) {
+  PackedFunc pf = node_->GetFunction(name, node_);
+  if (pf != nullptr) return pf;
+  if (query_imports) {
+    for (const Module& m : node_->imports_) {
+      pf = m.node_->GetFunction(name, m.node_);
+      if (pf != nullptr) return pf;
+    }
+  }
+  return pf;
+}
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_PACKED_FUNC_H_
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 61feccadf748..2a328c8086e0 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file registry.h
+ * \file tvm/runtime/registry.h
  * \brief This file defines the TVM global function registry.
  *
  *  The registered functions will be made available to front-end
@@ -76,7 +76,7 @@ class Registry {
   // Internal class.
   struct Manager;
 
- private:
+ protected:
   /*! \brief name of the function */
   std::string name_;
   /*! \brief internal packed function */
diff --git a/include/tvm/runtime/serializer.h b/include/tvm/runtime/serializer.h
new file mode 100644
index 000000000000..b2ab5483a22d
--- /dev/null
+++ b/include/tvm/runtime/serializer.h
@@ -0,0 +1,51 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file tvm/runtime/serializer.h
+ * \brief Serializer extension to support TVM data types
+ *  Include this file to enable serialization of DLDataType, DLContext
+ */
+#ifndef TVM_RUNTIME_SERIALIZER_H_
+#define TVM_RUNTIME_SERIALIZER_H_
+
+#include <dmlc/io.h>
+#include <dmlc/serializer.h>
+#include "./c_runtime_api.h"
+#include "./ndarray.h"
+
+namespace dmlc {
+namespace serializer {
+
+template<>
+struct Handler<DLDataType> {
+  inline static void Write(Stream *strm, const DLDataType& dtype) {
+    Handler<uint8_t>::Write(strm, dtype.code);
+    Handler<uint8_t>::Write(strm, dtype.bits);
+    Handler<uint16_t>::Write(strm, dtype.lanes);
+  }
+  inline static bool Read(Stream *strm, DLDataType* dtype) {
+    if (!Handler<uint8_t>::Read(strm, &(dtype->code))) return false;
+    if (!Handler<uint8_t>::Read(strm, &(dtype->bits))) return false;
+    if (!Handler<uint16_t>::Read(strm, &(dtype->lanes))) return false;
+    return true;
+  }
+};
+
+template<>
+struct Handler<DLContext> {
+  inline static void Write(Stream *strm, const DLContext& ctx) {
+    int32_t device_type = static_cast<int32_t>(ctx.device_type);
+    Handler<int32_t>::Write(strm, device_type);
+    Handler<int32_t>::Write(strm, ctx.device_id);
+  }
+  inline static bool Read(Stream *strm, DLContext* ctx) {
+    int32_t device_type = 0;
+    if (!Handler<int32_t>::Read(strm, &(device_type))) return false;
+    ctx->device_type = static_cast<DLDeviceType>(device_type);
+    if (!Handler<int32_t>::Read(strm, &(ctx->device_id))) return false;
+    return true;
+  }
+};
+
+}  // namespace serializer
+}  // namespace dmlc
+#endif  // TVM_RUNTIME_SERIALIZER_H_
diff --git a/include/tvm/runtime/threading_backend.h b/include/tvm/runtime/threading_backend.h
new file mode 100644
index 000000000000..e2da0a3c4446
--- /dev/null
+++ b/include/tvm/runtime/threading_backend.h
@@ -0,0 +1,85 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/runtime/threading_backend.h
+ * \brief Utilities for manipulating thread pool threads.
+ */
+#ifndef TVM_RUNTIME_THREADING_BACKEND_H_
+#define TVM_RUNTIME_THREADING_BACKEND_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace threading {
+
+/*!
+ * \brief A platform-agnostic abstraction for managing a collection of
+ *        thread pool threads.
+ */
+class ThreadGroup {
+ public:
+  class Impl;
+
+   /*!
+    * \brief Creates a collection of threads which run a provided function.
+    *
+    * \param num_workers The total number of worker threads in this group.
+             Includes main thread if `exclude_worker0 = true`
+    * \param worker_callback A callback which is run in its own thread.
+             Receives the worker_id as an argument.
+    * \param exclude_worker0 Whether to use the main thread as a worker.
+    *        If  `true`, worker0 will not be launched in a new thread and
+    *        `worker_callback` will only be called for values >= 1. This
+    *        allows use of the main thread as a worker.
+    */
+  ThreadGroup(int num_workers,
+              std::function<void(int)> worker_callback,
+              bool exclude_worker0 = false);
+  ~ThreadGroup();
+
+   /*!
+    * \brief Blocks until all non-main threads in the pool finish.
+    */
+  void Join();
+
+  enum AffinityMode : int {
+    kBig = 1,
+    kLittle = -1,
+  };
+
+  /*!
+   * \brief configure the CPU id affinity
+   *
+   * \param mode The preferred CPU type (1 = big, -1 = little).
+   * \param nthreads The number of threads to use (0 = use all).
+   * \param exclude_worker0 Whether to use the main thread as a worker.
+   *        If  `true`, worker0 will not be launched in a new thread and
+   *        `worker_callback` will only be called for values >= 1. This
+   *        allows use of the main thread as a worker.
+   *
+   * \return The number of workers to use.
+   */
+  int Configure(AffinityMode mode, int nthreads, bool exclude_worker0);
+
+ private:
+  Impl* impl_;
+};
+
+/*!
+ * \brief Platform-agnostic no-op.
+ */
+void Yield();
+
+/*!
+ * \return the maximum number of effective workers for this system.
+ */
+int MaxConcurrency();
+
+
+}  // namespace threading
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_THREADING_BACKEND_H_
diff --git a/include/tvm/runtime/util.h b/include/tvm/runtime/util.h
index 1a4961476c79..160642ffcc85 100644
--- a/include/tvm/runtime/util.h
+++ b/include/tvm/runtime/util.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file util.h
+ * \file tvm/runtime/util.h
  * \brief Useful runtime util.
  */
 #ifndef TVM_RUNTIME_UTIL_H_
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index a0e4a2c9e829..deaf74ccf222 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file schedule.h
+ * \file tvm/schedule.h
  * \brief Define a schedule.
  */
 #ifndef TVM_SCHEDULE_H_
@@ -56,24 +56,24 @@ class Stage : public NodeRef {
    * \brief set the memory scope of the stage
    * \param scope The memory scope.
    */
-  Stage& set_scope(std::string scope);  // NOLINT(*)
+  EXPORT Stage& set_scope(std::string scope);  // NOLINT(*)
   /*!
    * \brief specify the schedule to be computed at the parent schedule's scope.
    * \param parent The parent schedule.
    * \param scope The iteration point to carry the schedule.
    * \return reference to self.
    */
-  Stage& compute_at(Stage parent, IterVar scope);   // NOLINT(*)
+  EXPORT Stage& compute_at(Stage parent, IterVar scope);   // NOLINT(*)
   /*!
    * \brief Compute the function inline.
    * \return reference to self.
    */
-  Stage& compute_inline();   // NOLINT(*)
+  EXPORT Stage& compute_inline();   // NOLINT(*)
   /*!
    * \brief Compute the function at group root.
    * \return reference to self.
    */
-  Stage& compute_root();  // NOLINT(*)
+  EXPORT Stage& compute_root();  // NOLINT(*)
   /*!
    * \brief Bind the ivar to thread index.
    *
@@ -81,7 +81,7 @@ class Stage : public NodeRef {
    * \param thread_ivar The thread axis to be binded.
    * \return reference to self.
    */
-  Stage& bind(IterVar ivar, IterVar thread_ivar);
+  EXPORT Stage& bind(IterVar ivar, IterVar thread_ivar);
   /*!
    * \brief Set predicate under which store to the array can be performed.
    *  Use this when there are duplicated threads doing the same store and we only
@@ -92,7 +92,7 @@ class Stage : public NodeRef {
    * \param predicate The condition to be checked.
    * \return reference to self.
    */
-  Stage& set_store_predicate(Expr predicate);
+  EXPORT Stage& set_store_predicate(Expr predicate);
   /*!
    * \brief Specify environment threads that launched around the group's scope.
    *  This can only be used in group stage.
@@ -101,7 +101,7 @@ class Stage : public NodeRef {
    *    This is a beta feature.
    * \return reference to self.
    */
-  Stage& env_threads(Array<IterVar> threads);
+  EXPORT Stage& env_threads(Array<IterVar> threads);
   /*!
    * \brief Split the parent by factor, generate
    * \param parent The parent iteration domain.
@@ -110,7 +110,7 @@ class Stage : public NodeRef {
    * \param p_inner The result inner domain.
    * \return reference to self.
    */
-  Stage& split(IterVar parent, Expr factor, IterVar* p_outer, IterVar* p_inner);  // NOLINT(*)
+  EXPORT Stage& split(IterVar parent, Expr factor, IterVar* p_outer, IterVar* p_inner);  // NOLINT(*)
   /*!
    * \brief Split the iteration with given number of parts.
    *
@@ -120,7 +120,7 @@ class Stage : public NodeRef {
    * \param p_inner The result inner domain.
    * \return reference to self.
    */
-  Stage& split_by_nparts(IterVar parent, Expr nparts, IterVar* p_outer, IterVar* p_inner);   // NOLINT(*)
+  EXPORT Stage& split_by_nparts(IterVar parent, Expr nparts, IterVar* p_outer, IterVar* p_inner);   // NOLINT(*)
   /*!
    * \brief Fuse the inner outer domain to the target
    * \param outer The outer domain to be fused.
@@ -128,13 +128,27 @@ class Stage : public NodeRef {
    * \param p_target The result target domain.
    * \return reference to self.
    */
-  Stage& fuse(IterVar outer, IterVar inner, IterVar* p_target);  // NOLINT(*)
+  EXPORT Stage& fuse(IterVar outer, IterVar inner, IterVar* p_target);  // NOLINT(*)
+  /*!
+   * \brief Fuse all the axes together into a single axis.
+   *
+   * \param axes All the axes to be fused.
+   * \param p_target The result target domain.
+   *
+   * \note axes can be an empty array,
+   *       in that case, a singleton itervar is created and
+   *       inserted to the outermost loop.
+   *       The fuse of empty array is used to support zero-dimension tensors.
+   *
+   * \return reference to self.
+   */
+  EXPORT Stage& fuse(const Array<IterVar>& axes, IterVar* p_target);  // NOLINT(*)
   /*!
    * \brief Reorder the iteration
    * \param order The order of iteration variable.
    * \return reference to self.
    */
-  Stage& reorder(const Array<IterVar>& order);   // NOLINT(*)
+  EXPORT Stage& reorder(const Array<IterVar>& order);   // NOLINT(*)
   /*!
    * \brief Perform tiling on two dimensions
    *  The final loop order from outmost to inner most are
@@ -150,16 +164,16 @@ class Stage : public NodeRef {
    * \param p_y_inner Inner axis of y dimension
    * \return reference to self.
    */
-  Stage& tile(IterVar x_parent, IterVar y_parent,   // NOLINT(*)
-              Expr x_factor, Expr y_factor,
-              IterVar* p_x_outer, IterVar* p_y_outer,
-              IterVar* p_x_inner, IterVar* p_y_inner);
+  EXPORT Stage& tile(IterVar x_parent, IterVar y_parent,   // NOLINT(*)
+                     Expr x_factor, Expr y_factor,
+                     IterVar* p_x_outer, IterVar* p_y_outer,
+                     IterVar* p_x_inner, IterVar* p_y_inner);
   /*!
    * \brief Vectorize iteration.
    * \param var The axis to be vectorized.
    * \return reference to self.
    */
-  Stage& vectorize(IterVar var);   // NOLINT(*)
+  EXPORT Stage& vectorize(IterVar var);   // NOLINT(*)
   /*!
    * \brief Replace computation of the current stage by tensor intrinsic f.
    * \param var The axis marks beginning of tensorization.
@@ -167,28 +181,31 @@ class Stage : public NodeRef {
    * \param f The Tensor compute intrinsics.
    * \return reference to self.
    */
-  Stage& tensorize(IterVar var, TensorIntrin f);   // NOLINT(*)
+  EXPORT Stage& tensorize(IterVar var, TensorIntrin f);   // NOLINT(*)
   /*!
    * \brief Unroll iteration.
    * \param var The axis to be unrolled.
    * \return reference to self.
    */
-  Stage& unroll(IterVar var);   // NOLINT(*)
+  EXPORT Stage& unroll(IterVar var);   // NOLINT(*)
   /*!
    * \brief Parallelize iteration.
    * \param var The axis to be parallelized.
    * \return reference to self.
    */
-  Stage& parallel(IterVar var);   // NOLINT(*)
+  EXPORT Stage& parallel(IterVar var);   // NOLINT(*)
   /*!
    * \brief Annotate the iteration with pragma
    *
    * \param var The axis to be parallelized.
    * \param pragma_type The pragma type.
+   * \param pragma_value The pragma value
    *
    * \return reference to self.
    */
-  Stage& pragma(IterVar var, const std::string& pragma_type);   // NOLINT(*)
+  EXPORT Stage& pragma(IterVar var,
+                       const std::string& pragma_type,
+                       const Expr& pragma_value = Expr());   // NOLINT(*)
   /*!
    * \brief Fetch data in advance.
    * \param domain the tensor to be prefetched
@@ -196,7 +213,7 @@ class Stage : public NodeRef {
    * \param offset the number of iterations be to fetched in advance
    * \return reference to self
    */
-  Stage& prefetch(const Tensor &domain, IterVar var, Expr offset); //NOLINT(*)
+  EXPORT Stage& prefetch(const Tensor &domain, IterVar var, Expr offset); //NOLINT(*)
   /*!
    * \brief Set alignment requirement for specific dimension.
    *
@@ -207,12 +224,17 @@ class Stage : public NodeRef {
    * \param offset The required offset factor.
    * \return reference to self
    */
-  Stage& storage_align(IterVar axis, int factor, int offset); //NOLINT(*)
+  EXPORT Stage& storage_align(IterVar axis, int factor, int offset); //NOLINT(*)
   /*!
    * \brief Compute current stage with double buffering.
    * \return reference to self.
    */
-  Stage& double_buffer();   // NOLINT(*)
+  EXPORT Stage& double_buffer();   // NOLINT(*)
+  /*!
+   * \brief Schedule for OpenGL fragment shader.
+   * \return reference to self.
+   */
+  Stage& opengl(); // NOLINT(*)
   /*!
    * \brief whether the stage has been scheduled.
    * \return whether the stage has been scheduled.
@@ -248,13 +270,13 @@ class Schedule : public NodeRef {
    * \brief Get the stage corresponds to the op
    * \param op The operation.
    */
-  Stage operator[](const Operation& op);
+  EXPORT Stage operator[](const Operation& op);
   /*!
    * \brief Short hand for getting the stage of tensor's operation.
    * \param tensor The tensor
    * \return The stage corresponding to the tensor's op
    */
-  Stage operator[](const Tensor& tensor) {
+  EXPORT Stage operator[](const Tensor& tensor) {
     return this->operator[](tensor->op);
   }
   /*!
@@ -266,7 +288,7 @@ class Schedule : public NodeRef {
    * \param include_inputs Whether include inputs if they are reachable from outputs.
    * \return The new grouped stage.
    */
-  Stage create_group(const Array<Tensor>& outputs,
+  EXPORT Stage create_group(const Array<Tensor>& outputs,
                      const Array<Tensor>& inputs,
                      bool include_inputs = false);
   /*!
@@ -278,9 +300,26 @@ class Schedule : public NodeRef {
    * \param readers The readers to redirect to the tensor.
    * \return The created tensor.
    */
-  Tensor cache_read(const Tensor& tensor,
+  EXPORT Tensor cache_read(const Tensor& tensor,
                     const std::string& scope,
                     const Array<Operation>& readers);
+  /*!
+   * \brief Create a cache write tensor for producing tensor.
+   *  The the tensor will take over body of original tensor op.
+   *
+   *  This function can be used to do data layout transformation.
+   *  If there is a split/fuse/reorder on the data parallel axis of tensor
+   *  before cache_write is called. The intermediate cache stores
+   *  the data in the layout as the iteration order of leave axis.
+   *  The data will be transformed back to the original layout in the original tensor.
+   *  User can further call compute_inline to inline the original layout and keep
+   *  the data stored in the transformed layout.
+   *
+   * \param tensor The tensors to be produced.
+   * \param scope The scope of the storage.
+   * \return The created tensor.
+   */
+  EXPORT Array<Tensor> cache_write(const Array<Tensor>& tensor, const std::string& scope);
   /*!
    * \brief Create a cache write tensor for producing tensor.
    *  The the tensor will take over body of original tensor op.
@@ -297,7 +336,7 @@ class Schedule : public NodeRef {
    * \param scope The scope of the storage.
    * \return The created tensor.
    */
-  Tensor cache_write(const Tensor& tensor, const std::string& scope);
+  EXPORT Tensor cache_write(const Tensor& tensor, const std::string& scope);
   /*!
    * \brief Factor a reduction axis in tensor's schedule to be an explicit axis.
    * This will create a new stage that generated the new tensor with axis
@@ -308,10 +347,12 @@ class Schedule : public NodeRef {
    *
    * \param tensor The tensor to be factored.
    * \param axis The reduction axis in tensor's schedule to be factored.
+   * \param factor_axis The position where the new axis is placed.
    * \return The created factored tensors.
    */
-  Array<Tensor> rfactor(const Tensor& tensor,
-                        const IterVar& axis);
+  EXPORT Array<Tensor> rfactor(const Tensor& tensor,
+                        const IterVar& axis,
+                        int factor_axis = 0);
   /*!
    * \brief Normalize the schedule.
    *  This is needed before bound inference.
@@ -422,6 +463,8 @@ class StageNode : public Node {
   std::string scope;
   /*! \brief Whether this is an output stage */
   bool is_output{false};
+  /*! \brief Whether this is an OpenGL stage */
+  bool is_opengl{false};
   /*! \brief Whether apply double buffer optimization to this stage */
   bool double_buffer{false};
   /*!
@@ -445,6 +488,7 @@ class StageNode : public Node {
     v->Visit("attach_stage", &attach_stage);
     v->Visit("scope", &scope);
     v->Visit("is_output", &is_output);
+    v->Visit("is_opengl", &is_opengl);
     v->Visit("double_buffer", &double_buffer);
     v->Visit("group", &group);
     v->Visit("num_child_stages", &num_child_stages);
@@ -493,7 +537,7 @@ class ScheduleNode : public Node {
    * \param ops The ops to be scheduled.
    * \return sch The created Schedule.
    */
-  static Schedule make(Array<Operation> ops);
+  EXPORT static Schedule make(Array<Operation> ops);
 
   static constexpr const char* _type_key = "Schedule";
   TVM_DECLARE_NODE_TYPE_INFO(ScheduleNode, Node);
@@ -529,9 +573,13 @@ class IterVarAttrNode : public Node {
   /*! \brief Alignment offset of buffer dimension */
   int dim_align_offset{0};
   /*!
-   * \brief Additional pragmas, array of StringImm
+   * \brief Additional pragma keys, array of StringImm
+   */
+  Array<Expr> pragma_keys;
+  /*!
+   * \brief Additional values of pragma, if any
    */
-  Array<Expr> pragmas;
+  Array<Expr> pragma_values;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("iter_type", &iter_type);
@@ -541,7 +589,8 @@ class IterVarAttrNode : public Node {
     v->Visit("tensor_intrin", &tensor_intrin);
     v->Visit("dim_align_factor", &dim_align_factor);
     v->Visit("dim_align_offset", &dim_align_offset);
-    v->Visit("pragmas", &pragmas);
+    v->Visit("pragma_keys", &pragma_keys);
+    v->Visit("pragma_values", &pragma_values);
   }
 
   static constexpr const char* _type_key = "IterVarAttr";
@@ -639,6 +688,25 @@ class RebaseNode : public IterVarRelationNode {
 };
 
 
+/*!
+ * \brief Singleton iterator [0, 1)
+ */
+class SingletonNode : public IterVarRelationNode {
+ public:
+  /*! \brief The singleton iterator */
+  IterVar iter;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("iter", &iter);
+  }
+
+  static IterVarRelation make(IterVar iter);
+
+  static constexpr const char* _type_key = "Singleton";
+  TVM_DECLARE_NODE_TYPE_INFO(SingletonNode, IterVarRelationNode);
+};
+
+
 // implementations
 inline const StageNode* Stage::operator->() const {
   return static_cast<const StageNode*>(node_.get());
diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h
index 189b999a253d..cd248f8b9b96 100644
--- a/include/tvm/schedule_pass.h
+++ b/include/tvm/schedule_pass.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file schedule_pass.h
+ * \file tvm/schedule_pass.h
  * \brief  Collection of Schedule pass functions.
  *
  *  These passes works on the schedule hyper-graph
@@ -29,9 +29,13 @@ Map<IterVar, Range> InferBound(const Schedule& sch);
  *
  * \param s The schedule to be realized
  * \param dom_map The domain of each iter vars.
+ * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1 during lowering.
+ *                                This is a debug feature for dataflow/axis analysis.
+ *                                Note: If this is true, The lowered IR may be incorrect,
+ *                                because we will also delete the init part of reduction
  * \return the result Stmt
  */
-Stmt ScheduleOps(Schedule s, Map<IterVar, Range> dom_map);
+Stmt ScheduleOps(Schedule s, Map<IterVar, Range> dom_map, bool debug_keep_trivial_loop);
 
 /*!
  * \brief To automatically inline the element-wise operations.
@@ -48,7 +52,7 @@ void AutoInlineElemWise(Schedule sch);
  *
  * \param sch The schedule to be inlined.
  */
-void AutoInlineInjective(Schedule sch);
+EXPORT void AutoInlineInjective(Schedule sch);
 
 }  // namespace schedule
 }  // namespace tvm
diff --git a/include/tvm/target_info.h b/include/tvm/target_info.h
index d00cd1e2b60c..8569f188a4ab 100644
--- a/include/tvm/target_info.h
+++ b/include/tvm/target_info.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file target_info.h
+ * \file tvm/target_info.h
  * \brief Various information about target.
  */
 #ifndef TVM_TARGET_INFO_H_
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index a6613a4dc424..1a6338d9058c 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file tensor.h
+ * \file tvm/tensor.h
  * \brief Dataflow tensor object
  */
 #ifndef TVM_TENSOR_H_
@@ -23,7 +23,7 @@ class TensorNode;
 // internal node container for Operation
 class OperationNode;
 
-using Halide::IR::FunctionRef;
+using HalideIR::IR::FunctionRef;
 
 /*!
  * \brief Tensor structure representing a possible input,
@@ -188,7 +188,7 @@ inline bool Tensor::operator==(const Tensor& other) const {
 #define DEFINE_OVERLOAD_SLICE_UNARY_OP(Op)                              \
   inline Expr operator Op (const Tensor::Slice& a) {                    \
     return Op a.operator Expr() ;                                       \
-  }
+  }                                                                     \
 
 #define DEFINE_OVERLOAD_SLICE_BINARY_OP(Op)                             \
   template<typename T>                                                  \
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index b34751dcb91a..bd3fd11021b4 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file tensor_intrin.h
+ * \file tvm/tensor_intrin.h
  * \brief Tensor intrinsic operations.
  */
 #ifndef TVM_TENSOR_INTRIN_H_
diff --git a/include/tvm/tvm.h b/include/tvm/tvm.h
index 3766821acd11..7e9c4305ffbb 100644
--- a/include/tvm/tvm.h
+++ b/include/tvm/tvm.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file tvm.h
+ * \file tvm/tvm.h
  * \brief Header to include all C++ API.
  */
 #ifndef TVM_TVM_H_
diff --git a/jvm/README.md b/jvm/README.md
index 0a113fed0e7b..7de31d1576e9 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -27,7 +27,7 @@ TVM4J contains three modules:
 
 ### Build
 
-First please refer to [Installation Guide](http://docs.tvmlang.org/how_to/install.html) and build runtime shared library from the C++ codes (libtvm\_runtime.so for Linux and libtvm\_runtime.dylib for OSX).
+First please refer to [Installation Guide](http://docs.tvm.ai/install/) and build runtime shared library from the C++ codes (libtvm\_runtime.so for Linux and libtvm\_runtime.dylib for OSX).
 
 Then you can compile tvm4j by
 
@@ -38,7 +38,7 @@ make jvmpkg
 (Optional) run unit test by
 
 ```bash
-make jvmpkg JVM_TEST_ARGS="-DskipTests=false"
+sh tests/scripts/task_java_unittest.sh
 ```
 
 After it is compiled and packaged, you can install tvm4j in your local maven repository,
@@ -117,12 +117,12 @@ public class LoadAddFunc {
     Module fadd = Module.load(loadingDir + File.separator + "add_cpu.so");
 
     TVMContext ctx = TVMContext.cpu();
-    
+
     long[] shape = new long[]{2};
     NDArray arr = NDArray.empty(shape, ctx);
     arr.copyFrom(new float[]{3f, 4f});
     NDArray res = NDArray.empty(shape, ctx);
-    
+
     fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke();
     System.out.println(Arrays.toString(res.asFloatArray()));
 
@@ -135,7 +135,7 @@ public class LoadAddFunc {
 
 ## RPC Server
 
-There are two ways to start an RPC server on JVM. A standalone server can be started by 
+There are two ways to start an RPC server on JVM. A standalone server can be started by
 
 ```java
 Server server = new Server(port);
diff --git a/jvm/assembly/linux-x86_64-cpu/pom.xml b/jvm/assembly/linux-x86_64-cpu/pom.xml
deleted file mode 100644
index 06eb95f4d0b8..000000000000
--- a/jvm/assembly/linux-x86_64-cpu/pom.xml
+++ /dev/null
@@ -1,55 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>ml.dmlc.tvm</groupId>
-    <artifactId>tvm4j-full-parent</artifactId>
-    <version>0.0.1-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <groupId>ml.dmlc.tvm</groupId>
-  <artifactId>tvm4j-full-linux-x86_64-cpu</artifactId>
-  <version>0.0.1-SNAPSHOT</version>
-  <name>TVM4J Package - Full Linux-x86_64 CPU-only</name>
-  <packaging>jar</packaging>
-
-  <dependencies>
-    <dependency>
-      <groupId>ml.dmlc.tvm</groupId>
-      <artifactId>tvm4j-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>ml.dmlc.tvm</groupId>
-      <artifactId>libtvm4j-linux-x86_64-cpu</artifactId>
-      <version>${project.version}</version>
-      <type>so</type>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <executions>
-          <execution>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>false</appendAssemblyId>
-              <descriptors>
-                <descriptor>src/main/assembly/assembly.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/jvm/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml b/jvm/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml
deleted file mode 100644
index 936832f43005..000000000000
--- a/jvm/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml
+++ /dev/null
@@ -1,35 +0,0 @@
-<assembly>
-  <id>full</id>
-  <formats>
-    <format>jar</format>
-  </formats>
-  <includeBaseDirectory>false</includeBaseDirectory>
-  <files>
-    <file>
-      <source>../../../lib/libtvm_runtime.so</source>
-      <outputDirectory>lib/native</outputDirectory>
-      <fileMode>0644</fileMode>
-    </file>
-  </files>
-  <dependencySets>
-    <dependencySet>
-      <includes>
-        <include>*:*:jar</include>
-      </includes>
-      <outputDirectory>/</outputDirectory>
-      <useProjectArtifact>true</useProjectArtifact>
-      <unpack>true</unpack>
-      <scope>runtime</scope>
-    </dependencySet>
-    <dependencySet>
-      <outputDirectory>lib/native</outputDirectory>
-      <outputFileNameMapping>libtvm4j.so</outputFileNameMapping>
-      <unpack>false</unpack>
-      <useProjectArtifact>false</useProjectArtifact>
-      <useStrictFiltering>false</useStrictFiltering>
-      <includes>
-        <include>ml.dmlc.tvm:libtvm4j-linux-x86_64-cpu:so</include>
-      </includes>
-    </dependencySet>
-  </dependencySets>
-</assembly>
diff --git a/jvm/assembly/linux-x86_64-gpu/pom.xml b/jvm/assembly/linux-x86_64/pom.xml
similarity index 90%
rename from jvm/assembly/linux-x86_64-gpu/pom.xml
rename to jvm/assembly/linux-x86_64/pom.xml
index 48e1b869e97d..c539b1434b60 100644
--- a/jvm/assembly/linux-x86_64-gpu/pom.xml
+++ b/jvm/assembly/linux-x86_64/pom.xml
@@ -11,9 +11,9 @@
   </parent>
 
   <groupId>ml.dmlc.tvm</groupId>
-  <artifactId>tvm4j-full-linux-x86_64-gpu</artifactId>
+  <artifactId>tvm4j-full-linux-x86_64</artifactId>
   <version>0.0.1-SNAPSHOT</version>
-  <name>TVM4J Package - Full Linux-x86_64 GPU</name>
+  <name>TVM4J Package - Full Linux-x86_64</name>
   <packaging>jar</packaging>
 
   <dependencies>
@@ -24,7 +24,7 @@
     </dependency>
     <dependency>
       <groupId>ml.dmlc.tvm</groupId>
-      <artifactId>libtvm4j-linux-x86_64-gpu</artifactId>
+      <artifactId>libtvm4j-linux-x86_64</artifactId>
       <version>${project.version}</version>
       <type>so</type>
     </dependency>
diff --git a/jvm/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml b/jvm/assembly/linux-x86_64/src/main/assembly/assembly.xml
similarity index 88%
rename from jvm/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml
rename to jvm/assembly/linux-x86_64/src/main/assembly/assembly.xml
index c0f5c053d262..d70f1544a9a6 100644
--- a/jvm/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml
+++ b/jvm/assembly/linux-x86_64/src/main/assembly/assembly.xml
@@ -6,7 +6,7 @@
   <includeBaseDirectory>false</includeBaseDirectory>
   <files>
     <file>
-      <source>../../../lib/libtvm_runtime.so</source>
+      <source>../../../build/libtvm_runtime.so</source>
       <outputDirectory>lib/native</outputDirectory>
       <fileMode>0644</fileMode>
     </file>
@@ -28,7 +28,7 @@
       <useProjectArtifact>false</useProjectArtifact>
       <useStrictFiltering>false</useStrictFiltering>
       <includes>
-        <include>ml.dmlc.tvm:libtvm4j-linux-x86_64-gpu:so</include>
+        <include>ml.dmlc.tvm:libtvm4j-linux-x86_64:so</include>
       </includes>
     </dependencySet>
   </dependencySets>
diff --git a/jvm/assembly/osx-x86_64-cpu/pom.xml b/jvm/assembly/osx-x86_64/pom.xml
similarity index 90%
rename from jvm/assembly/osx-x86_64-cpu/pom.xml
rename to jvm/assembly/osx-x86_64/pom.xml
index 6e4342fc82a1..a0cb02d295ce 100644
--- a/jvm/assembly/osx-x86_64-cpu/pom.xml
+++ b/jvm/assembly/osx-x86_64/pom.xml
@@ -11,9 +11,9 @@
   </parent>
 
   <groupId>ml.dmlc.tvm</groupId>
-  <artifactId>tvm4j-full-osx-x86_64-cpu</artifactId>
+  <artifactId>tvm4j-full-osx-x86_64</artifactId>
   <version>0.0.1-SNAPSHOT</version>
-  <name>TVM4J Package - Full OSX-x86_64 CPU-only</name>
+  <name>TVM4J Package - Full OSX-x86_64</name>
   <packaging>jar</packaging>
 
   <dependencies>
@@ -24,7 +24,7 @@
     </dependency>
     <dependency>
       <groupId>ml.dmlc.tvm</groupId>
-      <artifactId>libtvm4j-osx-x86_64-cpu</artifactId>
+      <artifactId>libtvm4j-osx-x86_64</artifactId>
       <version>${project.version}</version>
       <type>jnilib</type>
     </dependency>
diff --git a/jvm/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml b/jvm/assembly/osx-x86_64/src/main/assembly/assembly.xml
similarity index 87%
rename from jvm/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml
rename to jvm/assembly/osx-x86_64/src/main/assembly/assembly.xml
index 404f935a1f3e..caefd426af66 100644
--- a/jvm/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml
+++ b/jvm/assembly/osx-x86_64/src/main/assembly/assembly.xml
@@ -6,7 +6,7 @@
   <includeBaseDirectory>false</includeBaseDirectory>
   <files>
     <file>
-      <source>../../../lib/libtvm_runtime.dylib</source>
+      <source>../../../build/libtvm_runtime.dylib</source>
       <outputDirectory>lib/native</outputDirectory>
       <fileMode>0644</fileMode>
     </file>
@@ -28,7 +28,7 @@
       <useProjectArtifact>false</useProjectArtifact>
       <useStrictFiltering>false</useStrictFiltering>
       <includes>
-        <include>ml.dmlc.tvm:libtvm4j-osx-x86_64-cpu:jnilib</include>
+        <include>ml.dmlc.tvm:libtvm4j-osx-x86_64:jnilib</include>
       </includes>
     </dependencySet>
   </dependencySets>
diff --git a/jvm/assembly/pom.xml b/jvm/assembly/pom.xml
index 0ee2161acac2..2380df497b56 100644
--- a/jvm/assembly/pom.xml
+++ b/jvm/assembly/pom.xml
@@ -18,21 +18,15 @@
 
   <profiles>
     <profile>
-      <id>osx-x86_64-cpu</id>
+      <id>osx-x86_64</id>
       <modules>
-        <module>osx-x86_64-cpu</module>
+        <module>osx-x86_64</module>
       </modules>
     </profile>
     <profile>
-      <id>linux-x86_64-cpu</id>
+      <id>linux-x86_64</id>
       <modules>
-        <module>linux-x86_64-cpu</module>
-      </modules>
-    </profile>
-    <profile>
-      <id>linux-x86_64-gpu</id>
-      <modules>
-        <module>linux-x86_64-gpu</module>
+        <module>linux-x86_64</module>
       </modules>
     </profile>
     <profile>
diff --git a/jvm/conf/log4j.properties b/jvm/conf/log4j.properties
new file mode 100644
index 000000000000..cb92f4c5250a
--- /dev/null
+++ b/jvm/conf/log4j.properties
@@ -0,0 +1,7 @@
+# for development debugging
+log4j.rootLogger = info, stdout
+
+log4j.appender.stdout = org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target = System.out
+log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n
diff --git a/jvm/core/pom.xml b/jvm/core/pom.xml
index 8aed78525189..9ffdba16f5ba 100644
--- a/jvm/core/pom.xml
+++ b/jvm/core/pom.xml
@@ -17,23 +17,16 @@
 
   <profiles>
     <profile>
-      <id>osx-x86_64-cpu</id>
+      <id>osx-x86_64</id>
       <properties>
-        <platform>osx-x86_64-cpu</platform>
+        <platform>osx-x86_64</platform>
         <libtvm.so.filename>libtvm_runtime.dylib</libtvm.so.filename>
       </properties>
     </profile>
     <profile>
-      <id>linux-x86_64-cpu</id>
+      <id>linux-x86_64</id>
       <properties>
-        <platform>linux-x86_64-cpu</platform>
-        <libtvm.so.filename>libtvm_runtime.so</libtvm.so.filename>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-gpu</id>
-      <properties>
-        <platform>linux-x86_64-gpu</platform>
+        <platform>linux-x86_64</platform>
         <libtvm.so.filename>libtvm_runtime.so</libtvm.so.filename>
       </properties>
     </profile>
@@ -81,7 +74,7 @@
           <consoleOutput>true</consoleOutput>
         </configuration>
       </plugin>
-      <plugin>             
+      <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
         <version>2.7</version>
@@ -91,7 +84,8 @@
           <threadCount>1</threadCount>
           <argLine>
             -Djava.library.path=${project.parent.basedir}/native/${platform}/target
-            -Dlibtvm.so.path=${project.parent.basedir}/../lib/${libtvm.so.filename}
+            -Dlibtvm.so.path=${project.parent.basedir}/../build/${libtvm.so.filename}
+            -Dlog4j.configuration=file://${project.parent.basedir}/conf/log4j.properties
           </argLine>
         </configuration>
         <executions>
@@ -119,5 +113,17 @@
       <version>4.11</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>1.7.25</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-simple</artifactId>
+      <version>1.7.25</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 </project>
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/Base.java b/jvm/core/src/main/java/ml/dmlc/tvm/Base.java
index dd4a10e46de5..d380cf5a07c2 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/Base.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/Base.java
@@ -155,14 +155,8 @@ private static void tryLoadLibraryOS(String libname) throws UnsatisfiedLinkError
    * @throws UnsatisfiedLinkError if loading fails
    */
   private static void tryLoadLibraryXPU(String libname, String arch) throws UnsatisfiedLinkError {
-    try {
-      // try gpu first
-      System.err.println(String.format("Try loading %s-%s-gpu from native path.", libname, arch));
-      System.loadLibrary(String.format("%s-%s-gpu", libname, arch));
-    } catch (UnsatisfiedLinkError e) {
-      System.err.println(String.format("Try loading %s-%s-cpu from native path.", libname, arch));
-      System.loadLibrary(String.format("%s-%s-cpu", libname, arch));
-    }
+    System.err.println(String.format("Try loading %s-%s from native path.", libname, arch));
+    System.loadLibrary(String.format("%s-%s", libname, arch));
   }
 
   // helper function definitions
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/Function.java b/jvm/core/src/main/java/ml/dmlc/tvm/Function.java
index 63602f3a14d0..2e21f439300e 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/Function.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/Function.java
@@ -109,8 +109,7 @@ private static Function getGlobalFunc(String name, boolean isResident, boolean a
   /**
    * Release the Function.
    * <p>
-   * We highly recommend you to do this manually since the GC strategy is lazy
-   * and `finalize()` is not guaranteed to be called when GC happens.
+   * We highly recommend you to do this manually since the GC strategy is lazy.
    * </p>
    */
   @Override public void release() {
@@ -188,7 +187,8 @@ public Function pushArg(String arg) {
    * @return this
    */
   public Function pushArg(NDArrayBase arg) {
-    Base._LIB.tvmFuncPushArgHandle(arg.handle, TypeCode.ARRAY_HANDLE.id);
+    int id = arg.isView ? TypeCode.ARRAY_HANDLE.id : TypeCode.NDARRAY_CONTAINER.id;
+    Base._LIB.tvmFuncPushArgHandle(arg.handle, id);
     return this;
   }
 
@@ -248,7 +248,9 @@ private static void pushArgToStack(Object arg) {
     } else if (arg instanceof byte[]) {
       Base._LIB.tvmFuncPushArgBytes((byte[]) arg);
     } else if (arg instanceof NDArrayBase) {
-      Base._LIB.tvmFuncPushArgHandle(((NDArrayBase) arg).handle, TypeCode.ARRAY_HANDLE.id);
+      NDArrayBase nd = (NDArrayBase) arg;
+      int id = nd.isView ? TypeCode.ARRAY_HANDLE.id : TypeCode.NDARRAY_CONTAINER.id;
+      Base._LIB.tvmFuncPushArgHandle(nd.handle, id);
     } else if (arg instanceof Module) {
       Base._LIB.tvmFuncPushArgHandle(((Module) arg).handle, TypeCode.MODULE_HANDLE.id);
     } else if (arg instanceof Function) {
@@ -269,6 +271,7 @@ private static void pushArgToStack(Object arg) {
         case BYTES:
           Base._LIB.tvmFuncPushArgBytes(tvmArg.asBytes());
           break;
+        case HANDLE:
         case ARRAY_HANDLE:
         case MODULE_HANDLE:
         case FUNC_HANDLE:
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/Module.java b/jvm/core/src/main/java/ml/dmlc/tvm/Module.java
index 6aa417e889f5..7c55add36639 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/Module.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/Module.java
@@ -72,8 +72,7 @@ private static Function getApi(String name) {
   /**
    * Release the Module.
    * <p>
-   * We highly recommend you to do this manually since the GC strategy is lazy
-   * and `finalize()` is not guaranteed to be called when GC happens.
+   * We highly recommend you to do this manually since the GC strategy is lazy.
    * </p>
    */
   @Override public void release() {
@@ -122,6 +121,13 @@ public void importModule(Module module) {
     Base.checkCall(Base._LIB.tvmModImport(handle, module.handle));
   }
 
+  /**
+   * @return type key of the module.
+   */
+  public String typeKey() {
+    return getApi("_GetTypeKey").pushArg(this).invoke().asString();
+  }
+
   /**
    * Load module from file.
    * @param path The path to the module file.
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/NDArray.java b/jvm/core/src/main/java/ml/dmlc/tvm/NDArray.java
index 3b9a4bb12e5a..431924c4c9b0 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/NDArray.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/NDArray.java
@@ -27,10 +27,12 @@
  */
 public class NDArray extends NDArrayBase {
   private final TVMType dtype;
+  private final TVMContext context;
 
-  NDArray(long handle, boolean isView, TVMType dtype) {
+  NDArray(long handle, boolean isView, TVMType dtype, TVMContext ctx) {
     super(handle, isView);
     this.dtype = dtype;
+    this.context = ctx;
   }
 
   @Override protected void finalize() throws Throwable {
@@ -53,7 +55,7 @@ public void copyFrom(double[] sourceArray) {
     }
     NDArray tmpArr = empty(shape(), this.dtype);
     Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle));
-    Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle));
+    tmpArr.release();
   }
 
   /**
@@ -72,7 +74,7 @@ public void copyFrom(float[] sourceArray) {
     }
     NDArray tmpArr = empty(shape(), this.dtype);
     Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle));
-    Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle));
+    tmpArr.release();
   }
 
   /**
@@ -91,7 +93,7 @@ public void copyFrom(long[] sourceArray) {
     }
     NDArray tmpArr = empty(shape(), this.dtype);
     Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle));
-    Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle));
+    tmpArr.release();
   }
 
   /**
@@ -110,7 +112,7 @@ public void copyFrom(int[] sourceArray) {
     }
     NDArray tmpArr = empty(shape(), this.dtype);
     Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle));
-    Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle));
+    tmpArr.release();
   }
 
   /**
@@ -129,7 +131,7 @@ public void copyFrom(short[] sourceArray) {
     }
     NDArray tmpArr = empty(shape(), this.dtype);
     Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle));
-    Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle));
+    tmpArr.release();
   }
 
   /**
@@ -161,7 +163,7 @@ public void copyFrom(char[] sourceArray) {
     }
     NDArray tmpArr = empty(shape(), this.dtype);
     Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle));
-    Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle));
+    tmpArr.release();
   }
 
   private void checkCopySize(int sourceLength) {
@@ -179,7 +181,7 @@ private void checkCopySize(int sourceLength) {
   public void copyFromRaw(byte[] sourceArray) {
     NDArray tmpArr = empty(shape(), this.dtype);
     Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(sourceArray, tmpArr.handle, handle));
-    Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle));
+    tmpArr.release();
   }
 
   /**
@@ -361,6 +363,14 @@ private byte[][] groupInternalBytes() {
     return units;
   }
 
+  /**
+   * Get the context of current array.
+   * @return the context.
+   */
+  public TVMContext ctx() {
+    return context;
+  }
+
   /**
    * Create an empty array given shape, type and device.
    * @param shape The shape of the array.
@@ -373,7 +383,7 @@ public static NDArray empty(long[] shape, TVMType dtype, TVMContext ctx) {
     Base.checkCall(Base._LIB.tvmArrayAlloc(
         shape, dtype.typeCode, dtype.bits, dtype.lanes,
         ctx.deviceType, ctx.deviceId, refHandle));
-    return new NDArray(refHandle.value, false, dtype);
+    return new NDArray(refHandle.value, false, dtype, ctx);
   }
 
   /**
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/NDArrayBase.java b/jvm/core/src/main/java/ml/dmlc/tvm/NDArrayBase.java
index d15caa79384f..11c77207fd1c 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/NDArrayBase.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/NDArrayBase.java
@@ -57,8 +57,7 @@ public NDArrayBase copyTo(NDArrayBase target) {
   /**
    * Release the NDArray memory.
    * <p>
-   * We highly recommend you to do this manually since the GC strategy is lazy
-   * and `finalize()` is not guaranteed to be called when GC happens.
+   * We highly recommend you to do this manually since the GC strategy is lazy.
    * </p>
    */
   public void release() {
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/TVMType.java b/jvm/core/src/main/java/ml/dmlc/tvm/TVMType.java
index 86d6efbb908b..e6b5e2ea1b9c 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/TVMType.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/TVMType.java
@@ -37,16 +37,16 @@ public TVMType(String typeStr, int lanes) {
     this.lanes = lanes;
     int bitsTemp = 0;
     if (typeStr.startsWith("int")) {
-      typeCode = 0;
+      typeCode = INT;
       bitsTemp = Integer.parseInt(typeStr.substring(3));
     } else if (typeStr.startsWith("uint")) {
-      typeCode = 1;
+      typeCode = UINT;
       bitsTemp = Integer.parseInt(typeStr.substring(4));
     } else if (typeStr.startsWith("float")) {
-      typeCode = 2;
+      typeCode = FLOAT;
       bitsTemp = Integer.parseInt(typeStr.substring(5));
     } else if (typeStr.startsWith("handle")) {
-      typeCode = 4;
+      typeCode = HANDLE;
       bitsTemp = 64;
     } else {
       throw new IllegalArgumentException("Do not know how to handle type " + typeStr);
@@ -78,16 +78,16 @@ public TVMType(String typeStr) {
   @Override public String toString() {
     String typeCodeStr;
     switch (typeCode) {
-      case 0:
+      case INT:
         typeCodeStr = "int";
         break;
-      case 1:
+      case UINT:
         typeCodeStr = "uint";
         break;
-      case 2:
+      case FLOAT:
         typeCodeStr = "float";
         break;
-      case 4:
+      case HANDLE:
         typeCodeStr = "handle";
         break;
       default:
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/TVMValueHandle.java b/jvm/core/src/main/java/ml/dmlc/tvm/TVMValueHandle.java
new file mode 100644
index 000000000000..b4316b7e72f3
--- /dev/null
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/TVMValueHandle.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm;
+
+/**
+ * Java class related to TVM handles (TypeCode.HANDLE)
+ */
+public class TVMValueHandle extends TVMValue {
+  public final long value;
+
+  public TVMValueHandle(long value) {
+    super(TypeCode.HANDLE);
+    this.value = value;
+  }
+
+  @Override public long asHandle() {
+    return value;
+  }
+}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/TypeCode.java b/jvm/core/src/main/java/ml/dmlc/tvm/TypeCode.java
index 0b28746f9555..1f01fde6d307 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/TypeCode.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/TypeCode.java
@@ -21,7 +21,7 @@
 public enum TypeCode {
   INT(0), UINT(1), FLOAT(2), HANDLE(3), NULL(4), TVM_TYPE(5),
   TVM_CONTEXT(6), ARRAY_HANDLE(7), NODE_HANDLE(8), MODULE_HANDLE(9),
-  FUNC_HANDLE(10), STR(11), BYTES(12);
+  FUNC_HANDLE(10), STR(11), BYTES(12), NDARRAY_CONTAINER(13);
 
   public final int id;
 
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/contrib/GraphModule.java b/jvm/core/src/main/java/ml/dmlc/tvm/contrib/GraphModule.java
new file mode 100644
index 000000000000..208006886cac
--- /dev/null
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/contrib/GraphModule.java
@@ -0,0 +1,170 @@
+package ml.dmlc.tvm.contrib;
+
+import ml.dmlc.tvm.Function;
+import ml.dmlc.tvm.Module;
+import ml.dmlc.tvm.NDArray;
+import ml.dmlc.tvm.TVMContext;
+
+/**
+ * Wrapper runtime module.
+ * This is a thin wrapper of the underlying TVM module.
+ * you can also directly call set_input, run, and get_output
+ * of underlying module functions.
+ */
+public class GraphModule {
+  private Module module;
+  private TVMContext ctx;
+
+  private Function fsetInput;
+  private Function frun;
+  private Function fgetOutput;
+  private Function fgetInput;
+  private Function fdebugGetOutput;
+  private Function floadParams;
+
+  GraphModule(Module module, TVMContext ctx) {
+    this.module = module;
+    this.ctx = ctx;
+    fsetInput = module.getFunction("set_input");
+    frun = module.getFunction("run");
+    fgetInput = module.getFunction("get_input");
+    fgetOutput = module.getFunction("get_output");
+    try {
+      fdebugGetOutput = module.getFunction("debug_get_output");
+    } catch (IllegalArgumentException ignored) {
+      // ignore
+    }
+    floadParams = module.getFunction("load_params");
+  }
+
+  /**
+   * Release the GraphModule.
+   * <p>
+   * We highly recommend you to do this manually since the GC strategy is lazy.
+   * </p>
+   */
+  public void release() {
+    fsetInput.release();
+    frun.release();
+    fgetInput.release();
+    fgetOutput.release();
+    if (fdebugGetOutput != null) {
+      fdebugGetOutput.release();
+    }
+    floadParams.release();
+    module.release();
+  }
+
+  /**
+   * Set inputs to the module.
+   * @param key The input key.
+   * @param value The input value
+   * @return self.
+   */
+  public GraphModule setInput(String key, NDArray value) {
+    NDArray input = value;
+    if (!value.ctx().equals(ctx)) {
+      input = NDArray.empty(value.shape(), ctx);
+      value.copyTo(input);
+    }
+    fsetInput.pushArg(key).pushArg(input).invoke();
+    return this;
+  }
+
+  /**
+   * Set inputs to the module
+   * @param key The input key.
+   * @param value The input value.
+   * @return self.
+   */
+  public GraphModule setInput(int key, NDArray value) {
+    NDArray input = value;
+    if (!value.ctx().equals(ctx)) {
+      input = NDArray.empty(value.shape(), ctx);
+      value.copyTo(input);
+    }
+    fsetInput.pushArg(key).pushArg(input).invoke();
+    return this;
+  }
+
+  /**
+   * Run forward execution of the graph.
+   * @return self.
+   */
+  public GraphModule run() {
+    frun.invoke();
+    return this;
+  }
+
+  /**
+   * Get index-th input to out.
+   * @param index The input index.
+   * @param out The output array container.
+   * @return out.
+   */
+  public NDArray getInput(int index, NDArray out) {
+    fgetInput.pushArg(index).pushArg(out).invoke();
+    return out;
+  }
+
+  /**
+   * Get index-th output to out.
+   * @param index The output index.
+   * @param out The output array container.
+   * @return out.
+   */
+  public NDArray getOutput(int index, NDArray out) {
+    fgetOutput.pushArg(index).pushArg(out).invoke();
+    return out;
+  }
+
+  /**
+   * Run graph up to node and get the output to out.
+   * @param node The node name.
+   * @param out The output array container.
+   * @return out.
+   */
+  public NDArray debugGetOutput(String node, NDArray out) {
+    if (fdebugGetOutput != null) {
+      fdebugGetOutput.pushArg(node).pushArg(out).invoke();
+    } else {
+      throw new RuntimeException("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0");
+    }
+    return out;
+  }
+
+  /**
+   * Run graph up to node and get the output to out.
+   * @param node The node index.
+   * @param out The output array container.
+   * @return out.
+   */
+  public NDArray debugGetOutput(int node, NDArray out) {
+    if (fdebugGetOutput != null) {
+      fdebugGetOutput.pushArg(node).pushArg(out).invoke();
+    } else {
+      throw new RuntimeException("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0");
+    }
+    return out;
+  }
+
+  /**
+   * Load parameters from serialized byte array of parameter dict.
+   * @param params The serialized parameter.
+   * @return self.
+   */
+  public GraphModule loadParams(byte[] params) {
+    floadParams.pushArg(params).invoke();
+    return this;
+  }
+
+  /**
+   * Get internal module function.
+   * @param key The key to the module.
+   * @return The function.
+   * @throws IllegalArgumentException if function does not exist.
+   */
+  public Function getFunction(String key) {
+    return module.getFunction(key);
+  }
+}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/contrib/GraphRuntime.java b/jvm/core/src/main/java/ml/dmlc/tvm/contrib/GraphRuntime.java
new file mode 100644
index 000000000000..edcde0cc65ec
--- /dev/null
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/contrib/GraphRuntime.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm.contrib;
+
+import ml.dmlc.tvm.Function;
+import ml.dmlc.tvm.Module;
+import ml.dmlc.tvm.TVMContext;
+import ml.dmlc.tvm.TVMValue;
+import ml.dmlc.tvm.rpc.RPC;
+import ml.dmlc.tvm.rpc.RPCSession;
+import ml.dmlc.tvm.rpc.TVMRemoteContext;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+
+public class GraphRuntime {
+  /**
+   * Create a runtime executor module given a graph and module.
+   * @param graphJson The graph deployed in json format output by nnvm graph.
+   * @param libmod The module of the corresponding function.
+   * @param ctx The local or remote context to deploy the module.
+   * @return Runtime graph module that can be used to execute the graph.
+   */
+  public static GraphModule create(String graphJson, Module libmod, TVMContext ctx) {
+    Module graphModule = null;
+    if (ctx.deviceType >= RPC.RPC_SESS_MASK) {
+      if (!(ctx instanceof  TVMRemoteContext)) {
+        throw new IllegalArgumentException(
+            "Looks like you are using remote context with no RPCSession bind."
+            + "Use session.context instead.");
+      }
+      RPCSession rpcSession = ((TVMRemoteContext) ctx).rpcSession;
+      // check arguments
+      if (!"rpc".equals(libmod.typeKey())) {
+        throw new IllegalArgumentException("libmod.typeKey != rpc");
+      }
+      final int sessIndex = (int) ((Function) reflectionStaticCall(
+          RPC.class, "getApi", "_SessTableIndex"))
+          .pushArg(libmod).invoke().asLong();
+      if (sessIndex != (Integer) reflectionGetField(rpcSession, "tblIndex")) {
+        throw new IllegalArgumentException(String.format(
+            "libmod SessTableIndex=%d mismatch rpcSession.tblIndex=%d",
+            sessIndex, reflectionGetField(rpcSession, "tblIndex")));
+      }
+
+      Function rpcModuleHandle = (Function) reflectionStaticCall(
+          RPC.class, "getApi","_ModuleHandle");
+      if (rpcModuleHandle == null) {
+        throw new RuntimeException("Cannot find global function tvm.rpc._ModuleHandle."
+            + "Did you compile tvm_runtime with the correct version?");
+      }
+
+      Function fcreate = Function.getFunction("tvm.graph_runtime.remote_create");
+      if (fcreate == null) {
+        throw new RuntimeException("Cannot find global function tvm.graph_runtime.remote_create."
+            + "Did you compile tvm_runtime with correct version?");
+      }
+
+      TVMValue hmod = rpcModuleHandle.pushArg(libmod).invoke();
+      graphModule = fcreate.call(graphJson, hmod,
+          ctx.deviceType % RPC.RPC_SESS_MASK, ctx.deviceId).asModule();
+    } else {
+      Function fcreate = Function.getFunction("tvm.graph_runtime.create");
+      if (fcreate == null) {
+        throw new RuntimeException("Cannot find global function tvm.graph_runtime.create."
+            + "Did you compile tvm_runtime with correct version?");
+      }
+      graphModule = fcreate.pushArg(graphJson)
+          .pushArg(libmod).pushArg(ctx.deviceType).pushArg(ctx.deviceId)
+          .invoke().asModule();
+    }
+
+    return new GraphModule(graphModule, ctx);
+  }
+
+  private static Object reflectionGetField(Object obj, String fieldName) {
+    try {
+      Field field = obj.getClass().getDeclaredField(fieldName);
+      field.setAccessible(true);
+      return field.get(obj);
+    } catch (NoSuchFieldException e) {
+      throw new RuntimeException(e);
+    } catch (IllegalAccessException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static Object reflectionStaticCall(Class<?> clazz, String methodName, Object ... args) {
+    Class<?>[] types = new Class<?>[args.length];
+    for (int i = 0; i < args.length; ++i) {
+      types[i] = args[i].getClass();
+    }
+    try {
+      Method method = clazz.getDeclaredMethod(methodName, types);
+      method.setAccessible(true);
+      return method.invoke(null, args);
+    } catch (NoSuchMethodException e) {
+      throw new RuntimeException(e);
+    } catch (IllegalAccessException e) {
+      throw new RuntimeException(e);
+    } catch (InvocationTargetException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java
index 8f634b0deff1..2fc97f65aca4 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java
@@ -33,6 +33,7 @@ public class ConnectProxyServerProcessor implements ServerProcessor {
   private final SocketFileDescriptorGetter socketFileDescriptorGetter;
 
   private volatile Socket currSocket = new Socket();
+  private Runnable callback;
 
   /**
    * Construct proxy server processor.
@@ -48,6 +49,15 @@ public ConnectProxyServerProcessor(String host, int port, String key,
     this.key = "server:" + key;
     socketFileDescriptorGetter = sockFdGetter;
   }
+  
+  /** 
+   * Set a callback when a connection is received e.g., to record the time for a
+   * watchdog.
+   * @param callback Runnable object.
+   */
+  public void setStartTimeCallback(Runnable callback) {
+    this.callback = callback;
+  }
 
   /**
    * Close the socket.
@@ -74,8 +84,13 @@ public ConnectProxyServerProcessor(String host, int port, String key,
       } else if (magic != RPC.RPC_MAGIC) {
         throw new RuntimeException(address + " is not RPC Proxy");
       }
+      // Get key from remote
+      int keylen = Utils.wrapBytes(Utils.recvAll(in, 4)).getInt();
+      String remoteKey = Utils.decodeToStr(Utils.recvAll(in, keylen));
       System.err.println("RPCProxy connected to " + address);
-
+      if (callback != null) {
+        callback.run();
+      }
       final int sockFd = socketFileDescriptorGetter.get(currSocket);
       if (sockFd != -1) {
         new NativeServerLoop(sockFd).run();
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java
new file mode 100644
index 000000000000..47881eb350c3
--- /dev/null
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm.rpc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.BindException;
+import java.net.ConnectException;
+import java.net.InetSocketAddress;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.SocketAddress;
+import java.net.SocketException;
+import java.net.SocketTimeoutException;
+
+
+/**
+ * Server processor with tracker connection (based on standalone).
+ * This RPC Server registers itself with an RPC Tracker for a specific queue
+ * (using its device key) and listens for incoming requests.
+ */
+public class ConnectTrackerServerProcessor implements ServerProcessor {
+  private ServerSocket server;
+  private final SocketFileDescriptorGetter socketFileDescriptorGetter;
+  private final String trackerHost;
+  private final int trackerPort;
+  // device key
+  private final String key;
+  // device key plus randomly generated key (per-session)
+  private final String matchKey;
+  private int serverPort = 5001;
+  public static final int MAX_SERVER_PORT = 5555;
+  // time to wait before aborting tracker connection (ms)
+  public static final int TRACKER_TIMEOUT = 6000;
+  // time to wait before retrying tracker connection (ms)
+  public static final int RETRY_PERIOD = TRACKER_TIMEOUT;
+  // time to wait for a connection before refreshing tracker connection (ms)
+  public static final int STALE_TRACKER_TIMEOUT = 300000;
+  // time to wait if no timeout value is specified (seconds)
+  public static final int HARD_TIMEOUT_DEFAULT = 300;
+  private RPCWatchdog watchdog;
+  private Socket trackerSocket;
+
+  /**
+   * Construct tracker server processor.
+   * @param trackerHost Tracker host.
+   * @param trackerPort Tracker port.
+   * @param key Device key.
+   * @param sockFdGetter Method to get file descriptor from Java socket.
+   */
+  public ConnectTrackerServerProcessor(String trackerHost, int trackerPort, String key,
+      SocketFileDescriptorGetter sockFdGetter, RPCWatchdog watchdog) throws IOException {
+    while (true) {
+      try {
+        this.server = new ServerSocket(serverPort);
+        server.setSoTimeout(STALE_TRACKER_TIMEOUT);
+        break;
+      } catch (BindException e) {
+        System.err.println(serverPort);
+        System.err.println(e);
+        serverPort++;
+        if (serverPort > MAX_SERVER_PORT) {
+          throw e;
+        }
+      }
+    }
+    System.err.println("using port: " + serverPort);
+    this.socketFileDescriptorGetter = sockFdGetter;
+    this.trackerHost = trackerHost;
+    this.trackerPort = trackerPort;
+    this.key = key;
+    this.matchKey = key + ":" + Math.random();
+    this.watchdog = watchdog;
+  }
+
+  public String getMatchKey() {
+    return matchKey;
+  }
+
+  @Override public void terminate() {
+    try {
+      server.close();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+
+  @Override public void run() {
+    String recvKey = null;
+    try {
+      trackerSocket = connectToTracker();
+      // open a socket and handshake with tracker
+      register();
+      Socket socket = null;
+      InputStream in = null;
+      OutputStream out = null;
+      while (true) {
+        try {
+          System.err.println("waiting for requests...");
+          // wait for client request
+          socket = server.accept();
+          in = socket.getInputStream();
+          out = socket.getOutputStream();
+          int magic = Utils.wrapBytes(Utils.recvAll(in, 4)).getInt();
+          if (magic != RPC.RPC_MAGIC) {
+            out.write(Utils.toBytes(RPC.RPC_CODE_MISMATCH));
+            System.err.println("incorrect RPC magic");
+            Utils.closeQuietly(socket);
+            continue;
+          }
+          recvKey = Utils.recvString(in);
+          System.err.println("matchKey:" + matchKey);
+          System.err.println("key: " + recvKey);
+          // incorrect key
+          if (recvKey.indexOf(matchKey) == -1) {
+            out.write(Utils.toBytes(RPC.RPC_CODE_MISMATCH));
+            System.err.println("key mismatch, expected: " + matchKey + " got: " +  recvKey);
+            Utils.closeQuietly(socket);
+            continue;
+          }
+          // successfully got client request and completed handshake with client
+          break;
+        } catch (SocketTimeoutException e) {
+          System.err.println("no incoming connections, refreshing...");
+          // need to reregister, if the tracker died we should see a socked closed exception
+          if (!needRefreshKey()) {
+            System.err.println("reregistering...");
+            register();
+          }
+        }
+      }
+      int timeout = HARD_TIMEOUT_DEFAULT;
+      int timeoutArgIndex = recvKey.indexOf(RPC.TIMEOUT_ARG);
+      if (timeoutArgIndex != -1) {
+        timeout = Integer.parseInt(recvKey.substring(timeoutArgIndex + RPC.TIMEOUT_ARG.length()));
+      }
+      System.err.println("alloted timeout: " + timeout);
+      if (!recvKey.startsWith("client:")) {
+        System.err.println("recv key mismatch...");
+        out.write(Utils.toBytes(RPC.RPC_CODE_MISMATCH));
+      } else {
+        out.write(Utils.toBytes(RPC.RPC_MAGIC));
+        // send server key to the client
+        Utils.sendString(out, recvKey);
+      }
+
+      System.err.println("Connection from " + socket.getRemoteSocketAddress().toString());
+      // received timeout in seconds
+      watchdog.startTimeout(timeout * 1000);
+      final int sockFd = socketFileDescriptorGetter.get(socket);
+      if (sockFd != -1) {
+        new NativeServerLoop(sockFd).run();
+        System.err.println("Finish serving " + socket.getRemoteSocketAddress().toString());
+      }
+      Utils.closeQuietly(socket);
+    } catch (ConnectException e) {
+      // if the tracker connection failed, wait a bit before retrying
+      try {
+        Thread.sleep(RETRY_PERIOD);
+      } catch (InterruptedException e_) {
+        System.err.println("interrupted before retrying to connect to tracker...");
+      }
+    } catch (Throwable e) {
+      e.printStackTrace();
+    } finally {
+      try {
+        if (trackerSocket != null) {
+          trackerSocket.close();
+        }
+        server.close();
+      } catch (Throwable e) {
+        e.printStackTrace();
+      }
+    }
+  }
+
+  private Socket connectToTracker() throws IOException {
+    trackerSocket = new Socket();
+    SocketAddress address = new InetSocketAddress(trackerHost, trackerPort);
+    trackerSocket.connect(address, TRACKER_TIMEOUT);
+    InputStream trackerIn = trackerSocket.getInputStream();
+    OutputStream trackerOut = trackerSocket.getOutputStream();
+    trackerOut.write(Utils.toBytes(RPC.RPC_TRACKER_MAGIC));
+    int trackerMagic = Utils.wrapBytes(Utils.recvAll(trackerIn, 4)).getInt();
+    if (trackerMagic != RPC.RPC_TRACKER_MAGIC) {
+      throw new SocketException("failed to connect to tracker (WRONG MAGIC)");
+    }
+    String infoJSON = generateCinfo(key);
+    Utils.sendString(trackerOut, infoJSON);
+    int recvCode = Integer.parseInt(Utils.recvString(trackerIn));
+    if (recvCode != RPC.TrackerCode.SUCCESS) {
+      throw new SocketException("failed to connect to tracker (not SUCCESS)");
+    }
+    return trackerSocket;
+  }
+
+  /*
+   * Register the RPC Server with the RPC Tracker.
+   */
+  private void register() throws IOException {
+    InputStream trackerIn = trackerSocket.getInputStream();
+    OutputStream trackerOut = trackerSocket.getOutputStream();
+    // send a JSON with PUT, device key, RPC server port, and the randomly
+    // generated key
+    String putJSON = generatePut(RPC.TrackerCode.PUT, key, serverPort, matchKey);
+    Utils.sendString(trackerOut, putJSON);
+    int recvCode = Integer.parseInt(Utils.recvString(trackerIn));
+    if (recvCode != RPC.TrackerCode.SUCCESS) {
+      throw new SocketException("failed to register with tracker (not SUCCESS)");
+    }
+    System.err.println("registered with tracker...");
+  }
+
+  /*
+   * Check if the RPC Tracker has our key.
+   */
+  private boolean needRefreshKey() throws IOException {
+    InputStream trackerIn = trackerSocket.getInputStream();
+    OutputStream trackerOut = trackerSocket.getOutputStream();
+    String getJSON = generateGetPendingMatchKeys(RPC.TrackerCode.GET_PENDING_MATCHKEYS);
+    Utils.sendString(trackerOut, getJSON);
+    String recvJSON = Utils.recvString(trackerIn);
+    System.err.println("pending matchkeys: " + recvJSON);
+    // fairly expensive string operation...
+    if (recvJSON.indexOf(matchKey) != -1 ) {
+      return true;
+    }
+    return false;
+  }
+
+  // handcrafted JSON
+  private String generateCinfo(String key) {
+    String cinfo = "{\"key\" : " + "\"server:" + key + "\"}";
+    return "[" + RPC.TrackerCode.UPDATE_INFO + ", " + cinfo + "]";
+  }
+
+  // handcrafted JSON
+  private String generatePut(int code, String key, int port, String matchKey) {
+    return "[" + code + ", " + "\"" + key + "\"" + ", " + "[" + port + ", "
+            + "\"" + matchKey +  "\"" + "]" + ", " + "null" + "]";
+  }
+
+  // handcrafted JSON
+  private String generateGetPendingMatchKeys(int code) {
+    return "[" + code  + "]";
+  }
+}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java
index 329d6688fa87..666b15aed615 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java
@@ -42,7 +42,9 @@ public NativeServerLoop(final int nativeSockFd) {
     File tempDir = null;
     try {
       tempDir = serverEnv();
+      System.err.println("starting server loop...");
       RPC.getApi("_ServerLoop").pushArg(sockFd).invoke();
+      System.err.println("done server loop...");
     } catch (IOException e) {
       e.printStackTrace();
     } finally {
@@ -70,13 +72,13 @@ private static File serverEnv() throws IOException {
       throw new IOException("Couldn't create directory " + tempDir.getAbsolutePath());
     }
 
-    Function.register("tvm.contrib.rpc.server.workpath", new Function.Callback() {
+    Function.register("tvm.rpc.server.workpath", new Function.Callback() {
       @Override public Object invoke(TVMValue... args) {
         return tempDir + File.separator + args[0].asString();
       }
     }, true);
 
-    Function.register("tvm.contrib.rpc.server.load_module", new Function.Callback() {
+    Function.register("tvm.rpc.server.load_module", new Function.Callback() {
       @Override public Object invoke(TVMValue... args) {
         String filename = args[0].asString();
         String path = tempDir + File.separator + filename;
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPC.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPC.java
index e3b8b9366751..8aa112d18b42 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPC.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPC.java
@@ -23,9 +23,20 @@
 import java.util.Map;
 
 public class RPC {
+  public static final int RPC_TRACKER_MAGIC = 0x2f271;
   public static final int RPC_MAGIC = 0xff271;
+  public static final int RPC_CODE_MISMATCH = RPC_MAGIC + 2;
   public static final int RPC_SESS_MASK = 128;
 
+  public static final String TIMEOUT_ARG = "-timeout=";
+
+  public class TrackerCode {
+    public static final int PUT = 3;
+    public static final int UPDATE_INFO = 5;
+    public static final int GET_PENDING_MATCHKEYS = 7;
+    public static final int SUCCESS = 0;
+  }
+
   private static ThreadLocal<Map<String, Function>> apiFuncs
       = new ThreadLocal<Map<String, Function>>() {
           @Override
@@ -34,10 +45,15 @@ protected Map<String, Function> initialValue() {
           }
         };
 
+  /**
+   * Get internal function starts with namespace tvm.rpc.
+   * @param name function name.
+   * @return the function, null if not exists.
+   */
   static Function getApi(String name) {
     Function func = apiFuncs.get().get(name);
     if (func == null) {
-      func = Function.getFunction("contrib.rpc." + name);
+      func = Function.getFunction("rpc." + name);
       if (func == null) {
         return null;
       }
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
index cb4ccf49434b..0eec9224a40c 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
@@ -60,7 +60,7 @@ public Function getFunction(String name) {
   public TVMContext context(String devType, int devId) {
     TVMContext ctx = new TVMContext(devType, devId);
     int encode = (tblIndex + 1) * RPC.RPC_SESS_MASK;
-    return new TVMContext(ctx.deviceType + encode, devId);
+    return new TVMRemoteContext(ctx.deviceType + encode, devId, this);
   }
 
   /**
@@ -80,7 +80,7 @@ public TVMContext context(String devType) {
    */
   public TVMContext context(int devType, int devId) {
     int encode = (tblIndex + 1) * RPC.RPC_SESS_MASK;
-    return new TVMContext(devType + encode, devId);
+    return new TVMRemoteContext(devType + encode, devId, this);
   }
 
   /**
@@ -172,7 +172,7 @@ public void upload(byte[] data, String target) {
     final String funcName = "upload";
     Function remoteFunc = remoteFuncs.get(funcName);
     if (remoteFunc == null) {
-      remoteFunc = getFunction("tvm.contrib.rpc.server.upload");
+      remoteFunc = getFunction("tvm.rpc.server.upload");
       remoteFuncs.put(funcName, remoteFunc);
     }
     remoteFunc.pushArg(target).pushArg(data).invoke();
@@ -205,7 +205,7 @@ public byte[] download(String path) {
     final String name = "download";
     Function func = remoteFuncs.get(name);
     if (func == null) {
-      func = getFunction("tvm.contrib.rpc.server.download");
+      func = getFunction("tvm.rpc.server.download");
       remoteFuncs.put(name, func);
     }
     return func.pushArg(path).invoke().asBytes();
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCWatchdog.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCWatchdog.java
new file mode 100644
index 000000000000..4df858cbd6bb
--- /dev/null
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCWatchdog.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm.rpc;
+
+/**
+ * Watchdog for RPC.
+ */
+public class RPCWatchdog extends Thread {
+  private int timeout = 0;
+  private boolean started = false;
+
+  public RPCWatchdog() {
+    super();
+  }
+
+  /**
+   * Start a timeout with watchdog (must be called before finishTimeout).
+   * @param timeout watchdog timeout in ms.
+   */
+  public synchronized void startTimeout(int timeout) {
+    this.timeout = timeout;
+    started = true;
+    this.notify();
+  }
+
+  /**
+   * Finish a timeout with watchdog (must be called after startTimeout).
+   */
+  public synchronized void finishTimeout() {
+    started = false;
+    this.notify();
+  }
+
+  /**
+   * Wait and kill RPC if timeout is exceeded.
+   */
+  @Override public void run() {
+    while (true) {
+      // timeout not started
+      synchronized (this) {
+        while (!started) {
+          try {
+            this.wait();
+          } catch (InterruptedException e) {
+            System.err.println("watchdog interrupted...");
+          }
+        }
+      }
+      synchronized (this) {
+        while (started) {
+          try {
+            System.err.println("waiting for timeout: " + timeout);
+            this.wait(timeout);
+            if (!started) {
+              System.err.println("watchdog woken up, ok...");
+            } else {
+              System.err.println("watchdog woke up!");
+              System.err.println("terminating...");
+              System.exit(0);
+            }
+          } catch (InterruptedException e) {
+            System.err.println("watchdog interrupted...");
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java
index 94b40af6d0d8..06e3303d1523 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java
@@ -60,7 +60,12 @@ public StandaloneServerProcessor(int serverPort,
         out.write(Utils.toBytes(RPC.RPC_MAGIC + 2));
       } else {
         out.write(Utils.toBytes(RPC.RPC_MAGIC));
+        // send server key to the client
+        String serverKey = "server:java";
+        out.write(Utils.toBytes(serverKey.length()));
+        out.write(Utils.toBytes(serverKey));
       }
+
       System.err.println("Connection from " + socket.getRemoteSocketAddress().toString());
       final int sockFd = socketFileDescriptorGetter.get(socket);
       if (sockFd != -1) {
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/TVMRemoteContext.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/TVMRemoteContext.java
new file mode 100644
index 000000000000..8b4449aee44d
--- /dev/null
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/TVMRemoteContext.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm.rpc;
+
+import ml.dmlc.tvm.TVMContext;
+
+// always related to RPCSession. Cannot construct by users.
+public class TVMRemoteContext extends TVMContext {
+  public final RPCSession rpcSession;
+
+  TVMRemoteContext(int deviceType, int deviceId, RPCSession rpcSession) {
+    super(deviceType, deviceId);
+    this.rpcSession = rpcSession;
+  }
+}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Utils.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Utils.java
index d6d9efb6e604..0f241d12c558 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Utils.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Utils.java
@@ -19,6 +19,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.net.Socket;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
@@ -76,4 +77,16 @@ public static String decodeToStr(byte[] bytes) {
     }
     return builder.toString();
   }
+
+  public static String recvString(InputStream in) throws IOException {
+    String recvString = null;
+    int len = wrapBytes(Utils.recvAll(in, 4)).getInt();
+    recvString = decodeToStr(Utils.recvAll(in, len));
+    return recvString;
+  }
+
+  public static void sendString(OutputStream out, String string) throws IOException {
+    out.write(toBytes(string.length()));
+    out.write(toBytes(string));
+  }
 }
diff --git a/jvm/core/src/test/java/ml/dmlc/tvm/ModuleTest.java b/jvm/core/src/test/java/ml/dmlc/tvm/ModuleTest.java
index cfec2d2e1f31..5209da48bccb 100644
--- a/jvm/core/src/test/java/ml/dmlc/tvm/ModuleTest.java
+++ b/jvm/core/src/test/java/ml/dmlc/tvm/ModuleTest.java
@@ -19,12 +19,16 @@
 
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import static org.junit.Assert.*;
 
 import java.io.File;
 import java.util.Random;
 
 public class ModuleTest {
+  private final Logger logger = LoggerFactory.getLogger(ModuleTest.class);
   private static String loadingDir;
 
   @BeforeClass
@@ -60,12 +64,16 @@ public void test_load_add_func_cpu() {
   public void test_load_add_func_gpu() {
     final Random RND = new Random(0);
 
+    TVMContext ctx = new TVMContext("gpu", 0);
+    if (!ctx.exist()) {
+      logger.warn("GPU does not exist. Skip the test.");
+      return;
+    }
+
     Module fadd = Module.load(loadingDir + File.separator + "add_gpu.so");
     Module faddDev = Module.load(loadingDir + File.separator + "add_gpu.ptx");
     fadd.importModule(faddDev);
 
-    TVMContext ctx = new TVMContext("gpu", 0);
-
     final int dim = 100;
     long[] shape = new long[]{dim};
     NDArray arr = NDArray.empty(shape, ctx);
diff --git a/jvm/core/src/test/java/ml/dmlc/tvm/TestUtils.java b/jvm/core/src/test/java/ml/dmlc/tvm/TestUtils.java
new file mode 100644
index 000000000000..23e22779adae
--- /dev/null
+++ b/jvm/core/src/test/java/ml/dmlc/tvm/TestUtils.java
@@ -0,0 +1,26 @@
+package ml.dmlc.tvm;
+
+import ml.dmlc.tvm.rpc.Server;
+
+import java.io.IOException;
+
+public class TestUtils {
+  public static class RefInt {
+    public int value;
+  }
+
+  public static Server startServer(RefInt portRef) {
+    Server server = null;
+    int port = 9981;
+    for (int i = 0; i < 10; ++i) {
+      try {
+        server = new Server(port + i);
+        server.start();
+        portRef.value = port + i;
+        return server;
+      } catch (IOException e) {
+      }
+    }
+    throw new RuntimeException("Cannot find an available port.");
+  }
+}
diff --git a/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java b/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java
new file mode 100644
index 000000000000..d719eb6f61e7
--- /dev/null
+++ b/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm.contrib;
+
+import ml.dmlc.tvm.*;
+import ml.dmlc.tvm.rpc.Client;
+import ml.dmlc.tvm.rpc.RPCSession;
+import ml.dmlc.tvm.rpc.Server;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Scanner;
+
+import static org.junit.Assert.assertArrayEquals;
+
+public class GraphRuntimeTest {
+  private final Logger logger = LoggerFactory.getLogger(GraphRuntime.class);
+  private static String loadingDir;
+
+  @BeforeClass
+  public static void beforeClass() {
+    loadingDir = System.getProperty("test.tempdir");
+  }
+
+  @Test
+  public void test_add_one_local() throws IOException {
+    Module libmod = Module.load(loadingDir + File.separator + "graph_addone_lib.so");
+    String graphJson = new Scanner(new File(
+        loadingDir + File.separator + "graph_addone.json"))
+        .useDelimiter("\\Z").next();
+
+    TVMContext ctx = TVMContext.cpu();
+    GraphModule graph = GraphRuntime.create(graphJson, libmod, ctx);
+
+    long[] shape = new long[]{4};
+    NDArray arr = NDArray.empty(shape, ctx);
+    arr.copyFrom(new float[]{1f, 2f, 3f, 4f});
+
+    NDArray out = NDArray.empty(shape, ctx);
+
+    graph.setInput("x", arr).run();
+    graph.getOutput(0, out);
+
+    assertArrayEquals(new float[]{2f, 3f, 4f, 5f}, out.asFloatArray(), 1e-3f);
+
+    arr.release();
+    out.release();
+    graph.release();
+  }
+
+  @Test
+  public void test_add_one_remote() throws IOException {
+    if (!Module.enabled("rpc")) {
+      logger.warn("RPC is not enabled. Skip.");
+      return;
+    }
+
+    String libPath = loadingDir + File.separator + "graph_addone_lib.so";
+    String graphJson = new Scanner(new File(
+        loadingDir + File.separator + "graph_addone.json"))
+        .useDelimiter("\\Z").next();
+
+    TestUtils.RefInt port = new TestUtils.RefInt();
+    Server server = null;
+    try {
+      server = TestUtils.startServer(port);
+      RPCSession remote = Client.connect("localhost", port.value);
+      TVMContext ctx = remote.cpu();
+
+      remote.upload(new File(libPath));
+      Module mlib = remote.loadModule("graph_addone_lib.so");
+
+      GraphModule graph = GraphRuntime.create(graphJson, mlib, ctx);
+
+      long[] shape = new long[]{4};
+      NDArray arr = NDArray.empty(shape, ctx);
+      arr.copyFrom(new float[]{1f, 2f, 3f, 4f});
+
+      NDArray out = NDArray.empty(shape, ctx);
+
+      graph.setInput("x", arr).run();
+      graph.getOutput(0, out);
+
+      assertArrayEquals(new float[]{2f, 3f, 4f, 5f}, out.asFloatArray(), 1e-3f);
+
+      arr.release();
+      out.release();
+      graph.release();
+    } finally {
+      if (server != null) {
+        server.terminate();
+      }
+    }
+  }
+}
diff --git a/jvm/core/src/test/java/ml/dmlc/tvm/rpc/RPCTest.java b/jvm/core/src/test/java/ml/dmlc/tvm/rpc/RPCTest.java
index 982be0b8117d..63cf5575b37d 100644
--- a/jvm/core/src/test/java/ml/dmlc/tvm/rpc/RPCTest.java
+++ b/jvm/core/src/test/java/ml/dmlc/tvm/rpc/RPCTest.java
@@ -20,36 +20,21 @@
 import ml.dmlc.tvm.Function;
 import ml.dmlc.tvm.Module;
 import ml.dmlc.tvm.TVMValue;
+import ml.dmlc.tvm.TestUtils;
 import org.junit.Ignore;
 import org.junit.Test;
-
-import java.io.IOException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import static org.junit.Assert.assertEquals;
 
 public class RPCTest {
-  static class RefInt {
-    public int value;
-  }
-
-  private static Server startServer(RefInt portRef) {
-    Server server = null;
-    int port = 9981;
-    for (int i = 0; i < 10; ++i) {
-      try {
-        server = new Server(port + i);
-        server.start();
-        portRef.value = port + i;
-        return server;
-      } catch (IOException e) {
-      }
-    }
-    throw new RuntimeException("Cannot find an available port.");
-  }
+  private final Logger logger = LoggerFactory.getLogger(RPCTest.class);
 
   @Test
   public void test_addone() {
     if (!Module.enabled("rpc")) {
+      logger.warn("RPC is not enabled. Skip.");
       return;
     }
     Function.register("test.rpc.addone", new Function.Callback() {
@@ -58,10 +43,10 @@ public void test_addone() {
         }
       });
 
-    RefInt port = new RefInt();
+    TestUtils.RefInt port = new TestUtils.RefInt();
     Server server = null;
     try {
-      server = startServer(port);
+      server = TestUtils.startServer(port);
       RPCSession client = Client.connect("localhost", port.value);
       Function func = client.getFunction("test.rpc.addone");
       assertEquals(11L, func.call(10).asLong());
@@ -75,6 +60,7 @@ public void test_addone() {
   @Test
   public void test_strcat() {
     if (!Module.enabled("rpc")) {
+      logger.warn("RPC is not enabled. Skip.");
       return;
     }
     Function.register("test.rpc.strcat", new Function.Callback() {
@@ -83,10 +69,10 @@ public void test_strcat() {
       }
     });
 
-    RefInt port = new RefInt();
+    TestUtils.RefInt port = new TestUtils.RefInt();
     Server server = null;
     try {
-      server = startServer(port);
+      server = TestUtils.startServer(port);
       RPCSession client = Client.connect("localhost", port.value);
       Function func = client.getFunction("test.rpc.strcat");
       assertEquals("abc:11", func.call("abc", 11L).asString());
diff --git a/jvm/core/src/test/scripts/test_add_gpu.py b/jvm/core/src/test/scripts/test_add_gpu.py
index 4c2415a14b94..ca0d9729c844 100644
--- a/jvm/core/src/test/scripts/test_add_gpu.py
+++ b/jvm/core/src/test/scripts/test_add_gpu.py
@@ -4,6 +4,9 @@
 from tvm.contrib import cc, util
 
 def test_add(target_dir):
+    if not tvm.module.enabled("cuda"):
+        print("skip %s because cuda is not enabled..." % __file__)
+        return
     n = tvm.var("n")
     A = tvm.placeholder((n,), name='A')
     B = tvm.placeholder((n,), name='B')
diff --git a/jvm/core/src/test/scripts/test_graph_runtime.py b/jvm/core/src/test/scripts/test_graph_runtime.py
new file mode 100644
index 000000000000..a60736c2468d
--- /dev/null
+++ b/jvm/core/src/test/scripts/test_graph_runtime.py
@@ -0,0 +1,47 @@
+import os
+
+import tvm
+import json
+from tvm.contrib import graph_runtime
+
+def dump_graph_lib(target_dir):
+    dim = 4
+    A = tvm.placeholder((dim,), name='A')
+    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    sched = tvm.create_schedule(B.op)
+
+    node0 = {"op": "null", "name": "x", "inputs": []}
+    node1 = {"op": "tvm_op", "name": "add",
+             "inputs": [[0, 0, 0]],
+             "attrs": {"func_name": "myadd",
+                       "flatten_data": "1",
+                       "num_inputs" : "1",
+                    "num_outputs" : "1"}}
+    nodes = [node0, node1]
+    arg_nodes = [0]
+    node_row_ptr = [0, 1, 2]
+    outputs = [[1, 0, 0]]
+    shape = (4,)
+    attrs = {
+        "shape" : ["list_shape", [shape, shape]],
+        "dltype" : ["list_str", ["float32", "float32"]],
+        "storage_id" : ["list_int", [0, 1]],
+    }
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": outputs,
+             "attrs": attrs}
+
+    graph = json.dumps(graph)
+    mlib = tvm.build(sched, [A, B], "llvm", name="myadd")
+
+    mlib.export_library(os.path.join(target_dir, "graph_addone_lib.so"))
+    with open(os.path.join(target_dir, "graph_addone.json"), "w") as fo:
+        fo.write(graph)
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) != 2:
+        sys.exit(-1)
+    dump_graph_lib(sys.argv[1])
diff --git a/jvm/core/src/test/scripts/test_rpc_proxy_server.py b/jvm/core/src/test/scripts/test_rpc_proxy_server.py
index 3f1f6466c715..09b7e4a6fe02 100644
--- a/jvm/core/src/test/scripts/test_rpc_proxy_server.py
+++ b/jvm/core/src/test/scripts/test_rpc_proxy_server.py
@@ -1,8 +1,8 @@
 import time
-from tvm.contrib import rpc_proxy
+from tvm.rpc import proxy
 
 def start_proxy_server(port, timeout):
-    prox = rpc_proxy.Proxy("localhost", port=port, port_end=port+1)
+    prox = proxy.Proxy("localhost", port=port, port_end=port+1)
     if timeout > 0:
         import time
         time.sleep(timeout)
@@ -17,4 +17,3 @@ def start_proxy_server(port, timeout):
     port = int(sys.argv[1])
     timeout = 0 if len(sys.argv) == 2 else float(sys.argv[2])
     start_proxy_server(port, timeout)
-
diff --git a/jvm/native/linux-x86_64-cpu/pom.xml b/jvm/native/linux-x86_64-cpu/pom.xml
deleted file mode 100644
index df920cd1c613..000000000000
--- a/jvm/native/linux-x86_64-cpu/pom.xml
+++ /dev/null
@@ -1,92 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>ml.dmlc.tvm</groupId>
-    <artifactId>tvm4j-native-parent</artifactId>
-    <version>0.0.1-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>libtvm4j-linux-x86_64-cpu</artifactId>
-  <version>0.0.1-SNAPSHOT</version>
-  <name>TVM4J Package - Native Linux-x86_64 CPU-only</name>
-  <url>http://maven.apache.org</url>
-
-  <packaging>so</packaging>
-
-  <dependencies>
-    <dependency>
-      <groupId>ml.dmlc.tvm</groupId>
-      <artifactId>tvm4j-core</artifactId>
-      <version>${project.version}</version>
-      <type>jar</type>
-      <scope>compile</scope>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
-
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>native-maven-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <!--  trigger javah -->
-          <javahOS>linux</javahOS>
-          <compilerProvider>generic-classic</compilerProvider>
-          <compilerExecutable>${cxx}</compilerExecutable>
-          <linkerExecutable>${cxx}</linkerExecutable>
-          <sources>
-            <source>
-              <directory>../src/main/native</directory>
-              <fileNames>
-                <fileName>ml_dmlc_tvm_native_c_api.cc</fileName>
-              </fileNames>
-            </source>
-          </sources>
-          <compilerStartOptions>
-            <compilerStartOption>-std=c++0x</compilerStartOption>
-          </compilerStartOptions>
-          <compilerEndOptions>
-            <compilerEndOption>-I../../../include</compilerEndOption>
-            <compilerEndOption>${cflags}</compilerEndOption>
-          </compilerEndOptions>
-          <linkerStartOptions>
-            <linkerStartOption>-shared</linkerStartOption>
-          </linkerStartOptions>
-          <linkerEndOptions>
-            <linkerEndOption>${ldflags}</linkerEndOption>
-          </linkerEndOptions>
-        </configuration>
-
-        <executions>
-          <execution>
-            <id>javah</id>
-            <phase>generate-sources</phase>
-            <configuration>
-              <javahOS>linux</javahOS>
-              <javahProvider>default</javahProvider>
-              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
-              <workingDirectory>${basedir}</workingDirectory>
-              <javahOutputFileName>ml_dmlc_tvm_native_c_api.h</javahOutputFileName>
-              <javahClassNames>
-                <javahClassName>ml.dmlc.tvm.LibInfo</javahClassName>
-              </javahClassNames>
-            </configuration>
-            <goals>
-              <goal>javah</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/jvm/native/linux-x86_64-gpu/pom.xml b/jvm/native/linux-x86_64/pom.xml
similarity index 96%
rename from jvm/native/linux-x86_64-gpu/pom.xml
rename to jvm/native/linux-x86_64/pom.xml
index 14f4ce64a399..82fb857db356 100644
--- a/jvm/native/linux-x86_64-gpu/pom.xml
+++ b/jvm/native/linux-x86_64/pom.xml
@@ -10,9 +10,9 @@
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>libtvm4j-linux-x86_64-gpu</artifactId>
+  <artifactId>libtvm4j-linux-x86_64</artifactId>
   <version>0.0.1-SNAPSHOT</version>
-  <name>TVM4J Package - Native Linux-x86_64 GPU</name>
+  <name>TVM4J Package - Native Linux-x86_64</name>
   <url>http://maven.apache.org</url>
 
   <packaging>so</packaging>
diff --git a/jvm/native/osx-x86_64-cpu/pom.xml b/jvm/native/osx-x86_64/pom.xml
similarity index 96%
rename from jvm/native/osx-x86_64-cpu/pom.xml
rename to jvm/native/osx-x86_64/pom.xml
index b99bc5f839db..52203fcc9bd2 100644
--- a/jvm/native/osx-x86_64-cpu/pom.xml
+++ b/jvm/native/osx-x86_64/pom.xml
@@ -10,9 +10,9 @@
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>libtvm4j-osx-x86_64-cpu</artifactId>
+  <artifactId>libtvm4j-osx-x86_64</artifactId>
   <version>0.0.1-SNAPSHOT</version>
-  <name>TVM4J Package - Native OSX-x86_64 CPU-only</name>
+  <name>TVM4J Package - Native OSX-x86_64</name>
   <url>http://maven.apache.org</url>
 
   <packaging>jnilib</packaging>
diff --git a/jvm/native/pom.xml b/jvm/native/pom.xml
index f1bc54ef61de..8861a2e20662 100644
--- a/jvm/native/pom.xml
+++ b/jvm/native/pom.xml
@@ -17,21 +17,15 @@
 
   <profiles>
     <profile>
-      <id>osx-x86_64-cpu</id>
+      <id>osx-x86_64</id>
       <modules>
-        <module>osx-x86_64-cpu</module>
+        <module>osx-x86_64</module>
       </modules>
     </profile>
     <profile>
-      <id>linux-x86_64-cpu</id>
+      <id>linux-x86_64</id>
       <modules>
-        <module>linux-x86_64-cpu</module>
-      </modules>
-    </profile>
-    <profile>
-      <id>linux-x86_64-gpu</id>
-      <modules>
-        <module>linux-x86_64-gpu</module>
+        <module>linux-x86_64</module>
       </modules>
     </profile>
   </profiles>
diff --git a/jvm/native/src/main/native/jni_helper_func.h b/jvm/native/src/main/native/jni_helper_func.h
index db4224012354..181d9de040f1 100644
--- a/jvm/native/src/main/native/jni_helper_func.h
+++ b/jvm/native/src/main/native/jni_helper_func.h
@@ -72,6 +72,14 @@ jstring getTVMValueStringField(JNIEnv *env, jobject obj) {
   return ret;
 }
 
+jobject newTVMValueHandle(JNIEnv *env, jlong value) {
+  jclass cls = env->FindClass("ml/dmlc/tvm/TVMValueHandle");
+  jmethodID constructor = env->GetMethodID(cls, "<init>", "(J)V");
+  jobject object = env->NewObject(cls, constructor, value);
+  env->DeleteLocalRef(cls);
+  return object;
+}
+
 jobject newTVMValueLong(JNIEnv *env, jlong value) {
   jclass cls = env->FindClass("ml/dmlc/tvm/TVMValueLong");
   jmethodID constructor = env->GetMethodID(cls, "<init>", "(J)V");
@@ -126,10 +134,10 @@ jobject newFunction(JNIEnv *env, jlong value) {
   return object;
 }
 
-jobject newNDArray(JNIEnv *env, jlong value) {
+jobject newNDArray(JNIEnv *env, jlong handle, jboolean isview) {
   jclass cls = env->FindClass("ml/dmlc/tvm/NDArrayBase");
-  jmethodID constructor = env->GetMethodID(cls, "<init>", "(J)V");
-  jobject object = env->NewObject(cls, constructor, value);
+  jmethodID constructor = env->GetMethodID(cls, "<init>", "(JZ)V");
+  jobject object = env->NewObject(cls, constructor, handle, isview);
   env->DeleteLocalRef(cls);
   return object;
 }
@@ -161,17 +169,21 @@ void fromJavaContext(JNIEnv *env, jobject jctx, TVMContext *ctx) {
 
 jobject tvmRetValueToJava(JNIEnv *env, TVMValue value, int tcode) {
   switch (tcode) {
-    case kUInt:
-    case kInt:
+    case kDLUInt:
+    case kDLInt:
       return newTVMValueLong(env, static_cast<jlong>(value.v_int64));
-    case kFloat:
+    case kDLFloat:
       return newTVMValueDouble(env, static_cast<jdouble>(value.v_float64));
+    case kHandle:
+      return newTVMValueHandle(env, reinterpret_cast<jlong>(value.v_handle));
     case kModuleHandle:
       return newModule(env, reinterpret_cast<jlong>(value.v_handle));
     case kFuncHandle:
       return newFunction(env, reinterpret_cast<jlong>(value.v_handle));
     case kArrayHandle:
-      return newNDArray(env, reinterpret_cast<jlong>(value.v_handle));
+      return newNDArray(env, reinterpret_cast<jlong>(value.v_handle), true);
+    case kNDArrayContainer:
+      return newNDArray(env, reinterpret_cast<jlong>(value.v_handle), false);
     case kStr:
       return newTVMValueString(env, value.v_str);
     case kBytes:
diff --git a/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc b/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc
index ea567f265f5b..fada50ca76e2 100644
--- a/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc
+++ b/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc
@@ -62,7 +62,7 @@ JNIEXPORT void JNICALL Java_ml_dmlc_tvm_LibInfo_tvmFuncPushArgLong(
   value.v_int64 = static_cast<int64_t>(arg);
   TVMFuncArgsThreadLocalEntry *e = TVMFuncArgsThreadLocalStore::Get();
   e->tvmFuncArgValues.push_back(value);
-  e->tvmFuncArgTypes.push_back(kInt);
+  e->tvmFuncArgTypes.push_back(kDLInt);
 }
 
 JNIEXPORT void JNICALL Java_ml_dmlc_tvm_LibInfo_tvmFuncPushArgDouble(
@@ -71,7 +71,7 @@ JNIEXPORT void JNICALL Java_ml_dmlc_tvm_LibInfo_tvmFuncPushArgDouble(
   value.v_float64 = static_cast<double>(arg);
   TVMFuncArgsThreadLocalEntry *e = TVMFuncArgsThreadLocalStore::Get();
   e->tvmFuncArgValues.push_back(value);
-  e->tvmFuncArgTypes.push_back(kFloat);
+  e->tvmFuncArgTypes.push_back(kDLFloat);
 }
 
 JNIEXPORT void JNICALL Java_ml_dmlc_tvm_LibInfo_tvmFuncPushArgString(
@@ -384,7 +384,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_tvm_LibInfo_tvmArrayGetShape(
   jmethodID arrayAppend = env->GetMethodID(arrayClass, "add", "(Ljava/lang/Object;)Z");
   for (int i = 0; i < ndim; ++i) {
     jobject data = env->NewObject(longClass, newLong, static_cast<jlong>(shape[i]));
-    env->CallObjectMethod(jshape, arrayAppend, data);
+    env->CallBooleanMethod(jshape, arrayAppend, data);
     env->DeleteLocalRef(data);
   }
   env->DeleteLocalRef(longClass);
diff --git a/make/config.mk b/make/config.mk
deleted file mode 100644
index 53775df1ab36..000000000000
--- a/make/config.mk
+++ /dev/null
@@ -1,70 +0,0 @@
-#-------------------------------------------------------------------------------
-#  Template configuration for compiling
-#
-#  If you want to change the configuration, please use the following
-#  steps. Assume you are on the root directory. First copy the this
-#  file so that any local changes will be ignored by git
-#
-#  $ cp make/config.mk .
-#
-#  Next modify the according entries, and then compile by
-#
-#  $ make
-#
-#  or build in parallel with 8 threads
-#
-#  $ make -j8
-#-------------------------------------------------------------------------------
-
-# whether compile with debug
-DEBUG = 0
-
-# the additional link flags you want to add
-ADD_LDFLAGS =
-
-# the additional compile flags you want to add
-ADD_CFLAGS =
-
-#---------------------------------------------
-# Backend runtimes.
-#---------------------------------------------
-# whether enable CUDA during compile
-USE_CUDA = 0
-
-# add the path to CUDA library to link and compile flag
-# if you have already add them to environment variable.
-# CUDA_PATH = /usr/local/cuda
-
-# ROCM
-USE_ROCM = 0
-
-# whether enable OpenCL during compile
-USE_OPENCL = 0
-
-# whether enable Metal during compile
-USE_METAL = 0
-
-# Whether enable RPC during compile
-USE_RPC = 1
-
-# Whether enable tiny embedded graph runtime.
-USE_GRAPH_RUNTIME = 1
-
-# whether build with LLVM support
-# Requires LLVM version >= 4.0
-# Set LLVM_CONFIG to your version, uncomment to build with llvm support
-#
-# LLVM_CONFIG = llvm-config
-
-#---------------------------------------------
-# Contrib optional libraries.
-#---------------------------------------------
-# Whether use BLAS, choices: openblas, atlas, blas, apple
-USE_BLAS = none
-
-# Whether use NNPack
-USE_NNPACK = 0
-# NNPACK_PATH = none
-
-# Whether use CuDNN
-USE_CUDNN = 0
diff --git a/make/contrib/cblas.mk b/make/contrib/cblas.mk
deleted file mode 100644
index b5f3237d7270..000000000000
--- a/make/contrib/cblas.mk
+++ /dev/null
@@ -1,17 +0,0 @@
-CBLAS_CONTRIB_SRC = $(wildcard src/contrib/cblas/*.cc)
-CBLAS_CONTRIB_OBJ = $(patsubst src/%.cc, build/%.o, $(CBLAS_CONTRIB_SRC))
-
-ifeq ($(USE_BLAS), openblas)
-	ADD_LDFLAGS += -lopenblas
-	RUNTIME_DEP += $(CBLAS_CONTRIB_OBJ)
-else ifeq ($(USE_BLAS), atlas)
-	ADD_LDFLAGS += -lcblas
-	RUNTIME_DEP += $(CBLAS_CONTRIB_OBJ)
-else ifeq ($(USE_BLAS), blas)
-	ADD_LDFLAGS += -lblas
-	RUNTIME_DEP += $(CBLAS_CONTRIB_OBJ)
-else ifeq ($(USE_BLAS), apple)
-	ADD_CFLAGS += -I/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/
-	FRAMEWORKS += -framework Accelerate
-	RUNTIME_DEP += $(CBLAS_CONTRIB_OBJ)
-endif
diff --git a/make/contrib/cudnn.mk b/make/contrib/cudnn.mk
deleted file mode 100644
index f3959fe6533c..000000000000
--- a/make/contrib/cudnn.mk
+++ /dev/null
@@ -1,8 +0,0 @@
-CUDNN_CONTRIB_SRC = $(wildcard src/contrib/cudnn/*.cc)
-CUDNN_CONTRIB_OBJ = $(patsubst src/%.cc, build/%.o, $(CUDNN_CONTRIB_SRC))
-
-ifeq ($(USE_CUDNN), 1)
-CFLAGS += -DTVM_USE_CUDNN=1 -I$(CUDA_PATH)/include
-ADD_LDFLAGS += -lcudnn
-RUNTIME_DEP += $(CUDNN_CONTRIB_OBJ)
-endif
diff --git a/make/contrib/nnpack.mk b/make/contrib/nnpack.mk
deleted file mode 100644
index 718bfe2ae414..000000000000
--- a/make/contrib/nnpack.mk
+++ /dev/null
@@ -1,12 +0,0 @@
-NNPACK_CONTRIB_SRC = $(wildcard src/contrib/nnpack/*.cc)
-NNPACK_CONTRIB_OBJ = $(patsubst src/%.cc, build/%.o, $(NNPACK_CONTRIB_SRC))
-
-ifeq ($(USE_NNPACK), 1)
-ifndef NNPACK_PATH
-	NNPACK_PATH = $(ROOTDIR)/NNPACK
-endif
-	PTHREAD_POOL_PATH = $(NNPACK_PATH)/deps/pthreadpool
-	CFLAGS += -DTVM_USE_NNPACK=1 -I$(NNPACK_PATH)/include -I$(PTHREAD_POOL_PATH)/include
-	LDFLAGS += -L$(NNPACK_PATH)/lib -lnnpack -lpthreadpool -lpthread
-	RUNTIME_DEP += $(NNPACK_CONTRIB_OBJ)
-endif
diff --git a/nnvm/Makefile b/nnvm/Makefile
new file mode 100644
index 000000000000..adbae329e144
--- /dev/null
+++ b/nnvm/Makefile
@@ -0,0 +1,103 @@
+ROOTDIR = $(CURDIR)
+
+ifndef config
+ifneq ("$(wildcard ./config.mk)", "")
+	config = config.mk
+else
+	config = make/config.mk
+endif
+endif
+include $(config)
+
+TVMPATH = ..
+
+export LDFLAGS = -pthread -lm
+export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
+CFLAGS += -I$(TVMPATH)/include -I$(TVMPATH)/dlpack/include -I$(TVMPATH)/HalideIR/src -I$(TVMPATH)/topi/include
+
+ifdef DMLC_CORE_PATH
+  CFLAGS += -I$(DMLC_CORE_PATH)/include
+else
+  CFLAGS += -I$(ROOTDIR)/../dmlc-core/include
+endif
+
+ifneq ($(ADD_CFLAGS), NONE)
+	CFLAGS += $(ADD_CFLAGS)
+endif
+
+ifneq ($(ADD_LDFLAGS), NONE)
+	LDFLAGS += $(ADD_LDFLAGS)
+endif
+
+# plugin
+PLUGIN_OBJ =
+include $(NNVM_PLUGINS)
+
+# specify tensor path
+.PHONY: clean all test lint cpplint pylint doc cython cython3 cyclean
+
+UNAME_S := $(shell uname -s)
+
+ifeq ($(UNAME_S), Darwin)
+	SHARED_LIBRARY_SUFFIX := dylib
+	WHOLE_ARCH= -all_load
+	NO_WHOLE_ARCH= -noall_load
+	LDFLAGS += -undefined dynamic_lookup
+else
+	SHARED_LIBRARY_SUFFIX := so
+	WHOLE_ARCH= --whole-archive
+	NO_WHOLE_ARCH= --no-whole-archive
+endif
+
+all: lib/libnnvm.a lib/libnnvm_compiler.$(SHARED_LIBRARY_SUFFIX)
+
+SRC = $(wildcard src/*.cc src/c_api/*.cc src/core/*.cc src/pass/*.cc)
+SRC_COMPILER = $(wildcard src/top/*/*.cc wildcard src/top/vision/*/*.cc src/compiler/*.cc src/compiler/*/*.cc)
+ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC))
+TOP_OBJ = $(patsubst %.cc, build/%.o, $(SRC_COMPILER))
+ALL_DEP = $(ALL_OBJ)
+
+include tests/cpp/unittest.mk
+
+test: $(TEST)
+
+build/src/%.o: src/%.cc
+	@mkdir -p $(@D)
+	$(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
+	$(CXX) -c $(CFLAGS) -c $< -o $@
+
+lib/libnnvm.a: $(ALL_DEP)
+	@mkdir -p $(@D)
+	$(AR) crv $@ $(filter %.o, $?)
+
+lib/libnnvm_compiler.$(SHARED_LIBRARY_SUFFIX): lib/libnnvm.a ${TOP_OBJ}
+	@mkdir -p $(@D)
+	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS) -Wl,${WHOLE_ARCH} lib/libnnvm.a -Wl,${NO_WHOLE_ARCH}
+
+cython:
+	cd python; python setup.py build_ext --inplace
+
+cython3:
+	cd python; python3 setup.py build_ext --inplace
+
+cyclean:
+	rm -rf python/nnvm/*/*.so python/nnvm/*/*.dylib python/nnvm/*/*.cpp
+
+lint: pylint cpplint
+
+doc:
+	doxygen docs/Doxyfile
+
+cpplint:
+	python ../dmlc-core/scripts/lint.py nnvm cpp include src
+
+pylint:
+	pylint python/nnvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+
+clean:
+	$(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o cli_test
+
+-include build/*.d
+-include build/*/*.d
+-include build/*/*/*.d
+-include build/*/*/*/*.d
diff --git a/nnvm/README.md b/nnvm/README.md
new file mode 100644
index 000000000000..ed8a18e3fa1e
--- /dev/null
+++ b/nnvm/README.md
@@ -0,0 +1,35 @@
+# NNVM Compiler Module of TVM Stack
+
+```python
+import tvm
+from tvm.contrib import graph_runtime, rpc
+import nnvm.frontend
+import nnvm.compiler
+
+# GET model from frameworks
+# change xyz to supported framework name.
+graph, params = nnvm.frontend.from_xyz(...)
+
+# OPTIMIZE and COMPILE the graph to get a deployable module
+# target can be "opencl", "llvm", "metal" or any target supported by tvm
+target = "cuda"
+graph, lib, params = nnvm.compiler.build(graph, target, {"data", data_shape}, params=params)
+
+# DEPLOY and run on gpu(0)
+module = graph_runtime.create(graph, lib, tvm.gpu(0))
+module.set_input(**params)
+module.run(data=data_array)
+output = tvm.nd.empty(out_shape, ctx=tvm.gpu(0))
+module.get_output(0, output)
+
+# DEPLOY to REMOTE mobile/rasp/browser with minimum tvm rpc runtime
+# useful for quick experiments on mobile devices
+remote = rpc.connect(remote_host, remote_port)
+lib.export_library("mylib.so")
+remote.upload("mylib.so")
+rlib = rpc.load_module("mylib.so")
+# run on remote device
+rmodule = graph_runtime.create(graph, rlib, remote.gpu(0))
+rmodule.set_input(**params)
+rmodule.run()
+```
diff --git a/nnvm/amalgamation/.gitignore b/nnvm/amalgamation/.gitignore
new file mode 100644
index 000000000000..e808ea2764c3
--- /dev/null
+++ b/nnvm/amalgamation/.gitignore
@@ -0,0 +1,2 @@
+nnvm.d
+nnvm.cc
diff --git a/nnvm/amalgamation/Makefile b/nnvm/amalgamation/Makefile
new file mode 100644
index 000000000000..1f286f055237
--- /dev/null
+++ b/nnvm/amalgamation/Makefile
@@ -0,0 +1,32 @@
+export NNVM_ROOT=`pwd`/..
+export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
+
+ifdef DMLC_CORE_PATH
+  CFLAGS += -I$(DMLC_CORE_PATH)/include
+else
+  CFLAGS += -I$(CURDIR)/../dmlc-core/include
+endif
+
+.PHONY: all clean
+
+all: libnnvm.a
+
+nnvm.cc:
+	python generate.py $@
+
+nnvm.d: nnvm.cc
+	${CXX} ${CFLAGS} -M -MT nnvm.o \
+		-I ${NNVM_ROOT}/ -I ${NNVM_ROOT}/include \
+		-D__MIN__=$(MIN) $+ > nnvm.d
+
+nnvm-all.cc: nnvm.d nnvm.cc
+	python ./amalgamation.py $+ $@
+
+nnvm-all.o: nnvm-all.cc
+	${CXX} ${CFLAGS} -fPIC -o $@ -c $+
+
+libnnvm.a: nnvm-all.o
+	ar rcs $@ $+
+
+clean:
+	rm -f *.d *.o *.so *.a nnvm-all.cc nnvm.cc
diff --git a/nnvm/amalgamation/README b/nnvm/amalgamation/README
new file mode 100644
index 000000000000..be16ec5f6c8f
--- /dev/null
+++ b/nnvm/amalgamation/README
@@ -0,0 +1 @@
+This folder is deprecated and will be deleted in the future.
\ No newline at end of file
diff --git a/nnvm/amalgamation/amalgamation.py b/nnvm/amalgamation/amalgamation.py
new file mode 100644
index 000000000000..310daa9d68e0
--- /dev/null
+++ b/nnvm/amalgamation/amalgamation.py
@@ -0,0 +1,100 @@
+import sys
+import os.path, re, StringIO
+
+blacklist = [
+    'Windows.h',
+    'mach/clock.h', 'mach/mach.h',
+    'malloc.h',
+    'glog/logging.h', 'io/azure_filesys.h', 'io/hdfs_filesys.h', 'io/s3_filesys.h',
+    'sys/stat.h', 'sys/types.h',
+    'omp.h', 'execinfo.h', 'packet/sse-inl.h'
+    ]
+
+
+def get_sources(def_file):
+    sources = []
+    files = []
+    visited = set()
+    mxnet_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir))
+    for line in open(def_file):
+        files = files + line.strip().split(' ')
+
+    for f in files:
+        f = f.strip()
+        if not f or f.endswith('.o:') or f == '\\': continue
+        fn = os.path.relpath(f)
+        if os.path.abspath(f).startswith(mxnet_path) and fn not in visited:
+            sources.append(fn)
+            visited.add(fn)
+    return sources
+
+sources = get_sources(sys.argv[1])
+
+def find_source(name, start):
+    candidates = []
+    for x in sources:
+        if x == name or x.endswith('/' + name): candidates.append(x)
+    if not candidates: return ''
+    if len(candidates) == 1: return candidates[0]
+    for x in candidates:
+        if x.split('/')[1] == start.split('/')[1]: return x
+    return ''
+
+
+re1 = re.compile('<([./a-zA-Z0-9_-]*)>')
+re2 = re.compile('"([./a-zA-Z0-9_-]*)"')
+
+sysheaders = []
+history = set([])
+out = StringIO.StringIO()
+
+def expand(x, pending):
+    if x in history and x not in ['mshadow/mshadow/expr_scalar-inl.h']: # MULTIPLE includes
+        return
+
+    if x in pending:
+        #print 'loop found: %s in ' % x, pending
+        return
+
+    print >>out, "//===== EXPANDING: %s =====\n" %x
+    for line in open(x):
+        if line.find('#include') < 0:
+            out.write(line)
+            continue
+        if line.strip().find('#include') > 0:
+            print line
+            continue
+        m = re1.search(line)
+        if not m: m = re2.search(line)
+        if not m:
+            print line + ' not found'
+            continue
+        h = m.groups()[0].strip('./')
+        source = find_source(h, x)
+        if not source:
+            if (h not in blacklist and
+                h not in sysheaders and
+                'mkl' not in h and
+                'nnpack' not in h): sysheaders.append(h)
+        else:
+            expand(source, pending + [x])
+    print >>out, "//===== EXPANDED: %s =====\n" %x
+    history.add(x)
+
+
+expand(sys.argv[2], [])
+
+f = open(sys.argv[3], 'wb')
+
+
+
+for k in sorted(sysheaders):
+    print >>f, "#include <%s>" % k
+
+print >>f, ''
+print >>f, out.getvalue()
+
+for x in sources:
+    if x not in history and not x.endswith('.o'):
+        print 'Not processed:', x
+
diff --git a/nnvm/amalgamation/generate.py b/nnvm/amalgamation/generate.py
new file mode 100644
index 000000000000..84a5fc06fb03
--- /dev/null
+++ b/nnvm/amalgamation/generate.py
@@ -0,0 +1,18 @@
+import os
+import sys
+
+FOLDERS = ["core", "pass", "c_api"]
+
+fo = open(sys.argv[1], "w")
+
+
+
+for folder in FOLDERS:
+    path = str(os.path.join("../src", folder))
+    flst = os.listdir(path)
+    for f in flst:
+    	if f.endswith(".cc") == True:
+        	fo.write('#include "' + str(os.path.join("src", folder, f)) + '"\n')
+
+
+fo.close()
diff --git a/nnvm/include/nnvm/base.h b/nnvm/include/nnvm/base.h
new file mode 100644
index 000000000000..449bd2f4626e
--- /dev/null
+++ b/nnvm/include/nnvm/base.h
@@ -0,0 +1,35 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/base.h
+ * \brief Configuration of nnvm as well as basic data structure.
+ */
+#ifndef NNVM_BASE_H_
+#define NNVM_BASE_H_
+
+#include <dmlc/base.h>
+#include <dmlc/common.h>
+#include <dmlc/any.h>
+#include <dmlc/memory.h>
+#include <dmlc/logging.h>
+#include <dmlc/registry.h>
+#include <dmlc/array_view.h>
+
+namespace nnvm {
+
+/*! \brief any type */
+using dmlc::any;
+
+/*! \brief array_veiw type  */
+using dmlc::array_view;
+
+/*!\brief getter function of any type */
+using dmlc::get;
+
+}  // namespace nnvm
+
+// describe op registration point
+#define NNVM_STRINGIZE_DETAIL(x) #x
+#define NNVM_STRINGIZE(x) NNVM_STRINGIZE_DETAIL(x)
+#define NNVM_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" NNVM_STRINGIZE(__LINE__))
+#define NNVM_ADD_FILELINE "\n\nDefined in " __FILE__ ":L" NNVM_STRINGIZE(__LINE__)
+#endif  // NNVM_BASE_H_
diff --git a/nnvm/include/nnvm/c_api.h b/nnvm/include/nnvm/c_api.h
new file mode 100644
index 000000000000..daf9b564f3fa
--- /dev/null
+++ b/nnvm/include/nnvm/c_api.h
@@ -0,0 +1,388 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/c_api.h
+ * \brief C API of NNVM symbolic construction and pass.
+ *  Enables construction and transformation of Graph
+ *  in any other host languages.
+ */
+#ifndef NNVM_C_API_H_
+#define NNVM_C_API_H_
+
+/*! \brief NNVM_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef NNVM_EXPORTS
+#define NNVM_DLL __declspec(dllexport)
+#else
+#define NNVM_DLL __declspec(dllimport)
+#endif
+#else
+#define NNVM_DLL
+#endif
+
+/*! \brief manually define unsigned int */
+typedef unsigned int nn_uint;
+
+/*! \brief handle to a function that takes param and creates symbol */
+typedef void *OpHandle;
+/*! \brief handle to a symbol that can be bind as operator */
+typedef void *SymbolHandle;
+/*! \brief handle to Graph */
+typedef void *GraphHandle;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief Set the last error message needed by C API
+ * \param msg The error message to set.
+ */
+NNVM_DLL void NNAPISetLastError(const char* msg);
+
+/*!
+ * \brief return str message of the last error
+ *  all function in this file will return 0 when success
+ *  and -1 when an error occured,
+ *  NNGetLastError can be called to retrieve the error
+ *
+ *  this function is threadsafe and can be called by different thread
+ *  \return error info
+ */
+NNVM_DLL const char *NNGetLastError(void);
+
+/*!
+ * \brief list all the available operator names, include entries.
+ * \param out_size the size of returned array
+ * \param out_array the output operator name array.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNListAllOpNames(nn_uint *out_size,
+                              const char*** out_array);
+
+/*!
+ * \brief Get operator handle given name.
+ * \param op_name The name of the operator.
+ * \param op_out The returnning op handle.
+ */
+NNVM_DLL int NNGetOpHandle(const char* op_name,
+                           OpHandle* op_out);
+
+/*!
+ * \brief list all the available operators.
+ *  This won't include the alias, use ListAllNames
+ *  instead to get all alias names.
+ *
+ * \param out_size the size of returned array
+ * \param out_array the output AtomicSymbolCreator array
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNListUniqueOps(nn_uint *out_size,
+                             OpHandle **out_array);
+
+/*!
+ * \brief Get the detailed information about atomic symbol.
+ * \param op The operator handle.
+ * \param real_name The returned name of the creator.
+ *   This name is not the alias name of the atomic symbol.
+ * \param description The returned description of the symbol.
+ * \param num_doc_args Number of arguments that contain documents.
+ * \param arg_names Name of the arguments of doc args
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \param return_type Return type of the function, if any.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNGetOpInfo(OpHandle op,
+                         const char **real_name,
+                         const char **description,
+                         nn_uint *num_doc_args,
+                         const char ***arg_names,
+                         const char ***arg_type_infos,
+                         const char ***arg_descriptions,
+                         const char **return_type);
+/*!
+ * \brief Create an AtomicSymbol functor.
+ * \param op The operator handle
+ * \param num_param the number of parameters
+ * \param keys the keys to the params
+ * \param vals the vals of the params
+ * \param out pointer to the created symbol handle
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolCreateAtomicSymbol(OpHandle op,
+                                        nn_uint num_param,
+                                        const char **keys,
+                                        const char **vals,
+                                        SymbolHandle *out);
+/*!
+ * \brief Create a Variable Symbol.
+ * \param name name of the variable
+ * \param out pointer to the created symbol handle
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolCreateVariable(const char *name, SymbolHandle *out);
+/*!
+ * \brief Create a Symbol by grouping list of symbols together
+ * \param num_symbols number of symbols to be grouped
+ * \param symbols array of symbol handles
+ * \param out pointer to the created symbol handle
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolCreateGroup(nn_uint num_symbols,
+                                 SymbolHandle *symbols,
+                                 SymbolHandle *out);
+/*!
+ * \brief Add src_dep to the handle as control dep.
+ * \param handle The symbol to add dependency edges on.
+ * \param src_dep the source handles.
+ */
+NNVM_DLL int NNAddControlDeps(SymbolHandle handle,
+                              SymbolHandle src_dep);
+/*!
+ * \brief Free the symbol handle.
+ * \param symbol the symbol
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolFree(SymbolHandle symbol);
+/*!
+ * \brief Copy the symbol to another handle
+ * \param symbol the source symbol
+ * \param out used to hold the result of copy
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolCopy(SymbolHandle symbol, SymbolHandle *out);
+/*!
+ * \brief Print the content of symbol, used for debug.
+ * \param symbol the symbol
+ * \param out_str pointer to hold the output string of the printing.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolPrint(SymbolHandle symbol, const char **out_str);
+/*!
+ * \brief Get string attribute from symbol
+ * \param symbol the source symbol
+ * \param key The key of the symbol.
+ * \param out The result attribute, can be NULL if the attribute do not exist.
+ * \param success Whether the result is contained in out.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolGetAttr(SymbolHandle symbol,
+                             const char* key,
+                             const char** out,
+                             int *success);
+/*!
+ * \brief Set string attribute from symbol.
+ *  NOTE: Setting attribute to a symbol can affect the semantics(mutable/immutable) of symbolic graph.
+ *
+ *  Safe recommendaton: use  immutable graph
+ *  - Only allow set attributes during creation of new symbol as optional parameter
+ *
+ *  Mutable graph (be careful about the semantics):
+ *  - Allow set attr at any point.
+ *  - Mutating an attribute of some common node of two graphs can cause confusion from user.
+ *
+ * \param symbol the source symbol
+ * \param num_param Number of parameters to set.
+ * \param keys The keys of the attribute
+ * \param values The value to be set
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolSetAttrs(SymbolHandle symbol,
+                              nn_uint num_param,
+                              const char** keys,
+                              const char** values);
+/*!
+ * \brief Get all attributes from symbol, including all descendents.
+ * \param symbol the source symbol
+ * \param recursive_option 0 for recursive, 1 for shallow.
+ * \param out_size The number of output attributes
+ * \param out 2*out_size strings representing key value pairs.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolListAttrs(SymbolHandle symbol,
+                               int recursive_option,
+                               nn_uint *out_size,
+                               const char*** out);
+
+/*!
+ * \brief List inputs variables in the symbol.
+ * \param symbol the symbol
+ * \param option The option to list the inputs
+ *   option=0 means list all arguments.
+ *   option=1 means list arguments that are readed only by the graph.
+ *   option=2 means list arguments that are mutated by the graph.
+ * \param out_size output size
+ * \param out_sym_array the output array.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolListInputVariables(SymbolHandle symbol,
+                                        int option,
+                                        nn_uint *out_size,
+                                        SymbolHandle** out_sym_array);
+
+/*!
+ * \brief List input names in the symbol.
+ * \param symbol the symbol
+ * \param option The option to list the inputs
+ *   option=0 means list all arguments.
+ *   option=1 means list arguments that are readed only by the graph.
+ *   option=2 means list arguments that are mutated by the graph.
+ * \param out_size output size
+ * \param out_str_array pointer to hold the output string array
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolListInputNames(SymbolHandle symbol,
+                                    int option,
+                                    nn_uint *out_size,
+                                    const char ***out_str_array);
+/*!
+ * \brief List returns names in the symbol.
+ * \param symbol the symbol
+ * \param out_size output size
+ * \param out_str_array pointer to hold the output string array
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolListOutputNames(SymbolHandle symbol,
+                                     nn_uint *out_size,
+                                     const char ***out_str_array);
+
+
+/*!
+ * \brief Supply number of outputs of the symbol.
+ * \param symbol the symbol
+ * \param output_count number of outputs
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolGetNumOutputs(SymbolHandle symbol,
+                                    nn_uint *output_count);
+
+/*!
+ * \brief Get a symbol that contains all the internals.
+ * \param symbol The symbol
+ * \param out The output symbol whose outputs are all the internals.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolGetInternals(SymbolHandle symbol,
+                                  SymbolHandle *out);
+/*!
+ * \brief Get a symbol that contains only direct children.
+ * \param symbol The symbol
+ * \param out The output symbol whose outputs are the direct children.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolGetChildren(SymbolHandle symbol,
+                                 SymbolHandle *out);
+/*!
+ * \brief Get index-th outputs of the symbol.
+ * \param symbol The symbol
+ * \param index the Index of the output.
+ * \param out The output symbol whose outputs are the index-th symbol.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolGetOutput(SymbolHandle symbol,
+                               nn_uint index,
+                               SymbolHandle *out);
+
+/*!
+ * \brief Compose the symbol on other symbols.
+ *
+ *  This function will change the sym hanlde.
+ *  To achieve function apply behavior, copy the symbol first
+ *  before apply.
+ *
+ * \param sym the symbol to apply
+ * \param name the name of symbol
+ * \param num_args number of arguments
+ * \param keys the key of keyword args (optional)
+ * \param args arguments to sym
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNSymbolCompose(SymbolHandle sym,
+                             const char* name,
+                             nn_uint num_args,
+                             const char** keys,
+                             SymbolHandle* args);
+
+// Graph IR API
+/*!
+ * \brief create a graph handle from symbol
+ * \param symbol The symbol representing the graph.
+ * \param graph The graph handle created.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNGraphCreate(SymbolHandle symbol, GraphHandle *graph);
+/*!
+ * \brief free the graph handle
+ * \param handle The handle to be freed.
+ */
+NNVM_DLL int NNGraphFree(GraphHandle handle);
+/*!
+ * \brief Get a new symbol from the graph.
+ * \param graph The graph handle.
+ * \param symbol The corresponding symbol
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNGraphGetSymbol(GraphHandle graph, SymbolHandle *symbol);
+
+/*!
+ * \brief Get Set a attribute in json format.
+ * This feature allows pass graph attributes back and forth in reasonable speed.
+ *
+ * \param handle The graph handle.
+ * \param key The key to the attribute.
+ * \param json_value The value need to be in format [type_name, value],
+ *  Where type_name is a registered type string in C++ side via DMLC_JSON_ENABLE_ANY.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNGraphSetJSONAttr(GraphHandle handle,
+                                const char* key,
+                                const char* json_value);
+
+/*!
+ * \brief Get a serialized attrirbute from graph.
+ * This feature allows pass graph attributes back and forth in reasonable speed.
+ *
+ * \param handle The graph handle.
+ * \param key The key to the attribute.
+ * \param json_out The result attribute, can be NULL if the attribute do not exist.
+ *  The json_out is an array of [type_name, value].
+ *  Where the type_name is a registered type string in C++ side via DMLC_JSON_ENABLE_ANY.
+ * \param success Whether the result is contained in out.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNGraphGetJSONAttr(GraphHandle handle,
+                                const char* key,
+                                const char** json_out,
+                                int *success);
+
+/*!
+ * \brief Set a attribute whose type is std::vector<NodeEntry> in c++
+ * This feature allows pass List of symbolic variables for gradient request.
+ *
+ * \note This is beta feature only used for test purpos
+ *
+ * \param handle The graph handle.
+ * \param key The key to the attribute.
+ * \param list The symbol whose outputs represents the list of NodeEntry to be passed.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNGraphSetNodeEntryListAttr_(GraphHandle handle,
+                                          const char* key,
+                                          SymbolHandle list);
+/*!
+ * \brief Apply passes on the src graph.
+ * \param src The source graph handle.
+ * \param num_pass The number of pass to be applied.
+ * \param pass_names The names of the pass.
+ * \param dst The result graph.
+ * \return 0 when success, -1 when failure happens
+ */
+NNVM_DLL int NNGraphApplyPasses(GraphHandle src,
+                                nn_uint num_pass,
+                                const char** pass_names,
+                                GraphHandle *dst);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // NNVM_C_API_H_
diff --git a/nnvm/include/nnvm/compiler/op_attr_types.h b/nnvm/include/nnvm/compiler/op_attr_types.h
new file mode 100644
index 000000000000..497a520db78e
--- /dev/null
+++ b/nnvm/include/nnvm/compiler/op_attr_types.h
@@ -0,0 +1,101 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file nnvm/compiler/op_attr_types.h
+ * \brief The Expr and related elements in DataFlow construction.
+ */
+#ifndef NNVM_COMPILER_OP_ATTR_TYPES_H_
+#define NNVM_COMPILER_OP_ATTR_TYPES_H_
+
+#include <tvm/expr.h>
+#include <tvm/tensor.h>
+#include <tvm/schedule.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/runtime/registry.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/graph.h>
+#include <vector>
+#include <string>
+#include "packed_func_ext.h"
+
+namespace nnvm {
+namespace compiler {
+
+using ::tvm::Array;
+using ::tvm::Tensor;
+using ::tvm::Schedule;
+
+/*! \brief operator pattern used in graph fusion */
+enum OpPatternKind {
+  // Elementwise operation
+  kElemWise = 0,
+  // Broadcasting operator, can always map output axis to the input in order.
+  // for example :code:`out[i, ax1, j, ax2] = input[i, j]`.
+  // Note that the axis need to be in order so transpose is not a bcast operator.
+  kBroadcast = 1,
+  // Injective operator, can always injectively map output axis to a single input axis.
+  // All injective operator can still be safely fused to injective and reduction.
+  kInjective = 2,
+  // Communicative reduction operator.
+  kCommReduce = 3,
+  // Complex operation, can still fuse elemwise operations into its output.
+  // but cannot chain another complex op
+  kOutEWiseFusable = 4,
+  // Opaque operation, cannot fuse anything.
+  kOpaque = 8
+};
+
+/*! \brief the operator pattern */
+using TOpPattern = int;
+
+/*!
+ * \brief Computation description interface
+ * \param attrs The attribute of the node.
+ * \param inputs The input tensors(placeholders)
+ * \param out_info Tensors holding shape/type information about output,
+ &                 these are always placeholders.
+ * \return The output description of the tensor.
+ */
+using FTVMCompute = std::function<
+  Array<Tensor>(const NodeAttrs& attrs,
+                const Array<Tensor>& inputs,
+                const Array<Tensor>& out_info)>;
+
+/*!
+ * \brief Build the computation schedule for
+ *  op whose root is at current op.
+ * \param attrs The attribute of the node.
+ * \param outs The output tensors.
+ * \param target The build target.
+ * \return schedule The computation schedule.
+ */
+using FTVMSchedule = std::function<
+  Schedule(const NodeAttrs& attrs,
+           const Array<Tensor>& outs,
+           const std::string& target)>;
+
+/*!
+ * \brief Modify the op node to alter its input layout.
+ *  it is invoked in AlterOpLayout pass.
+ * \param attrs The attribute of the original node.
+ * \param inputs The input symbols of the original node.
+ * \param tinfos The inferred shape and dtype of the inputs.
+ * \param ret The replaced operator.
+ * \return Whether to replace current operator.
+ */
+using FTVMAlterOpLayout = std::function<
+  bool(const NodeAttrs& attrs,
+       const Symbol& inputs,
+       const Array<Tensor>& tinfos,
+       Symbol* ret)>;
+
+/*!
+ * \brief Transform from normal operator to vectorized operator
+ * \param node The source node.
+ * \return Transformed vectorized op.
+ */
+using FTVMVectorizedOp = std::function<nnvm::NodePtr (const nnvm::Node* node)>;
+
+}  // namespace compiler
+}  // namespace nnvm
+#endif  // NNVM_COMPILER_OP_ATTR_TYPES_H_
diff --git a/nnvm/include/nnvm/compiler/packed_func_ext.h b/nnvm/include/nnvm/compiler/packed_func_ext.h
new file mode 100644
index 000000000000..e289fd4efa59
--- /dev/null
+++ b/nnvm/include/nnvm/compiler/packed_func_ext.h
@@ -0,0 +1,59 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file nnvm/compiler/packed_func_ext.h
+ * \brief Extension to enable packed functionn for nnvm types
+ */
+#ifndef NNVM_COMPILER_PACKED_FUNC_EXT_H_
+#define NNVM_COMPILER_PACKED_FUNC_EXT_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <nnvm/graph.h>
+#include <nnvm/symbolic.h>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+namespace nnvm {
+namespace compiler {
+
+using tvm::runtime::PackedFunc;
+
+using AttrDict = std::unordered_map<std::string, std::string>;
+
+/*!
+ * \brief Get PackedFunction from global registry and
+ *  report error if it does not exist
+ * \param name The name of the function.
+ * \return The created PackedFunc.
+ */
+inline const PackedFunc& GetPackedFunc(const std::string& name) {
+  const PackedFunc* pf = tvm::runtime::Registry::Get(name);
+  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
+  return *pf;
+}
+}  // namespace compiler
+}  // namespace nnvm
+
+// Enable the graph and symbol object exchange.
+namespace tvm {
+namespace runtime {
+
+template<>
+struct extension_class_info<nnvm::Symbol> {
+  static const int code = 16;
+};
+
+template<>
+struct extension_class_info<nnvm::Graph> {
+  static const int code = 17;
+};
+
+template<>
+struct extension_class_info<nnvm::compiler::AttrDict> {
+  static const int code = 18;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // NNVM_COMPILER_PACKED_FUNC_EXT_H_
diff --git a/nnvm/include/nnvm/compiler/util.h b/nnvm/include/nnvm/compiler/util.h
new file mode 100644
index 000000000000..5d5bc4478530
--- /dev/null
+++ b/nnvm/include/nnvm/compiler/util.h
@@ -0,0 +1,33 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file nnvm/compiler/util.h
+* \brief Utility functions for nnvm compiler
+*/
+#ifndef NNVM_COMPILER_UTIL_H_
+#define NNVM_COMPILER_UTIL_H_
+
+#include <tvm/expr.h>
+#include <nnvm/tuple.h>
+
+namespace nnvm {
+namespace compiler {
+
+/*
+ * \brief Helper function to convert TShape to TVM array. Useful for
+ * passing data from NNVM param structures to TOPI ops.
+ *
+ * \param shape The shape to convert
+ *
+ * \return An Array of Expr, where each element is a constant int32
+ */
+inline tvm::Array<tvm::Expr> ShapeToArray(TShape shape) {
+  tvm::Array<tvm::Expr> result;
+  for (auto i : shape) {
+    result.push_back(tvm::make_const(tvm::Int(32), i));
+  }
+  return result;
+}
+
+}  // namespace compiler
+}  // namespace nnvm
+#endif  // NNVM_COMPILER_UTIL_H_
diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
new file mode 100644
index 000000000000..1d3b662ff0b8
--- /dev/null
+++ b/nnvm/include/nnvm/graph.h
@@ -0,0 +1,315 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/graph.h
+ * \brief Configuation of nnvm as well as basic data structure.
+ */
+#ifndef NNVM_GRAPH_H_
+#define NNVM_GRAPH_H_
+
+#include <vector>
+#include <string>
+#include <utility>
+#include <algorithm>
+#include <unordered_map>
+#include <unordered_set>
+#include "./base.h"
+#include "./node.h"
+#include "./symbolic.h"
+
+namespace nnvm {
+
+class IndexedGraph;
+
+/*!
+ * \brief Symbolic computation graph.
+ *  This is the intermediate representation for optimization pass.
+ */
+class Graph {
+ public:
+  /*! \brief outputs of the computation graph. */
+  std::vector<NodeEntry> outputs;
+  /*!
+   * \brief attributes of a graph
+   *  Note that attribute is shared pointer and can be shared across graphs.
+   *
+   *  It is highly recommended to keep each attribute immutable.
+   *  It is also safe to implement an copy-on-write semnatics.
+   *
+   *  Copy when shared_ptr.unique is not true, while reuse original space
+   *  when shared_ptr.unique is true.
+   */
+  std::unordered_map<std::string, std::shared_ptr<any> > attrs;
+  /*!
+   * \brief Get the immutable attribute from attrs.
+   * \param attr_name the name of the attribute
+   * \return the reference to corresponding attribute
+   * \tparam T the type of the attribute.
+   */
+  template<typename T>
+  inline const T& GetAttr(const std::string& attr_name) const;
+  /*!
+   * \brief Check whether has a specific attribute.
+   * \param attr_name the name of the attribute
+   * \return a boolean result
+   */
+  inline bool HasAttr(const std::string& attr_name) const;
+  /*!
+   * \brief Get a move copy of the attribute, implement copy on write semantics.
+   *  The content is moved if the reference counter of shared_ptr is 1.
+   *  The attribute is erased from attrs after the call.
+   *
+   * \param attr_name the name of the attribute
+   * \return a new copy of the corresponding attribute.
+   * \tparam T the type of the attribute.
+   */
+  template<typename T>
+  inline T MoveCopyAttr(const std::string& attr_name);
+  /*!
+   * \brief get a indexed graph of current graph, if not exist, create it on demand
+   * \return The indexed graph.
+   * \sa IndexedGraph
+   */
+  const IndexedGraph& indexed_graph() const;
+
+ private:
+  // internal structure of indexed graph
+  mutable std::shared_ptr<const IndexedGraph> indexed_graph_;
+};
+
+/*!
+ * \brief Auxiliary data structure to index a graph.
+ *  It maps Nodes in the graph to consecutive integers node_id.
+ *  It also maps IndexedGraph::NodeEntry to consecutive integer entry_id.
+ *  This allows storing properties of Node and NodeEntry into
+ *  compact vector and quickly access them without resorting to hashmap.
+ *
+ *  The node_id and entry_rptr are the same as the JSON graph produced by SaveJSON Pass.
+ */
+class IndexedGraph {
+ public:
+  /*! \brief represents a data in the graph */
+  struct NodeEntry {
+    /*! \brief the source node id in the computation graph */
+    uint32_t node_id;
+    /*! \brief index of output from the source. */
+    uint32_t index;
+    /*! \brief version of the node */
+    uint32_t version;
+  };
+  /*! \brief Node data structure in IndexedGraph */
+  struct Node {
+    /*! \brief pointer to the source node */
+    const nnvm::Node* source;
+    /*! \brief inputs to the node */
+    array_view<NodeEntry> inputs;
+    /*! \brief control flow dependencies to the node */
+    array_view<uint32_t> control_deps;
+    /*! \brief weak reference to node */
+    std::weak_ptr<nnvm::Node> weak_ref;
+  };
+  /*! \return number of nodes in the graph */
+  inline size_t num_nodes() const {
+    return nodes_.size();
+  }
+  /*! \return total number of NodeEntry in the graph */
+  inline size_t num_node_entries() const {
+    return entry_rptr_.back();
+  }
+  /*!
+   * \brief Get a unique entry id between 0 to num_node_entries()
+   *  for a given IndexedGraph::NodeEntry
+   * \param node_id The node index
+   * \param index the output index
+   * \return the unique index.
+   */
+  inline uint32_t entry_id(uint32_t node_id, uint32_t index) const {
+    return entry_rptr_[node_id] + index;
+  }
+  /*!
+   * \brief Get a unique entry id between 0 to num_node_entries()
+   *  for a given IndexedGraph::NodeEntry
+   * \param e The entry to query for index.
+   * \return the unique index.
+   */
+  inline uint32_t entry_id(const NodeEntry& e) const {
+    return entry_rptr_[e.node_id] + e.index;
+  }
+  /*!
+   * \brief Get a unique entry id between 0 to num_node_entries()
+   *  for a given NodeEntry.
+   * \param e The entry to query for index.
+   * \return the unique index.
+   */
+  inline uint32_t entry_id(const nnvm::NodeEntry& e) const {
+    return entry_rptr_[node_id(e.node.get())] + e.index;
+  }
+  /*!
+   * \brief Get the corresponding node id for a given Node in the IndexedGraph.
+   * \param node The Node to query for index.
+   * \return the node index.
+   */
+  inline uint32_t node_id(const nnvm::Node* node) const {
+    return node2index_.at(node);
+  }
+  /*!
+   * \brief Get the corresponding Node structure for a given node_id.
+   * \param node_id The node id
+   * \return const reference to the corresponding IndexedGraph::Node
+   */
+  inline const Node& operator[](uint32_t node_id) const {
+    return nodes_[node_id];
+  }
+  /*!
+   * \brief Get the corresponding Node structure
+   * \param node The pointer to the Node structure
+   * \return const reference to the corresponding IndexedGraph::Node
+   */
+  inline const Node& operator[](const nnvm::Node* node) const {
+    return nodes_[node_id(node)];
+  }
+  /*! \return list of argument nodes */
+  inline const std::vector<uint32_t>& input_nodes() const {
+    return input_nodes_;
+  }
+  /*! \return list of mutable nodes */
+  inline const std::unordered_set<uint32_t>& mutable_input_nodes() const {
+    return mutable_input_nodes_;
+  }
+  /*! \return list of output entries */
+  inline const std::vector<NodeEntry>& outputs() const {
+    return outputs_;
+  }
+
+  /*! \return whether a node is existed in the indexed graph */
+  inline bool exist(const nnvm::Node* node) const {
+    return node2index_.count(node);
+  }
+
+  // disalllow copy assign
+  IndexedGraph(const IndexedGraph&) = delete;
+
+ private:
+  friend class Graph;
+  /*!
+   * \brief Constructor an IndexedGraph from normal Graph
+   * \param other The source graph.
+   */
+  explicit IndexedGraph(const Graph& other);
+  // Node pointers in CSR structure.
+  std::vector<Node> nodes_;
+  // Index to all input nodes.
+  std::vector<uint32_t> input_nodes_;
+  // Index to all mutable input nodes.
+  std::unordered_set<uint32_t> mutable_input_nodes_;
+  // space to store the outputs entries
+  std::vector<NodeEntry> outputs_;
+  // mapping from node to index.
+  std::unordered_map<const nnvm::Node*, uint32_t> node2index_;
+  // CSR pointer of node entries
+  std::vector<size_t> entry_rptr_;
+  // space to store input entries of each
+  std::vector<NodeEntry> input_entries_;
+  // control flow dependencies
+  std::vector<uint32_t> control_deps_;
+};
+
+/*!
+ * \brief perform a Post Order DFS visit to each node in the graph.
+ *  This order is deterministic and is also topoligical sorted.
+ * \param heads The heads in the graph.
+ * \param fvisit a function of type std::function<void(const std::shared_ptr<Node>&)>
+ * \tparam FVisit The function type to perform the visit.
+ */
+template<typename FVisit>
+inline void DFSVisit(const std::vector<NodeEntry>& heads, FVisit fvisit);
+
+// inline function implementations
+template<typename T>
+inline const T& Graph::GetAttr(const std::string& attr_name) const {
+  auto it = attrs.find(attr_name);
+  CHECK(it != attrs.end())
+      << "Cannot find attribute " << attr_name << " in the graph";
+  return nnvm::get<T>(*it->second);
+}
+
+inline bool Graph::HasAttr(const std::string& attr_name) const {
+  auto it = attrs.find(attr_name);
+  return it != attrs.end();
+}
+
+template<typename T>
+inline T Graph::MoveCopyAttr(const std::string& attr_name) {
+  auto it = attrs.find(attr_name);
+  CHECK(it != attrs.end())
+      << "Cannot find attribute " << attr_name << " in the graph";
+  std::shared_ptr<any> sptr = it->second;
+  attrs.erase(it);
+  if (sptr.unique()) {
+    return std::move(nnvm::get<T>(*sptr));
+  } else {
+    return nnvm::get<T>(*sptr);
+  }
+}
+
+template <typename GNode, typename HashType,
+           typename FVisit, typename HashFunc,
+          typename InDegree, typename GetInput>
+void PostOrderDFSVisit(const std::vector<GNode>& heads,
+                       FVisit fvisit,
+                       HashFunc hash,
+                       InDegree indegree,
+                       GetInput getinput) {
+  std::vector<std::pair<GNode, uint32_t> > stack;
+  std::unordered_set<HashType> visited;
+  for (auto& head : heads) {
+    HashType head_hash = hash(head);
+    if (visited.count(head_hash) == 0) {
+      stack.push_back(std::make_pair(head, 0));
+      visited.insert(head_hash);
+    }
+    while (!stack.empty()) {
+      std::pair<GNode, uint32_t>& back = stack.back();
+      if (back.second == indegree(back.first)) {
+        fvisit(back.first);
+        stack.pop_back();
+      } else {
+        const GNode& input = getinput(back.first, back.second++);
+        HashType input_hash = hash(input);
+        if (visited.count(input_hash) == 0) {
+          stack.push_back(std::make_pair(input, 0));
+          visited.insert(input_hash);
+        }
+      }
+    }
+  }
+}
+
+template<typename FVisit>
+inline void DFSVisit(const std::vector<NodeEntry>& heads,
+                     FVisit fvisit) {
+  typedef const NodePtr* GNode;
+  std::vector<GNode> head_nodes(heads.size());
+  std::transform(heads.begin(), heads.end(), head_nodes.begin(),
+                 [](const NodeEntry& e)->GNode {
+                   return &e.node;
+                 });
+  PostOrderDFSVisit<GNode, Node*>(
+      head_nodes,
+      [fvisit](GNode n) { fvisit(*n); },  // FVisit
+      [](GNode n)->Node* { return n->get(); },  // HashFunc
+      [](GNode n)->uint32_t {  // InDegree
+        if (!(*n)) return 0;
+        return (*n)->inputs.size() + (*n)->control_deps.size();
+      },
+      [](GNode n, uint32_t index)->GNode {  // GetInput
+        if (index < (*n)->inputs.size()) {
+          return &(*n)->inputs.at(index).node;
+        } else {
+          return &(*n)->control_deps.at(index - (*n)->inputs.size());
+        }
+      });
+}
+
+}  // namespace nnvm
+
+#endif  // NNVM_GRAPH_H_
diff --git a/nnvm/include/nnvm/graph_attr_types.h b/nnvm/include/nnvm/graph_attr_types.h
new file mode 100644
index 000000000000..2bd998fedfbb
--- /dev/null
+++ b/nnvm/include/nnvm/graph_attr_types.h
@@ -0,0 +1,112 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/graph_attr_types.h
+ * \brief Data structures that can appear in graph attributes.
+ */
+#ifndef NNVM_GRAPH_ATTR_TYPES_H_
+#define NNVM_GRAPH_ATTR_TYPES_H_
+
+#include <vector>
+#include <string>
+#include "./tuple.h"
+#include "./layout.h"
+
+namespace nnvm {
+
+/*!
+ * \brief The result holder of JSON serializer
+ *
+ * \note Stored under ret.attrs["json"], provided by Pass "SaveJSON"
+
+ * \code
+ *  Graph ret = ApplyPass(src_graph, "SaveJSON");
+ *  const JSONString& json = ret.GetAttr<JSONString>("shape");
+ * \endcode
+ */
+using JSONString = std::string;
+
+/*!
+ * \brief The result holder of shape of each NodeEntry in the graph.
+ * \note Stored under graph.attrs["shape"], provided by Pass "InferShape"
+ *
+ * \code
+ *  Graph g = ApplyPass(src_graph, "InferShape");
+ *  const ShapeVector& shapes = g.GetAttr<ShapeVector>("shape");
+ *  // get shape by entry id
+ *  TShape entry_shape = shapes[g.indexed_graph().entry_id(my_entry)];
+ * \endcode
+ *
+ * \sa FInferShape
+ */
+using ShapeVector = std::vector<TShape>;
+
+/*!
+ * \brief The result holder of type of each NodeEntry in the graph.
+ * \note Stored under graph.attrs["dtype"], provided by Pass "InferType"
+ *
+ * \code
+ *  Graph g = ApplyPass(src_graph, "InferType");
+ *  const DTypeVector& types = g.GetAttr<DTypeVector>("dtype");
+ *  // get type by entry id
+ *  int entry_type = dtypes[g.indexed_graph().entry_id(my_entry)];
+ * \endcode
+ *
+ * \sa FInferType
+ */
+using DTypeVector = std::vector<int>;
+
+/*!
+ * \brief The result holder of layout of each NodeEntry in the graph.
+ * \note Stored under graph.attrs["layout"], provided by Pass "InferType"
+ *
+ * \code
+ *  Graph g = ApplyPass(src_graph, "LayoutTransform");
+ *  const LayoutVector& layouts = g.GetAttr<LayoutVector>("layout");
+ *  // get layout by entry id
+ *  int entry_layout = layouts[g.indexed_graph().entry_id(my_entry)];
+ * \endcode
+ *
+ * \sa FCorrectLayout
+ */
+using LayoutVector = std::vector<Layout>;
+
+/*!
+ * \brief The result holder of device of each operator in the graph.
+ * \note Stored under graph.attrs["device"], provided by Pass "PlaceDevice"
+ *
+ * \code
+ *  Graph g = ApplyPass(src_graph, "PlaceDevice");
+ *  const &device = g.GetAttr<DeviceVector>("device");
+ *  // get device by node_id
+ *  int device_type = device[g.indexed_graph().node_id(my_node)];
+ * \endcode
+ */
+using DeviceVector = std::vector<int>;
+
+/*!
+ * \brief The result holder of device of each operator in the graph.
+ *
+ * \note Stored under graph.attrs["device_assign_map"], needed by Pass "PlaceDevice"
+ * -1 means unknown device
+ */
+using DeviceAssignMap = std::unordered_map<std::string, int>;
+
+/*!
+ * \brief The result holder of storage id of each NodeEntry in the graph.
+ *
+ * \note Stored under graph.attrs["storage"], provided by Pass "PlanMemory"
+ *  Storage id is a continuous integer.
+ *  If the storage id is -1 then the storage is not assigned.
+ *
+ * \code
+ *  Graph g = ApplyPass(src_graph, "PlanMemory");
+ *  const &storage = g.GetAttr<StorageVector>("storage");
+ *  // get storage id by entry
+ *  int storage_id = storage[g.indexed_graph().entry_id(my_entry)];
+ * \endcode
+ */
+using StorageVector = std::vector<int>;
+
+}  // namespace nnvm
+
+#endif  // NNVM_GRAPH_ATTR_TYPES_H_
diff --git a/nnvm/include/nnvm/layout.h b/nnvm/include/nnvm/layout.h
new file mode 100644
index 000000000000..94813f5323f8
--- /dev/null
+++ b/nnvm/include/nnvm/layout.h
@@ -0,0 +1,455 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nnvm/layout.h
+ * \brief Layout expression.
+ *        The layout is composed of upper cases, lower cases and numbers,
+ *        where upper case indicates a (super-)dimension and
+ *        the corresponding lower case with factor size indicates the split (sub-)dimension.
+ *        For example, NCHW16c can describe a 5-D tensor of
+ *        [batch_size, channel, height, width, channel_block].
+ *        Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
+ */
+#ifndef NNVM_LAYOUT_H_
+#define NNVM_LAYOUT_H_
+
+#include <dmlc/parameter.h>
+#include <string>
+#include <sstream>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace nnvm {
+
+class Layout {
+ public:
+  using LayoutDim = char;
+
+  /*! \brief default constructor */
+  Layout() : name_("__undef__") {} // NOLINT(*)
+
+  /*!
+   * \brief construct from a string.
+   * \param layout input in layout convention:
+   *        upper case indicates a dimension and
+   *        the corresponding lower case with factor size
+   *        indicates the split dimension.
+   *        return undefined layout if "__undef__" is passed.
+   */
+  inline Layout(const std::string& layout) { // NOLINT(*)
+    parse(layout);
+  }
+  /*!
+   * \brief copy constructor from another layout
+   * \param s the source layout
+   */
+  inline Layout(const Layout& s) { // NOLINT(*)
+    this->parse(s.name_);
+  }
+  /*!
+   * \brief move constructor from Layout
+   * \param src the source layout
+   */
+  inline Layout(Layout&& src) { // NOLINT(*)
+    this->swap(src);
+  }
+  /*!
+   * \brief assignment from another layout.
+   * \param src source layout
+   * \return reference of self
+   */
+  inline Layout& operator=(const Layout& src) {
+    this->parse(src.name_);
+    return *this;
+  }
+  /*!
+   * \brief assignment from rvalue of another layout.
+   * \param src source layout
+   * \return reference of self
+   */
+  inline Layout& operator=(Layout&& src) {
+    Layout(std::move(src)).swap(*this); // NOLINT(*)
+    return *this;
+  }
+  /*!
+   * \brief assignment from string.
+   * \param src source layout
+   * \return reference of self
+   */
+  inline Layout& operator=(const std::string& src) {
+    this->parse(src);
+    return *this;
+  }
+  /*!
+   * \return whether two layout equals
+   * \param s the layout to compare against
+   */
+  inline bool operator==(const Layout& s) const {
+    return name_ == s.name_;
+  }
+  /*!
+   * \return whether two layout not equal
+   * \param s the layout to compare against
+   */
+  inline bool operator!=(const Layout& s) const {
+    return !(*this == s);
+  }
+
+  /*!
+   * \brief Append the current layout by another.
+   * @param other the layout to be appended
+   * @return a new layout
+   */
+  inline Layout operator+(const Layout& other) const {
+    if (!this->defined() && !other.defined()) {
+      return Layout::Undef();
+    } else if (!this->defined()) {
+      return other;
+    } else if (!other.defined()) {
+      return *this;
+    }
+    return Layout(this->name_ + other.name_);
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a super-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a super-dimension.
+   */
+  static inline bool is_superdim(LayoutDim dim) {
+    return dim >= 'A' && dim <= 'Z';
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a sub-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a sub-dimension.
+   */
+  static inline bool is_subdim(LayoutDim dim) {
+    return dim >= 'a' && dim <= 'z';
+  }
+
+  /*!
+   * \brief Convert a given dimension to super-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static inline LayoutDim to_superdim(LayoutDim dim) {
+    if (is_subdim(dim)) {
+      return dim - 'a' + 'A';
+    }
+    return dim;
+  }
+
+  /*!
+   * \brief Convert a given dimension to sub-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static inline LayoutDim to_subdim(LayoutDim dim) {
+    if (is_superdim(dim)) {
+      return dim - 'A' + 'a';
+    }
+    return dim;
+  }
+
+  /*!
+   * \brief Return an undefined layout.
+   * \return a (global) undefined layout.
+   */
+  static inline const Layout& Undef() {
+    static Layout undef;
+    return undef;
+  }
+
+  /*!
+   * \brief Swap current object with other
+   * \param other another object to be swapped.
+   */
+  inline void swap(Layout& other) {  // NOLINT(*)
+    std::swap(name_, other.name_);
+    std::swap(superdim_pos_, other.superdim_pos_);
+    std::swap(subdim_pos_, other.subdim_pos_);
+    std::swap(subdim_size_, other.subdim_size_);
+    std::swap(layout_simplified_, other.layout_simplified_);
+  }
+
+  /*!
+   * \brief Two layouts are convertible only if
+   *        they have same set of super-dimensions.
+   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
+   *        but NCHW, CHW, OIHW are not.
+   * \param dst the target layout
+   * \return Whether can be converted to dst layout.
+   */
+  inline bool convertible(const Layout &dst) const {
+    if (!this->defined() || !dst.defined()) return false;
+    for (size_t i = 0; i < kUniqueDim; ++i) {
+      if ((superdim_pos_[i] >= 0 && dst.superdim_pos_[i] < 0) ||
+          (superdim_pos_[i] < 0 && dst.superdim_pos_[i] >= 0)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /*!
+   * \brief Returns a sublayout which is the portion of the object
+   *        that starts at dimension \p pos and spans \p len dimensions
+   *        (or until the end of the layout, whichever comes first).
+   * \param pos The start position.
+   * \param len The length of the sub-layout.
+   * \return A newly constructed Layout object.
+   */
+  inline Layout sublayout(size_t pos, size_t len) const {
+    if (pos > ndim()) return Layout::Undef();
+    if (pos + len > ndim()) len = ndim() - pos;
+    if (len == 0) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (size_t i = pos; i < pos + len; ++i) {
+      if (is_subdim(layout_simplified_[i])) {
+        auto block_size = this->subsizeof(layout_simplified_[i]);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << layout_simplified_[i];
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*! \return A newly constructed reversed Layout object. */
+  inline Layout reverse() const {
+    if (!this->defined()) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
+      if (is_subdim(layout_simplified_[i])) {
+        auto block_size = this->subsizeof(layout_simplified_[i]);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << layout_simplified_[i];
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*!
+   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
+   * \param dim The source dimension to be split. It must be a super-dimension.
+   * \param target_pos The target position of the newly split sub-dimension.
+   * \param size size of the sub-dimension.
+   * \return A newly constructed Layout object.
+   */
+  inline Layout split(LayoutDim dim, size_t target_pos, uint32_t size) const {
+    CHECK(target_pos <= this->ndim()) << "Invalid split position "
+                                      << target_pos << " for layout " << name_;
+    CHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
+    CHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
+    CHECK(!this->contains(to_subdim(dim))) << "Dimension " << dim
+                                           << " has already been split in "
+                                           << name_;
+    CHECK(size > 0) << "Invalid split size " << size;
+    std::ostringstream new_layout;
+    for (size_t i = 0; i <= this->ndim(); ++i) {
+      if (i == target_pos) {
+        new_layout << size << Layout::to_subdim(dim);
+      }
+      if (i == this->ndim()) break;
+      new_layout << this->at(i);
+    }
+    Layout x(new_layout.str());
+    return x;
+  }
+
+  using iterator = std::vector<LayoutDim>::const_iterator;
+  using reverse_iterator = std::vector<LayoutDim>::const_reverse_iterator;
+
+  /*! \return begin iterator */
+  inline iterator begin() const {
+    return layout_simplified_.begin();
+  }
+  /*! \return end iterator */
+  inline iterator end() const {
+    return layout_simplified_.end();
+  }
+  /*! \return rbegin iterator */
+  inline reverse_iterator rbegin() const {
+    return layout_simplified_.rbegin();
+  }
+  /*! \return rend iterator */
+  inline reverse_iterator rend() const {
+    return layout_simplified_.rend();
+  }
+
+  /*! \return number of dimensions */
+  inline size_t ndim() const {
+    return layout_simplified_.size();
+  }
+
+  /*!
+   * \brief The description of the \p i-th dimension.
+   *        If it is a sub-dimension, the size will be returned as well,
+   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
+   * \param i The position
+   * \return the description of the dimension.
+   */
+  inline std::string at(size_t i) const {
+    CHECK_LT(i, this->ndim()) << "position " << i
+                              << " exceeds ndim=" << this->ndim();
+    std::ostringstream repr;
+    if (is_subdim(layout_simplified_[i])) {
+      auto factor = subsizeof(layout_simplified_[i]);
+      CHECK_GT(factor, 0);
+      repr << factor;
+    }
+    repr << layout_simplified_[i];
+    return repr.str();
+  }
+
+  /*!
+   * \brief return the index of the input dimension.
+   *        If it is not found in the layout or the layout is undefined,
+   *        return -1.
+   * \param dim the input dimension.
+   * \return the index or -1 if not found.
+   */
+  inline int32_t indexof(LayoutDim dim) const {
+    if (!this->defined()) return -1;
+    else if (is_superdim(dim)) return superdim_pos_[dim - 'A'];
+    else if (is_subdim(dim)) return subdim_pos_[dim - 'a'];
+    return -1;
+  }
+
+  /*!
+   * \param dim the input super-dimension or sub-dimension.
+   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
+   *         or the size of \p dim itself (if \p dim is a sub-dimension).
+   *         Return -1 if \p dim is not in the layout or the layout is undefined.
+   */
+  inline int64_t subsizeof(LayoutDim dim) const {
+    CHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
+    if (!this->defined() || !this->contains(to_subdim(dim))) {
+      return -1;
+    }
+    int idx = to_subdim(dim) - 'a';
+    return subdim_size_[idx];
+  }
+
+  /*!
+   * \brief Whether the layout contains a dimension.
+   * \param dim dimension to be checked.
+   * \return Whether the layout contains the dimension.
+   */
+  inline bool contains(LayoutDim dim) const {
+    if (is_superdim(dim)) {
+      return superdim_pos_[dim-'A'] >= 0;
+    } else if (is_subdim(dim)) {
+      return subdim_pos_[dim-'a'] >= 0;
+    }
+    return false;
+  }
+
+  inline LayoutDim operator[](size_t i) const {
+    return layout_simplified_[i];
+  }
+
+  /*! \return whether the layout is defined */
+  inline bool defined() const {
+    return name_ != "__undef__";
+  }
+
+  /*! \return the string description of the layout */
+  inline const std::string& name() const {
+    return name_;
+  }
+
+  /*!
+   * \brief Write layout in JSON format.
+   * \param writer JSONWriter
+   */
+  inline void Save(dmlc::JSONWriter* writer) const {
+    writer->Write(name_);
+  }
+
+  /*!
+   * \brief Load layout from JSON.
+   * \param reader JSONReader
+   */
+  inline void Load(dmlc::JSONReader* reader) {
+    std::string tmp;
+    reader->Read(&tmp);
+    this->parse(tmp);
+  }
+
+  /*!
+   * \brief allow output string of layout to ostream
+   * \param os the output stream
+   * \param l the layout
+   * \return the ostream
+   */
+  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
+    os << l.name_;
+    return os;
+  }
+
+ private:
+  static const uint32_t kUniqueDim = 26;
+
+  std::string name_;
+  int32_t superdim_pos_[kUniqueDim];
+  int32_t subdim_pos_[kUniqueDim];
+  int64_t subdim_size_[kUniqueDim];
+  std::vector<LayoutDim> layout_simplified_;
+
+  void parse(const std::string& layout) {
+    name_ = layout;
+    std::fill_n(superdim_pos_, kUniqueDim, -1);
+    std::fill_n(subdim_pos_, kUniqueDim, -1);
+    std::fill_n(subdim_size_, kUniqueDim, -1);
+    layout_simplified_.clear();
+
+    if (layout == "__undef__") return;
+
+    int32_t factor = 0;
+    uint32_t curr = 0;
+    for (size_t i = 0; i < layout.size(); ++i) {
+      const LayoutDim c = layout.at(i);
+      if (is_superdim(c)) {
+        int pos = c - 'A';
+        CHECK_EQ(factor, 0) << "Invalid layout " << layout
+                            << ": invalid factor size " << factor
+                            << " before dimension " << c;
+        CHECK_EQ(superdim_pos_[pos], -1) << "Invalid layout " << layout
+                                           << ": duplicate dimension " << c;
+        superdim_pos_[pos] = curr++;
+        layout_simplified_.push_back(c);
+      } else if (is_subdim(c)) {
+        int pos = c - 'a';
+        CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size "
+                            << factor << " for dimension " << c;
+        CHECK_EQ(subdim_pos_[pos], -1) << "Invalid layout " << layout
+                                           << ": duplicate dimension " << c;
+        CHECK_EQ(subdim_size_[pos], -1) << "Invalid layout " << layout
+                                         << ": duplicate dimension " << c;
+        subdim_pos_[pos] = curr++;
+        subdim_size_[pos] = factor;
+        layout_simplified_.push_back(c);
+        factor = 0;
+      } else if (c >= '0' && c <= '9') {
+        CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
+        factor = factor * 10 + c - '0';
+      } else {
+        LOG(FATAL) << "Invalid layout " << layout;
+      }
+    }
+    CHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
+    for (LayoutDim dim : layout_simplified_) {
+      CHECK(is_superdim(dim) || superdim_pos_[dim-'a'] >= 0)
+        << "Invalid layout " << layout << ": missing axis "
+        << static_cast<char>(dim - 'a' + 'A');
+    }
+  }
+};
+
+}  // namespace nnvm
+
+#endif  // NNVM_LAYOUT_H_
diff --git a/nnvm/include/nnvm/node.h b/nnvm/include/nnvm/node.h
new file mode 100644
index 000000000000..57afb0c5587a
--- /dev/null
+++ b/nnvm/include/nnvm/node.h
@@ -0,0 +1,201 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/node.h
+ * \brief Graph node data structure.
+ */
+#ifndef NNVM_NODE_H_
+#define NNVM_NODE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include "./base.h"
+#include "./op.h"
+#include "./c_api.h"
+
+namespace nnvm {
+
+// Forward declare node.
+class Node;
+class Symbol;
+
+/*!
+ * \brief we always used NodePtr for a reference pointer
+ *  to the node, so this alias can be changed in case.
+ *
+ *  By default, NodePtr is a std::shared_ptr of node
+ */
+using NodePtr = std::shared_ptr<Node>;
+
+/*! \brief an entry that represents output data from a node */
+struct NodeEntry {
+  /*! \brief the source node of this data */
+  NodePtr node;
+  /*! \brief index of output from the source. */
+  uint32_t index;
+  /*!
+   * \brief version of input Variable.
+   *  This field can only be nonzero when this->node is a Variable node.
+   *  version is increased by one each time a Variable get composed to a mutation Op.
+   *  This information can be helpful to decide order of operations when sequence of mutation happens.
+   */
+  uint32_t version;
+};
+
+/*!
+ * \brief This lets you use a NodeEntry as a key in a unordered_map of the form
+ * unordered_map<NodeEntry, ValueType, NodeEntryHash, NodeEntryEqual>
+ */
+struct NodeEntryHash {
+  size_t operator()(const NodeEntry& e) const {
+    return std::hash<Node*>()(e.node.get()) ^
+          (std::hash<size_t>()(e.index) << 1 >> 1) ^
+          (std::hash<size_t>()(e.version) << 1);
+  }
+};
+
+/*!
+ * \brief This lets you use a NodeEntry as a key in a unordered_map of the form
+ * unordered_map<NodeEntry, ValueType, NodeEntryHash, NodeEntryEqual>
+ */
+struct NodeEntryEqual {
+  size_t operator()(const NodeEntry& a, const NodeEntry& b) const {
+    return (a.node.get() == b.node.get()) &&
+           (a.index == b.index) &&
+           (a.version == b.version);
+  }
+};
+
+/*! use NodeEntry as key in unordered_map */
+template<typename ValueType>
+using NodeEntryMap = std::unordered_map<NodeEntry, ValueType, NodeEntryHash, NodeEntryEqual>;
+
+/*!
+ * \brief The attributes of the current operation node.
+ *  Usually are additional parameters like axis,
+ */
+struct NodeAttrs {
+  /*!
+   * \brief The operator this node uses.
+   *  For place holder variable, op == nullptr.
+   */
+  const Op *op{nullptr};
+  /*! \brief name of the node */
+  std::string name;
+  /*! \brief The dictionary representation of attributes */
+  std::unordered_map<std::string, std::string> dict;
+  /*!
+   * \brief A parsed version of attributes,
+   * This is generated if OpProperty.attr_parser is registered.
+   * The object can be used to quickly access attributes.
+   */
+  any parsed;
+  /*!
+   * \brief Some operators take graphs as input. These operators include
+   * control flow operators and high-order functions.
+   * These graphs don't change when the operators are invoked for different
+   * mini-batches. In this sense, the subgraphs are kind of similar to
+   * the parameters and show be kept as node attributes.
+   *
+   * Users need to make sure the subgraphs are disjoint with the main graph.
+   * If a graph shares nodes with subgraphs, loading the graph from LoadJSON
+   * may generate a graph that has a different structure from the original graph
+   * (some of the nodes are duplicated). If nodes are shared between two graphs,
+   * shared nodes might be executed multiple times, which can be a problem for
+   * stateful operators.
+   */
+  std::vector<std::shared_ptr<Symbol> > subgraphs;
+};
+
+/*!
+ * \brief Node represents an operation in a computation graph.
+ */
+class NNVM_DLL Node {
+ public:
+  /*! \brief The attributes in the node. */
+  NodeAttrs attrs;
+  /*! \brief inputs to this node */
+  std::vector<NodeEntry> inputs;
+  /*!
+   * \brief Optional control flow dependencies
+   *  Gives operation must be performed before this operation.
+   */
+  std::vector<NodePtr> control_deps;
+  /*! \brief additional fields for this node */
+  any info;
+  /*! \brief destructor of node */
+  ~Node();
+  /*! \return operator in this node */
+  inline const Op* op() const;
+  /*!
+   * \brief return whether node is placeholder variable.
+   *  This is equivalent to op == nullptr
+   * \return whether node is placeholder input variable
+   */
+  inline bool is_variable() const;
+  /*! \return number of outputs from this node */
+  inline uint32_t num_outputs() const;
+  /*! \return number of inputs from this node */
+  inline uint32_t num_inputs() const;
+  /*!
+   * \brief create a new empty shared_ptr of Node.
+   * \return a created empty node.
+   */
+  static NodePtr Create();
+};
+
+/*!
+ * \brief Quick utilities make node.
+ * \param op_name The name of operator
+ * \param node_name The name of the node
+ * \param inputs The input entries
+ * \param attrs The attributes
+ * \return The created node entry.
+ */
+inline NodeEntry MakeNode(
+    const char* op_name,
+    std::string node_name,
+    std::vector<NodeEntry> inputs,
+    std::unordered_map<std::string, std::string> attrs =
+    std::unordered_map<std::string, std::string>()) {
+  NodePtr p = Node::Create();
+  p->attrs.op = nnvm::Op::Get(op_name);
+  p->attrs.name = std::move(node_name);
+  p->attrs.dict = attrs;
+  if (p->attrs.op->attr_parser) {
+    p->attrs.op->attr_parser(&(p->attrs));
+  }
+  p->inputs = std::move(inputs);
+  return NodeEntry{p, 0, 0};
+}
+
+// implementation of functions.
+inline const Op* Node::op() const {
+  return this->attrs.op;
+}
+inline bool Node::is_variable() const {
+  return this->op() == nullptr;
+}
+
+inline uint32_t Node::num_outputs() const {
+  if (is_variable()) return 1;
+  if (this->op()->get_num_outputs == nullptr) {
+    return this->op()->num_outputs;
+  } else {
+    return this->op()->get_num_outputs(this->attrs);
+  }
+}
+
+inline uint32_t Node::num_inputs() const {
+  if (is_variable()) return 1;
+  if (this->op()->get_num_inputs == nullptr) {
+    return this->op()->num_inputs;
+  } else {
+    return this->op()->get_num_inputs(this->attrs);
+  }
+}
+
+}  // namespace nnvm
+
+#endif  // NNVM_NODE_H_
diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h
new file mode 100644
index 000000000000..5bdfcaca169d
--- /dev/null
+++ b/nnvm/include/nnvm/op.h
@@ -0,0 +1,562 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/op.h
+ * \brief Operator information structor.
+ */
+#ifndef NNVM_OP_H_
+#define NNVM_OP_H_
+
+#include <dmlc/parameter.h>
+#include <string>
+#include <vector>
+#include <utility>
+#include <typeinfo>
+#include <limits>
+#include <functional>
+#include "./base.h"
+#include "./c_api.h"
+
+namespace nnvm {
+
+// forward declarations
+class Node;
+struct NodeAttrs;
+template<typename ValueType>
+class OpMap;
+class OpGroup;
+class OpRegistryEntry;
+using dmlc::ParamFieldInfo;
+
+/*! \brief constant to indicate it take any length of positional inputs */
+static const uint32_t kVarg = std::numeric_limits<uint32_t>::max();
+
+/*!
+ * \brief Operator structure.
+ *
+ *  Besides the fields in the structure,
+ *  arbitary additional information can be associated with each op.
+ *  See function GetAttr for details.
+ *
+ * \code
+ *  // Example usage of Op
+ *
+ *  // registeration of oeprators
+ *  // NOTE that the attr function can register any
+ *  // additional attributes to the operator
+ *  NNVM_REGISTER_OP(add)
+ *  .describe("add two inputs together")
+ *  .set_num_inputs(2)
+ *  .set_attr<OpKernel>("OpKernel<gpu>", AddKernel)
+ *  .include("ElementwiseOpAttr");
+ *
+ *  // can register attribute by group
+ *  // all the ops that include the group get the attribute.
+ *  NNVM_REGISTER_OP_GROUP(ElementwiseOpAttr)
+ *  .set_attr<FInferShape>("FInferShape", ElementwiseInferShape);
+ *
+ *  NNVM_REGISTER_OP(sub)
+ *  .describe("substract one tensor from another")
+ *  .set_num_inputs(2);
+ *
+ *  // Can call regster multiple times in different files
+ *  // to register different part of information
+ *  NNVM_REGISTER_OP(sub)
+ *  .set_attr<OpKernel>("OpKernel<gpu>", SubKernel);
+ *  .include("ElementwiseOpAttr");
+ *
+ *  // get operators from registry.
+ *  void my_function() {
+ *    const Op* add = Op::Get("add");
+ *    const Op* sub = Op::Get("sub");
+ *    // query basic information about each operator.
+ *    assert(op->name == "plus");
+ *    assert(op->num_inputs == 2);
+ *
+ *    // get additional registered information,
+ *    // Assume user registered a OpKernel type attribute as gpu_kernel on each operator.
+ *    const OpMap<OpKernel>& kernel = Op::GetAttr<OpKernel>("OpKernel<gpu>");
+ *    // we can get the kernel functions by using operator as key.
+ *    auto add_kernel = kernel[add];
+ *    auto sub_kernel = kernel[sub];
+ *    // subsequent code can make use of the queried kernel functions.
+ * }
+ * \endcode
+ */
+class NNVM_DLL Op {
+ public:
+  /*! \brief name of the operator */
+  std::string name;
+  /*!
+   * \brief detailed description of the operator
+   *  This can be used to generate docstring automatically for the operator.
+   */
+  std::string description;
+  /* \brief description of inputs and keyword arguments*/
+  std::vector<ParamFieldInfo> arguments;
+  /*!
+   * \brief number of inputs to the operator,
+   * -1 means it is variable length
+   * When get_num_inputs is presented,
+   * the number will be decided by get_num_inputs instead.
+   * \sa get_num_inputs
+   */
+  uint32_t num_inputs = 1;
+  /*!
+   * \brief number of outputs of the operator
+   *  When get_num_outputs is presented.
+   *  The number of outputs will be decided by
+   *  get_num_outputs function
+   * \sa get_num_outputs
+   */
+  uint32_t num_outputs = 1;
+  /*!
+   * \brief support level of the operator,
+   *  The lower the more priority it contains.
+   *  This is in analogies to BLAS levels.
+   */
+  uint32_t support_level = 10;
+  /*!
+   * \brief get number of outputs given information about the node.
+   * \param attrs The attribute of the node
+   * \return number of outputs.
+   */
+  std::function<uint32_t(const NodeAttrs& attrs)> get_num_outputs = nullptr;
+  /*!
+   * \brief get number of inputs given information about the node.
+   * \param attrs The attribute of the node
+   * \return number of inputs
+   */
+  std::function<uint32_t(const NodeAttrs& attrs)> get_num_inputs = nullptr;
+  /*!
+   * \brief Attribute parser to parse the NodeAttrs information.
+   *
+   * This can help to get quick access to a parsed attribute
+   * object
+   *
+   * \code
+   *  // Example usage of attr_parser.
+   *
+   *  // Suppose we want to register operator sum.
+   *  // The parameters about sum operator
+   *  struct SumParam {
+   *    int axis;
+   *  };
+   *  // The parser function
+   *  void SumAttrParser(NodeAttrs* attrs) {
+   *     // This will be invoked during node construction.
+   *     SumParam param;
+   *     // parse axis string to integer
+   *     param.axis = atoi(attrs->dict["axis"].c_str());
+   *     // set the parsed parameter
+   *     attrs->parsed = std::move(param);
+   *  }
+   *  // The other function that can utilize the parsed result.
+   *  TShape SumInferShape(const NodeAttrs& attrs,
+   *                       const std::vector<TShape>& ishapes) {
+   *     // we can use the parsed version of param
+   *     // without repeatively parsing the parameter
+   *     const SumParam& param = nnvm::get<SumParam>(attrs.parsed);
+   *  }
+   * \endcode
+   */
+  std::function<void(NodeAttrs* attrs)> attr_parser = nullptr;
+  // function fields.
+  /*!
+   * \brief setter function during registration
+   *  Set the description of operator
+   * \param descr the description string.
+   * \return reference to self.
+   */
+  inline Op& describe(const std::string& descr);  // NOLINT(*)
+  /*!
+   * \brief Add argument information to the function.
+   * \param name Name of the argument.
+   * \param type Type of the argument.
+   * \param description Description of the argument.
+   * \return reference to self.
+   */
+  inline Op& add_argument(const std::string &name,
+                          const std::string &type,
+                          const std::string &description);
+  /*!
+   * \brief Append list if arguments to the end.
+   * \param args Additional list of arguments.
+   * \return reference to self.
+   */
+  inline Op& add_arguments(const std::vector<ParamFieldInfo> &args);
+  /*!
+   * \brief Set the num_inputs
+   * \param n The number of inputs to be set.
+   * \return reference to self.
+   */
+  inline Op& set_num_inputs(uint32_t n);  // NOLINT(*)
+  /*!
+   * \brief Set the support level of op.
+   * \param level The support level.
+   * \return reference to self.
+   */
+  inline Op& set_support_level(uint32_t level);  // NOLINT(*)
+  /*!
+   * \brief Set the get_num_outputs function.
+   * \param fn The function to be set.
+   * \return reference to self.
+   */
+  inline Op& set_num_inputs(std::function<uint32_t (const NodeAttrs& attr)> fn);  // NOLINT(*)
+  /*!
+   * \brief Set the num_outputs
+   * \param n The number of outputs to be set.
+   * \return reference to self.
+   */
+  inline Op& set_num_outputs(uint32_t n);  // NOLINT(*)
+  /*!
+   * \brief Set the get_num_outputs function.
+   * \param fn The function to be set.
+   * \return reference to self.
+   */
+  inline Op& set_num_outputs(std::function<uint32_t (const NodeAttrs& attr)> fn);  // NOLINT(*)
+  /*!
+   * \brief Set the attr_parser function.
+   * \param fn The number of outputs to be set.
+   * \return reference to self.
+   */
+  inline Op& set_attr_parser(std::function<void (NodeAttrs* attrs)> fn);  // NOLINT(*)
+  /*!
+   * \brief Register additional attributes to operator.
+   * \param attr_name The name of the attribute.
+   * \param value The value to be set.
+   * \param plevel The priority level of this set,
+   *  an higher priority level attribute
+   *  will replace lower priority level attribute.
+   *  Must be bigger than 0.
+   *
+   *  Cannot set with same plevel twice in the code.
+   *
+   * \tparam ValueType The type of the value to be set.
+   */
+  template<typename ValueType>
+  inline Op& set_attr(const std::string& attr_name,  // NOLINT(*)
+                      const ValueType& value,
+                      int plevel = 10);
+  /*!
+   * \brief Add another alias to this operator.
+   *   The same Op can be queried with Op::Get(alias)
+   * \param alias The alias of the operator.
+   * \return reference to self.
+   */
+  Op& add_alias(const std::string& alias);  // NOLINT(*)
+  /*!
+   * \brief Include all the attributes from an registered op group.
+   * \param group_name The name of the group.
+   * \return reference to self.
+   *
+   * \sa NNVM_REGISTER_OP_GROUP
+   */
+  Op& include(const std::string& group_name);
+  /*!
+   * \brief Get an Op for a given operator name.
+   *  Will raise an error if the op has not been registered.
+   * \param op_name Name of the operator.
+   * \return Pointer to a Op, valid throughout program lifetime.
+   */
+  static const Op* Get(const std::string& op_name);
+  /*!
+   * \brief Get additional registered attribute about operators.
+   *  If nothing has been registered, an empty OpMap will be returned.
+   * \param attr_name The name of the attribute.
+   * \return An OpMap of specified attr_name.
+   * \tparam ValueType The type of the attribute.
+   */
+  template<typename ValueType>
+  static const OpMap<ValueType>& GetAttr(const std::string& attr_name);
+
+ private:
+  template<typename ValueType>
+  friend class OpMap;
+  friend class OpGroup;
+  friend class dmlc::Registry<Op>;
+  // Program internal unique index of operator.
+  // Used to help index the program.
+  uint32_t index_{0};
+  // internal constructor
+  Op();
+  // get const reference to certain attribute
+  static const any* GetAttrMap(const std::string& key);
+  // update the attribute OpMap
+  static void UpdateAttrMap(const std::string& key,
+                            std::function<void(any*)> updater);
+  // add a trigger based on tag matching on certain tag attribute
+  // This will apply trigger on all the op such that
+  // include the corresponding group.
+  // The trigger will also be applied to all future registrations
+  // that calls include
+  static void AddGroupTrigger(const std::string& group_name,
+                              std::function<void(Op*)> trigger);
+};
+
+/*!
+ * \brief A map data structure that takes Op* as key
+ *  and returns ValueType
+ * \tparam ValueType The type of the value stored in map.
+ */
+template<typename ValueType>
+class OpMap {
+ public:
+  /*!
+   * \brief get the corresponding value element at op
+   * \param op The key to the map
+   * \return the const reference to the content value.
+   */
+  inline const ValueType& operator[](const Op* op) const;
+  /*!
+   * \brief get the corresponding value element at op with default value.
+   * \param op The key to the map
+   * \param def_value The default value when the key does not exist.
+   * \return the const reference to the content value.
+   */
+  inline const ValueType& get(const Op* op, const ValueType& def_value) const;
+  /*!
+   * \brief Check if the map has op as key.
+   * \param op The key to the map
+   * \return 1 if op is contained in map, 0 otherwise.
+   */
+  inline int count(const Op* op) const;
+
+ private:
+  friend class Op;
+  // internal attribute name
+  std::string attr_name_;
+  // internal data
+  std::vector<std::pair<ValueType, int> > data_;
+  OpMap() = default;
+};
+
+/*!
+ * \brief auxiliary data structure used to
+ *  set attributes to a group of operators
+ */
+class OpGroup {
+ public:
+  /*! \brief the tag key to be matched */
+  std::string group_name;
+  /*!
+   * \brief Register additional attributes to operator group.
+   * \param attr_name The name of the attribute.
+   * \param value The value to be set.
+   * \param plevel The priority level of this set,
+   *  an higher priority level attribute
+   *  will replace lower priority level attribute.
+   *  Must be bigger than 0.
+   *
+   *  Cannot set with same plevel twice in the code.
+   *
+   * \tparam ValueType The type of the value to be set.
+   */
+  template<typename ValueType>
+  inline OpGroup& set_attr(const std::string& attr_name,  // NOLINT(*)
+                           const ValueType& value,
+                           int plevel = 1);
+};
+
+// internal macros to make
+#define NNVM_REGISTER_VAR_DEF(OpName)                                   \
+  static DMLC_ATTRIBUTE_UNUSED ::nnvm::Op & __make_ ## NnvmOp ## _ ## OpName
+
+#define NNVM_REGISTER_GVAR_DEF(TagName)                                     \
+  static DMLC_ATTRIBUTE_UNUSED ::nnvm::OpGroup __make_ ## NnvmOpGroup ## _ ## TagName
+
+/*!
+ * \def NNVM_REGISTER_OP
+ * \brief Register a new operator, or set attribute of the corresponding op.
+ *
+ * \param OpName The name of registry
+ *
+ * \code
+ *
+ *  NNVM_REGISTER_OP(add)
+ *  .describe("add two inputs together")
+ *  .set_num_inputs(2)
+ *  .set_attr<OpKernel>("gpu_kernel", AddKernel);
+ *
+ * \endcode
+ */
+#define NNVM_REGISTER_OP(OpName)                                     \
+  DMLC_STR_CONCAT(NNVM_REGISTER_VAR_DEF(OpName), __COUNTER__) =         \
+      ::dmlc::Registry<::nnvm::Op>::Get()->__REGISTER_OR_GET__(#OpName)
+
+/*!
+ * \def NNVM_REGISTER_OP_GROUP
+ * \brief Register attribute to a group of operators.
+ * These attributes will be registered to Op that include the group.
+ *
+ * \param GroupName The name of the group.
+ *
+ * \code
+ *
+ *  NNVM_REGISTER_OP(add)
+ *  .include("ElementwiseOpAttr");
+ *
+ *  // register same attributes to all the ops that include the group
+ *  NNVM_REGISTER_OP_GROUP(ElementwiseOpAttr)
+ *  .set_attr<FInferShape>("FInferShape", ElementwiseInferShape);
+ *
+ *  NNVM_REGISTER_OP(mul)
+ *  .include("ElementwiseOpAttr");
+ *
+ * \endcode
+ */
+#define NNVM_REGISTER_OP_GROUP(GroupName)                               \
+  DMLC_STR_CONCAT(NNVM_REGISTER_GVAR_DEF(GroupName), __COUNTER__) =     \
+      ::nnvm::OpGroup {#GroupName}
+
+// implementations of template functions after this.
+// member function of Op
+template<typename ValueType>
+inline const OpMap<ValueType>& Op::GetAttr(const std::string& key) {
+  const any* ref = GetAttrMap(key);
+  if (ref == nullptr) {
+    // update the attribute map of the key by creating new empty OpMap
+    UpdateAttrMap(key, [key](any* pmap) {
+        // use callback so it is in lockscope
+        if (pmap->empty()) {
+          OpMap<ValueType> pm;
+          pm.attr_name_ = key;
+          *pmap = std::move(pm);
+        }
+      });
+    ref = GetAttrMap(key);
+  }
+  return nnvm::get<OpMap<ValueType> >(*ref);
+}
+
+template<typename ValueType>
+inline Op& Op::set_attr(  // NOLINT(*)
+    const std::string& attr_name,
+    const ValueType& value,
+    int plevel) {
+  CHECK_GT(plevel, 0)
+      << "plevel in set_attr must be greater than 0";
+  // update the attribute map of the key by creating new empty if needed.
+  UpdateAttrMap(attr_name,
+                [this, attr_name, value, plevel](any* pmap) {
+      // the callback is in lockscope so is threadsafe.
+      if (pmap->empty()) {
+        OpMap<ValueType> pm;
+        pm.attr_name_ = attr_name;
+        *pmap = std::move(pm);
+      }
+      CHECK(pmap->type() == typeid(OpMap<ValueType>))
+          << "Attribute " << attr_name
+          << " of operator " << this->name
+          << " is registered as inconsistent types"
+          << " previously " << pmap->type().name()
+          << " current " << typeid(OpMap<ValueType>).name();
+      std::vector<std::pair<ValueType, int> >& vec =
+          nnvm::get<OpMap<ValueType> >(*pmap).data_;
+      // resize the value type.
+      if (vec.size() <= index_) {
+        vec.resize(index_ + 1,
+                   std::make_pair(ValueType(), 0));
+      }
+      std::pair<ValueType, int>& p = vec[index_];
+      CHECK(p.second != plevel)
+          << "Attribute " << attr_name
+          << " of operator " << this->name
+          << " is already registered with same plevel=" << plevel;
+      if (p.second < plevel) {
+        vec[index_] = std::make_pair(value, plevel);
+      }
+    });
+  return *this;
+}
+
+
+inline Op& Op::describe(const std::string& descr) {  // NOLINT(*)
+  this->description = descr;
+  return *this;
+}
+
+inline Op& Op::add_argument(const std::string &name,
+                            const std::string &type,
+                            const std::string &description) {
+  arguments.push_back({name, type, type, description});
+  return *this;
+}
+
+inline Op& Op::add_arguments(const std::vector<ParamFieldInfo> &args) {
+  this->arguments.insert(arguments.end(), args.begin(), args.end());
+  return *this;
+}
+
+inline Op& Op::set_num_inputs(uint32_t n) {  // NOLINT(*)
+  this->num_inputs = n;
+  return *this;
+}
+
+inline Op& Op::set_support_level(uint32_t n) {  // NOLINT(*)
+  this->support_level = n;
+  return *this;
+}
+
+inline Op& Op::set_num_inputs(std::function<uint32_t (const NodeAttrs& attr)> fn) {  // NOLINT(*)
+  this->get_num_inputs = fn;
+  return *this;
+}
+
+inline Op& Op::set_num_outputs(uint32_t n) {  // NOLINT(*)
+  this->num_outputs = n;
+  return *this;
+}
+
+inline Op& Op::set_num_outputs(std::function<uint32_t (const NodeAttrs& attr)> fn) {  // NOLINT(*)
+  this->get_num_outputs = fn;
+  return *this;
+}
+
+inline Op& Op::set_attr_parser(std::function<void (NodeAttrs* attrs)> fn) {  // NOLINT(*)
+  this->attr_parser = fn;
+  return *this;
+}
+
+// member functions of OpMap
+template<typename ValueType>
+inline int OpMap<ValueType>::count(const Op* op) const {
+  if (op == nullptr) return 0;
+  const uint32_t idx = op->index_;
+  return idx < data_.size() ? (data_[idx].second != 0) : 0;
+}
+
+template<typename ValueType>
+inline const ValueType& OpMap<ValueType>::operator[](const Op* op) const {
+  CHECK(op != nullptr);
+  const uint32_t idx = op->index_;
+  CHECK(idx < data_.size() && data_[idx].second)
+        << "Attribute " << attr_name_
+        << " has not been registered for Operator " << op->name;
+  return data_[idx].first;
+}
+
+template<typename ValueType>
+inline const ValueType& OpMap<ValueType>::get(const Op* op, const ValueType& def_value) const {
+  if (op == nullptr) return def_value;
+  const uint32_t idx = op->index_;
+  if (idx < data_.size() && data_[idx].second) {
+    return data_[idx].first;
+  } else {
+    return def_value;
+  }
+}
+
+template<typename ValueType>
+inline OpGroup& OpGroup::set_attr(const std::string& attr_name,
+                                  const ValueType& value,
+                                  int plevel) {
+  auto trigger = [attr_name, value, plevel](Op* op) {
+    op->set_attr<ValueType>(attr_name, value, plevel);
+  };
+  Op::AddGroupTrigger(group_name, trigger);
+  return *this;
+}
+
+}  // namespace nnvm
+
+#endif  // NNVM_OP_H_
diff --git a/nnvm/include/nnvm/op_attr_types.h b/nnvm/include/nnvm/op_attr_types.h
new file mode 100644
index 000000000000..b7f6be408a16
--- /dev/null
+++ b/nnvm/include/nnvm/op_attr_types.h
@@ -0,0 +1,219 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/op_attr_types.h
+ * \brief Data structures that can appear in operator attributes.
+ */
+#ifndef NNVM_OP_ATTR_TYPES_H_
+#define NNVM_OP_ATTR_TYPES_H_
+
+#include <vector>
+#include <string>
+#include <utility>
+#include <functional>
+#include "./base.h"
+#include "./node.h"
+#include "./tuple.h"
+#include "./layout.h"
+
+namespace nnvm {
+
+// These types are optional attributes in each operator.
+// Each attribute can be required by some passes.
+
+/*!
+ * \brief Return list of input arguments names of each operator.
+ *
+ * \param attrs The attributes of the node.
+ * \return list of inputs
+ * \note Register under "FListInputNames", default return {"data"}.
+ *
+ *  FListInputNames enables automatic variable creation for missing arguments.
+ */
+using FListInputNames = std::function<std::vector<std::string> (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Return number of visible outputs by the user.
+ *
+ * \param attrs The attributes of the node.
+ *
+ * \note Register under "FNumVisibleOutputs", default not registered.
+ *  This can be used to hide certain output from the user,
+ *  but the additional outputs can be used to pass information from
+ *  forward to gradient pass.
+ */
+using FNumVisibleOutputs = std::function<uint32_t (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Return list of output arguments names of each operator.
+ *
+ * \param attrs The attributes of the node.
+ * \return list of inputs
+ * \note Register under "FListOutputNames", default return {"outputs"}.
+ *
+ *  FListOutputNames customized naming for operator outputs.
+ */
+using FListOutputNames = std::function<std::vector<std::string> (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Check whether operator will mutate k-th input.
+ * \param attrs The attributes of the node.
+ * \return list of input indices it mutates.
+ *
+ * \note Register under "FMutateInputs", default return false
+ * FMutateInputs enables mutation order handling correctly.
+ */
+using FMutateInputs = std::function<std::vector<uint32_t> (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Inference function of certain type.
+ * \tparam AttrType The type of the attribute to be infered.
+ * \return whether all attributes are inferred.
+ */
+template<typename AttrType>
+using FInferNodeEntryAttr = std::function<bool (const NodeAttrs& attrs,
+                                                std::vector<AttrType> *in_attrs,
+                                                std::vector<AttrType> *out_attrs)>;
+
+/*!
+ * \brief Get attribute dictionary from node.
+ *
+ * \param attrs The attributes of the node.
+ * \return The attribute dict.
+ * \note Register under "FUpdateAttrDict"
+ */
+using FGetAttrDict = std::function<
+  std::unordered_map<std::string, std::string>
+  (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Shape inference function.
+ *  Update the shapes given the input shape information.
+ *  TShape.ndim() == 0 means the shape is still unknown.
+ *
+ * \note Register under "FInferShape",
+ *  by default do not update any shapes.
+ *
+ *  FInferShape is needed by shape inference
+ */
+using FInferShape = FInferNodeEntryAttr<TShape>;
+
+/*!
+ * \brief Type inference function.
+ *  Update the type given the known type information.
+ *
+ * \note Register under "FInferType",
+ *  by default set all the output types to 0.
+ */
+using FInferType = FInferNodeEntryAttr<int>;
+
+/*!
+ * \brief Whether this op is an explicit backward operator,
+ * If TIsBackward is true:
+ *   - The first control_deps of the node points to the corresponding forward operator.
+ *
+ * \note Register under "TIsBackward"
+ * This enables easier shape/type inference for backward operators.
+ */
+using TIsBackward = bool;
+
+/*!
+ * \brief Get possible inplace options.
+ *  This function enables optimization to reuse memory of inputs in output.
+ * \param attrs The attributes of the node
+ * \return list of pair of that maps input->output,
+ *   indicating possible in place operations.
+ *
+ * \note Register under "FInplaceOption", by default no inplace can happen.
+ */
+using FInplaceOption = std::function<
+  std::vector<std::pair<int, int> > (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Get if the inplace option is an identity
+ *  This function enables inplace optimization even when input reference count
+ *  is greater than one.
+ * \param attrs The attributes of the node
+ * \return list of bool indicating whether corresponding pair from FInplaceOption
+ *         is an identity
+ *
+ * \note Register under "FInplaceIdentity", by default no identities.
+ */
+using FInplaceIdentity = std::function<std::vector<bool> (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Get list of inputs in the op whose content are actually not used by the operator
+ *  These are dummy input that can be used for example in zeros_like, ones_like.
+ *
+ * \param attrs The attributes of the node
+ * \return list input index that are not used by the operator.
+ *
+ * \note Register under "FIgnoreInputs".
+ */
+using FIgnoreInputs = std::function<
+  std::vector<uint32_t> (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Get the gradient node of the op node
+ *  This function generates the backward graph of the node
+ * \param nodeptr The node to take gradient
+ * \param out_grads Gradient of current node's outputs
+ * \return gradients of the inputs
+ *
+ * \note Register under "FGradient"
+ */
+using FGradient = std::function<std::vector<NodeEntry>(
+    const NodePtr& nodeptr,
+    const std::vector<NodeEntry>& out_grads)>;
+
+/*!
+ * \brief Set the attributes of input variable.
+ *  Usually used for setting initialization or weight decay.
+ *  \param attrs The attributes of this node.
+ *  \param var the input variable
+ *  \param index index of var in all inputs
+ */
+using FSetInputVarAttrOnCompose = std::function<void(
+    const NodeAttrs& attrs,
+    NodePtr var,
+    const int index)>;
+
+/*!
+ * \brief Infer & correct function of node layout. See \p Layout for layout convention
+ * \param attrs The attribute of the node.
+ * \param ilayouts Given the input layouts produced by ancestor nodes,
+ *                 it should be filled by layouts that the node requests.
+ *                 If the requested layout is different from what ancestor produces,
+ *                 a __layout_transform__ operator will be inserted automatically.
+ * \param last_ilayouts The input layouts requested by the node
+ *                      at the last infer pass (if any).
+ *                      This can be useful when an operator wants to keep
+ *                      the input layout the same as the original one.
+ *                      For example, after the pass of AlterOpLayout,
+ *                      transpose(input, axis=[1, 2, 3, 0]) may receive an input of NCHW16c layout,
+ *                      with which it cannot calculate with axis=[1, 2, 3, 0].
+ *                      Last input layouts allow it to know what the layout it originally inferred,
+ *                      i.e., the layout in the imported model.
+ * \param olayouts Inferred output layouts.
+ * \return success flag.
+ */
+using FCorrectLayout = std::function<bool(
+    const NodeAttrs& attrs,
+    std::vector<Layout> *ilayouts,
+    const std::vector<Layout> *last_ilayouts,
+    std::vector<Layout> *olayouts)>;
+
+/*!
+ * \brief Get a list of inputs that represent graphs instead of data.
+ * Normally, input symbols are considered as data to the operator. However,
+ * control flow operators and high-order functions need to interpret symbols
+ * as graphs.
+ * \param attrs The attributes of this node.
+ * \return a list of input index that are interpreted as symbols by the operator.
+ *
+ * \note Register under "FInputGraph".
+ */
+using FInputGraph = std::function<std::vector<uint32_t>(const NodeAttrs& attrs)>;
+
+}  // namespace nnvm
+
+#endif  // NNVM_OP_ATTR_TYPES_H_
diff --git a/nnvm/include/nnvm/pass.h b/nnvm/include/nnvm/pass.h
new file mode 100644
index 000000000000..016d5ee2a763
--- /dev/null
+++ b/nnvm/include/nnvm/pass.h
@@ -0,0 +1,128 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/pass.h
+ * \brief Pass that can be applied to a graph.
+ */
+#ifndef NNVM_PASS_H_
+#define NNVM_PASS_H_
+
+#include <vector>
+#include <functional>
+#include "./base.h"
+#include "./graph.h"
+
+namespace nnvm {
+
+/*!
+ * \brief A PassFunction is an "Operator on Graph".
+ *  It takes a source graph and return a graph that may or may
+ *  not be the same as the input one.
+ *
+ *  A pass function can either change the graph structure (thus,
+ *  generating a new Graph), or add new attributes to the graph.
+ *
+ * \param src The graph to be transformed.
+ * \return The generated graph.
+ */
+typedef std::function<Graph (Graph src)> PassFunction;
+
+/*!
+ * \brief Apply a series of pass transformations on the input graph.
+ * \param src The graph to be transformed.
+ * \param passes A list of pass names to be applied.
+ * \return The transformed graph
+ */
+Graph ApplyPasses(Graph src,
+                  const std::vector<std::string>& passes);
+
+/*!
+ * \brief Apply one pass to the graph.
+ * \param src The graph to be transformed.
+ * \param pass The name of pass to be applied.
+ * \return The transformed graph.
+ */
+inline Graph ApplyPass(Graph src, const std::string& pass) {
+  return ApplyPasses(src, {pass});
+}
+
+
+/*!
+ * \brief Registry entry for pass functions.
+ */
+struct PassFunctionReg
+    : public dmlc::FunctionRegEntryBase<PassFunctionReg,
+                                        PassFunction> {
+  /*!
+   * \brief Whether the pass will change graph structure
+   *  If this is false, the pass will only change attributes.
+   */
+  bool change_graph{false};
+  /*! \brief dependencies on operator attributes */
+  std::vector<std::string> op_attr_dependency;
+  /*! \brief dependencies on attributes in the graph */
+  std::vector<std::string> graph_attr_dependency;
+  /*! \brief generated targets of graph attributes */
+  std::vector<std::string> graph_attr_targets;
+  /*!
+   * \brief Set whether this pass will change graph structure.
+   * \param v If true, the pass will change graph structure.
+   * \return Reference to self.
+   */
+  PassFunctionReg& set_change_graph(bool v) {  // NOLINT(*)
+    change_graph = v;
+    return *this;
+  }
+  /*!
+   * \brief Declare that this pass will generate the given graph attribute name
+   *        once it is applied on the graph.
+   * \param attr_name Name of the graph attribute.
+   * \return Reference to self.
+   */
+  PassFunctionReg& provide_graph_attr(const std::string& attr_name) {  // NOLINT(*)
+    graph_attr_targets.push_back(attr_name);
+    return *this;
+  }
+  /*!
+   * \brief Declare this pass requires the given operator attribute to be
+   *        available before being applied on the graph.
+   * \param attr_name Name of the attribute.
+   * \return Reference to self.
+   */
+  PassFunctionReg& depend_op_attr(const std::string& attr_name) {  // NOLINT(*)
+    op_attr_dependency.push_back(attr_name);
+    return *this;
+  }
+  /*!
+   * \brief Declare this pass requires the given graph attribute to be
+   *        available before being applied on the graph.
+   * \param attr_name Name of the attribute.
+   * \return Reference to self.
+   */
+  PassFunctionReg& depend_graph_attr(const std::string& attr_name) {  // NOLINT(*)
+    graph_attr_dependency.push_back(attr_name);
+    return *this;
+  }
+};
+
+/*!
+ * \def NNVM_REGISTER_PASS
+ * \brief Macro to register pass fuctions.
+ *
+ * \code
+ * // example of registering a shape inference pass
+ * NNVM_REGISTER_PASS(InferShape)
+ * .describe("Shape Inference function, generate graph attributes")
+ * .provide_graph_attr("data_shape")
+ * .depend_graph_attr("indexed_graph")
+ * .depend_op_attr("infer_shape")
+ * .set_body([](const Graph& g) {
+ *     // shape inference logic
+ *   });
+ * \endcode
+ */
+#define NNVM_REGISTER_PASS(name)                                     \
+  DMLC_REGISTRY_REGISTER(::nnvm::PassFunctionReg, PassFunctionReg, name)
+
+}  // namespace nnvm
+
+#endif  // NNVM_PASS_H_
diff --git a/nnvm/include/nnvm/pass_functions.h b/nnvm/include/nnvm/pass_functions.h
new file mode 100644
index 000000000000..4c29e09d813a
--- /dev/null
+++ b/nnvm/include/nnvm/pass_functions.h
@@ -0,0 +1,190 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/pass_functions.h
+ * \brief Pass functions that simply redirect the calls to ApplyPass
+ *
+ *  This file serves as documentation on how to use functions implemented in "src/pass".
+ *  It is totally optional to add these functions when you add a new pass, since
+ *  ApplyPass can be directly called.
+ */
+#ifndef NNVM_PASS_FUNCTIONS_H_
+#define NNVM_PASS_FUNCTIONS_H_
+
+#include <string>
+#include <memory>
+#include <vector>
+#include "./base.h"
+#include "./pass.h"
+#include "./graph_attr_types.h"
+
+namespace nnvm {
+namespace pass {
+
+/*!
+ * \brief Load a graph from JSON string, redirects to "LoadJSON" pass.
+ * \param json_str The json string.
+ * \return Loaded graph.
+ */
+inline Graph LoadJSON(const std::string& json_str) {
+  Graph ret;
+  ret.attrs["json"] = std::make_shared<any>(json_str);
+  return ApplyPass(ret, "LoadJSON");
+}
+
+/*!
+ * \brief Save a graph to json, redirects to "SaveJSON" pass.
+ * \param graph The graph to be saved as json format.
+ * \return The json string.
+ */
+inline std::string SaveJSON(Graph graph) {
+  Graph ret = ApplyPass(std::move(graph), "SaveJSON");
+  return ret.GetAttr<std::string>("json");
+}
+
+
+/*!
+ * \brief Print graph ir
+ * \param graph The graph to be printed
+ * \return The graph ir string.
+ */
+inline std::string PrintGraphIR(Graph graph) {
+  Graph ret = ApplyPass(std::move(graph), "PrintGraphIR");
+  return ret.GetAttr<std::string>("graphir");
+}
+
+/*!
+ * \brief Add control flow dependencies between nodes.
+ *
+ *  This function will enforce the correct order between
+ *  write (mutable operators) and read (immutable operators)
+ *  to sovle write-after-read and read-after-write problems.
+ *
+ * \param src The input graph.
+ * \return A graph with proper control flow dependencies added.
+ */
+inline Graph OrderMutation(Graph src) {
+  return ApplyPass(std::move(src), "OrderMutation");
+}
+
+/*!
+ * \brief Infer shapes in the graph given the information.
+ * \param graph The input graph.
+ * \param shape_inputs The shapes of input symbols to the graph.
+ * \param shape_attr_key The key to the node attribute that can indicate shape. This is
+ *                       the place where manual hint for shapes could be injected.
+ * \return A graph with new attribute "shape" containing inferred shape of each NodeEntry.
+ *         The index of ShapeVector is given by graph.indexed_graph().entry_id.
+ */
+inline Graph InferShape(Graph graph,
+                        ShapeVector shape_inputs,
+                        std::string shape_attr_key = "") {
+  if (shape_inputs.size() != 0) {
+    graph.attrs["shape_inputs"] = std::make_shared<any>(std::move(shape_inputs));
+  }
+  if (shape_attr_key.length() != 0) {
+    graph.attrs["shape_attr_key"] = std::make_shared<any>(std::move(shape_attr_key));
+  }
+  return ApplyPass(std::move(graph), "InferShape");
+}
+
+/*!
+ * \brief Infer types in the graph given the information.
+ * \param graph The input graph.
+ * \param dtype_inputs The types of input symbols to the graph.
+ * \param dtype_attr_key The key to the node attribute that can indicate types. This is
+ *                       the place where manual hint for types could be injected.
+ * \return A graph with new attribute "dtype" containing inferred type of each NodeEntry.
+ *         The index of ShapeVector is given by graph.indexed_graph().entry_id.
+ */
+inline Graph InferType(Graph graph,
+                       DTypeVector dtype_inputs,
+                       std::string dtype_attr_key = "") {
+  if (dtype_inputs.size() != 0) {
+    graph.attrs["dtype_inputs"] = std::make_shared<any>(std::move(dtype_inputs));
+  }
+  if (dtype_attr_key.length() != 0) {
+    graph.attrs["dtype_attr_key"] = std::make_shared<any>(std::move(dtype_attr_key));
+  }
+  return ApplyPass(std::move(graph), "InferType");
+}
+
+/*!
+ * \brief Place the devices for each operator in the graph.
+ *
+ *  Current device placement is quite simple. Each operator is assigned to a "group" (stored
+ *  in `device_group_attr_key` attribute). Each group is assigned to a device (stored in
+ *  `device_assign_map` attribute). Operators will be placed to the device assigned to its
+ *  group. Copy operators will be injected if cross device reference happens.
+ *
+ * \param graph The input graph.
+ * \param device_group_attr_key The attribute name for hints of device group.
+ * \param device_assign_map The assignment map of device.
+ * \param device_copy_op The name of copy op to be inserted when cross device copy happened.
+ * \return A graph with new attribute "device", cotaining device information of each node.
+ */
+inline Graph PlaceDevice(Graph graph,
+                         std::string device_group_attr_key,
+                         DeviceAssignMap device_assign_map,
+                         std::string device_copy_op) {
+  graph.attrs["device_group_attr_key"] = std::make_shared<any>(std::move(device_group_attr_key));
+  graph.attrs["device_assign_map"] = std::make_shared<any>(std::move(device_assign_map));
+  graph.attrs["device_copy_op"] = std::make_shared<any>(std::move(device_copy_op));
+  return ApplyPass(std::move(graph), "PlaceDevice");
+}
+
+/*!
+ * \brief Get the gradient graph whose outputs are gradients of xs wrt to ys.
+ * \param graph The input graph.
+ * \param ys The entries we want to take gradient from.
+ * \param xs The input to take gradient with respect to.
+ * \param ys_out_grad The symbol for additional gradient to be propagate back to y.
+ * \param aggregate_fun Aggregation function applied to aggregate the inputs.
+ * \param mirror_fun Optional mirror function to do mirror optimization and save memory.
+ * \param attr_hint_fun Optional, hint function to output a node that like src, but its attr is same as like.
+ * \param zero_ops Optional, list of operators that outputs a single zero array. The first one
+ *  must be zeros_like.
+ * \param copy_op_str Optional, name of the copy operation required to handle duplicates
+ *  on the edge of the graph
+ * \return A new graph, whose outputs correspond to inputs of xs.
+ */
+inline Graph Gradient(
+    Graph graph,
+    std::vector<NodeEntry> ys,
+    std::vector<NodeEntry> xs,
+    std::vector<NodeEntry> ys_out_grad,
+    std::function<NodeEntry(std::vector<NodeEntry>&& inputs)> aggregate_fun = nullptr,
+    std::function<int(const Node& node)> mirror_fun = nullptr,
+    std::function<NodeEntry(const NodeEntry& src, const NodeEntry &like)>
+    attr_hint_fun = nullptr,
+    std::vector<const Op*> zero_ops = std::vector<const Op*>(),
+    std::string copy_op_str = std::string()) {
+  graph.attrs["grad_ys"] = std::make_shared<any>(std::move(ys));
+
+  graph.attrs["grad_xs"] = std::make_shared<any>(std::move(xs));
+  graph.attrs["grad_ys_out_grad"] = std::make_shared<any>(std::move(ys_out_grad));
+  if (aggregate_fun != nullptr) {
+    graph.attrs["grad_aggregate_fun"] = std::make_shared<any>(aggregate_fun);
+  }
+
+  if (mirror_fun != nullptr) {
+    graph.attrs["grad_mirror_fun"] = std::make_shared<any>(mirror_fun);
+  }
+
+  if (attr_hint_fun != nullptr) {
+    graph.attrs["attr_hint_fun"] = std::make_shared<any>(attr_hint_fun);
+  }
+
+  if (zero_ops.size()) {
+    graph.attrs["zero_ops"] = std::make_shared<any>(std::move(zero_ops));
+  }
+
+  if (copy_op_str != std::string()) {
+      graph.attrs["copy_op"] = std::make_shared<any>(std::move(copy_op_str));
+  }
+
+  return ApplyPass(std::move(graph), "Gradient");
+}
+
+}  // namespace pass
+}  // namespace nnvm
+#endif  // NNVM_PASS_FUNCTIONS_H_
diff --git a/nnvm/include/nnvm/symbolic.h b/nnvm/include/nnvm/symbolic.h
new file mode 100644
index 000000000000..ebb2ab5d30d0
--- /dev/null
+++ b/nnvm/include/nnvm/symbolic.h
@@ -0,0 +1,217 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/symbolic.h
+ * \brief Symbolic graph construction API
+ *
+ *  This API is optional, but useful to allow user
+ *  to construct NNVM Graph easily, and quickly create
+ *  front-end host languages.
+ */
+#ifndef NNVM_SYMBOLIC_H_
+#define NNVM_SYMBOLIC_H_
+
+#include <string>
+#include <vector>
+#include <tuple>
+#include <utility>
+
+#include "./base.h"
+#include "./node.h"
+
+namespace nnvm {
+/*!
+ * \brief Symbol is help class used to represent the operator node in Graph.
+ *
+ *  Symbol acts as an interface for building graphs from different components
+ *  like Variable, Functor and Group. Symbol is also exported to python front-end
+ *  (while Graph is not) to enable quick test and deployment. Conceptually,
+ *  symbol is the final operation of a graph and thus including all the information
+ *  required (the graph) to evaluate its output value.
+ */
+class NNVM_DLL Symbol {
+ public:
+  /*! \brief option passed to ListAttr */
+  enum ListAttrOption {
+    /*! \brief recursively list all attributes */
+    kRecursive = 0,
+    /*! \brief only list attributes in current node */
+    kShallow = 1
+  };
+  /*! \brief option passed to ListInputNames */
+  enum ListInputOption {
+    /*! \brief list all the arguments */
+    kAll = 0,
+    /*! \brief list only read only arguments */
+    kReadOnlyArgs = 1,
+    /*!
+     * \brief List auxiliary states that can be mutated by the graph.
+     *  This excludes the ReadOnly arguments
+     */
+    kAuxiliaryStates = 2
+  };
+
+  /*! \brief output entries contained in the symbol */
+  std::vector<NodeEntry> outputs;
+
+  /*!
+   * \brief Copy the symbol.
+   * \return A deep copy of this symbol.
+   */
+  Symbol Copy() const;
+  /*!
+   * \brief Print the symbol info to output stream.
+   * \param os The output stream to print to.
+   */
+  void Print(std::ostream &os) const; // NOLINT(*)
+  /*!
+   * \brief Get the index-th element from the returned tuple.
+   * \param index Index of multi output.
+   * \return The symbol corresponds to the indexed element.
+   */
+  Symbol operator[] (size_t index) const;
+  /*!
+   * \brief List the input variable nodes.
+   *
+   *  The order of the returned list is the same as the order of the input list to `operator()`.
+   *
+   * \param option The options to list the arguments.
+   * \return The arguments list of this symbol, they can be either named or unnamed (empty string).
+   * \sa ListInputOption
+   */
+  std::vector<NodePtr> ListInputs(ListInputOption option) const;
+  /*!
+   * \brief List the input names.
+   *
+   *  The order of the returned list is the same as the order of the input list to `operator()`.
+   *
+   * \param option The options to list the arguments.
+   * \return The arguments list of this symbol, they can be either named or unnamed (empty string).
+   * \sa ListInputOption
+   */
+  std::vector<std::string> ListInputNames(ListInputOption option) const;
+  /*!
+   * \brief List the names of outputs for this symbol.
+   *
+   *  For normal operators, it is usually symbol node name + "_output".
+   *
+   * \return get the descriptions of outputs for this symbol.
+   */
+  std::vector<std::string> ListOutputNames() const;
+  /*!
+   * \brief Compose the symbol with arguments, this changes the current symbol.
+   * The kwargs passed in can be in-complete,
+   *
+   * The rest of the symbols will remain the same name.
+   *
+   * \param args Positional arguments.
+   * \param kwargs Keyword arguments for the symbol.
+   * \param name Name of returned symbol.
+   */
+  void Compose(const array_view<const Symbol*>& args,
+               const std::unordered_map<std::string, const Symbol*>& kwargs,
+               const std::string& name);
+  /*!
+   * \brief Apply the symbol as a function, compose with arguments
+   *
+   *  This is equivalent to Copy then Compose.
+   *
+   * \param args Positional arguments for the symbol.
+   * \param kwargs Keyword arguments for the symbol.
+   * \param name Name of returned symbol.
+   * \return A new Symbol which is the composition of current symbol with its arguments.
+   */
+  Symbol operator () (const array_view<const Symbol*>& args,
+                      const std::unordered_map<std::string, const Symbol*>& kwargs,
+                      const std::string& name) const;
+  /*!
+   * \brief Add control flow dependencies to the operators in symbols.
+   *
+   *  For grouped symbol, an error will be raised. This mutates current symbolic Node.
+   *
+   * \param src The symbols to depend on.
+   */
+  void AddControlDeps(const Symbol& src);
+  /*
+   * \brief Get all the internal nodes of the symbol.
+   * \return symbol A new symbol whose output contains all the outputs of the symbols
+   *                including input variables and intermediate outputs.
+   */
+  Symbol GetInternals() const;
+  /*
+   * \brief Get the direct inputs of the head node(s) of this symbol.
+   * \return symbol A new symbol whose output contains all the inputs of the head
+   *                node(s).
+   */
+  Symbol GetChildren() const;
+  /*!
+   * \brief Set additional attributes to current node.
+   *
+   *  This only works for symbol with outputs from single operators.
+   *  For grouped symbol, an error will be raised.
+   *
+   *  This function mutates the node's symbol and is not recommended.
+   *
+   * \param attrs The attributes to set.
+   */
+  void SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs);
+  /*!
+   * \brief Get attributes from the symbol.
+   *
+   *  This only works for symbol with outputs from single operators.
+   *  For grouped symbol, an error will be raised.
+   *
+   * \param key Key of the attribute. When key == "name", it returns the name attirbute.
+   * \param out The output value of the attribute.
+   * \return true If the attribute exists, false if the attribute does not exist.
+   */
+  bool GetAttr(const std::string& key, std::string* out) const;
+  /*!
+   * \brief Get attribute dictionary from the symbol.
+   *
+   *  For grouped symbol, an error will be raised.
+   *
+   * \param option If recursive flag is set, the attributes of all children are retrieved.
+   *               The name of symbol will be pre-pended to each key.
+   * \return The created attribute.
+   */
+  std::unordered_map<std::string, std::string> ListAttrs(ListAttrOption option) const;
+  /*!
+   * \brief Get attribute dictionary from the symbol and all children.
+   *
+   *  For grouped symbol, an error will be raised.
+   *
+   * \return The created attribute in format <operator_name, key, value>.
+   */
+  std::vector<std::tuple<std::string, std::string, std::string> >
+      ListAttrsRecursive() const;
+  /*!
+   * \brief Create symbolic functor(AtomicSymbol) by given operator and attributes.
+   * \param op The operator.
+   * \param attrs The additional attributes.
+   * \return Symbol that can be used to call compose further.
+   */
+  static Symbol CreateFunctor(const Op* op,
+                              std::unordered_map<std::string, std::string> attrs);
+  /*!
+   * \brief Create symbolic functor(AtomicSymbol) by given node attributes.
+   * \param attrs pre-initialized Node attributes.
+   * \return Symbol that can be used to call compose further.
+   */
+  static Symbol CreateFunctor(const NodeAttrs& attrs);
+  /*!
+   * \brief Create symbol node representing variable.
+   * \param name Name of the variable.
+   * \return The symbol.
+   */
+  static Symbol CreateVariable(const std::string& name);
+  /*!
+   * \brief Create equivalence of symbol by grouping the symbols together.
+   * \param symbols A list of symbols to be grouped.
+   * \return The grouped symbol.
+   */
+  static Symbol CreateGroup(const std::vector<Symbol>& symbols);
+};
+
+}  // namespace nnvm
+
+#endif  // NNVM_SYMBOLIC_H_
diff --git a/nnvm/include/nnvm/top/README b/nnvm/include/nnvm/top/README
new file mode 100644
index 000000000000..09a4d6fc387f
--- /dev/null
+++ b/nnvm/include/nnvm/top/README
@@ -0,0 +1 @@
+NNVM Core Operator and Compiler
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
new file mode 100644
index 000000000000..c9baa116e8aa
--- /dev/null
+++ b/nnvm/include/nnvm/top/nn.h
@@ -0,0 +1,493 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file nnvm/top/nn.h
+ * \brief Auxiliary param for tensor primitive.
+ */
+#ifndef NNVM_TOP_NN_H_
+#define NNVM_TOP_NN_H_
+
+#include <dmlc/base.h>
+#include <dmlc/parameter.h>
+#include <nnvm/tuple.h>
+#include <nnvm/layout.h>
+#include <string>
+#include "./tensor.h"
+
+namespace nnvm {
+namespace top {
+
+struct DenseParam : public dmlc::Parameter<DenseParam> {
+  int units;
+  bool use_bias;
+
+  DMLC_DECLARE_PARAMETER(DenseParam) {
+    DMLC_DECLARE_FIELD(units).set_lower_bound(1)
+    .describe("Number of hidden units of the dense transformation.");
+    DMLC_DECLARE_FIELD(use_bias).set_default(true)
+    .describe("Whether to use bias parameter");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kWeight = 1;
+  static const constexpr int kBias = 2;
+};
+
+struct DropoutParam : public dmlc::Parameter<DropoutParam> {
+  float rate;
+
+  DMLC_DECLARE_PARAMETER(DropoutParam) {
+    DMLC_DECLARE_FIELD(rate).set_default(0.5)
+        .set_range(0, 1)
+        .describe("Fraction of the input that gets dropped out during training time.");
+  }
+};
+
+struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
+  int axis;
+  double epsilon;
+  double momentum;
+  bool center;
+  bool scale;
+
+  DMLC_DECLARE_PARAMETER(BatchNormParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(1)
+      .describe("Specify which shape axis the channel is specified.");
+    DMLC_DECLARE_FIELD(epsilon).set_default(1e-5)
+        .describe("Small float added to variance to avoid dividing by zero.");
+    DMLC_DECLARE_FIELD(center).set_default(true)
+        .describe("If True, add offset of `beta` to normalized tensor."
+                  "If False, `beta` is ignored.");
+    DMLC_DECLARE_FIELD(scale).set_default(true)
+        .describe("If True, multiply by `gamma`. If False, `gamma` is not used."
+                  "When the next layer is piecewise linear (also e.g. `nn.relu`),"
+                  "this can be disabled since the scaling"
+                  "will be done by the next layer.");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kGamma = 1;
+  static const constexpr int kBeta = 2;
+  static const constexpr int kMovingMean = 3;
+  static const constexpr int kMovingVariance = 4;
+};
+
+
+// Shared by softmax and log_softmax
+struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
+  int axis;
+
+  DMLC_DECLARE_PARAMETER(SoftmaxParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(-1)
+        .describe("The axis to sum over when computing softmax.");
+  }
+};
+
+struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
+  double alpha;
+
+  DMLC_DECLARE_PARAMETER(LeakyReLUParam) {
+    DMLC_DECLARE_FIELD(alpha).set_lower_bound(0.0).set_default(0.25)
+        .describe("slope coefficient for the negative half axis.");
+  }
+};
+
+struct PReLUParam : public dmlc::Parameter<PReLUParam> {
+  int axis;
+  DMLC_DECLARE_PARAMETER(PReLUParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(1)
+      .describe("Specify which shape axis the channel is specified.");
+  }
+};
+
+struct PadParam : public dmlc::Parameter<PadParam> {
+  float pad_value;
+  Tuple<Tuple<int> > pad_width;
+
+  DMLC_DECLARE_PARAMETER(PadParam) {
+    DMLC_DECLARE_FIELD(pad_value).set_default(0.0)
+      .describe("The value to be padded.");
+    DMLC_DECLARE_FIELD(pad_width)
+      .describe("Number of values padded to the edges of each axis, "
+                "in the format of ((before_1, after_1), ... (before_N, after_N))");
+  }
+};
+
+
+struct Conv2DParam : public dmlc::Parameter<Conv2DParam> {
+  int channels;
+  TShape kernel_size;
+  TShape strides;
+  TShape padding;
+  TShape dilation;
+  int groups;
+  std::string layout;
+  std::string kernel_layout;
+  std::string out_layout;
+  int out_dtype;
+  bool use_bias;
+
+  DMLC_DECLARE_PARAMETER(Conv2DParam) {
+    DMLC_DECLARE_FIELD(channels)
+      .describe("The dimensionality of the output space"
+                "i.e. the number of output channels in the convolution.");
+    DMLC_DECLARE_FIELD(kernel_size)
+      .describe("Specifies the dimensions of the convolution window.");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "on both sides for padding number of points");
+    DMLC_DECLARE_FIELD(dilation).set_default(TShape({1, 1}))
+      .describe("Specifies the dilation rate to use for dilated convolution.");
+    DMLC_DECLARE_FIELD(groups).set_default(1)
+      .describe("Controls the connections between inputs and outputs."
+                "At groups=1, all inputs are convolved to all outputs."
+                "At groups=2, the operation becomes equivalent to having two convolution"
+                "layers side by side, each seeing half the input channels, and producing"
+                "half the output channels, and both subsequently concatenated.");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(out_layout).set_default("__undef__")
+      .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Default to be same as input layout.");
+    DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
+      .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+      .add_enum("same", -1)
+      .set_default(-1)
+      .describe("Output data type, set to explicit type under mixed precision setting");
+
+    DMLC_DECLARE_FIELD(use_bias).set_default(true)
+      .describe("Whether the layer uses a bias vector.");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kWeight = 1;
+  static const constexpr int kBias = 2;
+};
+
+struct WinogradWeightTransformParam : public dmlc::Parameter<WinogradWeightTransformParam> {
+    int tile_size;
+
+    DMLC_DECLARE_PARAMETER(WinogradWeightTransformParam) {
+      DMLC_DECLARE_FIELD(tile_size)
+        .describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
+    }
+
+    static const constexpr int kWeight = 0;
+};
+
+struct WinogradConv2DParam : public dmlc::Parameter<WinogradConv2DParam> {
+  int channels;
+  TShape kernel_size;
+  TShape strides;
+  TShape padding;
+  TShape dilation;
+  int groups;
+  std::string layout;
+  std::string kernel_layout;
+  std::string out_layout;
+  int out_dtype;
+  bool use_bias;
+  int tile_size;
+
+  DMLC_DECLARE_PARAMETER(WinogradConv2DParam) {
+    DMLC_DECLARE_FIELD(channels)
+      .describe("The dimensionality of the output space"
+                "i.e. the number of output channels in the convolution.");
+    DMLC_DECLARE_FIELD(kernel_size)
+      .describe("Specifies the dimensions of the convolution window.");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "on both sides for padding number of points");
+    DMLC_DECLARE_FIELD(dilation).set_default(TShape({1, 1}))
+      .describe("Specifies the dilation rate to use for dilated convolution.");
+    DMLC_DECLARE_FIELD(groups).set_default(1)
+      .describe("Controls the connections between inputs and outputs."
+                "At groups=1, all inputs are convolved to all outputs."
+                "At groups=2, the operation becomes equivalent to having two convolution"
+                "layers side by side, each seeing half the input channels, and producing"
+                "half the output channels, and both subsequently concatenated.");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(out_layout).set_default("__undef__")
+      .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Default to be same as input layout.");
+    DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
+      .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+      .add_enum("same", -1)
+      .set_default(-1)
+      .describe("Output data type, set to explicit type under mixed precision setting");
+    DMLC_DECLARE_FIELD(use_bias).set_default(true)
+      .describe("Whether the layer uses a bias vector.");
+    DMLC_DECLARE_FIELD(tile_size)
+      .describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kWeight = 1;
+  static const constexpr int kBias = 2;
+};
+
+struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
+  int channels;
+  TShape kernel_size;
+  TShape strides;
+  TShape padding;
+  TShape output_padding;
+  TShape dilation;
+  int groups;
+  std::string layout;
+  std::string kernel_layout;
+  bool use_bias;
+
+  DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) {
+    DMLC_DECLARE_FIELD(channels)
+      .describe("The dimensionality of the output space"
+                "i.e. the number of output channels in the convolution.");
+    DMLC_DECLARE_FIELD(kernel_size)
+      .describe("Specifies the dimensions of the convolution window.");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(output_padding).set_default(TShape({0, 0}))
+      .describe("Zero-padding added to one side of the output.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "on both sides for padding number of points");
+    DMLC_DECLARE_FIELD(dilation).set_default(TShape({1, 1}))
+      .describe("Specifies the dilation rate to use for dilated convolution.");
+    DMLC_DECLARE_FIELD(groups).set_default(1)
+      .describe("Controls the connections between inputs and outputs."
+                "At groups=1, all inputs are convolved to all outputs."
+                "At groups=2, the operation becomes equivalent to having two convolution"
+                "layers side by side, each seeing half the input channels, and producing"
+                "half the output channels, and both subsequently concatenated.");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
+      .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
+    DMLC_DECLARE_FIELD(use_bias).set_default(true)
+      .describe("Whether the layer uses a bias vector.");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kWeight = 1;
+  static const constexpr int kBias = 2;
+};
+
+
+struct MaxPool2DParam : public dmlc::Parameter<MaxPool2DParam> {
+  TShape pool_size;
+  TShape strides;
+  TShape padding;
+  std::string layout;
+  bool ceil_mode;
+
+  DMLC_DECLARE_PARAMETER(MaxPool2DParam) {
+    DMLC_DECLARE_FIELD(pool_size)
+      .describe("Size of the pooling windows..");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "Padding support both symmetric and asymmetric as"
+                "one int : same padding used on all sides"
+                "two int : bottom, right will use same padding as top, left"
+                "four int : padding width in the order of (top, left, bottom, right)");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(ceil_mode).set_default(false)
+      .describe("When true, will use ceil instead of floor to compute the output shape.");
+  }
+};
+
+
+struct AvgPool2DParam : public dmlc::Parameter<AvgPool2DParam> {
+  TShape pool_size;
+  TShape strides;
+  TShape padding;
+  std::string layout;
+  bool ceil_mode;
+  bool count_include_pad;
+
+  DMLC_DECLARE_PARAMETER(AvgPool2DParam) {
+    DMLC_DECLARE_FIELD(pool_size)
+      .describe("Size of the pooling windows..");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "Padding support both symmetric and asymmetric as"
+                "one int : same padding used on all sides"
+                "two int : bottom, right will use same padding as top, left"
+                "four int : padding width in the order of (top, left, bottom, right)");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(ceil_mode).set_default(false)
+      .describe("When true, will use ceil instead of floor to compute the output shape.");
+    DMLC_DECLARE_FIELD(count_include_pad).set_default(false)
+      .describe("When true, will include padding to compute the average");
+  }
+};
+
+
+struct GlobalPool2DParam : public dmlc::Parameter<GlobalPool2DParam> {
+  std::string layout;
+
+  DMLC_DECLARE_PARAMETER(GlobalPool2DParam) {
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+  }
+};
+
+struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
+  int scale;
+  std::string layout;
+  std::string method;
+
+  DMLC_DECLARE_PARAMETER(UpSamplingParam) {
+    DMLC_DECLARE_FIELD(scale)
+      .describe("upsampling scaling factor");
+    DMLC_DECLARE_FIELD(layout)
+      .set_default("NCHW")
+      .describe("Dimension ordering of data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Upsampling is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(method)
+      .set_default("NEAREST_NEIGHBOR")
+      .describe("Specify the mode to use for scaling."
+                "NEAREST_NEIGHBOR -  Nearest Neighbor"
+                "BILINEAR - Bilinear Interpolation");
+  }
+};
+
+struct LayoutTransformParam : public dmlc::Parameter<LayoutTransformParam> {
+  std::string src_layout;
+  std::string dst_layout;
+
+  DMLC_DECLARE_PARAMETER(LayoutTransformParam) {
+    DMLC_DECLARE_FIELD(src_layout).set_default("__undef__")
+    .describe("Dimension ordering of data");
+    DMLC_DECLARE_FIELD(dst_layout).set_default("__undef__")
+    .describe("Dimension ordering of data.");
+  }
+};
+
+struct MultiBoxPriorParam : public dmlc::Parameter<MultiBoxPriorParam> {
+  Tuple<float> sizes;
+  Tuple<float> ratios;
+  Tuple<float> steps;
+  Tuple<float> offsets;
+  bool clip;
+
+  DMLC_DECLARE_PARAMETER(MultiBoxPriorParam) {
+    DMLC_DECLARE_FIELD(sizes).set_default(Tuple<float>({1.0}))
+      .describe("List of sizes of generated MultiBoxPriores.");
+    DMLC_DECLARE_FIELD(ratios).set_default(Tuple<float>({1.0}))
+    .describe("List of aspect ratios of generated MultiBoxPriores.");
+    DMLC_DECLARE_FIELD(steps).set_default(Tuple<float>({-1.0, -1.0}))
+    .describe("Priorbox step across y and x, -1 for auto calculation.");
+    DMLC_DECLARE_FIELD(offsets).set_default(Tuple<float>({0.5, 0.5}))
+    .describe("Priorbox center offsets, y and x respectively.");
+    DMLC_DECLARE_FIELD(clip).set_default(false)
+    .describe("Whether to clip out-of-boundary boxes.");
+  }
+};
+
+struct MultiBoxTransformLocParam : public dmlc::Parameter<MultiBoxTransformLocParam> {
+  bool clip;
+  float threshold;
+  Tuple<float> variances;
+  DMLC_DECLARE_PARAMETER(MultiBoxTransformLocParam) {
+    DMLC_DECLARE_FIELD(clip).set_default(true)
+      .describe("Clip out-of-boundary boxes.");
+    DMLC_DECLARE_FIELD(threshold).set_default(0.01)
+    .describe("Threshold to be a positive prediction.");
+    DMLC_DECLARE_FIELD(variances).set_default(Tuple<float>({0.1f, 0.1f, 0.2f, 0.2f}))
+    .describe("Variances to be decoded from box regression output.");
+  }
+};
+
+struct NMSParam : public dmlc::Parameter<NMSParam> {
+  float nms_threshold;
+  bool force_suppress;
+  int nms_topk;
+  DMLC_DECLARE_PARAMETER(NMSParam) {
+    DMLC_DECLARE_FIELD(nms_threshold).set_default(0.5)
+      .describe("Non-maximum suppression threshold.");
+    DMLC_DECLARE_FIELD(force_suppress).set_default(false)
+    .describe("Suppress all detections regardless of class_id.");
+    DMLC_DECLARE_FIELD(nms_topk).set_default(-1)
+    .describe("Keep maximum top k detections before nms, -1 for no limit.");
+  }
+};
+
+struct LRNParam : public dmlc::Parameter<LRNParam> {
+  int size;
+  int axis;
+  float alpha;
+  float beta;
+  float bias;
+
+  DMLC_DECLARE_PARAMETER(LRNParam) {
+    DMLC_DECLARE_FIELD(size)
+      .describe("The size of the local region to be considered for normalization.");
+    DMLC_DECLARE_FIELD(axis)
+      .describe("input data layout channel axis");
+    DMLC_DECLARE_FIELD(alpha)
+      .describe("The scaling parameter.");
+    DMLC_DECLARE_FIELD(beta)
+      .describe("The exponent parameter.");
+    DMLC_DECLARE_FIELD(bias)
+      .describe("The offset parameter.");
+  }
+  // constants
+  static const constexpr int kData = 0;
+};
+
+struct L2NormalizeParam : public dmlc::Parameter<L2NormalizeParam> {
+  float eps;
+  Tuple<int> axis;
+
+  DMLC_DECLARE_PARAMETER(L2NormalizeParam) {
+    DMLC_DECLARE_FIELD(eps)
+      .describe("float type epsilon value.");
+    DMLC_DECLARE_FIELD(axis)
+      .describe("axis over the normalization applied");
+  }
+};
+
+}  // namespace top
+}  // namespace nnvm
+
+#endif  // NNVM_TOP_NN_H_
diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h
new file mode 100644
index 000000000000..22ee9d7118e6
--- /dev/null
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -0,0 +1,301 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file nnvm/top/tensor.h
+ * \brief Auxiliary param for tensor primitive.
+ */
+#ifndef NNVM_TOP_TENSOR_H_
+#define NNVM_TOP_TENSOR_H_
+
+#include <dmlc/base.h>
+#include <dmlc/parameter.h>
+#include <nnvm/tuple.h>
+
+namespace nnvm {
+namespace top {
+
+struct ConcatenateParam : public dmlc::Parameter<ConcatenateParam> {
+  int axis;
+  DMLC_DECLARE_PARAMETER(ConcatenateParam) {
+    DMLC_DECLARE_FIELD(axis).set_lower_bound(0).set_default(1)
+    .describe("the axis to be concated.");
+  }
+};
+
+struct ExpandDimsParam : public dmlc::Parameter<ExpandDimsParam> {
+  int axis;
+  int num_newaxis;
+  DMLC_DECLARE_PARAMETER(ExpandDimsParam) {
+    DMLC_DECLARE_FIELD(axis)
+    .describe("the axis to be expanded.");
+    DMLC_DECLARE_FIELD(num_newaxis).set_lower_bound(1).set_default(1)
+    .describe("Number of new axis to be inserted.");
+  }
+};
+
+struct SplitParam : public dmlc::Parameter<SplitParam> {
+  // numpy convention, only support indices, not support list.
+  Tuple<int> indices_or_sections;
+  int axis;
+  // additional hint whether it is equal_split mode
+  // deduced from indices_or_sections
+  bool equal_split;
+
+  DMLC_DECLARE_PARAMETER(SplitParam) {
+    DMLC_DECLARE_FIELD(indices_or_sections)
+        .describe("Number of outputs to be splitted");
+    DMLC_DECLARE_FIELD(axis).set_lower_bound(0).set_default(1)
+        .describe("the axis to be splitted.");
+  }
+};
+
+
+struct TakeParam : public dmlc::Parameter<TakeParam> {
+  dmlc::optional<int> axis;
+
+  DMLC_DECLARE_PARAMETER(TakeParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(dmlc::optional<int>())
+        .describe("the axis over which to select values.");
+  }
+};
+
+struct StridedSliceParam : public dmlc::Parameter<StridedSliceParam> {
+  // numpy convention, only support indices, not support list.
+  Tuple<int64_t> begin;
+  Tuple<int64_t> end;
+  Tuple<int64_t> stride;
+
+  DMLC_DECLARE_PARAMETER(StridedSliceParam) {
+    DMLC_DECLARE_FIELD(begin)
+        .describe("Indices for begin of slice");
+    DMLC_DECLARE_FIELD(end)
+        .describe("Indices for end of the slice");
+    DMLC_DECLARE_FIELD(stride).set_default(Tuple<int64_t>())
+        .describe("Stride values of the slice");
+  }
+};
+
+enum TypeFlag {
+  kFloat32 = 0,
+  kFloat64 = 1,
+  kFloat16 = 2,
+  kUint8 = 3,
+  kInt32 = 4,
+  kInt8  = 5,
+  kInt64 = 6,
+  kInt16 = 7,
+  kUint16 = 8,
+  kUint32 = 9,
+  kUint64 = 10,
+};
+
+enum IndicatorRuleFlag {
+  kGT0 = 0,
+  kLT0 = 1,
+  kMax = 2,
+  kMin = 3,
+};
+
+#define DMLC_DECLARE_DTYPE_FIELD(name)                              \
+  DMLC_DECLARE_FIELD(name)                                          \
+  .add_enum("float16", kFloat16)                                    \
+  .add_enum("float32", kFloat32)                                    \
+  .add_enum("float64", kFloat64)                                    \
+  .add_enum("uint8",  kUint8)                                       \
+  .add_enum("uint16", kUint16)                                      \
+  .add_enum("uint32", kUint32)                                      \
+  .add_enum("uint64", kUint64)                                      \
+  .add_enum("int8",  kInt8)                                         \
+  .add_enum("int16", kInt16)                                        \
+  .add_enum("int32", kInt32)                                        \
+  .add_enum("int64", kInt64)
+
+struct CastParam : public dmlc::Parameter<CastParam> {
+  int dtype;
+  DMLC_DECLARE_PARAMETER(CastParam) {
+    DMLC_DECLARE_DTYPE_FIELD(dtype)
+    .describe("Output data type.");
+  }
+};
+
+struct IndicatorParam : public dmlc::Parameter<IndicatorParam> {
+  TShape axis;
+  bool exclude;
+  DMLC_DECLARE_PARAMETER(IndicatorParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(TShape())
+    .describe(R"code(The axis or axes along which to perform the indicator rule.
+
+        The default, `axis=()`, will compute over all elements into a
+        scalar array with shape `(1,)`.
+
+        If `axis` is int, rule is applied on a particular axis.
+
+        If `axis` is a tuple of ints, rule is applied on all the axes
+        specified in the tuple.
+
+        If `exclude` is true, rule will be applied on the axes that are
+        NOT in axis instead.)code");
+    DMLC_DECLARE_FIELD(exclude).set_default(false)
+    .describe("Whether to apply rule on axis that are NOT in axis instead.");
+  }
+};
+
+struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
+  Tuple<int64_t> shape;
+
+  DMLC_DECLARE_PARAMETER(ReshapeParam) {
+    DMLC_DECLARE_FIELD(shape);
+  }
+};
+
+struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
+  TShape axis;
+
+  DMLC_DECLARE_PARAMETER(SqueezeParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(TShape())
+    .describe("The axis to squeeze in the input tensor.");
+  }
+};
+
+struct ScalarParam : public dmlc::Parameter<ScalarParam> {
+  double scalar;
+
+  DMLC_DECLARE_PARAMETER(ScalarParam) {
+    DMLC_DECLARE_FIELD(scalar);
+  }
+};
+
+struct FillValueParam : public dmlc::Parameter<FillValueParam> {
+  double fill_value;
+
+  DMLC_DECLARE_PARAMETER(FillValueParam) {
+    DMLC_DECLARE_FIELD(fill_value)
+    .describe("Scalar value to be filled");
+  }
+};
+
+struct TransposeParam : public dmlc::Parameter<TransposeParam> {
+  TShape axes;
+
+  DMLC_DECLARE_PARAMETER(TransposeParam) {
+    DMLC_DECLARE_FIELD(axes).set_default(TShape())
+    .describe("Target axis order. By default the axes will be inverted.");
+  }
+};
+
+struct FlipParam : public dmlc::Parameter<FlipParam> {
+  int axis;
+  DMLC_DECLARE_PARAMETER(FlipParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(0)
+    .describe("the axis to be reveresed.");
+  }
+};
+
+struct BroadcastToParam : public dmlc::Parameter<BroadcastToParam> {
+  TShape shape;
+
+  DMLC_DECLARE_PARAMETER(BroadcastToParam) {
+    DMLC_DECLARE_FIELD(shape).set_default(TShape())
+      .describe("The shape of the desired array."
+                " We can set the dim to zero if it's same as the original."
+                " E.g `A = broadcast_to(B, shape=(10, 0, 0))` ");
+  }
+};
+
+struct ReduceParam : public dmlc::Parameter<ReduceParam> {
+  TShape axis;
+  bool keepdims;
+  bool exclude;
+
+  DMLC_DECLARE_PARAMETER(ReduceParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(TShape())
+        .describe(R"code(The axis or axes along which to perform the reduction.
+
+      The default, `axis=()`, will compute over all elements into a
+      scalar array with shape `(1,)`.
+
+      If `axis` is int, a reduction is performed on a particular axis.
+
+      If `axis` is a tuple of ints, a reduction is performed on all the axes
+      specified in the tuple.
+
+      If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.)code");
+
+    DMLC_DECLARE_FIELD(keepdims).set_default(false)
+      .describe("If this is set to `True`, the reduced axes are left "
+                "in the result as dimension with size one.");
+    DMLC_DECLARE_FIELD(exclude).set_default(false)
+      .describe("Whether to perform reduction on axis that are NOT in axis instead.");
+  }
+};
+
+struct InitOpWithScalarParam : public dmlc::Parameter<InitOpWithScalarParam> {
+  TShape shape;
+  int dtype;
+  double fill_value;
+
+  DMLC_DECLARE_PARAMETER(InitOpWithScalarParam) {
+    DMLC_DECLARE_FIELD(shape).set_default(TShape());
+    DMLC_DECLARE_DTYPE_FIELD(dtype).set_default(kFloat32)
+      .describe("Target data type.");
+    DMLC_DECLARE_FIELD(fill_value).describe("Scalar value to fill");
+  }
+};
+
+struct InitOpParam : public dmlc::Parameter<InitOpParam> {
+  TShape shape;
+  int dtype;
+
+  DMLC_DECLARE_PARAMETER(InitOpParam) {
+    DMLC_DECLARE_FIELD(shape).set_default(TShape());
+    DMLC_DECLARE_DTYPE_FIELD(dtype).set_default(kFloat32)
+      .describe("Target data type.");
+  }
+};
+
+struct ElementWiseReduceParam : public dmlc::Parameter<ElementWiseReduceParam> {
+  int num_args;
+  DMLC_DECLARE_PARAMETER(ElementWiseReduceParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
+      .describe("Number of inputs to be reduced.");
+  }
+};
+
+struct MatMulParam : public dmlc::Parameter<MatMulParam> {
+  bool transpose_a;
+  bool transpose_b;
+
+  DMLC_DECLARE_PARAMETER(MatMulParam) {
+    DMLC_DECLARE_FIELD(transpose_a)
+      .describe("If true then transpose the first input before dot.")
+      .set_default(false);
+    DMLC_DECLARE_FIELD(transpose_b)
+      .describe("If true then transpose the second input before dot.")
+      .set_default(false);
+  }
+};
+
+struct ClipParam : public dmlc::Parameter<ClipParam> {
+  double a_min, a_max;
+  DMLC_DECLARE_PARAMETER(ClipParam) {
+    DMLC_DECLARE_FIELD(a_min)
+      .describe("Minimum value such that value smaller then this will be clipped.");
+    DMLC_DECLARE_FIELD(a_max)
+      .describe("Maximum value such that value larger then this will be clipped.");
+  }
+};
+
+struct SliceLikeParam : public dmlc::Parameter<SliceLikeParam> {
+  Tuple<int> axis;
+  DMLC_DECLARE_PARAMETER(SliceLikeParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(Tuple<int>())
+      .describe("List of axes on which input data will be sliced according to the "
+                "corresponding size of the second input. By default will slice "
+                "on all axes. Negative axes are supported.");
+  }
+};
+
+}  // namespace top
+}  // namespace nnvm
+
+#endif  // NNVM_TOP_TENSOR_H_
diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h
new file mode 100644
index 000000000000..7e83aecc11f0
--- /dev/null
+++ b/nnvm/include/nnvm/tuple.h
@@ -0,0 +1,633 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file nnvm/tuple.h
+ * \brief Data structure Tuple and TShape to store dynamic sized shapes.
+ */
+#ifndef NNVM_TUPLE_H_
+#define NNVM_TUPLE_H_
+
+#include <vector>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <iostream>
+#include <string>
+#include "./base.h"
+
+namespace nnvm {
+
+/*! \brief data type to store dim size */
+typedef int64_t dim_t;
+
+/*!
+ * \brief A dynamic sized array data structure that is optimized for storing
+ *        small number of elements with same type.
+ *
+ *  Data will be stored in stack when number of elements is small.
+ *  It is suitable to hold shape of Tensor.
+ *
+ * \tparam ValueType The type of data stored inside tuple.
+ * \sa TShape
+ */
+template<typename ValueType>
+class Tuple {
+ public:
+  /*! \brief default constructor */
+  Tuple() = default;
+  /*! \brief destructor */
+  inline ~Tuple() {
+    delete [] data_heap_;
+  }
+  /*!
+   * \brief copy constructor from another tuple
+   * \param s the source tuple
+   */
+  inline Tuple(const Tuple<ValueType>& s) {
+    this->assign(s.begin(), s.end());
+  }
+  /*!
+   * \brief constructor from initializer list
+   * \param init the initializer_list
+   */
+  inline Tuple(std::initializer_list<ValueType> init) {
+    this->assign(init.begin(), init.end());
+  }
+  /*!
+   * \brief constructor from vector
+   * \param init the vector
+   */
+  inline Tuple(std::vector<ValueType> init) {  // NOLINT(runtime/explicit)
+    this->assign(init.begin(), init.end());
+  }
+  /*!
+   * \brief move constructor from Tuple
+   * \param src the source shape
+   */
+
+  inline Tuple(Tuple<ValueType>&& src) {   // NOLINT(runtime/explicit)
+    this->swap(src);
+  }
+  /*!
+   * \brief construct the Tuple from content of iterator
+   * \param begin the beginning of iterator
+   * \param end end the end of the iterator
+   * \tparam RandomAccessIterator iterator type
+   */
+  template<typename RandomAccessIterator>
+  inline Tuple(RandomAccessIterator begin,
+               RandomAccessIterator end) {
+    this->assign(begin, end);
+  }
+  /*!
+   * \brief Assign content to tuple from iterator.
+   * \param begin the beginning of iterator
+   * \param end end the end of the iterator
+   * \tparam RandomAccessIterator iterator type
+   */
+  template<typename RandomAccessIterator>
+  inline void assign(RandomAccessIterator begin,
+                     RandomAccessIterator end) {
+    this->SetDim(end - begin);
+    std::copy(begin, end, this->begin());
+  }
+  /*!
+   * \brief Swap current object with other
+   * \param other another object to be swapped.
+   */
+  inline void swap(Tuple<ValueType>& other) {  // NOLINT(*)
+    std::swap(ndim_, other.ndim_);
+    std::swap(num_heap_allocated_, other.num_heap_allocated_);
+    std::swap(data_stack_, other.data_stack_);
+    std::swap(data_heap_, other.data_heap_);
+  }
+  /*!
+   * \brief assignment from another tuple.
+   * \param src source tuple
+   * \return reference of self
+   */
+  inline Tuple<ValueType>& operator=(const Tuple<ValueType>& src) {
+    this->assign(src.begin(), src.end());
+    return *this;
+  }
+  /*!
+   * \brief assignment from rvalue of another tuple.
+   * \param src source tuple
+   * \return reference of self
+   */
+  inline Tuple<ValueType>& operator=(Tuple<ValueType>&& src) {
+    Tuple<ValueType>(std::move(src)).swap(*this);
+    return *this;
+  }
+  /*!
+   * \brief assignment from initializer list
+   * \param init the source initializer list
+   * \return reference of self
+   */
+  inline Tuple<ValueType> &operator=(std::initializer_list<ValueType> init) {
+    this->assign(init.begin(), init.end());
+    return *this;
+  }
+  /*!
+   * \return whether two tuple equals
+   * \param s the tuple to compare against
+   */
+  inline bool operator==(const Tuple<ValueType> &s) const {
+    if (ndim_ != s.ndim_) return false;
+    return std::equal(begin(), end(), s.begin());
+  }
+  /*!
+   * \return whether two tuple not equal
+   * \param s the tuple to compare against
+   */
+  inline bool operator!=(const Tuple<ValueType> &s) const {
+    return !(*this == s);
+  }
+  /*! \return the begin data pointer to content of the tuple */
+  inline const ValueType *begin() const {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \return the begin data pointer to content of the tuple */
+  inline ValueType *begin() {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \return the data pointer to end of the tuple */
+  inline const ValueType* end() const {
+    return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
+  }
+  /*! \return the data pointer to end the tuple */
+  inline ValueType* end() {
+    return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
+  }
+  /*! \return number of dimension of the tuple */
+  inline uint32_t ndim() const {
+    return ndim_;
+  }
+  /*!
+   * \brief get corresponding index
+   * \param i dimension index
+   * \return the corresponding dimension size
+   */
+  inline ValueType& operator[](size_t i) {
+    return begin()[i];
+  }
+  /*!
+   * \brief get corresponding index
+   * \param i dimension index
+   * \return the corresponding dimension size
+   */
+  inline const ValueType& operator[](size_t i) const {
+    return begin()[i];
+  }
+  /*!
+   * \brief Save Tuple to JSON.
+   * \param writer JSONWriter
+   */
+  inline void Save(dmlc::JSONWriter* writer) const {
+    std::vector<ValueType> tmp(begin(), end());
+    writer->Write(tmp);
+  }
+  /*!
+   * \brief Load Tuple from JSON.
+   * \param reader JSONReader
+   */
+  inline void Load(dmlc::JSONReader* reader) {
+    std::vector<ValueType> tmp;
+    reader->Read(&tmp);
+    this->assign(tmp.begin(), tmp.end());
+  }
+  /*!
+   * \brief allow output string of tuple to ostream
+   * \param os the output stream
+   * \param t the tuple
+   * \return the ostream
+   */
+  friend std::ostream &operator<<(std::ostream &os, const Tuple<ValueType> &t) {
+    os << '[';
+    const ValueType* begin = t.begin();
+    const ValueType* end = t.end();
+    for (const ValueType* it = begin; it != end; ++it) {
+      if (it != begin) os << ',';
+      os << *it;
+    }
+    os << ']';
+    return os;
+  }
+  /*!
+   * \brief read tuple from the istream
+   * \param is the input stream
+   * \param t The tuple
+   * \return the istream
+   */
+  friend std::istream &operator>>(std::istream &is, Tuple<ValueType> &t) {
+    // get (
+    while (true) {
+      char ch = is.peek();
+      if (isdigit(ch) || ch == '-') {
+        ValueType idx;
+        if (is >> idx) {
+          t.assign(&idx, &idx + 1);
+        }
+        return is;
+      }
+      is.get();
+      if (ch == '(' || ch == '[') break;
+      if (!isspace(ch)) {
+        is.setstate(std::ios::failbit);
+        return is;
+    }
+    }
+    // Handle empty tuple
+    while (isspace(is.peek())) {
+      is.get();
+    }
+    if (is.peek() == ')' || is.peek() == ']') {
+      is.get();
+      return is;
+    }
+    // Handle non-empty tuple
+    ValueType idx;
+    std::vector<ValueType> tmp;
+    while (is >> idx) {
+      tmp.push_back(idx);
+      char ch;
+      do {
+        ch = is.get();
+      } while (isspace(ch));
+      if (std::is_integral<ValueType>::value && ch == 'L') {
+        ch = is.get();
+      }
+      if (ch == ',') {
+        while (true) {
+          ch = is.peek();
+          if (isspace(ch)) {
+            is.get(); continue;
+          }
+          if (ch == ')' || ch == ']') {
+            is.get(); break;
+          }
+          break;
+        }
+        if (ch == ')' || ch == ']') break;
+      } else if (ch == ')' || ch == ']') {
+        break;
+      } else {
+        is.setstate(std::ios::failbit);
+        return is;
+      }
+    }
+    t.assign(tmp.begin(), tmp.end());
+    return is;
+  }
+  /*!
+   * \brief save the content into binary stream
+   * \param strm the output stream
+   * \tparam DType data type that save to
+   * \tparam TStream any stream type that have write
+   */
+  template<typename DType = ValueType, typename TStream>
+  inline void Save(TStream *strm) const;
+  /*!
+   * \brief load the content from binary stream
+   * \param strm the output stream
+   * \tparam DType data type that load from
+   * \tparam TStream any stream type that have write
+   * \return whether the load is successful
+   */
+  template<typename DType = ValueType, typename TStream>
+  inline bool Load(TStream *strm);
+
+ protected:
+  // stack cache size
+  static const uint32_t kStackCache = 4;
+  /*! \brief number of dimension of the tuple */
+  uint32_t ndim_{0};
+  /*! \brief number of cells allocated in data_heap_ */
+  uint32_t num_heap_allocated_{0};
+  /*! \brief in stack space used to store shape when it is small */
+  ValueType data_stack_[kStackCache];
+  /*! \brief space to store shape when dimension is big*/
+  ValueType* data_heap_{nullptr};
+  // internal function to change the dimension
+  inline void SetDim(uint32_t ndim) {
+    if (ndim > kStackCache &&
+        ndim > num_heap_allocated_) {
+      delete [] data_heap_;
+      data_heap_ = new ValueType[ndim];
+      num_heap_allocated_ = ndim;
+    }
+    ndim_ = ndim;
+  }
+};
+
+/*!
+ * \brief A Shape class that is used to represent shape of each tensor.
+ */
+class TShape : public Tuple<dim_t> {
+ public:
+  /*! \brief default constructor */
+  TShape() = default;
+  /*!
+   * constructor to construct a shape with all 1.
+   * \param ndim the number of dimension
+   */
+  inline TShape(uint32_t ndim) {  // NOLINT(*)
+    this->SetDim(ndim);
+    std::fill_n(begin(), ndim, 1);
+  }
+  /*!
+   * \brief copy constructor of TShape
+   * \param s source shape.
+   */
+  inline TShape(const Tuple<dim_t>& s) { // NOLINT(*)
+    this->assign(s.begin(), s.end());
+  }
+  /*!
+   * \brief constructor from initializer list
+   * \param init the initializer_list
+   */
+  inline TShape(std::initializer_list<dim_t> init) {
+    this->assign(init.begin(), init.end());
+  }
+  /*!
+   * \brief move constructor.
+   * \param s source shape.
+   */
+  inline TShape(Tuple<dim_t>&& s) {  // NOLINT(*)
+    this->swap(s);
+  }
+  /*!
+   * \brief construct the Tuple from content of iterator
+   * \param begin the beginning of iterator
+   * \param end end the end of the iterator
+   * \tparam RandomAccessIterator iterator type
+   */
+  template<typename RandomAccessIterator>
+  inline TShape(RandomAccessIterator begin,
+                RandomAccessIterator end) {
+    this->assign(begin, end);
+  }
+  /*!
+   * \brief assignment function from tshape
+   * \param src source shape.
+   * \return self.
+   */
+  inline TShape& operator=(const Tuple<dim_t>& src) {
+    this->assign(src.begin(), src.end());
+    return *this;
+  }
+  /*!
+   * \brief move assignment function from tshape
+   * \param src source shape.
+   * \return self.
+   */
+  inline TShape& operator=(Tuple<dim_t>&& src) {  // NOLINT(*)
+    TShape(std::move(src)).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+  /*! \return total number of elements in the shape */
+  inline size_t Size() const {
+    dim_t size = 1;
+    const dim_t* start = begin(), *fin = end();
+    for (const dim_t* it = start; it != fin; ++it) {
+      size *= *it;
+    }
+    return size;
+  }
+  /*!
+   * \return product shape in [dimstart,dimend)
+   * \param dimstart start dimension
+   * \param dimend end dimension
+   */
+  inline size_t ProdShape(int dimstart, int dimend) const {
+    dim_t num = 1;
+    const dim_t *d = this->data();
+    for (int i = dimstart; i < dimend; ++i) {
+      num *= d[i];
+    }
+    return num;
+  }
+  /*! \return the begin data pointer to content of the tuple */
+  inline const dim_t *data() const {
+    return begin();
+  }
+  /*! \return the begin data pointer to content of the tuple */
+  inline dim_t *data() {
+    return begin();
+  }
+#ifdef MSHADOW_XINLINE
+  template<int dim>
+  inline TShape(const mshadow::Shape<dim> &s) {// NOLINT(*)
+    this->assign(s.shape_, s.shape_ + dim);
+  }
+
+  template<int dim>
+  inline TShape(mshadow::Shape<dim> &&s) {// NOLINT(*)
+    this->assign(s.shape_, s.shape_ + dim);
+  }
+  /*!
+   * \brief assignment from shape
+   * \param shape source shape
+   * \tparam dim shape dimension
+   * \return reference of self
+   */
+  template<int dim>
+  inline TShape &operator=(const mshadow::Shape<dim> &shape) {
+    this->assign(shape.shape_, shape.shape_ + dim);
+    return *this;
+  }
+  /*!
+   * \brief get the shape of tensor specifying dim
+   * \return the shape requested
+   * \tparam dim dimension of the tensor
+   */
+  template<int dim>
+  inline mshadow::Shape<dim> get() const {
+    CHECK_EQ(dim, static_cast<int>(ndim()))
+        << "dimension do not match target dimension " << dim << " vs " << ndim();
+    const dim_t *d = this->data();
+    mshadow::Shape<dim> s;
+    for (int i = 0; i < dim; ++i) {
+      s[i] = d[i];
+    }
+    return s;
+  }
+  /*!
+   * flatten the higher dimension to second dimension, return a 2D shape
+   * \return the flat 2d shape
+   */
+  inline mshadow::Shape<2> FlatTo2D(void) const {
+    mshadow::Shape<2> s;
+    if (ndim() == 0) return mshadow::Shape2(0, 0);
+    const dim_t *d = this->data();
+    s.shape_[1] = d[ndim() - 1];
+    dim_t ymax = 1;
+    for (size_t i = 1; i < ndim(); ++i) {
+      ymax *= d[i - 1];
+    }
+    s.shape_[0] = ymax;
+    return s;
+  }
+  /*!
+   * flatten the shape into three parts: [0, axis_begin), [axis_begin, axis_end], (axis_end, ndim)
+   * \param axis_begin The beginning axis specified.
+   * \param axis_end The ending axis specified.
+   * \return the flat 3d shape
+   */
+  inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
+    CHECK(axis_end >= axis_begin);
+    mshadow::Shape<3> s;
+    if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
+    const dim_t *d = this->data();
+    s.shape_[0] = 1;
+    s.shape_[1] = 1;
+    s.shape_[2] = 1;
+
+    for (size_t i = 0; i < axis_begin; ++i) {
+      s.shape_[0] *= d[i];
+    }
+    for (size_t i = axis_begin; i <= axis_end; ++i) {
+      s.shape_[1] *= d[i];
+    }
+    for (size_t i = axis_end + 1; i < ndim(); ++i) {
+      s.shape_[2] *= d[i];
+    }
+    return s;
+  }
+  /*!
+   * flatten the axis before and after the specified axis, so it becomes 3D tensor
+   * \param axis The axis specified.
+   * \return the flat 3d shape
+   */
+  inline mshadow::Shape<3> FlatTo3D(size_t axis) const {
+    return FlatTo3D(axis, axis);
+  }
+  inline bool operator==(const TShape &s) const {
+    if (ndim() != s.ndim()) return false;
+    return std::equal(begin(), end(), s.begin());
+  }
+  inline bool operator!=(const TShape &s) const {
+    return !(*this == s);
+  }
+  /*!
+   * \return whether two shape equals
+   * \param s the shape to compare against
+   * \tparam dim dimension of the shape
+   */
+  template<int dim>
+  inline bool operator==(const mshadow::Shape<dim> &s) const {
+    if (ndim_ != dim) return false;
+    const dim_t *d = dim <= kStackCache ? data_stack_ : data_heap_;
+    for (size_t i = 0; i < dim; ++i) {
+      if (d[i] != s.shape_[i]) return false;
+    }
+    return true;
+  }
+  /*!
+   * \return whether two shape not equals
+   * \param s the shape to compare against
+   * \tparam dim dimension of the shape
+   */
+  template<int dim>
+  inline bool operator!=(const mshadow::Shape<dim> &s) const {
+    return !(*this == s);
+  }
+#endif
+};
+
+/*! \brief helper function to cast type of container elements */
+template<typename SrcIter, typename DstIter>
+inline DstIter ShapeTypeCast(const SrcIter begin,
+                             const SrcIter end,
+                             DstIter dst_begin) {
+  typedef typename std::iterator_traits<SrcIter>::value_type SrcDType;
+  typedef typename std::iterator_traits<DstIter>::value_type DstDType;
+  auto cast = [](const SrcDType& dim) { return static_cast<DstDType>(dim); };
+  return std::transform(begin, end, dst_begin, cast);
+}
+
+/*! \brief helper function to transform a container to TShape with type cast */
+template<typename SrcIter>
+inline TShape ShapeTypeCast(const SrcIter begin, const SrcIter end) {
+  size_t ndim = std::distance(begin, end);
+  TShape res(ndim);
+  ShapeTypeCast(begin, end, res.begin());
+  return res;
+}
+
+/*! \tparam ValueType The type of data stored inside tuple. */
+template<typename ValueType>
+template<typename DType, typename TStream>
+inline void Tuple<ValueType>::Save(TStream *strm) const {
+  strm->Write(&ndim_, sizeof(ndim_));
+  if (typeid(DType) == typeid(ValueType)) {
+    strm->Write(begin(), sizeof(ValueType) * ndim_);
+  } else {
+    std::vector<DType> buffer(ndim_);
+    ShapeTypeCast(begin(), end(), buffer.data());
+    strm->Write(buffer.data(), sizeof(DType) * ndim_);
+  }
+}
+
+/*! \tparam ValueType The type of data stored inside tuple. */
+template<typename ValueType>
+template<typename DType, typename TStream>
+inline bool Tuple<ValueType>::Load(TStream *strm) {
+  if (strm->Read(&ndim_, sizeof(ndim_)) != sizeof(ndim_)) return false;
+  this->SetDim(ndim_);
+  size_t nread = sizeof(DType) * ndim_;
+  if (typeid(DType) == typeid(ValueType)) {
+    if (strm->Read(begin(), nread) != nread) return false;
+  } else {
+    std::vector<DType> buffer(ndim_);
+    if (strm->Read(buffer.data(), nread) != nread) return false;
+    ShapeTypeCast(buffer.begin(), buffer.end(), begin());
+  }
+  return true;
+}
+
+}  // namespace nnvm
+
+namespace std {
+/*! \brief hash function for Tuple. */
+template<typename T>
+struct hash<nnvm::Tuple<T> > {
+  /*! \brief hash a Tuple into unsigned int */
+  size_t operator()(const nnvm::Tuple<T>& val) const {
+    std::hash<uint32_t> hash_uint;
+    size_t res = hash_uint(val.ndim());
+    for (uint32_t i = 0; i < val.ndim(); ++i) {
+      res = dmlc::HashCombine(res, val[i]);
+    }
+    return res;
+  }
+};
+
+/*! \brief hash function for TShape. */
+template<>
+struct hash<nnvm::TShape> {
+  /*! \brief hash a TShape into unsigned int */
+  size_t operator()(const nnvm::TShape& val) const {
+    std::hash<uint32_t> hash_uint;
+    size_t res = hash_uint(val.ndim());
+    for (uint32_t i = 0; i < val.ndim(); ++i) {
+      res = dmlc::HashCombine(res, val[i]);
+    }
+    return res;
+  }
+};
+}  // namespace std
+
+namespace dmlc {
+/*! \brief description for optional TShape */
+DMLC_DECLARE_TYPE_NAME(optional<nnvm::TShape>, "Shape or None");
+// avoid low version of MSVC
+#if !defined(_MSC_VER)
+template<typename T>
+struct type_name_helper<nnvm::Tuple<T> > {
+  static inline std::string value() {
+    return "tuple of <" + type_name<T>() + ">";
+  }
+};
+#endif
+}  // namespace dmlc
+#endif  // NNVM_TUPLE_H_
diff --git a/nnvm/make/config.mk b/nnvm/make/config.mk
new file mode 100644
index 000000000000..e363af2ae9b2
--- /dev/null
+++ b/nnvm/make/config.mk
@@ -0,0 +1,46 @@
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling nnvm
+#
+#  If you want to change the configuration, please use the following
+#  steps. Assume you are on the root directory of nnvm. First copy the this
+#  file so that any local changes will be ignored by git
+#
+#  $ cp make/config.mk .
+#
+#  Next modify the according entries, and then compile by
+#
+#  $ make
+#
+#  or build in parallel with 8 threads
+#
+#  $ make -j8
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export NVCC = nvcc
+
+# choice of archiver
+export AR = ar
+
+# the additional link flags you want to add
+ADD_LDFLAGS=
+
+# the additional compile flags you want to add
+ADD_CFLAGS=
+
+# path to dmlc-core module 
+#DMLC_CORE_PATH=
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use fusion integration. This requires installing cuda.
+# ifndef CUDA_PATH
+# 	CUDA_PATH = /usr/local/cuda
+# endif
+# NNVM_FUSION_PATH = plugin/nnvm-fusion
+# NNVM_PLUGINS += $(NNVM_FUSION_PATH)/nnvm-fusion.mk
diff --git a/nnvm/python/.gitignore b/nnvm/python/.gitignore
new file mode 100644
index 000000000000..40d7cb4cc13a
--- /dev/null
+++ b/nnvm/python/.gitignore
@@ -0,0 +1,2 @@
+*.c
+*.cpp
diff --git a/nnvm/python/nnvm/__init__.py b/nnvm/python/nnvm/__init__.py
new file mode 100644
index 000000000000..31b88587764d
--- /dev/null
+++ b/nnvm/python/nnvm/__init__.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# coding: utf-8
+"""NNVM python API for ease of use and help new framework establish python API. """
+from __future__ import absolute_import as _abs
+
+from . import _base
+from . import symbol as sym
+from . import symbol
+from ._base import NNVMError
+from . import frontend
+
+__version__ = _base.__version__
diff --git a/nnvm/python/nnvm/_base.py b/nnvm/python/nnvm/_base.py
new file mode 100644
index 000000000000..29390a2201bf
--- /dev/null
+++ b/nnvm/python/nnvm/_base.py
@@ -0,0 +1,199 @@
+# coding: utf-8
+# pylint: disable=invalid-name, unused-import
+""" ctypes library of nnvm and helper functions """
+from __future__ import absolute_import
+
+import os
+import sys
+import ctypes
+import numpy as np
+from . import libinfo
+
+try:
+    import tvm
+except ImportError:
+    pass
+
+#----------------------------
+# library loading
+#----------------------------
+if sys.version_info[0] == 3:
+    string_types = str
+    numeric_types = (float, int, np.float32, np.int32)
+    # this function is needed for python3
+    # to convert ctypes.char_p .value back to python str
+    py_str = lambda x: x.decode('utf-8')
+else:
+    string_types = basestring
+    numeric_types = (float, int, long, np.float32, np.int32)
+    py_str = lambda x: x
+
+
+class NNVMError(Exception):
+    """Error that will be throwed by all nnvm functions"""
+    pass
+
+def _load_lib():
+    """Load libary by searching possible path."""
+    lib_path = libinfo.find_lib_path()
+    lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_LOCAL)
+    # DMatrix functions
+    lib.NNGetLastError.restype = ctypes.c_char_p
+    return lib
+
+# version number
+__version__ = libinfo.__version__
+# library instance of nnvm
+_LIB = _load_lib()
+# The FFI mode of TVM
+_FFI_MODE = os.environ.get("TVM_FFI", "auto")
+
+# type definitions
+nn_uint = ctypes.c_uint
+OpHandle = ctypes.c_void_p
+SymbolHandle = ctypes.c_void_p
+GraphHandle = ctypes.c_void_p
+
+# Global dict of str to symbol to initialize variables
+_all_var_init = {}
+
+#----------------------------
+# helper function definition
+#----------------------------
+def check_call(ret):
+    """Check the return value of C API call
+
+    This function will raise exception when error occurs.
+    Wrap every API call with this function
+
+    Parameters
+    ----------
+    ret : int
+        return value from API calls
+    """
+    if ret != 0:
+        raise NNVMError(py_str(_LIB.NNGetLastError()))
+
+def c_str(string):
+    """Create ctypes char * from a python string
+    Parameters
+    ----------
+    string : string type
+        python string
+
+    Returns
+    -------
+    str : c_char_p
+        A char pointer that can be passed to C API
+    """
+    return ctypes.c_char_p(string.encode('utf-8'))
+
+
+def c_array(ctype, values):
+    """Create ctypes array from a python array
+
+    Parameters
+    ----------
+    ctype : ctypes data type
+        data type of the array we want to convert to
+
+    values : tuple or list
+        data content
+
+    Returns
+    -------
+    out : ctypes array
+        Created ctypes array
+    """
+    return (ctype * len(values))(*values)
+
+def ctypes2buffer(cptr, length):
+    """Convert ctypes pointer to buffer type.
+
+    Parameters
+    ----------
+    cptr : ctypes.POINTER(ctypes.c_char)
+        pointer to the raw memory region
+    length : int
+        the length of the buffer
+
+    Returns
+    -------
+    buffer : bytearray
+        The raw byte memory buffer
+    """
+    if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
+        raise TypeError('expected char pointer')
+    res = bytearray(length)
+    rptr = (ctypes.c_char * length).from_buffer(res)
+    if not ctypes.memmove(rptr, cptr, length):
+        raise RuntimeError('memmove failed')
+    return res
+
+def ctypes2numpy_shared(cptr, shape):
+    """Convert a ctypes pointer to a numpy array
+
+    The result numpy array shares the memory with the pointer
+
+    Parameters
+    ----------
+    cptr : ctypes.POINTER(mx_float)
+        pointer to the memory region
+
+    shape : tuple
+        shape of target ndarray
+
+    Returns
+    -------
+    out : numpy_array
+        A numpy array : numpy array
+    """
+    if not isinstance(cptr, ctypes.POINTER(mx_float)):
+        raise RuntimeError('expected float pointer')
+    size = 1
+    for s in shape:
+        size *= s
+    dbuffer = (mx_float * size).from_address(ctypes.addressof(cptr.contents))
+    return np.frombuffer(dbuffer, dtype=np.float32).reshape(shape)
+
+
+def ctypes2docstring(num_args, arg_names, arg_types, arg_descs, remove_dup=True):
+    """Convert ctypes returned doc string information into parameters docstring.
+
+    num_args : nn_uint
+        Number of arguments.
+
+    arg_names : ctypes.POINTER(ctypes.c_char_p)
+        Argument names.
+
+    arg_types : ctypes.POINTER(ctypes.c_char_p)
+        Argument type information.
+
+    arg_descs : ctypes.POINTER(ctypes.c_char_p)
+        Argument description information.
+
+    remove_dup : boolean, optional
+        Whether remove duplication or not.
+
+    Returns
+    -------
+    docstr : str
+        Python docstring of parameter sections.
+    """
+    param_keys = set()
+    param_str = []
+    for i in range(num_args.value):
+        key = py_str(arg_names[i])
+        if key in param_keys and remove_dup:
+            continue
+        param_keys.add(key)
+        type_info = py_str(arg_types[i])
+        ret = '%s : %s' % (key, type_info)
+        if arg_descs[i]:
+            ret += '\n    ' + py_str(arg_descs[i])
+        param_str.append(ret)
+    doc_str = ('Parameters\n' +
+               '----------\n' +
+               '%s\n')
+    doc_str = doc_str % ('\n'.join(param_str))
+    return doc_str
diff --git a/nnvm/python/nnvm/_ctypes/README b/nnvm/python/nnvm/_ctypes/README
new file mode 100644
index 000000000000..6e82cb962f99
--- /dev/null
+++ b/nnvm/python/nnvm/_ctypes/README
@@ -0,0 +1 @@
+Ctypes specific implementation of certain modules
\ No newline at end of file
diff --git a/nnvm/python/nnvm/_ctypes/__init__.py b/nnvm/python/nnvm/_ctypes/__init__.py
new file mode 100644
index 000000000000..fc76dabf682b
--- /dev/null
+++ b/nnvm/python/nnvm/_ctypes/__init__.py
@@ -0,0 +1 @@
+""""ctypes implementation of the Symbol"""
diff --git a/nnvm/python/nnvm/_ctypes/symbol.py b/nnvm/python/nnvm/_ctypes/symbol.py
new file mode 100644
index 000000000000..843601c10f4e
--- /dev/null
+++ b/nnvm/python/nnvm/_ctypes/symbol.py
@@ -0,0 +1,226 @@
+# coding: utf-8
+# pylint: disable=invalid-name, protected-access, too-many-arguments, too-many-lines,
+# pylint: disable=len-as-condition, consider-iterating-dictionary
+"""Symbolic configuration API."""
+from __future__ import absolute_import as _abs
+
+import copy
+import ctypes
+import sys
+from .._base import _LIB
+from .._base import c_array, c_str, nn_uint, py_str
+from .._base import SymbolHandle, OpHandle
+from .._base import check_call, ctypes2docstring
+from ..name import NameManager
+from ..attribute import AttrScope
+
+class SymbolBase(object):
+    """Symbol is symbolic graph."""
+    __slots__ = ["handle"]
+    # pylint: disable=no-member
+    def __init__(self, handle):
+        """Initialize the function with handle
+
+        Parameters
+        ----------
+        handle : SymbolHandle
+            the handle to the underlying C++ Symbol
+        """
+        self.handle = handle
+
+    def __del__(self):
+        check_call(_LIB.NNSymbolFree(self.handle))
+
+    def __call__(self, *args, **kwargs):
+        """Invoke symbol as function on inputs.
+
+        Parameters
+        ----------
+        args:
+            provide positional arguments
+
+        kwargs:
+            provide keyword arguments
+        Returns
+        -------
+        the resulting symbol
+        """
+        s = copy.deepcopy(self)
+        s._compose(*args, **kwargs)
+        return s
+
+    def _compose(self, *args, **kwargs):
+        """Compose symbol on inputs.
+
+        This call mutates the current symbol.
+
+        Parameters
+        ----------
+        args:
+            provide positional arguments
+
+        kwargs:
+            provide keyword arguments
+
+        Returns
+        -------
+        the resulting symbol
+        """
+        name = kwargs.pop('name', None)
+
+        if name:
+            name = c_str(name)
+        if len(args) != 0 and len(kwargs) != 0:
+            raise TypeError('compose only accept input Symbols \
+                either as positional or keyword arguments, not both')
+
+        for arg in args:
+            if not isinstance(arg, SymbolBase):
+                raise TypeError('Compose expect `Symbol` as arguments')
+        for val in kwargs.values():
+            if not isinstance(val, SymbolBase):
+                raise TypeError('Compose expect `Symbol` as arguments')
+
+        num_args = len(args) + len(kwargs)
+        if len(kwargs) != 0:
+            keys = c_array(ctypes.c_char_p, [c_str(key) for key in kwargs.keys()])
+            args = c_array(SymbolHandle, [s.handle for s in kwargs.values()])
+        else:
+            keys = None
+            args = c_array(SymbolHandle, [s.handle for s in args])
+        check_call(_LIB.NNSymbolCompose(
+            self.handle, name, num_args, keys, args))
+
+    def _set_attr(self, **kwargs):
+        """Set the attribute of the symbol.
+
+        Parameters
+        ----------
+        **kwargs
+            The attributes to set
+        """
+        keys = c_array(ctypes.c_char_p,
+                       [c_str(key) for key in kwargs.keys()])
+        vals = c_array(ctypes.c_char_p,
+                       [c_str(str(val)) for val in kwargs.values()])
+        num_args = nn_uint(len(kwargs))
+        check_call(_LIB.NNSymbolSetAttrs(
+            self.handle, num_args, keys, vals))
+
+
+_symbol_cls = SymbolBase
+
+def _set_symbol_class(cls):
+    global _symbol_cls
+    _symbol_cls = cls
+
+
+def _make_atomic_symbol_function(handle, name):
+    """Create an atomic symbol function by handle and funciton name."""
+    real_name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = nn_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+    ret_type = ctypes.c_char_p()
+
+    check_call(_LIB.NNGetOpInfo(
+        handle, ctypes.byref(real_name), ctypes.byref(desc),
+        ctypes.byref(num_args),
+        ctypes.byref(arg_names),
+        ctypes.byref(arg_types),
+        ctypes.byref(arg_descs),
+        ctypes.byref(ret_type)))
+    param_str = ctypes2docstring(num_args, arg_names, arg_types, arg_descs)
+    func_name = name
+    desc = py_str(desc.value)
+
+    doc_str = ('%s\n\n' +
+               '%s\n' +
+               'Returns\n' +
+               '-------\n' +
+               'result: Tensor\n' +
+               '    The result Tensor.')
+    doc_str = doc_str % (desc, param_str)
+
+    def creator(*args, **kwargs):
+        """Activation Operator of Neural Net.
+        The parameters listed below can be passed in as keyword arguments.
+
+        Parameters
+        ----------
+        name : string, required.
+            Name of the resulting symbol.
+
+        Returns
+        -------
+        symbol: Symbol
+            the resulting symbol
+        """
+        param_keys = []
+        param_vals = []
+        symbol_kwargs = {}
+        name = kwargs.pop('name', None)
+        attr = kwargs.pop('attr', None)
+
+        for k, v in kwargs.items():
+            if isinstance(v, SymbolBase):
+                symbol_kwargs[k] = v
+            else:
+                param_keys.append(c_str(k))
+                param_vals.append(c_str(str(v)))
+        # create atomic symbol
+        param_keys = c_array(ctypes.c_char_p, param_keys)
+        param_vals = c_array(ctypes.c_char_p, param_vals)
+        sym_handle = SymbolHandle()
+        check_call(_LIB.NNSymbolCreateAtomicSymbol(
+            handle,
+            nn_uint(len(param_keys)),
+            param_keys, param_vals,
+            ctypes.byref(sym_handle)))
+
+        if len(args) != 0 and len(symbol_kwargs) != 0:
+            raise TypeError(
+                '%s can only accept input'
+                'Symbols either as positional or keyword arguments, not both' % func_name)
+        s = _symbol_cls(sym_handle)
+        attr = AttrScope.current.get(attr)
+        if attr:
+            s._set_attr(**attr)
+        hint = func_name.lower()
+        name = NameManager.current.get(name, hint)
+        s._compose(*args, name=name, **symbol_kwargs)
+        return s
+
+    creator.__name__ = func_name
+    creator.__doc__ = doc_str
+    return creator
+
+
+def _init_symbol_module(symbol_class, root_namespace):
+    """List and add all the atomic symbol functions to current module."""
+    _set_symbol_class(symbol_class)
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+
+    check_call(_LIB.NNListAllOpNames(ctypes.byref(size),
+                                     ctypes.byref(plist)))
+    op_names = []
+    for i in range(size.value):
+        op_names.append(py_str(plist[i]))
+
+    module_obj = sys.modules["%s.symbol" % root_namespace]
+    module_obj_contrib = sys.modules["%s.contrib" % root_namespace]
+    module_internal = sys.modules["%s._symbol_internal" % root_namespace]
+    for name in op_names:
+        hdl = OpHandle()
+        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+        function = _make_atomic_symbol_function(hdl, name)
+        if function.__name__.startswith('_contrib_'):
+            setattr(module_obj_contrib, function.__name__.split('_contrib_')[1], function)
+        elif function.__name__.startswith('_'):
+            setattr(module_internal, function.__name__, function)
+            setattr(module_obj, function.__name__, function)
+        else:
+            setattr(module_obj, function.__name__, function)
diff --git a/nnvm/python/nnvm/_cy2/README b/nnvm/python/nnvm/_cy2/README
new file mode 100644
index 000000000000..ed4639b674a0
--- /dev/null
+++ b/nnvm/python/nnvm/_cy2/README
@@ -0,0 +1 @@
+This folder is by default empty and will hold DLLs generated by cython.
diff --git a/nnvm/python/nnvm/_cy2/__init__.py b/nnvm/python/nnvm/_cy2/__init__.py
new file mode 100644
index 000000000000..910cbe2e586b
--- /dev/null
+++ b/nnvm/python/nnvm/_cy2/__init__.py
@@ -0,0 +1 @@
+"""Namespace for cython generated modules for python2"""
diff --git a/nnvm/python/nnvm/_cy3/README b/nnvm/python/nnvm/_cy3/README
new file mode 100644
index 000000000000..dc3a57603782
--- /dev/null
+++ b/nnvm/python/nnvm/_cy3/README
@@ -0,0 +1 @@
+This folder is by default empty and will hold DLLs generated by cython.
\ No newline at end of file
diff --git a/nnvm/python/nnvm/_cy3/__init__.py b/nnvm/python/nnvm/_cy3/__init__.py
new file mode 100644
index 000000000000..c3eb41421b3e
--- /dev/null
+++ b/nnvm/python/nnvm/_cy3/__init__.py
@@ -0,0 +1 @@
+"""Cython generated modules"""
diff --git a/nnvm/python/nnvm/_symbol_internal.py b/nnvm/python/nnvm/_symbol_internal.py
new file mode 100644
index 000000000000..6fadaf89c9d9
--- /dev/null
+++ b/nnvm/python/nnvm/_symbol_internal.py
@@ -0,0 +1 @@
+"""Module space to register internal functions. Leave empty"""
diff --git a/nnvm/python/nnvm/attribute.py b/nnvm/python/nnvm/attribute.py
new file mode 100644
index 000000000000..a023b9cd88df
--- /dev/null
+++ b/nnvm/python/nnvm/attribute.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+"""Attribute scoping support for symbolic API."""
+from __future__ import absolute_import
+
+from ._base import string_types
+
+class AttrScope(object):
+    """Attribute manager for scoping.
+
+    User can also inherit this object to change naming behavior.
+
+    Parameters
+    ----------
+    kwargs
+        The attributes to set for all symbol creations in the scope.
+    """
+    current = None
+
+    def __init__(self, **kwargs):
+        self._old_scope = None
+        for value in kwargs.values():
+            if not isinstance(value, string_types):
+                raise ValueError("Attributes need to be string")
+        self._attr = kwargs
+
+    def get(self, attr):
+        """
+        Get the attribute dict given the attribute set by the symbol.
+
+        Parameters
+        ----------
+        attr : dict of string to string
+            The attribute passed in by user during symbol creation.
+
+        Returns
+        -------
+        attr : dict of string to string
+            Updated attributes to add other scope related attributes.
+        """
+        if self._attr:
+            ret = self._attr.copy()
+            if attr:
+                ret.update(attr)
+            return ret
+        else:
+            return attr
+
+    def __enter__(self):
+        # pylint: disable=protected-access
+        self._old_scope = AttrScope.current
+        attr = AttrScope.current._attr.copy()
+        attr.update(self._attr)
+        self._attr = attr
+        AttrScope.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        AttrScope.current = self._old_scope
+
+AttrScope.current = AttrScope()
diff --git a/nnvm/python/nnvm/compiler/__init__.py b/nnvm/python/nnvm/compiler/__init__.py
new file mode 100644
index 000000000000..1625150a6edc
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/__init__.py
@@ -0,0 +1,23 @@
+"""NNVM compiler toolchain.
+
+User only need to use :any:`build` and :any:`build_config` to do the compilation,
+and :any:`save_param_dict` to save the parameters into bytes.
+The other APIs are for more advanced interaction with the compiler toolchain.
+"""
+from __future__ import absolute_import
+
+import tvm
+
+from . import build_module
+from . build_module import build, optimize, build_config
+from . compile_engine import engine, graph_key
+from . param_dict import save_param_dict, load_param_dict
+
+from .. import symbol as _symbol
+from .. import graph as _graph
+
+from .. import top as _top
+
+
+tvm.register_extension(_symbol.Symbol, _symbol.Symbol)
+tvm.register_extension(_graph.Graph, _graph.Graph)
diff --git a/nnvm/python/nnvm/compiler/build_module.py b/nnvm/python/nnvm/compiler/build_module.py
new file mode 100644
index 000000000000..fd8599bcfa93
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -0,0 +1,447 @@
+# pylint: disable=invalid-name
+"""Namespace for building operators."""
+from __future__ import absolute_import as _abs
+
+import logging
+import tvm
+
+from tvm.contrib import graph_runtime
+from tvm import autotvm
+from . import graph_attr, graph_util
+from .. import graph as _graph
+from .. import symbol as sym
+from .._base import _all_var_init
+
+OPT_PASS_LEVEL = {
+    "SimplifyInference": 0,
+    "PrecomputePrune": 2,
+    "OpFusion": 1,
+    "FoldScaleAxis": 3,
+    "AlterOpLayout": 3,
+}
+
+# List of optimization pass and level when switch on
+class BuildConfig(object):
+    """Configuration scope to set a build config option.
+
+    Parameters
+    ----------
+    kwargs
+        Keyword arguments of configurations to set.
+    """
+    current = None
+    defaults = {
+        "opt_level": 2,
+        "add_pass": None,
+    }
+    def __init__(self, **kwargs):
+        self._old_scope = None
+        for k, _ in kwargs.items():
+            if k not in BuildConfig.defaults:
+                raise ValueError(
+                    "invalid argument %s, candidates are %s" % (k, BuildConfig.defaults.keys()))
+        self._attr = kwargs
+
+    def __getattr__(self, name):
+        if name not in self._attr:
+            return BuildConfig.defaults[name]
+        return self._attr[name]
+
+    def __enter__(self):
+        # pylint: disable=protected-access
+        self._old_scope = BuildConfig.current
+        attr = BuildConfig.current._attr.copy()
+        attr.update(self._attr)
+        self._attr = attr
+        BuildConfig.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        BuildConfig.current = self._old_scope
+
+    def pass_enabled(self, pass_name):
+        """Get whether pass is enabled.
+
+        Parameters
+        ----------
+        pass_name : str
+            The optimization pass name
+
+        Returns
+        -------
+        enabled : bool
+            Whether pass is enabled.
+        """
+        if self.add_pass and pass_name in self.add_pass:
+            return True
+        return self.opt_level >= OPT_PASS_LEVEL[pass_name]
+
+
+BuildConfig.current = BuildConfig()
+
+def build_config(**kwargs):
+    """Configure the build behavior by setting config variables.
+
+    Parameters
+    ----------
+    opt_level: int, default=2
+        Optimization level. See OPT_PASS_LEVEL for level of each pass.
+
+    add_pass: set of str
+        Optimization pass to be added regardless of optimization level.
+
+    Returns
+    -------
+    config: BuildConfig
+        The build configuration
+    """
+    return BuildConfig(**kwargs)
+
+
+@tvm.register_func("nnvm.compiler.lower")
+def _lower(sch, inputs, func_name, graph):
+    import traceback
+    # pylint: disable=broad-except
+    try:
+        f = tvm.lower(sch, inputs, name=func_name)
+        logging.debug("lower function %s", func_name)
+        logging.debug("%s", tvm.lower(sch, inputs, simple_mode=True))
+    except Exception:
+        msg = traceback.format_exc()
+        msg += "Error during compile graph\n"
+        msg += "--------------------------\n"
+        msg += graph.ir(join_entry_attrs=["shape"])
+        raise RuntimeError(msg)
+    return f if isinstance(
+        f, (tvm.container.Array, tuple, list)) else [f]
+
+
+@tvm.register_func("nnvm.compiler.build_target")
+def _build(funcs, target, target_host):
+    if target_host == "":
+        target_host = None
+    return tvm.build(funcs, target=target, target_host=target_host)
+
+
+def _update_shape_dtype(shape, dtype, params):
+    """Update shape dtype given params information"""
+    if not params:
+        return shape, dtype
+    shape = shape.copy()
+    shape.update({k : v.shape for k, v in params.items()})
+    if isinstance(dtype, str):
+        for k, v in params.items():
+            if v.dtype != dtype:
+                raise ValueError(
+                    "%s: dtype not expected %s vs %s" % (k, dtype, v.dtype))
+    else:
+        dtype = dtype.copy()
+        dtype.update({k : str(v.dtype) for k, v in params.items()})
+    return shape, dtype
+
+
+def optimize(graph, shape, dtype="float32", layout=None):
+    """Perform target and parameter invariant graph optimization.
+
+    This is an advanced function that usually do not need to be called.
+    Call build instead.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to be used in optimized.
+
+    Returns
+    -------
+    graph : Graph
+        The optimized graph.
+    """
+    # pylint: disable=unused-argument
+    cfg = BuildConfig.current
+
+    if cfg.pass_enabled("AlterOpLayout"):
+        layout = layout if layout else {}
+        graph = graph_attr.set_layout_inputs(graph, layout)
+        graph = graph.apply(["CorrectLayout"])
+
+        graph = graph_attr.set_shape_inputs(graph, shape)
+        graph = graph_attr.set_dtype_inputs(graph, dtype)
+        graph = graph.apply(["InferShape", "InferType", "AlterOpLayout"])
+        graph = graph_attr.set_layout_inputs(graph, layout)
+        graph = graph.apply(["CorrectLayout"])
+
+    if cfg.pass_enabled("SimplifyInference"):
+        graph = graph_attr.set_shape_inputs(graph, shape)
+        graph = graph.apply(["InferShape", "SimplifyInference"])
+
+    if cfg.pass_enabled("FoldScaleAxis"):
+        graph = graph_attr.set_shape_inputs(graph, shape)
+        graph = graph.apply(["InferShape", "FoldScaleAxis"])
+    return graph
+
+
+def build(graph, target=None, shape=None, dtype="float32",
+          params=None, target_host=None, layout=None):
+    """Build graph into runtime library.
+
+    The build function will optimize the graph and do the compilation.
+
+    When params is provided, the compiler might split the graph to
+    pre-compute certain values, so the final execution graph can
+    be different from the original one.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to be used in lowering
+
+    target : str or :any:`tvm.target.Target`, optional
+        The build target
+
+    shape : dict of str to tuple, optional
+        The input shape to the graph
+
+    dtype : str or dict of str to str
+        The input types to the graph
+
+    params : dict of str to NDArray
+        Input parameters to the graph that do not change
+        during inference time. Used for pre-compute
+        folding optimization.
+
+    target_host : str or :any:`tvm.target.Target` optional
+        Host compilation target, if target is device.
+        When TVM compiles device specific program such as CUDA,
+        we also need host(CPU) side code to interact with the driver
+        setup the dimensions and parameters correctly.
+        target_host is used to specify the host side codegen target.
+        By default, llvm is used if it is enabled,
+        otherwise a stackvm intepreter is used.
+
+    layout : dict of str to str or str optional
+        The input layout
+
+    Returns
+    -------
+    graph : Graph
+        The final execution graph.
+
+    libmod : tvm.Module
+        The module that comes with the execution graph
+
+    params : dict of str to NDArray
+        The updated parameters of graph if params is passed.
+        This can be different from the params passed in.
+    """
+    target = target if target else tvm.target.current_target()
+    if target is None:
+        raise ValueError("Target is not set in env or passed as argument.")
+    target = tvm.target.create(target)
+
+    # if not inside an autotvm config dispatch context, load pre-tuned parameters from TopHub
+    if autotvm.task.DispatchContext.current is None:
+        tophub_context = autotvm.tophub.context(target)
+    else:
+        tophub_context = autotvm.util.EmptyContext()
+
+    with tophub_context:
+        shape = shape if shape else {}
+        if not isinstance(shape, dict):
+            raise TypeError("require shape to be dict")
+        for value in shape.values():
+            if not all(isinstance(x, int) for x in value):
+                raise TypeError("shape value must be int iterator")
+
+        cfg = BuildConfig.current
+        graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
+        shape, dtype = _update_shape_dtype(shape, dtype, params)
+
+        # correct layout if necessary
+        layout = layout if layout else {}
+        graph = graph_attr.set_layout_inputs(graph, layout)
+        graph = graph.apply("CorrectLayout")
+        index = graph.index
+        layouts = graph.json_attr("layout")
+        layout = {x: layouts[index.entry_id(x)] for x in index.input_names}
+
+        # Initial pass do shape type inference
+        ishape, _ = graph_util.infer_shape(graph, **shape)
+        shape.update(zip(graph.index.input_names, ishape))
+        if not isinstance(dtype, str):
+            idtype, _ = graph_util.infer_dtype(graph, **dtype)
+            dtype.update(zip(graph.index.input_names, idtype))
+        # Initialize all variables specified in _all_var_init
+        init_var = {}
+        if _all_var_init:
+            init_var = initialize_variables(shape, dtype)
+        # Apply optimization
+        with target:
+            graph = optimize(graph, shape, dtype, layout)
+
+        # Clear extra params without nodes.
+        _remove_noref_params(params, graph)
+
+        # Precompute prune
+        if params and cfg.pass_enabled("PrecomputePrune"):
+            graph, params = precompute_prune(graph, params)
+            shape, dtype = _update_shape_dtype(shape, dtype, params)
+        # Operator Fusion and generation
+        graph = graph_attr.set_shape_inputs(graph, shape)
+        graph = graph.apply("InferShape")
+        graph = graph_attr.set_dtype_inputs(graph, dtype)
+        graph._set_json_attr("target", str(target), "str")
+        if target_host is not None:
+            graph._set_json_attr("target_host", str(target_host), "str")
+        if cfg.pass_enabled("OpFusion"):
+            graph._set_json_attr("opt_level", 1, "int")
+        else:
+            graph._set_json_attr("opt_level", 0, "int")
+        graph = graph.apply("InferShape").apply("InferType")
+        with target:
+            graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile")
+        libmod = graph_attr._move_out_module(graph, "module")
+        # Write variable initial values into params
+        if init_var:
+            if params is None:
+                params = {}
+            params.update(init_var)
+        return graph, libmod, params
+
+def _remove_noref_params(params, graph):
+    """ Helper to clear non referenced params
+
+    Parameters
+    ----------
+    graph : Graph
+        The input graph
+
+    params: dict of str to ndarray
+        The parameter dictionary
+    """
+    arg_list = set(graph.symbol.list_input_names())
+
+    if params:
+        param_keys = list(params.keys())
+        for key in param_keys:
+            if key not in arg_list:
+                params.pop(key)
+
+def _run_graph(graph, params):
+    """Helper utility to build and run and get outputs, only use cpu mode.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to be executed.
+
+    params: dict of str to ndarray
+        The parameter dictionary.
+
+    Returns
+    -------
+    out_dict: dict of str to tvm.NDArray
+        The output dictionaries.
+    """
+    graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
+    shape = {k : v.shape for k, v in params.items()}
+    dtype = {k : v.dtype for k, v in params.items()}
+    target = "llvm"
+    ctx = tvm.cpu(0)
+    _, oshape = graph_util.infer_shape(graph, **shape)
+    _, odtype = graph_util.infer_dtype(graph, **dtype)
+    graph, libmod, _ = build(graph, target, shape, dtype)
+    m = graph_runtime.create(graph, libmod, ctx)
+    set_input, run, get_output = m["set_input"], m["run"], m["get_output"]
+    kset = set(graph.symbol.list_input_names())
+    for k, v in params.items():
+        if k in kset:
+            set_input(k, tvm.nd.array(v))
+    run()
+    out_data = []
+    for i, kv in enumerate(zip(oshape, odtype)):
+        shape, dtype = kv
+        arr = tvm.nd.empty(shape, dtype, ctx)
+        get_output(i, arr)
+        out_data.append(arr)
+    return out_data
+
+
+def precompute_prune(graph, params):
+    """Precompute the part of graph that can be pre-computed.
+
+    This will create a new graph that only contains the ops
+    that need to be computed depending on input as well as
+    updated version of param dict that pre-computes some of
+    intermediate results.
+
+    Parameters
+    ----------
+    graph : Graph
+        The input graph
+
+    params : dict of str -> tvm.NDArray
+        The parameter dictionary of the graph
+
+    Returns
+    -------
+    pruned_graph : Graph
+        The pruned graph
+
+    new_params : dict of str-> tvm.NDArray
+        The updated dictionary of parameters.
+    """
+    graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
+    graph._set_json_attr("param_name_list", list(params.keys()), "list_str")
+    graph = graph.apply("PrecomputePrune")
+    pre_graph = graph_attr._move_out_graph(graph, "precompute_graph")
+    if pre_graph is None:
+        return graph, params
+    out_names = pre_graph.json_attr("output_names")
+    if not pre_graph.symbol.list_output_names():
+        return graph, params
+    with tvm.build_config(auto_unroll_max_step=0):
+        out_arrs = _run_graph(pre_graph, params)
+    return graph, dict(zip(out_names, out_arrs))
+
+
+def initialize_variables(ishape, idtype):
+    """ Initialize variables stored in _all_var_init dictionary.
+
+    Parameters
+    ----------
+    ishape : dict of str to tuple of int
+        The input shape to the graph
+
+    idtype : str or dict of str to str
+        The input types to the graph
+
+    Returns
+    -------
+    init_var : dict of str to tvm.ndarray
+    """
+    symbol_init_dict = {}
+    const_init_dict = {}
+    init_var = {}
+    for key, value in _all_var_init.items():
+        if isinstance(value, sym.Symbol):
+            symbol_init_dict[key] = value
+        else:
+            const_init_dict[key] = tvm.nd.array(value)
+    # Make sure variables are initialized only once.
+    _all_var_init.clear()
+    if symbol_init_dict:
+        # Create dummy params to run initialization graph
+        params = {}
+        for name, shape in ishape.items():
+            dtype = idtype if isinstance(idtype, str) else idtype[name]
+            params[name] = tvm.nd.empty(shape, dtype, ctx=tvm.cpu())
+        init_group_sym = sym.Group(symbol_init_dict.values())
+        graph = _graph.create(init_group_sym)
+        with tvm.build_config(auto_unroll_max_step=0):
+            init_values = _run_graph(graph, params)
+        init_var.update(dict(zip(symbol_init_dict.keys(), init_values)))
+    init_var.update(const_init_dict)
+    for name, data in init_var.items():
+        ishape[name] = data.shape
+    return init_var
diff --git a/nnvm/python/nnvm/compiler/compile_engine.py b/nnvm/python/nnvm/compiler/compile_engine.py
new file mode 100644
index 000000000000..289f09deb280
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/compile_engine.py
@@ -0,0 +1,105 @@
+# pylint: disable=invalid-name
+"""Compiler engine interface to internal engine
+
+You can get the engine singleton at ``nnvm.compiler.engine``
+"""
+import tvm
+
+_list_cache_items = tvm.get_global_func("nnvm.compiler.ListCacheItems")
+_clear_cache = tvm.get_global_func("nnvm.compiler.ClearCache")
+_get_cache_item = tvm.get_global_func("nnvm.compiler.GetCacheItem")
+_set_cache_item = tvm.get_global_func("nnvm.compiler.SetCacheItem")
+_graph_key_get_graph = tvm.get_global_func("nnvm.compiler.GraphKeyGetGraph")
+_make_graph_key = tvm.get_global_func("nnvm.compiler.MakeGraphKey")
+
+@tvm.register_node
+class GraphKey(tvm.node.NodeBase):
+    """Key of a graph compilation context"""
+    @property
+    def graph(self):
+        return _graph_key_get_graph(self)
+
+
+@tvm.register_node
+class GraphCacheEntry(tvm.node.NodeBase):
+    """CacheEntry of compilation into a TVM Function"""
+    pass
+
+
+@tvm.register_node
+class GraphFunc(tvm.node.NodeBase):
+    """Compiled result of a graph into a TVM Function"""
+    pass
+
+
+class Engine(object):
+    """Global singleton compilation engine.
+
+    You can get the singleton at ``nnvm.compiler.engine``
+    """
+    def items(self):
+        """List the available cache key value pairs.
+
+        Returns
+        -------
+        item_list : list of (GraphKey, GraphCacheEntry)
+            The existing cache items
+        """
+        res = _list_cache_items()
+        assert len(res) % 2 == 0
+        return [(res[2*i], res[2*i+1]) for i in range(len(res) // 2)]
+
+    def clear_cache(self):
+        """Clear the existing cached functions."""
+        _clear_cache()
+
+    def __setitem__(self, key, value):
+        """Clear the existing cached functions."""
+        if isinstance(value, GraphCacheEntry):
+            _set_cache_item(key, value.graph_func)
+        else:
+            _set_cache_item(key, value)
+
+    def __getitem__(self, key):
+        """Clear the existing cached functions."""
+        return _get_cache_item(key)
+
+    def dump(self):
+        """Return a string representation of engine dump
+
+        Returns
+        -------
+        dump : str
+            The dumped string representation
+        """
+        items = self.items()
+        res = "====================================\n"
+        res += "CompilerEngine dump, %d items cached\n" % len(items)
+        for key, value in items:
+            res += "------------------------------------\n"
+            res += "target={}\n".format(key.target)
+            res += "inputs={}\n".format(key.inputs)
+            res += "use_count={}\n".format(value.use_count)
+            res += "func_name={}\n".format(value.graph_func.func_name)
+            res += key.graph.ir() + "\n"
+        res += "===================================\n"
+        return res
+
+engine = Engine()
+
+
+def graph_key(graph, inputs, target):
+    """Construct a new graph key.
+
+    Parameters
+    ----------
+    graph : Graph
+        The computation graph structure
+
+    inputs : list of Tensor(placeholder)
+        The input requirement to the graph.
+
+    target : str
+        The target of compilation.
+    """
+    return _make_graph_key(graph, inputs, target)
diff --git a/nnvm/python/nnvm/compiler/graph_attr.py b/nnvm/python/nnvm/compiler/graph_attr.py
new file mode 100644
index 000000000000..3ce6c4b53239
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/graph_attr.py
@@ -0,0 +1,118 @@
+# pylint: disable=invalid-name
+"""Utilities to access graph attributes"""
+from __future__ import absolute_import as _abs
+
+import tvm
+
+def set_shape_inputs(g, shape):
+    """Set the shape of input graph nodes in the graph attribute.
+
+    Parameters
+    ----------
+    g : Graph
+        The input graph
+
+    shape : dict of str to tuple
+        The input shape
+
+    Returns
+    -------
+    g : Graph
+        The updated graph with updated shape.
+    """
+    list_shape = [
+        shape.get(name, ()) for name in g.index.input_names]
+    g._set_json_attr("shape_inputs", list_shape, 'list_shape')
+    return g
+
+
+DTYPE_TO_TCODE = {
+    "default": -1,
+    "float32": 0,
+    "float64": 1,
+    "float16": 2,
+    "uint8": 3,
+    "int32": 4,
+    "int8": 5,
+    "int64": 6,
+    "int16": 7,
+    "uint16": 8,
+    "uint32": 9,
+    "uint64": 10,
+}
+
+TCODE_TO_DTYPE = {
+    -1: None,
+    0: "float32",
+    1: "float64",
+    2: "float16",
+    3: "uint8",
+    4: "int32",
+    5: "int8",
+    6: "int64",
+    7: "int16",
+    8: "uint16",
+    9: "uint32",
+    10: "uint64",
+}
+
+def set_dtype_inputs(g, dtype):
+    """Set the dtype inputs of graph nodes
+
+    Parameters
+    ----------
+    g : Graph
+        The input graph
+
+    dtype : dict of str to str or str
+        The input dtype
+
+    Returns
+    -------
+    g : Graph
+        The updated graph with updated dtype.
+    """
+    if isinstance(dtype, dict):
+        list_dtype = [
+            DTYPE_TO_TCODE[str(dtype.get(name, "default"))]
+            for name in g.index.input_names]
+    else:
+        list_dtype = [DTYPE_TO_TCODE[dtype]] * len(g.index.input_names)
+    g._set_json_attr("dtype_inputs", list_dtype, "list_int")
+    return g
+
+
+def set_layout_inputs(g, layout):
+    """Set the layout inputs of graph nodes
+
+    Parameters
+    ----------
+    g : Graph
+        The input graph
+
+    layout : dict of str to str or str
+        The input layout
+
+    Returns
+    -------
+    g : Graph
+        The updated graph with updated layout.
+    """
+    if isinstance(layout, dict):
+        list_layout = [
+            layout.get(name, "__undef__") for name in g.index.input_names]
+    elif isinstance(layout, str):
+        list_layout = ["__undef__"] * len(g.index.input_names)
+        list_layout[0] = layout
+    else:
+        raise ValueError("Input layout must be str or dict")
+    last_inferred_layouts = g.json_attr("layout")
+    if last_inferred_layouts:
+        input_layout = [last_inferred_layouts[g.index.entry_id(x)] for x in g.index.input_names]
+        for i, layout_stored in enumerate(input_layout):
+            list_layout[i] = list_layout[i] if list_layout[i] != '__undef__' else layout_stored
+    g._set_json_attr("layout_inputs", list_layout, 'list_layout')
+    return g
+
+_move_out_module = tvm.get_global_func("nnvm.graph._move_module")
+_move_out_graph = tvm.get_global_func("nnvm.graph._move_graph")
diff --git a/nnvm/python/nnvm/compiler/graph_pass.py b/nnvm/python/nnvm/compiler/graph_pass.py
new file mode 100644
index 000000000000..a37e83a2c5c0
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/graph_pass.py
@@ -0,0 +1,8 @@
+# pylint: disable=invalid-name
+"""Namespace of graph pass.
+
+Principle:
+- Graph in, graph out: always takes in graph as first argument and returns a graph
+- Composable API: break graph transformation pass as segments of small transformations.
+"""
+from __future__ import absolute_import as _abs
diff --git a/nnvm/python/nnvm/compiler/graph_util.py b/nnvm/python/nnvm/compiler/graph_util.py
new file mode 100644
index 000000000000..e831298b27d9
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/graph_util.py
@@ -0,0 +1,148 @@
+# pylint: disable=invalid-name
+"""Utility function to get information from graph."""
+from __future__ import absolute_import as _abs
+
+import tvm
+from . import graph_attr
+
+from ..graph import create
+from ..symbol import Group, ones_like
+
+def infer_shape(graph, **shape):
+    """Infer the shape given the shape of inputs.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to perform shape inference from
+
+    shape : dict of str to tuple
+        The specific input shape.
+
+    Returns
+    -------
+    in_shape : list of tuple
+         Shape of inputs
+
+    out_shape: list of tuple
+         Shape of outputs
+    """
+    graph = graph_attr.set_shape_inputs(graph, shape)
+    graph = graph.apply("InferShape")
+    shape = graph.json_attr("shape")
+    index = graph.index
+    input_shape = [shape[index.entry_id(x)] for x in index.input_names]
+    output_shape = [shape[index.entry_id(x)] for x in index.output_entries]
+    return input_shape, output_shape
+
+
+def infer_dtype(graph, **dtype):
+    """Infer the type given the typeS of inputs.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to perform type inference from
+
+    dtype : dict of str to dtype
+        The specific input data type.
+
+    Returns
+    -------
+    in_dtype : list of tuple
+         Dtype of inputs
+
+    out_dtype: list of tuple
+         Dtype of outputs
+    """
+    graph = graph_attr.set_dtype_inputs(graph, dtype)
+    graph = graph.apply("InferType")
+    dtype = graph.json_attr("dtype")
+    index = graph.index
+    input_dtype = [graph_attr.TCODE_TO_DTYPE[dtype[index.entry_id(x)]]
+                   for x in index.input_names]
+    output_dtype = [graph_attr.TCODE_TO_DTYPE[dtype[index.entry_id(x)]]
+                    for x in index.output_entries]
+    return input_dtype, output_dtype
+
+
+_deep_compare = tvm.get_global_func("nnvm.graph.DeepCompare")
+
+def check_graph_equal(grapha, graphb, compare_variable_attrs=False):
+    """Check if two graphs have equal structure.
+
+    Parameters
+    ----------
+    grapha : Graph
+        The first graph
+
+    graphb : Graph
+        The second graph
+
+    compare_variable_attrs : bool, optional
+        Whether we want to compare attributes(names) on variables.
+        Usually it is safe to skip it unless we want input name
+        to exactly match
+
+    Raises
+    ------
+    ValueError
+        ValueError is raised with error message when graph not equal
+    """
+    err = _deep_compare(grapha, graphb, compare_variable_attrs)
+    if err:
+        raise ValueError("Graph compare error: " + err)
+
+def get_gradient_graph(ys, xs, grad_ys=None):
+    """Create gradient graph of ys with respect to xs.
+
+    Parameters
+    ----------
+    ys : Symbol or list of Symbol
+        Symbols from which the gradient is calculated.
+    xs : Symbol or list of Symbol
+        Symbols the gradient respect to.
+        For group symbol, gradients for all outputs will be calculated.
+    grad_ys : Symbol or list of Symbol
+        Head gradients for ys.
+
+    Returns
+    -------
+    ret : Graph
+        Generated gradient graph.
+    """
+    if isinstance(ys, list):
+        ys = Group(ys)
+    g = create(ys)
+    g._set_symbol_list_attr('grad_ys', ys)
+    g._set_symbol_list_attr('grad_xs', xs)
+    ny = len(ys.list_output_names())
+    if grad_ys is None:
+        grad_ys = [ones_like(ys[i]) for i in range(ny)]
+    g._set_symbol_list_attr('grad_ys_out_grad', grad_ys)
+    return g.apply('Gradient')
+
+def gradients(ys, xs, grad_ys=None):
+    """Create gradient symbol of ys respect to xs.
+
+    Parameters
+    ----------
+    ys : Symbol or list of Symbol
+        Symbols from which the gradient is calculated.
+    xs : Symbol or list of Symbol
+        Symbols the gradient respect to.
+        For group symbol, gradients for all outputs will be calculated.
+    grad_ys : Symbol or list of Symbol
+        Head gradients for ys.
+
+    Returns
+    -------
+    ret : list of Symbol
+        Generated gradient symbol. For each xs,
+        all gradients from ys are merged into a single symbol.
+    """
+    grad_g = get_gradient_graph(ys, xs, grad_ys)
+    nx = len(Group(xs).list_output_names()) \
+        if isinstance(xs, list) else len(xs.list_output_names())
+    ret = [grad_g.symbol[i] for i in range(nx)]
+    return ret
diff --git a/nnvm/python/nnvm/compiler/lr_scheduler.py b/nnvm/python/nnvm/compiler/lr_scheduler.py
new file mode 100644
index 000000000000..791925e74960
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/lr_scheduler.py
@@ -0,0 +1,58 @@
+# pylint: disable=too-few-public-methods, no-member
+"""API for scheduling learning rate."""
+from .. import symbol as sym
+
+class LRScheduler(object):
+    """Base class of a learning rate scheduler.
+
+    A scheduler returns a new learning rate based on the number of updates that have
+    been performed.
+
+    Parameters
+    ----------
+    base_lr : float, optional
+        The initial learning rate.
+    """
+    def __init__(self, base_lr=0.01, name='LRScheduler'):
+        self.name = name
+        self.base_lr = base_lr
+
+    def __call__(self, num_update):
+        """Return a new learning rate based on number of updates.
+
+        Parameters
+        ----------
+        num_update: nnvm Symbol
+            the number of updates applied to weight.
+        """
+        raise NotImplementedError("__call__ method must be overridden.")
+
+class FactorScheduler(LRScheduler):
+    """Reduce the learning rate by a factor for every *n* steps.
+
+    It returns a new learning rate by::
+
+        base_lr * pow(factor, num_update/step)
+
+    Parameters
+    ----------
+    step : int
+        Changes the learning rate for every n updates.
+    factor : float, optional
+        The factor to change the learning rate.
+    stop_factor_lr : float, optional
+        Stop updating the learning rate if it is less than this value.
+    """
+    def __init__(self, step, factor=1, stop_factor_lr=1e-8, name='FactorScheduler', **kwargs):
+        super(FactorScheduler, self).__init__(name=name, **kwargs)
+        if step < 1:
+            raise ValueError("Schedule step must be greater or equal than 1 round")
+        if factor > 1.0:
+            raise ValueError("Factor must be no more than 1 to make lr reduce")
+        self.step = step
+        self.factor = factor
+        self.stop_factor_lr = stop_factor_lr
+
+    def __call__(self, num_update):
+        updated_lr = self.base_lr * self.factor ** (num_update / self.step)
+        return sym.clip(updated_lr, a_min=self.stop_factor_lr, a_max=self.base_lr)
diff --git a/nnvm/python/nnvm/compiler/optimizer.py b/nnvm/python/nnvm/compiler/optimizer.py
new file mode 100644
index 000000000000..bcf7498528af
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/optimizer.py
@@ -0,0 +1,131 @@
+# pylint: disable=invalid-name, no-member, too-few-public-methods, too-many-arguments, too-many-locals, protected-access
+"""Optimizer API"""
+from . import graph_util
+from .. import symbol as sym
+
+class Optimizer(object):
+    """Base class inherited by all optimizers.
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        The initial learning rate.
+
+    lr_scheduler : LRScheduler, optional
+        The learning rate scheduler.
+
+    rescale_grad : float, optional
+        Multiply the gradient with `rescale_grad` before updating. Often
+        choose to be ``1.0/batch_size``.
+
+    clip_gradient : float, optional
+        Clip the gradient by projecting onto the box ``[-clip_gradient, clip_gradient]``.
+
+    wd : float, optional
+        The weight decay (or L2 regularization) coefficient. Modifies objective
+        by adding a penalty for having large weights.
+
+    name : string, optional
+        The name of optimizer.
+    """
+    def __init__(self, learning_rate=0.01, lr_scheduler=None,
+                 rescale_grad=1, clip_gradient=None, wd=0, name="Optimizer"):
+        self.name = name
+        self.lr = learning_rate
+        self.lr_scheduler = lr_scheduler
+        self.rescale_grad = rescale_grad
+        self.clip_gradient = clip_gradient
+        self.wd = wd
+        init_update_t = sym.Variable(name+'_t', init=sym.zeros(shape=(1,), dtype="int32"))
+        self.update_t = sym._assign(init_update_t, init_update_t + 1)
+
+    def minimize(self, obj, var=None):
+        """Minimize given obj symbol respect to var. If var is not set, all input
+        variables of obj will be used.
+
+        Parameters
+        ----------
+        obj : nnvm Symbol or list of nnvm Symbols
+            Symbols to be minimized.
+        var : nnvm Symbol or list of nnvm Symbols, optional
+            Symbols the gradient respect to.
+
+        Returns
+        -------
+        group_sym : nnvm Symbol
+            Group symbol represents update symbols.
+        """
+        raise NotImplementedError()
+
+    def _get_lr(self):
+        """Gets the learning rate with learning rate scheduler.
+
+        Returns
+        -------
+        lr : float
+            Learning rate.
+        """
+        if self.lr_scheduler is not None:
+            lr = self.lr_scheduler(self.update_t)
+        else:
+            lr = self.lr
+        return lr
+
+
+class SGD(Optimizer):
+    """The SGD optimizer
+    """
+    def __init__(self, name='SGD', **kwargs):
+        super(SGD, self).__init__(name=name, **kwargs)
+
+    def minimize(self, obj, var=None):
+        variables = var or obj.list_input_variables()
+        if not isinstance(variables, list):
+            variables = [variables]
+        grads = graph_util.gradients(obj, variables)
+        updates = []
+        lr_t = self._get_lr()
+        for v, g in zip(variables, grads):
+            g = self.rescale_grad * g
+            if self.clip_gradient is not None:
+                g = sym.clip(g, a_min=-1 * self.clip_gradient, a_max=self.clip_gradient)
+            updates.append(sym._assign(v, v - lr_t * (g + self.wd * v)))
+        return sym.Group(updates)
+
+
+class Adam(Optimizer):
+    """The Adam optimizer.
+
+    This class implements the optimizer described in *Adam: A Method for
+    Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999,
+                 epsilon=1e-8, name='Adam', **kwargs):
+        super(Adam, self).__init__(learning_rate=learning_rate, name=name, **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.m = []
+        self.v = []
+
+    def minimize(self, obj, var=None):
+        variables = var or obj.list_input_variables()
+        if not isinstance(variables, list):
+            variables = [variables]
+        grads = graph_util.gradients(obj, variables)
+        updates = []
+        for i, v in enumerate(variables):
+            self.m.append(sym.Variable(self.name + '_m' + str(i), init=sym.zeros_like(v)))
+            self.v.append(sym.Variable(self.name + '_v' + str(i), init=sym.zeros_like(v)))
+        rate = sym.sqrt(1 - self.beta2 ** self.update_t) / (1 -  self.beta1 ** self.update_t)
+        lr_t = self._get_lr() * rate
+        for variable, g, m, v in zip(variables, grads, self.m, self.v):
+            g = self.rescale_grad * g
+            if self.clip_gradient is not None:
+                g = sym.clip(g, a_min=-1 * self.clip_gradient, a_max=self.clip_gradient)
+            update_m = sym._assign(m, self.beta1 * m + (1 - self.beta1) * g)
+            update_v = sym._assign(v, self.beta2 * v + (1 - self.beta2) * g * g)
+            update_var = sym._assign(variable, variable - lr_t * (update_m / (sym.sqrt(update_v) \
+                         + self.epsilon) + self.wd * variable))
+            updates.append(update_var)
+        return sym.Group(updates)
diff --git a/nnvm/python/nnvm/compiler/param_dict.py b/nnvm/python/nnvm/compiler/param_dict.py
new file mode 100644
index 000000000000..3bb30e20bbc2
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/param_dict.py
@@ -0,0 +1,61 @@
+# pylint: disable=invalid-name
+"""Helper utility to save parameter dict"""
+import tvm
+
+_save_param_dict = tvm.get_global_func("nnvm.compiler._save_param_dict")
+_load_param_dict = tvm.get_global_func("nnvm.compiler._load_param_dict")
+
+def save_param_dict(params):
+    """Save parameter dictionary to binary bytes.
+
+    The result binary bytes can be loaded by the
+    GraphModule with API "load_params".
+
+    Parameters
+    ----------
+    params : dict of str to NDArray
+        The parameter dictionary.
+
+    Returns
+    -------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Examples
+    --------
+    .. code-block:: python
+
+       # compile and save the modules to file.
+       graph, lib, params = nnvm.compiler.build(
+          graph, target, shape={"data", data_shape}, params=params)
+       module = graph_runtime.create(graph, lib, tvm.gpu(0))
+       # save the parameters as byte array
+       param_bytes = nnvm.compiler.save_param_dict(params)
+       # We can serialize the param_bytes and load it back later.
+       # Pass in byte array to module to directly set parameters
+       module["load_params"](param_bytes)
+    """
+    args = []
+    for k, v in params.items():
+        args.append(k)
+        args.append(tvm.nd.array(v))
+    return _save_param_dict(*args)
+
+
+def load_param_dict(param_bytes):
+    """Load parameter dictionary to binary bytes.
+
+    Parameters
+    ----------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Returns
+    -------
+    params : dict of str to NDArray
+        The parameter dictionary.
+    """
+    if isinstance(param_bytes, (bytes, str)):
+        param_bytes = bytearray(param_bytes)
+    load_arr = _load_param_dict(param_bytes)
+    return {v.name : v.array for v in load_arr}
diff --git a/nnvm/python/nnvm/contrib.py b/nnvm/python/nnvm/contrib.py
new file mode 100644
index 000000000000..976eb532b019
--- /dev/null
+++ b/nnvm/python/nnvm/contrib.py
@@ -0,0 +1 @@
+"""Module space to register contrib functions. Leave empty"""
diff --git a/nnvm/python/nnvm/cython/README b/nnvm/python/nnvm/cython/README
new file mode 100644
index 000000000000..d9deab1abca9
--- /dev/null
+++ b/nnvm/python/nnvm/cython/README
@@ -0,0 +1 @@
+Cython specific implementation of certain modules
\ No newline at end of file
diff --git a/nnvm/python/nnvm/cython/base.pyi b/nnvm/python/nnvm/cython/base.pyi
new file mode 100644
index 000000000000..b1f67876598d
--- /dev/null
+++ b/nnvm/python/nnvm/cython/base.pyi
@@ -0,0 +1,89 @@
+ctypedef void* SymbolHandle
+ctypedef void* OpHandle
+ctypedef unsigned nn_uint
+
+cdef py_str(const char* x):
+    if PY_MAJOR_VERSION < 3:
+        return x
+    else:
+        return x.decode("utf-8")
+
+
+cdef c_str(pystr):
+    """Create ctypes char * from a python string
+    Parameters
+    ----------
+    string : string type
+        python string
+
+    Returns
+    -------
+    str : c_char_p
+        A char pointer that can be passed to C API
+    """
+    return pystr.encode("utf-8")
+
+
+cdef CALL(int ret):
+    if ret != 0:
+        raise NNVMError(NNGetLastError())
+
+
+cdef const char** CBeginPtr(vector[const char*]& vec):
+    if (vec.size() != 0):
+        return &vec[0]
+    else:
+        return NULL
+
+cdef vector[const char*] SVec2Ptr(vector[string]& vec):
+    cdef vector[const char*] svec
+    svec.resize(vec.size())
+    for i in range(vec.size()):
+        svec[i] = vec[i].c_str()
+    return svec
+
+
+cdef BuildDoc(nn_uint num_args,
+              const char** arg_names,
+              const char** arg_types,
+              const char** arg_descs,
+              remove_dup=True):
+    """Convert ctypes returned doc string information into parameters docstring.
+
+    num_args : nn_uint
+        Number of arguments.
+
+    arg_names : ctypes.POINTER(ctypes.c_char_p)
+        Argument names.
+
+    arg_types : ctypes.POINTER(ctypes.c_char_p)
+        Argument type information.
+
+    arg_descs : ctypes.POINTER(ctypes.c_char_p)
+        Argument description information.
+
+    remove_dup : boolean, optional
+        Whether remove duplication or not.
+
+    Returns
+    -------
+    docstr : str
+        Python docstring of parameter sections.
+    """
+    param_keys = set()
+    param_str = []
+    for i in range(num_args):
+        key = arg_names[i]
+        if key in param_keys and remove_dup:
+            continue
+        param_keys.add(key)
+        type_info = arg_types[i]
+        ret = '%s : %s' % (key, type_info)
+        if len(arg_descs[i]) != 0:
+            ret += '\n    ' + py_str(arg_descs[i])
+        param_str.append(ret)
+    doc_str = ('Parameters\n' +
+               '----------\n' +
+               '%s\n')
+    doc_str = doc_str % ('\n'.join(param_str))
+    return doc_str
diff --git a/nnvm/python/nnvm/cython/symbol.pyx b/nnvm/python/nnvm/cython/symbol.pyx
new file mode 100644
index 000000000000..6848699b3939
--- /dev/null
+++ b/nnvm/python/nnvm/cython/symbol.pyx
@@ -0,0 +1,216 @@
+from __future__ import absolute_import as _abs
+
+import sys as _sys
+import ctypes as _ctypes
+from numbers import Number as _Number
+from .._base import NNVMError
+from ..name import NameManager
+from ..attribute import AttrScope
+from libcpp.vector cimport vector
+from libcpp.string cimport string
+from cpython.version cimport PY_MAJOR_VERSION
+
+include "./base.pyi"
+
+cdef extern from "nnvm/c_api.h":
+    const char* NNGetLastError();
+    int NNListAllOpNames(nn_uint *out_size,
+                      const char ***out_array);
+    int NNGetOpHandle(const char *op_name,
+                      OpHandle *handle);
+    int NNGetOpInfo(OpHandle op,
+                    const char **name,
+                    const char **description,
+                    nn_uint *num_doc_args,
+                    const char ***arg_names,
+                    const char ***arg_type_infos,
+                    const char ***arg_descriptions,
+                    const char **return_type);
+    int NNListOpNames(nn_uint *out_size,
+                      const char ***out_array);
+    int NNSymbolCreateAtomicSymbol(OpHandle op,
+                                   nn_uint num_param,
+                                   const char **keys,
+                                   const char **vals,
+                                   SymbolHandle *out);
+    int NNSymbolFree(SymbolHandle symbol);
+    int NNSymbolSetAttrs(SymbolHandle symbol,
+                         nn_uint num_param,
+                         const char** keys,
+                         const char** values);
+    int NNSymbolCompose(SymbolHandle sym,
+                        const char* name,
+                        nn_uint num_args,
+                        const char** keys,
+                        SymbolHandle* args);
+
+cdef class SymbolBase:
+    """Symbol is symbolic graph."""
+    # handle for symbolic operator.
+    cdef SymbolHandle handle
+
+    def __init__(self, handle):
+        cdef unsigned long ptr
+        if handle is None:
+            self.handle = NULL
+        else:
+            ptr = handle.value
+            self.handle = <SymbolHandle>(ptr)
+
+    def __dealloc__(self):
+        CALL(NNSymbolFree(self.handle))
+
+    @property
+    def handle(self):
+        return _ctypes.cast(<unsigned long>self.handle, _ctypes.c_void_p)
+
+    def _set_attr(self, **kwargs):
+        """Set the attribute of the symbol.
+
+        Parameters
+        ----------
+        **kwargs
+            The attributes to set
+        """
+        SymbolSetAttr(self.handle, kwargs)
+
+
+cdef SymbolSetAttr(SymbolHandle handle, dict kwargs):
+    cdef vector[string] sparam_keys
+    cdef vector[string] sparam_vals
+    cdef nn_uint num_args
+    for k, v in kwargs.items():
+        sparam_keys.push_back(c_str(k))
+        sparam_vals.push_back(c_str(str(v)))
+    # keep strings in vector
+    cdef vector[const char*] param_keys = SVec2Ptr(sparam_keys)
+    cdef vector[const char*] param_vals = SVec2Ptr(sparam_vals)
+    num_args = param_keys.size()
+    CALL(NNSymbolSetAttrs(
+        handle, num_args, CBeginPtr(param_keys), CBeginPtr(param_vals)))
+
+
+_symbol_cls = SymbolBase
+
+cdef _set_symbol_class(cls):
+    global _symbol_cls
+    _symbol_cls = cls
+
+cdef NewSymbol(SymbolHandle handle):
+    """Create a new symbol given handle"""
+    sym = _symbol_cls(None)
+    (<SymbolBase>sym).handle = handle
+    return sym
+
+cdef _make_atomic_symbol_function(OpHandle handle, string name):
+    """Create an atomic symbol function by handle and funciton name."""
+    cdef const char *real_name
+    cdef const char *desc
+    cdef nn_uint num_args
+    cdef const char** arg_names
+    cdef const char** arg_types
+    cdef const char** arg_descs
+    cdef const char* return_type
+
+    CALL(NNGetOpInfo(
+        handle, &real_name, &desc,
+        &num_args, &arg_names,
+        &arg_types, &arg_descs,
+        &return_type))
+
+    param_str = BuildDoc(num_args, arg_names, arg_types, arg_descs)
+    func_name = py_str(name.c_str())
+    doc_str = ('%s\n\n' +
+               '%s\n' +
+               'Returns\n' +
+               '-------\n' +
+               'result: Tensor\n' +
+               '    The result Tensor.')
+    doc_str = doc_str % (desc, param_str)
+    func_hint = func_name.lower()
+
+    def creator(*args, **kwargs):
+        cdef vector[string] sparam_keys
+        cdef vector[string] sparam_vals
+        cdef vector[SymbolHandle] symbol_args
+        cdef vector[string] ssymbol_keys
+        cdef SymbolHandle ret_handle
+
+        name = kwargs.pop("name", None)
+        attr = kwargs.pop("attr", None)
+
+        if len(kwargs) != 0:
+            for k, v in kwargs.items():
+                if isinstance(v, SymbolBase):
+                    ssymbol_keys.push_back(c_str(k))
+                    symbol_args.push_back((<SymbolBase>v).handle)
+                else:
+                    sparam_keys.push_back(c_str(k))
+                    sparam_vals.push_back(c_str(str(v)))
+
+        if len(args) != 0:
+            if symbol_args.size() != 0:
+                raise TypeError("compose only accept input Symbols\
+                    either as positional or keyword arguments, not both")
+            for v in args:
+                if not isinstance(v, SymbolBase):
+                    raise TypeError('Compose expect `Symbol` as arguments')
+                symbol_args.push_back((<SymbolBase>v).handle)
+
+        cdef vector[const char*] param_keys = SVec2Ptr(sparam_keys)
+        cdef vector[const char*] param_vals = SVec2Ptr(sparam_vals)
+        cdef vector[const char*] symbol_keys = SVec2Ptr(ssymbol_keys)
+
+        CALL(NNSymbolCreateAtomicSymbol(
+            handle,
+            <nn_uint>param_keys.size(),
+            CBeginPtr(param_keys),
+            CBeginPtr(param_vals),
+            &ret_handle))
+        num_args = <nn_uint>(symbol_args.size())
+
+        attr = AttrScope.current.get(attr)
+        if attr:
+            SymbolSetAttr(ret_handle, attr)
+        name = NameManager.current.get(name, func_hint)
+
+        cdef const char* c_name = NULL
+
+        if name:
+            name = c_str(name)
+            c_name = name
+
+        CALL(NNSymbolCompose(
+            ret_handle,
+            c_name,
+            num_args,
+            &symbol_keys[0] if symbol_keys.size() != 0 else NULL,
+            &symbol_args[0] if symbol_args.size() != 0 else NULL))
+        return NewSymbol(ret_handle)
+
+    creator.__name__ = func_name
+    creator.__doc__ = doc_str
+    return creator
+
+
+def _init_symbol_module(symbol_class, root_namespace):
+    """List and add all the atomic symbol functions to current module."""
+    cdef const char** op_name_ptrs
+    cdef nn_uint size
+    cdef vector[string] op_names
+    cdef OpHandle handle
+
+    _set_symbol_class(symbol_class)
+    CALL(NNListAllOpNames(&size, &op_name_ptrs))
+    for i in range(size):
+        op_names.push_back(string(op_name_ptrs[i]));
+    module_obj = _sys.modules["%s.symbol" % root_namespace]
+    module_internal = _sys.modules["%s._symbol_internal" % root_namespace]
+    for i in range(op_names.size()):
+        CALL(NNGetOpHandle(op_names[i].c_str(), &handle))
+        function = _make_atomic_symbol_function(handle, op_names[i])
+        if function.__name__.startswith('_'):
+            setattr(module_internal, function.__name__, function)
+            setattr(module_obj, function.__name__, function)
+        else:
+            setattr(module_obj, function.__name__, function)
diff --git a/nnvm/python/nnvm/frontend/__init__.py b/nnvm/python/nnvm/frontend/__init__.py
new file mode 100644
index 000000000000..80f66c0d35e3
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/__init__.py
@@ -0,0 +1,8 @@
+"""NNVM frontends."""
+from __future__ import absolute_import
+from .mxnet import from_mxnet
+from .onnx import from_onnx
+from .coreml import from_coreml
+from .keras import from_keras
+from .darknet import from_darknet
+from .tensorflow import from_tensorflow
diff --git a/nnvm/python/nnvm/frontend/common.py b/nnvm/python/nnvm/frontend/common.py
new file mode 100644
index 000000000000..7b8c4621029d
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/common.py
@@ -0,0 +1,172 @@
+"""Shared functions and classes for frontends."""
+from __future__ import absolute_import as _abs
+import logging
+from nnvm import sym as _sym
+from .._base import string_types
+
+def get_nnvm_op(op_name):
+    op = getattr(_sym, op_name)
+    if not op:
+        raise RuntimeError("Unable to map op_name {} to nnvm.sym".format(op_name))
+    return op
+
+class Renamer(object):
+    """A simply renamer for operators.
+
+    Parameters
+    ----------
+    new_name : str
+        The new name for the operator
+    """
+    def __init__(self, new_name):
+        self._new_name = new_name
+
+    def __call__(self, inputs, attrs, *args):
+        return get_nnvm_op(self._new_name)(*inputs, **attrs)
+
+
+class AttrConverter(object):
+    """Common attribute conveter. An AttrConverter instance is a callable:
+    ```
+    attr_converter = AttrConverter(op_name, transforms={'a':'b', 'c':('d', 1)})
+    new_op_name, new_attr = attr_converter(attrs)
+    ```
+
+    Parameters
+    ----------
+    op_name : str or callable
+        If set as str, returned operator name is the str.
+        If set as callable, returned operator is the str returned by calling:
+        `op_name = func(attr)`
+    transforms : dict of `new_name, or (new_name, default_value, transform function)`
+        If only a new_name is provided, it's like renaming the attribute name.
+        If default_value if provded, then the attribute is considered as optional.
+        If transform function is provided, the original attribute value is handled
+        by transform function.
+    excludes : list
+        A list of excluded attributes that should `NOT` appear.
+        Raise NotImplementedError if occured.
+    disables : list
+        A list of attributes that is disabled in nnvm. Log warnings.
+    ignores : list
+        A list of attributes that is ignored in nnvm. Debug level logging.
+    extras : dict
+        A series of additional attributes should be added anyway to the returned
+        attribute dict.
+    custom_check : callable
+        A custom function takes attribute, and return True/False.
+        Raise RuntimeError if not bool(True) returned.
+    """
+    def __init__(self, op_name, transforms=None,
+                 excludes=None, disables=None, ignores=None,
+                 extras=None, custom_check=None):
+        self._op_name = op_name
+        self._transforms = transforms if transforms else {}
+        self._excludes = excludes if excludes else []
+        self._disables = disables if disables else []
+        self._ignores = ignores if ignores else []
+        self._extras = extras if extras else {}
+        self._custom_check = custom_check
+
+    def __call__(self, inputs, attrs, *args):
+        # apply custom check
+        if self._custom_check:
+            func, msg = self._custom_check
+            if not func(attrs):
+                raise RuntimeError("Check failed: {}".format(msg))
+        # get new op_name
+        if isinstance(self._op_name, string_types):
+            op_name = self._op_name
+        else:
+            assert callable(self._op_name), "op_name can either be string or callable"
+            op_name = self._op_name(attrs)
+        # convert attributes
+        new_attrs = {}
+        for k in attrs.keys():
+            if k in self._excludes:
+                raise NotImplementedError("Attribute {} not supported yet.".format(k))
+            elif k in self._disables:
+                logging.warning("Attribute %s is disabled in nnvm.sym.%s", k, op_name)
+            elif k in self._ignores:
+                logging.debug("Attribute %s is ignored in nnvm.sym.%s", k, op_name)
+            elif k in self._transforms:
+                new_name, defaults, transform = self._parse_default(self._transforms[k])
+                if defaults is None:
+                    new_attr = self._required_attr(attrs, k)
+                else:
+                    new_attr = attrs.get(k, None)
+                if new_attr is None:
+                    new_attrs[new_name] = defaults
+                else:
+                    new_attrs[new_name] = transform(new_attr)
+            else:
+                # copy
+                new_attrs[k] = attrs[k]
+        # add extras
+        new_attrs.update(self._extras)
+        return get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+    def _parse_default(self, target):
+        """Helper function to parse default values."""
+        if not isinstance(target, (list, tuple)):
+            k, v, t = target, None, lambda x: x
+        elif len(target) == 1:
+            k, v, t = target[0], None, lambda x: x
+        elif len(target) == 2:
+            k, v, t = target[0], target[1], lambda x: x
+        elif len(target) > 2:
+            k, v, t = target[0], target[1], target[2]
+        else:
+            k = None  # should raise
+        if not isinstance(k, string_types):
+            msg = "{} is not a valid target, (name, default) expected.".format(target)
+            raise ValueError(msg)
+        return k, v, t
+
+    def _parse_bool(self, value):
+        """Helper function to parse default boolean values."""
+        if isinstance(value, string_types):
+            return value.strip().lower() in ['true', '1', 't', 'y', 'yes']
+        return bool(value)
+
+    def _required_attr(self, attr, key):
+        """Wrapper for getting required attributes."""
+        assert isinstance(attr, dict)
+        if key not in attr:
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return attr[key]
+
+
+class SymbolTable(object):
+    """Table storing symbols by names."""
+    def __init__(self):
+        self.vars = {}
+        self.params = {}
+        self.const_ctr = 1
+        self.in_padding = False
+        self.paddings = [0, 0]
+
+    def new_const(self, value):
+        name = "_param_%d" % (self.const_ctr)
+        self.const_ctr += 1
+        self.params[name] = value
+        self.vars[name] = _sym.Variable(name=name)
+        return self.vars[name]
+
+    def get_var(self, name, must_contain=True):
+        if must_contain:
+            assert name in self.vars
+        if name not in self.vars:
+            self.vars[name] = _sym.Variable(name=name)
+        return self.vars[name]
+
+    def set_var(self, name, sym):
+        assert isinstance(sym, _sym.Symbol)
+        self.vars[name] = sym
+
+    def set_padding(self, paddings):
+        self.paddings = paddings
+        self.in_padding = True
+
+    def clear_padding(self):
+        self.in_padding = False
diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
new file mode 100644
index 000000000000..e80cfe23f220
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -0,0 +1,349 @@
+# pylint: disable=invalid-name, unused-argument
+"""CoreML frontend."""
+from __future__ import absolute_import as _abs
+import numpy as np
+
+import tvm
+from .. import symbol as _sym
+from .._base import string_types
+from .common import SymbolTable
+
+__all__ = ['from_coreml']
+
+
+def NeuralNetworkImageScaler(op, insym, symtab):
+    # this changes the symbol
+    biases = np.array([op.blueBias, op.greenBias, op.redBias]).reshape([3, 1, 1])
+    bias = symtab.new_const(biases)
+    ret = _sym.__mul_scalar__(insym, scalar=op.channelScale)
+    ret = _sym.broadcast_add(ret, bias)
+    return ret
+
+
+def NeuralNetworkMeanImage(op, insym, symtab):
+    # this changes the symbol
+    ret = _sym.elemwise_sub(insym, scalar=op.meanImage)
+    return ret
+
+
+def ConvolutionLayerParams(op, insym, symtab):
+    """Convolution layer params."""
+    weights = symtab.new_const(np.array(list(op.weights.floatValue)).reshape(
+        tuple([op.outputChannels, op.kernelChannels] + list(op.kernelSize))))
+    if op.hasBias:
+        biases = symtab.new_const(list(op.bias.floatValue))
+    dilation = list(op.dilationFactor)
+    if not dilation:
+        dilation = [1, 1]
+    params = {'channels':op.outputChannels,
+              'kernel_size':list(op.kernelSize),
+              'strides':list(op.stride),
+              'dilation': dilation,
+              'use_bias': op.hasBias,
+              'groups':op.nGroups}
+
+    if op.WhichOneof('ConvolutionPaddingType') == 'valid':
+        valid = op.valid
+        padding = [b.startEdgeSize for b in valid.paddingAmounts.borderAmounts]
+        padding2 = [b.endEdgeSize for b in valid.paddingAmounts.borderAmounts]
+        for i, j in zip(padding, padding2):
+            assert i == j, "Asymmetry padding not supported"
+        if padding:
+            params['padding'] = padding
+    elif op.WhichOneof('ConvolutionPaddingType') == 'same':
+        kernel = params['kernel_size']
+        pad_h = kernel[0] - 1
+        pad_w = kernel[1] - 1
+        pad_t = pad_h // 2
+        pad_l = pad_w // 2
+        pad_b = pad_h - pad_t
+        pad_r = pad_w - pad_l
+        assert pad_t == pad_r and pad_l == pad_b, "Asymmetry padding not supported"
+        params['padding'] = [pad_t, pad_l]
+    else:
+        raise NotImplementedError("Valid/Same convolution padding implemented")
+
+    if op.hasBias:
+        pos = [insym, weights, biases]
+    else:
+        pos = [insym, weights]
+
+    if op.isDeconvolution:
+        ret = _sym.conv2d_transpose(*pos, **params)
+    else:
+        ret = _sym.conv2d(*pos, **params)
+    # consume padding layer
+    if symtab.in_padding:
+        params['padding'] = [sum(x) for x in zip(params.get('padding', [0, 0]), symtab.paddings)]
+        symtab.clear_padding()
+    return ret
+
+def BatchnormLayerParams(op, insym, symtab):
+    """Get layer of batchnorm parameter"""
+    # this changes the symbol
+    if op.instanceNormalization:
+        raise NotImplementedError("instance normalization not implemented")
+    else:
+        params = {'gamma':symtab.new_const(list(op.gamma.floatValue)),
+                  'beta':symtab.new_const(list(op.beta.floatValue)),
+                  'moving_mean':symtab.new_const(list(op.mean.floatValue)),
+                  'moving_var': symtab.new_const(list(op.variance.floatValue)),
+                  'epsilon': op.epsilon}
+        return _sym.batch_norm(data=insym, **params)
+
+def ActivationParams(op, insym, symtab):
+    """Get activation parameters"""
+    whichActivation = op.WhichOneof('NonlinearityType')
+    par = getattr(op, whichActivation)
+    if whichActivation == 'linear':
+        return _sym.__add_scalar__(_sym.__mul_scalar__(insym, scalar=par.alpha), scalar=par.beta)
+    elif whichActivation == 'ReLU':
+        return _sym.relu(insym)
+    elif whichActivation == 'leakyReLU':
+        return _sym.leaky_relu(insym, alpha=par.alpha)
+    elif whichActivation == 'thresholdedReLU':
+        alpha_tensor = _sym.full_like(insym, fill_value=float(par.alpha))
+        return _sym.elemwise_mul(insym, _sym.greater(insym, alpha_tensor))
+    elif whichActivation == 'PReLU':
+        return _sym.prelu(insym, alpha=par.alpha)
+    elif whichActivation == 'tanh':
+        return _sym.tanh(insym)
+    elif whichActivation == 'scaledTanh':
+        return _sym.__mul_scalar__(_sym.tanh(_sym.__mul_scalar__(
+            insym, scalar=par.beta)), scalar=par.alpha)
+    elif whichActivation == 'sigmoid':
+        return _sym.sigmoid(insym)
+    elif whichActivation == 'sigmoidHard':
+        transformX = (par.alpha * insym) + par.beta
+        return _sym.clip(transformX, a_min=0, a_max=1)
+    elif whichActivation == 'ELU':
+        return _sym.__mul_scalar__(_sym.__add_scalar__(
+            _sym.exp(insym), scalar=-1), scalar=par.alpha)
+    elif whichActivation == 'softsign':
+        return insym / (1 + (_sym.relu(insym) + _sym.relu(_sym.negative(insym))))
+    elif whichActivation == 'softplus':
+        return _sym.log(_sym.__add_scalar__(_sym.exp(insym), scalar=1))
+    elif whichActivation == 'parametricSoftplus':
+        alpha = list(par.alpha.floatValue)
+        beta = list(par.alpha.floatValue)
+        if len(alpha) == 1:
+            return _sym.__mul_scalar__(_sym.log(_sym.__add_scalar__(
+                _sym.exp(insym), scalar=beta[0])), scalar=alpha[0])
+        alpha = np.array(alpha).reshape((len(alpha), 1, 1))
+        beta = np.array(beta).reshape((len(beta), 1, 1))
+        alphasym = symtab.new_const(alpha)
+        betasym = symtab.new_const(beta)
+        return _sym.broadcast_mul(_sym.log(_sym.broadcast_add(
+            _sym.exp(insym), betasym)), alphasym)
+    else:
+        raise NotImplementedError('%s not implemented' % whichActivation)
+
+def ScaleLayerParams(op, insym, symtab):
+    """Scale layer params."""
+    scale = symtab.new_const(np.array(list(op.scale.floatValue)).reshape(
+        tuple(list(op.shapeScale) + [1, 1])))
+    # scale = _sym.reshape(scale, shape=tuple(list(op.shapeScale) + [1,1]))
+    ret = _sym.broadcast_mul(insym, scale)
+    if op.hasBias:
+        bias = symtab.new_const(np.array(list(op.bias.floatValue)).reshape(
+            tuple(list(op.shapeBias) + [1, 1])))
+        # bias = _sym.reshape(bias, shape=tuple(list(op.shapeBias) + [1,1]))
+        ret = _sym.broadcast_add(ret, bias)
+    return ret
+
+def PoolingLayerParams(op, insym, symtab):
+    """get pooling parameters"""
+    if op.globalPooling:
+        if op.type == 0:
+            return _sym.global_max_pool2d(insym)
+        elif op.type == 1:
+            return _sym.global_avg_pool2d(insym)
+        else:
+            raise NotImplementedError("Only max and average pooling implemented")
+
+    else:
+        params = {'pool_size':list(op.kernelSize),
+                  'strides':list(op.stride)}
+
+        if op.WhichOneof('PoolingPaddingType') == 'valid':
+            valid = op.valid
+            padding = [b.startEdgeSize for b in valid.paddingAmounts.borderAmounts]
+            padding2 = [b.endEdgeSize for b in valid.paddingAmounts.borderAmounts]
+            for i, j in zip(padding, padding2):
+                assert i == j
+            params['padding'] = padding
+        elif op.WhichOneof('PoolingPaddingType') == 'includeLastPixel':
+            # I don't know if this is correct
+            valid = op.includeLastPixel
+            padding = list(valid.paddingAmounts)
+            params['padding'] = padding
+            params['ceil_mode'] = True
+        else:
+            raise NotImplementedError("Other convolution padding not implemented")
+
+        # consume padding layer
+        if symtab.in_padding:
+            params['padding'] = [sum(x) for x in zip(
+                params.get('padding', [0, 0]), symtab.paddings)]
+            symtab.clear_padding()
+
+        if op.type == 0:
+            return _sym.max_pool2d(insym, **params)
+        elif op.type == 1:
+            return _sym.avg_pool2d(insym, **params)
+        else:
+            raise NotImplementedError("Only max and average pooling implemented")
+
+def SoftmaxLayerParams(op, insym, symtab):
+    return _sym.softmax(_sym.flatten(insym))
+
+def InnerProductLayerParams(op, insym, symtab):
+    weights = symtab.new_const(np.array(op.weights.floatValue).reshape(
+        (op.outputChannels, op.inputChannels)))
+    par = {'weight':weights, 'use_bias':False, 'units':op.outputChannels}
+    if op.hasBias:
+        bias = symtab.new_const(np.array(op.bias.floatValue))
+        par['bias'] = bias
+        par['use_bias'] = True
+    return _sym.dense(data=insym, **par)
+
+def AddLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list):
+        insyms = [insyms]
+    ret = insyms[0]
+    for i in range(1, len(insyms)):
+        ret = _sym.elemwise_add(ret, insyms[i])
+    if op.alpha > 0:
+        ret = _sym.__add_scalar__(ret, scalar=op.alpha)
+    return ret
+
+def ConcatLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list):
+        insyms = [insyms]
+    if op.sequenceConcat:
+        raise NotImplementedError("Sequence Concat not supported")
+    ret = _sym.concatenate(*insyms, axis=1)
+    return ret
+
+def FlattenLayerParams(op, insym, symtab):
+    if op.mode == 1:
+        insym = _sym.transpose(_sym.reshape(insym, shape=(0, 0, -1)), axes=(0, 2, 1))
+    return _sym.flatten(insym)
+
+def PaddingLayerParams(op, insym, symtab):
+    """Hacking for padding layer params."""
+    if op.WhichOneof('PaddingType') == 'constant':
+        constant = op.constant
+        if constant.value != 0:
+            raise NotImplementedError("Padding value {} not supported.".format(constant.value))
+        padding = [b.startEdgeSize for b in op.paddingAmounts.borderAmounts]
+        padding2 = [b.endEdgeSize for b in op.paddingAmounts.borderAmounts]
+        for i, j in zip(padding, padding2):
+            assert i == j
+        symtab.set_padding(padding)
+    else:
+        raise NotImplementedError("Only constant padding is supported now.")
+    return insym
+
+def PermuteLayerParams(op, insym, symtab):
+    axes = tuple(op.axis)
+    return _sym.transpose(insym, axes=axes)
+
+
+_convert_map = {
+    'NeuralNetworkMeanImage': NeuralNetworkMeanImage,
+    'NeuralNetworkImageScaler': NeuralNetworkImageScaler,
+    'ConvolutionLayerParams':ConvolutionLayerParams,
+    'BatchnormLayerParams':BatchnormLayerParams,
+    'ActivationParams':ActivationParams,
+    'ScaleLayerParams':ScaleLayerParams,
+    'PoolingLayerParams':PoolingLayerParams,
+    'SoftmaxLayerParams':SoftmaxLayerParams,
+    'InnerProductLayerParams':InnerProductLayerParams,
+    'AddLayerParams':AddLayerParams,
+    'FlattenLayerParams':FlattenLayerParams,
+    'ConcatLayerParams':ConcatLayerParams,
+    'PaddingLayerParams':PaddingLayerParams,
+    'PermuteLayerParams':PermuteLayerParams,
+}
+
+def coreml_op_to_nnvm(op, inname, outname, symtab):
+    """Convert coreml layer to nnvm layer.
+
+    Parameters
+    ----------
+    coremlop: a coreml protobuf bit
+
+    prevsym: previous nnvm symbol
+
+    Returns:
+    -------
+    nnvm.sym.Symbol
+        Converted symbol
+    """
+    classname = type(op).__name__
+    if classname not in _convert_map:
+        raise NotImplementedError("%s is not supported" % (classname))
+    if isinstance(inname, string_types):
+        insym = symtab.get_var(inname)
+    else:
+        insym = [symtab.get_var(i) for i in inname]
+    ret = _convert_map[classname](op, insym, symtab)
+    if outname:
+        symtab.set_var(outname, ret)
+    if classname != 'PaddingLayerParams':
+        assert not symtab.in_padding, "Previous padding not consumed by conv/pool"
+
+def from_coreml(model):
+    """Convert from coreml model into NNVM format.
+
+    Parameters
+    ----------
+    model:
+        coremltools.models.MLModel of a NeuralNetworkClassifier
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by nnvm
+    """
+    try:
+        import coremltools as cm
+    except ImportError:
+        raise ImportError('The coremltools package must be installed')
+
+    assert isinstance(model, cm.models.MLModel)
+    spec = model.get_spec()
+    modeltype = spec.WhichOneof('Type')
+    assert modeltype in ['neuralNetworkClassifier', 'neuralNetwork', 'neuralNetworkRegressor']
+    cc = getattr(spec, modeltype)
+
+    symtab = SymbolTable()
+    for i in spec.description.input:
+        symtab.get_var(i.name, must_contain=False)
+
+    for pp in cc.preprocessing:
+        whichpp = pp.WhichOneof('preprocessor')
+        ppmethod = getattr(pp, whichpp)
+        # the NeuralNetworkImageScalar doesn't seem to have a featureName?
+        if whichpp == 'scaler':
+            for i in spec.description.input:
+                coreml_op_to_nnvm(ppmethod, i.name, i.name, symtab)
+        else:
+            coreml_op_to_nnvm(ppmethod, pp.featureName, pp.featureName, symtab)
+
+    for l in cc.layers:
+        layertype = l.WhichOneof('layer')
+        layerop = getattr(l, layertype)
+        assert len(l.output) == 1
+        if len(l.input) == 1:
+            coreml_op_to_nnvm(layerop, l.input[0], l.output[0], symtab)
+        else:
+            coreml_op_to_nnvm(layerop, list(l.input), l.output[0], symtab)
+    returns = [symtab.get_var(i.name, must_contain=False) for i in spec.description.output]
+    tvmparams = {k:tvm.nd.array(np.array(v, dtype=np.float32)) for k, v in symtab.params.items()}
+    # for now return first output
+    return returns[0], tvmparams
diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
new file mode 100644
index 000000000000..3a197a416219
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -0,0 +1,746 @@
+"""
+DarkNet symbol frontend.
+"""
+
+from __future__ import absolute_import as _abs
+import numpy as np
+import tvm
+from .. import symbol as _sym
+
+class LAYERTYPE(object):
+    """Darknet LAYERTYPE Class constant."""
+    CONVOLUTIONAL = 0
+    DECONVOLUTIONAL = 1
+    CONNECTED = 2
+    MAXPOOL = 3
+    SOFTMAX = 4
+    DETECTION = 5
+    DROPOUT = 6
+    CROP = 7
+    ROUTE = 8
+    COST = 9
+    NORMALIZATION = 10
+    AVGPOOL = 11
+    LOCAL = 12
+    SHORTCUT = 13
+    ACTIVE = 14
+    RNN = 15
+    GRU = 16
+    LSTM = 17
+    CRNN = 18
+    BATCHNORM = 19
+    NETWORK = 20
+    XNOR = 21
+    REGION = 22
+    REORG = 23
+    BLANK = 24
+
+class ACTIVATION(object):
+    """Darknet ACTIVATION Class constant."""
+    LOGISTIC = 0
+    RELU = 1
+    RELIE = 2
+    LINEAR = 3
+    RAMP = 4
+    TANH = 5
+    PLSE = 6
+    LEAKY = 7
+    ELU = 8
+    LOGGY = 9
+    STAIR = 10
+    HARDTAN = 11
+    LHTAN = 12
+
+__all__ = ['from_darknet']
+
+def _darknet_get_nnvm_op(op_name):
+    """Get the nnvm operation from opname, raise error if not supported."""
+    op = getattr(_sym, op_name)
+    if not op:
+        raise RuntimeError("Not to map op_name {} to nnvm.sym".format(op_name))
+    return op
+
+def _darknet_required_attr(attr, key):
+    """Check the attribute exists and return if exists, if not return error."""
+    assert isinstance(attr, dict)
+    if key not in attr:
+        raise AttributeError("Required attribute {} not found.".format(key))
+    return attr[key]
+
+def _darknet_raise_not_supported(attr, op='nnvm'):
+    """Raise error if any operation is not supported."""
+    err = "{} is not supported in {}.".format(attr, op)
+    raise NotImplementedError(err)
+
+def _darknet_warn_not_used(attr, op='nnvm'):
+    """Raise warning if any operation not supported."""
+    import warnings
+    err = "{} is ignored in {}.".format(attr, op)
+    warnings.warn(err)
+
+def _darknet_parse_tshape(tshape):
+    """Parse tshape in string."""
+    return [int(x.strip()) for x in tshape.strip('()').split(',')]
+
+def _darknet_parse_bool_str(attr, key, default='False'):
+    """Parse bool string to boolean."""
+    return attr.get(key, default).strip().lower() in \
+                                    ['true', '1', 't', 'y', 'yes']
+
+def _darknet_maxpooling(inputs, attrs):
+    """Process the max pool 2d operation."""
+    kernel = _darknet_parse_tshape(_darknet_required_attr(attrs, 'kernel'))
+    if len(kernel) != 1:
+        _darknet_raise_not_supported('non-2d kernel', 'pool_2d')
+
+    op_name, new_attrs = 'max_pool2d', {}
+    strides = int(attrs.get('stride', (1, 1)))
+    pads = int(attrs.get('pad', (0, 0)))
+    new_attrs['pool_size'] = [kernel[0], kernel[0]]
+    new_attrs['strides'] = str((strides, strides))
+    new_attrs['padding'] = str((pads, pads))
+    extra_pad_size = attrs.get('extra_pad_size', 0)
+    if extra_pad_size:
+        pad_width = ((0, 0), (0, 0), (0, extra_pad_size), (0, extra_pad_size))
+        inputs = _sym.pad(*inputs, pad_width=pad_width, pad_value=np.finfo(np.float32).min)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_avgpooling(inputs, attrs):
+    """Process the average pool 2d operation."""
+    kernel = _darknet_parse_tshape(_darknet_required_attr(attrs, 'kernel'))
+    if len(kernel) != 1:
+        _darknet_raise_not_supported('non-2d kernel', 'pool_2d')
+
+    op_name, new_attrs = 'avg_pool2d', {}
+    strides = int(attrs.get('stride', (1, 1)))
+    pads = int(attrs.get('pad', (0, 0)))
+    new_attrs['pool_size'] = [kernel[0], kernel[0]]
+    new_attrs['strides'] = str((strides, strides))
+    new_attrs['padding'] = str((pads, pads))
+
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_batch_norm(inputs, attrs):
+    """Process the batchnormalization operation."""
+    op_name, new_attrs = 'darknet_batch_norm', {}
+    new_attrs['axis'] = attrs.get('axis', 1)
+    new_attrs['epsilon'] = attrs.get('eps', 0.000001)
+    new_attrs['center'] = True
+    new_attrs['scale'] = True
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_conv2d(inputs, attrs):
+    """Process the convolution 2d operation."""
+    kernel = _darknet_parse_tshape(_darknet_required_attr(attrs, 'kernel'))
+    if len(kernel) != 1:
+        _darknet_raise_not_supported('non 2d kernel', 'conv2d')
+    layout = attrs.get('layout', 'NCHW')
+    if layout not in ['NCHW', 'NHWC']:
+        _darknet_raise_not_supported('layout: ' + layout, 'conv2d')
+    strides = int(attrs.get('stride', (1, 1)))
+    pads = int(attrs.get('pad', (0, 0)))
+
+    op_name, new_attrs = 'conv2d', {}
+    new_attrs['channels'] = _darknet_required_attr(attrs, 'num_filter')
+    new_attrs['kernel_size'] = [kernel[0], kernel[0]]
+    new_attrs['strides'] = (strides, strides)
+    new_attrs['padding'] = (pads, pads)
+    new_attrs['dilation'] = attrs.get('dilate', (1, 1))
+    new_attrs['groups'] = attrs.get('num_group', 1)
+    new_attrs['layout'] = layout
+    if attrs.get('use_batchNorm', False) is True:
+        new_attrs['use_bias'] = False
+    else:
+        new_attrs['use_bias'] = True
+    out_name = {}
+    sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+    out_name[0] = sym.list_output_names()[0].replace('_output', '')
+
+    if attrs.get('use_batchNorm', False) is True:
+        op_name, new_attrs = 'batch_norm', {}
+        new_attrs['epsilon'] = 0.000001
+        sym = _darknet_get_nnvm_op(op_name)(*sym, **new_attrs)
+        out_name[1] = sym.list_output_names()[0].replace('_output', '')
+    if 'activation' in attrs:
+        new_attrs = {}
+        new_attrs['activation'] = attrs['activation']
+        new_attrs['slope'] = 0.1
+        sym, _ = _darknet_activations(sym, new_attrs)
+    return sym, out_name
+
+
+def _darknet_conv2d_transpose(inputs, attrs):
+    """Process the convolution 2d transpose operation."""
+    if 'target_shape' in attrs:
+        _darknet_raise_not_supported('target_shape', 'conv2d_transpose')
+    kernel = _darknet_parse_tshape(_darknet_required_attr(attrs, 'kernel'))
+    if len(kernel) != 2:
+        _darknet_raise_not_supported('non-2d kernel', 'conv2d_transpose')
+    layout = attrs.get('layout', 'NCHW')
+    if layout not in ['NCHW', 'NHWC']:
+        _darknet_raise_not_supported('layout: ' + layout, 'conv2d_transpose')
+    op_name, new_attrs = 'conv2d_transpose', {}
+    new_attrs['channels'] = _darknet_required_attr(attrs, 'num_filter')
+    new_attrs['kernel_size'] = kernel
+    new_attrs['strides'] = attrs.get('stride', (1, 1))
+    new_attrs['output_padding'] = attrs.get('adj', (0, 0))
+    new_attrs['padding'] = attrs.get('pad', (0, 0))
+    new_attrs['dilation'] = attrs.get('dilate', (1, 1))
+    new_attrs['groups'] = attrs.get('num_group', 1)
+    new_attrs['layout'] = layout
+    new_attrs['use_bias'] = not _darknet_parse_bool_str(attrs, 'no_bias')
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_shortcut(inputs, attrs):
+    """Process the shortcut operation."""
+    op_name, new_attrs = 'elemwise_add', {}
+    input_0 = inputs[0]
+    input_1 = inputs[1]
+    input_0_channel = int(attrs['out_channel'])
+    input_1_channel = int(attrs['add_out_channel'])
+    input_0_size = int(attrs['out_size'])
+    input_1_size = int(attrs['add_out_size'])
+
+    if input_0_size > input_1_size:
+        scale = int(input_0_size/input_1_size)
+        input_1 = _sym.upsampling(input_1, scale=scale, name="_upsampling")
+    elif input_0_size < input_1_size:
+        stride = int(input_1_size/input_0_size)
+        input_1 = _sym.avg_pool2d(input_1, pool_size=(1, 1),
+                                  strides=(stride, stride), padding=(0, 0), name="_downsampling")
+
+    if input_0_channel != input_1_channel:
+        pad_channel = input_0_channel - input_1_channel
+        input_1 = _sym.pad(input_1, pad_width=((0, 0), (0, pad_channel), (0, 0), (0, 0)),
+                           pad_value=0.)
+
+    new_inputs = _as_list([input_0, input_1])
+    sym = _darknet_get_nnvm_op(op_name)(*new_inputs, **new_attrs)
+    out_name = sym.list_output_names()[0].replace('_output', '')
+    if 'activation' in attrs:
+        new_attrs['activation'] = attrs['activation']
+        sym, _ = _darknet_activations(sym, new_attrs)
+    return sym, out_name
+
+def _darknet_dense(inputs, attrs):
+    """Process the dense operation."""
+    op_name, new_attrs = 'dense', {}
+    new_attrs['units'] = _darknet_required_attr(attrs, 'num_hidden')
+    out_name = {}
+    new_attrs['use_bias'] = attrs.get('use_bias', False)
+    if attrs.get('use_flatten', False) is True:
+        inputs[0] = _sym.flatten(inputs[0])
+    sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+    out_name[0] = sym.list_output_names()[0].replace('_output', '')
+    if 'use_batchNorm' in attrs:
+        op_name, new_attrs = 'batch_norm', {}
+        new_attrs['epsilon'] = 0.000001
+        sym = _darknet_get_nnvm_op(op_name)(*sym, **new_attrs)
+        out_name[1] = sym.list_output_names()[0].replace('_output', '')
+    if 'activation' in attrs:
+        new_attrs = {}
+        new_attrs['activation'] = attrs['activation']
+        sym, _ = _darknet_activations(sym, new_attrs)
+    return sym, out_name
+
+def _darknet_dropout(inputs, attrs):
+    """Process the dropout operation, its a blank operation."""
+    op_name, new_attrs = 'dropout', {}
+    new_attrs['rate'] = attrs.get('p', 0.5)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_reshape(inputs, attrs):
+    """Process the reshape operation."""
+    if _darknet_parse_bool_str(attrs, 'reverse'):
+        _darknet_raise_not_supported('reverse', 'reshape')
+    op_name, new_attrs = 'reshape', {}
+    new_attrs['shape'] = _darknet_required_attr(attrs, 'shape')
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_softmax_output(inputs, attrs):
+    """Process the softmax operation."""
+    temperature = attrs.get('temperature', 1)
+    if temperature != 1:
+        inputs[0] = inputs[0] / float(temperature)
+    op_name, new_attrs = 'softmax', {}
+    if _darknet_parse_bool_str(attrs, 'multi_output'):
+        new_attrs['axis'] = 1
+
+    if attrs.get('use_flatten', False) is True:
+        inputs[0] = _sym.flatten(inputs[0])
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_route(inputs, attrs):
+    """Process the route operation, which is equivalent to concat."""
+    op_name = 'concatenate'
+    new_attrs = {'axis': attrs.get('dim', 1)}
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_reorg(inputs, attrs):
+    """Process the reorg operation."""
+    op_name, new_attrs = 'yolo_reorg', {}
+    if 'stride' in attrs:
+        new_attrs = {'stride': attrs.get('stride', 1)}
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_region(inputs, attrs):
+    """Process the region operation."""
+    op_name, new_attrs = 'yolo_region', {}
+    if 'n' in attrs:
+        new_attrs['n'] = attrs.get('n', 1)
+    if 'classes' in attrs:
+        new_attrs['classes'] = attrs.get('classes', 1)
+    if 'coords' in attrs:
+        new_attrs['coords'] = attrs.get('coords', 0)
+    if 'background' in attrs:
+        new_attrs['background'] = attrs.get('background', 0)
+    if 'softmax' in attrs:
+        new_attrs['softmax'] = attrs.get('softmax', 0)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_activations(inputs, attrs):
+    """Process the activation function."""
+    act = _darknet_required_attr(attrs, 'activation')
+    if ACTIVATION.LOGISTIC == act:
+        act_type = 'sigmoid'
+    elif ACTIVATION.RELU == act:
+        act_type = 'relu'
+    elif ACTIVATION.TANH == act:
+        act_type = 'tanh'
+    elif ACTIVATION.LINEAR == act:
+        return inputs, None
+    elif ACTIVATION.LEAKY == act:
+        act_type = 'leaky_relu'
+    elif ACTIVATION.ELU == act:
+        act_type = 'elu'
+    else:
+        _darknet_raise_not_supported('act: ' + act)
+
+    if act_type in ['relu', 'tanh']:
+        op_name, new_attrs = act_type, {}
+        sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+    elif act_type in ['leaky_relu']:
+        op_name, new_attrs = act_type, {}
+        new_attrs['alpha'] = attrs.get('slope', 0.1)
+        sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+    elif act_type in ['elu']:
+        sym = -1 * _sym.relu(1 - _sym.exp(*inputs)) + _sym.relu(*inputs)
+    elif act_type in ['sigmoid']:
+        op_name, new_attrs = act_type, {}
+        sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+    else:
+        _darknet_raise_not_supported('act_type: ' + act_type)
+    return sym, None
+
+def _darknet_op_not_support(inputs, attrs):
+    """Raise exception if the operation is not supported."""
+    err = "{} is not supported in {}.".format(attrs, inputs)
+    raise NotImplementedError(err)
+
+_DARKNET_CONVERT_MAP = {
+    LAYERTYPE.CONVOLUTIONAL   : _darknet_conv2d,
+    LAYERTYPE.DECONVOLUTIONAL : _darknet_conv2d_transpose,
+    LAYERTYPE.CONNECTED       : _darknet_dense,
+    LAYERTYPE.MAXPOOL         : _darknet_maxpooling,
+    LAYERTYPE.SOFTMAX         : _darknet_softmax_output,
+    LAYERTYPE.DROPOUT         : _darknet_dropout,
+    LAYERTYPE.AVGPOOL         : _darknet_avgpooling,
+    LAYERTYPE.BATCHNORM       : _darknet_batch_norm,
+    LAYERTYPE.ROUTE           : _darknet_route,
+    LAYERTYPE.REORG           : _darknet_reorg,
+    LAYERTYPE.REGION          : _darknet_region,
+    LAYERTYPE.SHORTCUT        : _darknet_shortcut,
+    LAYERTYPE.DETECTION       : _darknet_op_not_support,
+    LAYERTYPE.CROP            : _darknet_op_not_support,
+    LAYERTYPE.COST            : _darknet_op_not_support,
+    LAYERTYPE.NORMALIZATION   : _darknet_op_not_support,
+    LAYERTYPE.LOCAL           : _darknet_op_not_support,
+    LAYERTYPE.ACTIVE          : _darknet_op_not_support,
+    LAYERTYPE.RNN             : _darknet_op_not_support,
+    LAYERTYPE.GRU             : _darknet_op_not_support,
+    LAYERTYPE.LSTM            : _darknet_op_not_support,
+    LAYERTYPE.CRNN            : _darknet_op_not_support,
+    LAYERTYPE.NETWORK         : _darknet_op_not_support,
+    LAYERTYPE.XNOR            : _darknet_op_not_support,
+    LAYERTYPE.BLANK           : _darknet_op_not_support,
+}
+
+def _darknet_convert_symbol(op_name, inputs, attrs):
+    """Convert from darknet op to nnvm op.
+    The converter must specify some conversions explicitly to
+    support gluon format ops such as conv2d...
+
+    Parameters
+    ----------
+    op_name : str
+        Operator name, such as Convolution, Connected, etc
+    inputs : list of nnvm.Symbol
+        List of input symbols.
+    attrs : dict
+        Dict of operator attributes
+
+    Returns
+    -------
+    out_name : converted out name of operation
+    sym : nnvm.Symbol
+        Converted nnvm Symbol
+    """
+
+    if op_name in _DARKNET_CONVERT_MAP:
+        sym, out_name = _DARKNET_CONVERT_MAP[op_name](inputs, attrs)
+    else:
+        _darknet_raise_not_supported('Operator type ' + str(op_name))
+    if out_name is  None:
+        out_name = sym.list_output_names()[0].replace('_output', '')
+    return out_name, sym
+
+
+def _as_list(arr):
+    """Force being a list, ignore if already is."""
+    if isinstance(arr, list):
+        return arr
+    return [arr]
+
+
+class GraphProto(object):
+    """A helper class for handling nnvm graph copying from darknet model.
+    """
+
+    def __init__(self, net, dtype='float32'):
+        self.net = net
+        self.dtype = dtype
+        self._sym_array = {}
+        self._tvmparams = {}
+        self._outs = []
+        self._rnn_state_ctr = 0
+
+    def _read_memory_buffer(self, shape, data):
+        length = 1
+        for x in shape:
+            length *= x
+        data_np = np.zeros(length, dtype=self.dtype)
+        for i in range(length):
+            data_np[i] = data[i]
+        return data_np.reshape(shape)
+
+    def _get_convolution_weights(self, layer, opname):
+        """Get the convolution layer weights and biases."""
+        if layer.nweights == 0:
+            return
+
+        if (layer.n * layer.c * layer.size * layer.size) != layer.nweights:
+            raise RuntimeError("layer weights size not matching with n c h w")
+
+        shape = (layer.n, layer.c, layer.size, layer.size)
+        weights = self._read_memory_buffer(shape, layer.weights)
+
+        biases = self._read_memory_buffer((layer.n, ), layer.biases)
+
+        k = self._get_tvm_params_name(opname[0], 'weight')
+        self._tvmparams[k] = tvm.nd.array(weights)
+
+        if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+            self._get_batchnorm_weights(layer, opname[1], layer.n)
+            k = self._get_tvm_params_name(opname[1], 'beta')
+            self._tvmparams[k] = tvm.nd.array(biases)
+        else:
+            k = self._get_tvm_params_name(opname[0], 'bias')
+            self._tvmparams[k] = tvm.nd.array(biases)
+
+    def _get_connected_weights(self, layer, opname):
+        """Parse the weights and biases for fully connected or dense layer."""
+        size = layer.outputs * layer.inputs
+        if size == 0:
+            return
+
+        weights = self._read_memory_buffer((layer.outputs, layer.inputs), layer.weights)
+        biases = self._read_memory_buffer((layer.outputs, ), layer.biases)
+
+        k = self._get_tvm_params_name(opname[0], 'weight')
+        self._tvmparams[k] = tvm.nd.array(weights)
+
+        if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+            self._get_batchnorm_weights(layer, opname[1], layer.outputs)
+            k = self._get_tvm_params_name(opname[1], 'beta')
+            self._tvmparams[k] = tvm.nd.array(biases)
+        else:
+            k = self._get_tvm_params_name(opname[0], 'bias')
+            self._tvmparams[k] = tvm.nd.array(biases)
+
+    def _get_batchnorm_weights(self, layer, opname, size):
+        """Parse the weights for batchnorm, which includes, scales, moving mean
+        and moving variances."""
+        scales = self._read_memory_buffer((size, ), layer.scales)
+        rolling_mean = self._read_memory_buffer((size, ), layer.rolling_mean)
+        rolling_variance = self._read_memory_buffer((size, ), layer.rolling_variance)
+
+        k = self._get_tvm_params_name(opname, 'moving_mean')
+        self._tvmparams[k] = tvm.nd.array(rolling_mean)
+        k = self._get_tvm_params_name(opname, 'moving_var')
+        self._tvmparams[k] = tvm.nd.array(rolling_variance)
+        k = self._get_tvm_params_name(opname, 'gamma')
+        self._tvmparams[k] = tvm.nd.array(scales)
+
+    def _get_darknet_attrs(self, layer, layer_num):
+        """Parse attributes of each layer and return."""
+        attr = {}
+        use_flatten = True
+        if LAYERTYPE.CONVOLUTIONAL == layer.type:
+            attr.update({'layout' : 'NCHW'})
+            attr.update({'pad' : str(layer.pad)})
+            attr.update({'num_group' : str(layer.groups)})
+            attr.update({'num_filter' : str(layer.n)})
+            attr.update({'stride' : str(layer.stride)})
+            attr.update({'kernel' : str(layer.size)})
+            attr.update({'activation' : (layer.activation)})
+
+            if layer.nbiases == 0:
+                attr.update({'use_bias' : False})
+            else:
+                attr.update({'use_bias' : True})
+
+            if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+                attr.update({'use_batchNorm' : True})
+                attr.update({'use_scales' : True})
+
+        elif LAYERTYPE.CONNECTED == layer.type:
+            attr.update({'num_hidden' : str(layer.outputs)})
+            attr.update({'activation' : (layer.activation)})
+            if layer_num != 0:
+                layer_prev = self.net.layers[layer_num - 1]
+                if (layer_prev.out_h == layer.h and
+                        layer_prev.out_w == layer.w and
+                        layer_prev.out_c == layer.c):
+                    use_flatten = False
+            attr.update({'use_flatten' : use_flatten})
+            attr.update({'use_bias' : True})
+            if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+                attr.update({'use_batchNorm' : True})
+                attr.update({'use_scales' : True})
+                attr.update({'use_bias' : False})
+
+        elif LAYERTYPE.MAXPOOL == layer.type:
+            attr.update({'pad' : str(layer.pad)})
+            attr.update({'stride' : str(layer.stride)})
+            attr.update({'kernel' : str(layer.size)})
+            max_output = (layer.w - layer.size + 2 * layer.pad)/float(layer.stride) + 1
+            if max_output < layer.out_w:
+                extra_pad = (layer.out_w - max_output)*layer.stride
+                attr.update({'extra_pad_size' : int(extra_pad)})
+        elif LAYERTYPE.AVGPOOL == layer.type:
+            attr.update({'pad' : str(layer.pad)})
+            if layer.stride == 0:
+                attr.update({'stride' : str(1)})
+            else:
+                attr.update({'stride' : str(layer.stride)})
+            if layer.size == 0 and layer.h == layer.w:
+                attr.update({'kernel' : str(layer.h)})
+            else:
+                attr.update({'kernel' : str(layer.size)})
+
+        elif LAYERTYPE.DROPOUT == layer.type:
+            attr.update({'p' : str(layer.probability)})
+
+        elif LAYERTYPE.SOFTMAX == layer.type:
+            attr.update({'axis' : 1})
+            attr.update({'use_flatten' : True})
+            if layer.temperature:
+                attr.update({'temperature' : str(layer.temperature)})
+
+        elif LAYERTYPE.SHORTCUT == layer.type:
+            add_layer = self.net.layers[layer.index]
+            attr.update({'activation' : (layer.activation)})
+            attr.update({'out_channel' : (layer.out_c)})
+            attr.update({'out_size' : (layer.out_h)})
+            attr.update({'add_out_channel' : (add_layer.out_c)})
+            attr.update({'add_out_size' : (add_layer.out_h)})
+
+        elif LAYERTYPE.ROUTE == layer.type:
+            pass
+
+        elif LAYERTYPE.COST == layer.type:
+            pass
+
+        elif LAYERTYPE.REORG == layer.type:
+            attr.update({'stride' : layer.stride})
+
+        elif LAYERTYPE.REGION == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'classes' : layer.classes})
+            attr.update({'coords' : layer.coords})
+            attr.update({'background' : layer.background})
+            attr.update({'softmax' : layer.softmax})
+        else:
+            err = "Darknet layer type {} is not supported in nnvm.".format(layer.type)
+            raise NotImplementedError(err)
+
+        return attr
+
+    def _get_tvm_params_name(self, opname, arg_name):
+        """Makes the params name for the k,v pair."""
+        return opname + '_'+ arg_name
+
+    def _get_darknet_params(self, layer, opname):
+        """To parse and get the darknet params."""
+        if LAYERTYPE.CONVOLUTIONAL == layer.type:
+            self._get_convolution_weights(layer, opname)
+
+        elif LAYERTYPE.CONNECTED == layer.type:
+            self._get_connected_weights(layer, opname)
+
+    def _preproc_layer(self, layer, layer_num):
+        """To preprocess each darknet layer, some layer doesnt need processing."""
+        if layer_num == 0:
+            name = 'data'
+            attribute = {}
+            sym = [_sym.Variable(name, **attribute)]
+        else:
+            sym = self._sym_array[layer_num - 1]
+        skip_layer = False
+
+        if LAYERTYPE.ROUTE == layer.type:
+            sym = []
+            for j in range(layer.n):
+                sym.append(self._sym_array[layer.input_layers[j]])
+            if layer.n == 1:
+                skip_layer = True
+
+        elif LAYERTYPE.COST == layer.type:
+            skip_layer = True
+
+        elif LAYERTYPE.SHORTCUT == layer.type:
+            sym = [sym, self._sym_array[layer.index]]
+
+        elif LAYERTYPE.BLANK == layer.type:
+            skip_layer = True
+
+        if skip_layer is True:
+            self._sym_array[layer_num] = sym
+
+        return skip_layer, sym
+
+    def _get_opname(self, layer):
+        """Returs the layer name."""
+        return layer.type
+
+    def _new_rnn_state_sym(self, state=None):
+        """Returs a symbol for state"""
+        name = "rnn%d_state" % (self._rnn_state_ctr)
+        self._rnn_state_ctr += 1
+        return _sym.Variable(name=name, init=state)
+
+    def _get_rnn_state_buffer(self, layer):
+        """Get the state buffer for rnn."""
+        buffer = np.zeros((1, layer.outputs), self.dtype)
+        return self._new_rnn_state_sym(buffer)
+
+    def _get_darknet_rnn_attrs(self, layer, sym):
+        """Get the rnn converted symbol from attributes."""
+        attr = self._get_darknet_attrs(layer, 0)
+        op_name = self._get_opname(layer)
+        layer_name, sym = _darknet_convert_symbol(op_name, _as_list(sym), attr)
+        self._get_darknet_params(layer, layer_name)
+        return sym
+
+    def _handle_darknet_rnn_layers(self, layer_num, sym):
+        """Parse attributes and handle the rnn layers."""
+        attr = {}
+        layer = self.net.layers[layer_num]
+        processed = False
+
+        if LAYERTYPE.RNN == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'batch' : layer.batch})
+            attr.update({'num_hidden' : str(layer.outputs)})
+
+            state = self._get_rnn_state_buffer(layer)
+
+            for _ in range(layer.steps):
+                input_layer = layer.input_layer
+                sym = self._get_darknet_rnn_attrs(input_layer, sym)
+
+                self_layer = layer.self_layer
+                state = self._get_darknet_rnn_attrs(self_layer, state)
+
+                op_name, new_attrs = 'elemwise_add', {}
+                new_inputs = _as_list([sym, state])
+                state = _darknet_get_nnvm_op(op_name)(*new_inputs, **new_attrs)
+                self._outs.append(state)
+
+                output_layer = layer.output_layer
+                sym = self._get_darknet_rnn_attrs(output_layer, state)
+
+            self._sym_array[layer_num] = sym
+            processed = True
+
+        elif LAYERTYPE.CRNN == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'batch' : layer.batch})
+            attr.update({'num_hidden' : str(layer.outputs)})
+
+            state = self._get_rnn_state_buffer(layer)
+
+            for _ in range(layer.steps):
+                input_layer = layer.input_layer
+                sym = self._get_darknet_rnn_attrs(input_layer, sym)
+
+                self_layer = layer.self_layer
+                state = self._get_darknet_rnn_attrs(self_layer, state)
+
+                op_name, new_attrs = 'elemwise_add', {}
+                new_inputs = _as_list([sym, state])
+                state = _darknet_get_nnvm_op(op_name)(*new_inputs, **new_attrs)
+                self._outs.append(state)
+
+                output_layer = layer.output_layer
+                sym = self._get_darknet_rnn_attrs(output_layer, state)
+
+            self._sym_array[layer_num] = sym
+            processed = True
+
+        return processed, sym
+
+    def from_darknet(self):
+        """To convert the darknet symbol to nnvm symbols."""
+        for i in range(self.net.n):
+            layer = self.net.layers[i]
+            need_skip, sym = self._preproc_layer(layer, i)
+            if need_skip is True:
+                continue
+
+            processed, sym = self._handle_darknet_rnn_layers(i, sym)
+            if processed is True:
+                continue
+
+            attr = self._get_darknet_attrs(layer, i)
+            op_name = self._get_opname(layer)
+            layer_name, sym = _darknet_convert_symbol(op_name, _as_list(sym), attr)
+            self._get_darknet_params(self.net.layers[i], layer_name)
+            self._sym_array[i] = sym
+        self._outs = _as_list(sym) + self._outs
+        if isinstance(self._outs, list):
+            sym = _sym.Group(self._outs)
+        return sym, self._tvmparams
+
+def from_darknet(net, dtype='float32'):
+    """Convert from darknet's model into compatible NNVM format.
+    Reconstruct a nnvm symbol by traversing the darknet input.
+
+    Parameters
+    ----------
+    net : ctype Pointer to network
+        Darknet parsed symbols
+
+    dtype : str
+        Datatype of the input net structure, default is float32
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by nnvm
+    """
+
+    return GraphProto(net, dtype).from_darknet()
diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
new file mode 100644
index 000000000000..15493d18e7bb
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -0,0 +1,533 @@
+# pylint: disable=invalid-name, import-self
+"""Keras frontend."""
+from __future__ import absolute_import as _abs
+import sys
+import numpy as np
+import tvm
+from .. import symbol as _sym
+from .common import SymbolTable
+
+__all__ = ['from_keras']
+
+
+def _check_data_format(keras_layer):
+    if hasattr(keras_layer, ('data_format')):
+        if keras_layer.data_format != 'channels_last':
+            raise ValueError("Keras frontend currently supports data_format = channels_last only.")
+
+
+def _get_pad_pair(input1d, kernel1d, stride1d):
+    out1d = (input1d + stride1d - 1) // stride1d
+    pad = np.maximum((out1d - 1) * stride1d + kernel1d - input1d, 0)
+    pad_before = pad // 2
+    pad_after = pad - pad_before
+    return [pad_before, pad_after]
+
+def _get_elu(insym, alpha):
+    """ A helper method for elu.
+    """
+    return -alpha * _sym.relu(1 - _sym.exp(insym)) + _sym.relu(insym)
+
+def _convert_activation(insym, keras_layer, _):
+    if isinstance(keras_layer, str):
+        act_type = keras_layer
+    else:
+        if sys.version_info.major < 3:
+            act_type = keras_layer.activation.func_name
+        else:
+            act_type = keras_layer.activation.__name__
+    if act_type == 'linear':
+        if isinstance(keras_layer, str):
+            return insym
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
+        beta = keras_layer.beta if hasattr(keras_layer, "beta") else 0
+        return _sym.__add_scalar__(_sym.__mul_scalar__(insym, \
+            scalar=alpha), scalar=beta)
+    elif act_type == 'softmax':
+        return _sym.softmax(insym, axis=1)
+    elif act_type == 'sigmoid':
+        return _sym.sigmoid(insym)
+    elif act_type == 'tanh':
+        return _sym.tanh(insym)
+    elif act_type == 'relu':
+        return _sym.relu(insym)
+    elif act_type == 'softplus':
+        return _sym.log(_sym.__add_scalar__(_sym.exp(insym), scalar=1))
+    elif act_type == 'elu':
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
+        return _get_elu(insym, alpha)
+    elif act_type == 'selu':
+        # Alpha, Gamma values, obtained from  https://arxiv.org/abs/1706.02515
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1.6732
+        gamma = keras_layer.gamma if hasattr(keras_layer, "gamma") else 1.0507
+        return gamma * _get_elu(insym, alpha)
+    elif act_type == 'relu6':
+        return _sym.clip(insym, a_min=0, a_max=6)
+    elif act_type == 'softsign':
+        return insym / (1 + (_sym.relu(insym) + _sym.relu(_sym.negative(insym))))
+    elif act_type == 'hard_sigmoid':
+        transformX = (0.2 * insym) + 0.5
+        return _sym.clip(transformX, a_min=0, a_max=1)
+    else:
+        raise TypeError("Unsupported activation type : {}".format(act_type))
+
+
+def _convert_advanced_activation(insym, keras_layer, symtab):
+    act_type = type(keras_layer).__name__
+    if act_type == 'ReLU':
+        return _sym.relu(insym)
+    elif act_type == 'LeakyReLU':
+        return _sym.leaky_relu(insym, alpha=keras_layer.alpha)
+    elif act_type == 'ELU':
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
+        return _get_elu(insym, alpha)
+    elif act_type == 'PReLU':
+        assert hasattr(keras_layer, "alpha"), \
+            "alpha required for PReLU."
+        _check_data_format(keras_layer)
+        size = len(keras_layer.alpha.shape)
+        return -symtab.new_const(keras_layer.get_weights()[0] \
+                                 .transpose(np.roll(range(size), 1))) \
+                                 * _sym.relu(-insym) + _sym.relu(insym)
+    elif act_type == 'ThresholdedReLU':
+        theta = keras_layer.theta if hasattr(keras_layer, "theta") else 1.0
+        theta_tensor = _sym.full_like(insym[0], fill_value=float(theta))
+        return _sym.elemwise_mul(insym[0], _sym.greater(insym[0], theta_tensor, out_type="float32"))
+    else:
+        raise TypeError("Unsupported advanced activation type : {}".format(act_type))
+
+
+def _convert_merge(insym, keras_layer, _):
+    merge_type = type(keras_layer).__name__
+    ret = insym[0]
+    for i in range(1, len(insym)):
+        if merge_type == 'Add':
+            ret = _sym.elemwise_add(ret, insym[i])
+        elif merge_type == 'Subtract':
+            ret = _sym.elemwise_sub(ret, insym[i])
+        elif merge_type == 'Multiply':
+            ret = _sym.elemwise_mul(ret, insym[i])
+        elif merge_type == 'Average':
+            raise NotImplementedError('Average merge not implemented')
+        elif merge_type == 'Maximum':
+            raise NotImplementedError('Maximum merge not implemented')
+        else:
+            raise TypeError("Unsupported merge type : {}".format(merge_type))
+    return ret
+
+
+def _convert_dense(insym, keras_layer, symtab):
+    weightList = keras_layer.get_weights()
+    weight = symtab.new_const(weightList[0].transpose([1, 0]))
+    params = {'weight':weight, 'use_bias':False, 'units':weightList[0].shape[1]}
+    if keras_layer.use_bias:
+        params['use_bias'] = True
+        params['bias'] = symtab.new_const(weightList[1])
+    out = _sym.dense(data=insym, **params)
+    # defuse activation
+    if sys.version_info.major < 3:
+        act_type = keras_layer.activation.func_name
+    else:
+        act_type = keras_layer.activation.__name__
+    if act_type != 'linear':
+        out = _convert_activation(out, act_type, symtab)
+    return out
+
+
+def _convert_convolution(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    is_deconv = type(keras_layer).__name__ == 'Conv2DTranspose'
+    is_depthconv = type(keras_layer).__name__ == 'DepthwiseConv2D'
+    weightList = keras_layer.get_weights()
+    if is_deconv:
+        kernel_h, kernel_w, n_filters, in_channels = weightList[0].shape
+        weight = weightList[0].transpose([3, 2, 0, 1])
+    elif is_depthconv:
+        kernel_h, kernel_w, in_channels, depth_mult = weightList[0].shape
+        weight = weightList[0].transpose([2, 3, 0, 1])
+    else:
+        kernel_h, kernel_w, in_channels, n_filters = weightList[0].shape
+        weight = weightList[0].transpose([3, 2, 0, 1])
+    dilation = [1, 1]
+    if isinstance(keras_layer.dilation_rate, (list, tuple)):
+        dilation = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
+    else:
+        dilation = [keras_layer.dilation_rate, keras_layer.dilation_rate]
+    kernel_h = (kernel_h - 1) * dilation[0] + 1
+    kernel_w = (kernel_w - 1) * dilation[1] + 1
+    stride_h, stride_w = keras_layer.strides
+    params = {'weight': symtab.new_const(weight),
+              'kernel_size': [kernel_h, kernel_w],
+              'strides': [stride_h, stride_w],
+              'dilation': dilation,
+              'padding': [0, 0],
+              'use_bias': False}
+    if is_depthconv:
+        params['channels'] = in_channels * depth_mult
+        params['groups'] = in_channels
+    else:
+        params['channels'] = n_filters
+    if keras_layer.use_bias:
+        params['use_bias'] = True
+        params['bias'] = symtab.new_const(weightList[1])
+    if keras_layer.padding == 'valid':
+        pass
+    # we insert a separate pad operator
+    elif keras_layer.padding == 'same':
+        in_h = keras_layer.input_shape[1]
+        in_w = keras_layer.input_shape[2]
+        pad_t, pad_b = _get_pad_pair(in_h, kernel_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, kernel_w, stride_w)
+        insym = _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+    else:
+        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+    if is_deconv:
+        out = _sym.conv2d_transpose(data=insym, **params)
+    else:
+        out = _sym.conv2d(data=insym, **params)
+    # defuse activation
+    if sys.version_info.major < 3:
+        act_type = keras_layer.activation.func_name
+    else:
+        act_type = keras_layer.activation.__name__
+    if act_type != 'linear':
+        out = _convert_activation(out, act_type, symtab)
+    return out
+
+
+def _convert_separable_convolution(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    weightList = keras_layer.get_weights()
+    # depthwise conv
+    kernel_h, kernel_w, in_channels, depth_mult = weightList[0].shape
+    stride_h, stride_w = keras_layer.strides
+    weight0 = weightList[0].transpose([2, 3, 0, 1])
+    params0 = {'weight': symtab.new_const(weight0),
+               'channels': in_channels * depth_mult,
+               'groups': in_channels,
+               'kernel_size': [kernel_h, kernel_w],
+               'strides': [stride_h, stride_w],
+               'dilation': [1, 1],
+               'padding': [0, 0],
+               'use_bias': False}
+    if keras_layer.padding == 'valid':
+        pass
+    # we insert a separate pad operator
+    elif keras_layer.padding == 'same':
+        in_h = keras_layer.input_shape[1]
+        in_w = keras_layer.input_shape[2]
+        pad_t, pad_b = _get_pad_pair(in_h, kernel_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, kernel_w, stride_w)
+        insym = _sym.pad(data=insym, pad_width=(
+            (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+    else:
+        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+    depthconv = _sym.conv2d(data=insym, **params0)
+    # pointwise conv
+    weight1 = weightList[1].transpose([3, 2, 0, 1])
+    params1 = {'weight': symtab.new_const(weight1),
+               'channels': weight1.shape[0],
+               'groups': 1,
+               'kernel_size': [1, 1],
+               'strides': [1, 1],
+               'dilation': [1, 1],
+               'use_bias': False}
+    if keras_layer.use_bias:
+        params1['use_bias'] = True
+        params1['bias'] = symtab.new_const(weightList[2])
+    out = _sym.conv2d(data=depthconv, **params1)
+    # defuse activation
+    if sys.version_info.major < 3:
+        act_type = keras_layer.activation.func_name
+    else:
+        act_type = keras_layer.activation.__name__
+    if act_type != 'linear':
+        out = _convert_activation(out, act_type, symtab)
+    return out
+
+
+def _convert_flatten(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    # NCHW -> NHWC so that dense can be correctly converted
+    insym = _sym.transpose(insym, axes=[0, 2, 3, 1])
+    return _sym.flatten(insym)
+
+
+def _convert_pooling(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    pool_type = type(keras_layer).__name__
+    # global pool in keras = global pool + flatten in nnvm
+    if pool_type == 'GlobalMaxPooling2D':
+        return _convert_flatten(_sym.global_max_pool2d(insym), keras_layer, symtab)
+    elif pool_type == 'GlobalAveragePooling2D':
+        return _convert_flatten(_sym.global_avg_pool2d(insym), keras_layer, symtab)
+    else:
+        pool_h, pool_w = keras_layer.pool_size
+        stride_h, stride_w = keras_layer.strides
+        params = {'pool_size': [pool_h, pool_w],
+                  'strides': [stride_h, stride_w],
+                  'padding': [0, 0]}
+        if keras_layer.padding == 'valid':
+            pass
+        # we insert a separate pad operator
+        elif keras_layer.padding == 'same':
+            in_h = keras_layer.input_shape[1]
+            in_w = keras_layer.input_shape[2]
+            pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
+            pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
+            insym = _sym.pad(data=insym, pad_width=(
+                (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+        else:
+            raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+        if pool_type == 'MaxPooling2D':
+            return _sym.max_pool2d(insym, **params)
+        elif pool_type == 'AveragePooling2D':
+            # TODO: in keras, padded zeros are not calculated
+            return _sym.avg_pool2d(insym, **params)
+        else:
+            raise TypeError("Unsupported pooling type : {}".format(keras_layer))
+
+
+def _convert_upsample(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    upsample_type = type(keras_layer).__name__
+    if upsample_type == "UpSampling1D":
+        h = keras_layer.size
+        params = {'scale': h}
+    elif upsample_type == "UpSampling2D":
+        h, w = keras_layer.size
+        if h != w:
+            raise TypeError("Unsupported upsampling type with different axes size : {}"
+                            .format(keras_layer.size))
+        params = {'scale': h}
+    elif upsample_type == "UpSampling3D":
+        h, w, d = keras_layer.size
+        if h != w or w != d:
+            raise TypeError("Unsupported upsampling type with different axes size : {}"
+                            .format(keras_layer.size))
+        params = {'scale': h}
+    else:
+        raise TypeError("Unsupported upsampling type : {}".format(upsample_type))
+    return _sym.upsampling(insym, **params)
+
+
+def _convert_batchnorm(insym, keras_layer, symtab):
+    params = {'scale': False,
+              'center': False,
+              'epsilon': keras_layer.epsilon}
+    idx = 0
+    if keras_layer.scale:
+        params['scale'] = True
+        gamma = keras_layer.get_weights()[idx]
+        params['gamma'] = symtab.new_const(gamma)
+        idx += 1
+    if keras_layer.center:
+        params['center'] = True
+        beta = keras_layer.get_weights()[idx]
+        params['beta'] = symtab.new_const(beta)
+        idx += 1
+    moving_mean = keras_layer.get_weights()[idx]
+    moving_var = keras_layer.get_weights()[idx + 1]
+    params['moving_mean'] = symtab.new_const(moving_mean)
+    params['moving_var'] = symtab.new_const(moving_var)
+    return _sym.batch_norm(data=insym, **params)
+
+
+def _convert_padding(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    padding_type = type(keras_layer).__name__
+    padding = keras_layer.padding
+    top = left = bottom = right = 0
+    if padding_type == 'ZeroPadding2D':
+        if isinstance(padding, int):
+            top = left = bottom = right = padding
+        elif isinstance(padding, tuple):
+            if isinstance(padding[0], int):
+                top, left = padding
+                bottom, right = padding
+            elif isinstance(padding[0], tuple):
+                top, bottom = padding[0]
+                left, right = padding[1]
+            else:
+                raise ValueError("Unrecognized padding option: {}".format(str(padding)))
+        else:
+            raise ValueError("Unrecognized padding option: {}".format(str(padding)))
+    elif padding_type == 'ZeroPadding1D':
+        raise NotImplementedError("ZeroPadding1D not implemented")
+    else:
+        raise ValueError("Unrecognized padding type: {}".format(padding_type))
+    return _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (top, bottom), (left, right)))
+
+
+def _convert_concat(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        insym = [insym]
+    return _sym.concatenate(*insym, axis=1)
+
+
+def _convert_reshape(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    ch = keras_layer.input_shape[-1]
+    assert ch == keras_layer.target_shape[-1], \
+        "Only supports last dimension in target shape being equal to " \
+        "the channel number of input tensor."
+    shape = (-1, ch) + keras_layer.target_shape[:-1]
+    return _sym.reshape(insym, shape=shape)
+
+
+def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
+    """Layers that can be skipped because they are train time only."""
+    return insym
+
+
+_convert_map = {
+    'Dense'                    : _convert_dense,
+    'Activation'               : _convert_activation,
+    'ReLU'                     : _convert_advanced_activation,
+    'LeakyReLU'                : _convert_advanced_activation,
+    'PReLU'                    : _convert_advanced_activation,
+    'ELU'                      : _convert_advanced_activation,
+    'ThresholdedReLU'          : _convert_advanced_activation,
+
+    'AveragePooling2D'         : _convert_pooling,
+    'MaxPooling2D'             : _convert_pooling,
+    'GlobalAveragePooling2D'   : _convert_pooling,
+    'GlobalMaxPooling2D'       : _convert_pooling,
+    'Conv2D'                   : _convert_convolution,
+    'Conv2DTranspose'          : _convert_convolution,
+    'DepthwiseConv2D'          : _convert_convolution,
+    'SeparableConv2D'          : _convert_separable_convolution,
+
+    'Flatten'                  : _convert_flatten,
+    'Reshape'                  : _convert_reshape,
+    'Concatenate'              : _convert_concat,
+    'BatchNormalization'       : _convert_batchnorm,
+
+    'Add'                      : _convert_merge,
+    'Subtract'                 : _convert_merge,
+    'Multiply'                 : _convert_merge,
+    'ZeroPadding2D'            : _convert_padding,
+    'UpSampling2D'             : _convert_upsample,
+
+    # 'ZeroPadding1D'          : _convert_padding,
+    # 'AveragePooling1D'       : _convert_pooling,
+    # 'MaxPooling1D'           : _convert_pooling,
+    # 'GlobalAveragePooling1D' : _convert_pooling,
+    # 'GlobalMaxPooling1D'     : _convert_pooling,
+    # 'Cropping1D'             : _convert_cropping,
+    # 'Cropping2D'             : _convert_cropping,
+    # 'UpSampling1D'           : _convert_upsample,
+    # 'UpSampling3D'           : _convert_upsample,
+    # 'Conv1D'                 : _convert_convolution1d,
+
+    # 'GRU'                    : _convert_gru,
+    # 'LSTM'                   : _convert_lstm,
+    # 'SimpleRNN'              : _convert_simple_rnn,
+    # 'Bidirectional'          : _convert_bidirectional,
+    # 'TimeDistributed'        : _default_skip,
+
+    # 'Average'                : _convert_merge,
+    # 'Maximum'                : _convert_merge,
+    # 'Dot'                    : _convert_merge,
+    # 'Permute'                : _convert_permute,
+    # 'Embedding'              : _convert_embedding,
+    # 'RepeatVector'           : _convert_repeat_vector,
+
+    'InputLayer'               : _default_skip,
+    'Dropout'                  : _default_skip,
+    'SpatialDropout2D'         : _default_skip,
+    'SpatialDropout1D'         : _default_skip,
+}
+
+
+def _check_unsupported_layers(model):
+    for layer in model.layers:
+        if type(layer).__name__ not in _convert_map:
+            raise ValueError("Keras layer {} not supported.".format(type(layer).__name__))
+
+
+def keras_op_to_nnvm(insym, keras_layer, outname, symtab):
+    """Convert keras layer to nnvm symbol, and update symtab.
+
+    Parameters
+    ----------
+    insym : nnvm.symbol.Symbol or a list of it
+        The input nnvm symbol(s)
+
+    keras_layer : keras.layers
+        The keras layer to be converted
+
+    outname : str
+        Name of the output nnvm symbol
+
+    symtab : nnvm.frontend.common.SymbolTable
+        The global symbol table to be updated
+    """
+    if type(keras_layer).__name__ not in _convert_map:
+        raise NotImplementedError("{} is not supported".format((type(keras_layer).__name__)))
+    ret = _convert_map[type(keras_layer).__name__](insym, keras_layer, symtab)
+    symtab.set_var(outname, ret)
+
+
+def from_keras(model):
+    """Convert keras model to NNVM format.
+
+    Parameters
+    ----------
+    model : keras.engine.training.Model
+        The keras model to be converted
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by nnvm
+    """
+    try:
+        import keras
+    except ImportError:
+        raise ImportError('Keras must be installed')
+
+    assert isinstance(model, keras.engine.training.Model)
+    if keras.backend.backend() != 'tensorflow':
+        raise ValueError("Keras frontend currently supports tensorflow backend only.")
+    if keras.backend.image_data_format() != 'channels_last':
+        raise ValueError("Keras frontend currently supports data_format = channels_last only.")
+    _check_unsupported_layers(model)
+
+    symtab = SymbolTable()
+    for keras_layer in model.layers:
+        if isinstance(keras_layer, keras.engine.InputLayer):
+            symtab.get_var(keras_layer.name, must_contain=False)
+        else:
+            inbound_nodes = keras_layer.inbound_nodes if hasattr(keras_layer, 'inbound_nodes') \
+                       else keras_layer._inbound_nodes if hasattr(keras_layer, '_inbound_nodes') \
+                       else None
+            if inbound_nodes is None:
+                raise TypeError("Unknown layer type or unsupported Keras version : {}"
+                                .format(keras_layer))
+            for my_idx, node in enumerate(inbound_nodes):
+                insym = []
+
+                # Since Keras allows creating multiple layers from the same name instance,
+                # we append node index to the symbol name to make it unique.
+                # The one exception is InputLayer.  Changing input variable names after conversion
+                # would confuse users, so we should keep them as far as possible.  Fortunately,
+                # they are named uniquely to input_1, input_2, input_3 ... by default.
+                for pred_idx, pred in zip(node.node_indices, node.inbound_layers):
+                    if isinstance(pred, keras.engine.InputLayer):
+                        _sym = symtab.get_var(pred.name, must_contain=True)
+                    else:
+                        _sym = symtab.get_var(pred.name + ':' + str(pred_idx), must_contain=True)
+                    insym.append(_sym)
+
+                if len(insym) == 1:
+                    insym = insym[0]
+                keras_op_to_nnvm(insym, keras_layer, keras_layer.name + ':' + str(my_idx), symtab)
+
+    outsym = symtab.get_var(model._output_layers[0].name + ':0')
+    tvmparams = {k:tvm.nd.array(np.array(v, dtype=np.float32)) for k, v in symtab.params.items()}
+    return outsym, tvmparams
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
new file mode 100644
index 000000000000..5e94c92b462f
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -0,0 +1,477 @@
+# pylint: disable=invalid-name, import-self
+"""MXNet symbol frontend."""
+from __future__ import absolute_import as _abs
+import json
+import tvm
+from .. import symbol as _sym
+
+__all__ = ['from_mxnet']
+
+def _get_nnvm_op(op_name):
+    op = getattr(_sym, op_name)
+    if not op:
+        raise RuntimeError("Unable to map op_name {} to nnvm.sym".format(op_name))
+    return op
+
+def _required_attr(attr, key):
+    assert isinstance(attr, dict)
+    if key not in attr:
+        raise AttributeError("Required attribute {} not found.".format(key))
+    return attr[key]
+
+def _raise_not_supported(attr, op='nnvm'):
+    err = "{} is not supported in {}.".format(attr, op)
+    raise NotImplementedError(err)
+
+def _warn_not_used(attr, op='nnvm'):
+    import warnings
+    err = "{} is ignored in {}.".format(attr, op)
+    warnings.warn(err)
+
+def _parse_tshape(tshape):
+    """Parse tshape in string."""
+    return [int(x.strip()) for x in tshape.strip('()').split(',')]
+
+def _parse_bool_str(attr, key, default='False'):
+    """Parse bool string to boolean."""
+    return attr.get(key, default).strip().lower() in ['true', '1', 't', 'y', 'yes']
+
+def _rename(new_name):
+    def impl(inputs, attrs):
+        return _get_nnvm_op(new_name)(*inputs, **attrs)
+    return impl
+
+def _pooling(inputs, attrs):
+    kernel = _parse_tshape(_required_attr(attrs, 'kernel'))
+    if len(kernel) != 2:
+        _raise_not_supported('non-2d kernel', 'pool_2d')
+    global_pool = 'global' if _parse_bool_str(attrs, 'global_pool') else ''
+    pool_type = _required_attr(attrs, 'pool_type')
+    if pool_type not in ['avg', 'max']:
+        _raise_not_supported('non-avg/max', 'pool2d')
+    op_name, new_attrs = '_'.join([global_pool, pool_type, 'pool2d']).strip('_'), {}
+    # new_attrs['layout'] = 'NCHW'
+    if not global_pool:
+        new_attrs['pool_size'] = kernel
+        new_attrs['strides'] = attrs.get('stride', (1, 1))
+        new_attrs['padding'] = attrs.get('pad', (0, 0))
+        new_attrs['ceil_mode'] = (attrs.get('pooling_convention', 'valid') == 'full')
+        if pool_type == 'avg':
+            new_attrs['count_include_pad'] = attrs.get('count_include_pad', True)
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _batch_norm(inputs, attrs):
+    if _parse_bool_str(attrs, 'output_mean_var'):
+        _raise_not_supported('output_mean_var', 'batch_norm')
+    # if _parse_bool_str(attrs, 'fix_gamma'):
+    #     _warn_not_used('fix_gamma', 'batch_norm')
+    if _parse_bool_str(attrs, 'use_global_stats'):
+        _warn_not_used('use_global_stats', 'batch_norm')
+    # if _parse_bool_str(attrs, 'momentum'):
+    #     _warn_not_used('momentum', 'batch_norm')
+    op_name, new_attrs = 'batch_norm', {}
+    new_attrs['axis'] = attrs.get('axis', 1)
+    new_attrs['epsilon'] = attrs.get('eps', 0.001)
+    new_attrs['center'] = True
+    new_attrs['scale'] = not _parse_bool_str(attrs, 'fix_gamma', default="False")
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _concat(inputs, attrs):
+    op_name = 'concatenate'
+    new_attrs = {'axis': attrs.get('dim', 1)}
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _conv2d(inputs, attrs):
+    kernel = _parse_tshape(_required_attr(attrs, 'kernel'))
+    if len(kernel) != 2:
+        _raise_not_supported('non 2d kernel', 'conv2d')
+    layout = attrs.get('layout', 'NCHW')
+    if layout not in ['NCHW', 'NHWC']:
+        _raise_not_supported('layout: ' + layout, 'conv2d')
+    if 'kernel_layout' in attrs:
+        kernel_layout = attrs['kernel_layout']
+    else:
+        kernel_layout = 'HWIO' if layout == 'NHWC' else 'OIHW'
+    op_name, new_attrs = 'conv2d', {}
+    new_attrs['channels'] = _required_attr(attrs, 'num_filter')
+    new_attrs['kernel_size'] = kernel
+    new_attrs['strides'] = attrs.get('stride', (1, 1))
+    new_attrs['padding'] = attrs.get('pad', (0, 0))
+    new_attrs['dilation'] = attrs.get('dilate', (1, 1))
+    new_attrs['groups'] = attrs.get('num_group', 1)
+    new_attrs['layout'] = layout
+    new_attrs['kernel_layout'] = kernel_layout
+    new_attrs['use_bias'] = attrs.get('no_bias', 'False').strip() == 'False'
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _conv2d_transpose(inputs, attrs):
+    if 'target_shape' in attrs:
+        _raise_not_supported('target_shape', 'conv2d_transpose')
+    kernel = _parse_tshape(_required_attr(attrs, 'kernel'))
+    if len(kernel) != 2:
+        _raise_not_supported('non-2d kernel', 'conv2d_transpose')
+    layout = attrs.get('layout', 'NCHW')
+    if layout not in ['NCHW', 'NHWC']:
+        _raise_not_supported('layout: ' + layout, 'conv2d_transpose')
+    if 'kernel_layout' in attrs:
+        kernel_layout = attrs['kernel_layout']
+    else:
+        kernel_layout = 'HWIO' if layout == 'NHWC' else 'OIHW'
+    op_name, new_attrs = 'conv2d_transpose', {}
+    new_attrs['channels'] = _required_attr(attrs, 'num_filter')
+    new_attrs['kernel_size'] = kernel
+    new_attrs['strides'] = attrs.get('stride', (1, 1))
+    new_attrs['output_padding'] = attrs.get('adj', (0, 0))
+    new_attrs['padding'] = attrs.get('pad', (0, 0))
+    new_attrs['dilation'] = attrs.get('dilate', (1, 1))
+    new_attrs['groups'] = attrs.get('num_group', 1)
+    new_attrs['layout'] = layout
+    new_attrs['kernel_layout'] = kernel_layout
+    new_attrs['use_bias'] = not _parse_bool_str(attrs, 'no_bias')
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _dense(inputs, attrs):
+    import mxnet as mx
+    op_name, new_attrs = 'dense', {}
+    new_attrs['units'] = _required_attr(attrs, 'num_hidden')
+    new_attrs['use_bias'] = not _parse_bool_str(attrs, 'no_bias')
+    try:
+        _ = mx.sym.FullyConnected(mx.sym.var('x'), num_hidden=1, flatten=True)
+        has_flatten = True
+    except mx.base.MXNetError:
+        # no flatten attribute in old mxnet
+        has_flatten = False
+    use_flatten = _parse_bool_str(attrs, 'flatten', 'True')
+    if has_flatten and use_flatten:
+        inputs[0] = _sym.flatten(inputs[0])
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _dropout(inputs, attrs):
+    op_name, new_attrs = 'dropout', {}
+    new_attrs['rate'] = attrs.get('p', 0.5)
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _leaky_relu(inputs, attrs):
+    act_type = _required_attr(attrs, 'act_type')
+    if act_type in ['leaky', 'prelu']:
+        op_name, new_attrs = act_type, {}
+        if act_type == 'leaky':
+            new_attrs['alpha'] = attrs.get('slope', 0.25)
+        sym = _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    elif act_type == 'elu':
+        slope = attrs.get('slope', 0.25)
+        sym = -slope * _sym.relu(1 - _sym.exp(*inputs)) + _sym.relu(*inputs)
+    elif act_type == 'rrelu':
+        lower_bound = float(_required_attr(attrs, 'lower_bound'))
+        upper_bound = float(_required_attr(attrs, 'upper_bound'))
+        slope = (lower_bound + upper_bound) / 2.0
+        op_name, new_attrs = 'leaky_relu', {'alpha': str(slope)}
+        sym = _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    else:
+        _raise_not_supported('act_type: ' + act_type)
+    return sym
+
+def _activations(inputs, attrs):
+    act_type = _required_attr(attrs, 'act_type')
+    if act_type in ['relu', 'sigmoid', 'tanh']:
+        op_name, new_attrs = act_type, {}
+        sym = _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    elif act_type == 'softrelu':
+        sym = _sym.log((1 + _sym.exp(*inputs)))
+    else:
+        _raise_not_supported('act_type: ' + act_type)
+    return sym
+
+def _reshape(inputs, attrs):
+    if _parse_bool_str(attrs, 'reverse'):
+        _raise_not_supported('reverse', 'reshape')
+    op_name, new_attrs = 'reshape', {}
+    new_attrs['shape'] = _required_attr(attrs, 'shape')
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _split(inputs, attrs):
+    op_name, new_attrs = 'split', {}
+    axis = attrs.get('axis', 1)
+    new_attrs['indices_or_sections'] = _required_attr(attrs, 'num_outputs')
+    new_attrs['axis'] = axis
+    outputs = _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    if _parse_bool_str(attrs, 'squeeze_axis'):
+        squeeze_attrs = {'axis': axis}
+        outputs = _sym.Group([_get_nnvm_op('squeeze')(o, **squeeze_attrs) for o in outputs])
+    return outputs
+
+def _softmax_activation(inputs, attrs):
+    op_name, new_attrs = 'softmax', {}
+    mode = attrs.get('mode', 'instance')
+    new_attrs['axis'] = 0 if mode == 'instance' else 1
+    return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
+
+def _softmax_output(inputs, attrs):
+    op_name, new_attrs = 'softmax', {}
+    if _parse_bool_str(attrs, 'multi_output'):
+        new_attrs['axis'] = 1
+    return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
+
+def _upsampling(inputs, attrs):
+    scale = attrs.get('scale')
+    new_attrs = {'scale':int(scale)}
+    return _get_nnvm_op('upsampling')(inputs[0], **new_attrs)
+
+def _clip(inputs, attrs):
+    op_name, new_attrs = "clip", {}
+    new_attrs['a_min'] = _required_attr(attrs, 'a_min')
+    new_attrs['a_max'] = _required_attr(attrs, 'a_max')
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _contrib_multibox_detection(inputs, attrs):
+    clip = _parse_bool_str(attrs, 'clip', default='True')
+    threshold = attrs.get('threshold') or 0.01
+    nms_threshold = attrs.get('nms_threshold') or 0.5
+    force_suppress = _parse_bool_str(attrs, 'force_suppress', default='False')
+    variances = tuple([float(x.strip()) for x in attrs.get('variances').strip('()').split(',')]) \
+        if attrs.get('variances') is not None else (0.1, 0.1, 0.2, 0.2)
+    nms_topk = attrs.get('nms_topk') or -1
+    new_attrs0 = {'clip': clip, 'threshold': float(threshold), 'variances': variances}
+    new_attrs1 = {'nms_threshold': float(nms_threshold), 'force_suppress': force_suppress,
+                  'nms_topk': int(nms_topk)}
+    data, valid_count = _get_nnvm_op('multibox_transform_loc')(inputs[0], inputs[1],
+                                                               inputs[2], **new_attrs0)
+    return _get_nnvm_op('nms')(data, valid_count, **new_attrs1)
+
+def _elemwise_sum(inputs, _):
+    new_attrs = {'num_args':len(inputs)}
+    return _get_nnvm_op('elemwise_sum')(*inputs, **new_attrs)
+
+def _crop_like(inputs, attrs):
+    new_attrs = {}
+    offsets = \
+        tuple([float(x.strip()) for x in attrs.get('offsets').strip('()').split(',')]) \
+            if attrs.get('offsets') is not None else (0, 0)
+    if offsets != (0, 0):
+        raise RuntimeError("Currently only supports offsets to be zero.")
+    center_crop = _parse_bool_str(attrs, 'center_crop', default="False")
+    if center_crop:
+        raise RuntimeError("center crop is not supported.")
+    if len(inputs) < 2:
+        raise RuntimeError("Only support crop_like pattern.")
+    new_attrs["axis"] = [2, 3]
+    return _get_nnvm_op('slice_like')(inputs[0], inputs[1], **new_attrs)
+
+
+def _expand_dims(inputs, attrs):
+    op_name, new_attrs = "expand_dims", {}
+    new_attrs['axis'] = _required_attr(attrs, 'axis')
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _l2_normalize(inputs, attrs):
+    op_name, new_attrs = 'l2_normalize', {}
+    mode = _required_attr(attrs, 'mode')
+    if mode == 'instance':
+        new_attrs['axis'] = (0,1,2,3)
+    if mode == 'channel':
+        new_attrs['axis'] = 1
+    if mode == 'spatial':
+        new_attrs['axis'] = (2, 3)
+    new_attrs['eps'] = attrs.get('eps', 1e-10)
+    return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
+
+def _lrn(inputs, attrs):
+    op_name, new_attrs = "lrn", {}
+    new_attrs['alpha'] = attrs.get('alpha', 0.0001)
+    new_attrs['beta'] = attrs.get('beta', 0.75)
+    new_attrs['bias'] = attrs.get('knorm', 2)
+    # NCHW format and normalization along channel axis
+    new_attrs['axis'] = 1
+    new_attrs['size'] = _required_attr(attrs, 'nsize')
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+_identity_list = ['__add_scalar__', '__add_symbol__', '__div_scalar__',
+                  '__div_symbol__', '__mul_scalar__', '__mul_symbol__',
+                  '__pow_scalar__', '__rdiv_scalar__', '__rpow_scalar__',
+                  '__rsub_scalar__', '__sub_scalar__', '__sub_symbol__',
+                  'broadcast_add', 'broadcast_div', 'broadcast_mul',
+                  'broadcast_sub', 'broadcast_to', 'cast', 'elemwise_add',
+                  'elemwise_div', 'elemwise_mul', 'elemwise_sub', 'exp',
+                  'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
+                  'relu', 'sigmoid', 'slice_like', 'softmax', 'sum', 'tanh',
+                  'transpose']
+
+_convert_map = {
+    '_copy'         : _rename('copy'),
+    '_div_scalar'   : _rename('__div_scalar__'),
+    '_minus_scalar' : _rename('__sub_scalar__'),
+    '_mul_scalar'   : _rename('__mul_scalar__'),
+    '_plus_scalar'  : _rename('__add_scalar__'),
+    '_rdiv_scalar'  : _rename('__rdiv_scalar__'),
+    '_rminus_scalar': _rename('__rsub_scalar__'),
+    '_contrib_MultiBoxPrior' : _rename('multibox_prior'),
+    '_contrib_MultiBoxDetection' : _contrib_multibox_detection,
+    'Activation'    : _activations,
+    'BatchNorm'     : _batch_norm,
+    'BatchNorm_v1'  : _batch_norm,
+    'Cast'          : _rename('cast'),
+    'Concat'        : _concat,
+    'Convolution'   : _conv2d,
+    'Convolution_v1': _conv2d,
+    'Crop'          : _crop_like,
+    'Deconvolution' : _conv2d_transpose,
+    'Dropout'       : _dropout,
+    'Flatten'       : _rename('flatten'),
+    'FullyConnected': _dense,
+    'LeakyReLU'     : _leaky_relu,
+    'Pooling'       : _pooling,
+    'Pooling_v1'    : _pooling,
+    'Reshape'       : _reshape,
+    'SliceChannel'  : _split,
+    'split'         : _split,
+    'Softmax'       : _rename('softmax'),
+    'SoftmaxActivation' : _softmax_activation,
+    'SoftmaxOutput' : _softmax_output,
+    'add_n'         : _elemwise_sum,
+    'concat'        : _concat,
+    'max_axis'      : _rename('max'),
+    'min_axis'      : _rename('min'),
+    'reshape'       : _reshape,
+    'sum_axis'      : _rename('sum'),
+    'UpSampling'    : _upsampling,
+    'clip'          : _clip,
+    'expand_dims'   : _expand_dims,
+    'L2Normalization':_l2_normalize,
+    'LRN'           : _lrn
+
+}
+
+def _convert_symbol(op_name, inputs, attrs,
+                    identity_list=None,
+                    convert_map=None):
+    """Convert from mxnet op to nnvm op.
+    The converter must specify some conversions explicitly to
+    support gluon format ops such as conv2d...
+
+    Parameters
+    ----------
+    op_name : str
+        Operator name, such as Convolution, FullyConnected
+    inputs : list of nnvm.Symbol
+        List of input symbols.
+    attrs : dict
+        Dict of operator attributes
+    identity_list : list
+        List of operators that don't require conversion
+    convert_map : dict
+        Dict of name : callable, where name is the op's name that
+        require conversion to nnvm, callable are functions which
+        take attrs and return (new_op_name, new_attrs)
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Converted nnvm Symbol
+    """
+    identity_list = identity_list if identity_list else _identity_list
+    convert_map = convert_map if convert_map else _convert_map
+    if op_name in identity_list:
+        op = _get_nnvm_op(op_name)
+        sym = op(*inputs, **attrs)
+    elif op_name in convert_map:
+        sym = convert_map[op_name](inputs, attrs)
+    else:
+        _raise_not_supported('Operator: ' + op_name)
+    return sym
+
+def _as_list(arr):
+    """Force being a list, ignore if already is."""
+    if isinstance(arr, list):
+        return arr
+    return [arr]
+
+def _from_mxnet_impl(symbol, graph):
+    """Convert mxnet symbol to nnvm implementation.
+    Reconstruct a nnvm symbol by traversing the mxnet symbol.
+
+    Parameters
+    ----------
+    symbol : mxnet.sym.Symbol
+        Incompatible symbol from mxnet, sharing similar graph structure.
+        The op_name and attrs inside are not always compatible.
+    graph : dict
+        Reusable nodes are stored in graph.
+
+    Returns:
+    -------
+    nnvm.sym.Symbol
+        Converted symbol
+    """
+    if len(symbol.list_outputs()) > 1:
+        return [_from_mxnet_impl(s, graph) for s in symbol]
+
+    name = symbol.attr('name')
+    output_index = json.loads(symbol.tojson())['heads'][0][1]
+    node = graph.get(name, None)
+    if node:
+        return node[output_index]
+    attr = symbol.list_attr()
+    # op_name = symbol.attr('op_name')
+    childs = symbol.get_children()
+    if childs is not None:
+        op_name = symbol.attr('op_name')
+        childs = [_from_mxnet_impl(childs[i], graph) for i in range(len(childs.list_outputs()))]
+        childs = [x for y in childs for x in _as_list(y)]  # expand group symbol
+        node = _convert_symbol(op_name, childs, attr)
+    else:
+        op_name = json.loads(symbol.tojson())['nodes'][0]['op']
+        node = _sym.Variable(name=name, **attr)
+    graph[name] = node
+    return node[output_index]
+
+def from_mxnet(symbol, arg_params=None, aux_params=None):
+    """Convert from MXNet's model into compatible NNVM format.
+
+    Parameters
+    ----------
+    symbol : mxnet.Symbol or mxnet.gluon.HybridBlock
+        MXNet symbol
+
+    arg_params : dict of str to mx.NDArray
+        The argument parameters in mxnet
+
+    aux_params : dict of str to mx.NDArray
+        The auxiliary parameters in mxnet
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by nnvm
+    """
+    try:
+        import mxnet as mx
+    except ImportError as e:
+        raise ImportError('{}. MXNet is required to parse symbols.'.format(e))
+
+    if isinstance(symbol, mx.sym.Symbol):
+        sym = _from_mxnet_impl(symbol, {})
+        params = {}
+        arg_params = arg_params if arg_params else {}
+        aux_params = aux_params if aux_params else {}
+        for k, v in arg_params.items():
+            params[k] = tvm.nd.array(v.asnumpy())
+        for k, v in aux_params.items():
+            params[k] = tvm.nd.array(v.asnumpy())
+    elif isinstance(symbol, mx.gluon.HybridBlock):
+        data = mx.sym.Variable('data')
+        sym = symbol(data)
+        sym = _from_mxnet_impl(sym, {})
+        params = {}
+        for k, v in symbol.collect_params().items():
+            params[k] = tvm.nd.array(v.data().asnumpy())
+    elif isinstance(symbol, mx.gluon.Block):
+        raise NotImplementedError("Only Hybrid Blocks are supported now.")
+    else:
+        msg = "mxnet.Symbol or gluon.HybridBlock expected, got {}".format(type(symbol))
+        raise ValueError(msg)
+    if isinstance(sym, list):
+        sym = _sym.Group(sym)
+    return sym, params
diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
new file mode 100644
index 000000000000..cfef11d6a106
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -0,0 +1,823 @@
+# pylint: disable=import-self, invalid-name, unused-argument
+"""ONNX: Open Neural Network Exchange frontend."""
+from __future__ import absolute_import as _abs
+import numpy as np
+import tvm
+from .. import symbol as _sym
+from .. import graph as _graph
+from ..compiler import graph_util
+from .common import get_nnvm_op, Renamer, SymbolTable, AttrConverter as AttrCvt
+
+__all__ = ['from_onnx']
+
+
+class OnnxOpConverter(object):
+    """ A helper class for holding onnx op converters.
+    """
+
+    @classmethod
+    def get_converter(cls, opset):
+        """ Get converter matches given opset.
+
+        :param opset: opset from model.
+        :return: converter, which should be `_impl_vx`. Number x is the biggest
+            number smaller than or equal to opset belongs to all support versions.
+        """
+        versions = [
+            int(d.replace('_impl_v', '')) for d in dir(cls) if '_impl_v' in d
+        ]
+        versions = sorted(versions + [opset])
+        version = versions[
+            max([i for i, v in enumerate(versions) if v == opset]) - 1]
+        if hasattr(cls, '_impl_v{}'.format(version)):
+            return getattr(cls, '_impl_v{}'.format(version))
+        else:
+            raise NotImplementedError(
+                'opset version {} of {} not implemented'.format(
+                    version, cls.__name__))
+
+
+class Elemwise(OnnxOpConverter):
+    """ A helper class for elemwise op converters.
+    """
+
+    name = ''
+
+    @classmethod
+    def _math_name_picker(cls, suffix):
+
+        def _impl(attr):
+            if attr.get('broadcast', 0):
+                return 'broadcast_' + suffix
+            return 'elemwise_' + suffix
+
+        return _impl
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        assert len(inputs) == 2, "Math op take 2 inputs, {} given".format(
+            len(inputs))
+        op_name = cls._math_name_picker(cls.name)(attr)
+        axis = int(attr.get('axis', 0))
+        conv_ops = ["conv2d", "conv2d_transpose"]
+        if op_name == 'broadcast_add' and inputs[0].attr('op_name') in conv_ops:
+            # TODO(zhreshold): remove hard coded infershape
+            inputs[1] = _sym.expand_dims(inputs[1], axis=axis, num_newaxis=2)
+        return get_nnvm_op(op_name)(*inputs)
+
+
+class Pool(OnnxOpConverter):
+    """ A helper class for pool op converters.
+    """
+
+    name = ''
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return AttrCvt(
+            op_name=_dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), _revert_caffe2_pad)
+            },
+            # very weird attributes here in onnx, force check
+            ignores=['dilations'],
+            # TODO(zhreshold): make sure ceil_mode in onnx, and layout?
+            extras={'ceil_mode': False},
+            custom_check=_dimension_constraint())(inputs, attr, params)
+
+
+class Absolute(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return _sym.relu(inputs[0]) + _sym.relu(_sym.negative(inputs[0]))
+
+
+class Add(Elemwise):
+    name = 'add'
+
+
+class AveragePool(Pool):
+    name = 'avg_pool'
+
+
+class BatchNorm(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # TODO(zhreshold): 'spatial' is not properly handled here.
+        return AttrCvt(
+            op_name='batch_norm',
+            disables=['momentum'],
+            ignores=['spatial', 'is_test', 'consumed_inputs'])(inputs, attr,
+                                                               params)
+
+
+class Conv(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # get number of channels
+        channels = _infer_channels(inputs[1], params)
+        attr['channels'] = channels
+        return AttrCvt(
+            op_name=_dimension_picker('conv'),
+            transforms={
+                'kernel_shape': 'kernel_size',
+                'dilations': ('dilation', (0, 0)),
+                'pads': ('padding', (0, 0), _revert_caffe2_pad),
+                'group': ('groups', 1)
+            },
+            extras={'use_bias': len(inputs) == 3},
+            custom_check=_dimension_constraint())(inputs, attr, params)
+
+
+class ConvTranspose(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # get number of channels
+        channels = _infer_channels(inputs[1], params, True)
+        attr['channels'] = channels
+        groups = attr.pop('group')
+        attr['groups'] = groups
+        return AttrCvt(
+            op_name=_dimension_picker('conv', '_transpose'),
+            transforms={
+                'kernel_shape': 'kernel_size',
+                'dilations': ('dilation', (0, 0)),
+                'pads': ('padding', (0, 0), _revert_caffe2_pad)
+            },
+            disables=['output_shape'],
+            extras={'use_bias': len(inputs) == 3},
+            custom_check=_dimension_constraint())(inputs, attr, params)
+
+
+class Div(Elemwise):
+    name = 'div'
+
+
+class Elu(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 1.0))
+        return -alpha * _sym.relu(1 - _sym.exp(inputs[0])) + _sym.relu(
+            inputs[0])
+
+
+class Gemm(OnnxOpConverter):
+    """ Operator converter for Gemm.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        assert len(inputs) == 3, "Gemm op take 3 inputs, {} given".format(
+            len(inputs))
+        # Y = alpha * A * B + beta * C
+        alpha = float(attr.get('alpha', 1.0))
+        beta = float(attr.get('beta', 1.0))
+        transA = int(attr.get('transA', 0))
+        transB = int(attr.get('transB', 0))
+        # get number of channels
+        channels = _infer_channels(inputs[1], params, not transB)
+        if transA:
+            inputs[0] = _sym.transpose(inputs[0], axes=(1, 0))
+        if not transB:
+            inputs[1] = _sym.transpose(inputs[1], axes=(1, 0))
+        inputs[0] = _sym.flatten(inputs[0])
+        return _sym.dense(
+            alpha * inputs[0], inputs[1], beta * inputs[2], units=channels)
+
+
+class MaxPool(Pool):
+    name = 'max_pool'
+
+
+class Mul(Elemwise):
+    name = 'mul'
+
+
+class Pad(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # get number of channels
+        channels = _infer_channels(inputs[1], params, True)
+        attr['channels'] = channels
+        groups = attr.pop('group')
+        attr['groups'] = groups
+        return AttrCvt(
+            op_name='pad',
+            transforms={
+                'value': 'pad_value',
+                'pads': 'pad_width'
+            },
+            custom_check=lambda attrs: attrs.get('mode') == 'constant')(
+                inputs, attr, params)
+
+
+class ParametricSoftPlus(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 1.0))
+        beta = float(attr.get('beta', 1.0))
+        return _sym.log(_sym.exp(beta * inputs[0]) + 1) * alpha
+
+
+class Prelu(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        assert len(inputs) == 2, "Prelu need 2 inputs, {} given".format(
+            len(inputs))
+        channels = _infer_channels(inputs[1], params, False)
+        if channels == 1:
+            return inputs[0] * inputs[1]
+        return _sym.broadcast_mul(inputs[0], inputs[1])
+
+
+class Reciprocal(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return 1.0 / inputs[0]
+
+
+class Reshape(OnnxOpConverter):
+    """ Operator converter for Reshape.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return _sym.reshape(inputs[0], shape=attr['shape'])
+
+    @classmethod
+    def _impl_v5(cls, inputs, attr, params):
+        if inputs[1].list_output_names()[0] in params:
+            shape = tuple(params[inputs[1].list_output_names()[0]].asnumpy())
+            out = _sym.reshape(inputs[0], shape=shape)
+        else:
+            out = _sym.reshape_like(inputs[0], inputs[1])
+
+        return out
+
+class Scale(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        scale = float(attr.get('scale', 1.0))
+        return inputs[0] * scale
+
+
+class Selu(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 1.6732))
+        gamma = float(attr.get('gamma', 1.0507))
+        return gamma * (
+            -alpha * _sym.relu(1 - _sym.exp(inputs[0])) + _sym.relu(inputs[0]))
+
+
+class ScaledTanh(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 1.0))
+        beta = float(attr.get('beta', 1.0))
+        return _sym.tanh(beta * inputs[0]) * alpha
+
+
+class SoftPlus(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return _sym.log(_sym.exp(inputs[0]) + 1)
+
+
+class Softsign(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return inputs[0] / (1 + Absolute.get_converter(1)(inputs, attr, params))
+
+
+class Sub(Elemwise):
+    name = 'sub'
+
+
+class Sum(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # Onnx Sum Operator
+        for in_index in range(len(inputs) - 1):
+            inputs[in_index + 1] = _sym.broadcast_add(inputs[in_index],
+                                                      inputs[in_index + 1])
+
+        return inputs[len(inputs) - 1]
+
+
+class ThresholdedRelu(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 0.0))
+        return _sym.relu(inputs[0] - alpha)
+
+
+class ImageScaler(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        channelScale = attr['scale']
+        bias_attr = attr['bias']
+        bias = SymbolTable().new_const(np.array(bias_attr).reshape([3, 1, 1]))
+        scaledChannel = _sym.__mul_scalar__(inputs[0], scalar=channelScale)
+        ret = _sym.broadcast_add(scaledChannel, bias)
+        return ret
+
+
+def _revert_caffe2_pad(attr):
+    """Caffe2 require two times the normal padding."""
+    if len(attr) == 4:
+        attr = attr[:2]
+    elif len(attr) == 2:
+        pass
+    else:
+        raise ValueError("Invalid caffe2 type padding: {}".format(attr))
+    return attr
+
+
+def _broadcast_constraint():
+
+    def _broadcast_check(attrs):
+        if attrs.get('axis', None):
+            return False
+        return True
+
+    return _broadcast_check, "Specifying broadcast axis not allowed."
+
+
+def _dimension_picker(prefix, surfix=''):
+
+    def _impl(attr):
+        kernel = attr['kernel_shape']
+        if len(kernel) == 2:
+            return prefix + '2d' + surfix
+        else:
+            raise NotImplementedError("Only 2d kernel supported.")
+
+    return _impl
+
+
+def _dimension_constraint():
+
+    def _dim_check(attrs):
+        if len(attrs['kernel_shape']) == 2:
+            return True
+        return False
+
+    return _dim_check, "Only 2d kernel supported."
+
+
+def _infer_channels(inputs, params, transpose=False):
+    """A hack for getting 'channles' or 'units' since onnx don't provide
+    these attributes. We check the shape of weights provided to get the number.
+    """
+    g = _graph.create(inputs)
+    shape_dict = {k: v.shape for k, v in params.items()}
+    _, out_shapes = graph_util.infer_shape(g, **shape_dict)
+    channels = out_shapes[0][0] if not transpose else out_shapes[0][1]
+    return channels
+
+
+def _fully_connected(opset):
+
+    def _impl(inputs, attr, params):
+        # get number of channels
+        channels = _infer_channels(inputs[1], params)
+        attr['units'] = channels
+        return AttrCvt('dense', ignores=['axis', 'axis_w'])(inputs, attr)
+
+    return _impl
+
+
+class Shape(OnnxOpConverter):
+    """ Operator converter for Shape.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # Result of this operator is prominently used by reshape operator.
+        # Just pass the input as it is so that reshape_like can be used there.
+        print("Shape: Differently implemented in NNVM as a bypass (dummy operator)")
+        return inputs[0]
+
+class Cast(OnnxOpConverter):
+    """ Operator converter for Cast.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return AttrCvt(op_name='cast', transforms={'to': 'dtype'})(inputs, attr)
+
+    @classmethod
+    def _impl_v5(cls, inputs, attr, params):
+        try:
+            from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
+            attr['to'] = TENSOR_TYPE_TO_NP_TYPE[attr['to']]
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import onnx.mapping which is required {}".format(e))
+        return AttrCvt(op_name='cast', transforms={'to': 'dtype'})(inputs, attr)
+
+
+class Unsqueeze(OnnxOpConverter):
+    """ Operator converter for Unsqueeze.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        for axes in attr['axes']:
+            inputs[0] = _sym.expand_dims(inputs[0], axis=axes, num_newaxis=1)
+        return inputs[0]
+
+class Slice(OnnxOpConverter):
+    """ Operator converter for Slice.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if isinstance(attr['starts'], int):
+            attr['starts'] = (attr['starts'],)
+            attr['ends'] = (attr['ends'],)
+
+        try:
+            # Update the starts and ends according to axes if required.
+            if isinstance(attr['axes'], int):
+                attr['axes'] = (attr['axes'],)
+
+            if (max(attr['axes']) + 1) != len(attr['axes']):
+                new_axes = []
+                new_starts = []
+                new_ends = []
+                pop_index = 0
+                for i in range(max(attr['axes']) + 1):
+                    if i in attr['axes']:
+                        new_axes.append(i)
+                        new_starts.append(attr['starts'][pop_index])
+                        new_ends.append(attr['ends'][pop_index])
+                        pop_index += 1
+                    else:
+                        new_axes.append(i)
+                        new_starts.append(0)
+                        new_ends.append(np.iinfo(np.int32).max)
+                attr['axes'] = new_axes
+                attr['starts'] = new_starts
+                attr['ends'] = new_ends
+        except KeyError:
+            pass
+
+        return AttrCvt(op_name='strided_slice',
+                       transforms={'starts': 'begin',
+                                   'ends': 'end'},
+                       ignores=['axes'])(inputs, attr)
+
+class Gather(OnnxOpConverter):
+    """ Operator converter for Gather.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr['axis']
+        indices = np.array(attr['indices'], dtype='int32')
+        name = 'gather_indices'
+        gather_indices = _sym.Variable(name=name, init=indices)
+        params[name] = indices
+        return _sym.take(inputs[0], gather_indices, axis=axis)
+
+# compatible operators that do NOT require any conversion.
+_identity_list = []
+
+
+# _convert_map defines maps of name to converter functor(callable)
+# for 1 to 1 mapping, use Renamer if nothing but name is different
+# use AttrCvt if attributes need to be converted
+# for 1 to N mapping(composed), use custom callable functions
+# for N to 1 mapping, currently not supported(?)
+def _get_convert_map(opset):
+    return {
+        # defs/experimental
+        'Identity': Renamer('copy'),
+        # 'Affine'
+        'ThresholdedRelu': ThresholdedRelu.get_converter(opset),
+        'ScaledTanh': ScaledTanh.get_converter(opset),
+        'ParametricSoftplus': ParametricSoftPlus.get_converter(opset),
+        # 'ConstantFill'
+        # 'GivenTensorFill'
+        'FC': AttrCvt('dense', ignores=['axis', 'axis_w']),
+        'Scale': Scale.get_converter(opset),
+        # 'GRUUnit'
+        # 'ATen'
+        'ImageScaler': ImageScaler.get_converter(opset),
+        # 'MeanVarianceNormalization'
+        # 'Crop'
+        # 'Embedding'
+        # 'Upsample'
+        'SpatialBN': BatchNorm.get_converter(opset),
+
+        # defs/generator
+        # 'Constant' # Implemented
+        # 'RandomUniform'
+        # 'RandomNormal'
+        # 'RandomUniformLike'
+        # 'RandomNormalLike'
+
+        # defs/logical
+
+        # defs/math
+        'Add': Add.get_converter(opset),
+        'Sub': Sub.get_converter(opset),
+        'Mul': Mul.get_converter(opset),
+        'Div': Div.get_converter(opset),
+        'Neg': Renamer('negative'),
+        'Abs': Absolute.get_converter(opset),
+        'Reciprocal': Reciprocal.get_converter(opset),
+        'Floor': Renamer('floor'),
+        'Ceil': Renamer('ceil'),
+        'Sqrt': Renamer('sqrt'),
+        'Relu': Renamer('relu'),
+        'LeakyRelu': Renamer('leaky_relu'),
+        'Selu': Selu.get_converter(opset),
+        'Elu': Elu.get_converter(opset),
+        'Exp': Renamer('exp'),
+        'Log': Renamer('log'),
+        'Tanh': Renamer('tanh'),
+        'Pow': Renamer('broadcast_pow'),
+        'PRelu': Prelu.get_converter(opset),
+        'Sigmoid': Renamer('sigmoid'),
+        # 'HardSigmoid'
+        # 'Max' : this is the elemwise maximum
+        # 'Min' : this is the elemwise minimum
+        'Sum': Sum.get_converter(opset),
+        # 'Mean'
+        'Clip': AttrCvt('clip', transforms={'min': 'a_min', 'max': 'a_max'}),
+        # softmax default axis is different in onnx
+        'Softmax': AttrCvt('softmax', {'axis': ('axis', 1)}),
+        'LogSoftmax': AttrCvt('log_softmax', {'axis': ('axis', 1)}),
+        # 'Hardmax'
+        'Softsign': Softsign.get_converter(opset),
+        'SoftPlus': SoftPlus.get_converter(opset),
+        'Gemm': Gemm.get_converter(opset),
+        'MatMul': Renamer('matmul'),
+
+        # defs/nn
+        'AveragePool': AveragePool.get_converter(opset),
+        'MaxPool': MaxPool.get_converter(opset),
+        'Conv': Conv.get_converter(opset),
+        'ConvTranspose': ConvTranspose.get_converter(opset),
+        'GlobalAveragePool': Renamer('global_avg_pool2d'),
+        'GlobalMaxPool': Renamer('global_max_pool2d'),
+        'BatchNormalization': BatchNorm.get_converter(opset),
+        # 'InstanceNormalization'
+        # 'LpNormalization'
+        'Dropout': AttrCvt('dropout', {'ratio': 'rate'}, ignores=['is_test']),
+        'Flatten': Renamer('flatten'),
+        # 'LRN'
+
+        # defs/reduction
+        'ReduceMax': AttrCvt('max', {'axes', 'axis'}),
+        'ReduceMin': AttrCvt('min', {'axes', 'axis'}),
+        'ReduceSum': AttrCvt('sum', {'axes', 'axis'}),
+        # 'ReduceMean'
+        # 'ReduceProd'
+        # 'ReduceLogSumExp'
+        # 'ArgMax'
+        # 'ArgMin'
+
+        # defs/tensor
+        'Cast': Cast.get_converter(opset),
+        'Reshape': Reshape.get_converter(opset),
+        'Concat': Renamer('concatenate'),
+        'Split': AttrCvt('split', {'split': 'indices_or_sections'}),
+        'Slice': Slice.get_converter(opset),
+        'Transpose': AttrCvt('transpose', {'perm': 'axes'}),
+        'Gather': Gather.get_converter(opset),
+        'Squeeze': Renamer('squeeze'),
+        'Unsqueeze': Unsqueeze.get_converter(opset),
+        'Pad': Pad.get_converter(opset),
+        'Shape': Shape.get_converter(opset),
+    }
+
+
+class GraphProto(object):
+    """A helper class for handling nnvm graph copying from pb2.GraphProto.
+    Definition: https://github.com/onnx/onnx/blob/master/onnx/onnx.proto
+    """
+
+    def __init__(self):
+        self._nodes = {}
+        self._params = {}
+        self._renames = {}
+        self._num_input = 0
+        self._num_param = 0
+
+    def from_onnx(self, graph, opset):
+        """Construct nnvm nodes from onnx graph.
+        The inputs from onnx graph is vague, only providing "1", "2"...
+        For convenience, we rename the `real` input names to "input_0",
+        "input_1"... And renaming parameters to "param_0", "param_1"...
+
+        Parameters
+        ----------
+        graph : onnx protobuf object
+            The loaded onnx graph
+        opset : opset version
+
+        Returns
+        -------
+        sym : nnvm.sym.Symbol
+            The returned nnvm symbol
+        params : dict
+            A dict of name: tvm.nd.array pairs, used as pretrained weights
+        """
+        # parse network inputs to nnvm, aka parameters
+        for init_tensor in graph.initializer:
+            if not init_tensor.name.strip():
+                raise ValueError("Tensor's name is required.")
+            self._params[init_tensor.name] = self._parse_array(init_tensor)
+        for i in graph.input:
+            # from onnx v0.2, GraphProto.input has type ValueInfoProto,
+            #  and the name is 'i.name'
+            i_name = self._parse_value_proto(i)
+            if i_name in self._params:
+                # i is a param instead of input
+                self._num_param += 1
+                self._params[i_name] = self._params.pop(i_name)
+                self._nodes[i_name] = _sym.Variable(
+                    name=i_name, shape=self._params[i_name].shape)
+            else:
+                self._num_input += 1
+                self._nodes[i_name] = _sym.Variable(name=i_name)
+        # construct nodes, nodes are stored as directed acyclic graph
+        for node in graph.node:
+            op_name = node.op_type
+            attr = self._parse_attr(node.attribute)
+            inputs = [self._nodes[self._renames.get(i, i)] for i in node.input]
+            if op_name == "Constant":
+                t_proto = self._parse_attr(node.attribute)["value"]
+                self._num_param += 1
+                self._params[node.output[0]] = self._parse_array(t_proto)
+                self._nodes[node.output[0]] = _sym.Variable(name=node.output[0],
+                                                            shape=list(t_proto.dims))
+            else:
+                op = self._convert_operator(op_name, inputs, attr, opset)
+                node_output = self._fix_outputs(op_name, node.output)
+                assert len(node_output) == len(op.list_output_names()), (
+                    "Number of output mismatch {} vs {} in {}.".format(
+                        len(node_output), len(op.list_output_names()), op_name))
+                for k, i in zip(list(node_output), range(len(node_output))):
+                    self._nodes[k] = op[i]
+        # now return the outputs
+        out = [self._nodes[self._parse_value_proto(i)] for i in graph.output]
+        if len(out) > 1:
+            out = _sym.Group(out)
+        else:
+            out = out[0]
+        return out, self._params
+
+    def _parse_value_proto(self, value_proto):
+        """Parse ValueProto or raw str."""
+        try:
+            name = value_proto.name
+        except AttributeError:
+            name = value_proto
+        return name
+
+    def _parse_array(self, tensor_proto):
+        """Grab data in TensorProto and convert to numpy array."""
+        try:
+            from onnx.numpy_helper import to_array
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import onnx which is required {}".format(e))
+        np_array = to_array(tensor_proto).reshape(tuple(tensor_proto.dims))
+        return tvm.nd.array(np_array)
+
+    def _parse_attr(self, attr_proto):
+        """Convert a list of AttributeProto to a dict, with names as keys."""
+        attrs = {}
+        for a in attr_proto:
+            for f in ['f', 'i', 's']:
+                if a.HasField(f):
+                    attrs[a.name] = getattr(a, f)
+            for f in ['floats', 'ints', 'strings']:
+                if list(getattr(a, f)):
+                    assert a.name not in attrs, "Only one type of attr is allowed"
+                    attrs[a.name] = tuple(getattr(a, f))
+            for f in ['t']:
+                if a.HasField(f):
+                    attrs[a.name] = getattr(a, f)
+            for f in ['tensors']:
+                if list(getattr(a, f)):
+                    assert a.name not in attrs, "Only one type of attr is allowed"
+                    attrs[a.name] = tuple(getattr(a, f))
+            for f in ['g']:
+                if a.HasField(f):
+                    raise NotImplementedError(
+                        "Filed {} is not supported in nnvm.".format(f))
+            for f in ['graphs']:
+                if list(getattr(a, f)):
+                    raise NotImplementedError(
+                        "Filed {} is not supported in nnvm.".format(f))
+            if a.name not in attrs:
+                raise ValueError("Cannot parse attribute: \n{}\n.".format(a))
+        return attrs
+
+    def _convert_operator(self,
+                          op_name,
+                          inputs,
+                          attrs,
+                          opset,
+                          identity_list=None,
+                          convert_map=None):
+        """Convert from onnx operator to nnvm operator.
+        The converter must specify conversions explicity for incompatible name, and
+        apply handlers to operator attributes.
+
+        Parameters
+        ----------
+        op_name : str
+            Operator name, such as Convolution, FullyConnected
+        inputs : list of nnvm.Symbol
+            List of input symbols.
+        attrs : dict
+            Dict of operator attributes
+        opset : int
+            Opset version
+        identity_list : list
+            List of operators that don't require conversion
+        convert_map : dict
+            Dict of name : callable, where name is the op's name that
+            require conversion to nnvm, callable are functions which
+            take attrs and return (new_op_name, new_attrs)
+
+        Returns
+        -------
+        sym : nnvm.Symbol
+            Converted nnvm Symbol
+        """
+        identity_list = identity_list if identity_list else _identity_list
+        convert_map = convert_map if convert_map else _get_convert_map(opset)
+        if op_name in identity_list:
+            sym = get_nnvm_op(op_name)(*inputs, **attrs)
+        elif op_name in convert_map:
+            sym = convert_map[op_name](inputs, attrs, self._params)
+        else:
+            raise NotImplementedError(
+                "Operator {} not implemented.".format(op_name))
+        return sym
+
+    def _fix_outputs(self, op_name, outputs):
+        """A hack to handle dropout or similar operator that have more than one out
+        in ONNX.
+        """
+        if op_name == 'Dropout':
+            if len(outputs) == 1:
+                return outputs
+            # TODO(zhreshold): support dropout mask?
+            outputs = outputs[:-1]
+        return outputs
+
+
+def from_onnx(model):
+    """Load onnx graph which is a python protobuf object into nnvm graph.
+    The companion parameters will be handled automatically.
+    The inputs from onnx graph is vague, only providing "1", "2"...
+    For convenience, we rename the `real` input names to "input_0",
+    "input_1"... And renaming parameters to "param_0", "param_1"...
+
+    Parameters
+    ----------
+    model : protobuf object
+        ONNX ModelProto after ONNX v1.1.0
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.ndarray
+        Dict of converted parameters stored in tvm.ndarray format
+    """
+    g = GraphProto()
+    graph = model.graph
+    try:
+        opset = model.opset_import[0].version if model.opset_import else 1
+    except AttributeError:
+        opset = 1
+    sym, params = g.from_onnx(graph, opset)
+    return sym, params
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
new file mode 100644
index 000000000000..b66cf60e3c04
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -0,0 +1,1258 @@
+# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines
+"""TF: Tensorflow frontend."""
+from __future__ import absolute_import as _abs
+from __future__ import print_function
+
+# Numpy support
+import numpy as np
+
+import tvm
+from .. import symbol as _sym
+from .. import graph as _graph
+from .. compiler import graph_util
+from .common import get_nnvm_op, AttrConverter as AttrConvert
+
+__all__ = ['from_tensorflow']
+
+class AttrCvt(object):
+    """A Wrapper to handle some common jobs:
+    """
+    def __init__(self, op_name, transforms=None,
+                 excludes=None, disables=None, ignores=None,
+                 extras=None, custom_check=None):
+        self._op_name = op_name
+        self._transforms = transforms if transforms else {}
+        self._excludes = excludes if excludes else []
+        self._disables = disables if disables else []
+        self._ignores = ignores if ignores else []
+        self._extras = extras if extras else {}
+        self._custom_check = custom_check
+
+    def __call__(self, inputs, attrs, *args):
+        self._ignores.append('_output_shapes')
+        self._ignores.append('_input_shapes')
+        self._ignores.append('T')
+        self._ignores.append('use_cudnn_on_gpu')
+        self._ignores.append('_node_name')
+        self._ignores.append('is_training')
+        # Retain the names
+        try:
+            attrs['name'] = attrs['_node_name']
+        except KeyError:
+            pass
+        return AttrConvert(self._op_name, self._transforms, self._excludes,
+                           self._disables, self._ignores, self._extras,
+                           self._custom_check)(inputs, attrs, *args)
+
+def _get_pad_pair(input1d, kernel1d, stride1d):
+    if input1d % stride1d == 0:
+        pad = max(kernel1d - stride1d, 0)
+    else:
+        pad = max(kernel1d - (input1d % stride1d), 0)
+
+    pad_before = pad // 2
+    pad_after = pad - pad_before
+
+    return [pad_before, pad_after]
+
+def _math_name_picker(surfix):
+    def _impl(attr):
+        return 'broadcast_' + surfix
+    return _impl
+
+def _dimension_picker(prefix, surfix=''):
+    def _impl(attr):
+        kernel = attr['kernel_shape']
+        if len(kernel) == 2:
+            return prefix + '2d' + surfix
+        else:
+            raise NotImplementedError("Only 2d kernel supported.")
+    return _impl
+
+def _dimension_constraint():
+    def _dim_check(attrs):
+        if len(attrs['kernel_shape']) == 2:
+            return True
+        return False
+    return _dim_check, "Only 2d kernel supported."
+
+def _infer_channels(inputs, params, transpose=False):
+    """A hack for getting 'channles' or 'units' since tensorflow don't provide
+    these attributes. We check the shape of weights provided to get the number.
+    """
+    g = _graph.create(inputs)
+    shape_dict = {k: v.shape for k, v in params.items()}
+    _, out_shapes = graph_util.infer_shape(g, **shape_dict)
+    channels = out_shapes[0][0] if not transpose else out_shapes[0][1]
+    return channels
+
+def _rsqrt():
+    def _impl(inputs, attr, *args):
+        return AttrCvt(op_name="__pow_scalar__", extras={'scalar': -0.5})(inputs, attr)
+    return _impl
+
+def _argx(func, func_name):
+    """ A common wrapper for argmin and argmax operations """
+    def _impl(inputs, attr, params):
+        try:
+            # In Tensorflow, `axis` argument is a Tensor, not attribute. We
+            # support the case where it inputs from a scalar constant.
+            axis_input_name = inputs[1].list_output_names()[0]
+            axis_input_vlaue = params[axis_input_name].asnumpy()[0]
+        except (IndexError, KeyError):
+            raise TypeError( \
+                "Unsupported argument for `{}` : `axis` should be a constant".format(func_name))
+        return func(inputs[0], axis=axis_input_vlaue, keepdims=False)
+    return _impl
+
+def _elemwise(name):
+    def _impl(inputs, attr, *args):
+        assert len(inputs) == 2, "Math op take 2 inputs, {} given".format(len(inputs))
+        op_name = _math_name_picker(name)(attr)
+        axis = int(attr.get('axis', 0))
+        conv_ops = ["conv2d", "conv2d_transpose"]
+        if op_name == 'broadcast_add' and inputs[0].attr('op_name') in conv_ops:
+            # TODO: remove hard coded infershape
+            inputs[1] = _sym.expand_dims(inputs[1], axis=axis, num_newaxis=2)
+        return get_nnvm_op(op_name)(*inputs)
+    return _impl
+
+def _pooling(name):
+    def _impl(inputs, attr, params):
+
+        attr['data_format'] = attr['data_format'].decode("utf-8")
+
+        if attr['data_format'] == 'NHWC':
+            attr['kernel_shape'] = (attr['ksize'][1], attr['ksize'][2])
+        elif attr['data_format'] == 'NCHW':
+            attr['kernel_shape'] = (attr['ksize'][2], attr['ksize'][3])
+        else:
+            raise TypeError("Unsupported data_format type : {}".format(attr['data_format']))
+
+        # Fix strides
+        attr['strides'] = (attr['strides'][1], attr['strides'][2])
+
+        # Fix padding
+        input_shapes = attr['_input_shapes'][inputs[0]]
+        attr['padding'] = attr['padding'].decode("utf-8")
+
+        if attr['padding'] == 'VALID':
+            attr['padding'] = [0, 0]
+        elif attr['padding'] == 'SAME':
+            stride_h, stride_w = attr['strides']
+            kernel_h, kernel_w = attr['kernel_shape']
+            if attr['data_format'] == 'NHWC':
+                in_h = input_shapes[0][1]
+                in_w = input_shapes[0][2]
+            else:
+                in_h = input_shapes[0][2]
+                in_w = input_shapes[0][3]
+
+            pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
+            pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
+
+            attr['padding'] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]]
+        else:
+            raise TypeError("Unsupported padding type : {}".format(attr['padding']))
+
+        if name == "avg_pool":
+            attr['count_include_pad'] = False
+
+        return AttrCvt(
+            op_name=_dimension_picker(name),
+            transforms={
+                'kernel_shape':'pool_size',
+                'data_format':'layout'},
+            ignores=['ksize'],
+            extras={'ceil_mode': False},
+            custom_check=_dimension_constraint())(inputs, attr)
+    return _impl
+
+def _conv():
+    def _impl(inputs, attr, params):
+        attr['data_format'] = attr['data_format'].decode("utf-8")
+
+        # Extract kernel shape from params
+        conv_param_weights = params[inputs[1].list_output_names()[0]]
+
+        if attr['data_format'] == 'NHWC':
+            attr['kernel_shape'] = (conv_param_weights.shape[0], conv_param_weights.shape[1])
+            attr['channels'] = conv_param_weights.shape[3]
+            if 'dilations' in attr:
+                attr['dilations'] = (attr['dilations'][0], attr['dilations'][1])
+        elif attr['data_format'] == 'NCHW':
+            attr['kernel_shape'] = (conv_param_weights.shape[2], conv_param_weights.shape[3])
+            attr['channels'] = conv_param_weights.shape[1]
+            if 'dilations' in attr:
+                attr['dilations'] = (attr['dilations'][2], attr['dilations'][3])
+        else:
+            raise TypeError("Unsupported data format type : {}".format(attr['data_format']))
+
+        # Fix strides
+        attr['strides'] = (attr['strides'][1], attr['strides'][2])
+
+        # Fix padding
+        input_shapes = attr['_input_shapes'][inputs[0]]
+        attr['padding'] = attr['padding'].decode("utf-8")
+
+        if attr['padding'] == 'VALID':
+            attr['padding'] = [0, 0]
+        elif attr['padding'] == 'SAME':
+            stride_h, stride_w = attr['strides']
+            kernel_h, kernel_w = attr['kernel_shape']
+            if attr['data_format'] == 'NHWC':
+                in_h = input_shapes[0][1]
+                in_w = input_shapes[0][2]
+            else:
+                in_h = input_shapes[0][2]
+                in_w = input_shapes[0][3]
+
+            pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
+            pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
+
+            if attr['data_format'] == 'NHWC':
+                inputs[0] = _sym.pad(data=inputs[0],
+                                     pad_width=((0, 0),
+                                                (pad_v[0], pad_v[1]),
+                                                (pad_h[0], pad_h[1]),
+                                                (0, 0)))
+            else:
+                inputs[0] = _sym.pad(data=inputs[0],
+                                     pad_width=((0, 0),
+                                                (0, 0),
+                                                (pad_v[0], pad_v[1]),
+                                                (pad_h[0], pad_h[1])))
+
+            attr['padding'] = [0, 0]
+
+        else:
+            raise TypeError("Unsupported padding type : {}".format(attr['padding']))
+
+        if 'kernel_layout' not in attr:
+            attr['kernel_layout'] = 'HWIO' if attr['data_format'] == 'NHWC' else 'OIHW'
+
+        return AttrCvt(
+            op_name=_dimension_picker('conv'),
+            transforms={
+                'kernel_shape': 'kernel_size',
+                'data_format': 'layout',
+                'dilations': ('dilation', (0, 0)),
+                'group': ('groups', 1)},
+            extras={'use_bias': len(inputs) == 3},
+            custom_check=_dimension_constraint())(inputs, attr)
+    return _impl
+
+def _depthwise_conv():
+    def _impl(inputs, attr, params):
+        attr['data_format'] = attr['data_format'].decode("utf-8")
+        input_shapes = attr['_input_shapes'][inputs[0]]
+
+        # Extract kernel shape from params
+        conv_param_weights = params[inputs[1].list_output_names()[0]]
+
+        if attr['data_format'] == 'NHWC':
+            kernel_h, kernel_w, _, depth_mult = conv_param_weights.shape
+            attr['kernel_shape'] = (conv_param_weights.shape[0], conv_param_weights.shape[1])
+            attr['channels'] = input_shapes[0][3] * depth_mult
+            if 'dilations' in attr:
+                attr['dilations'] = (attr['dilations'][0], attr['dilations'][1])
+        elif attr['data_format'] == 'NCHW':
+            depth_mult, _, kernel_h, kernel_w = conv_param_weights.shape
+            attr['kernel_shape'] = (conv_param_weights.shape[2], conv_param_weights.shape[3])
+            attr['channels'] = input_shapes[0][1] * depth_mult
+            if 'dilations' in attr:
+                attr['dilations'] = (attr['dilations'][2], attr['dilations'][3])
+        else:
+            raise TypeError("Unsupported data format type : {}".format(attr['data_format']))
+
+        # Fix strides
+        attr['strides'] = (attr['strides'][1], attr['strides'][2])
+
+        # Fix groups
+        attr['groups'] = attr['channels']
+
+        # Fix padding
+        attr['padding'] = attr['padding'].decode("utf-8")
+
+        if attr['padding'] == 'VALID':
+            attr['padding'] = [0, 0]
+        elif attr['padding'] == 'SAME':
+            stride_h, stride_w = attr['strides']
+            kernel_h, kernel_w = attr['kernel_shape']
+            if attr['data_format'] == 'NHWC':
+                in_h = input_shapes[0][1]
+                in_w = input_shapes[0][2]
+            else:
+                in_h = input_shapes[0][2]
+                in_w = input_shapes[0][3]
+
+            pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
+            pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
+
+            if attr['data_format'] == 'NHWC':
+                inputs[0] = _sym.pad(data=inputs[0],
+                                     pad_width=((0, 0),
+                                                (pad_v[0], pad_v[1]),
+                                                (pad_h[0], pad_h[1]),
+                                                (0, 0)))
+            else:
+                inputs[0] = _sym.pad(data=inputs[0],
+                                     pad_width=((0, 0),
+                                                (0, 0),
+                                                (pad_v[0], pad_v[1]),
+                                                (pad_h[0], pad_h[1])))
+
+            attr['padding'] = [0, 0]
+
+        else:
+            raise TypeError("Unsupported padding type : {}".format(attr['padding']))
+
+        if 'kernel_layout' not in attr:
+            attr['kernel_layout'] = 'HWOI' if attr['data_format'] == 'NHWC' else 'OIHW'
+
+        return AttrCvt(
+            op_name=_dimension_picker('conv'),
+            transforms={
+                'kernel_shape': 'kernel_size',
+                'data_format': 'layout',
+                'dilations': ('dilation', (0, 0)),
+                'group': ('groups', 1)},
+            extras={'use_bias': len(inputs) == 3},
+            custom_check=_dimension_constraint())(inputs, attr)
+    return _impl
+
+def _decode_image():
+    def _impl(inputs, attr, params):
+        # Image decode wrapper: Expecting user to feed decoded input to next layer drop this layer.
+        print("DecodeJpeg: It's a pass through, please handle preprocessing before input")
+        return inputs[0]
+    return _impl
+
+def _cast():
+    def _impl(inputs, attr, params):
+        # Convert from tensorflow Dtype to str
+        attr['DstT'] = attr['DstT'].name
+        return AttrCvt(op_name='cast', transforms={'DstT': 'dtype'}, ignores=['SrcT'])(inputs, attr)
+    return _impl
+
+def _expand_dims():
+    def _impl(inputs, attr, params):
+        dim_input = inputs.pop(1)
+        axis = params[dim_input.list_output_names()[0]]
+        params.pop(dim_input.list_output_names()[0])
+        return AttrCvt(op_name="expand_dims", ignores=['Tdim'],
+                       extras={'axis': axis.asnumpy()[0]})(inputs, attr)
+    return _impl
+
+def _resize_bilinear():
+    def _impl(inputs, attr, params):
+        attr['size'] = attr['_output_shapes'][0][1:3]
+        inputs.pop(1)
+        # NHWC
+        attr['layout'] = 'NHWC'
+
+        return AttrCvt(op_name="resize",
+                       ignores=['Tdim'],
+                       extras={'method': "BILINEAR"})(inputs, attr)
+    return _impl
+
+def _check_numerics():
+    def _impl(inputs, attr, params):
+        # Making a copy node assuming no need to verify
+        return AttrCvt(op_name="copy", ignores=['message'])(inputs, attr)
+    return _impl
+
+
+def _matmul():
+    def _impl(inputs, attr, params):
+        channels = _infer_channels(inputs[1], params, not attr['transpose_b'])
+        if attr['transpose_a']:
+            inputs[0] = _sym.transpose(inputs[0], axis(1, 0))
+        if not attr['transpose_b']:
+            inputs[1] = _sym.transpose(inputs[1], axes=(1, 0))
+        return AttrCvt(op_name="dense",
+                       extras={'use_bias': False, 'units': channels},
+                       ignores=['transpose_a', 'transpose_b', 'T'])(inputs, attr)
+
+    return _impl
+
+def _identity():
+    def _impl(inputs, attr, params):
+        return inputs[0]
+    return _impl
+
+def _concatV2():
+    def _impl(inputs, attr, params):
+        pop_node = inputs.pop(len(inputs)-1)
+        axis = params[pop_node.list_output_names()[0]]
+        params.pop(pop_node.list_output_names()[0])
+        return AttrCvt(
+            op_name="concatenate", ignores=['T', 'N', 'Tidx'],
+            extras={'axis': axis.asnumpy()[0]})(inputs, attr)
+    return _impl
+
+def _concat():
+    def _impl(inputs, attr, params):
+        pop_node = inputs.pop(0)
+        axis = params[pop_node.list_output_names()[0]]
+        params.pop(pop_node.list_output_names()[0])
+        return AttrCvt(
+            op_name="concatenate", ignores=['N'],
+            extras={'axis': axis.asnumpy()[0]})(inputs, attr)
+    return _impl
+
+def _reshape():
+    def _impl(inputs, attr, params):
+        try:
+            pop_node = inputs[1]
+            shape_arg = params.pop(pop_node.list_output_names()[0])
+            inputs.pop(1)
+
+            return AttrCvt(
+                op_name="reshape",
+                extras={'shape':tuple(shape_arg.asnumpy())},
+                ignores=['Tshape'])(inputs, attr)
+        except KeyError:
+            return AttrCvt(
+                op_name="reshape_like",
+                ignores=['Tshape'])(inputs, attr)
+    return _impl
+
+def _bias_add():
+    def _impl(inputs, attr, params):
+        return _sym.broadcast_add(inputs[0], inputs[1])
+    return _impl
+
+def _squeeze():
+    def _impl(inputs, attr, params):
+        return AttrCvt(
+            op_name="squeeze",
+            transforms={'squeeze_dims':'axis'},
+            ignores=['T'])(inputs, attr)
+    return _impl
+
+def _fused_batch_norm():
+    def _impl(inputs, attr, params):
+        # Tensorflow: (data, gamma, beta, moving_mean, moving_variance)
+        # NNVM:       (data, gamma, beta, moving_mean, moving_varience)
+        return AttrCvt(
+            op_name='batch_norm',
+            transforms={'scale_after_normalization':'scale', 'variance_epsilon':'epsilon'},
+            extras={'axis': 3}, # Fix axis
+            ignores=['data_format'],
+            disables=['momentum'])(inputs, attr)
+    return _impl
+
+def _batch_norm():
+    def _impl(inputs, attr, params):
+        # Rearrange inputs from
+        # (data, moving_mean, moving_variance, beta, gamma)
+        #     to
+        # (data, gamma, beta, moving_mean, moving_var)
+        new_inputs = [inputs[0], inputs[4], inputs[3], inputs[1], inputs[2]]
+
+        return AttrCvt(
+            op_name='batch_norm',
+            transforms={'scale_after_normalization':'scale', 'variance_epsilon':'epsilon'},
+            extras={'axis': 3}, # Fix axis
+            ignores=['data_format'],
+            disables=['momentum'])(new_inputs, attr)
+    return _impl
+
+def _relu6():
+    def _impl(inputs, attr, params):
+        return _sym.clip(inputs[0], a_min=0, a_max=6, name=attr['_node_name'])
+    return _impl
+
+def _shape():
+    def _impl(inputs, attr, params):
+        # Result of this operator is prominently used by reshape operator.
+        # Just pass the input as it is so that reshape_like can be used there.
+        return inputs[0]
+    return _impl
+
+def _fill():
+    def _impl(inputs, attr, params):
+        fill_arg = params.pop(inputs.pop(1).list_output_names()[0])
+        new_inputs = []
+        return AttrCvt(
+            op_name='full',
+            extras={'shape':inputs[0],
+                    'fill_value':fill_arg.asnumpy()[0], 'dtype':attr['T'].name},
+            ignores=['index_type', 'T'])(new_inputs, attr)
+    return _impl
+
+def _lrn():
+    def _impl(inputs, attr, params):
+        new_inputs = []
+        attr_new = {}
+        depth_radius = attr.get('depth_radius', 5)
+        size = (depth_radius * 2) + 1
+        attr_new['axis'] = 3 # Fix axis, NHWC format
+        attr_new['size'] = size
+        attr_new['bias'] = attr.get('bias', 1)
+        attr_new['alpha'] = attr.get('alpha', 1) * size
+        attr_new['beta'] = attr.get('beta', 0.5)
+        return AttrCvt(op_name='lrn')(new_inputs, attr_new)
+    return _impl
+
+def _gather_v2():
+    "Tensorflow now support only gatherv2"
+    def _impl(inputs, attr, params):
+        axis = params[inputs.pop(2).list_output_names()[0]].asnumpy()[0]
+        new_input = []
+        new_input.append(inputs.pop(0))
+        new_input.append(inputs.pop(0))
+        return AttrCvt(
+            op_name="take",
+            extras={'axis':axis},
+            ignores=['Tindices', 'Tparams', 'validate_indices', \
+                     'Taxis', '_class'])(new_input, attr)
+    return _impl
+
+def _infer_out_shapes(inputs, params):
+    """A method to get the output shape of an intermediate node in the NNVM graph."""
+    g = _graph.create(inputs)
+    shape_dict = {k: v.shape for k, v in params.items()}
+    _, out_shapes = graph_util.infer_shape(g, **shape_dict)
+    return out_shapes
+
+def _stridedSlice():
+    def _impl(inputs, attr, params):
+        """Strided Slice.
+        Operator description: https://www.tensorflow.org/api_docs/python/tf/strided_slice
+        Tensorflow mask validation: https://github.com/tensorflow/tensorflow/blob/master/
+        tensorflow/core/util/strided_slice_op.cc#L147-L368
+        """
+        begin = params.pop(inputs[1].list_output_names()[0]).asnumpy().tolist()
+        end = params.pop(inputs[2].list_output_names()[0]).asnumpy().tolist()
+        stride = params.pop(inputs[3].list_output_names()[0]).asnumpy().tolist()
+        begin_mask = int(attr.get('begin_mask', 0))
+        end_mask = int(attr.get('end_mask', 0))
+        ellipsis_mask = int(attr.get('ellipsis_mask', 0))
+        new_axis_mask = int(attr.get('new_axis_mask', 0))
+        shrink_axis_mask = int(attr.get('shrink_axis_mask', 0))
+        data_shape = attr['_input_shapes'][inputs[0]]
+        data_dim = len(data_shape[0])
+        stride_dim = len(stride)
+
+        def _transform_mask(stride_dim, ellipsis_mask):
+            """Handle mask inputs to create new begin, end, stride and output shape"""
+            m_begin = [0] * data_dim
+            m_end = [0] * data_dim
+            m_stride = [0] * data_dim
+            #Count new axis after ellipsis_mask, consider while applying ellipsis_mask.
+            ellipsis_seen = False
+            new_axes_after_ellipsis = 0
+            for i in range(stride_dim):
+                mask = 1 << i
+                if ellipsis_seen and (mask & new_axis_mask) != 0:
+                    new_axes_after_ellipsis += 1
+                if (mask & ellipsis_mask) != 0:
+                    ellipsis_seen = True
+            if not ellipsis_seen:
+                #Used later for extending the stride attributes in the below loop.
+                ellipsis_mask |= (1 << stride_dim)
+                stride_dim += 1
+            final_index = 0
+            for index in range(stride_dim):
+                mask = 1 << index
+                if mask & ellipsis_mask:
+                    #Identify the end index for applying ellipsis_mask
+                    to_index = min(((data_dim - (stride_dim-index)) + 1 \
+                                     + new_axes_after_ellipsis), data_dim)
+                    for i in range(final_index, to_index):
+                        m_begin[final_index] = 0
+                        m_end[final_index] = data_shape[0][final_index]
+                        m_stride[final_index] = 1
+                        final_index += 1
+                elif not mask & new_axis_mask:
+                    if final_index == len(m_begin):
+                        break
+                    if mask & begin_mask:
+                        m_begin[final_index] = data_shape[0][final_index] \
+                                                     if stride[index] < 0 else 0
+                    elif begin[index]:
+                        m_begin[final_index] = begin[index]
+                    if mask & end_mask:
+                        m_end[final_index] = 0 if stride[index] < 0 \
+                                                 else data_shape[0][final_index]
+                    elif end[index]:
+                        m_end[final_index] = end[index]
+                    m_stride[final_index] = stride[index]
+                    if mask & shrink_axis_mask:
+                        #Tensorflow make axis with shrink_axis_mask as dimension 1
+                        m_begin[final_index] = data_shape[0][final_index] + begin[index] \
+                                                 if begin[index] < 0 else begin[index]
+                        m_end[final_index] = begin[index] + 1
+                        m_stride[final_index] = 1
+                    final_index += 1
+            return m_begin, m_end, m_stride
+
+        if begin_mask or end_mask or ellipsis_mask or new_axis_mask or shrink_axis_mask:
+            begin, end, stride = _transform_mask(stride_dim, ellipsis_mask)
+        out = _sym.strided_slice(inputs[0], begin=begin, end=end, stride=stride)
+        out_shape = _infer_out_shapes(out, params)[0]
+
+        #Create final output shape.
+        final_output = []
+        out_index = 0
+        index = 0
+        while out_index != len(out_shape):
+            #axis with shrink_axis_mask dimension=1 and it is ignored.
+            mask = 1 << index
+            if (new_axis_mask & mask) and not ellipsis_mask & mask:
+                final_output.append(1)
+            elif (not mask & shrink_axis_mask) or index >= stride_dim:
+                #Shrink is considered till stride_dim
+                final_output.append(out_shape[out_index])
+                out_index += 1
+            index += 1
+        return _sym.reshape(out, shape=tuple(final_output))
+    return _impl
+
+def _LSTMBlockCell():
+    def _impl(inputs, in_state_c, in_state_h, attr, params):
+        """LSTM Block cell.
+        Calculations are described in: https://github.com/tensorflow/tensorflow/blob/
+        r1.8/tensorflow/contrib/rnn/python/ops/lstm_ops.py#L41-L114
+
+        Parameters
+        ----------
+        inputs : nnvm.Symbol
+            Input data
+        in_state_c: list of nnvm.Symbol
+            Cell state input values for all the layers
+        in_state_h: list of nnvm.Symbol
+            Hidden state input values for all the layers
+        attrs : dict
+            Dict of operator attributes
+        params : dict
+            List of pretrained weights and bias
+
+        Returns
+        -------
+        sym : nnvm.Symbol
+            Converted nnvm Symbol
+        output: nnvm.Symbol
+            Output state value.
+        """
+        in_data = inputs[0]
+        in_weight = inputs[3]
+        in_bias = inputs[7]
+        forget_bias = attr.pop('forget_bias')
+        input_shape = attr['_input_shapes'][inputs[0]]
+        weight_shape = attr['_input_shapes'][inputs[3]]
+        batch_size, input_size = input_shape[0][0], input_shape[0][1]
+        num_hidden_layers = weight_shape[0][1]
+        num_hidden = num_hidden_layers // 4
+
+        in_data = _sym.reshape(in_data,
+                               shape=(batch_size, input_size))
+        ixh = _sym.concatenate(*[in_data, in_state_h], axis=1)
+        in_weight = _sym.transpose(in_weight)
+        gates = _sym.dense(ixh, in_weight, in_bias, use_bias=True,
+                           units=num_hidden_layers, name="dense")
+        gate_list = _sym.split(gates, indices_or_sections=4, axis=1)
+        in_gate = _sym.sigmoid(gate_list[0])
+        in_transform = _sym.tanh(gate_list[1])
+        forget_gate = _sym.sigmoid(gate_list[2])
+        forget_gate = forget_gate + forget_bias
+        out_gate = _sym.sigmoid(gate_list[3])
+        next_c = _sym.broadcast_add(_sym.broadcast_mul(forget_gate, in_state_c),
+                                    _sym.broadcast_mul(in_gate, in_transform))
+        next_h = out_gate * _sym.tanh(next_c)
+        out_state = _sym.concatenate(*[next_c, next_h])
+        out_state = _sym.reshape(out_state,
+                                 shape=(2, batch_size, num_hidden))
+        return next_h, out_state
+    return _impl
+
+
+# compatible operators that do NOT require any conversion.
+_identity_list = []
+
+# _convert_map defines maps of name to converter functor(callable)
+# for 1 to 1 mapping, use Renamer if nothing but name is different
+# use AttrCvt if attributes need to be converted
+# for 1 to N mapping(composed), use custom callable functions
+# for N to 1 mapping, currently not supported(?)
+_convert_map = {
+    'ArgMax'                            : _argx(_sym.argmax, 'argmax'),
+    'ArgMin'                            : _argx(_sym.argmin, 'argmin'),
+    'AvgPool'                           : _pooling('avg_pool'),
+    'BatchNormWithGlobalNormalization'  : _batch_norm(),
+    'BiasAdd'                           : _bias_add(),
+    'Cast'                              : _cast(),
+    'CheckNumerics'                     : _check_numerics(),
+    'Concat'                            : _concat(),
+    'ConcatV2'                          : _concatV2(),
+    'Conv2D'                            : _conv(),
+    'DecodeJpeg'                        : _decode_image(),
+    'ExpandDims'                        : _expand_dims(),
+    'Identity'                          : _identity(),
+    'MatMul'                            : _matmul(),
+    'MaxPool'                           : _pooling('max_pool'),
+    'Mul'                               : _elemwise('mul'),
+    'Relu'                              : AttrCvt('relu'),
+    'Reshape'                           : _reshape(),
+    'ResizeBilinear'                    : _resize_bilinear(),
+    'Softmax'                           : AttrCvt('softmax', {'axis': ('axis', 1)}),
+    'Sub'                               : _elemwise('sub'),
+    'Add'                               : _elemwise('add'),
+    'Rsqrt'                             : _rsqrt(),
+    'Squeeze'                           : _squeeze(),
+    'FusedBatchNorm'                    : _fused_batch_norm(),
+    'Relu6'                             : _relu6(),
+    'DepthwiseConv2dNative'             : _depthwise_conv(),
+    'Shape'                             : _shape(),
+    'Sigmoid'                           : AttrCvt('sigmoid'),
+    'Fill'                              : _fill(),
+    'GatherV2'                          : _gather_v2(),
+    'StridedSlice'                      : _stridedSlice(),
+    'LRN'                               : _lrn(),
+}
+
+# _convert_map_rnn defines maps of rnn operator name to
+# converter functor(callable) for 1 to 1 mapping.
+_convert_map_rnn = {
+    'LSTMBlockCell'                     : _LSTMBlockCell(),
+}
+
+class RecurrentNetworks(object):
+    """Recurrent network layer handlers.
+
+    Handle Layer operations.
+    ToDo: Operators like RNN/GRU layer concepts also can be handled here
+
+    Parameters
+    ----------
+    nodes : list
+        list of graph nodes used for tensorflow parsing.
+
+    out_rnn : list
+        List of RecurrentNetwork outputs. This output will be appended to the
+        'head' nodes of the graph.
+
+    graph : tensorflow graph definition object
+        The loaded tensorflow GraphDef
+
+    convert_map : dict
+        Dict of name : callable, where name is the op's name that
+        require conversion to nnvm, callable are functions which
+        take attrs and return (new_op_name, new_attrs)
+    """
+    def __init__(self, nodes, out_rnn, graph, convert_map):
+        self._graph = graph
+        self._convert_map = convert_map
+        self._nodes = nodes
+        self._out_rnn = out_rnn
+        self._cur_lstm_layer = 0
+        self._layer_name_list = []
+        self._recurrent_ops_layer_map = {
+            'LSTMBlockCell'               : self._LSTMBlockCellLayer(),
+        }
+
+    def _LSTMBlockCellLayer(self):
+        """LSTMBlockCell layer handler.
+
+        Parameters
+        ----------
+        op_name : str
+            Operator name, eg:LSTMBlockCell
+
+        layer_name : str list
+            Layer name is used for creating the state input placeholder.
+
+        inputs : nnvm.Symbol
+            Input data
+
+        attrs : dict
+            Dict of operator attributes
+
+        params : dict
+            List of pretrained weights and bias
+
+        num_layers : int
+            Total number of LSTM layer presented in the graph
+
+        Returns
+        -------
+        sym : nnvm.sym.Symbol
+            The returned nnvm symbol
+        """
+        def _impl(op_name, layer_name, inputs, attrs, params, num_layers):
+            in_state_c_name = layer_name+'_c'
+            in_state_h_name = layer_name+'_h'
+
+            def _init_state(num_layers, batch_size, num_hidden):
+                """Create the initial states for the first layer in the graph."""
+                in_state_c = _sym.Variable(in_state_c_name,
+                                           shape=(num_layers, batch_size, num_hidden))
+                in_state_h = _sym.Variable(in_state_h_name,
+                                           shape=(num_layers, batch_size, num_hidden))
+                return in_state_c, in_state_h
+
+            def _get_cur_input_state(in_state_c, in_state_h, num_layers,
+                                     layer, batch_size, num_hidden):
+                """Select the appropriate states for the current layer"""
+                in_state_c_tup = _sym.split(in_state_c,
+                                            indices_or_sections=num_layers, axis=0)
+                in_state_h_tup = _sym.split(in_state_h,
+                                            indices_or_sections=num_layers, axis=0)
+                cur_in_state_c = _sym.reshape(in_state_c_tup[layer],
+                                              shape=(batch_size, num_hidden))
+                cur_in_state_h = _sym.reshape(in_state_h_tup[layer],
+                                              shape=(batch_size, num_hidden))
+                return cur_in_state_c, cur_in_state_h
+
+            def _LSTMBlockCellWrapper(inputs, attr, params,
+                                      num_layers, layer):
+                """LSTM cell warapper to prepare the inputs"""
+                input_shape = attr['_input_shapes'][inputs[0]]
+                weight_shape = attr['_input_shapes'][inputs[3]]
+                batch_size = input_shape[0][0]
+                num_hidden = weight_shape[0][1] // 4
+
+                if layer == 0:
+                    #Create initial states placeholder in case of first layer
+                    in_state_c, in_state_h = _init_state(num_layers,
+                                                         batch_size, num_hidden)
+                else:
+                    in_state_c = self._nodes[in_state_c_name]
+                    in_state_h = self._nodes[in_state_h_name]
+
+                cur_in_state_c, cur_in_state_h = _get_cur_input_state( \
+                                                    in_state_c, in_state_h,
+                                                    num_layers, layer,
+                                                    batch_size, num_hidden)
+                output, out_state = self._convert_map[op_name](inputs, cur_in_state_c,
+                                                               cur_in_state_h,
+                                                               attr, params)
+                return output, out_state, in_state_c, in_state_h
+
+            sym, cur_out_state, in_state_c, in_state_h = \
+                    _LSTMBlockCellWrapper(inputs, attrs, params,
+                                          num_layers, self._cur_lstm_layer)
+            self._nodes[in_state_c_name] = in_state_c
+            self._nodes[in_state_h_name] = in_state_h
+            cur_out_state = _sym.expand_dims(cur_out_state, axis=0, num_newaxis=1)
+            self._out_rnn.append(cur_out_state)
+            self._cur_lstm_layer += 1
+            return sym
+        return _impl
+
+    def process_op(self, op_name, inputs, attrs, params):
+        """Process recurrent layer operators.
+
+        List '_recurrent_ops_layer_map' map each Layer based operators with its
+        layer handlers. Total number of layers are calculated to form the input
+        data shapes.
+
+        Parameters
+        ----------
+        op_name : str
+            Operator name, such as LSTMBlockCell
+
+        inputs : nnvm.Symbol
+            Input data
+
+        attrs : dict
+            Dict of operator attributes
+
+        params : dict
+            List of pretrained weights and bias
+
+        Returns
+        -------
+        sym : nnvm.sym.Symbol
+            The returned nnvm symbol
+        """
+        def _get_abs_layer_name(node):
+            """Identify the layer name is already handled. Return the absolute name
+            """
+            if not self._layer_name_list:
+                self._layer_name_list.append(node.name)
+                return node.name
+
+            for _name in self._layer_name_list:
+                if _name in node.name:
+                    abs_name = _name
+                else:
+                    self._layer_name_list.append(node.name)
+                    abs_name = node.name
+            return abs_name
+
+        #Find number of layers of this same operator node in the graph
+        #and also read the inputs name for the current op.
+        num_layers = 0
+        for _, node in enumerate(self._graph.node):
+            if node.op == op_name:
+                layer_name = _get_abs_layer_name(node)
+                num_layers += 1
+
+        sym = self._recurrent_ops_layer_map[op_name](op_name, layer_name, inputs, attrs,
+                                                     params, num_layers)
+        return sym
+
+
+def _parse_import_prerequisites(graph):
+    """ Calculate the named preconditions from TensorFlow `graph`.
+    Return prerequisites for parsing:
+     a. Set of operator names which don't have their mapping in TVM, i.e.
+        which are not supported
+    """
+    missing_operators = set()
+    for node in graph.node:
+        if node.op == "Placeholder":
+            pass
+        elif node.op == "Const":
+            pass
+        else:
+            if any([node.op in t for t in [_identity_list, _convert_map, _convert_map_rnn]]):
+                pass
+            else:
+                missing_operators.add(node.op)
+
+    return missing_operators
+
+
+class GraphProto(object):
+    """ A helper class for handling nnvm graph copying from Tensorflow GraphDef.
+    Definition:
+        https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto
+    """
+    def __init__(self):
+        self._nodes = {}
+        self._params = {}
+        self._renames = {}
+        self._replacements = {}
+        self._output_shapes = {}
+        self._num_input = 0
+        self._num_param = 0
+        self._input_node = ''
+        self._num_rnn_layer = False
+
+    def from_tensorflow(self, graph):
+        """Construct nnvm nodes from tensorflow  graph definition - GraphDef.
+
+        Follow the tensorflow graph definition to parse and convert it to NNVM.
+        Some of the assumptions listed below.
+
+            -> First Placeholder or Const node will be considered as graph input.
+            -> Rest all Const nodes are params.
+            -> Last node is assumed as graph output.
+            -> _output_shapes : Attribute should present in the tenserflow forzen graph.
+            -> DecodeJpeg, ResizeBilinear: These are dummy operators.
+                                           Hence user should handle preprocessing outside.
+            -> CheckNumerics: No implementation as of now for this.
+                              Just copies input to output.
+
+        TODO: Change algorithm to stop treating first 'Const' in a special way.
+
+        Parameters
+        ----------
+        graph : tensorflow graph definition object
+            The loaded tensorflow GraphDef
+
+        Returns
+        -------
+        sym : nnvm.sym.Symbol
+            The returned nnvm symbol
+        params : dict
+            A dict of name: tvm.nd.array pairs, used as pretrained weights
+        """
+
+        try:
+            from tensorflow.python.framework import tensor_util
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import tensorflow which is required {}".format(e))
+
+        missing_operators = _parse_import_prerequisites(graph)
+
+        if missing_operators:
+            raise NotImplementedError( \
+                "The following operators are not implemented: {}".format(missing_operators))
+
+        # Parse the nodes to re-create TF graph using Symbol API of NNVM
+        for node in graph.node:
+            # Tensorflow doesn't have seperate list for params extraction.
+            # Operator name 'Const' is treated as a parameter to build NNVM params dict.
+            input_shapes = {}
+            if node.op == "Placeholder":
+                self._input_node = node.name
+                self._num_input += 1
+
+                try:
+                    self._output_shapes[node.name] = \
+                         [tensor_util.TensorShapeProtoToList(shape) \
+                         for shape in self._parse_attr(node.attr)['_output_shapes']]
+                    self._nodes[node.name] = _sym.Variable(name=node.name,
+                                                           shape=self._output_shapes[node.name][0])
+                    input_shapes[self._nodes[node.name]] = self._output_shapes[node.name]
+                except KeyError:
+                    raise NotImplementedError( \
+                        "Please freeze the graph with add_shapes=True")
+            elif node.op == "Const":
+                if self._input_node == '':
+                    self._input_node = node.name
+                    self._num_input += 1
+                    self._nodes[node.name] = _sym.Variable(name=node.name)
+                else:
+                    # Rest all nodes are Param nodes, lets parse
+                    self._num_param += 1
+                    for key, value in node.attr.items():
+                        self._parse_param(key, value, node.name)
+                    if node.name not in self._nodes:
+                        raise NotImplementedError( \
+                            "Const {} couldn't be converted to Param.".format(node.name))
+                attr = self._parse_attr(node.attr)
+                #Variable converted to Const will not have only value attr
+                if 'value' in attr:
+                    tensor_value = attr['value']
+                    self._output_shapes[node.name] = \
+                        [tensor_util.TensorShapeProtoToList( \
+                            tensor_value.tensor_shape)]
+                elif '_output_shapes' in attr:
+                    self._output_shapes[node.name] = \
+                        [tensor_util.TensorShapeProtoToList(shape) \
+                        for shape in self._parse_attr(node.attr)['_output_shapes']]
+                else:
+                    raise NotImplementedError( \
+                        "Please freeze the graph with add_shapes=True")
+            else:
+                attr = self._parse_attr(node.attr)
+                try:
+                    self._output_shapes[node.name] = \
+                         [tensor_util.TensorShapeProtoToList(shape) \
+                          for shape in attr['_output_shapes']]
+                except KeyError:
+                    raise NotImplementedError( \
+                        "Please freeze the graph with add_shapes=True")
+
+                # Pass the parsed shapes instead
+                attr["_output_shapes"] = self._output_shapes[node.name]
+
+                # Pass the node name too in attr
+                attr["_node_name"] = node.name
+
+                #ToDo: Some of the tensorflow operators internaly maintain
+                #execution layers and its output name will the layer number along with
+                #graph node name.eg: Node name:- 'Model/RNN/cell_0/RnnCell', but the
+                #output name will be 'Model/RNN/cell_0/RnnCell:0'. In this case,
+                #the digit has to be ignored.
+                if ":" in node.input[0]:
+                    in_name, _ = node.input[0].split(':')
+                    node.input[0] = in_name
+                try:
+                    inputs = [self._nodes[i] for i in node.input]
+                    for i in node.input:
+                        if i not in self._params:
+                            input_shapes[self._nodes[i]] = self._output_shapes[i]
+                    attr['_input_shapes'] = input_shapes
+                except KeyError:
+                    # TODO: Need to find clean way to handle '^CheckNumerics'
+                    pass
+
+                inputs = self._fix_extranodes(node.op, attr, inputs)
+
+                op = self._convert_operator(node.op, inputs, attr, graph)
+                # Assuming only one output.
+                self._nodes[node.name] = op
+                node_output = op
+        # Assume the final node is the output node
+        out = node_output
+
+        #Add the RNN outputs also with 'head' nodes of the nnvm graph
+        if self._num_rnn_layer:
+            out_rnn = _sym.concatenate(*self._out_rnn, axis=0)
+            out = [out, out_rnn]
+        if isinstance(out, list):
+            out = _sym.Group(out)
+
+        return out, self._params
+
+    def _parse_param(self, key, value, name):
+        try:
+            from tensorflow.python.framework import tensor_util
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import tensorflow which is required {}".format(e))
+
+        if key == 'value':
+            np_array = tensor_util.MakeNdarray(value.tensor)
+            array_ndim = len(np_array.shape)
+            if array_ndim == 0:
+                new_array = np.empty([1], dtype=np_array.dtype)
+                new_array[0] = np_array
+                self._params[name] = tvm.nd.array(new_array)
+            else:
+                self._params[name] = tvm.nd.array(np_array)
+            self._nodes[name] = _sym.Variable(name=name,
+                                              shape=self._params[name].shape)
+        else:
+            if key != 'dtype' and key != '_output_shapes' and key != '_class':
+                raise NotImplementedError \
+                    ("Other attributes for a Const(param) Node {} ? .".format(key))
+
+    def _get_attr(self, buf):
+        """Returns the value of the attr of this buf with the given `name`.
+
+        Args:
+          buf: attrvalue protobuf.
+
+        Returns:
+          The value of the attr, as a Python object.
+
+        Raises:
+          ValueError: If this op does not have an attr with the given `name`.
+        """
+        fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
+
+        x = buf
+
+        ret = []
+
+        try:
+            from tensorflow.python.framework import dtypes
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import tensorflow which is required {}".format(e))
+
+        # Treat an empty oneof value as an empty list.
+        if not x.WhichOneof("value"):
+            return ret
+        if x.HasField("list"):
+            for f in fields:
+                if getattr(x.list, f):
+                    if f == "type":
+                        ret = [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
+                    else:
+                        ret = list(getattr(x.list, f))
+        else:
+            for f in fields:
+                if x.HasField(f):
+                    if f == "type":
+                        ret = dtypes.as_dtype(getattr(x, f))
+                    else:
+                        ret = getattr(x, f)
+        return ret
+
+    def _parse_attr(self, attr_proto):
+        """Convert a list of AttributeProto to a dict, with names as keys."""
+        attrs = {}
+        for key, value in attr_proto.items():
+            attrs[key] = self._get_attr(value)
+
+        return attrs
+
+    def _convert_rnn_operator(self, op_name, inputs,
+                              attrs, params, graph, convert_map):
+        """Convert RNN and its variant operators to NNVM operators.
+        This converter read the input states of each layers and
+        also maintain the output states of each layer in a list.
+
+        Parameters
+        ----------
+        op_name : str
+            Operator name, such as LSTMBlockCell
+        inputs : list of nnvm.Symbol
+            List of input symbols.
+        attrs : dict
+            Dict of operator attributes
+        params : dict
+            List of pretrained weights and bias
+        graph : Tensorflow graph object
+            Graph is to find the number of upcoming same operator to
+            calculate the number of layers.
+        convert_map : dict
+            Dict of name : callable, where name is the op's name that
+            require conversion to nnvm, callable are functions which
+            take attrs and return (new_op_name, new_attrs)
+
+        Returns
+        -------
+        sym : nnvm.Symbol
+            Converted nnvm Symbol
+        """
+        if not self._num_rnn_layer:
+            self._out_rnn = []
+            self.rnn = RecurrentNetworks(self._nodes, self._out_rnn, graph, convert_map)
+            self._num_rnn_layer = True
+        sym = self.rnn.process_op(op_name, inputs, attrs, params)
+        return sym
+
+    def _convert_operator(self, op_name, inputs, attrs,
+                          graph, identity_list=None, convert_map=None):
+        """Convert from Tensorflow operator to nnvm operator.
+        The converter must specify conversions explicity for incompatible name, and
+        apply handlers to operator attributes.
+
+        Parameters
+        ----------
+        op_name : str
+            Operator name, such as Conv2D, AvgPool
+        inputs : list of nnvm.Symbol
+            List of input symbols.
+        attrs : dict
+            Dict of operator attributes
+        identity_list : list
+            List of operators that don't require conversion
+        convert_map : dict
+            Dict of name : callable, where name is the op's name that
+            require conversion to nnvm, callable are functions which
+            take attrs and return (new_op_name, new_attrs)
+
+        Returns
+        -------
+        sym : nnvm.Symbol
+            Converted nnvm Symbol
+        """
+        identity_list = identity_list if identity_list else _identity_list
+        convert_map = convert_map if convert_map else _convert_map
+        convert_map_rnn = _convert_map_rnn
+        if op_name in identity_list:
+            sym = get_nnvm_op(op_name)(*inputs, **attrs)
+        elif op_name in convert_map:
+            sym = convert_map[op_name](inputs, attrs, self._params)
+        elif op_name in convert_map_rnn:
+            sym = self._convert_rnn_operator(op_name, inputs, attrs,
+                                             self._params, graph,
+                                             convert_map_rnn)
+        else:
+            raise NotImplementedError("Operator {} not implemented.".format(op_name))
+        return sym
+
+    def _fix_extranodes(self, op_name, attr, inputs):
+        if op_name == "Softmax":
+            # Require some times flatten of data before it goes to softmax
+            # Need to relook into this with latest softmax axis support.
+            op = AttrCvt(op_name='flatten')(inputs, {})
+            node_output = op.list_output_names()
+            for k, i in zip(list(node_output), range(len(node_output))):
+                self._nodes[k] = op[i]
+            inputs = [op]
+
+        return inputs
+
+def from_tensorflow(graph):
+    """  Load tensorflow graph which is a python tensorflow graph object into nnvm graph.
+    The companion parameters will be handled automatically.
+
+    Parameters
+    ----------
+    graph : GraphDef object
+        Tensorflow GraphDef
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.ndarray
+        Dict of converted parameters stored in tvm.ndarray format
+    """
+    g = GraphProto()
+    sym, params = g.from_tensorflow(graph)
+    return sym, params
diff --git a/nnvm/python/nnvm/graph.py b/nnvm/python/nnvm/graph.py
new file mode 100644
index 000000000000..2ea365e67ef4
--- /dev/null
+++ b/nnvm/python/nnvm/graph.py
@@ -0,0 +1,272 @@
+# coding: utf-8
+# pylint: disable=invalid-name, protected-access, too-many-arguments, too-many-lines
+"""NNVM Graph IR API.
+
+This is a developer API that is used to manipulate and transform graphs.
+"""
+from __future__ import absolute_import as _abs
+
+import ctypes
+import json
+from ._base import _LIB
+from ._base import c_array, c_str, nn_uint, py_str, string_types
+from ._base import GraphHandle, SymbolHandle
+from ._base import check_call
+from .symbol import Variable, Symbol, Group as _Group
+
+class GraphIndex(object):
+    """Index for quickly accessing graph attributes.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to create index.
+    """
+    def __init__(self, graph):
+        jgraph = json.loads(create(graph).apply("SaveJSON").json_attr("json"))
+        self.nodes = jgraph["nodes"]
+        self.entry_ptr = jgraph["node_row_ptr"]
+        self._name2nodeid = {n["name"]: i for i, n in enumerate(self.nodes)}
+        self.input_names = graph.symbol.list_input_names()
+        self.output_entries = jgraph["heads"]
+
+    @property
+    def num_nodes(self):
+        """Number of nodes in graph."""
+        return len(self.entry_ptr) - 1
+
+    @property
+    def num_node_entries(self):
+        """Number of nodes in graph."""
+        return self.entry_ptr[-1]
+
+    def node_id(self, key):
+        """Get the node index for a given key.
+
+        Parameters
+        ----------
+        key : str or int
+            The node key or index
+
+        Returns
+        -------
+        index : int
+            The entry index
+        """
+        return self._name2nodeid[key]
+
+    def entry_id(self, key, value_index=0):
+        """Get the entry id of a node entry.
+
+        Parameters
+        ----------
+        key : str or int
+            The node key or index
+
+        value_index : int
+            The value index of output
+
+        Returns
+        -------
+        index : int
+            The entry index
+        """
+        if isinstance(key, (list, tuple)):
+            if len(key) != 3:
+                raise ValueError("Expect entry index to be tuple of 3 elems")
+            key, value_index, _ = key
+        idx = self.node_id(key) if isinstance(key, str) else key
+        assert value_index < self.entry_ptr[idx + 1]
+        return self.entry_ptr[idx] + value_index
+
+
+
+class Graph(object):
+    """Graph is the graph object that can be used to apply optimization pass.
+
+    It contains additional graphwise attribute besides the internal symbol.
+    """
+    _tvm_tcode = 17
+
+    # pylint: disable=no-member
+    def __init__(self, handle):
+        """Initialize the function with handle
+
+        Parameters
+        ----------
+        handle : GraphHandle
+            the handle to the underlying C++ Graph
+        """
+        self.handle = handle
+        self._index = None
+
+    def __del__(self):
+        check_call(_LIB.NNGraphFree(self.handle))
+
+    def json_attr(self, key):
+        """Get attribute string from the graph.
+
+        Parameters
+        ----------
+        key : str
+            The key to get attribute from.
+
+        Returns
+        -------
+        value : str
+            The attribute value of the key, returns None if attribute do not exist.
+        """
+        ret = ctypes.c_char_p()
+        success = ctypes.c_int()
+        check_call(_LIB.NNGraphGetJSONAttr(
+            self.handle, c_str(key), ctypes.byref(ret), ctypes.byref(success)))
+        if success.value != 0:
+            json_str = py_str(ret.value)
+            return json.loads(json_str)[1]
+        return None
+
+    def _set_symbol_list_attr(self, key, value):
+        """Set the attribute of the graph.
+
+        Parameters
+        ----------
+        key : string
+            The key of the attribute
+        value : value
+            The any type that can be dumped to json
+        type_name : string
+            The typename registered on c++ side.
+        """
+        if isinstance(value, list):
+            value = _Group(value)
+        if not isinstance(value, Symbol):
+            raise ValueError("value need to be grouped symbol")
+        check_call(_LIB.NNGraphSetNodeEntryListAttr_(
+            self.handle, c_str(key), value.handle))
+
+    def _set_json_attr(self, key, value, type_name=None):
+        """Set the attribute of the graph.
+
+        Parameters
+        ----------
+        key : string
+            The key of the attribute
+        value : value
+            The any type that can be dumped to json
+        type_name : string
+            The typename registered on c++ side.
+        """
+        if isinstance(value, string_types):
+            type_name = 'str'
+        elif type_name is None:
+            raise ValueError("Need to specify type_name")
+        json_value = json.dumps([type_name, value])
+        check_call(_LIB.NNGraphSetJSONAttr(
+            self.handle, c_str(key), c_str(json_value)))
+
+    @property
+    def _tvm_handle(self):
+        return self.handle.value
+
+    @property
+    def symbol(self):
+        shandle = SymbolHandle()
+        check_call(_LIB.NNGraphGetSymbol(self.handle, ctypes.byref(shandle)))
+        return Symbol(shandle)
+
+    def json(self):
+        """Get JSON representation of the graph
+
+        Returns
+        -------
+        json : str
+            JSON representation of the graph
+        """
+        return self.apply("SaveJSON").json_attr("json")
+
+    def _tvm_graph_json(self):
+        """Get TVM graph json"""
+        return self.json()
+
+    @property
+    def index(self):
+        if not self._index:
+            self._index = GraphIndex(self)
+        return self._index
+
+    def ir(self, join_entry_attrs=None, join_node_attrs=None):
+        """Get text form of graph ir.
+
+        Parameters
+        ----------
+        join_entry_attrs : list of str
+            List of graph NodeEntry attribute to be
+            printed along each operator.
+
+        join_node_attrs : list of str
+            List of graph node attribute to be
+            printed along each operator.
+        """
+        if join_entry_attrs:
+            self._set_json_attr("join_entry_attrs", join_entry_attrs, "list_str")
+        if join_node_attrs:
+            self._set_json_attr("join_node_attrs", join_node_attrs, "list_str")
+        return self.apply("PrintGraphIR").json_attr("graphir")
+
+    def apply(self, passes):
+        """Apply passes to the graph
+
+        Parameters
+        ----------
+        passes : str or list of str
+            The passes to be applied
+
+        Returns
+        -------
+        g : Graph
+            The transformed graph.
+        """
+        if isinstance(passes, string_types):
+            passes = [passes]
+        cpass = c_array(ctypes.c_char_p, [c_str(key) for key in passes])
+        ghandle = GraphHandle()
+        npass = nn_uint(len(passes))
+        check_call(_LIB.NNGraphApplyPasses(self.handle, npass, cpass, ctypes.byref(ghandle)))
+        return Graph(ghandle)
+
+
+def load_json(json_str):
+    """Create a new graph by loading from json
+
+    Parameters
+    ----------
+    json_str : str
+        The json string
+
+    Returns
+    -------
+    graph : Graph
+        The loaded graph
+    """
+    ret = create(Variable("x"))
+    ret._set_json_attr("json", json_str)
+    return ret.apply("LoadJSON")
+
+
+def create(symbol):
+    """Create a new graph from symbol.
+
+    Parameters
+    ----------
+    symbol : Symbol
+        The symbolic graph used to create Graph object.
+
+    Returns
+    -------
+    graph : Graph
+        A generated new graph object.
+    """
+    ghandle = GraphHandle()
+    check_call(_LIB.NNGraphCreate(
+        symbol.handle, ctypes.byref(ghandle)))
+    return Graph(ghandle)
diff --git a/nnvm/python/nnvm/libinfo.py b/nnvm/python/nnvm/libinfo.py
new file mode 100644
index 000000000000..652433fc3d8c
--- /dev/null
+++ b/nnvm/python/nnvm/libinfo.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+"""Information about nnvm."""
+from __future__ import absolute_import
+import sys
+import os
+import platform
+
+if sys.version_info[0] == 3:
+    import builtins as __builtin__
+else:
+    import __builtin__
+
+def find_lib_path():
+    """Find NNNet dynamic library files.
+
+    Returns
+    -------
+    lib_path : list(string)
+        List of all found path to the libraries
+    """
+    if hasattr(__builtin__, "NNVM_BASE_PATH"):
+        base_path = __builtin__.NNVM_BASE_PATH
+    else:
+        base_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+
+    if hasattr(__builtin__, "NNVM_LIBRARY_NAME"):
+        lib_name = __builtin__.NNVM_LIBRARY_NAME
+    else:
+        lib_name = "nnvm_compiler" if sys.platform.startswith('win32') else "libnnvm_compiler"
+
+    api_path = os.path.join(base_path, '..', '..', 'lib')
+    cmake_build_path_win = os.path.join(base_path, '..', '..', '..', 'build', 'Release')
+    cmake_build_path = os.path.join(base_path, '..', '..', '..', 'build')
+    install_path = os.path.join(base_path, '..', '..', '..')
+    dll_path = [base_path, api_path, cmake_build_path_win, cmake_build_path,
+                install_path]
+
+    if sys.platform.startswith('linux') and os.environ.get('LD_LIBRARY_PATH', None):
+        dll_path.extend([p.strip() for p in os.environ['LD_LIBRARY_PATH'].split(":")])
+    elif sys.platform.startswith('darwin') and os.environ.get('DYLD_LIBRARY_PATH', None):
+        dll_path.extend([p.strip() for p in os.environ['DYLD_LIBRARY_PATH'].split(":")])
+    elif sys.platform.startswith('win32') and os.environ.get('PATH', None):
+        dll_path.extend([p.strip() for p in os.environ['PATH'].split(";")])
+
+    if sys.platform.startswith('win32'):
+        vs_configuration = 'Release'
+        if platform.architecture()[0] == '64bit':
+            dll_path.append(os.path.join(base_path, '..', '..', '..', 'build', vs_configuration))
+            dll_path.append(os.path.join(base_path, '..', '..', '..', 'windows', 'x64',
+                                         vs_configuration))
+        else:
+            dll_path.append(os.path.join(base_path, '..', '..', '..', 'build', vs_configuration))
+            dll_path.append(os.path.join(base_path, '..', '..', '..', 'windows', vs_configuration))
+        dll_path = [os.path.join(p, '%s.dll' % lib_name) for p in dll_path]
+    elif sys.platform.startswith('darwin'):
+        dll_path = [os.path.join(p, '%s.dylib' % lib_name) for p in dll_path]
+    else:
+        dll_path = [os.path.join(p, '%s.so' % lib_name) for p in dll_path]
+
+    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+    if not lib_path:
+        raise RuntimeError('Cannot find the files.\n' +
+                           'List of candidates:\n' + str('\n'.join(dll_path)))
+    return lib_path
+
+
+# current version
+__version__ = "0.8.0"
diff --git a/nnvm/python/nnvm/name.py b/nnvm/python/nnvm/name.py
new file mode 100644
index 000000000000..081d2bae7242
--- /dev/null
+++ b/nnvm/python/nnvm/name.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+"""Automatic naming support for symbolic API."""
+from __future__ import absolute_import as _abs
+
+class NameManager(object):
+    """NameManager to do automatic naming.
+
+    User can also inherit this object to change naming behavior.
+    """
+    current = None
+
+    def __init__(self):
+        self._counter = {}
+        self._old_manager = None
+
+    def get(self, name, hint):
+        """Get the canonical name for a symbol.
+
+        This is default implementation.
+        When user specified a name,
+        the user specified name will be used.
+
+        When user did not, we will automatically generate a
+        name based on hint string.
+
+        Parameters
+        ----------
+        name : str or None
+            The name user specified.
+
+        hint : str
+            A hint string, which can be used to generate name.
+
+        Returns
+        -------
+        full_name : str
+            A canonical name for the user.
+        """
+        if name:
+            return name
+        if hint not in self._counter:
+            self._counter[hint] = 0
+        name = '%s%d' % (hint, self._counter[hint])
+        self._counter[hint] += 1
+        return name
+
+    def __enter__(self):
+        self._old_manager = NameManager.current
+        NameManager.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_manager
+        NameManager.current = self._old_manager
+
+
+class Prefix(NameManager):
+    """A name manager that always attach a prefix to all names.
+
+    Examples
+    --------
+    >>> import nnvm as nn
+    >>> data = nn.symbol.Variable('data')
+    >>> with nn.name.Prefix('mynet_'):
+            net = nn.symbol.FullyConnected(data, num_hidden=10, name='fc1')
+    >>> net.list_arguments()
+    ['data', 'mynet_fc1_weight', 'mynet_fc1_bias']
+    """
+    def __init__(self, prefix):
+        super(Prefix, self).__init__()
+        self._prefix = prefix
+
+    def get(self, name, hint):
+        name = super(Prefix, self).get(name, hint)
+        return self._prefix + name
+
+# initialize the default name manager
+NameManager.current = NameManager()
diff --git a/nnvm/python/nnvm/symbol.py b/nnvm/python/nnvm/symbol.py
new file mode 100644
index 000000000000..6997ecc64654
--- /dev/null
+++ b/nnvm/python/nnvm/symbol.py
@@ -0,0 +1,400 @@
+# pylint: disable=invalid-name, unused-import, protected-access
+"""Symbolic graph construction API.
+
+This namespace contains most of the registered operators.
+For detailed list of operators, checkout ``Core Tensor Operators``
+"""
+from __future__ import absolute_import as _abs
+import sys as _sys
+import os as _os
+import ctypes as _ctypes
+from numbers import Number as _Number
+
+import numpy as np
+
+from . import _base
+from ._base import _LIB, check_call as _check_call, _FFI_MODE, _all_var_init
+from .attribute import AttrScope
+from . import _symbol_internal as _internal
+from . import contrib
+
+# Use different verison of SymbolBase
+# When possible, use cython to speedup part of computation.
+
+IMPORT_EXCEPT = RuntimeError if _FFI_MODE == "cython" else ImportError
+
+try:
+    if _FFI_MODE == "ctypes":
+        raise ImportError()
+    if _sys.version_info >= (3, 0):
+        from ._cy3.symbol import SymbolBase, _init_symbol_module
+    else:
+        from ._cy2.symbol import SymbolBase, _init_symbol_module
+except IMPORT_EXCEPT:
+    # pylint: disable=wrong-import-position
+    from ._ctypes.symbol import SymbolBase, _init_symbol_module
+
+
+class Symbol(SymbolBase):
+    """Symbol is basic operation unit for symbolic graph compostion."""
+    # disable dictionary storage, also do not have parent type.
+    __slots__ = []
+
+    _tvm_tcode = 16
+
+    @property
+    def _tvm_handle(self):
+        return self.handle.value
+
+    def __add__(self, other):
+        """x.__add__(y) <=> x+y"""
+        if isinstance(other, Symbol):
+            return __add_symbol__(self, other)
+        elif isinstance(other, _Number):
+            return __add_scalar__(self, scalar=other)
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        """x.__sub__(y) <=> x-y"""
+        if isinstance(other, Symbol):
+            return __sub_symbol__(self, other)
+        if isinstance(other, _Number):
+            return __sub_scalar__(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rsub__(self, other):
+        if isinstance(other, _Number):
+            return __rsub_scalar__(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
+
+    def __mul__(self, other):
+        """x.__mul__(y) <=> x*y"""
+        if isinstance(other, Symbol):
+            return __mul_symbol__(self, other)
+        if isinstance(other, _Number):
+            return __mul_scalar__(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rmul__(self, other):
+        return self.__mul__(other)
+
+    def __div__(self, other):
+        """x.__div__(y) <=> x/y"""
+        if isinstance(other, Symbol):
+            return __div_symbol__(self, other)
+        if isinstance(other, _Number):
+            return __div_scalar__(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rdiv__(self, other):
+        if isinstance(other, _Number):
+            return __rdiv_scalar__(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
+
+    def __lshift__(self, other):
+        """x.__lshift__(y) <=> x << y"""
+        if isinstance(other, _Number):
+            return __lshift_scalar__(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rshift__(self, other):
+        """x.__rshift__(y) <=> x >> y"""
+        if isinstance(other, _Number):
+            return __rshift_scalar__(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
+
+    def __truediv__(self, other):
+        return self.__div__(other)
+
+    def __rtruediv__(self, other):
+        return self.__rdiv__(other)
+
+    def __pow__(self, other):
+        """x.__pow__(y) <=> x**y"""
+        if isinstance(other, Symbol):
+            return __pow_symbol__(self, other)
+        if isinstance(other, _Number):
+            return __pow_scalar__(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rpow__(self, other):
+        if isinstance(other, _Number):
+            return __rpow_scalar__(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
+
+    def __neg__(self):
+        """x.__neg__() <=> -x"""
+        return self.__mul__(-1.0)
+
+    def __copy__(self):
+        return self.__deepcopy__()
+
+    def __deepcopy__(self, _=None):
+        """Returns a deep copy of the input object."""
+        handle = _base.SymbolHandle()
+        _base.check_call(_LIB.NNSymbolCopy(self.handle,
+                                           _ctypes.byref(handle)))
+        return Symbol(handle)
+
+    def __getitem__(self, index):
+        if isinstance(index, _base.string_types):
+            idx = None
+            for i, name in enumerate(self.list_output_names()):
+                if name == index:
+                    if idx is not None:
+                        raise ValueError('There are multiple outputs with name \"%s\"' % index)
+                    idx = i
+            if idx is None:
+                raise ValueError('Cannot find output that matches name \"%s\"' % index)
+            index = idx
+        if not isinstance(index, int):
+            raise TypeError('Symbol only support integer index to fetch i-th output')
+        handle = _base.SymbolHandle()
+        _check_call(_LIB.NNSymbolGetOutput(
+            self.handle, _base.nn_uint(index), _ctypes.byref(handle)))
+        return Symbol(handle=handle)
+
+    def __iter__(self):
+        return (self[i] for i in self.list_output_names())
+
+    def attr(self, key):
+        """Get attribute string from the symbol, this function only works for non-grouped symbol.
+
+        Parameters
+        ----------
+        key : str
+            The key to get attribute from.
+
+        Returns
+        -------
+        value : str
+            The attribute value of the key, returns None if attribute do not exist.
+        """
+        ret = _ctypes.c_char_p()
+        success = _ctypes.c_int()
+        _check_call(_LIB.NNSymbolGetAttr(
+            self.handle, _base.c_str(key), _ctypes.byref(ret), _ctypes.byref(success)))
+        if success.value != 0:
+            return _base.py_str(ret.value)
+        return None
+
+    def list_attr(self, recursive=False):
+        """Get all attributes from the symbol.
+
+        Parameters
+        ----------
+        recursive : bool
+            Default `False`. When `recursive` is `True`, list recursively all the
+            attributes in the descendents. The attribute names are pre-pended with
+            the symbol names to avoid conflicts. If `False`, then only attributes
+            that belongs to this symbol is returned, and the attribute names will
+            **not** be pre-pended with the symbol name.
+        """
+        size = _base.nn_uint()
+        pairs = _ctypes.POINTER(_ctypes.c_char_p)()
+        option = _ctypes.c_int(0) if recursive else _ctypes.c_int(1)
+        _check_call(_LIB.NNSymbolListAttrs(
+            self.handle, option, _ctypes.byref(size), _ctypes.byref(pairs)))
+        return {_base.py_str(pairs[i*2]): _base.py_str(pairs[i*2+1]) for i in range(size.value)}
+
+    def get_internals(self):
+        """Get a new grouped symbol whose output contains all the internal outputs of this symbol.
+
+        Returns
+        -------
+        sgroup : Symbol
+            The internal of the symbol.
+        """
+        handle = _base.SymbolHandle()
+        _check_call(_LIB.NNSymbolGetInternals(
+            self.handle, _ctypes.byref(handle)))
+        return Symbol(handle=handle)
+
+    def get_children(self):
+        """Gets a new grouped symbol whose output contains
+           inputs to output nodes of the original symbol."""
+        handle = _base.SymbolHandle()
+        _check_call(_LIB.NNSymbolGetChildren(
+            self.handle, _ctypes.byref(handle)))
+        ret = Symbol(handle=handle)
+        if not ret.list_output_names():
+            return None
+        return ret
+
+    def _get_list_copt(self, option):
+        """internal function to get list option"""
+        if option == 'all':
+            return _ctypes.c_int(0)
+        elif option == 'read_only':
+            return _ctypes.c_int(1)
+        elif option == 'aux_state':
+            return _ctypes.c_int(2)
+        else:
+            raise ValueError("option need to be in {'all', 'read_only, 'aux_state'}")
+
+    def list_input_variables(self, option='all'):
+        """List all the input variables in the symbol.
+
+        Parameters
+        ----------
+        option : {'all', 'read_only', 'aux_state'}, optional
+           The listing option
+           - 'all' will list all the arguments.
+           - 'read_only' lists arguments that are readed by the graph.
+           - 'aux_state' lists arguments that are mutated by the graph as state.
+        Returns
+        -------
+        vars : list of symbol
+            List of all the variables
+        """
+        size = _ctypes.c_uint()
+        sarr = _ctypes.POINTER(_base.SymbolHandle)()
+        _check_call(_LIB.NNSymbolListInputVariables(
+            self.handle, self._get_list_copt(option),
+            _ctypes.byref(size), _ctypes.byref(sarr)))
+        return [Symbol(_base.SymbolHandle(sarr[i])) for i in range(size.value)]
+
+    def list_input_names(self, option='all'):
+        """List all the inputs in the symbol.
+
+        Parameters
+        ----------
+        option : {'all', 'read_only', 'aux_state'}, optional
+           The listing option
+           - 'all' will list all the arguments.
+           - 'read_only' lists arguments that are readed by the graph.
+           - 'aux_state' lists arguments that are mutated by the graph as state.
+        Returns
+        -------
+        args : list of string
+            List of all the arguments.
+        """
+        size = _ctypes.c_uint()
+        sarr = _ctypes.POINTER(_ctypes.c_char_p)()
+        _check_call(_LIB.NNSymbolListInputNames(
+            self.handle, self._get_list_copt(option),
+            _ctypes.byref(size), _ctypes.byref(sarr)))
+        return [_base.py_str(sarr[i]) for i in range(size.value)]
+
+    def list_output_names(self):
+        """List all outputs in the symbol.
+
+        Returns
+        -------
+        returns : list of string
+            List of all the outputs.
+        """
+        size = _ctypes.c_uint()
+        sarr = _ctypes.POINTER(_ctypes.c_char_p)()
+        _check_call(_LIB.NNSymbolListOutputNames(
+            self.handle, _ctypes.byref(size), _ctypes.byref(sarr)))
+        return [_base.py_str(sarr[i]) for i in range(size.value)]
+
+    def debug_str(self):
+        """Get a debug string.
+
+        Returns
+        -------
+        debug_str : string
+            Debug string of the symbol.
+        """
+        debug_str = _ctypes.c_char_p()
+        _check_call(_LIB.NNSymbolPrint(
+            self.handle, _ctypes.byref(debug_str)))
+        return _base.py_str(debug_str.value)
+
+    def _add_control_deps(self, deps):
+        """Add control flow dependencies.
+        This makes current op depend on the deps.
+        Only use when necessary,
+        this function mutate the current symbol node.
+
+        Returns
+        -------
+        deps : Symbol for list of symbol
+            The dependencies
+        """
+        if isinstance(deps, list):
+            deps = Group(deps)
+        _check_call(_LIB.NNAddControlDeps(
+            self.handle, deps.handle))
+
+
+def Variable(name, init=None, **kwargs):
+    """Create a symbolic variable with specified name.
+
+    Parameters
+    ----------
+    name : str
+        Name of the variable.
+    init : Symbol or numpy.ndarray
+        Symbol or numpy ndarray of initial value for the variable.
+        Note that for symbolic initialization value, it must be able
+        to be defined through InferShape, such as sym.zeros_like(v),
+        in which v is an input or parameter. Otherwise, pass a numpy
+        ndarray instead.
+    kwargs : dict of string -> string
+        Additional attributes to set on the variable.
+
+    Returns
+    -------
+    variable : Symbol
+        The created variable symbol.
+    """
+    if not isinstance(name, _base.string_types):
+        raise TypeError('Expect a string for variable `name`')
+    handle = _base.SymbolHandle()
+    _base.check_call(_LIB.NNSymbolCreateVariable(
+        _base.c_str(name), _ctypes.byref(handle)))
+    ret = Symbol(handle)
+    attr = AttrScope.current.get(kwargs)
+    if attr:
+        ret._set_attr(**attr)
+    if init is not None:
+        if not isinstance(init, (Symbol, np.ndarray)):
+            raise TypeError('Expect a Symbol or numpy ndarray'
+                            'for variable `init`')
+        _all_var_init[name] = init
+    return ret
+
+
+def Group(symbols):
+    """Create a symbol that groups symbols together.
+
+    Parameters
+    ----------
+    symbols : list
+        List of symbols to be grouped.
+
+    Returns
+    -------
+    sym : Symbol
+        The created group symbol.
+     """
+    ihandles = []
+    for sym in symbols:
+        if not isinstance(sym, Symbol):
+            raise TypeError('Expect Symbols in the list input')
+        ihandles.append(sym.handle)
+    handle = _base.SymbolHandle()
+    _check_call(_LIB.NNSymbolCreateGroup(
+        _base.nn_uint(len(ihandles)),
+        _base.c_array(_base.SymbolHandle, ihandles),
+        _ctypes.byref(handle)))
+    return Symbol(handle)
+
+# Set the real symbol class to Symbol
+_init_symbol_module(Symbol, "nnvm")
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
new file mode 100644
index 000000000000..bff828d68280
--- /dev/null
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -0,0 +1,13 @@
+"""Utilities for testing and benchmarks"""
+from __future__ import absolute_import as _abs
+
+from .config import ctx_list
+from .utils import create_workload
+from . import mobilenet
+from . import mlp
+from . import resnet
+from . import vgg
+from . import squeezenet
+from . import dcgan
+from . import dqn
+from . import yolo2_detection
diff --git a/nnvm/python/nnvm/testing/config.py b/nnvm/python/nnvm/testing/config.py
new file mode 100644
index 000000000000..0eab3e6b3389
--- /dev/null
+++ b/nnvm/python/nnvm/testing/config.py
@@ -0,0 +1,14 @@
+"""Configuration about tests"""
+from __future__ import absolute_import as _abs
+
+import os
+import tvm
+
+def ctx_list():
+    """Get context list for testcases"""
+    device_list = os.environ.get("NNVM_TEST_TARGETS", "")
+    device_list = (device_list.split(",") if device_list
+                   else ["llvm", "cuda"])
+    device_list = set(device_list)
+    res = [("llvm", tvm.cpu(0)), ("cuda", tvm.gpu(0))]
+    return [x for x in res if x[1].exist and x[0] in device_list]
diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py
new file mode 100644
index 000000000000..362fd3058954
--- /dev/null
+++ b/nnvm/python/nnvm/testing/darknet.py
@@ -0,0 +1,496 @@
+# pylint: disable=invalid-name, unused-variable, unused-argument, no-init
+"""
+Compile DarkNet Models
+====================
+DarkNet helper functions for darknet model parsing and image loading.
+This functions will not be loaded by default.
+These are utility functions used for testing and tutorial file.
+"""
+from __future__ import division
+import math
+import numpy as np
+from cffi import FFI
+import cv2
+
+def _resize_image(img, w_in, h_in):
+    """Resize the image to the given height and width."""
+    imc, imh, imw = img.shape
+    h_in = int(h_in)
+    w_in = int(w_in)
+    part = np.zeros((imc, imh, w_in))
+    resized = np.zeros((imc, h_in, w_in))
+    w_scale = (imw - 1) / (w_in - 1)
+    h_scale = (imh - 1) / (h_in - 1)
+    for k in range(imc):
+        for j in range(imh):
+            for c in range(w_in):
+                if c == w_in - 1 or imw == 1:
+                    part[k][j][c] = img[k][j][imw - 1]
+                else:
+                    fdx, idx = math.modf(c * w_scale)
+                    part[k][j][c] = (1 - fdx) * img[k][j][int(idx)] + \
+                                            fdx * img[k][j][int(idx) + 1]
+    for k in range(imc):
+        for j in range(h_in):
+            fdy, idy = math.modf(j * h_scale)
+            for c in range(w_in):
+                resized[k][j][c] = (1 - fdy)*part[k][int(idy)][c]
+            if (j == h_in - 1) or (imh == 1):
+                continue
+            for c in range(w_in):
+                resized[k][j][c] += fdy * part[k][int(idy) + 1][c]
+    return resized
+
+def load_image_color(test_image):
+    """To load the image using opencv api and do preprocessing."""
+    imagex = cv2.imread(test_image)
+    imagex = np.array(imagex)
+    imagex = imagex.transpose((2, 0, 1))
+    imagex = np.divide(imagex, 255.0)
+    imagex = np.flip(imagex, 0)
+    return imagex
+
+def _letterbox_image(img, w_in, h_in):
+    """To get the image in boxed format."""
+    imc, imh, imw = img.shape
+    if (w_in / imw) < (h_in / imh):
+        new_w = w_in
+        new_h = imh * w_in / imw
+    else:
+        new_h = h_in
+        new_w = imw * h_in/imh
+    resized = _resize_image(img, new_w, new_h)
+    boxed = np.full((imc, h_in, w_in), 0.5, dtype=float)
+    _, resizedh, resizedw = resized.shape
+    boxed[:, int((h_in - new_h) / 2)
+          :int((h_in - new_h) / 2) + resizedh, int((w_in - new_w) / 2)
+          :int((w_in - new_w) / 2) + resizedw] = resized
+    return boxed
+
+def load_image(image, resize_width, resize_height):
+    """Load the image and convert to the darknet model format.
+    The image processing of darknet is different from normal.
+    Parameters
+    ----------
+    image : string
+        The image file name with path
+
+    resize_width : integer
+        The width to which the image needs to be resized
+
+    resize_height : integer
+        The height to which the image needs to be resized
+
+    Returns
+    -------
+    img : Float array
+        Array of processed image
+    """
+
+    img = load_image_color(image)
+    return _letterbox_image(img, resize_width, resize_height)
+
+class LAYERTYPE(object):
+    """Darknet LAYERTYPE Class constant."""
+    CONVOLUTIONAL = 0
+    DECONVOLUTIONAL = 1
+    CONNECTED = 2
+    MAXPOOL = 3
+    SOFTMAX = 4
+    DETECTION = 5
+    DROPOUT = 6
+    CROP = 7
+    ROUTE = 8
+    COST = 9
+    NORMALIZATION = 10
+    AVGPOOL = 11
+    LOCAL = 12
+    SHORTCUT = 13
+    ACTIVE = 14
+    RNN = 15
+    GRU = 16
+    LSTM = 17
+    CRNN = 18
+    BATCHNORM = 19
+    NETWORK = 20
+    XNOR = 21
+    REGION = 22
+    REORG = 23
+    BLANK = 24
+
+class ACTIVATION(object):
+    """Darknet ACTIVATION Class constant."""
+    LOGISTIC = 0
+    RELU = 1
+    RELIE = 2
+    LINEAR = 3
+    RAMP = 4
+    TANH = 5
+    PLSE = 6
+    LEAKY = 7
+    ELU = 8
+    LOGGY = 9
+    STAIR = 10
+    HARDTAN = 11
+    LHTAN = 12
+
+__darknetffi__ = FFI()
+
+__darknetffi__.cdef("""
+typedef struct network network;
+typedef struct layer layer;
+
+typedef struct{
+    int *leaf;
+    int n;
+    int *parent;
+    int *child;
+    int *group;
+    char **name;
+
+    int groups;
+    int *group_size;
+    int *group_offset;
+} tree;
+
+typedef enum{
+    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN
+} ACTIVATION;
+
+
+typedef enum {
+    CONVOLUTIONAL,
+    DECONVOLUTIONAL,
+    CONNECTED,
+    MAXPOOL,
+    SOFTMAX,
+    DETECTION,
+    DROPOUT,
+    CROP,
+    ROUTE,
+    COST,
+    NORMALIZATION,
+    AVGPOOL,
+    LOCAL,
+    SHORTCUT,
+    ACTIVE,
+    RNN,
+    GRU,
+    LSTM,
+    CRNN,
+    BATCHNORM,
+    NETWORK,
+    XNOR,
+    REGION,
+    REORG,
+    BLANK
+} LAYERTYPE;
+
+typedef enum{
+    SSE, MASKED, LONE, SEG, SMOOTH
+} COSTTYPE;
+
+
+struct layer{
+    LAYERTYPE type;
+    ACTIVATION activation;
+    COSTTYPE cost_type;
+    void (*forward);
+    void (*backward);
+    void (*update);
+    void (*forward_gpu);
+    void (*backward_gpu);
+    void (*update_gpu);
+    int batch_normalize;
+    int shortcut;
+    int batch;
+    int forced;
+    int flipped;
+    int inputs;
+    int outputs;
+    int nweights;
+    int nbiases;
+    int extra;
+    int truths;
+    int h,w,c;
+    int out_h, out_w, out_c;
+    int n;
+    int max_boxes;
+    int groups;
+    int size;
+    int side;
+    int stride;
+    int reverse;
+    int flatten;
+    int spatial;
+    int pad;
+    int sqrt;
+    int flip;
+    int index;
+    int binary;
+    int xnor;
+    int steps;
+    int hidden;
+    int truth;
+    float smooth;
+    float dot;
+    float angle;
+    float jitter;
+    float saturation;
+    float exposure;
+    float shift;
+    float ratio;
+    float learning_rate_scale;
+    int softmax;
+    int classes;
+    int coords;
+    int background;
+    int rescore;
+    int objectness;
+    int does_cost;
+    int joint;
+    int noadjust;
+    int reorg;
+    int log;
+    int tanh;
+
+    float alpha;
+    float beta;
+    float kappa;
+
+    float coord_scale;
+    float object_scale;
+    float noobject_scale;
+    float mask_scale;
+    float class_scale;
+    int bias_match;
+    int random;
+    float thresh;
+    int classfix;
+    int absolute;
+
+    int onlyforward;
+    int stopbackward;
+    int dontload;
+    int dontloadscales;
+
+    float temperature;
+    float probability;
+    float scale;
+
+    char  * cweights;
+    int   * indexes;
+    int   * input_layers;
+    int   * input_sizes;
+    int   * map;
+    float * rand;
+    float * cost;
+    float * state;
+    float * prev_state;
+    float * forgot_state;
+    float * forgot_delta;
+    float * state_delta;
+    float * combine_cpu;
+    float * combine_delta_cpu;
+
+    float * concat;
+    float * concat_delta;
+
+    float * binary_weights;
+
+    float * biases;
+    float * bias_updates;
+
+    float * scales;
+    float * scale_updates;
+
+    float * weights;
+    float * weight_updates;
+
+    float * delta;
+    float * output;
+    float * squared;
+    float * norms;
+
+    float * spatial_mean;
+    float * mean;
+    float * variance;
+
+    float * mean_delta;
+    float * variance_delta;
+
+    float * rolling_mean;
+    float * rolling_variance;
+
+    float * x;
+    float * x_norm;
+
+    float * m;
+    float * v;
+
+    float * bias_m;
+    float * bias_v;
+    float * scale_m;
+    float * scale_v;
+
+
+    float *z_cpu;
+    float *r_cpu;
+    float *h_cpu;
+    float * prev_state_cpu;
+
+    float *temp_cpu;
+    float *temp2_cpu;
+    float *temp3_cpu;
+
+    float *dh_cpu;
+    float *hh_cpu;
+    float *prev_cell_cpu;
+    float *cell_cpu;
+    float *f_cpu;
+    float *i_cpu;
+    float *g_cpu;
+    float *o_cpu;
+    float *c_cpu;
+    float *dc_cpu;
+
+    float * binary_input;
+
+    struct layer *input_layer;
+    struct layer *self_layer;
+    struct layer *output_layer;
+
+    struct layer *reset_layer;
+    struct layer *update_layer;
+    struct layer *state_layer;
+
+    struct layer *input_gate_layer;
+    struct layer *state_gate_layer;
+    struct layer *input_save_layer;
+    struct layer *state_save_layer;
+    struct layer *input_state_layer;
+    struct layer *state_state_layer;
+
+    struct layer *input_z_layer;
+    struct layer *state_z_layer;
+
+    struct layer *input_r_layer;
+    struct layer *state_r_layer;
+
+    struct layer *input_h_layer;
+    struct layer *state_h_layer;
+
+    struct layer *wz;
+    struct layer *uz;
+    struct layer *wr;
+    struct layer *ur;
+    struct layer *wh;
+    struct layer *uh;
+    struct layer *uo;
+    struct layer *wo;
+    struct layer *uf;
+    struct layer *wf;
+    struct layer *ui;
+    struct layer *wi;
+    struct layer *ug;
+    struct layer *wg;
+
+    tree *softmax_tree;
+
+    size_t workspace_size;
+};
+
+
+typedef enum {
+    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM
+} LEARNINGRATEPOLICY;
+
+typedef struct network{
+    int n;
+    int batch;
+    size_t *seen;
+    int *t;
+    float epoch;
+    int subdivisions;
+    layer *layers;
+    float *output;
+    LEARNINGRATEPOLICY policy;
+
+    float learning_rate;
+    float momentum;
+    float decay;
+    float gamma;
+    float scale;
+    float power;
+    int time_steps;
+    int step;
+    int max_batches;
+    float *scales;
+    int   *steps;
+    int num_steps;
+    int burn_in;
+
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+
+    int inputs;
+    int outputs;
+    int truths;
+    int notruth;
+    int h, w, c;
+    int max_crop;
+    int min_crop;
+    float max_ratio;
+    float min_ratio;
+    int center;
+    float angle;
+    float aspect;
+    float exposure;
+    float saturation;
+    float hue;
+    int random;
+
+    int gpu_index;
+    tree *hierarchy;
+
+    float *input;
+    float *truth;
+    float *delta;
+    float *workspace;
+    int train;
+    int index;
+    float *cost;
+} network;
+
+
+typedef struct {
+    int w;
+    int h;
+    int c;
+    float *data;
+} image;
+
+network *load_network(char *cfg, char *weights, int clear);
+image letterbox_image(image im, int w, int h);
+int resize_network(network *net, int w, int h);
+void top_predictions(network *net, int n, int *index);
+void free_image(image m);
+image load_image_color(char *filename, int w, int h);
+float *network_predict_image(network *net, image im);
+float *network_predict(network *net, float *input);
+network *make_network(int n);
+layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
+layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam);
+layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding);
+layer make_avgpool_layer(int batch, int w, int h, int c);
+layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2);
+layer make_batchnorm_layer(int batch, int w, int h, int c);
+layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra);
+layer make_region_layer(int batch, int w, int h, int n, int classes, int coords);
+layer make_softmax_layer(int batch, int inputs, int groups);
+layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
+void free_network(network *net);
+"""
+                   )
diff --git a/nnvm/python/nnvm/testing/dcgan.py b/nnvm/python/nnvm/testing/dcgan.py
new file mode 100644
index 000000000000..421699ad4c14
--- /dev/null
+++ b/nnvm/python/nnvm/testing/dcgan.py
@@ -0,0 +1,90 @@
+# pylint: disable=unused-argument
+"""
+Symbol of the generator of DCGAN
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+from .. import symbol as sym
+from . utils import create_workload
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = sym.conv2d_transpose(data,
+                               kernel_size=kshape,
+                               strides=stride,
+                               channels=oshape[0],
+                               padding=(pad_y, pad_x),
+                               output_padding=(adj_y, adj_x),
+                               use_bias=False,
+                               name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = sym.batch_norm(net, epsilon=eps, name="%s_bn" % prefix)
+    net = sym.relu(net, name="%s_act" % prefix)
+    return net
+
+def get_symbol(oshape, ngf=128, code=None):
+    """get symbol of dcgan generator"""
+    assert oshape[-1] == 32, "Only support 32x32 image"
+    assert oshape[-2] == 32, "Only support 32x32 image"
+
+    code = sym.Variable("data") if code is None else code
+    net = sym.dense(code, name="g1", units=4*4*ngf*4, use_bias=False)
+    net = sym.relu(net)
+    # 4 x 4
+    net = sym.reshape(net, shape=(-1, ngf * 4, 4, 4))
+    # 8 x 8
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    net = deconv2d(
+        net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv")
+    net = sym.tanh(net)
+    return net
+
+
+def get_workload(batch_size, oshape=(3, 32, 32), ngf=128, random_len=100, dtype="float32"):
+    """Get benchmark workload for a DCGAN generator
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+    oshape : tuple, optional
+        The shape of output image, layout="CHW"
+    ngf: int, optional
+        The number of final feature maps in the generator
+    random_len : int, optional
+        The length of random input
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(oshape=oshape, ngf=ngf)
+    return create_workload(net, batch_size, (random_len, ), dtype)
diff --git a/nnvm/python/nnvm/testing/dqn.py b/nnvm/python/nnvm/testing/dqn.py
new file mode 100644
index 000000000000..b04475efa32a
--- /dev/null
+++ b/nnvm/python/nnvm/testing/dqn.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Symbol of Nature DQN
+
+Reference:
+Mnih, Volodymyr, et al. "Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+from .. import symbol as sym
+from . utils import create_workload
+
+def get_symbol(num_actions=18):
+    """get symbol of nature dqn"""
+    data = sym.Variable(name='data')
+    net = sym.conv2d(data, kernel_size=(8, 8), strides=(4, 4), padding=(0, 0),
+                     channels=32, name='conv1')
+    net = sym.relu(net, name='relu1')
+    net = sym.conv2d(net, kernel_size=(4, 4), strides=(2, 2), padding=(0, 0),
+                     channels=64, name='conv2')
+    net = sym.relu(net, name='relu2')
+    net = sym.conv2d(net, kernel_size=(3, 3), strides=(1, 1), padding=(0, 0),
+                     channels=64, name='conv3')
+    net = sym.relu(net, name='relu3')
+    net = sym.flatten(net, name='flatten')
+    net = sym.dense(net, units=512, name='fc4')
+    net = sym.relu(net, name='relu4')
+    net = sym.dense(net, units=num_actions, name='fc5')
+
+    return net
+
+
+def get_workload(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32"):
+    """Get benchmark workload for a Deep Q Network
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+    num_actions : int, optional
+        Number of actions
+    image_shape : tuple, optional
+        The input image shape
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_actions=num_actions)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/init.py b/nnvm/python/nnvm/testing/init.py
new file mode 100644
index 000000000000..36ddcc955f7c
--- /dev/null
+++ b/nnvm/python/nnvm/testing/init.py
@@ -0,0 +1,109 @@
+"""Initializer of parameters."""
+import numpy as np
+
+class Initializer(object):
+    """The base class of an initializer."""
+    def __init__(self, **kwargs):
+        self._kwargs = kwargs
+
+    def __call__(self, desc, arr):
+        """Initialize an array
+
+        Parameters
+        ----------
+        desc : str
+            Initialization pattern descriptor.
+
+        arr : NDArray
+            The array to be initialized.
+        """
+        if desc.endswith('weight'):
+            self._init_weight(desc, arr)
+        elif desc.endswith('bias'):
+            self._init_bias(desc, arr)
+        elif desc.endswith('gamma'):
+            self._init_gamma(desc, arr)
+        elif desc.endswith('beta'):
+            self._init_beta(desc, arr)
+        elif desc.endswith('mean'):
+            self._init_mean(desc, arr)
+        elif desc.endswith('var'):
+            self._init_var(desc, arr)
+        else:
+            self._init_default(desc, arr)
+
+    def _init_bias(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_gamma(self, _, arr):
+        arr[:] = 1.0
+
+    def _init_beta(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_mean(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_var(self, _, arr):
+        arr[:] = 1.0
+
+    def _init_weight(self, name, arr):
+        """Abstract method to Initialize weight."""
+        raise NotImplementedError("Must override it")
+
+    def _init_default(self, name, _):
+        raise ValueError(
+            'Unknown initialization pattern for %s. ' \
+            'Default initialization is now limited to '\
+            '"weight", "bias", "gamma" (1.0), and "beta" (0.0).' \
+            'Please use mx.sym.Variable(init=mx.init.*) to set initialization pattern' % name)
+
+
+class Xavier(Initializer):
+    """ "Xavier" initialization for weights
+
+    Parameters
+    ----------
+    rnd_type: str, optional
+        Random generator type, can be ``'gaussian'`` or ``'uniform'``.
+
+    factor_type: str, optional
+        Can be ``'avg'``, ``'in'``, or ``'out'``.
+
+    magnitude: float, optional
+        Scale of random number.
+    """
+    def __init__(self, rnd_type="uniform", factor_type="avg", magnitude=3):
+        super(Xavier, self).__init__(rnd_type=rnd_type,
+                                     factor_type=factor_type,
+                                     magnitude=magnitude)
+        self.rnd_type = rnd_type
+        self.factor_type = factor_type
+        self.magnitude = float(magnitude)
+
+    def _init_weight(self, name, arr):
+        shape = arr.shape
+        hw_scale = 1.
+        if len(shape) < 2:
+            raise ValueError('Xavier initializer cannot be applied to vector {0}. It requires at'
+                             ' least 2D.'.format(name))
+        if len(shape) > 2:
+            hw_scale = np.prod(shape[2:])
+        fan_in, fan_out = shape[1] * hw_scale, shape[0] * hw_scale
+        factor = 1.
+        if self.factor_type == "avg":
+            factor = (fan_in + fan_out) / 2.0
+        elif self.factor_type == "in":
+            factor = fan_in
+        elif self.factor_type == "out":
+            factor = fan_out
+        else:
+            raise ValueError("Incorrect factor type")
+        # Hack for mobilenet, because there is less connectivity
+        if "depthwise" in name:
+            factor = 3 * 3
+        scale = np.sqrt(self.magnitude / factor)
+        if self.rnd_type == "uniform":
+            arr[:] = np.random.uniform(-scale, scale, size=arr.shape)
+        else:
+            raise ValueError("Unknown random type")
diff --git a/nnvm/python/nnvm/testing/mlp.py b/nnvm/python/nnvm/testing/mlp.py
new file mode 100644
index 000000000000..1b6975661fe4
--- /dev/null
+++ b/nnvm/python/nnvm/testing/mlp.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+a simple multilayer perceptron
+"""
+from .. import symbol as sym
+from . utils import create_workload
+
+def get_symbol(num_classes=1000):
+    data = sym.Variable('data')
+    data = sym.flatten(data=data)
+    fc1 = sym.dense(data=data, name='fc1', units=128)
+    act1 = sym.relu(data=fc1, name='relu1')
+    fc2 = sym.dense(data=act1, name='fc2', units=64)
+    act2 = sym.relu(data=fc2, name='relu2')
+    fc3 = sym.dense(data=act2, name='fc3', units=num_classes)
+    mlp = sym.softmax(data=fc3, name='softmax')
+    return mlp
+
+def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224), dtype="float32"):
+    """Get benchmark workload for a simple multilayer perceptron
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/mobilenet.py b/nnvm/python/nnvm/testing/mobilenet.py
new file mode 100644
index 000000000000..feab1ca765bc
--- /dev/null
+++ b/nnvm/python/nnvm/testing/mobilenet.py
@@ -0,0 +1,106 @@
+"""Helper utility to get mobilenet workload for testing."""
+# pylint: disable=invalid-name
+from __future__ import absolute_import as _abs
+
+from .. import symbol as sym
+from . utils import create_workload
+
+def conv_block(data, name, channels,
+               kernel_size=(3, 3), strides=(1, 1), padding=(1, 1),
+               epsilon=1e-5):
+    """Helper function to construct conv-bn-relu"""
+    # convolution + bn + relu
+    conv = sym.conv2d(data=data, channels=channels,
+                      kernel_size=kernel_size, strides=strides,
+                      padding=padding, use_bias=False,
+                      layout="NCHW", name=name + "_conv")
+    bn = sym.batch_norm(data=conv, epsilon=epsilon, name=name + "_bn")
+    act = sym.relu(data=bn, name=name + "_relu")
+    return act
+
+def separable_conv_block(data, name, depthwise_channels,
+                         pointwise_channels, kernel_size=(3, 3),
+                         downsample=False, padding=(1, 1),
+                         epsilon=1e-5):
+    """Helper function to get a separable conv block"""
+    if downsample:
+        strides = (2, 2)
+    else:
+        strides = (1, 1)
+    # depthwise convolution + bn + relu
+    conv1 = sym.conv2d(data=data, channels=depthwise_channels,
+                       groups=depthwise_channels, kernel_size=kernel_size, strides=strides,
+                       padding=padding, use_bias=False, layout="NCHW",
+                       name=name + "_depthwise_conv1")
+    bn1 = sym.batch_norm(data=conv1, epsilon=epsilon, name=name + "_bn1")
+    act1 = sym.relu(data=bn1, name=name + "_relu1")
+    # pointwise convolution + bn + relu
+    conv2 = sym.conv2d(data=act1, channels=pointwise_channels, kernel_size=(1, 1), strides=(1, 1),
+                       padding=(0, 0), use_bias=False, layout="NCHW", name=name + "_conv2")
+    bn2 = sym.batch_norm(data=conv2, epsilon=epsilon, name=name + "_bn2")
+    act2 = sym.relu(data=bn2, name=name + "_relu2")
+    return act2
+
+def mobile_net(num_classes=1000, alpha=1.0, is_shallow=False):
+    """Function to construct a MobileNet"""
+    data = sym.Variable("data")
+    body = conv_block(data, "conv_block_1", int(32*alpha), strides=(2, 2))
+    body = separable_conv_block(body, "separable_conv_block_1",
+                                int(32*alpha), int(64*alpha))
+    body = separable_conv_block(body, "separable_conv_block_2",
+                                int(64*alpha), int(128*alpha), downsample=True)
+    body = separable_conv_block(body, "separable_conv_block_3",
+                                int(128*alpha), int(128*alpha))
+    body = separable_conv_block(body, "separable_conv_block_4",
+                                int(128*alpha), int(256*alpha), downsample=True)
+    body = separable_conv_block(body, "separable_conv_block_5",
+                                int(256*alpha), int(256*alpha))
+    body = separable_conv_block(body, "separable_conv_block_6",
+                                int(256*alpha), int(512*alpha), downsample=True)
+    if is_shallow:
+        body = separable_conv_block(body, "separable_conv_block_7",
+                                    int(512*alpha), int(1024*alpha), downsample=True)
+        body = separable_conv_block(body, "separable_conv_block_8",
+                                    int(1024*alpha), int(1024*alpha))
+    else:
+        for i in range(7, 12):
+            body = separable_conv_block(body, "separable_conv_block_%d" % i,
+                                        int(512*alpha), int(512*alpha))
+        body = separable_conv_block(body, "separable_conv_block_12",
+                                    int(512*alpha), int(1024*alpha), downsample=True)
+        body = separable_conv_block(body, "separable_conv_block_13",
+                                    int(1024*alpha), int(1024*alpha))
+    pool = sym.global_avg_pool2d(data=body, name="pool")
+    flatten = sym.flatten(data=pool, name="flatten")
+    fc = sym.dense(data=flatten, units=num_classes, use_bias=False, name="fc")
+    softmax = sym.softmax(data=fc, name="softmax")
+    return softmax
+
+
+def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224), dtype="float32"):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = mobile_net(num_classes=num_classes, alpha=1.0, is_shallow=False)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/resnet.py b/nnvm/python/nnvm/testing/resnet.py
new file mode 100644
index 000000000000..6de0213679d1
--- /dev/null
+++ b/nnvm/python/nnvm/testing/resnet.py
@@ -0,0 +1,226 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+# pylint: disable=unused-argument
+from .. import symbol as sym
+from . utils import create_workload
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same,
+        otherwise means differ
+    name : str
+        Base name of the operators
+    """
+    if bottle_neck:
+        # the same as https://github.com/facebook/fb.resnet.torch#notes,
+        # a bit difference with origin paper
+        bn1 = sym.batch_norm(data=data, epsilon=2e-5, name=name + '_bn1')
+        act1 = sym.relu(data=bn1, name=name + '_relu1')
+        conv1 = sym.conv2d(
+            data=act1, channels=int(num_filter*0.25), kernel_size=(1, 1),
+            strides=(1, 1), padding=(0, 0), use_bias=False, name=name + '_conv1')
+        bn2 = sym.batch_norm(data=conv1, epsilon=2e-5, name=name + '_bn2')
+        act2 = sym.relu(data=bn2, name=name + '_relu2')
+        conv2 = sym.conv2d(
+            data=act2, channels=int(num_filter*0.25), kernel_size=(3, 3),
+            strides=stride, padding=(1, 1), use_bias=False, name=name + '_conv2')
+        bn3 = sym.batch_norm(data=conv2, epsilon=2e-5, name=name + '_bn3')
+        act3 = sym.relu(data=bn3, name=name + '_relu3')
+        conv3 = sym.conv2d(
+            data=act3, channels=num_filter, kernel_size=(1, 1),
+            strides=(1, 1), padding=(0, 0), use_bias=False, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = sym.conv2d(
+                data=act1, channels=num_filter, kernel_size=(1, 1),
+                strides=stride, use_bias=False, name=name+'_sc')
+        return sym.elemwise_add(conv3, shortcut)
+    else:
+        bn1 = sym.batch_norm(data=data, epsilon=2e-5, name=name + '_bn1')
+        act1 = sym.relu(data=bn1, name=name + '_relu1')
+        conv1 = sym.conv2d(
+            data=act1, channels=num_filter, kernel_size=(3, 3),
+            strides=stride, padding=(1, 1), use_bias=False, name=name + '_conv1')
+        bn2 = sym.batch_norm(data=conv1, epsilon=2e-5, name=name + '_bn2')
+        act2 = sym.relu(data=bn2, name=name + '_relu2')
+        conv2 = sym.conv2d(
+            data=act2, channels=num_filter, kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), use_bias=False, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = sym.conv2d(
+                data=act1, channels=num_filter, kernel_size=(1, 1),
+                strides=stride, use_bias=False, name=name+'_sc')
+        return sym.elemwise_add(conv2, shortcut)
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape,
+           bottle_neck=True):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    """
+    num_unit = len(units)
+    assert num_unit == num_stages
+    data = sym.Variable(name='data')
+    data = sym.batch_norm(data=data, epsilon=2e-5, scale=False, name='bn_data')
+    (_, height, _) = image_shape
+    if height <= 32:            # such as cifar10
+        body = sym.conv2d(
+            data=data, channels=filter_list[0], kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), use_bias=False, name="conv0")
+    else:                       # often expected to be 224 such as imagenet
+        body = sym.conv2d(
+            data=data, channels=filter_list[0], kernel_size=(7, 7),
+            strides=(2, 2), padding=(3, 3), use_bias=False, name="conv0")
+        body = sym.batch_norm(data=body, epsilon=2e-5, name='bn0')
+        body = sym.relu(data=body, name='relu0')
+        body = sym.max_pool2d(data=body, pool_size=(3, 3), strides=(2, 2), padding=(1, 1))
+
+    for i in range(num_stages):
+        body = residual_unit(
+            body, filter_list[i+1], (1 if i == 0 else 2, 1 if i == 0 else 2),
+            False, name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck)
+        for j in range(units[i]-1):
+            body = residual_unit(
+                body, filter_list[i+1], (1, 1), True,
+                name='stage%d_unit%d' % (i + 1, j + 2), bottle_neck=bottle_neck)
+    bn1 = sym.batch_norm(data=body, epsilon=2e-5, name='bn1')
+    relu1 = sym.relu(data=bn1, name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = sym.global_avg_pool2d(data=relu1, name='pool1')
+    flat = sym.flatten(data=pool1)
+    fc1 = sym.dense(data=flat, units=num_classes, name='fc1')
+    return sym.softmax(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers=50, image_shape=(3, 224, 224), **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    (_, height, _) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}".format(num_layers))
+
+    return resnet(units=units,
+                  num_stages=num_stages,
+                  filter_list=filter_list,
+                  num_classes=num_classes,
+                  image_shape=image_shape,
+                  bottle_neck=bottle_neck)
+
+def get_workload(batch_size=1, num_classes=1000, num_layers=18,
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for resnet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    num_layers : int, optional
+        Number of layers
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, num_layers=num_layers,
+                     image_shape=image_shape, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/squeezenet.py b/nnvm/python/nnvm/testing/squeezenet.py
new file mode 100644
index 000000000000..a445e8cfb7da
--- /dev/null
+++ b/nnvm/python/nnvm/testing/squeezenet.py
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from .. import symbol as sym
+from . utils import create_workload
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = sym.concatenate(left, right, axis=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = sym.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
+                     padding=(padding, padding))
+    net = sym.relu(net)
+    return net
+
+# Net
+def get_symbol(num_classes, version, **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    net = sym.Variable("data")
+    if version == '1.0':
+        net = sym.conv2d(net, channels=96, kernel_size=(7, 7), strides=(2, 2), padding=(3, 3))
+        net = sym.relu(net)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 32, 128, 128)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 64, 256, 256)
+    else:
+        net = sym.conv2d(net, channels=64, kernel_size=(3, 3), strides=(2, 2), padding=(1, 1))
+        net = sym.relu(net)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256)
+    net = sym.dropout(net, rate=0.5)
+    net = sym.conv2d(net, channels=num_classes, kernel_size=(1, 1))
+    net = sym.relu(net)
+    net = sym.global_avg_pool2d(net)
+    net = sym.flatten(net)
+    return sym.softmax(net)
+
+def get_workload(batch_size=1, num_classes=1000, version='1.0',
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for resnet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, version=version, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/tf.py b/nnvm/python/nnvm/testing/tf.py
new file mode 100644
index 000000000000..0372d7450586
--- /dev/null
+++ b/nnvm/python/nnvm/testing/tf.py
@@ -0,0 +1,278 @@
+# pylint: disable=invalid-name, unused-variable, unused-argument, no-init
+"""
+Tensorflow Model Helpers
+========================
+Some helper definitions for tensorflow models.
+"""
+import re
+import os.path
+import collections
+import numpy as np
+
+# Tensorflow imports
+import tensorflow as tf
+from tensorflow.core.framework import graph_pb2
+
+######################################################################
+# Some helper functions
+# ---------------------
+
+def ProcessGraphDefParam(graph_def):
+    """Type-checks and possibly canonicalizes `graph_def`.
+
+    Parameters
+    ----------
+    graph_def : Obj
+        tensorflow graph definition.
+
+    Returns
+    -------
+    graph_def : Obj
+        tensorflow graph devinition
+
+    """
+
+    if not isinstance(graph_def, graph_pb2.GraphDef):
+        # `graph_def` could be a dynamically-created message, so try a duck-typed
+        # approach
+        try:
+            old_graph_def = graph_def
+            graph_def = graph_pb2.GraphDef()
+            graph_def.MergeFrom(old_graph_def)
+        except TypeError:
+            raise TypeError('graph_def must be a GraphDef proto.')
+    return graph_def
+
+class NodeLookup(object):
+    """Converts integer node ID's to human readable labels."""
+
+    def __init__(self,
+                 label_lookup_path=None,
+                 uid_lookup_path=None):
+        self.node_lookup = self.load(label_lookup_path, uid_lookup_path)
+
+    def load(self, label_lookup_path, uid_lookup_path):
+        """Loads a human readable English name for each softmax node.
+
+        Parameters
+        ----------
+        label_lookup_path: String
+            File containing String UID to integer node ID mapping .
+
+        uid_lookup_path: String
+            File containing String UID to human-readable string mapping.
+
+        Returns
+        -------
+        node_id_to_name: dict
+            dict from integer node ID to human-readable string.
+
+        """
+        if not tf.gfile.Exists(uid_lookup_path):
+            tf.logging.fatal('File does not exist %s', uid_lookup_path)
+        if not tf.gfile.Exists(label_lookup_path):
+            tf.logging.fatal('File does not exist %s', label_lookup_path)
+
+        # Loads mapping from string UID to human-readable string
+        proto_as_ascii_lines = tf.gfile.GFile(uid_lookup_path).readlines()
+        uid_to_human = {}
+        p = re.compile(r'[n\d]*[ \S,]*')
+        for line in proto_as_ascii_lines:
+            parsed_items = p.findall(line)
+            uid = parsed_items[0]
+            human_string = parsed_items[2]
+            uid_to_human[uid] = human_string
+
+        # Loads mapping from string UID to integer node ID.
+        node_id_to_uid = {}
+        proto_as_ascii = tf.gfile.GFile(label_lookup_path).readlines()
+        for line in proto_as_ascii:
+            if line.startswith('  target_class:'):
+                target_class = int(line.split(': ')[1])
+            if line.startswith('  target_class_string:'):
+                target_class_string = line.split(': ')[1]
+                node_id_to_uid[target_class] = target_class_string[1:-2]
+
+        # Loads the final mapping of integer node ID to human-readable string
+        node_id_to_name = {}
+        for key, val in node_id_to_uid.items():
+            if val not in uid_to_human:
+                tf.logging.fatal('Failed to locate: %s', val)
+            name = uid_to_human[val]
+            node_id_to_name[key] = name
+
+        return node_id_to_name
+
+    def id_to_string(self, node_id):
+        if node_id not in self.node_lookup:
+            return ''
+        return self.node_lookup[node_id]
+
+def get_workload(model_path):
+    """ Import workload from frozen protobuf
+
+    Parameters
+    ----------
+    model_path: str
+        model_path on remote repository to download from.
+
+    Returns
+    -------
+    graph_def: graphdef
+        graph_def is the tensorflow workload for mobilenet.
+
+    """
+
+    repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/'
+    model_name = os.path.basename(model_path)
+    model_url = os.path.join(repo_base, model_path)
+
+    from mxnet.gluon.utils import download
+    download(model_url, model_name)
+
+    # Creates graph from saved graph_def.pb.
+    with tf.gfile.FastGFile(os.path.join("./", model_name), 'rb') as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+        graph = tf.import_graph_def(graph_def, name='')
+        return graph_def
+
+#######################################################################
+# PTB LSTMBlockCell Model
+# -----------------------
+
+class PTBSmallConfig(object):
+    """Small config.
+    This configurations are used when training the model
+    """
+    num_layers = 2
+    num_steps = 1
+    hidden_size = 200
+    batch_size = 1
+    vocab_size = 10000
+    init_scale = 0.1
+
+def get_config():
+    """Configuration used for training the model"""
+    return PTBSmallConfig()
+
+def pick_from_weight(weight, pows=1.0):
+    """Identify token from Softmax output.
+    This token will be mapped to word in the vocabulary.
+    """
+    weight = weight**pows
+    t = np.cumsum(weight)
+    s = np.sum(weight)
+    return int(np.searchsorted(t, 0.5 * s))
+
+def do_tf_sample(session, data, in_states, num_samples):
+    """Sampled from the model"""
+    samples = []
+    sample = None
+    #Cell inputs c and h should be passed for each layer explicitly.
+    state_input_name = ['Model/MultiRNNCellZeroState/LSTMBlockCellZeroState/zeros:0',
+                        'Model/MultiRNNCellZeroState/LSTMBlockCellZeroState/zeros_1:0',
+                        'Model/MultiRNNCellZeroState/LSTMBlockCellZeroState_1/zeros:0',
+                        'Model/MultiRNNCellZeroState/LSTMBlockCellZeroState_1/zeros_1:0']
+    state = session.run(state_input_name)
+
+    #Graph nodes to be fetched as run output. Tensorflow LSTMBlockCell create internal
+    #nodes for intermediate operations (gates) in the cell during run.
+    #Cell state (c) is ':1'and cell output (h) is ':6' for each layer.
+    fetches = [['Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:1',
+                'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:6',
+                'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:1',
+                'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:6'],
+               'Model/Softmax:0']
+
+    def _get_feed_dict(input_name, input_data):
+        """Create feed dict"""
+        feed_dict = {}
+        if isinstance(input_data, list):
+            for i, e in enumerate(input_name):
+                feed_dict[e] = input_data[i]
+        else:
+            feed_dict[input_name] = input_data
+        return feed_dict
+
+    for x in data:
+        feed_dict = _get_feed_dict(state_input_name, state)
+        feed_dict['Model/Placeholder:0'] = [[x]]
+        state, probs = session.run(fetches, feed_dict)
+        sample = pick_from_weight(probs[0])
+    if sample is not None:
+        samples.append(sample)
+    else:
+        samples.append(0)
+
+    k = 1
+    while k < num_samples:
+        feed_dict = _get_feed_dict(state_input_name, state)
+        feed_dict['Model/Placeholder:0'] = [[samples[-1]]]
+        state, probs = session.run(fetches, feed_dict)
+        sample = pick_from_weight(probs[0])
+        samples.append(sample)
+        k += 1
+    return samples, state
+
+def _create_ptb_vocabulary(data_dir):
+    """Read the PTB sample data input to create vocabulary"""
+    data_path = data_dir+'simple-examples/data/'
+    file_name = 'ptb.train.txt'
+    def _read_words(filename):
+        """Read the data for creating vocabulary"""
+        with tf.gfile.GFile(filename, "r") as f:
+            return f.read().encode("utf-8").decode("utf-8").replace("\n", "<eos>").split()
+
+    def _build_vocab(filename):
+        """Create vocabulary"""
+        data = _read_words(filename)
+        counter = collections.Counter(data)
+        count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*count_pairs))
+        word_to_id = dict(zip(words, range(len(words))))
+        #for python 3.x
+        id_to_word = dict((v, k) for k, v in word_to_id.items())
+        return word_to_id, id_to_word
+
+    def ptb_raw_data(data_path, file_name):
+        """Read the sample data and create vocabulary"""
+        train_path = os.path.join(data_path, file_name)
+        word_to_id, id_2_word = _build_vocab(train_path)
+        return word_to_id, id_2_word
+    return ptb_raw_data(data_path, file_name)
+
+def get_workload_ptb():
+    """ Import ptb workload from frozen protobuf
+
+    Parameters
+    ----------
+        Nothing.
+
+    Returns
+    -------
+    graph_def: graphdef
+        graph_def is the tensorflow workload for ptb.
+
+    word_to_id : dict
+        English word to integer id mapping
+
+    id_to_word : dict
+        Integer id to English word mapping
+    """
+    sample_repo = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/'
+    sample_data_file = 'simple-examples.tgz'
+    sample_url = sample_repo+sample_data_file
+    ptb_model_file = 'RNN/ptb/ptb_model_with_lstmblockcell.pb'
+
+    import tarfile
+    from tvm.contrib.download import download
+    DATA_DIR = './ptb_data/'
+    if not os.path.exists(DATA_DIR):
+        os.mkdir(DATA_DIR)
+    download(sample_url, DATA_DIR+sample_data_file)
+    t = tarfile.open(DATA_DIR+sample_data_file, 'r')
+    t.extractall(DATA_DIR)
+
+    word_to_id, id_to_word = _create_ptb_vocabulary(DATA_DIR)
+    return word_to_id, id_to_word, get_workload(ptb_model_file)
diff --git a/nnvm/python/nnvm/testing/utils.py b/nnvm/python/nnvm/testing/utils.py
new file mode 100644
index 000000000000..9b228d595d6a
--- /dev/null
+++ b/nnvm/python/nnvm/testing/utils.py
@@ -0,0 +1,57 @@
+"""Helper utility to create common workload for testing."""
+from __future__ import absolute_import as _abs
+
+import numpy as np
+import tvm
+from ..compiler import graph_util
+from ..import graph
+from . init import Xavier
+
+def create_workload(net, batch_size, image_shape=(3, 224, 224),
+                    dtype="float32", initializer=None, seed=0):
+    """Helper function to create benchmark workload for input network
+
+    Parameters
+    ----------
+    net : nnvm.Symbol
+        The selected network symbol to use
+
+    batch_size : int
+        The batch size used in the model
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    initializer : Initializer
+        The initializer used
+
+    seed : int
+        The seed used in initialization.
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    if image_shape is None:
+        image_shape = (3, 224, 224)
+    data_shape = (batch_size,) + image_shape
+    params = {}
+    g = graph.create(net)
+    input_shapes, _ = graph_util.infer_shape(g, data=data_shape)
+    shape_dict = dict(zip(g.index.input_names, input_shapes))
+    np.random.seed(seed)
+    initializer = initializer if initializer else Xavier()
+    for k, v in shape_dict.items():
+        if k == "data":
+            continue
+        init_value = np.zeros(v).astype(dtype)
+        initializer(k, init_value)
+        params[k] = tvm.nd.array(init_value, ctx=tvm.cpu(0))
+    return net, params
diff --git a/nnvm/python/nnvm/testing/vgg.py b/nnvm/python/nnvm/testing/vgg.py
new file mode 100644
index 000000000000..2c290bdc3c68
--- /dev/null
+++ b/nnvm/python/nnvm/testing/vgg.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+from .. import symbol as sym
+from . utils import create_workload
+
+def get_feature(internel_layer, layers, filters, batch_norm=False):
+    """Get VGG feature body as stacks of convoltions."""
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = sym.conv2d(
+                data=internel_layer, kernel_size=(3, 3), padding=(1, 1),
+                channels=filters[i], name="conv%s_%s"%(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = sym.batch_norm(
+                    data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = sym.relu(data=internel_layer, name="relu%s_%s" %(i + 1, j + 1))
+        internel_layer = sym.max_pool2d(
+            data=internel_layer, pool_size=(2, 2), strides=(2, 2), name="pool%s"%(i + 1))
+    return internel_layer
+
+def get_classifier(input_data, num_classes):
+    """Get VGG classifier layers as fc layers."""
+    flatten = sym.flatten(data=input_data, name="flatten")
+    fc6 = sym.dense(data=flatten, units=4096, name="fc6")
+    relu6 = sym.relu(data=fc6, name="relu6")
+    drop6 = sym.dropout(data=relu6, rate=0.5, name="drop6")
+    fc7 = sym.dense(data=drop6, units=4096, name="fc7")
+    relu7 = sym.relu(data=fc7, name="relu7")
+    drop7 = sym.dropout(data=relu7, rate=0.5, name="drop7")
+    fc8 = sym.dense(data=drop7, units=num_classes, name="fc8")
+    return fc8
+
+def get_symbol(num_classes, num_layers=11, batch_norm=False):
+    """
+    Parameters
+    ----------
+    num_classes : int, default 1000
+        Number of classification classes.
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+    batch_norm : bool, default False
+        Use batch normalization.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data = sym.Variable(name="data")
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    symbol = sym.softmax(data=classifier, name='softmax')
+    return symbol
+
+def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224),
+                 dtype="float32", **kwargs):
+    """Get benchmark workload for VGG nets.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/yolo2_detection.py b/nnvm/python/nnvm/testing/yolo2_detection.py
new file mode 100644
index 000000000000..b7744c45cff4
--- /dev/null
+++ b/nnvm/python/nnvm/testing/yolo2_detection.py
@@ -0,0 +1,246 @@
+# pylint: disable=invalid-name, unused-variable, unused-argument, no-init
+"""
+Yolo detection boxes helper functions
+====================
+DarkNet helper functions for yolo and image loading.
+This functions will not be loaded by default.
+These are utility functions used for testing and tutorial file.
+"""
+from __future__ import division
+import math
+from collections import namedtuple
+import numpy as np
+from PIL import Image
+from PIL import ImageDraw
+from PIL import ImageFont
+
+def _entry_index(batch, w, h, outputs, classes, coords, location, entry):
+    n = int(location/(w*h))
+    loc = location%(w*h)
+    return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc
+
+Box = namedtuple('Box', ['x', 'y', 'w', 'h'])
+def _get_region_box(x, biases, n, index, i, j, w, h, stride):
+    b = Box(0, 0, 0, 0)
+    b = b._replace(x=(i + x[index + 0*stride]) / w)
+    b = b._replace(y=(j + x[index + 1*stride]) / h)
+    b = b._replace(w=np.exp(x[index + 2*stride]) * biases[2*n] / w)
+    b = b._replace(h=np.exp(x[index + 3*stride]) * biases[2*n+1] / h)
+    return b
+
+def _correct_region_boxes(boxes, n, w, h, netw, neth, relative):
+    new_w, new_h = (netw, (h*netw)/w) if (netw/w < neth/h) else ((w*neth/h), neth)
+    for i in range(n):
+        b = boxes[i]
+        b = boxes[i]
+        b = b._replace(x=(b.x - (netw - new_w)/2/netw) / (new_w/netw))
+        b = b._replace(y=(b.y - (neth - new_h)/2/neth) / (new_h/neth))
+        b = b._replace(w=b.w * netw/new_w)
+        b = b._replace(h=b.h * neth/new_h)
+        if not relative:
+            b = b._replace(x=b.x * w)
+            b = b._replace(w=b.w * w)
+            b = b._replace(y=b.y * h)
+            b = b._replace(h=b.h * h)
+        boxes[i] = b
+
+def _overlap(x1, w1, x2, w2):
+    l1 = x1 - w1/2
+    l2 = x2 - w2/2
+    left = l1 if l1 > l2 else l2
+    r1 = x1 + w1/2
+    r2 = x2 + w2/2
+    right = r1 if r1 < r2 else r2
+    return right - left
+
+def _box_intersection(a, b):
+    w = _overlap(a.x, a.w, b.x, b.w)
+    h = _overlap(a.y, a.h, b.y, b.h)
+    if w < 0 or h < 0:
+        return 0
+    return w*h
+
+def _box_union(a, b):
+    i = _box_intersection(a, b)
+    u = a.w*a.h + b.w*b.h - i
+    return u
+
+def _box_iou(a, b):
+    return _box_intersection(a, b)/_box_union(a, b)
+
+def get_region_boxes(layer_in, imw, imh, netw, neth, thresh, probs,
+                     boxes, relative, tvm_out):
+    "To get the boxes for the image based on the prediction"
+    lw = layer_in.w
+    lh = layer_in.h
+    probs = [[0 for i in range(layer_in.classes + 1)] for y in range(lw*lh*layer_in.n)]
+    boxes = [Box(0, 0, 0, 0) for i in range(lw*lh*layer_in.n)]
+    for i in range(lw*lh):
+        row = int(i / lw)
+        col = int(i % lw)
+        for n in range(layer_in.n):
+            index = n*lw*lh + i
+            obj_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
+                                     layer_in.coords, n*lw*lh + i, layer_in.coords)
+            box_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
+                                     layer_in.coords, n*lw*lh + i, 0)
+            mask_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
+                                      layer_in.coords, n*lw*lh + i, 4)
+            scale = 1 if layer_in.background  else tvm_out[obj_index]
+            boxes[index] = _get_region_box(tvm_out, layer_in.biases, n, box_index, col,
+                                           row, lw, lh, lw*lh)
+            if not layer_in.softmax_tree:
+                max_element = 0
+                for j in range(layer_in.classes):
+                    class_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
+                                               layer_in.coords, n*lw*lh + i, layer_in.coords+1+j)
+                    prob = scale*tvm_out[class_index]
+                    probs[index][j] = prob if prob > thresh else 0
+                    max_element = max(max_element, prob)
+                probs[index][layer_in.classes] = max_element
+
+    _correct_region_boxes(boxes, lw*lh*layer_in.n, imw, imh, netw, neth, relative)
+    return boxes, probs
+
+
+def do_nms_sort(boxes, probs, total, classes, thresh):
+    "Does the sorting based on the threshold values"
+    SortableBbox = namedtuple('SortableBbox', ['index_var', 'class_var', 'probs'])
+
+    s = [SortableBbox(0, 0, []) for i in range(total)]
+    for i in range(total):
+        s[i] = s[i]._replace(index_var=i)
+        s[i] = s[i]._replace(class_var=0)
+        s[i] = s[i]._replace(probs=probs)
+
+    for k in range(classes):
+        for i in range(total):
+            s[i] = s[i]._replace(class_var=k)
+        s = sorted(s, key=lambda x: x.probs[x.index_var][x.class_var], reverse=True)
+        for i in range(total):
+            if probs[s[i].index_var][k] == 0:
+                continue
+            a = boxes[s[i].index_var]
+            for j in range(i+1, total):
+                b = boxes[s[j].index_var]
+                if _box_iou(a, b) > thresh:
+                    probs[s[j].index_var][k] = 0
+    return boxes, probs
+
+def draw_detections(im, num, thresh, boxes, probs, names, classes):
+    "Draw the markings around the detected region"
+    for i in range(num):
+        labelstr = []
+        category = -1
+        for j in range(classes):
+            if probs[i][j] > thresh:
+                if category == -1:
+                    category = j
+                labelstr.append(names[j])
+        if category > -1:
+            imc, imh, imw = im.shape
+            width = int(imh * 0.006)
+            offset = category*123457 % classes
+            red = _get_color(2, offset, classes)
+            green = _get_color(1, offset, classes)
+            blue = _get_color(0, offset, classes)
+            rgb = [red, green, blue]
+            b = boxes[i]
+            left = int((b.x-b.w/2.)*imw)
+            right = int((b.x+b.w/2.)*imw)
+            top = int((b.y-b.h/2.)*imh)
+            bot = int((b.y+b.h/2.)*imh)
+
+            if left < 0:
+                left = 0
+            if right > imw-1:
+                right = imw-1
+            if top < 0:
+                top = 0
+            if bot > imh-1:
+                bot = imh-1
+            _draw_box_width(im, left, top, right, bot, width, red, green, blue)
+            label = _get_label(''.join(labelstr), rgb)
+            _draw_label(im, top + width, left, label, rgb)
+
+def _get_pixel(im, x, y, c):
+    return im[c][y][x]
+
+def _set_pixel(im, x, y, c, val):
+    if x < 0 or y < 0 or c < 0 or x >= im.shape[2] or y >= im.shape[1] or c >= im.shape[0]:
+        return
+    im[c][y][x] = val
+
+def _draw_label(im, r, c, label, rgb):
+    w = label.shape[2]
+    h = label.shape[1]
+    if (r - h) >= 0:
+        r = r - h
+
+    for j in range(h):
+        if j < h and (j + r) < im.shape[1]:
+            for i in range(w):
+                if i < w and (i + c) < im.shape[2]:
+                    for k in range(label.shape[0]):
+                        val = _get_pixel(label, i, j, k)
+                        _set_pixel(im, i+c, j+r, k, val)#rgb[k] * val)
+
+def _get_label(labelstr, rgb):
+    text = labelstr
+    colorText = "black"
+    testDraw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
+    font = ImageFont.truetype("arial.ttf", 25)
+    width, height = testDraw.textsize(labelstr, font=font)
+    img = Image.new('RGB', (width, height), color=(int(rgb[0]*255), int(rgb[1]*255),
+                                                   int(rgb[2]*255)))
+    d = ImageDraw.Draw(img)
+    d.text((0, 0), text, fill=colorText, font=font)
+    opencvImage = np.divide(np.asarray(img), 255)
+    return opencvImage.transpose(2, 0, 1)
+
+def _get_color(c, x, max_value):
+    c = int(c)
+    colors = [[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]]
+    ratio = (float(x)/float(max_value)) * 5
+    i = int(math.floor(ratio))
+    j = int(math.ceil(ratio))
+    ratio -= i
+    r = (1-ratio) * colors[i][c] + ratio*colors[j][c]
+    return r
+
+def _draw_box(im, x1, y1, x2, y2, r, g, b):
+    y1 = int(y1)
+    y2 = int(y2)
+    x1 = int(x1)
+    x2 = int(x2)
+    ac, ah, aw = im.shape
+    if x1 < 0:
+        x1 = 0
+    if x1 >= aw:
+        y1 = 0
+    if y1 >= ah:
+        y1 = ah - 1
+    if y2 < 0:
+        y2 = 0
+    if y2 >= ah:
+        y2 = ah - 1
+
+    for i in range(x1, x2):
+        im[0][y1][i] = r
+        im[0][y2][i] = r
+        im[1][y1][i] = g
+        im[1][y2][i] = g
+        im[2][y1][i] = b
+        im[2][y2][i] = b
+
+    for i in range(y1, y2):
+        im[0][i][x1] = r
+        im[0][i][x2] = r
+        im[1][i][x1] = g
+        im[1][i][x2] = g
+        im[2][i][x1] = b
+        im[2][i][x2] = b
+
+def _draw_box_width(im, x1, y1, x2, y2, w, r, g, b):
+    for i in range(int(w)):
+        _draw_box(im, x1+i, y1+i, x2-i, y2-i, r, g, b)
diff --git a/nnvm/python/nnvm/top/__init__.py b/nnvm/python/nnvm/top/__init__.py
new file mode 100644
index 000000000000..d9174f8162f2
--- /dev/null
+++ b/nnvm/python/nnvm/top/__init__.py
@@ -0,0 +1,14 @@
+"""Tensor operator property registry
+
+Provide information to lower and schedule tensor operators.
+"""
+from .attr_dict import AttrDict
+from . import tensor
+from . import nn
+from . import transform
+from . import reduction
+from . import vision
+from . import image
+
+from .registry import OpPattern
+from .registry import register_compute, register_schedule, register_pattern
diff --git a/nnvm/python/nnvm/top/attr_dict.py b/nnvm/python/nnvm/top/attr_dict.py
new file mode 100644
index 000000000000..efd439fa75fc
--- /dev/null
+++ b/nnvm/python/nnvm/top/attr_dict.py
@@ -0,0 +1,160 @@
+# pylint: disable=invalid-name
+"""Attr dictionary object used by schedule functions"""
+import tvm
+
+_dict_get = tvm.get_global_func("nnvm.compiler._dict_get")
+_dict_size = tvm.get_global_func("nnvm.compiler._dict_size")
+_dict_keys = tvm.get_global_func("nnvm.compiler._dict_keys")
+
+class AttrDict(object):
+    """Attribute dictionary in nnvm.
+
+    Used by python registration of compute and schedule function.
+    AttrDict is passed as the first argument to schedule and compute function.
+    """
+    _tvm_tcode = 18
+
+    def __init__(self, handle):
+        self.handle = handle
+
+    def __del__(self):
+        tvm.nd.free_extension_handle(self.handle, 18)
+
+    @property
+    def _tvm_handle(self):
+        return self.handle.value
+
+    def __getitem__(self, key):
+        return _dict_get(self, key)
+
+    def keys(self):
+        """Get list of keys in the dict.
+
+        Returns
+        -------
+        keys : list of str
+            List of keys
+        """
+        return [x.value for x in _dict_keys(self)]
+
+    def get_int_tuple(self, key):
+        """Get tuple of integer from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        tuple : tuple of int
+            The result tuple
+        """
+        return tuple(int(x) for x in self[key][1:-1].split(",") if x)
+
+    def get_int_pair_tuple(self, key):
+        """Get tuple of integer pairs from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        tuple : tuple of int pairs
+            The result tuple
+        """
+        flat = [int(x.strip(' [] ')) for x in self[key][1:-1].split(",")]
+        return tuple((flat[i], flat[i+1]) for i in range(0, len(flat), 2))
+
+    def get_int(self, key):
+        """Get integer from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        value : int
+            The result value
+        """
+        return int(self[key])
+
+    def get_float_tuple(self, key):
+        """Get tuple of float from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        tuple : tuple of float
+            The result tuple
+        """
+        return tuple(float(x) for x in self[key][1:-1].split(",") if x)
+
+    def get_float(self, key):
+        """Get float from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        value : float
+            The result value
+        """
+        return float(self[key])
+
+    def get_bool(self, key):
+        """Get bool from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        value : bool
+            The result value
+        """
+        lowercase = self[key].lower()
+        if lowercase == "1":
+            return True
+        elif lowercase == "0":
+            return False
+        elif lowercase == "true":
+            return True
+        elif lowercase == "false":
+            return False
+        else:
+            raise ValueError("Wrong bool format for key %s" % key)
+
+    def get_string(self, key):
+        """Get string from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        value : str
+            The result value
+        """
+        return self[key]
+
+    def __repr__(self):
+        return str({k : self[k] for k in self.keys()})
+
+
+tvm.register_extension(AttrDict, AttrDict)
diff --git a/nnvm/python/nnvm/top/image.py b/nnvm/python/nnvm/top/image.py
new file mode 100644
index 000000000000..a9d0d8648c48
--- /dev/null
+++ b/nnvm/python/nnvm/top/image.py
@@ -0,0 +1,17 @@
+# pylint: disable=invalid-name, unused-argument
+"""Definition of image ops"""
+from __future__ import absolute_import
+
+import tvm
+import topi
+from . import registry as reg
+from .registry import OpPattern
+
+# resize
+@reg.register_schedule("resize")
+def schedule_resize(_, outs, target):
+    """Schedule definition of resize"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_injective(outs)
+
+reg.register_pattern("resize", OpPattern.INJECTIVE)
diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
new file mode 100644
index 000000000000..f59424203402
--- /dev/null
+++ b/nnvm/python/nnvm/top/nn.py
@@ -0,0 +1,356 @@
+# pylint: disable=invalid-name, unused-argument
+"""Definition of nn ops"""
+from __future__ import absolute_import
+
+import tvm
+import topi
+from topi.util import get_const_int
+from .tensor import _fschedule_broadcast, _fschedule_injective
+from . import registry as reg
+from .registry import OpPattern
+
+# relu
+reg.register_schedule("relu", _fschedule_broadcast)
+reg.register_pattern("relu", OpPattern.ELEMWISE)
+
+
+# leaky_relu
+reg.register_schedule("leaky_relu", _fschedule_broadcast)
+reg.register_pattern("leaky_relu", OpPattern.ELEMWISE)
+
+# prelu
+reg.register_schedule("prelu", _fschedule_broadcast)
+reg.register_pattern("prelu", OpPattern.BROADCAST)
+
+# flatten
+reg.register_schedule("flatten", _fschedule_broadcast)
+reg.register_pattern("flatten", OpPattern.INJECTIVE)
+
+
+# pad
+reg.register_schedule("pad", _fschedule_broadcast)
+reg.register_pattern("pad", OpPattern.INJECTIVE)
+
+
+# layout transform
+reg.register_schedule("__layout_transform__", _fschedule_injective)
+reg.register_pattern("__layout_transform__", OpPattern.INJECTIVE)
+
+
+@reg.register_schedule("softmax")
+def schedule_softmax(_, outs, target):
+    """Schedule definition of softmax"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_softmax(outs)
+
+reg.register_pattern("softmax", OpPattern.OPAQUE)
+
+
+# log softmax
+@reg.register_schedule("log_softmax")
+def schedule_log_softmax(_, outs, target):
+    """Schedule definition of softmax"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_softmax(outs)
+
+# Mark softmax as extern as we do not fuse it in call cases
+reg.register_pattern("log_softmax", OpPattern.OPAQUE)
+
+
+# dense
+@reg.register_compute("dense")
+def compute_dense(attrs, inputs, _):
+    """Compute definition of dense"""
+    if attrs.get_bool("use_bias"):
+        return topi.nn.dense(inputs[0], inputs[1], bias=inputs[2])
+    return topi.nn.dense(inputs[0], inputs[1])
+
+@reg.register_schedule("dense")
+def schedule_dense(_, outs, target):
+    """Schedule definition of dense"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_dense(outs)
+
+reg.register_pattern("dense", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+#matmul
+reg.register_pattern("matmul", OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_schedule("matmul", _fschedule_injective)
+
+# conv2d
+@reg.register_compute("conv2d")
+def compute_conv2d(attrs, inputs, _):
+    """Compute definition of conv2d"""
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    channels = attrs.get_int("channels")
+    layout = attrs["layout"]
+    kernel_layout = attrs["kernel_layout"]
+    out_dtype = attrs["out_dtype"]
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+    assert layout == "NCHW" or layout == "NHWC"
+    (dilation_h, dilation_w) = dilation
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+    elif dilation == (1, 1):
+        kernel = inputs[1]
+    elif layout == "NCHW":
+        kernel = topi.nn.dilate(inputs[1], [1, 1, dilation_h, dilation_w])
+    else: #layout == NHWC
+        kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1])
+
+    if groups == 1:
+        out = topi.nn.conv2d(
+            inputs[0], kernel, strides, padding, layout, out_dtype=out_dtype)
+    elif layout == "NCHW" and \
+         groups == get_const_int(inputs[0].shape[1]) and \
+         groups == channels:
+        out = topi.nn.depthwise_conv2d_nchw(
+            inputs[0], kernel, strides, padding, out_dtype=out_dtype)
+    elif layout == "NHWC" and \
+         kernel_layout == "HWOI" and \
+         groups == get_const_int(inputs[0].shape[3]) and \
+         groups == channels:
+        out = topi.nn.depthwise_conv2d_nhwc(
+            inputs[0], kernel, strides, padding, out_dtype=out_dtype)
+    else:
+        raise ValueError("not support arbitrary group number for now")
+
+    if attrs.get_bool("use_bias"):
+        bias = inputs[2]
+        expand_axis = 1 if layout == "NCHW" else 0
+        bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
+        out = topi.add(out, bias)
+    return out
+
+@reg.register_schedule("conv2d")
+def schedule_conv2d(attrs, outs, target):
+    """Schedule definition of conv2d"""
+    groups = attrs.get_int("groups")
+    channels = attrs.get_int("channels")
+    layout = attrs["layout"]
+    kernel_layout = attrs["kernel_layout"]
+
+    with tvm.target.create(target):
+        if groups == 1 and layout == "NCHW":
+            return topi.generic.schedule_conv2d_nchw(outs)
+        elif groups == 1 and layout == "NHWC":
+            return topi.generic.schedule_conv2d_nhwc(outs)
+        elif groups == channels and layout == "NCHW":
+            return topi.generic.schedule_depthwise_conv2d_nchw(outs)
+        elif groups == channels and layout == "NHWC" and kernel_layout == "HWOI":
+            return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
+        else:
+            raise ValueError("No compatible schedule")
+
+@reg.register_alter_op_layout("conv2d")
+def alter_conv2d_layout(attrs, inputs, tinfos):
+    return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos)
+
+reg.register_pattern("conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# convolution NCHWc
+@reg.register_compute("_contrib_conv2d_NCHWc")
+def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
+    """Compute definition of conv2d NCHWc"""
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    kh, kw = attrs.get_int_tuple('kernel_size')
+    groups = attrs.get_int("groups")
+    channels = attrs.get_int("channels")
+    layout = attrs.get_string("layout")
+    out_layout = attrs.get_string("out_layout")
+    assert dilation == (1, 1), "not support dilate now"
+    if groups == 1:
+        # pylint: disable=assignment-from-no-return
+        out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], channels, (kh, kw),
+                                   strides, padding, layout, out_layout)
+        # pylint: enable=assignment-from-no-return
+    else:
+        raise ValueError("not support arbitrary group number > 1 for now")
+    if attrs.get_bool("use_bias"):
+        bias = inputs[2]
+        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
+        out = topi.add(out, bias)
+    return out
+
+@reg.register_schedule("_contrib_conv2d_NCHWc")
+def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
+    """Schedule definition of conv2d NCHWc"""
+    groups = attrs.get_int("groups")
+    kh, kw = attrs.get_int_tuple('kernel_size')
+    oc = attrs.get_int("channels")
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    layout = attrs.get_string("layout")
+    out_layout = attrs.get_string("out_layout")
+    with tvm.target.create(target):
+        if groups == 1:
+            return topi.generic.schedule_conv2d_NCHWc(oc, (kh, kw), strides, padding,
+                                                      layout, out_layout, outs)
+        else:
+            raise ValueError("not support group number > 1 for now")
+
+reg.register_pattern("_contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+@reg.register_compute("_contrib_conv2d_winograd_weight_transform")
+def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _):
+    return topi.nn.conv2d_winograd_weight_transform(inputs[0], attrs.get_int('tile_size'))
+
+@reg.register_schedule("_contrib_conv2d_winograd_weight_transform")
+def schedule_contrib_conv2d_winograd_weight_transform(attrs, outs, target):
+    with tvm.target.create(target):
+        return topi.generic.schedule_conv2d_winograd_weight_transform(outs)
+
+reg.register_pattern("_contrib_conv2d_winograd_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+@reg.register_compute("_contrib_conv2d_winograd_without_weight_transform")
+def compute_contrib_conv2d_winograd_without_weight_transform(attrs, inputs, _):
+    """Compute definition of conv2d NCHWc"""
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.get_string("layout")
+    out_dtype = attrs.get_string("out_dtype")
+    tile_size = attrs.get_int("tile_size")
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+
+    # pylint: disable=assignment-from-no-return
+    out = topi.nn.conv2d_winograd_without_weight_transform(
+        inputs[0], inputs[1], strides, padding, layout, out_dtype,
+        tile_size)
+
+    if attrs.get_bool("use_bias"):
+        bias = inputs[2]
+        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
+        out = topi.add(out, bias)
+    return out
+
+@reg.register_schedule("_contrib_conv2d_winograd_without_weight_transform")
+def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, target):
+    with tvm.target.create(target):
+        return topi.generic.schedule_conv2d_winograd_without_weight_transform(outs)
+
+reg.register_pattern("_contrib_conv2d_winograd_without_weight_transform",
+                     OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# conv2d_transpose
+@reg.register_compute("conv2d_transpose")
+def compute_conv2d_transpose(attrs, inputs, _):
+    """Compute definition of conv2d_transpose"""
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs["layout"]
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    assert groups == 1, "only support groups == 1 for now"
+    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding)
+    if attrs.get_bool("use_bias"):
+        bias = inputs[2]
+        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
+        out = topi.add(out, bias)
+    output_padding = attrs.get_int_tuple("output_padding")
+    out = topi.nn.pad(out, \
+        [0, 0, 0, 0], [0, 0, output_padding[0], output_padding[1]])
+    return out
+
+@reg.register_schedule("conv2d_transpose")
+def schedule_conv2d_transpose(attrs, outs, target):
+    """Schedule definition of conv2d_transpose"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_conv2d_transpose_nchw(outs)
+
+reg.register_pattern("conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# max_pool2d
+@reg.register_schedule("max_pool2d")
+def schedule_max_pool2d(_, outs, target):
+    """Schedule definition of max_pool2d"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_pool(outs)
+
+reg.register_pattern("max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# avg_pool2d
+@reg.register_schedule("avg_pool2d")
+def schedule_avg_pool2d(_, outs, target):
+    """Schedule definition of avg_pool2d"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_pool(outs)
+
+reg.register_pattern("avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# global_max_pool2d
+@reg.register_schedule("global_max_pool2d")
+def schedule_global_max_pool2d(_, outs, target):
+    """Schedule definition of global_max_pool2d"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_global_pool(outs)
+
+reg.register_pattern("global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# global_avg_pool2d
+@reg.register_schedule("global_avg_pool2d")
+def schedule_global_avg_pool2d(_, outs, target):
+    """Schedule definition of global_avg_pool2d"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_global_pool(outs)
+
+reg.register_pattern("global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# upsampling
+@reg.register_schedule("upsampling")
+def schedule_upsampling(_, outs, target):
+    """Schedule definition of upsampling"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_injective(outs)
+
+reg.register_pattern("upsampling", OpPattern.INJECTIVE)
+
+@reg.register_compute("lrn")
+def compute_lrn(attrs, inputs, _):
+    """Compute definition of lrn"""
+    size = attrs.get_int("size")
+    axis = attrs.get_int("axis")
+    alpha = attrs.get_float("alpha")
+    beta = attrs.get_float("beta")
+    bias = attrs.get_float("bias")
+    return topi.nn.lrn(inputs[0], size, axis, alpha, beta, bias)
+
+@reg.register_schedule("lrn")
+def schedule_lrn(attrs, outs, target):
+    """Schedule definition of lrn"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_lrn(outs)
+
+reg.register_pattern("lrn", OpPattern.OPAQUE)
+
+@reg.register_compute("l2_normalize")
+def compute_l2_normalize(attrs, inputs, _):
+    """Compute definition of l2 normalize"""
+    eps = attrs.get_float("eps")
+    axis = attrs.get_int_tuple("axis")
+    return topi.nn.l2_normalize(inputs[0], eps, axis)
+
+@reg.register_schedule("l2_normalize")
+def schedule_l2_normalize(attrs, outs, target):
+    """Schedule definition of l2 normalize"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_l2_normalize(outs)
+
+reg.register_pattern("l2_normalize", OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/nnvm/python/nnvm/top/reduction.py b/nnvm/python/nnvm/top/reduction.py
new file mode 100644
index 000000000000..fd8e2f8df56e
--- /dev/null
+++ b/nnvm/python/nnvm/top/reduction.py
@@ -0,0 +1,51 @@
+# pylint: disable=invalid-name, unused-argument
+"""Reduction ops"""
+from __future__ import absolute_import
+
+import tvm
+import topi
+import topi.cuda
+from . import registry as reg
+from .registry import OpPattern
+
+def _schedule_reduce(_, outs, target):
+    """Generic schedule for reduce"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_reduce(outs)
+
+
+_fschedule_reduce = tvm.convert(_schedule_reduce)
+
+def _compute_reduce(f):
+    """auxiliary function"""
+    def _compute(attrs, inputs, out_info):
+        axis = attrs.get_int_tuple("axis")
+        keepdims = attrs.get_bool("keepdims")
+        if axis:
+            return f(inputs[0], axis=axis, keepdims=keepdims)
+        return f(inputs[0], keepdims=keepdims)
+    return _compute
+
+# sum
+reg.register_pattern("sum", OpPattern.COMM_REDUCE)
+reg.register_schedule("sum", _fschedule_reduce)
+
+# max
+reg.register_pattern("max", OpPattern.COMM_REDUCE)
+reg.register_schedule("max", _fschedule_reduce)
+
+# min
+reg.register_pattern("min", OpPattern.COMM_REDUCE)
+reg.register_schedule("min", _fschedule_reduce)
+
+# collapse sum
+reg.register_pattern("collapse_sum", OpPattern.COMM_REDUCE)
+reg.register_schedule("collapse_sum", _fschedule_reduce)
+
+# argmax
+reg.register_pattern("argmax", OpPattern.COMM_REDUCE)
+reg.register_schedule("argmax", _fschedule_reduce)
+
+# argmin
+reg.register_pattern("argmin", OpPattern.COMM_REDUCE)
+reg.register_schedule("argmin", _fschedule_reduce)
diff --git a/nnvm/python/nnvm/top/registry.py b/nnvm/python/nnvm/top/registry.py
new file mode 100644
index 000000000000..68ea80e7e017
--- /dev/null
+++ b/nnvm/python/nnvm/top/registry.py
@@ -0,0 +1,122 @@
+# pylint: disable=invalid-name
+"""Information registry to register operator information for compiler"""
+import tvm
+
+class OpPattern(object):
+    """Operator generic patterns
+
+    See Also
+    --------
+    top.tag : Contains explanation of the tag type.
+    """
+    # Elementwise operator
+    ELEMWISE = 0
+    # Broadcast operator
+    BROADCAST = 1
+    # Injective mapping
+    INJECTIVE = 2
+    # Comunication
+    COMM_REDUCE = 3
+    # Complex op, can still fuse ewise into it
+    OUT_ELEMWISE_FUSABLE = 4
+    # Not fusable opaque op
+    OPAQUE = 8
+
+_register_compute = tvm.get_global_func("nnvm._register_compute")
+_register_schedule = tvm.get_global_func("nnvm._register_schedule")
+_register_pattern = tvm.get_global_func("nnvm._register_pattern")
+_register_alter_op_layout = tvm.get_global_func("nnvm.compiler._register_alter_op_layout")
+
+def register_compute(op_name, f=None, level=10):
+    """Register compute function for operator
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    f : function
+        The schedule function
+
+    level : int
+        The priority level
+
+    Returns
+    -------
+    fregister : function
+        Register function if f is not specified.
+    """
+    def register(myf):
+        """internal register function"""
+        _register_compute(op_name, myf, level)
+        return myf
+    return register(f) if f else register
+
+
+def register_schedule(op_name, f=None, level=10):
+    """Register schedule function for operator
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    f : function
+        The schedule function
+
+    level : int
+        The priority level
+
+    Returns
+    -------
+    fregister : function
+        Register function if f is not specified.
+    """
+    def register(myf):
+        """internal register function"""
+        _register_schedule(op_name, myf, level)
+        return myf
+    return register(f) if f else register
+
+
+def register_pattern(op_name, pattern, level=10):
+    """Register pattern code for operator
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    pattern : int
+        The pattern code.
+
+    level : int
+        The priority level
+    """
+    _register_pattern(op_name, pattern, level)
+
+
+def register_alter_op_layout(op_name, f=None, level=10):
+    """Register alter layout function for operator
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    f : function
+        The schedule function
+
+    level : int
+        The priority level
+
+    Returns
+    -------
+    fregister : function
+        Register function if f is not specified.
+    """
+    def register(myf):
+        """internal register function"""
+        _register_alter_op_layout(op_name, myf, level)
+        return myf
+    return register(f) if f else register
diff --git a/nnvm/python/nnvm/top/tensor.py b/nnvm/python/nnvm/top/tensor.py
new file mode 100644
index 000000000000..e0214d6ddf16
--- /dev/null
+++ b/nnvm/python/nnvm/top/tensor.py
@@ -0,0 +1,278 @@
+# pylint: disable=invalid-name, unused-argument
+"""Tensor ops"""
+from __future__ import absolute_import
+
+import tvm
+import topi
+import topi.cuda
+from . import registry as reg
+from .registry import OpPattern
+
+def _schedule_injective(_, outs, target):
+    """Generic schedule for binary bcast"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_injective(outs)
+
+def _compute_binary_scalar(f):
+    """auxiliary function"""
+    @tvm.tag_scope(topi.tag.ELEMWISE)
+    def _compute(attrs, x, _):
+        x = x[0]
+        scalar = attrs.get_float("scalar")
+        scalar = tvm.const(scalar, x.dtype)
+        return tvm.compute(x.shape, lambda *i: f(x(*i), scalar))
+    return _compute
+
+
+def _compute_unary(f):
+    """auxiliary function"""
+    def _compute(attrs, x, _):
+        return f(x[0])
+    return _compute
+
+
+def _compute_binary(f):
+    """auxiliary function"""
+    def _compute(attrs, x, _):
+        return f(x[0], x[1])
+    return _compute
+
+
+_fschedule_injective = tvm.convert(_schedule_injective)
+_fschedule_broadcast = _fschedule_injective
+_fschedule_elemwise = _fschedule_injective
+
+# Assign requires special treatment in the compiler
+# The compute and schedule are designed as
+# copy from rhs to output
+reg.register_pattern("_assign", OpPattern.OPAQUE)
+reg.register_schedule("_assign", _fschedule_broadcast)
+
+# copy
+reg.register_pattern("copy", OpPattern.ELEMWISE)
+reg.register_schedule("copy", _fschedule_broadcast)
+
+# cast
+reg.register_pattern("cast", OpPattern.ELEMWISE)
+reg.register_schedule("cast", _fschedule_broadcast)
+
+# floor
+reg.register_pattern("floor", OpPattern.ELEMWISE)
+reg.register_schedule("floor", _fschedule_broadcast)
+
+# ceil
+reg.register_pattern("ceil", OpPattern.ELEMWISE)
+reg.register_schedule("ceil", _fschedule_broadcast)
+
+# round
+reg.register_pattern("round", OpPattern.ELEMWISE)
+reg.register_schedule("round", _fschedule_broadcast)
+
+# abs
+reg.register_pattern("abs", OpPattern.ELEMWISE)
+reg.register_schedule("abs", _fschedule_broadcast)
+
+# trunc
+reg.register_pattern("trunc", OpPattern.ELEMWISE)
+reg.register_schedule("trunc", _fschedule_broadcast)
+
+# exp
+reg.register_pattern("exp", OpPattern.ELEMWISE)
+reg.register_schedule("exp", _fschedule_broadcast)
+
+# sqrt
+reg.register_pattern("sqrt", OpPattern.ELEMWISE)
+reg.register_schedule("sqrt", _fschedule_broadcast)
+
+# log
+reg.register_pattern("log", OpPattern.ELEMWISE)
+reg.register_schedule("log", _fschedule_broadcast)
+
+# tanh
+reg.register_pattern("tanh", OpPattern.ELEMWISE)
+reg.register_schedule("tanh", _fschedule_broadcast)
+
+# negative
+reg.register_pattern("negative", OpPattern.ELEMWISE)
+reg.register_schedule("negative", _fschedule_broadcast)
+
+# sigmoid
+reg.register_pattern("sigmoid", OpPattern.ELEMWISE)
+reg.register_schedule("sigmoid", _fschedule_broadcast)
+
+# add_scalar
+reg.register_pattern("__add_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__add_scalar__", _fschedule_broadcast)
+
+# sub_calar
+reg.register_pattern("__sub_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__sub_scalar__", _fschedule_broadcast)
+
+# rsub_scalar
+reg.register_pattern("__rsub_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__rsub_scalar__", _fschedule_broadcast)
+
+# mul_scalar
+reg.register_pattern("__mul_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__mul_scalar__", _fschedule_broadcast)
+
+# div_scalar
+reg.register_pattern("__div_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__div_scalar__", _fschedule_broadcast)
+
+# rdiv_scalar
+reg.register_pattern("__rdiv_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__rdiv_scalar__", _fschedule_broadcast)
+
+# pow_scalar
+reg.register_pattern("__pow_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__pow_scalar__", _fschedule_broadcast)
+
+# rpow_scalar
+reg.register_pattern("__rpow_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__rpow_scalar__", _fschedule_broadcast)
+
+# lshift_scalar
+reg.register_pattern("__lshift_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__lshift_scalar__", _fschedule_broadcast)
+
+# rshift_scalar
+reg.register_pattern("__rshift_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__rshift_scalar__", _fschedule_broadcast)
+
+# elemwise_add
+reg.register_pattern("elemwise_add", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_add", _fschedule_broadcast)
+
+# elemwise_sub
+reg.register_pattern("elemwise_sub", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_sub", _fschedule_broadcast)
+
+# elemwise_mul
+reg.register_pattern("elemwise_mul", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_mul", _fschedule_broadcast)
+
+# elemwise_div
+reg.register_pattern("elemwise_div", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_div", _fschedule_broadcast)
+
+# elemwise_mod
+reg.register_pattern("elemwise_mod", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_mod", _fschedule_broadcast)
+
+# elemwise_pow
+reg.register_pattern("elemwise_pow", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_pow", _fschedule_broadcast)
+
+# broadcast_add
+reg.register_pattern("broadcast_add", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_add", _fschedule_broadcast)
+
+# broadcast_sub
+reg.register_pattern("broadcast_sub", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_sub", _fschedule_broadcast)
+
+# broadcast_mul
+reg.register_pattern("broadcast_mul", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_mul", _fschedule_broadcast)
+
+# broadcast_div
+reg.register_pattern("broadcast_div", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_div", _fschedule_broadcast)
+
+# broadcast mod
+reg.register_pattern("broadcast_mod", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_mod", _fschedule_broadcast)
+
+# broadcast max
+reg.register_pattern("broadcast_max", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_max", _fschedule_broadcast)
+
+# broadcast min
+reg.register_pattern("broadcast_min", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_min", _fschedule_broadcast)
+
+# broadcast pow
+reg.register_pattern("broadcast_pow", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_pow", _fschedule_broadcast)
+
+# broadcast left_shift
+reg.register_pattern("broadcast_left_shift", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_left_shift", _fschedule_broadcast)
+
+# broadcast right_shift
+reg.register_pattern("broadcast_right_shift", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_right_shift", _fschedule_broadcast)
+
+# broadcast greater
+reg.register_pattern("broadcast_greater", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_greater", _fschedule_broadcast)
+
+# broadcast less
+reg.register_pattern("broadcast_less", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_less", _fschedule_broadcast)
+
+# broadcast equal
+reg.register_pattern("broadcast_equal", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_equal", _fschedule_broadcast)
+
+# broadcast not_equal
+reg.register_pattern("broadcast_not_equal", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_not_equal", _fschedule_broadcast)
+
+# broadcast greater_equal
+reg.register_pattern("broadcast_greater_equal", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_greater_equal", _fschedule_broadcast)
+
+# broadcast less_equal
+reg.register_pattern("broadcast_less_equal", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_less_equal", _fschedule_broadcast)
+
+# broadcast_to
+reg.register_pattern("broadcast_to", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_to", _fschedule_broadcast)
+
+# clip
+reg.register_pattern("clip", OpPattern.ELEMWISE)
+reg.register_schedule("clip", _fschedule_elemwise)
+
+# elemwise sum
+reg.register_pattern("elemwise_sum", OpPattern.ELEMWISE)
+reg.register_schedule("elemwise_sum", _fschedule_elemwise)
+
+# full
+reg.register_pattern("full", OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_schedule("full", _fschedule_elemwise)
+
+# full_like
+reg.register_pattern("full_like", OpPattern.ELEMWISE)
+reg.register_schedule("full_like", _fschedule_elemwise)
+
+# zeros
+reg.register_pattern("zeros", OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_schedule("zeros", _fschedule_elemwise)
+
+# zeros_like
+reg.register_pattern("zeros_like", OpPattern.ELEMWISE)
+reg.register_schedule("zeros_like", _fschedule_elemwise)
+
+# ones
+reg.register_pattern("ones", OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_schedule("ones", _fschedule_elemwise)
+
+# ones_like
+reg.register_pattern("ones_like", OpPattern.ELEMWISE)
+reg.register_schedule("ones_like", _fschedule_elemwise)
+
+# greater
+reg.register_pattern("greater", OpPattern.ELEMWISE)
+reg.register_schedule("greater", _fschedule_elemwise)
+
+# less
+reg.register_pattern("less", OpPattern.ELEMWISE)
+reg.register_schedule("less", _fschedule_elemwise)
+
+# block_grad
+reg.register_compute("block_grad", _compute_unary(topi.identity))
+reg.register_pattern("block_grad", OpPattern.ELEMWISE)
+reg.register_schedule("block_grad", _fschedule_elemwise)
diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py
new file mode 100644
index 000000000000..facb345c1abe
--- /dev/null
+++ b/nnvm/python/nnvm/top/transform.py
@@ -0,0 +1,82 @@
+# pylint: disable=invalid-name, unused-argument
+"""Tensor transformation ops"""
+from __future__ import absolute_import
+
+import topi
+from .tensor import _fschedule_broadcast, _fschedule_injective
+from . import registry as reg
+from .registry import OpPattern
+
+# expand_dims
+reg.register_pattern("expand_dims", OpPattern.BROADCAST)
+reg.register_schedule("expand_dims", _fschedule_broadcast)
+
+# expand_like
+@reg.register_compute("expand_like")
+def compute_expand_like(attrs, inputs, _):
+    """Compute definition of expand_like"""
+    if len(inputs[0].shape) == len(inputs[1].shape):
+        # If the number of dimensions is not changed then it is just a broadcasting
+        return topi.broadcast_to(inputs[0], inputs[1].shape)
+
+    exclude = attrs.get_bool("exclude")
+    axis = attrs.get_int_tuple("axis")
+    if exclude:
+        exclude_axis = (axis,) if isinstance(axis, int) else axis
+        axis = []
+        for item in range(len(inputs[1].shape)):
+            if item not in exclude_axis:
+                axis.append(item)
+        axis = tuple(axis)
+
+    return topi.transform.expand_like(inputs[0], inputs[1], axis)
+reg.register_pattern("expand_like", OpPattern.BROADCAST)
+reg.register_schedule("expand_like", _fschedule_broadcast)
+
+# reshape_like
+@reg.register_compute("reshape_like")
+def compute_reshape_like(attrs, inputs, out_info):
+    """Compute definition of reshape_like"""
+    return topi.reshape(inputs[0], inputs[1].shape)
+reg.register_pattern("reshape_like", OpPattern.INJECTIVE)
+reg.register_schedule("reshape_like", _fschedule_injective)
+
+# transpose
+reg.register_pattern("transpose", OpPattern.INJECTIVE)
+reg.register_schedule("transpose", _fschedule_injective)
+
+# flip
+reg.register_pattern("flip", OpPattern.INJECTIVE)
+reg.register_schedule("flip", _fschedule_injective)
+
+# reshape
+reg.register_pattern("reshape", OpPattern.INJECTIVE)
+reg.register_schedule("reshape", _fschedule_injective)
+
+# squeeze
+reg.register_pattern("squeeze", OpPattern.INJECTIVE)
+reg.register_schedule("squeeze", _fschedule_injective)
+
+# concatenate
+reg.register_pattern("concatenate", OpPattern.INJECTIVE)
+reg.register_schedule("concatenate", _fschedule_injective)
+
+# split
+reg.register_pattern("split", OpPattern.INJECTIVE)
+reg.register_schedule("split", _fschedule_injective)
+
+# take
+reg.register_pattern("take", OpPattern.INJECTIVE)
+reg.register_schedule("take", _fschedule_injective)
+
+# strided_slice
+reg.register_pattern("strided_slice", OpPattern.INJECTIVE)
+reg.register_schedule("strided_slice", _fschedule_injective)
+
+# slice_like
+reg.register_pattern("slice_like", OpPattern.INJECTIVE)
+reg.register_schedule("slice_like", _fschedule_injective)
+
+# where
+reg.register_pattern("where", OpPattern.INJECTIVE)
+reg.register_schedule("where", _fschedule_injective)
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
new file mode 100644
index 000000000000..f2e12c0f367a
--- /dev/null
+++ b/nnvm/python/nnvm/top/vision.py
@@ -0,0 +1,98 @@
+# pylint: disable=invalid-name, unused-argument
+"""Definition of nn ops"""
+from __future__ import absolute_import
+
+import tvm
+import topi
+from . import registry as reg
+from .registry import OpPattern
+
+@reg.register_compute("yolo_reorg")
+def compute_reorg(attrs, inputs, _):
+    """Compute definition of reorg"""
+    return topi.vision.reorg(inputs[0], attrs.get_int("stride"))
+
+@reg.register_schedule("yolo_reorg")
+def schedule_reorg(attrs, outs, target):
+    """Schedule definition of reorg"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_injective(outs)
+
+reg.register_pattern("yolo_reorg", OpPattern.INJECTIVE)
+
+@reg.register_compute("yolo_region")
+def compute_region(attrs, inputs, _):
+    """Compute definition of region"""
+    n = attrs.get_int("n")
+    classes = attrs.get_int("classes")
+    coords = attrs.get_int("coords")
+    background = attrs.get_int("background")
+    softmax = attrs.get_int("softmax")
+    return topi.vision.yolo.region(inputs[0], n, classes, coords, background, softmax)
+
+@reg.register_schedule("yolo_region")
+def schedule_region(attrs, outs, target):
+    """Schedule definition of region"""
+    with tvm.target.create(target):
+        return topi.generic.vision.schedule_region(outs)
+
+reg.register_pattern("yolo_region", OpPattern.OPAQUE)
+
+# multibox_prior
+@reg.register_schedule("multibox_prior")
+def schedule_multibox_prior(_, outs, target):
+    """Schedule definition of multibox_prior"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_multibox_prior(outs)
+
+@reg.register_compute("multibox_prior")
+def compute_multibox_prior(attrs, inputs, _):
+    """Compute definition of multibox_prior"""
+    sizes = attrs.get_float_tuple('sizes')
+    ratios = attrs.get_float_tuple('ratios')
+    steps = attrs.get_float_tuple('steps')
+    offsets = attrs.get_float_tuple('offsets')
+    clip = attrs.get_bool('clip')
+
+    return topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios,
+                                          steps, offsets, clip)
+
+reg.register_pattern("multibox_prior", OpPattern.OPAQUE)
+
+# multibox_transform_loc
+@reg.register_schedule("multibox_transform_loc")
+def schedule_multibox_transform_loc(_, outs, target):
+    """Schedule definition of multibox_detection"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_multibox_transform_loc(outs)
+
+@reg.register_compute("multibox_transform_loc")
+def compute_multibox_transform_loc(attrs, inputs, _):
+    """Compute definition of multibox_detection"""
+    clip = attrs.get_bool('clip')
+    threshold = attrs.get_float('threshold')
+    variance = attrs.get_float_tuple('variances')
+
+    return topi.vision.ssd.multibox_transform_loc(inputs[0], inputs[1], inputs[2],
+                                                  clip, threshold, variance)
+
+reg.register_pattern("multibox_detection", OpPattern.OPAQUE)
+
+# non-maximum suppression
+@reg.register_schedule("nms")
+def schedule_nms(_, outs, target):
+    """Schedule definition of nms"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_nms(outs)
+
+@reg.register_compute("nms")
+def compute_nms(attrs, inputs, _):
+    """Compute definition of nms"""
+    nms_threshold = attrs.get_float('nms_threshold')
+    force_suppress = attrs.get_bool('force_suppress')
+    nms_topk = attrs.get_int('nms_topk')
+
+    return topi.vision.nms(inputs[0], inputs[1], nms_threshold,
+                           force_suppress, nms_topk)
+
+reg.register_pattern("nms", OpPattern.OPAQUE)
diff --git a/nnvm/python/setup.py b/nnvm/python/setup.py
new file mode 100644
index 000000000000..f680690a2e10
--- /dev/null
+++ b/nnvm/python/setup.py
@@ -0,0 +1,62 @@
+import os
+import sys
+from setuptools import find_packages
+from distutils.core import setup
+
+def config_cython():
+    # temporary disable cython for now
+    # as NNVM uses local DLL build
+    return []
+    try:
+        from Cython.Build import cythonize
+        from distutils.extension import Extension
+        if sys.version_info >= (3, 0):
+            subdir = "_cy3"
+        else:
+            subdir = "_cy2"
+        ret = []
+        path = "nnvm/cython"
+
+        for fn in os.listdir(path):
+            if not fn.endswith(".pyx"):
+                continue
+            ret.append(Extension(
+                "nnvm/%s/%s" % (subdir, fn[:-4]),
+                ["nnvm/cython/%s" % fn],
+                include_dirs=["../include/"],
+                language="c++"))
+        return cythonize(ret)
+    except:
+        print("Cython is not installed, will compile without cython module")
+        return []
+
+# We can not import `libinfo.py` in setup.py directly since __init__.py
+# Will be invoked which introduces dependences
+CURRENT_DIR = os.path.dirname(__file__)
+libinfo_py = os.path.join(CURRENT_DIR, './nnvm/libinfo.py')
+libinfo = {'__file__': libinfo_py}
+exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
+
+__version__ = libinfo['__version__']
+if not os.getenv('CONDA_BUILD'):
+    LIB_PATH = libinfo['find_lib_path']()
+    _, LIB_NAME = os.path.split(LIB_PATH[0])
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    rpath = os.path.relpath(LIB_PATH[0], curr_path)
+    setup_kwargs = dict(
+        include_package_data=True,
+        data_files=[('nnvm', [rpath])]
+    )
+else:
+    setup_kwargs = {}
+
+setup(name='nnvm',
+      version=__version__,      
+      description="NNVM: Open Compiler for AI Frameworks",
+      zip_safe=False,
+      install_requires=[
+        'numpy'
+      ],
+      packages=find_packages(),
+      url='https://github.com/dmlc/nnvm',
+      **setup_kwargs)
diff --git a/nnvm/src/README.md b/nnvm/src/README.md
new file mode 100644
index 000000000000..adae68105650
--- /dev/null
+++ b/nnvm/src/README.md
@@ -0,0 +1,13 @@
+Project Structure
+=================
+
+The following components are operator invariant.
+
+- c_api: NNVM C API
+- core: NNVM core data structure
+- pass: NNVM pass
+
+The following components are generic NNVM compiler and defines tensor operator set
+
+- top: NNVM core tensor operators
+- compiler: NNVM compiler toolchain
diff --git a/nnvm/src/c_api/c_api_common.h b/nnvm/src/c_api/c_api_common.h
new file mode 100644
index 000000000000..d1d6c4316f79
--- /dev/null
+++ b/nnvm/src/c_api/c_api_common.h
@@ -0,0 +1,57 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file c_api_error.h
+ * \brief Common fields of all C APIs
+ */
+#ifndef NNVM_C_API_C_API_COMMON_H_
+#define NNVM_C_API_C_API_COMMON_H_
+
+#include <dmlc/base.h>
+#include <dmlc/logging.h>
+#include <dmlc/thread_local.h>
+#include <nnvm/c_api.h>
+#include <nnvm/symbolic.h>
+#include <vector>
+#include <string>
+
+/*! \brief  macro to guard beginning and end section of all functions */
+#define API_BEGIN() try {
+/*! \brief every function starts with API_BEGIN();
+     and finishes with API_END() or API_END_HANDLE_ERROR */
+#define API_END() } catch(dmlc::Error &_except_) { return NNAPIHandleException(_except_); } return 0;  // NOLINT(*)
+/*!
+ * \brief every function starts with API_BEGIN();
+ *   and finishes with API_END() or API_END_HANDLE_ERROR
+ *   The finally clause contains procedure to cleanup states when an error happens.
+ */
+#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; return NNAPIHandleException(_except_); } return 0; // NOLINT(*)
+
+
+/*! \brief entry to to easily hold returning information */
+struct NNAPIThreadLocalEntry {
+  /*! \brief result holder for returning string */
+  std::string ret_str;
+  /*! \brief result holder for returning strings */
+  std::vector<std::string> ret_vec_str;
+  /*! \brief result holder for returning string pointers */
+  std::vector<const char *> ret_vec_charp;
+  /*! \brief result holder for returning handles */
+  std::vector<void *> ret_handles;
+  /*! \brief argument holder to hold symbol */
+  std::unordered_map<std::string, const nnvm::Symbol*> kwarg_symbol;
+};
+
+/*! \brief Thread local store that can be used to hold return values. */
+typedef dmlc::ThreadLocalStore<NNAPIThreadLocalEntry> NNAPIThreadLocalStore;
+
+/*!
+ * \brief handle exception throwed out
+ * \param e the exception
+ * \return the return value of API after exception is handled
+ */
+inline int NNAPIHandleException(const dmlc::Error &e) {
+  NNAPISetLastError(e.what());
+  return -1;
+}
+
+#endif  // NNVM_C_API_C_API_COMMON_H_
diff --git a/nnvm/src/c_api/c_api_error.cc b/nnvm/src/c_api/c_api_error.cc
new file mode 100644
index 000000000000..399268667ddd
--- /dev/null
+++ b/nnvm/src/c_api/c_api_error.cc
@@ -0,0 +1,21 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file c_api_error.cc
+ * \brief C error handling
+ */
+#include <dmlc/thread_local.h>
+#include "./c_api_common.h"
+
+struct ErrorEntry {
+  std::string last_error;
+};
+
+typedef dmlc::ThreadLocalStore<ErrorEntry> NNAPIErrorStore;
+
+const char *NNGetLastError() {
+  return NNAPIErrorStore::Get()->last_error.c_str();
+}
+
+void NNAPISetLastError(const char* msg) {
+  NNAPIErrorStore::Get()->last_error = msg;
+}
diff --git a/nnvm/src/c_api/c_api_graph.cc b/nnvm/src/c_api/c_api_graph.cc
new file mode 100644
index 000000000000..831aaec33e8c
--- /dev/null
+++ b/nnvm/src/c_api/c_api_graph.cc
@@ -0,0 +1,98 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file c_api_graph.cc
+ * \brief C API related to Graph IR.
+ */
+#include <nnvm/c_api.h>
+#include <nnvm/op.h>
+#include <nnvm/symbolic.h>
+#include <nnvm/graph.h>
+#include <nnvm/pass.h>
+#include <dmlc/json.h>
+#include "./c_api_common.h"
+
+using namespace nnvm;
+
+int NNGraphCreate(SymbolHandle symbol, GraphHandle *graph) {
+  Graph* g = new Graph();
+  API_BEGIN();
+  g->outputs = static_cast<Symbol*>(symbol)->outputs;
+  *graph = g;
+  API_END_HANDLE_ERROR(delete g);
+}
+
+int NNGraphFree(GraphHandle handle) {
+  API_BEGIN();
+  delete static_cast<Graph*>(handle);
+  API_END();
+}
+
+int NNGraphGetSymbol(GraphHandle graph, SymbolHandle *symbol) {
+  Symbol* s = new Symbol();
+  API_BEGIN();
+  s->outputs = static_cast<Graph*>(graph)->outputs;
+  *symbol = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int NNGraphSetNodeEntryListAttr_(GraphHandle handle,
+                                 const char* key,
+                                 SymbolHandle list) {
+  API_BEGIN();
+  Symbol* s = static_cast<Symbol*>(list);
+  Graph* g = static_cast<Graph*>(handle);
+  g->attrs[std::string(key)]
+      = std::make_shared<any>(s->outputs);
+  API_END();
+}
+
+int NNGraphSetJSONAttr(GraphHandle handle,
+                       const char* key,
+                       const char* json_value) {
+  API_BEGIN();
+  Graph* g = static_cast<Graph*>(handle);
+  std::string temp(json_value);
+  std::istringstream is(temp);
+  dmlc::JSONReader reader(&is);
+  nnvm::any value;
+  reader.Read(&value);
+  g->attrs[std::string(key)] = std::make_shared<any>(std::move(value));
+  API_END();
+}
+
+int NNGraphGetJSONAttr(GraphHandle handle,
+                      const char* key,
+                      const char** json_out,
+                      int *success) {
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+  API_BEGIN();
+  Graph* g = static_cast<Graph*>(handle);
+  std::string skey(key);
+  auto it = g->attrs.find(skey);
+  if (it != g->attrs.end()) {
+    std::ostringstream os;
+    dmlc::JSONWriter writer(&os);
+    writer.Write(*it->second.get());
+    ret->ret_str = os.str();
+    *json_out = (ret->ret_str).c_str();
+    *success = 1;
+  } else {
+    *success = 0;
+  }
+  API_END();
+}
+
+int NNGraphApplyPasses(GraphHandle src,
+                       nn_uint num_pass,
+                       const char** pass_names,
+                       GraphHandle *dst) {
+  Graph* g = new Graph();
+  API_BEGIN();
+  std::vector<std::string> vpass;
+  for (nn_uint i = 0; i < num_pass; ++i) {
+    vpass.emplace_back(std::string(pass_names[i]));
+  }
+  *g = ApplyPasses(*static_cast<Graph*>(src), vpass);
+  *dst = g;
+  API_END_HANDLE_ERROR(delete g);
+}
diff --git a/nnvm/src/c_api/c_api_symbolic.cc b/nnvm/src/c_api/c_api_symbolic.cc
new file mode 100644
index 000000000000..9f62dbd80b0c
--- /dev/null
+++ b/nnvm/src/c_api/c_api_symbolic.cc
@@ -0,0 +1,330 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file c_api_symbolic.cc
+ * \brief C API related to symbolic graph compsition.
+ */
+#include <nnvm/c_api.h>
+#include <nnvm/op.h>
+#include <nnvm/symbolic.h>
+#include "./c_api_common.h"
+
+using namespace nnvm;
+
+int NNListAllOpNames(nn_uint *out_size,
+                     const char*** out_array) {
+  API_BEGIN();
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+  ret->ret_vec_str = dmlc::Registry<Op>::ListAllNames();
+  ret->ret_vec_charp.resize(0);
+  ret->ret_vec_charp.reserve(ret->ret_vec_str.size());
+  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
+    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
+  }
+  *out_array = dmlc::BeginPtr(ret->ret_vec_charp);
+  *out_size = static_cast<nn_uint>(ret->ret_vec_str.size());
+  API_END();
+}
+
+int NNGetOpHandle(const char* op_name,
+                  OpHandle* op_out) {
+  API_BEGIN();
+  *op_out = (OpHandle)Op::Get(op_name);  // NOLINT(*)
+  API_END();
+}
+
+int NNListUniqueOps(nn_uint *out_size,
+                    OpHandle **out_array) {
+  API_BEGIN();
+  auto &vec = dmlc::Registry<Op>::List();
+  *out_size = static_cast<nn_uint>(vec.size());
+  *out_array = (OpHandle*)(dmlc::BeginPtr(vec));  //  NOLINT(*)
+  API_END();
+}
+
+int NNAddControlDeps(SymbolHandle handle,
+                     SymbolHandle src_dep) {
+  API_BEGIN();
+  static_cast<Symbol*>(handle)->AddControlDeps(
+      *static_cast<Symbol*>(src_dep));
+  API_END();
+}
+
+int NNGetOpInfo(OpHandle handle,
+                const char **name,
+                const char **description,
+                nn_uint *num_doc_args,
+                const char ***arg_names,
+                const char ***arg_type_infos,
+                const char ***arg_descriptions,
+                const char **return_type) {
+  const Op *op = static_cast<const Op *>(handle);
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+
+  API_BEGIN();
+  *name = op->name.c_str();
+  *description = op->description.c_str();
+  *num_doc_args = static_cast<nn_uint>(op->arguments.size());
+  if (return_type) *return_type = nullptr;
+  ret->ret_vec_charp.resize(0);
+  ret->ret_vec_charp.reserve(op->arguments.size() * 3);
+  for (size_t i = 0; i < op->arguments.size(); ++i) {
+    ret->ret_vec_charp.push_back(op->arguments[i].name.c_str());
+  }
+  for (size_t i = 0; i < op->arguments.size(); ++i) {
+    ret->ret_vec_charp.push_back(op->arguments[i].type_info_str.c_str());
+  }
+  for (size_t i = 0; i < op->arguments.size(); ++i) {
+    ret->ret_vec_charp.push_back(op->arguments[i].description.c_str());
+  }
+  *arg_names = dmlc::BeginPtr(ret->ret_vec_charp);
+  *arg_type_infos = dmlc::BeginPtr(ret->ret_vec_charp) + op->arguments.size();
+  *arg_descriptions = dmlc::BeginPtr(ret->ret_vec_charp) + (op->arguments.size() * 2);
+  API_END();
+}
+
+int NNSymbolCreateAtomicSymbol(OpHandle creator,
+                               nn_uint num_param,
+                               const char **keys,
+                               const char **vals,
+                               SymbolHandle *out) {
+  Symbol *s = new Symbol();
+  API_BEGIN();
+  const Op* op = static_cast<const Op*>(creator);
+  std::unordered_map<std::string, std::string> kwargs;
+  for (nn_uint i = 0; i < num_param; ++i) {
+    kwargs.insert({std::string(keys[i]), std::string(vals[i])});
+  }
+  *s = Symbol::CreateFunctor(op, std::move(kwargs));
+  *out = s;
+  API_END_HANDLE_ERROR(delete s;);
+}
+
+int NNSymbolCreateVariable(const char *name, SymbolHandle *out) {
+  Symbol *s = new Symbol();
+  API_BEGIN();
+  *s = Symbol::CreateVariable(name);
+  *out = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int NNSymbolCreateGroup(nn_uint num_symbols,
+                        SymbolHandle *symbols,
+                        SymbolHandle *out) {
+  Symbol *s = new Symbol();
+  Symbol **sym_arr = (Symbol**)symbols; // NOLINT(*)
+  API_BEGIN();
+  std::vector<Symbol> syms;
+  for (nn_uint i = 0; i < num_symbols; ++i) {
+    syms.push_back(*sym_arr[i]);
+  }
+  *s = Symbol::CreateGroup(syms);
+  *out = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int NNSymbolGetOutput(SymbolHandle symbol,
+                      nn_uint index,
+                      SymbolHandle *out) {
+  Symbol *s = new Symbol();
+  API_BEGIN();
+  *s = (*static_cast<Symbol*>(symbol))[index];
+  *out = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int NNSymbolGetInternals(SymbolHandle symbol,
+                         SymbolHandle *out) {
+  Symbol *s = new Symbol();
+  API_BEGIN();
+  *s = static_cast<Symbol*>(symbol)->GetInternals();
+  *out = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int NNSymbolGetChildren(SymbolHandle symbol,
+                        SymbolHandle *out) {
+  Symbol *s = new Symbol();
+  API_BEGIN();
+  *s = static_cast<Symbol*>(symbol)->GetChildren();
+  *out = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int NNSymbolFree(SymbolHandle symbol) {
+  API_BEGIN();
+  delete static_cast<Symbol*>(symbol);
+  API_END();
+}
+
+int NNSymbolCopy(SymbolHandle symbol, SymbolHandle *out) {
+  Symbol *s = new Symbol();
+  API_BEGIN();
+  *s = static_cast<const Symbol*>(symbol)->Copy();
+  *out = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int NNSymbolPrint(SymbolHandle symbol, const char **out_str) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+  API_BEGIN();
+  std::ostringstream os;
+  s->Print(os);
+  ret->ret_str = os.str();
+  *out_str = (ret->ret_str).c_str();
+  API_END();
+}
+
+int NNSymbolGetAttr(SymbolHandle symbol,
+                    const char* key,
+                    const char** out,
+                    int* success) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+  API_BEGIN();
+  if (s->GetAttr(key, &(ret->ret_str))) {
+    *out = (ret->ret_str).c_str();
+    *success = 1;
+  } else {
+    *out = nullptr;
+    *success = 0;
+  }
+  API_END();
+}
+
+int NNSymbolSetAttrs(SymbolHandle symbol,
+                     nn_uint num_param,
+                     const char** keys,
+                     const char** vals) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  API_BEGIN();
+  std::vector<std::pair<std::string, std::string> > kwargs;
+  for (nn_uint i = 0; i < num_param; ++i) {
+    kwargs.emplace_back(
+        std::make_pair(std::string(keys[i]), std::string(vals[i])));
+  }
+  s->SetAttrs(kwargs);
+  API_END();
+}
+
+int NNSymbolListAttrs(SymbolHandle symbol,
+                      int option,
+                      nn_uint *out_size,
+                      const char*** out) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+  API_BEGIN();
+  std::unordered_map<std::string, std::string> attr =
+      s->ListAttrs(static_cast<Symbol::ListAttrOption>(option));  // NOLINT(*)
+
+  std::vector<std::string>& attr_list = ret->ret_vec_str;
+  attr_list.resize(0);
+  attr_list.reserve(attr.size());
+  for (const auto& kv : attr) {
+    attr_list.push_back(kv.first);
+    attr_list.push_back(kv.second);
+  }
+  *out_size = attr.size();
+  ret->ret_vec_charp.clear();
+  ret->ret_vec_charp.reserve(ret->ret_vec_str.size());
+  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
+    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
+  }
+  *out = dmlc::BeginPtr(ret->ret_vec_charp);
+  API_END();
+}
+
+int NNSymbolListInputVariables(SymbolHandle symbol,
+                               int option,
+                               nn_uint *out_size,
+                               SymbolHandle** out_sym_array) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+  API_BEGIN();
+  std::vector<NodePtr> vs = s->ListInputs(Symbol::ListInputOption(option));
+  ret->ret_handles.resize(0);
+  ret->ret_handles.reserve(vs.size());
+  for (size_t i = 0; i < vs.size(); ++i) {
+    nnvm::Symbol* rs = new nnvm::Symbol();
+    rs->outputs.push_back(NodeEntry{vs[i], 0, 0});
+    ret->ret_handles.push_back(rs);
+  }
+  *out_size = static_cast<nn_uint>(vs.size());
+  *out_sym_array = dmlc::BeginPtr(ret->ret_handles);
+  API_END();
+}
+
+int NNSymbolListInputNames(SymbolHandle symbol,
+                           int option,
+                           nn_uint *out_size,
+                           const char ***out_str_array) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+  API_BEGIN();
+  ret->ret_vec_str =
+      s->ListInputNames(Symbol::ListInputOption(option));
+  ret->ret_vec_charp.resize(0);
+  ret->ret_vec_charp.reserve(ret->ret_vec_str.size());
+  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
+    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
+  }
+  *out_size = static_cast<nn_uint>(ret->ret_vec_charp.size());
+  *out_str_array = dmlc::BeginPtr(ret->ret_vec_charp);
+  API_END();
+}
+
+int NNSymbolListOutputNames(SymbolHandle symbol,
+                            nn_uint *out_size,
+                            const char ***out_str_array) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+  API_BEGIN();
+  ret->ret_vec_str = s->ListOutputNames();
+  ret->ret_vec_charp.resize(0);
+  ret->ret_vec_charp.reserve(ret->ret_vec_str.size());
+  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
+    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
+  }
+  *out_size = static_cast<nn_uint>(ret->ret_vec_charp.size());
+  *out_str_array = dmlc::BeginPtr(ret->ret_vec_charp);
+  API_END();
+}
+
+int NNSymbolGetNumOutputs(SymbolHandle symbol,
+                           nn_uint *output_count) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  API_BEGIN();
+  *output_count = static_cast<nn_uint>(s->outputs.size());
+  API_END();
+}
+
+int NNSymbolCompose(SymbolHandle sym,
+                    const char *name,
+                    nn_uint num_args,
+                    const char** keys,
+                    SymbolHandle* args) {
+  API_BEGIN();
+  NNAPIThreadLocalEntry *ret = NNAPIThreadLocalStore::Get();
+  std::string& s_name = ret->ret_str;
+  std::unordered_map<std::string, const Symbol*>& kwargs
+      = ret->kwarg_symbol;
+  kwargs.clear();
+  if (name != nullptr) {
+    s_name = name;
+  } else {
+    s_name.clear();
+  }
+  Symbol* s = static_cast<Symbol*>(sym);
+  if (keys == nullptr && num_args != 0) {
+    kwargs.clear();
+    array_view<const Symbol*> parg(
+        (Symbol**)args, (Symbol**)args + num_args); // NOLINT(*)
+    s->Compose(parg, kwargs, s_name);
+  } else {
+    for (nn_uint i = 0; i < num_args; ++i) {
+      kwargs[keys[i]] = (Symbol*)args[i];  //  NOLINT(*)
+    }
+    s->Compose(array_view<const Symbol*>(), kwargs, s_name);
+  }
+  API_END();
+}
diff --git a/nnvm/src/compiler/alter_op_layout.cc b/nnvm/src/compiler/alter_op_layout.cc
new file mode 100644
index 000000000000..bf28df3d04f8
--- /dev/null
+++ b/nnvm/src/compiler/alter_op_layout.cc
@@ -0,0 +1,154 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file alter_op_layout.cc
+ * \brief Alter the operator layouts. Keep inferred layouts (if any) from previous stages.
+ *        e.g., convolution may calculates faster with NCHW16c layout.
+ */
+#include <nnvm/pass.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/layout.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/pass_functions.h>
+#include <tvm/tvm.h>
+#include <algorithm>
+#include <functional>
+#include "./compile_engine.h"
+#include "./graph_transform.h"
+
+namespace nnvm {
+namespace compiler {
+namespace {
+
+tvm::Array<tvm::Tensor> GetTensorInfo(const IndexedGraph& idx_graph,
+                                      const uint32_t nid,
+                                      const ShapeVector& shape_vec,
+                                      const DTypeVector& dtype_vec) {
+  tvm::Array<tvm::Tensor> vec;
+  for (uint32_t i = 0; i < idx_graph[nid].source->num_outputs(); ++i) {
+    tvm::Array<tvm::Expr> shape;
+    for (int64_t x : shape_vec[idx_graph.entry_id(nid, i)]) {
+      CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
+      shape.push_back(tvm::make_const(tvm::Int(32), x));
+    }
+    vec.push_back(tvm::placeholder(
+      shape, GetTVMType(dtype_vec[idx_graph.entry_id(nid, i)])));
+  }
+  return vec;
+}
+
+Graph AlterOpLayout(const Graph& src) {
+  static auto& falter_op_layout =
+    Op::GetAttr<nnvm::compiler::FTVMAlterOpLayout >("FTVMAlterOpLayout");
+
+  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
+  const DTypeVector& dtype_vec = src.GetAttr<DTypeVector>("dtype");
+  const IndexedGraph& idx_graph = src.indexed_graph();
+
+  std::vector<std::vector<Layout> > in_layouts_of_node(idx_graph.num_nodes());
+  std::vector<std::vector<Layout> > out_layouts_of_node(idx_graph.num_nodes());
+  std::unordered_map<const Node*, uint32_t> new_nodes;
+
+  if (src.HasAttr("layout")) {
+    // record layouts so that LayoutTransform pass can fix layouts correctly,
+    // e.g., conv2d can be replaced by some contrib implement
+    // whose layout is different from the original one
+    // (which was imported from a model file).
+    const auto& layouts = src.GetAttr<std::vector<Layout> >("layout");
+    for (uint32_t nid = 0; nid < idx_graph.num_nodes(); ++nid) {
+      const auto &inode = idx_graph[nid];
+      if (falter_op_layout.count(inode.source->op())) {
+        // do not record input layouts of nodes that will be replaced.
+        continue;
+      }
+      std::vector<Layout> in_layout;
+      for (const auto& e : inode.inputs) {
+        in_layout.emplace_back(layouts[idx_graph.entry_id(e)]);
+      }
+      in_layouts_of_node[nid] = in_layout;
+
+      std::vector<Layout> out_layout;
+      for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+        out_layout.emplace_back(layouts[idx_graph.entry_id(nid, i)]);
+      }
+      out_layouts_of_node[nid] = out_layout;
+    }
+  }
+
+  auto transform = [&](uint32_t nid,
+                       const NodePtr& n,
+                       std::vector<NodeEntry>* ret) {
+    nnvm::compiler::FTVMAlterOpLayout fn_alter_op_layout =
+      falter_op_layout.get(n->op(), nullptr);
+    if (fn_alter_op_layout == nullptr) {
+      new_nodes[n.get()] = nid;
+      return false;
+    }
+
+    // construct parameters for registered function
+    std::vector<Symbol> op_inputs;
+    tvm::Array<tvm::Tensor> tensor_infos;
+    CHECK_EQ(n->num_inputs(), idx_graph[nid].inputs.size());
+    for (uint32_t i = 0; i < n->num_inputs(); ++i) {
+      const nnvm::NodeEntry& input = n->inputs[i];
+      // input operator
+      Symbol op_input;
+      op_input.outputs.push_back(input);
+      op_inputs.push_back(op_input);
+
+      // input tinfo, extract from the original graph
+      // because it was where infer_shape & infer_type applied.
+      tvm::Array<tvm::Tensor> op_output_tinfos =
+        GetTensorInfo(idx_graph, idx_graph[nid].inputs[i].node_id,
+                      shape_vec, dtype_vec);
+      tensor_infos.push_back(op_output_tinfos[input.index]);
+    }
+    // callback registered function to get a new operator.
+    Symbol op;
+    bool do_alter =
+      fn_alter_op_layout(n->attrs, Symbol::CreateGroup(op_inputs), tensor_infos, &op);
+    if (do_alter) *ret = op.outputs;
+    return do_alter;
+  };
+
+  Graph ret = nnvm::compiler::GraphTransform(src, transform);
+
+  if (src.HasAttr("layout")) {
+    // restore the layouts to return graph
+    const auto& ret_idx = ret.indexed_graph();
+    std::vector<Layout> ret_layouts(ret_idx.num_node_entries(), Layout::Undef());
+    for (uint32_t nid = 0; nid < ret_idx.num_nodes(); ++nid) {
+      const auto& inode = ret_idx[nid];
+      if (new_nodes.count(inode.source)) {
+        const std::vector<Layout>& in_layouts =
+          in_layouts_of_node[new_nodes[inode.source]];
+        for (uint32_t i = 0; i < inode.inputs.size(); ++i) {
+          const auto& e = inode.inputs[i];
+          ret_layouts[ret_idx.entry_id(e)] = in_layouts[i];
+        }
+        const std::vector<Layout>& out_layouts =
+          out_layouts_of_node[new_nodes[inode.source]];
+        for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+          ret_layouts[ret_idx.entry_id(nid, i)] = out_layouts[i];
+        }
+      }
+    }
+
+    // cannot call indexed_graph() before return the origin Graph,
+    // thus create a new one.
+    nnvm::Graph new_ret;
+    new_ret.outputs = ret.outputs;
+    new_ret.attrs["layout"] = std::make_shared<any>(std::move(ret_layouts));
+    return new_ret;
+  }
+
+  return ret;
+}
+
+// register pass
+NNVM_REGISTER_PASS(AlterOpLayout)
+.set_body(AlterOpLayout)
+.set_change_graph(true);
+
+}  // namespace
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
new file mode 100644
index 000000000000..a9d4aa2d016a
--- /dev/null
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -0,0 +1,375 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file compile_engine.cc
+ * \brief The compile engine.
+ */
+#include <dmlc/common.h>
+#include <tvm/ir.h>
+#include <tvm/operation.h>
+#include <nnvm/graph.h>
+#include <nnvm/node.h>
+#include <nnvm/pass_functions.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <mutex>
+#include "./graph_hash.h"
+#include "./compile_engine.h"
+
+namespace nnvm {
+namespace compiler {
+
+using namespace tvm;
+
+/*!
+ * \brief Get type flag from TVM Type
+ *
+ * \param type the tvm type.
+ * \return corresponding DLDataType
+ */
+int GetTypeFlag(tvm::Type type) {
+  if (type == tvm::Float(32)) return 0;
+  if (type == tvm::Float(64)) return 1;
+  if (type == tvm::Float(16)) return 2;
+  if (type == tvm::UInt(8)) return 3;
+  if (type == tvm::Int(32)) return 4;
+  if (type == tvm::Int(8)) return 5;
+  if (type == tvm::Int(64)) return 6;
+  if (type == tvm::Int(16)) return 7;
+  if (type == tvm::UInt(16)) return 8;
+  if (type == tvm::UInt(32)) return 9;
+  if (type == tvm::UInt(64)) return 10;
+  LOG(FATAL) << "cannot convert " << type;
+  return 0;
+}
+// convert from type flag to tvm type.
+Type GetTVMType(int type_flag) {
+  switch (type_flag) {
+    case 0:
+      return tvm::Float(32);
+    case 1:
+      return tvm::Float(64);
+    case 2:
+      return tvm::Float(16);
+    case 3:
+      return tvm::UInt(8);
+    case 4:
+      return tvm::Int(32);
+    case 5:
+      return tvm::Int(8);
+    case 6:
+      return tvm::Int(64);
+    case 7:
+      return tvm::Int(16);
+    case 8:
+      return tvm::UInt(16);
+    case 9:
+      return tvm::UInt(32);
+    case 10:
+      return tvm::UInt(64);
+    default:
+      LOG(FATAL) << "unknown type_flag=" << type_flag;
+      return Float(32);
+  }
+}
+
+// internal compile engine
+class CompileEngine {
+ public:
+  static CompileEngine* Global() {
+    static CompileEngine inst;
+    return &inst;
+  }
+  // lower graph possible get back an cached op.
+  GraphFunc Lower(Graph graph,
+                  const Array<tvm::Tensor>& inputs,
+                  const std::string& target,
+                  int master_idx) {
+    GraphKey key = GraphKeyNode::make(graph, inputs, target);
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      ++(it->second->use_count);
+      return it->second->graph_func;
+    }
+    GraphFunc f = DoLower(key->graph, key->inputs, key->target, master_idx);
+    std::shared_ptr<GraphCacheEntryNode> n = std::make_shared<GraphCacheEntryNode>();
+    n->graph_func = f;
+    n->use_count = 1;
+    n->master_idx = master_idx;
+    cache_[key] = GraphCacheEntry(n);
+    return f;
+  }
+  // List all items in the cache.
+  Array<NodeRef> ListCacheItems() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Array<NodeRef> items;
+    for (auto& kv : cache_) {
+      items.push_back(kv.first);
+      std::shared_ptr<GraphCacheEntryNode> n =
+          std::make_shared<GraphCacheEntryNode>(*(kv.second.operator->()));
+      items.push_back(GraphCacheEntry(n));
+    }
+    return items;
+  }
+  // Find the function given graph key.
+  GraphCacheEntry Find(const GraphKey& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      return it->second;
+    } else {
+      return GraphCacheEntry();
+    }
+  }
+  // Set the given function on given graph key.
+  void Set(const GraphKey& key, GraphFunc func) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::shared_ptr<GraphCacheEntryNode> n = std::make_shared<GraphCacheEntryNode>();
+    n->graph_func = func;
+    n->use_count = 1;
+    cache_[key] = GraphCacheEntry(n);
+  }
+    // Clear the function cache.
+  void Clear() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cache_.clear();
+  }
+
+  // get schedule and its args
+  std::tuple<Schedule, Array<tvm::Tensor>, Graph>
+  GetScheduleArgs(Graph graph,
+                  const Array<tvm::Tensor> &inputs,
+                  const std::string &target,
+                  int master_idx,
+                  std::string *readable_name,
+                  Array<tvm::Tensor> *outputs) {
+    // shape, type
+    static auto& fcompute =
+        nnvm::Op::GetAttr<FTVMCompute>("FTVMCompute");
+    static auto& fschedule =
+        nnvm::Op::GetAttr<FTVMSchedule>("FTVMSchedule");
+
+    std::vector<TShape> ishape;
+    std::vector<int> idtype;
+
+    for (const tvm::Tensor t : inputs) {
+      std::vector<dim_t> shape;
+      for (Expr v : t->shape) {
+        CHECK(v.as<tvm::ir::IntImm>());
+        shape.push_back(v.as<tvm::ir::IntImm>()->value);
+      }
+      ishape.emplace_back(TShape(shape.begin(), shape.end()));
+      idtype.emplace_back(GetTypeFlag(t->dtype));
+    }
+    graph = pass::InferShape(graph, ishape);
+    graph = pass::InferType(graph, idtype);
+
+    const ShapeVector& shape_vec = graph.GetAttr<ShapeVector>("shape");
+    const DTypeVector& dtype_vec = graph.GetAttr<DTypeVector>("dtype");
+    const IndexedGraph& idx = graph.indexed_graph();
+    CHECK_EQ(inputs.size(), idx.input_nodes().size());
+
+    std::vector<tvm::Tensor> tensor_vec(idx.num_node_entries());
+    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
+      uint32_t nid = idx.input_nodes()[i];
+      tensor_vec[idx.entry_id(nid, 0)] = inputs[i];
+    }
+
+    std::ostringstream readable_name_os;
+    readable_name_os << "fuse";
+    for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+      const auto& inode = idx[nid];
+      if (inode.source->is_variable()) continue;
+      Array<Tensor> op_inputs, out_info;
+      readable_name_os << "_" << inode.source->op()->name;
+      // input array
+      for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+        const tvm::Tensor& t = tensor_vec[idx.entry_id(e)];
+        CHECK(t.defined());
+        op_inputs.push_back(t);
+      }
+      // output hint
+      for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+        Array<Expr> shape;
+        for (int64_t x : shape_vec[idx.entry_id(nid, i)]) {
+          CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
+          shape.push_back(make_const(Int(32), x));
+        }
+        out_info.push_back(
+            placeholder(shape,
+                        GetTVMType(dtype_vec[idx.entry_id(nid, i)])));
+      }
+      // get default
+      Array<Tensor> out = fcompute[inode.source->op()](
+          inode.source->attrs, op_inputs, out_info);
+      CHECK_EQ(out.size(), inode.source->num_outputs());
+
+      // check output dimentions also match
+      // This check is to make sure the NNVM operator Infer match with Compute result.
+      // Missing this check may pass the build but leads to runtime errors.
+      for (uint32_t i = 0; i < out.size(); ++i) {
+        CHECK_EQ(out[i].ndim(), out_info[i].ndim()) << inode.source->op()->name;
+        tvm::Tensor inferred_tensor = out[i];
+        tvm::Tensor computed_tensor = out_info[i];
+        for (uint32_t j = 0; j < inferred_tensor->shape.size(); ++j) {
+          if ((as_const_int(inferred_tensor->shape[j])) &&
+              (as_const_int(computed_tensor->shape[j])))
+            CHECK_EQ((*as_const_int(inferred_tensor->shape[j])),
+                     (*as_const_int(computed_tensor->shape[j]))) << inode.source->op()->name;
+        }
+      }
+
+      // schedule on root node, and use master's schedule
+      for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+        uint32_t eid = idx.entry_id(nid, index);
+        tensor_vec[eid] = out[index];
+      }
+    }
+    // Schedule on final output.
+    Array<Tensor> all_args = inputs;
+    Array<Tensor> outs;
+    for (const IndexedGraph::NodeEntry& e : idx.outputs()) {
+      const tvm::Tensor& t = tensor_vec[idx.entry_id(e)];
+      CHECK(t.defined());
+      outs.push_back(t);
+      all_args.push_back(t);
+    }
+
+    Schedule sch = fschedule[idx[master_idx].source->op()](
+        idx[master_idx].source->attrs, outs, target);
+
+    // store extra return values
+    if (readable_name != nullptr) {
+      *readable_name = readable_name_os.str();
+    }
+    if (outputs != nullptr) {
+      *outputs = outs;
+    }
+
+    return std::make_tuple(sch, all_args, graph);
+  }
+
+  // run the actual lowering process
+  GraphFunc DoLower(Graph graph,
+                    const Array<tvm::Tensor>& inputs,
+                    const std::string& target,
+                    int master_idx) {
+    std::string readable_name;
+    Array<tvm::Tensor> all_args;
+    Array<tvm::Tensor> outputs;
+    Schedule sch;
+
+    std::tie(sch, all_args, graph) = GetScheduleArgs(
+        graph, inputs, target, master_idx,
+        &readable_name, &outputs);
+
+    std::shared_ptr<GraphFuncNode> gf = std::make_shared<GraphFuncNode>();
+    gf->target = target;
+    gf->func_name = GetUniqeName(readable_name);
+    gf->inputs = inputs;
+    gf->outputs = outputs;
+    static const PackedFunc& flower = GetPackedFunc("nnvm.compiler.lower");
+    gf->funcs = flower(sch, all_args, gf->func_name, graph);
+    return GraphFunc(gf);
+  }
+
+ private:
+  // Get unique name
+  std::string GetUniqeName(std::string name) {
+    while (true) {
+      auto it = name_map_.find(name);
+      if (it == name_map_.end()) {
+        name_map_[name] = 1;
+        return name;
+      } else {
+        std::ostringstream os;
+        os << name << "_" << it->second;
+        ++(it->second);
+        name = os.str();
+      }
+    }
+    return name;
+  }
+
+  // global mutex
+  std::mutex mutex_;
+  // the name map
+  std::unordered_map<std::string, int> name_map_;
+  // the compiler cache
+  std::unordered_map<GraphKey, GraphCacheEntry,
+                     GraphKeyHash, GraphKeyEqual> cache_;
+};
+
+GraphFunc GraphLower(Graph graph,
+                     const Array<tvm::Tensor>& inputs,
+                     const std::string& target,
+                     int master_idx) {
+  return CompileEngine::Global()->Lower(
+      graph, inputs, target, master_idx);
+}
+
+// Expose cache to front end
+TVM_REGISTER_GLOBAL("nnvm.compiler.ListCacheItems")
+.set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue *rv) {
+    *rv = CompileEngine::Global()->ListCacheItems();
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler.ClearCache")
+.set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue *rv) {
+    CompileEngine::Global()->Clear();
+  });
+
+// NOTE: this involves graph lookup and can be slow
+TVM_REGISTER_GLOBAL("nnvm.compiler.GetCacheItem")
+.set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue *rv) {
+    *rv = CompileEngine::Global()->Find(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler.SetCacheItem")
+.set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue *rv) {
+    CompileEngine::Global()->Set(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler.GraphKeyGetGraph")
+.set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue *rv) {
+    *rv = args[0].operator GraphKey()->graph;
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler.MakeGraphKey")
+.set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue *rv) {
+    *rv = GraphKeyNode::make(args[0], args[1], args[2]);
+  });
+
+// This can be used to extract workloads from nnvm compiler
+TVM_REGISTER_GLOBAL("nnvm.compiler.CacheItem2ScheduleArgs")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    Array<tvm::NodeRef> item = args[0];
+
+    const GraphKeyNode *key = reinterpret_cast<const GraphKeyNode *>(item[0].get());
+    const GraphCacheEntryNode *value = reinterpret_cast<const GraphCacheEntryNode *>(item[1].get());
+
+    // extract arguments from cached item
+    Graph graph = key->graph;
+    const Array<tvm::Tensor> &inputs = key->inputs;
+    std::string target = args[1];
+    int master_idx = value->master_idx;
+
+    Schedule sch;
+    Array<tvm::Tensor> all_args;
+    std::tie(sch, all_args, graph) =
+        CompileEngine::Global()->GetScheduleArgs(
+        graph, inputs, target, master_idx, nullptr, nullptr);
+
+    Array<tvm::NodeRef> ret;
+    ret.push_back(sch);
+    ret.push_back(all_args);
+    *rv = ret;
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<GraphFuncNode>([](const GraphFuncNode *op, IRPrinter *p) {
+    p->stream << "GraphFunc(name=" << op->func_name
+              << ", addr=" << op << ")";
+});
+
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/compile_engine.h b/nnvm/src/compiler/compile_engine.h
new file mode 100644
index 000000000000..d84fe2facbd3
--- /dev/null
+++ b/nnvm/src/compiler/compile_engine.h
@@ -0,0 +1,115 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file compile_engine.h
+ * \brief Internal engine to compile a subgraph fragment and cache compilation.
+ */
+#ifndef NNVM_COMPILER_COMPILE_ENGINE_H_
+#define NNVM_COMPILER_COMPILE_ENGINE_H_
+
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/tuple.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/operation.h>
+#include <tvm/lowered_func.h>
+#include <string>
+#include <utility>
+#include "./graph_hash.h"
+
+namespace nnvm {
+namespace compiler {
+
+/*! \brief A TVM Node to represent compiled graph function */
+struct GraphFuncNode : public tvm::Node {
+  /* \brief compiled target */
+  std::string target;
+  /*! \brief Function name */
+  std::string func_name;
+  /* \brief The inputs to the function */
+  tvm::Array<Tensor> inputs;
+  /* \brief The outputs to the function */
+  tvm::Array<Tensor> outputs;
+  /*! \brief The lowered functions */
+  tvm::Array<tvm::LoweredFunc> funcs;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("target", &target);
+    v->Visit("func_name", &func_name);
+    v->Visit("inputs", &inputs);
+    v->Visit("outputs", &outputs);
+    v->Visit("funcs", &funcs);
+  }
+
+  static constexpr const char* _type_key = "GraphFunc";
+  TVM_DECLARE_NODE_TYPE_INFO(GraphFuncNode, tvm::Node);
+};
+
+TVM_DEFINE_NODE_REF(GraphFunc, GraphFuncNode);
+
+/*! \brief Cache Entry in the graph */
+struct GraphCacheEntryNode : public tvm::Node {
+  /*! \brief The graph function */
+  GraphFunc graph_func;
+  /*! \brief Usage statistics */
+  int use_count{0};
+  /*! \brief Index of the master node for calling schedule*/
+  int master_idx;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("graph_func", &graph_func);
+    v->Visit("use_count", &use_count);
+    v->Visit("master_idx", &master_idx);
+  }
+  static constexpr const char* _type_key = "GraphCacheEntry";
+  TVM_DECLARE_NODE_TYPE_INFO(GraphCacheEntryNode, tvm::Node);
+};
+
+class GraphCacheEntry : public ::tvm::NodeRef {
+ public:
+  GraphCacheEntry() {}
+  explicit GraphCacheEntry(std::shared_ptr<::tvm::Node> n) : NodeRef(n) {}
+  GraphCacheEntryNode* operator->() {
+    return static_cast<GraphCacheEntryNode*>(node_.get());
+  }
+  using ContainerType = GraphCacheEntryNode;
+};
+
+/*!
+ * \brief Call compile engine to lower a graph with given inputs.
+ *
+ * \param graph The graph to be compiled
+ * \param inputs The input specification.
+ * \param target The build target
+ * \param master_idx The index of master node for calling schedule
+ *
+ * \return func A lowered tvm function.
+ */
+GraphFunc GraphLower(Graph graph,
+                     const Array<tvm::Tensor>& inputs,
+                     const std::string& target,
+                     int master_idx);
+
+/*!
+ * \brief Get type flag from TVM Type
+ *
+ * \param type the tvm type
+ * \return corresponding DLDataType
+ */
+int GetTypeFlag(tvm::Type type);
+
+/*!
+ * \brief Get TVM Type from type flag
+ *
+ * \param type_flag the type flag
+ * \return corresponding TVM type
+ */
+tvm::Type GetTVMType(int type_flag);
+
+}  // namespace compiler
+}  // namespace nnvm
+
+#endif  // NNVM_COMPILER_COMPILE_ENGINE_H_
diff --git a/nnvm/src/compiler/fold_scale_axis.cc b/nnvm/src/compiler/fold_scale_axis.cc
new file mode 100644
index 000000000000..e38082b69916
--- /dev/null
+++ b/nnvm/src/compiler/fold_scale_axis.cc
@@ -0,0 +1,512 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file fold_scale_axis.cc
+ * \author Fold scaling parameter of axis into weight of conv/dense
+*/
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "./pattern_util.h"
+#include "./graph_transform.h"
+
+namespace nnvm {
+namespace compiler {
+
+enum FoldScaleKind {
+  // No folding is applied
+  kNone,
+  // The folding decision is pending, we can fold on a state.
+  kPending,
+  // The original operator that contains the scale.
+  kProvider,
+  // The final conumer of axis scale using multiply
+  // Likely be a conv or dense operator.
+  kMulConsumer,
+  // The final conumer of axis scale using division
+  kDivConsumer
+};
+
+struct FoldChainInfo {
+  // Entry kind
+  FoldScaleKind kind{kNone};
+  // The output axis to be folded
+  int axis{0};
+  // Source node in the fold chain
+  int source{0};
+};
+
+// The entry of folding chains on which
+// we should perform folding on
+struct FoldChainEntry {
+  // Fold information
+  FoldChainInfo info;
+  // Number of outgoing fork count
+  // in forward propagation.
+  int fork_count{0};
+  // Following field only used by provider.
+  // The input index
+  int fold_input_index{1};
+  // The scale entry
+  NodeEntry scale_entry;
+};
+
+// Try to pass axis scaling to backward,
+// Given that we we know the status of current fold axis.
+// return whether the forward signal is consumed.
+using FScaleAxisBackward = std::function<
+  bool(const NodeAttrs& attrs,
+       const std::vector<TShape>& in_shape,
+       const std::vector<TShape>& out_shape,
+       const FoldChainInfo& out_info,
+       std::vector<FoldChainInfo>* in_info)>;
+
+
+// Try to pass axis scaling to forward,
+// Given that we we know the status of one of its input to be pending
+// also update other input info
+// return whether the forward signal is consumed.
+using FScaleAxisForward = std::function<
+  bool(const NodeAttrs& attrs,
+       const std::vector<TShape>& in_shape,
+       const std::vector<TShape>& out_shape,
+       std::vector<FoldChainInfo>* in_info,
+       FoldChainInfo* out_info)>;
+
+
+// Detect if there is a scaling axis happening
+bool DetectScaleAxis(const IndexedGraph& idx,
+                     uint32_t nid,
+                     const ShapeVector& shape_vec,
+                     const std::vector<uint32_t>& ref_count,
+                     bool is_forward,
+                     std::vector<FoldChainEntry>* chain) {
+  const IndexedGraph::Node& inode = idx[nid];
+  static const Op* bcast_mul = Op::Get("broadcast_mul");
+  static const Op* expand_dims = Op::Get("expand_dims");
+  if (inode.source->op() != bcast_mul) return false;
+  const TShape& oshape = shape_vec[idx.entry_id(nid, 0)];
+  CHECK_NE(oshape.ndim(), 0);
+  if (oshape.ndim() <= 1) return false;
+  for (int i = 0; i < 2; ++i) {
+    const IndexedGraph::NodeEntry& a = inode.inputs[i];
+    const IndexedGraph::NodeEntry& b = inode.inputs[1 - i];
+    std::pair<int, int> axis =
+        MatchBroadcast1DAxis(oshape, shape_vec[idx.entry_id(a)]);
+    if (axis.first != -1 &&
+        shape_vec[idx.entry_id(b)] == oshape) {
+      if (ref_count[a.node_id] != 1) return false;
+      if (is_forward && ref_count[nid] != 1) return false;
+      if (!is_forward && ref_count[b.node_id] != 1) return false;
+      const IndexedGraph::Node& anode = idx[a.node_id];
+      // mark the current entry.
+      FoldChainEntry& e = (*chain)[nid];
+      if (anode.source->is_variable()) {
+        e.fold_input_index = 1 - i;
+        e.scale_entry = inode.source->inputs[1 - i];
+      } else if (anode.source->op()  == expand_dims &&
+                 shape_vec[idx.entry_id(anode.source->inputs[0])].ndim() == 1) {
+        e.fold_input_index = 1 - i;
+        e.scale_entry = anode.source->inputs[0];
+      } else {
+        return false;
+      }
+      e.info.axis = axis.first;
+      e.info.kind = kPending;
+      e.info.source = nid;
+      e.fork_count = 1;
+      // In the backward message passing
+      // We need to eagerly pass it to the input
+      // In the forward message passing
+      // we will "pull" the message from input.
+      if (!is_forward) {
+        FoldChainEntry& enext = (*chain)[b.node_id];
+        enext.info.axis = e.info.axis;
+        enext.info.kind = kPending;
+        enext.info.source = nid;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+Graph FoldScaleAxis(Graph src) {
+  // Operator pattern
+  static auto& fbackward =
+      nnvm::Op::GetAttr<FScaleAxisBackward>("FScaleAxisBackward");
+  static auto& fforward =
+      nnvm::Op::GetAttr<FScaleAxisForward>("FScaleAxisForward");
+  const IndexedGraph& idx = src.indexed_graph();
+  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
+  std::vector<uint32_t> ref_count = GetNodeRefCounts(idx);
+  std::vector<FoldChainEntry> bwd_chain(idx.num_nodes());
+  std::vector<FoldChainEntry> fwd_chain(idx.num_nodes());
+  // shape hint for the inference.
+  std::vector<TShape> in_shape, out_shape;
+
+  // perform backward folding.
+  for (uint32_t i = idx.num_nodes(); i != 0; --i) {
+    uint32_t nid = i - 1;
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    if (DetectScaleAxis(idx, nid, shape_vec,
+                        ref_count, false, &bwd_chain)) continue;
+    if (bwd_chain[nid].info.kind != kPending) continue;
+    // if referred by multiple node, cannot do propagation
+    if (ref_count[nid] != 1 || !fbackward.count(inode.source->op())) {
+      bwd_chain[nid].info.kind = kNone; continue;
+    }
+    // get input shape and output shape.
+    in_shape.clear(); out_shape.clear();
+    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+      in_shape.push_back(shape_vec[idx.entry_id(e)]);
+    }
+    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+      out_shape.push_back(shape_vec[idx.entry_id(nid, i)]);
+    }
+    std::vector<FoldChainInfo> in_info(in_shape.size(), FoldChainInfo());
+    bool consumed = fbackward[inode.source->op()](
+        inode.source->attrs,
+        in_shape,
+        out_shape,
+        bwd_chain[nid].info,
+        &in_info);
+    CHECK_EQ(in_info.size(), in_shape.size());
+    // propagate back.
+    bool can_prop = true;
+    for (size_t i = 0; i < in_info.size(); ++i) {
+      const IndexedGraph::NodeEntry& e = inode.inputs[i];
+      if (ref_count[e.node_id] != 1 ||
+          idx[e.node_id].source->num_outputs() != 1) {
+        can_prop = false; break;
+      }
+    }
+    if (!can_prop) continue;
+    for (size_t i = 0; i < in_info.size(); ++i) {
+      const IndexedGraph::NodeEntry& e = inode.inputs[i];
+      bwd_chain[e.node_id].info = in_info[i];
+    }
+    // mark consumed by making the source as provider.
+    if (consumed) {
+      bwd_chain[bwd_chain[nid].info.source].info.kind = kProvider;
+    }
+  }
+
+
+  // perform forward folding.
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    // skip scales that are already folded in backward.
+    if (bwd_chain[nid].info.kind == kProvider) continue;
+    if (DetectScaleAxis(idx, nid, shape_vec,
+                        ref_count, true, &fwd_chain)) continue;
+    if (inode.source->num_outputs() != 1) continue;
+    // Do state update
+    // get input shape and output shape.
+    std::vector<FoldChainInfo> in_info;
+    FoldChainInfo out_info;
+    int num_inpending = 0;
+    in_shape.clear(); out_shape.clear();
+    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+      in_shape.push_back(shape_vec[idx.entry_id(e)]);
+      // input information
+      in_info.push_back(fwd_chain[e.node_id].info);
+      if (fwd_chain[e.node_id].info.kind == kPending) {
+        ++num_inpending;
+      }
+    }
+    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+      out_shape.push_back(shape_vec[idx.entry_id(nid, i)]);
+    }
+    if (num_inpending != 1 ||
+        !fforward.count(inode.source->op())) continue;
+    bool consumed = fforward[inode.source->op()](
+        inode.source->attrs,
+        in_shape,
+        out_shape,
+        &in_info,
+        &out_info);
+    // update input info
+    for (size_t i = 0; i < in_info.size(); ++i) {
+      fwd_chain[inode.inputs[i].node_id].info = in_info[i];
+    }
+    if (consumed) {
+      fwd_chain[nid].info = out_info;
+      for (size_t i = 0; i < in_info.size(); ++i) {
+        if (in_info[i].kind == kPending) {
+          if (--fwd_chain[in_info[i].source].fork_count == 0) {
+            fwd_chain[in_info[i].source].info.kind = kProvider;
+          }
+        }
+      }
+    } else {
+      // can propagate condition
+      if (inode.source->num_outputs() == 1) {
+        fwd_chain[nid].info = out_info;
+        if (out_info.kind == kPending) {
+          // When there is multiple reference to input
+          // every path have to be consumed
+          fwd_chain[out_info.source].fork_count += ref_count[nid] - 1;
+        }
+      }
+    }
+  }
+
+  auto transform = [&](uint32_t nid, const NodePtr& n, std::vector<NodeEntry>* ret) {
+    NodeEntry rvalue = NodeEntry{n, 0, 0};
+    {
+      // Backward chain
+      const FoldChainEntry& e = bwd_chain[nid];
+      if (e.info.kind == kMulConsumer &&
+          bwd_chain[e.info.source].info.kind == kProvider) {
+        const FoldChainEntry& se = bwd_chain[e.info.source];
+        CHECK_EQ(n->num_outputs(), 1);
+        NodeEntry scale = ExpandBiasToMatchAxis(
+            se.scale_entry,
+            shape_vec[idx.entry_id(nid, 0)].ndim(),
+            shape_vec[idx.entry_id(se.scale_entry)].ndim(),
+            e.info.axis);
+        rvalue = MakeNode("broadcast_mul", n->attrs.name + "_sc",
+                          {rvalue, scale});
+      } else if (e.info.kind == kProvider) {
+        rvalue = n->inputs[e.fold_input_index];
+      }
+    }
+    // Note that the value might get transformed twice if it
+    // folds value from both fwd and backward chain.
+    {
+      // forward chain
+      const FoldChainEntry& e = fwd_chain[nid];
+      if (e.info.kind == kMulConsumer &&
+          fwd_chain[e.info.source].info.kind == kProvider) {
+        const FoldChainEntry& se = fwd_chain[e.info.source];
+        CHECK_EQ(n->num_outputs(), 1);
+        NodeEntry scale = ExpandBiasToMatchAxis(
+            se.scale_entry,
+            shape_vec[idx.entry_id(nid, 0)].ndim(),
+            shape_vec[idx.entry_id(se.scale_entry)].ndim(),
+            e.info.axis);
+        rvalue = MakeNode("broadcast_mul", n->attrs.name + "_sc",
+                          {rvalue, scale});
+      } else if (e.info.kind == kDivConsumer &&
+                 fwd_chain[e.info.source].info.kind == kProvider) {
+        const FoldChainEntry& se = fwd_chain[e.info.source];
+        CHECK_EQ(n->num_outputs(), 1);
+        NodeEntry scale = ExpandBiasToMatchAxis(
+            se.scale_entry,
+            shape_vec[idx.entry_id(nid, 0)].ndim(),
+            shape_vec[idx.entry_id(se.scale_entry)].ndim(),
+            e.info.axis);
+        rvalue = MakeNode("broadcast_div", n->attrs.name + "_sc",
+                          {rvalue, scale});
+      } else if (e.info.kind == kProvider) {
+        rvalue = n->inputs[e.fold_input_index];
+      }
+    }
+    if (rvalue.node == n) {
+      return false;
+    } else {
+      *ret = {rvalue};
+      return true;
+    }
+  };
+  return GraphTransform(src, transform);
+}
+
+NNVM_REGISTER_PASS(FoldScaleAxis)
+.set_body(FoldScaleAxis);
+
+// property registration.
+bool ReluScaleAxisBackward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    const FoldChainInfo& out_info,
+    std::vector<FoldChainInfo>* in_axis) {
+  (*in_axis)[0] = out_info;
+  return false;
+}
+
+bool ReluScaleAxisForward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    std::vector<FoldChainInfo>* in_info,
+    FoldChainInfo* out_info) {
+  *out_info = (*in_info)[0];
+  return false;
+}
+
+NNVM_REGISTER_OP(relu)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", ReluScaleAxisBackward);
+
+NNVM_REGISTER_OP(leaky_relu)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", ReluScaleAxisBackward);
+
+NNVM_REGISTER_OP(relu)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", ReluScaleAxisForward);
+
+NNVM_REGISTER_OP(leaky_relu)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", ReluScaleAxisForward);
+
+// property registration.
+template <typename T>
+bool Pool2DBackward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    const FoldChainInfo& out_info,
+    std::vector<FoldChainInfo>* in_axis) {
+  const T& param = nnvm::get<T>(attrs.parsed);
+  if (out_info.axis == 1 && param.layout == "NCHW") {
+    (*in_axis)[0] = out_info;
+  }
+  return false;
+}
+
+template <typename T>
+bool Pool2DForward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    std::vector<FoldChainInfo>* in_info,
+    FoldChainInfo* out_info) {
+  const T& param = nnvm::get<T>(attrs.parsed);
+  if ((*in_info)[0].axis == 1 && param.layout == "NCHW") {
+    *out_info = (*in_info)[0];
+  }
+  return false;
+}
+
+NNVM_REGISTER_OP(max_pool2d)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", Pool2DBackward<top::MaxPool2DParam>);
+
+NNVM_REGISTER_OP(avg_pool2d)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", Pool2DBackward<top::AvgPool2DParam>);
+
+NNVM_REGISTER_OP(max_pool2d)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", Pool2DForward<top::MaxPool2DParam>);
+
+NNVM_REGISTER_OP(avg_pool2d)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", Pool2DForward<top::AvgPool2DParam>);
+
+
+
+bool BroadcastAddSubScaleAxisBackward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    const FoldChainInfo& out_info,
+    std::vector<FoldChainInfo>* in_axis) {
+  if (out_info.kind != kPending) return false;
+  for (int i = 0; i < 2; ++i) {
+    std::pair<int, int> m = MatchBroadcast1DAxis(out_shape[0], in_shape[1 - i]);
+    if (m.second != -1 &&
+        in_shape[i] == out_shape[0] &&
+        m.first == out_info.axis) {
+      (*in_axis)[i].kind = kPending;
+      (*in_axis)[i].axis = out_info.axis;
+      (*in_axis)[i].source = out_info.source;
+      (*in_axis)[1 - i].kind = kMulConsumer;
+      (*in_axis)[1 - i].axis = m.second;
+      (*in_axis)[1 - i].source = out_info.source;
+      return false;
+    }
+  }
+  return false;
+}
+
+bool BroadcastAddSubScaleAxisForward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    std::vector<FoldChainInfo>* in_info,
+    FoldChainInfo* out_info) {
+  for (int i = 0; i < 2; ++i) {
+    if ((*in_info)[i].kind == kPending) {
+      std::pair<int, int> m = MatchBroadcast1DAxis(out_shape[0], in_shape[1 - i]);
+      if (m.second != -1 &&
+          in_shape[i] == out_shape[0] &&
+          m.first == (*in_info)[i].axis) {
+        out_info->kind = kPending;
+        out_info->axis = m.first;
+        out_info->source = (*in_info)[i].source;
+        (*in_info)[1 - i].kind = kDivConsumer;
+        (*in_info)[1 - i].axis = m.second;
+        (*in_info)[1 - i].source = (*in_info)[i].source;
+        return false;
+      }
+    }
+  }
+  return false;
+}
+
+NNVM_REGISTER_OP(broadcast_add)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", BroadcastAddSubScaleAxisBackward);
+
+NNVM_REGISTER_OP(broadcast_sub)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", BroadcastAddSubScaleAxisBackward);
+
+NNVM_REGISTER_OP(broadcast_add)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", BroadcastAddSubScaleAxisForward);
+
+NNVM_REGISTER_OP(broadcast_sub)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", BroadcastAddSubScaleAxisForward);
+
+bool Conv2DScaleAxisBackward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    const FoldChainInfo& out_info,
+    std::vector<FoldChainInfo>* in_axis) {
+  using top::Conv2DParam;
+  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+  if (out_info.kind != kPending) return false;
+  // only optimize for kernel layout OIHW for now
+  if (param.kernel_layout == "OIHW" && out_info.axis == 1) {
+    (*in_axis)[1].kind = kMulConsumer;
+    (*in_axis)[1].axis = 0;
+    (*in_axis)[1].source = out_info.source;
+    if (param.use_bias) {
+      (*in_axis)[2].kind = kMulConsumer;
+      (*in_axis)[2].axis = 0;
+      (*in_axis)[2].source = out_info.source;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool Conv2DScaleAxisForward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    std::vector<FoldChainInfo>* in_info,
+    FoldChainInfo* out_info) {
+  using top::Conv2DParam;
+  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+  if ((*in_info)[0].kind != kPending) return false;
+  // only optimize for nchw for now
+  if (param.kernel_layout == "OIHW" && (*in_info)[0].axis == 1) {
+    (*in_info)[1].kind = kMulConsumer;
+    (*in_info)[1].axis = 1;
+    (*in_info)[1].source = (*in_info)[0].source;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+NNVM_REGISTER_OP(conv2d)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", Conv2DScaleAxisBackward);
+
+NNVM_REGISTER_OP(conv2d)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", Conv2DScaleAxisForward);
+
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
new file mode 100644
index 000000000000..d4e668972593
--- /dev/null
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -0,0 +1,505 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file graph_fuse.cc
+ * \brief Fuse the operators together.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/tuple.h>
+#include <nnvm/pass.h>
+#include <nnvm/pass_functions.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/lowered_func.h>
+#include <dmlc/parameter.h>
+#include "./compile_engine.h"
+#include "./graph_runtime.h"
+#include "./pattern_util.h"
+
+namespace nnvm {
+namespace compiler {
+using namespace tvm;
+
+// The single fuse rule.
+enum class FuseRule {
+  kUknown,
+  kFuseToMaster,
+  kRealize
+};
+
+/*!
+ * \brief Get DLDataType from dtype flag.
+ *
+ * \param type_flag The data type flag
+ * \return corresponding DLDataType
+ */
+DLDataType GetDLType(int type_flag) {
+  return Type2TVMType(GetTVMType(type_flag));
+}
+
+// Partition the graph into segments
+// Each segment will be compiled into one operator.
+// Need also mark the property of the segment.
+nnvm::Graph GraphFusePartition(nnvm::Graph g) {
+  // setup ref counter
+  const IndexedGraph& idx = g.indexed_graph();
+  int opt_level = 2;
+  if (g.attrs.count("opt_level") != 0) {
+    opt_level = g.MoveCopyAttr<int>("opt_level");
+  }
+
+  // Get attributes from the graph
+  const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
+
+  // Reference counter of each op node
+  // For now, always store result when an op is referred more than once.
+  std::vector<uint32_t> ref_count = GetNodeRefCounts(idx);
+  for (const auto& e : idx.outputs()) {
+    // this line will realize all the outputs
+    ref_count[e.node_id] += 1;
+  }
+  // Pattern for the subgraph
+  std::vector<TOpPattern> pattern_vec(idx.num_nodes(),  kOpaque);
+  // Whether node can be fused to parent.
+  std::vector<FuseRule> fuse_vec(idx.num_nodes(), FuseRule::kUknown);
+  // Master node id of fusion segment.
+  std::vector<int> master_vec(idx.num_nodes(), -1);
+  // Operator pattern
+  static auto& op_pattern = nnvm::Op::GetAttr<TOpPattern>("TOpPattern");
+
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) {
+      fuse_vec[nid] = FuseRule::kRealize; continue;
+    }
+    TOpPattern pt = op_pattern.get(inode.source->op(), kOpaque);
+
+    if (pt <= kBroadcast) {
+      // Try to check if we can fuse to the master.
+      int chosen_master = -1;
+      bool ewise = inode.source->num_outputs() == 1;
+      for (const auto& e : inode.inputs) {
+        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
+          TOpPattern ipt = pattern_vec[e.node_id];
+          if (ipt != kElemWise) ewise = false;
+          if (ipt <= kInjective) {
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else if (ipt == kOutEWiseFusable &&
+                     chosen_master == -1 &&
+                     shape_vec[idx.entry_id(nid, 0)] == shape_vec[idx.entry_id(e)]) {
+            chosen_master = master_vec[e.node_id];
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else {
+            fuse_vec[e.node_id] = FuseRule::kRealize;
+          }
+        }
+        if (ewise) {
+          if (shape_vec[idx.entry_id(nid, 0)] != shape_vec[idx.entry_id(e)]) {
+            ewise = false;
+          }
+        }
+      }
+      master_vec[nid] = chosen_master;
+      if (chosen_master != -1) {
+        pt = kOutEWiseFusable;
+      } else {
+        pt = ewise ? kElemWise : kBroadcast;
+      }
+    } else if (pt == kInjective || pt == kCommReduce) {
+      // fuse to the comm reduce or injective
+      for (const auto& e : inode.inputs) {
+        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
+          TOpPattern ipt = pattern_vec[e.node_id];
+          if (ipt <= kInjective) {
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else {
+            fuse_vec[e.node_id] = FuseRule::kRealize;
+          }
+        }
+      }
+      if (pt == kCommReduce) {
+        master_vec[nid] = nid;
+      }
+    } else {
+      // realize
+      master_vec[nid] = nid;
+      for (const auto& e : inode.inputs) {
+        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
+          fuse_vec[e.node_id] = FuseRule::kRealize;
+          if (master_vec[e.node_id] == -1) {
+            master_vec[e.node_id] = e.node_id;
+          }
+        }
+      }
+    }
+
+    pattern_vec[nid] = pt;
+    if (ref_count[nid] > 1 || opt_level < 1) {
+      fuse_vec[nid] = FuseRule::kRealize;
+      if (master_vec[nid] == -1) {
+        master_vec[nid] = nid;
+      }
+    }
+  }
+
+  // point to the group root id of each node
+  std::vector<int> group_vec(idx.num_nodes(), -1);
+  for (uint32_t i = idx.num_nodes(); i != 0; --i) {
+    uint32_t nid = i - 1;
+    const auto& inode = idx[nid];
+    if (group_vec[nid] == -1) {
+      group_vec[nid] = nid;
+    }
+    // propagate the group id.
+    for (const auto& e : inode.inputs) {
+      if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) {
+        CHECK(group_vec[e.node_id] == -1||
+              group_vec[e.node_id] == group_vec[nid]);
+        group_vec[e.node_id] = group_vec[nid];
+      }
+    }
+  }
+  g.attrs["group_root"] = std::make_shared<any>(std::move(group_vec));
+  g.attrs["group_master"] = std::make_shared<any>(std::move(master_vec));
+  g.attrs["pattern"] = std::make_shared<any>(std::move(pattern_vec));
+  return g;
+}
+
+
+NNVM_REGISTER_PASS(GraphFusePartition)
+.set_body(GraphFusePartition)
+.depend_graph_attr("shape")
+.depend_graph_attr("dtype");
+
+
+// Decorate the result of PlanMemory
+// This function does two things:
+// - Give separate memory to each variable
+// - Tie the memory of output/lhs in assign node properly
+//   so the execution of assign can have side effect.
+nnvm::Graph DecorateMemoryPlan(
+    nnvm::Graph g,
+    const std::vector<int>& assign_flag) {
+  // setup ref counter
+  const IndexedGraph& idx = g.indexed_graph();
+  StorageVector storage_vec = g.MoveCopyAttr<StorageVector>("storage_id");
+  g.attrs.erase("storage_allocated_bytes");
+  g.attrs.erase("storage_inplace_index");
+  size_t num_not_allocated = g.MoveCopyAttr<size_t>(
+      "storage_num_not_allocated");
+  CHECK_EQ(num_not_allocated, 0U)
+      << "Can only build inference graph with all statically allocated memory";
+
+  // reassign variable id so that they are different.
+  int max_id = 0;
+  for (size_t i = 0; i < storage_vec.size(); ++i) {
+    max_id = std::max(storage_vec[i] + 1, max_id);
+  }
+  for (uint32_t nid : idx.input_nodes()) {
+    storage_vec[idx.entry_id(nid, 0)] = max_id++;
+  }
+  // tie up the assign node storage properly
+  for (uint32_t nid = 0 ; nid < idx.num_nodes(); ++nid) {
+    if (assign_flag[nid] == 0) continue;
+    const auto& inode = idx[nid];
+    int var_storage_id = storage_vec[idx.entry_id(inode.inputs[0])];
+    storage_vec[idx.entry_id(nid, 0)] = var_storage_id;
+
+    if (assign_flag[nid] == 2) {
+      storage_vec[idx.entry_id(inode.inputs[1])] = var_storage_id;
+    }
+  }
+  g.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec));
+  return g;
+}
+
+struct INodeEntryHash {
+  size_t operator()(const IndexedGraph::NodeEntry& e) const {
+    return e.node_id;
+  }
+};
+
+struct INodeEntryEqual {
+  size_t operator()(const IndexedGraph::NodeEntry& a,
+                    const IndexedGraph::NodeEntry& b) const {
+    return a.node_id == b.node_id && a.index == b.index;
+  }
+};
+
+// Auxiliary data structure for representing fused op.
+struct FuseEntry {
+  // subgraph of the fragement
+  Graph subgraph;
+  // The input map
+  std::unordered_map<IndexedGraph::NodeEntry, nnvm::NodeEntry,
+                     INodeEntryHash, INodeEntryEqual> imap;
+  // reverse map to the old input entry
+  std::unordered_map<const Node*, IndexedGraph::NodeEntry> reverse_imap;
+  // TVM Placeholder for inputs
+  std::unordered_map<const Node*, Tensor> input_info;
+  // Whether we can flatten data
+  bool flatten_data;
+  // The corresponding function.
+  GraphFunc compiled_func;
+};
+
+// Fuse the partitioned graph into segments.
+// Create a new graph with fused noded.
+// Also inheritate attribute shape, dltype from previous graph.
+nnvm::Graph GraphFuseCompile(nnvm::Graph g) {
+  // setup ref counter
+  const IndexedGraph& idx = g.indexed_graph();
+  // Get attributes from the graph
+  const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
+  const DTypeVector& dtype_vec = g.GetAttr<DTypeVector>("dtype");
+  const std::vector<int>& group_vec = g.GetAttr<std::vector<int> >("group_root");
+  const std::vector<int>& master_vec = g.GetAttr<std::vector<int> >("group_master");
+  const std::vector<TOpPattern>& pattern_vec =
+      g.GetAttr<std::vector<TOpPattern> >("pattern");
+  std::string target = g.GetAttr<std::string>("target");
+  std::string target_host;
+
+  if (g.HasAttr("target_host")) {
+    target_host = g.GetAttr<std::string>("target_host");
+  }
+  // specially handle assign
+  const nnvm::Op* assign_op = nnvm::Op::Get("_assign");
+
+  std::vector<FuseEntry> fuse_vec(idx.num_nodes());
+  // setup inputs and placeholder.
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    CHECK_GE(group_vec[nid], 0);
+    int root_id = group_vec[nid];
+    FuseEntry& fe = fuse_vec[root_id];
+    fe.flatten_data = (pattern_vec[root_id] == kElemWise ||
+                       inode.source->op() == assign_op);
+    for (const auto& e : inode.inputs) {
+      if (group_vec[e.node_id] != root_id && fe.imap.count(e) == 0) {
+        Array<Expr> shape;
+        if (fe.flatten_data) {
+          // elementwise support flatten
+          int64_t prod = 1;
+          for (int64_t x : shape_vec[idx.entry_id(e)]) {
+            prod *= x;
+          }
+          CHECK_LE(prod, static_cast<int64_t>(std::numeric_limits<int>::max()));
+          shape.push_back(make_const(Int(32), prod));
+        } else {
+          for (int64_t x : shape_vec[idx.entry_id(e)]) {
+            CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
+            shape.push_back(make_const(Int(32), x));
+          }
+        }
+        std::ostringstream os_name;
+        os_name << "input" << fe.imap.size();
+        Tensor data = placeholder(
+            shape, TVMType2Type(GetDLType(dtype_vec[idx.entry_id(e)])),
+            os_name.str());
+        NodeEntry garg = Symbol::CreateVariable(os_name.str()).outputs[0];
+        fe.imap[e] = garg;
+        fe.reverse_imap[garg.node.get()] = e;
+        fe.input_info[garg.node.get()] = std::move(data);
+      }
+    }
+  }
+  // Setup the Subgraph
+  std::vector<NodeEntry> subgraph_vec(idx.num_node_entries());
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    int root_id = group_vec[nid];
+    FuseEntry& fe = fuse_vec[root_id];
+    // copy and create subgraph node.
+    NodePtr gnode = Node::Create();
+    gnode->attrs = inode.source->attrs;
+    // input loading
+    for (const auto& e : inode.inputs) {
+      if (group_vec[e.node_id] != root_id) {
+        auto it = fe.imap.find(e);
+        CHECK(it != fe.imap.end());
+        gnode->inputs.push_back(it->second);
+      } else {
+        const NodeEntry& ne = subgraph_vec[idx.entry_id(e)];
+        CHECK(!idx[e.node_id].source->is_variable());
+        CHECK(ne.node != nullptr);
+        gnode->inputs.push_back(ne);
+      }
+    }
+    // schedule on root node, and use master's schedule
+    if (static_cast<int>(nid) != root_id) {
+      for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+        uint32_t eid = idx.entry_id(nid, index);
+        subgraph_vec[eid] = NodeEntry{gnode, index, 0};
+      }
+    } else {
+      for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+        fe.subgraph.outputs.push_back(NodeEntry{gnode, index, 0});
+      }
+    }
+  }
+  // Start lowering
+  Array<tvm::LoweredFunc> func_list;
+  std::unordered_set<const tvm::Node*> func_set;
+
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    int root_id = group_vec[nid];
+    if (static_cast<int>(nid) != root_id) continue;
+    int master = master_vec[root_id];
+    FuseEntry& fe = fuse_vec[root_id];
+
+    const IndexedGraph& subidx = fe.subgraph.indexed_graph();
+    CHECK_EQ(subidx.input_nodes().size(), fe.imap.size());
+    CHECK_EQ(subidx.input_nodes().size(), fe.input_info.size());
+
+    Array<Tensor> inputs;
+    for (uint32_t sub_input_id : subidx.input_nodes()) {
+      auto it = fe.input_info.find(subidx[sub_input_id].source);
+      inputs.push_back(it->second);
+    }
+    // find master idx in subgraph
+    int sub_master_idx = 0;
+    for (uint32_t i = 0; i < subidx.num_nodes(); i++) {
+      if (subidx[i].source->op() == idx[master].source->op()) {
+        sub_master_idx = i;
+        break;
+      }
+    }
+    fe.compiled_func = GraphLower(fe.subgraph, inputs, target, sub_master_idx);
+    for (LoweredFunc f : fe.compiled_func->funcs) {
+      if (!func_set.count(f.get())) {
+        func_set.insert(f.get());
+        func_list.push_back(f);
+      }
+    }
+  }
+
+  const nnvm::Op* tvm_op = nnvm::Op::Get("tvm_op");
+
+  std::unordered_map<uint32_t, nnvm::NodePtr> old_new;
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) {
+      // only copy over name since that is sufficient.
+      nnvm::NodePtr np = nnvm::Node::Create();
+      np->attrs.name = inode.source->attrs.name;
+      old_new[nid] = np;
+      continue;
+    }
+    int root_id = group_vec[nid];
+    if (static_cast<int>(nid) != root_id) continue;
+
+    // Handle normal op
+    FuseEntry& fe = fuse_vec[root_id];
+    const IndexedGraph& subidx = fe.subgraph.indexed_graph();
+    nnvm::NodePtr np = nnvm::Node::Create();
+    np->attrs.op = tvm_op;
+    np->attrs.name = inode.source->attrs.name;
+    TVMOpParam param;
+    param.func_name = fe.compiled_func->func_name;
+    param.num_inputs = static_cast<uint32_t>(fe.imap.size());
+    param.num_outputs = static_cast<uint32_t>(fe.subgraph.outputs.size());
+    param.flatten_data = fe.flatten_data;
+    param.UpdateDict(&(np->attrs.dict));
+    np->attrs.parsed = std::move(param);
+
+    for (uint32_t sub_input_id : subidx.input_nodes()) {
+      // Need to make sure subgraph input order meets order of the graph input
+      auto rit = fe.reverse_imap.find(subidx[sub_input_id].source);
+      CHECK(rit != fe.reverse_imap.end());
+      const IndexedGraph::NodeEntry& e = rit->second;
+      auto it = old_new.find(e.node_id);
+      CHECK(it != old_new.end())
+          << "cannot find node_id=" << e.node_id;
+      np->inputs.emplace_back(
+          nnvm::NodeEntry{it->second, e.index, e.version});
+    }
+    for (const uint32_t node_id : inode.control_deps) {
+      auto it = old_new.find(node_id);
+      CHECK(it != old_new.end());
+      np->control_deps.emplace_back(it->second);
+    }
+    old_new[nid] = np;
+  }
+  nnvm::Graph ret;
+  for (const auto& e : idx.outputs()) {
+    auto it = old_new.find(group_vec[e.node_id]);
+    CHECK(it != old_new.end())
+        << "cannot find node_id=" << e.node_id;
+    ret.outputs.emplace_back(
+        nnvm::NodeEntry{it->second, e.index, e.version});
+  }
+
+  // Reference counter of each op node
+  // For now, always store result when an op is referred more than once.
+  std::vector<uint32_t> ref_count = GetNodeRefCounts(idx);
+  for (const auto& e : idx.outputs()) {
+    // this line will realize all the outputs
+    ref_count[e.node_id] += 1;
+  }
+
+  const IndexedGraph& new_idx = ret.indexed_graph();
+
+  // Handling assign:
+  //
+  //  assign is a special operator that mutates the variable.
+  //  Currently assign is implemented as output = copy(input[1])
+  //  Then we run DecorageMemoryPlan to force
+  //  output.storage = input[0].storage
+  //
+  std::vector<int> assign_flag(new_idx.num_nodes(), 0);
+  ShapeVector new_shape_vec = ShapeVector(new_idx.num_node_entries(), TShape());
+  DTypeVector new_dtype_vec = DTypeVector(new_idx.num_node_entries());
+  std::vector<std::string> new_dltype_vec(new_idx.num_node_entries());
+
+  for (const auto& kv : old_new) {
+    uint32_t nid = kv.first;
+    const auto& inode = idx[nid];
+    uint32_t new_nid = new_idx.node_id(kv.second.get());
+    if (inode.source->op() == assign_op) {
+      // Check if rhs of assign can be comute inplace
+      // If yes, we can simply set that memory to be assign target
+      // and change assign to nop
+      const IndexedGraph::NodeEntry& rhs = inode.inputs[1];
+      if (ref_count[rhs.node_id] <= 1 &&
+          !(idx[rhs.node_id].source->is_variable()) &&
+          pattern_vec[group_vec[rhs.node_id]] <= kBroadcast) {
+        assign_flag[new_nid] = 2;
+        TVMOpParam& param = dmlc::get<TVMOpParam>(kv.second->attrs.parsed);
+        param.func_name = "__nop";
+        param.UpdateDict(&(kv.second->attrs.dict));
+      } else {
+        assign_flag[new_nid] = 1;
+      }
+    }
+    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+      uint32_t new_eid = new_idx.entry_id(new_idx.node_id(kv.second.get()), i);
+      uint32_t old_eid = idx.entry_id(nid, i);
+      new_shape_vec[new_eid] = shape_vec[old_eid];
+      new_dtype_vec[new_eid] = dtype_vec[old_eid];
+      new_dltype_vec[new_eid] = tvm::runtime::TVMType2String(
+          GetDLType(dtype_vec[old_eid]));
+    }
+  }
+  ret.attrs["shape"] = std::make_shared<any>(std::move(new_shape_vec));
+  ret.attrs["dtype"] = std::make_shared<any>(std::move(new_dtype_vec));
+  ret.attrs["dltype"] = std::make_shared<any>(std::move(new_dltype_vec));
+  // Setup module
+  static const PackedFunc& fbuild = GetPackedFunc("nnvm.compiler.build_target");
+  tvm::runtime::Module module = fbuild(func_list, target, target_host);
+  ret.attrs["module"] = std::make_shared<any>(std::move(module));
+  ret = nnvm::ApplyPass(ret, "PlanMemory");
+  ret = DecorateMemoryPlan(ret, assign_flag);
+  return ret;
+}
+
+NNVM_REGISTER_PASS(GraphFuseCompile)
+.set_body(GraphFuseCompile);
+
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc
new file mode 100644
index 000000000000..d881130f72cc
--- /dev/null
+++ b/nnvm/src/compiler/graph_hash.cc
@@ -0,0 +1,221 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file graph_deep_compare.cc
+ * \brief Deep compare two graph structure
+ */
+#include <dmlc/common.h>
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <tvm/ir.h>
+#include <tvm/runtime/packed_func.h>
+#include <functional>
+#include "./node_attr.h"
+#include "./graph_hash.h"
+
+namespace nnvm {
+namespace compiler {
+
+using namespace tvm;
+using tvm::ir::IntImm;
+
+size_t HashPlaceHolder(const Tensor& t) {
+  size_t key = t->shape.size();
+  key = dmlc::HashCombine(key, (t->dtype.code() << 8) | t->dtype.bits());
+  for (Expr s : t->shape) {
+    if (const IntImm* op = s.as<IntImm>()) {
+      key = dmlc::HashCombine(key, op->value);
+    }
+  }
+  return key;
+}
+
+bool PlaceHolderEqual(const Tensor& a, const Tensor& b) {
+  if (a->shape.size() != b->shape.size()) return false;
+  if (a->dtype != b->dtype) return false;
+  for (size_t i = 0; i < a->shape.size(); ++i) {
+    const IntImm* a_value = a->shape[i].as<IntImm>();
+    const IntImm* b_value = b->shape[i].as<IntImm>();
+    if (a_value && b_value == nullptr) return false;
+    if (b_value && a_value == nullptr) return false;
+    if (a_value == nullptr && b_value == nullptr) {
+      continue;
+    }
+    if (a_value->value != b_value->value) return false;
+  }
+  return true;
+}
+
+size_t GraphKeyHash::Hash(const GraphKey& gkey)  {
+  if (gkey->cache_hash_key_ != 0) return gkey->cache_hash_key_;
+  size_t key = dmlc::HashCombine(GraphHash(gkey->graph), gkey->target);
+  key = dmlc::HashCombine(key, gkey->inputs.size());
+  for (size_t i = 0; i < gkey->inputs.size(); ++i) {
+    key = dmlc::HashCombine(key, HashPlaceHolder(gkey->inputs[i]));
+  }
+  if (key == 0) key = 1;
+  gkey->cache_hash_key_ = key;
+  return key;
+}
+
+bool GraphKeyEqual::Equal(const GraphKey& a,
+                          const GraphKey& b) {
+  if (a->target != b->target) return false;
+  if (a->inputs.size() != b->inputs.size()) return false;
+  for (size_t i = 0; i < a->inputs.size(); ++i) {
+    if (!PlaceHolderEqual(a->inputs[i], b->inputs[i])) return false;
+  }
+  if (GraphDeepCompare(a->graph, b->graph, false).length() != 0) return false;
+  return true;
+}
+
+GraphKey GraphKeyNode::make(Graph graph,
+                            tvm::Array<Tensor> inputs,
+                            std::string target) {
+  std::shared_ptr<GraphKeyNode> n
+      = std::make_shared<GraphKeyNode>();
+  n->graph = std::move(graph);
+  n->inputs = inputs;
+  n->target = std::move(target);
+  return GraphKey(n);
+}
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<GraphKeyNode>([](const GraphKeyNode *op, IRPrinter *p) {
+    p->stream << "GraphKeyNode("<< op << ")";
+});
+
+
+// Run graph hash
+size_t GraphHash(const Graph& graph) {
+  const IndexedGraph& idx = graph.indexed_graph();
+  size_t key = 0;
+  // Combine a linearized sequence of ops in subgraph
+  key = dmlc::HashCombine(key, idx.num_nodes());
+  std::hash<std::string> str_hash;
+  std::vector<size_t> hash_temp;
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const IndexedGraph::Node& inode = idx[nid];
+    // Use name instad op address so it is deterministic across runs
+    if (inode.source->is_variable()) continue;
+    key = dmlc::HashCombine(key, inode.source->op()->name);
+    hash_temp.clear();
+    for (const auto& kv : GetAttrDict(inode.source->attrs)) {
+      hash_temp.push_back(dmlc::HashCombine(str_hash(kv.first), kv.second));
+    }
+    // to make sure it is deterministic
+    // since unordered_map is not deterministic
+    std::sort(hash_temp.begin(), hash_temp.end());
+    for (size_t value : hash_temp) {
+      key = dmlc::HashCombine(key, value);
+    }
+  }
+  return key;
+}
+
+// deep compare the graph structure
+// not considering the graph attributes
+// return non-empty error message if the graph mismatch.
+// the comparator won't match name of intermediate node.
+// compare_var_attr
+std::string GraphDeepCompare(const Graph& a,
+                             const Graph& b,
+                             bool compare_variable_attr) {
+  const IndexedGraph& idxa = a.indexed_graph();
+  const IndexedGraph& idxb = b.indexed_graph();
+  std::ostringstream err;
+  if (idxa.num_nodes() != idxb.num_nodes()) {
+    err << "Number of nodes mismatch";
+    return err.str();
+  }
+  if (idxa.num_node_entries() != idxb.num_node_entries()) {
+    err << "Number of node entry mismatch";
+    return err.str();
+  }
+  if (idxa.outputs().size() != idxb.outputs().size()) {
+    err << "Number of outputs mismatch";
+    return err.str();
+  }
+  for (size_t i = 0; i < idxa.outputs().size(); ++i) {
+    if (idxa.outputs()[i].node_id != idxb.outputs()[i].node_id ||
+        idxa.outputs()[i].index != idxb.outputs()[i].index) {
+      err << "Output entry mismatch";
+      return err.str();
+    }
+  }
+  if (idxa.input_nodes().size() != idxb.input_nodes().size()) {
+    err << "Number of inputs mismatch";
+    return err.str();
+  }
+
+  for (uint32_t nid = 0; nid < idxa.num_nodes(); ++nid) {
+    const IndexedGraph::Node& anode = idxa[nid];
+    const IndexedGraph::Node& bnode = idxb[nid];
+    if (anode.source->op() != bnode.source->op()) {
+      err << "Node mismatch ";
+      return err.str();
+    }
+    if (anode.source->is_variable()) {
+      CHECK(bnode.source->is_variable());
+      if (!compare_variable_attr) continue;
+    }
+    AttrDict adict = GetAttrDict(anode.source->attrs);
+    AttrDict bdict = GetAttrDict(bnode.source->attrs);
+
+    auto fmatch = [&err, &anode](const AttrDict& adict, const AttrDict& bdict) {
+      for (const auto& kv : adict) {
+        auto it = bdict.find(kv.first);
+        if (it != bdict.end()) {
+          if (it->second != kv.second) {
+            err << "Node attr mismatch, op=" << anode.source->attrs.name
+                << " attr_key=" << kv.first << " " << it->second
+                << " v.s. " << kv.second;
+            return false;
+          }
+        } else {
+          err << "One attr_key=" << kv.first << " is missing in another "
+               << "op=" << anode.source->attrs.name;
+          return false;
+        }
+      }
+      return true;
+    };
+    if (!fmatch(adict, bdict)) return err.str();
+    if (adict.size() != bdict.size()) {
+      CHECK(!fmatch(bdict, adict));
+      return err.str();
+    }
+    if (anode.inputs.size() != bnode.inputs.size()) {
+      err << "Node input mismatch, op=" << anode.source->attrs.name;
+      return err.str();
+    }
+    if (anode.control_deps.size() != bnode.control_deps.size()) {
+      err << "Node control_deps mistach, op=" << anode.source->attrs.name;
+      return err.str();
+    }
+    for (size_t i = 0; i < anode.inputs.size(); ++i) {
+      const IndexedGraph::NodeEntry& ae = anode.inputs[i];
+      const IndexedGraph::NodeEntry& be = bnode.inputs[i];
+      if (ae.node_id != be.node_id ||
+          ae.index != be.index ||
+          ae.version != be.version) {
+        err << "Node input mismatch on, op=" << anode.source->attrs.name;
+        return err.str();
+      }
+    }
+    for (size_t i = 0; i < anode.control_deps.size(); ++i) {
+      if (anode.control_deps[i] != bnode.control_deps[i]) {
+        err << "Node control_dep mismatch on, op=" << anode.source->attrs.name;
+        return err.str();
+      }
+    }
+  }
+  return "";
+}
+
+TVM_REGISTER_GLOBAL("nnvm.graph.DeepCompare")
+.set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue *rv) {
+    *rv = GraphDeepCompare(args[0], args[1], args[2]);
+  });
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_hash.h b/nnvm/src/compiler/graph_hash.h
new file mode 100644
index 000000000000..f6f93a9d7e95
--- /dev/null
+++ b/nnvm/src/compiler/graph_hash.h
@@ -0,0 +1,82 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file graph_hash.h
+ * \brief The graph hashing function.
+ */
+#ifndef NNVM_COMPILER_GRAPH_HASH_H_
+#define NNVM_COMPILER_GRAPH_HASH_H_
+
+#include <dmlc/common.h>
+#include <nnvm/graph.h>
+#include <tvm/operation.h>
+#include <string>
+
+namespace nnvm {
+namespace compiler {
+
+class GraphKey;
+
+/*! \brief Key to a graph compiler cache */
+struct GraphKeyNode : public tvm::Node {
+  /*! \brief The graph structure */
+  Graph graph;
+  /* \brief The inputs to the function */
+  tvm::Array<Tensor> inputs;
+  /*! \brief The target */
+  std::string target;
+  // Cached internal hash key, invisible to the user.
+  // The graph hash key is ensured always not to be 0
+  mutable size_t cache_hash_key_{0};
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("inputs", &inputs);
+    v->Visit("target", &target);
+  }
+
+  static GraphKey make(Graph graph,
+                       tvm::Array<Tensor> inputs,
+                       std::string target);
+  static constexpr const char* _type_key = "GraphKey";
+  TVM_DECLARE_NODE_TYPE_INFO(GraphKeyNode, tvm::Node);
+};
+
+TVM_DEFINE_NODE_REF(GraphKey, GraphKeyNode);
+
+/*! \brief Hashing function for graph key */
+struct GraphKeyHash {
+  size_t operator()(const GraphKey& gkey) const {
+    return Hash(gkey);
+  }
+  static size_t Hash(const GraphKey& gkey);
+};
+
+/*! \brief function for graph key */
+struct GraphKeyEqual {
+  bool operator()(const GraphKey& a,
+                  const GraphKey& b) const {
+    return Equal(a, b);
+  }
+  static bool Equal(const GraphKey& a, const GraphKey& b);
+};
+
+/*!
+ * \brief Create a hash code for a given graph.
+ * \return The hash code of the graph.
+ */
+size_t GraphHash(const Graph& graph);
+
+/*!
+ * \brief Compare two graphs
+ *  return empty string if they are equal
+ *  otherwise return error message
+ * \param a The first graph.
+ * \param b The second graph.
+ * \return empty string if they are equal, otherwise return error message.
+ */
+std::string GraphDeepCompare(const Graph& a,
+                             const Graph& b,
+                             bool compare_variable_attr);
+}  // namespace compiler
+}  // namespace nnvm
+
+#endif  // NNVM_COMPILER_GRAPH_HASH_H_
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
new file mode 100644
index 000000000000..e623192258de
--- /dev/null
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -0,0 +1,105 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file graph_runtime.cc
+ * \brief Interface code with TVM graph runtime.
+*/
+#include <dmlc/memory_io.h>
+#include "./graph_runtime.h"
+
+namespace nnvm {
+namespace compiler {
+
+using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMRetValue;
+using tvm::runtime::PackedFunc;
+
+DMLC_REGISTER_PARAMETER(TVMOpParam);
+
+// parser
+inline void TVMOpParamParser(nnvm::NodeAttrs* attrs) {
+  TVMOpParam param;
+  param.Init(attrs->dict);
+  attrs->parsed = std::move(param);
+}
+
+NNVM_REGISTER_OP(tvm_op)
+.set_attr_parser(TVMOpParamParser)
+.set_num_inputs([](const NodeAttrs& attrs) {
+    const TVMOpParam& param = nnvm::get<TVMOpParam>(attrs.parsed);
+    return param.num_inputs;
+  })
+.set_num_outputs([](const NodeAttrs& attrs) {
+    const TVMOpParam& param = nnvm::get<TVMOpParam>(attrs.parsed);
+    return param.num_outputs;
+  });
+
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._save_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    CHECK_EQ(args.size() % 2, 0u);
+    size_t num_params = args.size() / 2;
+    std::vector<std::string> names;
+    names.reserve(num_params);
+    std::vector<DLTensor*> arrays;
+    arrays.reserve(num_params);
+    for (size_t i = 0; i < num_params * 2; i += 2) {
+      names.emplace_back(args[i].operator std::string());
+      arrays.emplace_back(args[i + 1].operator DLTensor*());
+    }
+    std::string bytes;
+    dmlc::MemoryStringStream strm(&bytes);
+    dmlc::Stream* fo = &strm;
+    uint64_t header = kTVMNDArrayListMagic, reserved = 0;
+    fo->Write(header);
+    fo->Write(reserved);
+    fo->Write(names);
+    {
+      uint64_t sz = static_cast<uint64_t>(arrays.size());
+      fo->Write(sz);
+      for (size_t i = 0; i < sz; ++i) {
+        tvm::runtime::SaveDLTensor(fo, arrays[i]);
+      }
+    }
+    TVMByteArray arr;
+    arr.data = bytes.c_str();
+    arr.size = bytes.length();
+    *rv = arr;
+  });
+
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._load_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    std::string bytes = args[0];
+    std::vector<std::string> names;
+    dmlc::MemoryStringStream memstrm(&bytes);
+    dmlc::Stream* strm = &memstrm;
+    uint64_t header, reserved;
+    CHECK(strm->Read(&header))
+        << "Invalid parameters file format";
+    CHECK(header == kTVMNDArrayListMagic)
+        << "Invalid parameters file format";
+    CHECK(strm->Read(&reserved))
+        << "Invalid parameters file format";
+    CHECK(strm->Read(&names))
+        << "Invalid parameters file format";
+    uint64_t sz;
+    strm->Read(&sz, sizeof(sz));
+    size_t size = static_cast<size_t>(sz);
+    CHECK(size == names.size())
+        << "Invalid parameters file format";
+    tvm::Array<NDArrayWrapper> ret;
+    for (size_t i = 0; i < size; ++i) {
+      tvm::runtime::NDArray temp;
+      temp.Load(strm);
+      std::shared_ptr<NDArrayWrapperNode> n
+          = std::make_shared<NDArrayWrapperNode>();
+      n->name = std::move(names[i]);
+      n->array = temp;
+      ret.push_back(NDArrayWrapper(n));
+    }
+    *rv = ret;
+  });
+
+TVM_REGISTER_NODE_TYPE(NDArrayWrapperNode);
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_runtime.h b/nnvm/src/compiler/graph_runtime.h
new file mode 100644
index 000000000000..272e2be7f251
--- /dev/null
+++ b/nnvm/src/compiler/graph_runtime.h
@@ -0,0 +1,59 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file graph_runtime.h
+ * \brief Interface code with TVM graph runtime.
+*/
+#ifndef NNVM_COMPILER_GRAPH_RUNTIME_H_
+#define NNVM_COMPILER_GRAPH_RUNTIME_H_
+
+#include <nnvm/graph.h>
+#include <tvm/base.h>
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/runtime/ndarray.h>
+#include <vector>
+#include <string>
+
+namespace nnvm {
+namespace compiler {
+
+/*! \brief Magic number for NDArray list file  */
+constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
+
+struct TVMOpParam : public dmlc::Parameter<TVMOpParam> {
+  std::string func_name;
+  uint32_t num_inputs;
+  uint32_t num_outputs;
+  uint32_t flatten_data;
+
+  DMLC_DECLARE_PARAMETER(TVMOpParam) {
+    DMLC_DECLARE_FIELD(func_name);
+    DMLC_DECLARE_FIELD(num_inputs).set_default(1);
+    DMLC_DECLARE_FIELD(num_outputs).set_default(1);
+    DMLC_DECLARE_FIELD(flatten_data).set_default(0);
+  }
+};
+
+
+/*!
+ * \brief wrapper node container for exchange.
+ */
+struct NDArrayWrapperNode : public ::tvm::Node {
+  std::string name;
+  tvm::runtime::NDArray array;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("array", &array);
+  }
+
+  static constexpr const char* _type_key = "NDArrayWrapper";
+  TVM_DECLARE_NODE_TYPE_INFO(NDArrayWrapperNode, Node);
+};
+
+TVM_DEFINE_NODE_REF(NDArrayWrapper, NDArrayWrapperNode);
+
+}  // namespace compiler
+}  // namespace nnvm
+
+#endif   // NNVM_COMPILER_GRAPH_RUNTIME_H_
diff --git a/nnvm/src/compiler/graph_transform.h b/nnvm/src/compiler/graph_transform.h
new file mode 100644
index 000000000000..dd80accbee3f
--- /dev/null
+++ b/nnvm/src/compiler/graph_transform.h
@@ -0,0 +1,124 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file graph_transform.h
+ * \brief A mutator class that does local pattern matching and mutates a node.
+*/
+#ifndef NNVM_COMPILER_GRAPH_TRANSFORM_H_
+#define NNVM_COMPILER_GRAPH_TRANSFORM_H_
+
+#include <nnvm/graph.h>
+#include <vector>
+
+namespace nnvm {
+namespace compiler {
+
+/*!
+ * \brief Transform the graph to build a new Graph, in post DFS order.
+ *
+ *  Automatically copies node when some of its children or control_deps changed.
+ *  This function won't be called in Variable.
+ *
+ * \param graph The original graph
+ *
+ * \param ftransform Function of (int nid, const NodePtr& node, std::vector<NodeEntry>* out) -> bool
+ *
+ *      If empty vector is returned, it means original entries should be kept.
+ *
+ * \tparam FTransform The transformation function.
+ */
+template<typename FTransform>
+Graph GraphTransform(Graph graph, FTransform ftransform) {
+  const IndexedGraph& idx = graph.indexed_graph();
+  // new nodes
+  std::vector<NodeEntry> new_entry_map(idx.num_node_entries());
+  std::vector<bool> updated(idx.num_node_entries(), false);
+
+  // setup inputs and placeholder.
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    bool need_copy = false;
+    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+      if (updated[idx.entry_id(e)]) {
+        need_copy = true; break;
+      }
+    }
+    if (!need_copy) {
+      for (const uint32_t cid : inode.control_deps) {
+        const auto& cnode = idx[cid];
+        for (uint32_t i = 0 ; i < cnode.source->num_outputs(); ++i) {
+          if (updated[idx.entry_id(cid, i)]) {
+            need_copy = true;
+          }
+        }
+        if (need_copy) break;
+      }
+    }
+
+    if (!need_copy) {
+      std::vector<NodeEntry> ret;
+      if (ftransform(nid, inode.weak_ref.lock(), &ret)) {
+        CHECK_EQ(ret.size(), static_cast<size_t>(inode.source->num_outputs()));
+        for (uint32_t i = 0 ; i < inode.source->num_outputs(); ++i) {
+          updated[idx.entry_id(nid, i)] = true;
+          new_entry_map[idx.entry_id(nid, i)] = ret[i];
+        }
+      }
+    } else {
+      NodePtr node = Node::Create();
+      node->attrs = inode.source->attrs;
+      for (size_t i = 0; i < inode.inputs.size(); ++i) {
+        const IndexedGraph::NodeEntry& e = inode.inputs[i];
+        if (updated[idx.entry_id(e)]) {
+          node->inputs.push_back(new_entry_map[idx.entry_id(e)]);
+        } else {
+          node->inputs.push_back(inode.source->inputs[i]);
+        }
+      }
+      for (size_t i = 0; i < inode.control_deps.size(); ++i) {
+        const uint32_t cid = inode.control_deps[i];
+        const auto& cnode = idx[cid];
+        CHECK_NE(cnode.source->num_outputs(), 0U);
+        NodePtr selected_ptr;
+        for (uint32_t j = 0 ; j < cnode.source->num_outputs(); ++j) {
+          NodePtr cptr = updated[idx.entry_id(cid, j)] ?
+              new_entry_map[idx.entry_id(cid, j)].node : inode.source->control_deps[i];
+          if (selected_ptr == nullptr) {
+            selected_ptr = std::move(cptr);
+          } else {
+            CHECK(selected_ptr.get() == cptr.get())
+                << "Control dependency node changed to more than one node";
+          }
+        }
+        node->control_deps.push_back(selected_ptr);
+      }
+      std::vector<NodeEntry> ret;
+      if (ftransform(nid, node, &ret)) {
+        CHECK_EQ(ret.size(), static_cast<size_t>(inode.source->num_outputs()));
+        for (uint32_t i = 0 ; i < inode.source->num_outputs(); ++i) {
+          updated[idx.entry_id(nid, i)] = true;
+          new_entry_map[idx.entry_id(nid, i)] = ret[i];
+        }
+      } else {
+        for (uint32_t i = 0 ; i < inode.source->num_outputs(); ++i) {
+          updated[idx.entry_id(nid, i)] = true;
+          new_entry_map[idx.entry_id(nid, i)] = NodeEntry{node, i, 0};
+        }
+      }
+    }
+  }
+  Graph ret;
+  for (size_t i = 0; i < idx.outputs().size(); ++i) {
+    const IndexedGraph::NodeEntry& e = idx.outputs()[i];
+    if (updated[idx.entry_id(e)]) {
+      ret.outputs.push_back(new_entry_map[idx.entry_id(e)]);
+    } else {
+      ret.outputs.push_back(graph.outputs[i]);
+    }
+  }
+  return ret;
+}
+
+}  // namespace compiler
+}  // namespace nnvm
+
+#endif  // NNVM_COMPILER_GRAPH_TRANSFORM_H_
diff --git a/nnvm/src/compiler/node_attr.h b/nnvm/src/compiler/node_attr.h
new file mode 100644
index 000000000000..c4395ad98b69
--- /dev/null
+++ b/nnvm/src/compiler/node_attr.h
@@ -0,0 +1,34 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file node_attr.h
+ * \brief utility to access node attributes
+*/
+#ifndef NNVM_COMPILER_NODE_ATTR_H_
+#define NNVM_COMPILER_NODE_ATTR_H_
+
+#include <nnvm/op.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <unordered_map>
+#include <string>
+
+namespace nnvm {
+namespace compiler {
+
+using AttrDict = std::unordered_map<std::string, std::string>;
+/*!
+ * \brief Get canonicalized attr dict from node
+ * \param attrs The node attrs
+ * \return The attribute dict
+ */
+inline AttrDict GetAttrDict(const NodeAttrs& attrs) {
+  static auto& fgetdict = nnvm::Op::GetAttr<FGetAttrDict>("FGetAttrDict");
+  if (fgetdict.count(attrs.op)) {
+    return fgetdict[attrs.op](attrs);
+  } else {
+    return attrs.dict;
+  }
+}
+
+}  // namespace compiler
+}  // namespace nnvm
+#endif  // NNVM_COMPILER_NODE_ATTR_H_
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
new file mode 100644
index 000000000000..d549f9e2004f
--- /dev/null
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -0,0 +1,146 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file packed_func_ext.cc
+ * \brief Registeration of extension type.
+ */
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/op.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include "./node_attr.h"
+#include "compile_engine.h"
+
+namespace tvm {
+namespace runtime {
+
+TVM_REGISTER_EXT_TYPE(nnvm::Graph);
+TVM_REGISTER_EXT_TYPE(nnvm::Symbol);
+TVM_REGISTER_EXT_TYPE(nnvm::compiler::AttrDict);
+
+}  // namespace runtime
+}  // namespace tvm
+
+namespace nnvm {
+DMLC_JSON_ENABLE_ANY(int, int);
+}  // namespace nnvm
+
+namespace nnvm {
+namespace compiler {
+
+using tvm::Tensor;
+using tvm::Array;
+using tvm::Node;
+using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMRetValue;
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._dict_get")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const AttrDict& dict = args[0].AsExtension<AttrDict>();
+    std::string key = args[1];
+    auto it = dict.find(key);
+    if (it != dict.end()) {
+      *rv = it->second;
+    } else {
+      *rv = nullptr;
+    }
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._dict_size")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const AttrDict& dict = args[0].AsExtension<AttrDict>();
+    *rv = static_cast<int64_t>(dict.size());
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._dict_keys")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const AttrDict& dict = args[0].AsExtension<AttrDict>();
+    tvm::Array<tvm::Expr> keys;
+    for (const auto& kv : dict) {
+      keys.push_back(kv.first);
+    }
+    *rv = keys;
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._register_alter_op_layout")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+  PackedFunc* f = new PackedFunc(args[1].operator PackedFunc());
+  Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+  auto fpack = [f](const NodeAttrs& attrs,
+                   const Symbol& inputs,
+                   const Array<Tensor>& tinfos,
+                   Symbol* ret_symbol) {
+    TVMRetValue ret = (*f)(GetAttrDict(attrs), inputs, tinfos);
+    if (ret.type_code() == TVMTypeCode::kNull) {
+      return false;
+    }
+    CHECK_EQ(ret.type_code(), tvm::runtime::extension_class_info<Symbol>::code)
+      << " expected " << "Symbol (code = " << tvm::runtime::extension_class_info<Symbol>::code
+      << ") but get code = " << ret.type_code();
+    *ret_symbol = *(static_cast<Symbol*>(ret.value().v_handle));
+    return true;
+  };
+  op.set_attr<FTVMAlterOpLayout>("FTVMAlterOpLayout", fpack, args[2]);
+});
+
+// custom version of TVM compute
+TVM_REGISTER_GLOBAL("nnvm._register_compute")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+    PackedFunc* f = new PackedFunc(args[1].operator PackedFunc());
+    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+    auto fcompute = [f](const NodeAttrs& attrs,
+                        const Array<Tensor>& inputs,
+                        const Array<Tensor>& out_info)
+        -> Array<Tensor> {
+      TVMRetValue ret = (*f)(GetAttrDict(attrs), inputs, out_info);
+      if ((*ret.ptr<std::shared_ptr<tvm::Node> >())->derived_from<tvm::TensorNode>()) {
+        return {ret.operator Tensor()};
+      } else {
+        return ret;
+      }
+    };
+    op.set_attr<FTVMCompute>("FTVMCompute", fcompute, args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm._register_schedule")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+    PackedFunc* f = new PackedFunc(args[1].operator PackedFunc());
+    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+    auto fschedule = [f](const NodeAttrs& attrs,
+                         const Array<Tensor>& outs,
+                         const std::string& target) {
+      return (*f)(GetAttrDict(attrs), outs, target).operator Schedule();
+    };
+    op.set_attr<FTVMSchedule>("FTVMSchedule", fschedule, args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm._register_pattern")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+    op.set_attr<TOpPattern>("TOpPattern", args[1].operator int(), args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.graph._move_module")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const nnvm::Graph& g = args[0].AsExtension<Graph>();
+    *rv = const_cast<nnvm::Graph*>(&g)->
+        MoveCopyAttr<tvm::runtime::Module>(args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.graph._move_graph")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const nnvm::Graph& g = args[0].AsExtension<Graph>();
+    std::string key = args[1];
+    if (g.attrs.count(key)) {
+      *rv = const_cast<nnvm::Graph*>(&g)->
+          MoveCopyAttr<nnvm::Graph>(key);
+    } else {
+      *rv = nullptr;
+    }
+  });
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/pattern_util.h b/nnvm/src/compiler/pattern_util.h
new file mode 100644
index 000000000000..10322a7dce85
--- /dev/null
+++ b/nnvm/src/compiler/pattern_util.h
@@ -0,0 +1,99 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file pattern_util.h
+ * \brief Utilities for doing various pattern matching in graph.
+*/
+#ifndef NNVM_COMPILER_PATTERN_UTIL_H_
+#define NNVM_COMPILER_PATTERN_UTIL_H_
+
+#include <nnvm/graph.h>
+#include <vector>
+#include <utility>
+#include <string>
+
+namespace nnvm {
+namespace compiler {
+
+/*!
+ * \brief find axis in oshape, such that:
+ *  bias_shape = [1,1, ... oshape[axis], 1,1,]
+ *
+ *  This is used to detect bias or scaling factor on channel dimension.
+ * \param oshape The output shape
+ * \param bias_shape The shape of bias or scaling factor.
+ * \return Pair of matched axis in o shape and bias_shape if found.
+ */
+inline std::pair<int, int> MatchBroadcast1DAxis(
+    const TShape& oshape, const TShape& bias_shape) {
+  dim_t axis_dim = bias_shape.ndim();
+  for (dim_t i = bias_shape.ndim(); i != 0; --i, --axis_dim) {
+    if (bias_shape[i - 1] != 1) break;
+  }
+  // everything is 1
+  if (axis_dim == 0) {
+    return {oshape.ndim()  - bias_shape.ndim(), 0};
+  }
+  axis_dim = axis_dim - 1;
+  // The bias shape is not 1D
+  for (dim_t i = 0; i < axis_dim; ++i) {
+    if (bias_shape[i] != 1) return {-1, -1};
+  }
+  int axis = static_cast<int>(
+      oshape.ndim() - bias_shape.ndim() + axis_dim);
+  if (oshape[axis] != bias_shape[axis_dim]) return {-1, -1};
+  return {axis, axis_dim};
+}
+
+/*!
+ * \brief Expand bias dimension to match needed axis.
+ *
+ * \param bias The bias NodeEntry
+ * \param out_dim output dimension.
+ * \param bias_dim The current bias dimension.
+ * \param axis The axis we want to match on.
+ */
+inline NodeEntry
+ExpandBiasToMatchAxis(NodeEntry bias,
+                      int out_dim,
+                      int bias_dim,
+                      int axis) {
+  if (bias_dim != 1) {
+    bias = MakeNode("squeeze", bias.node->attrs.name + "_sqz", {bias});
+  }
+  int num_pad_axis = out_dim - axis - 1;
+  if (num_pad_axis > 0) {
+    std::unordered_map<std::string, std::string> kwargs{
+      {"axis", "1"},
+      {"num_newaxis", std::to_string(num_pad_axis)}};
+    return MakeNode("expand_dims", bias.node->attrs.name + "_expand",
+                    {bias}, kwargs);
+
+  } else {
+    return bias;
+  }
+}
+
+/*!
+ * \brief Get the reference count of each node.
+ * \param idx The IndexedGraph
+ * \return ref_count vector of length number nodes.
+ */
+inline std::vector<uint32_t>
+GetNodeRefCounts(const IndexedGraph& idx) {
+  std::vector<uint32_t> ref_count(idx.num_nodes(), 0);
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    for (const auto& e : inode.inputs) {
+      ++ref_count[e.node_id];
+    }
+  }
+  for (const auto& e : idx.outputs()) {
+    // this line will realize all the outputs
+    ref_count[e.node_id] += 1;
+  }
+  return ref_count;
+}
+}  // namespace compiler
+}  // namespace nnvm
+#endif  //  NNVM_COMPILER_PATTERN_UTIL_H_
diff --git a/nnvm/src/compiler/precompute_prune.cc b/nnvm/src/compiler/precompute_prune.cc
new file mode 100644
index 000000000000..9ded7a169bf9
--- /dev/null
+++ b/nnvm/src/compiler/precompute_prune.cc
@@ -0,0 +1,109 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file precompute_prune.cc
+ * \brief Split the graph into a pre-compute graph and a execution graph.
+ *
+ *  The pre-compute graph outputs parameters that can be taken
+ *  by execution graph during execution phase.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <unordered_set>
+
+namespace nnvm {
+namespace compiler {
+
+nnvm::Graph PrecomputePrune(nnvm::Graph src) {
+  const auto& plist
+      = src.GetAttr<std::vector<std::string> >("param_name_list");
+  std::unordered_set<std::string> params(plist.begin(), plist.end());
+
+  std::unordered_set<nnvm::Node*> pruned;
+  nnvm::NodeEntryMap<nnvm::NodePtr> entry_var;
+  std::unordered_set<std::string> unique_name;
+  // number of edges that are not variable
+  int non_var_edge = 0;
+
+  auto replace_pruned_entry = [&] (const NodeEntry& e) {
+    if (!entry_var.count(e)) {
+      if (!e.node->is_variable()) {
+        ++non_var_edge;
+      }
+      nnvm::NodePtr var = nnvm::Node::Create();
+      var->attrs.name = e.node->attrs.name;
+      if (e.version) {
+          var->attrs.name += "_" + std::to_string(e.version);
+      }
+      if (e.node->num_outputs() != 1) {
+        var->attrs.name += "_output" + std::to_string(e.index);
+      }
+      entry_var.emplace(e, var);
+      CHECK(!unique_name.count(var->attrs.name));
+      unique_name.insert(var->attrs.name);
+      return nnvm::NodeEntry{var, 0, 0};
+    } else {
+      return nnvm::NodeEntry{entry_var.at(e), 0, 0};
+    }
+  };
+
+  DFSVisit(src.outputs, [&](const nnvm::NodePtr& n) {
+    bool can_be_pruned = true;
+    if (n->is_variable()) {
+      if (params.count(n->attrs.name)) {
+        pruned.emplace(n.get());
+      }
+      can_be_pruned = false;
+    }
+
+    for (const auto& e : n->inputs) {
+      if (!pruned.count(e.node.get())) {
+        can_be_pruned = false;
+      }
+    }
+    if (can_be_pruned) {
+      pruned.emplace(n.get());
+    } else {
+      // scan again to find edge nodes, skip variables
+      for (auto& e : n->inputs) {
+        if (pruned.count(e.node.get())) {
+          e = replace_pruned_entry(e);
+        }
+      }
+    }
+  });
+
+  // nothing being pruned.
+  if (non_var_edge == 0) {
+    return src;
+  }
+
+  for (auto& e : src.outputs) {
+    if (pruned.count(e.node.get())) {
+      e = replace_pruned_entry(e);
+    }
+  }
+
+  nnvm::Graph pre_graph;
+  pre_graph.outputs.reserve(entry_var.size());
+  std::vector<std::string> output_names;
+  output_names.reserve(entry_var.size());
+
+  for (auto kv : entry_var) {
+    pre_graph.outputs.emplace_back(kv.first);
+    output_names.emplace_back(kv.second->attrs.name);
+  }
+  // new parameter list
+  pre_graph.attrs["output_names"] =
+      std::make_shared<dmlc::any>(std::move(output_names));
+  src.attrs["precompute_graph"] =
+      std::make_shared<dmlc::any>(std::move(pre_graph));
+  return src;
+}
+
+NNVM_REGISTER_PASS(PrecomputePrune)
+.set_body(PrecomputePrune);
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/simplify_inference.cc b/nnvm/src/compiler/simplify_inference.cc
new file mode 100644
index 000000000000..a0782222aa06
--- /dev/null
+++ b/nnvm/src/compiler/simplify_inference.cc
@@ -0,0 +1,111 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file simplify_inference.cc
+ * \author Ziheng Jiang
+*/
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "./graph_transform.h"
+#include "./pattern_util.h"
+
+namespace nnvm {
+namespace compiler {
+
+std::vector<NodeEntry>
+BatchNormToInferUnpack(const nnvm::NodeAttrs& attrs,
+                       nnvm::NodeEntry data,
+                       nnvm::NodeEntry gamma,
+                       nnvm::NodeEntry beta,
+                       nnvm::NodeEntry moving_mean,
+                       nnvm::NodeEntry moving_var,
+                       TShape dshape,
+                       TShape bshape) {
+  CHECK_NE(dshape.ndim(), 0);
+  CHECK(attrs.op);
+  static const  Op* bn_op = Op::Get("batch_norm");
+  CHECK(attrs.op == bn_op);
+  const auto& param = nnvm::get<top::BatchNormParam>(attrs.parsed);
+  std::string bn_name = attrs.name;
+
+  // transform batch_norm(data) to scale * data + shift
+  NodeEntry var_add_eps = MakeNode(
+      "__add_scalar__", bn_name + "_add_eps",
+      {moving_var}, {{"scalar", std::to_string(param.epsilon)}});
+
+  NodeEntry sqrt = MakeNode(
+      "sqrt", bn_name + "_sqrt", {var_add_eps});
+
+  NodeEntry scale = MakeNode(
+      "__rdiv_scalar__", bn_name + "_div",
+      {sqrt}, {{"scalar", "1"}});
+
+  if (param.scale) {
+    scale = MakeNode(
+        "elemwise_mul", bn_name + "_gamma_mul_div",
+        {scale, gamma});
+  }
+
+  NodeEntry neg_mean = MakeNode(
+      "negative", bn_name + "_neg_mean", {moving_mean});
+
+  NodeEntry shift = MakeNode(
+      "elemwise_mul", bn_name + "_neg_mean_mul_a",
+      {neg_mean, scale});
+
+  if (param.center) {
+    shift = MakeNode(
+        "elemwise_add", bn_name + "_add_beta", {shift, beta});
+  }
+  int axis = param.axis;
+  scale = ExpandBiasToMatchAxis(scale, dshape.ndim()-bshape.ndim()+1, 1, axis);
+  shift = ExpandBiasToMatchAxis(shift, dshape.ndim()-bshape.ndim()+1, 1, axis);
+
+  NodeEntry out = MakeNode("broadcast_mul", bn_name + "_a_mul_data",
+                           {data, scale});
+  out = MakeNode("broadcast_add", bn_name + "_out",
+                 {out, shift});
+  // It is invalid to ref the other values of BN after inference transform.
+  NodeEntry undef = MakeNode("__undef__", "undef", {});
+  return {out, undef, undef};
+}
+
+Graph SimplifyInference(nnvm::Graph src) {
+  // Get attributes from the graph
+  const IndexedGraph& idx = src.indexed_graph();
+  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
+  auto transform = [&](uint32_t nid, const NodePtr& n, std::vector<NodeEntry>* ret) {
+    if (n->is_variable()) return false;
+    static const Op* bn_op = Op::Get("batch_norm");
+    static const Op* dropout_op = Op::Get("dropout");
+    if (n->op() == bn_op) {
+      *ret = BatchNormToInferUnpack(
+          n->attrs,
+          n->inputs[0],
+          n->inputs[1],
+          n->inputs[2],
+          n->inputs[3],
+          n->inputs[4],
+          shape_vec[idx.entry_id(nid, 0)],
+          shape_vec[idx.entry_id(nid, 1)]);
+      return true;
+    } else if (n->op() == dropout_op) {
+      NodeEntry undef = MakeNode("__undef__", "undef", {});
+      *ret = {n->inputs[0], undef};
+      return true;
+    } else {
+      return false;
+    }
+  };
+  return GraphTransform(src, transform);
+}
+
+NNVM_REGISTER_PASS(SimplifyInference)
+.set_body(SimplifyInference)
+.set_change_graph(true);
+
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/core/graph.cc b/nnvm/src/core/graph.cc
new file mode 100644
index 000000000000..b8bcae70f2e0
--- /dev/null
+++ b/nnvm/src/core/graph.cc
@@ -0,0 +1,121 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file graph_attr_types.cc
+ * \brief Graph node data structure.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <limits>
+
+namespace nnvm {
+
+const IndexedGraph& Graph::indexed_graph() const {
+  if (indexed_graph_ == nullptr) {
+    indexed_graph_.reset(new IndexedGraph(*this));
+  }
+  return *indexed_graph_;
+}
+
+// a subgraph should not refer to any nodes with higher level
+// where "level" refers to the nested depth of the subgraph
+// e.g. the main graph is level 0
+// subgraphs of the main graph is level 1
+// subgraphs of the subgraphs of the main graph is level 2
+static void SubgraphSanityCheck(const std::vector<std::shared_ptr<Symbol>> &subgraphs) {
+  std::vector<const std::vector<nnvm::NodeEntry>*> curr_level;
+  std::vector<const std::vector<nnvm::NodeEntry>*> next_level;
+  std::unordered_map<nnvm::Node*, uint32_t> node2level;
+  for (auto &subgraph : subgraphs)
+    next_level.push_back(&subgraph->outputs);
+  for (uint32_t level = 0; !next_level.empty(); ++level) {
+    curr_level.swap(next_level);
+    next_level.clear();
+    for (const std::vector<NodeEntry> *graph_ptr : curr_level) {
+      const std::vector<NodeEntry> &graph = *graph_ptr;
+      DFSVisit(graph, [&next_level, &node2level, level](const NodePtr& n) {
+        nnvm::Node *node = n.get();
+        // if the node is visited, but on a different level, then check failed
+        // if check failed here or before, we stop doing anything, but raise an error
+        CHECK(!node2level.count(node) || node2level[node] == level)
+          << "A subgraph should not depend on the outputs of nodes on higher levels";
+        // otherwise, this node belongs to the current level
+        node2level[node] = level;
+        // subgraphs of current node belongs to next level
+        for (const auto& subgraph : n->attrs.subgraphs) {
+          next_level.push_back(&subgraph->outputs);
+        }
+      });
+    }
+  }
+}
+
+// implement constructor from graph
+IndexedGraph::IndexedGraph(const Graph &g) {
+  entry_rptr_.push_back(0);
+  std::vector<size_t> inputs_rptr{0}, control_rptr{0};
+  std::vector<std::shared_ptr<Symbol>> subgraphs;
+
+  DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs]
+             (const NodePtr& n) {
+      CHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
+      uint32_t nid = static_cast<uint32_t>(nodes_.size());
+      for (const auto &subgraph : n->attrs.subgraphs)
+        subgraphs.push_back(subgraph);
+      // nodes_
+      IndexedGraph::Node new_node;
+      new_node.source = n.get();
+      new_node.weak_ref = n;
+      nodes_.emplace_back(std::move(new_node));
+      // arg_nodes_
+      if (n->is_variable()) {
+        input_nodes_.push_back(nid);
+      }
+      // node2index_
+      node2index_[n.get()] = nid;
+      // entry rptr
+      entry_rptr_.push_back(entry_rptr_.back() + n->num_outputs());
+      // input entries
+      for (const auto& e : n->inputs) {
+        auto it = node2index_.find(e.node.get());
+        CHECK(it != node2index_.end() && it->first == e.node.get());
+        input_entries_.emplace_back(NodeEntry{it->second, e.index, e.version});
+      }
+      inputs_rptr.push_back(input_entries_.size());
+      // control deps
+      for (const auto& nptr : n->control_deps) {
+        auto it = node2index_.find(nptr.get());
+        CHECK(it != node2index_.end() && it->first == nptr.get());
+        control_deps_.push_back(it->second);
+      }
+      control_rptr.push_back(control_deps_.size());
+  });
+  if (!subgraphs.empty())
+    SubgraphSanityCheck(subgraphs);
+
+  for (const auto& e : g.outputs) {
+    outputs_.emplace_back(NodeEntry{
+        node2index_.at(e.node.get()), e.index, e.version});
+  }
+
+  static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
+  // setup array view
+  // input_entries_ and control_rptr must not change after this step.
+  const NodeEntry* iptr = dmlc::BeginPtr(input_entries_);
+  for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+    nodes_[nid].inputs = array_view<NodeEntry>(
+        iptr + inputs_rptr[nid], iptr + inputs_rptr[nid + 1]);
+    if (nodes_[nid].source->op() != nullptr &&
+        fmutate_inputs.count(nodes_[nid].source->op())) {
+      for (uint32_t i : fmutate_inputs[nodes_[nid].source->op()](nodes_[nid].source->attrs)) {
+        mutable_input_nodes_.insert(nodes_[nid].inputs[i].node_id);
+      }
+    }
+  }
+  const uint32_t* cptr = dmlc::BeginPtr(control_deps_);
+  for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+    nodes_[nid].control_deps = array_view<uint32_t>(
+        cptr + control_rptr[nid], cptr + control_rptr[nid + 1]);
+  }
+}
+
+}  // namespace nnvm
diff --git a/nnvm/src/core/node.cc b/nnvm/src/core/node.cc
new file mode 100644
index 000000000000..b5b5ec22fd3f
--- /dev/null
+++ b/nnvm/src/core/node.cc
@@ -0,0 +1,44 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file node.cc
+ * \brief Graph node data structure.
+ */
+#include <nnvm/node.h>
+
+namespace nnvm {
+
+Node::~Node() {
+  if (inputs.size() != 0) {
+    // explicit deletion via DFS
+    // this is used to avoid stackoverflow caused by chain of deletions
+    std::vector<Node*> stack{this};
+    std::vector<NodePtr> to_delete;
+    while (!stack.empty()) {
+      Node* n = stack.back();
+      stack.pop_back();
+      for (NodeEntry& e : n->inputs) {
+        if (e.node.unique()) {
+          stack.push_back(e.node.get());
+          to_delete.emplace_back(std::move(e.node));
+        } else {
+          e.node.reset();
+        }
+      }
+      for (NodePtr& sp : n->control_deps) {
+        if (sp.unique()) {
+          stack.push_back(sp.get());
+          to_delete.emplace_back(std::move(sp));
+        } else {
+          sp.reset();
+        }
+      }
+      n->inputs.clear();
+    }
+  }
+}
+
+NodePtr Node::Create() {
+  return std::make_shared<Node>();
+}
+
+}  // namespace nnvm
diff --git a/nnvm/src/core/op.cc b/nnvm/src/core/op.cc
new file mode 100644
index 000000000000..e554d36b4e8c
--- /dev/null
+++ b/nnvm/src/core/op.cc
@@ -0,0 +1,113 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file op.cc
+ * \brief Support for operator registry.
+ */
+#include <nnvm/base.h>
+#include <nnvm/op.h>
+
+#include <memory>
+#include <atomic>
+#include <mutex>
+#include <unordered_set>
+
+namespace dmlc {
+// enable registry
+DMLC_REGISTRY_ENABLE(nnvm::Op);
+}  // namespace dmlc
+
+namespace nnvm {
+
+// single manager of operator information.
+struct OpManager {
+  // mutex to avoid registration from multiple threads.
+  // recursive is needed for trigger(which calls UpdateAttrMap)
+  std::recursive_mutex mutex;
+  // global operator counter
+  std::atomic<int> op_counter{0};
+  // storage of additional attribute table.
+  std::unordered_map<std::string, std::unique_ptr<any> > attr;
+  // storage of existing triggers
+  std::unordered_map<std::string, std::vector<std::function<void(Op*)>  > > tmap;
+  // group of each operator.
+  std::vector<std::unordered_set<std::string> > op_group;
+  // get singleton of the
+  static OpManager* Global() {
+    static OpManager inst;
+    return &inst;
+  }
+};
+
+// constructor
+Op::Op() {
+  OpManager* mgr = OpManager::Global();
+  index_ = mgr->op_counter++;
+}
+
+Op& Op::add_alias(const std::string& alias) {  // NOLINT(*)
+  dmlc::Registry<Op>::Get()->AddAlias(this->name, alias);
+  return *this;
+}
+
+// find operator by name
+const Op* Op::Get(const std::string& name) {
+  const Op* op = dmlc::Registry<Op>::Find(name);
+  CHECK(op != nullptr)
+      << "Operator " << name << " is not registered";
+  return op;
+}
+
+// Get attribute map by key
+const any* Op::GetAttrMap(const std::string& key) {
+  auto& dict =  OpManager::Global()->attr;
+  auto it = dict.find(key);
+  if (it != dict.end()) {
+    return it->second.get();
+  } else {
+    return nullptr;
+  }
+}
+
+// update attribute map
+void Op::UpdateAttrMap(const std::string& key,
+                       std::function<void(any*)> updater) {
+  OpManager* mgr = OpManager::Global();
+  std::lock_guard<std::recursive_mutex>(mgr->mutex);
+  std::unique_ptr<any>& value = mgr->attr[key];
+  if (value.get() == nullptr) value.reset(new any());
+  if (updater != nullptr) updater(value.get());
+}
+
+void Op::AddGroupTrigger(const std::string& group_name,
+                         std::function<void(Op*)> trigger) {
+  OpManager* mgr = OpManager::Global();
+  std::lock_guard<std::recursive_mutex>(mgr->mutex);
+  auto& tvec = mgr->tmap[group_name];
+  tvec.push_back(trigger);
+  auto& op_group = mgr->op_group;
+  for (const Op* op : dmlc::Registry<Op>::List()) {
+    if (op->index_ < op_group.size() &&
+        op_group[op->index_].count(group_name) != 0) {
+      trigger((Op*)op);  // NOLINT(*)
+    }
+  }
+}
+
+Op& Op::include(const std::string& group_name) {
+  OpManager* mgr = OpManager::Global();
+  std::lock_guard<std::recursive_mutex>(mgr->mutex);
+  auto it = mgr->tmap.find(group_name);
+  if (it != mgr->tmap.end()) {
+    for (auto& trigger : it->second) {
+      trigger(this);
+    }
+  }
+  auto& op_group = mgr->op_group;
+  if (index_ >= op_group.size()) {
+    op_group.resize(index_ + 1);
+  }
+  op_group[index_].insert(group_name);
+  return *this;
+}
+
+}  // namespace nnvm
diff --git a/nnvm/src/core/pass.cc b/nnvm/src/core/pass.cc
new file mode 100644
index 000000000000..d72d4af00e65
--- /dev/null
+++ b/nnvm/src/core/pass.cc
@@ -0,0 +1,55 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file pass.cc
+ * \brief Support for pass registry.
+ */
+#include <nnvm/pass.h>
+#include <algorithm>
+
+namespace dmlc {
+// enable registry
+DMLC_REGISTRY_ENABLE(nnvm::PassFunctionReg);
+}  // namespace dmlc
+
+namespace nnvm {
+
+const PassFunctionReg* FindPassDep(const std::string&attr_name) {
+  for (auto* r : dmlc::Registry<PassFunctionReg>::List()) {
+    for (auto& s : r->graph_attr_targets) {
+      if (s == attr_name) return r;
+    }
+  }
+  return nullptr;
+}
+
+Graph ApplyPasses(Graph g,
+                  const std::vector<std::string>& pass) {
+  std::vector<const PassFunctionReg*> fpass;
+  for (auto& name : pass) {
+    auto* reg = dmlc::Registry<PassFunctionReg>::Find(name);
+    CHECK(reg != nullptr)
+        << "Cannot find pass " << name << " in the registry";
+    fpass.push_back(reg);
+  }
+
+  for (auto r : fpass) {
+    for (auto& dep : r->graph_attr_dependency) {
+      if (g.attrs.count(dep) == 0) {
+        auto* pass_dep = FindPassDep(dep);
+        std::string msg;
+        if (pass_dep != nullptr) {
+          msg = " The attribute is provided by pass " + pass_dep->name;
+        }
+        LOG(FATAL) << "Graph attr dependency " << dep
+                   << " is required by pass " << r->name
+                   << " but is not available "
+                   << msg;
+      }
+    }
+    g = r->body(std::move(g));
+  }
+
+  return g;
+}
+
+}  // namespace nnvm
diff --git a/nnvm/src/core/symbolic.cc b/nnvm/src/core/symbolic.cc
new file mode 100644
index 000000000000..680a1900c008
--- /dev/null
+++ b/nnvm/src/core/symbolic.cc
@@ -0,0 +1,640 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file symbolic.cc
+ * \brief Symbolic graph composition API.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/symbolic.h>
+#include <nnvm/op_attr_types.h>
+
+namespace nnvm {
+
+namespace symbol_constants {
+const char *kNamespaceSeparator = "$";
+}  // namespace symbol_constants
+
+// auxililary version attribute in variable.
+struct VariableParam {
+  uint32_t version{0};
+};
+
+NodePtr CreateVariableNode(const std::string& name) {
+  NodePtr n = Node::Create();
+  n->attrs.op = nullptr;
+  n->attrs.name = name;
+  n->attrs.parsed = VariableParam();
+  return n;
+}
+
+// scan over a node's input, update the version to latest
+// If the node's op mutates a certain input variable,
+// The version of that varaible will increase
+// version is used to implicitly order the mutation sequences
+inline void UpdateNodeVersion(Node *n) {
+  static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
+  for (NodeEntry& e : n->inputs) {
+    if (e.node->is_variable()) {
+      e.version = nnvm::get<VariableParam>(e.node->attrs.parsed).version;
+    }
+  }
+  if (fmutate_inputs.count(n->op()) != 0) {
+    for (uint32_t i : fmutate_inputs[n->op()](n->attrs)) {
+      NodeEntry& e = n->inputs[i];
+      CHECK(e.node->is_variable())
+          << "Mutation target can only be Variable";
+      // increase the version of the variable.
+      e.version = ++nnvm::get<VariableParam>(e.node->attrs.parsed).version;
+    }
+  }
+}
+
+inline std::string DefaultVarName(const std::string &op_name,
+                                  const std::string &arg_name) {
+  if (op_name.length() == 0) {
+    return arg_name;
+  } else {
+    return op_name + '_' + arg_name;
+  }
+}
+
+inline void KeywordArgumentMismatch(const char *source,
+                                    const std::vector<std::string>& user_args,
+                                    const array_view<std::string>& args) {
+  std::unordered_set<std::string> keys(args.begin(), args.end());
+  std::ostringstream head, msg;
+  msg << "\nCandidate arguments:\n";
+  for (size_t i = 0; i < args.size(); ++i) {
+    msg << "\t[" << i << ']' << args[i] << '\n';
+  }
+
+  for (const auto& key : user_args) {
+    if (keys.count(key) == 0) {
+      LOG(FATAL) << source
+                 << "Keyword argument name " << key << " not found."
+                 << msg.str();
+    }
+  }
+}
+
+template<typename T>
+inline std::vector<std::string> GetKeys(
+    const std::unordered_map<std::string, T>& kwargs) {
+  std::vector<std::string> keys(kwargs.size());
+  std::transform(kwargs.begin(), kwargs.end(), keys.begin(),
+                 [](decltype(*kwargs.begin())& kv) { return kv.first; });
+  return keys;
+}
+
+// whether the symbol is atomic functor
+inline bool IsAtomic(const std::vector<NodeEntry>& outputs) {
+  Node* node = outputs[0].node.get();
+  for (const NodeEntry& e : outputs) {
+    if (node != e.node.get()) return false;
+  }
+  return node->inputs.size() == 0 && node->control_deps.size() == 0;
+}
+
+// public functions
+Symbol Symbol::Copy() const {
+  std::unordered_map<Node*, NodePtr> old_new;
+  // use DFSVisit to copy all the nodes
+  DFSVisit(this->outputs, [&old_new](const NodePtr& node) {
+      NodePtr np = Node::Create();
+      np->attrs = node->attrs;
+      old_new[node.get()] = std::move(np);
+    });
+  // connect nodes of new graph
+  for (const auto &kv : old_new) {
+    for (const NodeEntry& e : kv.first->inputs) {
+      Node *ptr = e.node.get();
+      kv.second->inputs.emplace_back(NodeEntry{old_new[ptr], e.index, e.version});
+    }
+    for (const NodePtr& p : kv.first->control_deps) {
+      kv.second->control_deps.emplace_back(old_new[p.get()]);
+    }
+  }
+  // set the head
+  Symbol ret;
+  for (const NodeEntry &e : outputs) {
+    ret.outputs.emplace_back(NodeEntry{old_new[e.node.get()], e.index, e.version});
+  }
+  return ret;
+}
+
+void Symbol::Print(std::ostream &os) const {
+  if (outputs.size() == 1 &&
+      outputs[0].node->inputs.size() == 0 &&
+      outputs[0].node->control_deps.size() == 0) {
+    if (outputs[0].node->is_variable()) {
+      os << "Variable:" << outputs[0].node->attrs.name << '\n';
+    } else {
+      os << "AtomicFunctor "<< " Op:" << outputs[0].node->op()->name << '\n';
+    }
+  } else {
+    // use DFSVisit to copy all the nodes
+    os << "Symbol Outputs:\n";
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      os << "\toutput[" << i << "]=" << outputs[i].node->attrs.name
+         << '(' << outputs[i].index << ")\n";
+    }
+    DFSVisit(this->outputs, [&os](const NodePtr& node) {
+        if (node->is_variable()) {
+          os << "Variable:" << node->attrs.name << '\n';
+        } else {
+          os << "--------------------\n";
+          os << "Op:" << node->op()->name << ", Name=" << node->attrs.name << '\n'
+             << "Inputs:\n";
+          for (size_t i = 0; i < node->inputs.size(); ++i) {
+            const NodeEntry& e = node->inputs[i];
+            os << "\targ[" << i << "]=" << e.node->attrs.name
+               << '(' << e.index << ")";
+            if (e.node->is_variable()) {
+              os << " version=" << e.version << '\n';
+            } else {
+              os << '\n';
+            }
+          }
+          if (!node->attrs.dict.empty()) {
+            os << "Attrs:\n";
+            // make an ordered copy because unordered_map doesn't guarantee order.
+            std::map<std::string, std::string> sorted_dict(
+              node->attrs.dict.begin(), node->attrs.dict.end());
+            for (auto &kv : sorted_dict) {
+              os << '\t' << kv.first << '=' << kv.second << '\n';
+            }
+          }
+          if (node->control_deps.size() != 0) {
+            os << "Control deps:\n";
+            for (size_t i = 0; i < node->control_deps.size(); ++i) {
+              os << "\tcdep[" << i << "]=" << node->control_deps[i]->attrs.name << '\n';
+            }
+          }
+        }
+      });
+  }
+}
+
+Symbol Symbol::operator[] (size_t index) const {
+  size_t nreturn = outputs.size();
+  CHECK_LT(index, nreturn) << "Symbol only accept nonnegative index";
+  if (nreturn == 1) {
+    return *this;
+  } else {
+    Symbol s;
+    s.outputs.push_back(outputs[index]);
+    return s;
+  }
+}
+
+std::vector<NodePtr> Symbol::ListInputs(ListInputOption option) const {
+  std::vector<NodePtr> ret;
+  if (option == kAll) {
+    ret.reserve(this->outputs.size());
+    DFSVisit(this->outputs, [&ret](const NodePtr &node) {
+        if (node->is_variable()) {
+          ret.push_back(node);
+        }
+      });
+  } else {
+    std::unordered_set<Node*> mutable_set;
+    std::vector<NodePtr> vlist;
+    vlist.reserve(this->outputs.size());
+    static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
+    DFSVisit(this->outputs, [&mutable_set, &vlist](const NodePtr &node) {
+        if (node->is_variable()) {
+          vlist.push_back(node);
+        } else if (fmutate_inputs.count(node->op())) {
+          for (uint32_t i : fmutate_inputs[node->op()](node->attrs)){
+            mutable_set.insert(node->inputs[i].node.get());
+          }
+        }
+      });
+    ret.reserve(vlist.size());
+    for (const NodePtr& node : vlist) {
+      if ((option == kReadOnlyArgs && mutable_set.count(node.get()) == 0) ||
+          (option == kAuxiliaryStates && mutable_set.count(node.get()) != 0)) {
+        ret.emplace_back(node);
+      }
+    }
+  }
+  return ret;
+}
+
+std::vector<std::string> Symbol::ListInputNames(ListInputOption option) const {
+  std::vector<NodePtr> inputs = ListInputs(option);
+  std::vector<std::string> ret(inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    ret[i] = inputs[i]->attrs.name;
+  }
+  return ret;
+}
+
+std::vector<std::string> Symbol::ListOutputNames() const {
+  static auto& flist_ouputs = Op::GetAttr<FListOutputNames>("FListOutputNames");
+
+  std::vector<std::string> ret;
+  ret.reserve(outputs.size());
+  for (auto &head : outputs) {
+    if (head.node->is_variable()) {
+      ret.push_back(head.node->attrs.name);
+    } else {
+      const std::string& hname = head.node->attrs.name;
+      std::string rname;
+      FListOutputNames fn = flist_ouputs.get(head.node->op(), nullptr);
+      if (fn != nullptr) {
+        rname = fn(head.node->attrs)[head.index];
+      } else {
+        rname = "output";
+        if (head.node->num_outputs() != 1) {
+          std::ostringstream os;
+          os << rname << head.index;
+          rname = os.str();
+        }
+      }
+      if (hname.length() == 0) {
+        ret.push_back(std::move(rname));
+      } else {
+        ret.push_back(hname + '_' + rname);
+      }
+    }
+  }
+  return ret;
+}
+
+// compositional logic
+void Symbol::Compose(const array_view<const Symbol*>& args,
+                     const std::unordered_map<std::string, const Symbol*>& kwargs,
+                     const std::string& name) {
+  static auto& flist_inputs = Op::GetAttr<FListInputNames>("FListInputNames");
+  static auto& fset_attrs = Op::GetAttr<FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose");
+  static auto& fgraph = Op::GetAttr<FInputGraph>("FInputGraph");
+
+  // The arguments that contain graphs.
+  Node* n = outputs[0].node.get();
+  FInputGraph fng = fgraph.get(n->op(), nullptr);
+  std::vector<uint32_t> garg_idx;
+  if (fng != nullptr)
+    garg_idx = fng(n->attrs);
+
+  // The names of the arguments that contain graphs.
+  FListInputNames name_fn = flist_inputs.get(n->op(), nullptr);
+  auto arg_names = (name_fn == nullptr) ? std::vector<std::string>{"data"} : name_fn(n->attrs);
+  std::vector<std::string> garg_names(garg_idx.size());
+  for (size_t i = 0; i < garg_idx.size(); i++) {
+    size_t idx = garg_idx[i];
+    if (idx < arg_names.size())
+      garg_names[i] = arg_names[idx];
+  }
+
+  // parameter check.
+  for (size_t i = 0; i < args.size(); ++i) {
+    // If the argument isn't a graph, it should have only one output.
+    if (garg_idx.empty() || std::find(garg_idx.begin(), garg_idx.end(), i) == garg_idx.end())
+      CHECK_EQ(args[i]->outputs.size(), 1U)
+        << "Argument " << i << " is a tuple, single value is required";
+  }
+  for (const auto& kv : kwargs) {
+    if (garg_names.empty()
+        || std::find(garg_names.begin(), garg_names.end(), kv.first) == garg_names.end())
+      CHECK_EQ(kv.second->outputs.size(), 1U)
+        << "Keyword Argument " << kv.first << " is a tuple, single value is required";
+  }
+  // assign new name
+  if (!name.empty()) outputs[0].node->attrs.name = name;
+
+  // Atomic functor composition.
+  if (IsAtomic(outputs)) {
+    uint32_t n_req = n->num_inputs();
+    std::vector<const Symbol *> arg_vec(args.begin(), args.end());
+    std::unordered_map<std::string, const Symbol*> kwarg_map(kwargs.begin(), kwargs.end());
+    // If one of the input arguments is a graph, we need to remove it from the
+    // list.
+    if (fng != nullptr) {
+      std::vector<uint32_t> idxes = fng(n->attrs);
+      for (auto idx : idxes) {
+        const Symbol *sym;
+        if (idx < arg_vec.size()) {
+          sym = arg_vec[idx];
+        } else {
+          auto it = kwarg_map.find(arg_names[idx]);
+          CHECK(it != kwarg_map.end());
+          sym = it->second;
+          kwarg_map.erase(it);
+        }
+        if (n_req != kVarg)
+          n_req--;
+        n->attrs.subgraphs.push_back(std::make_shared<Symbol>(*sym));
+      }
+      // Because idxes does not contain duplicates, the loop below functions well.
+      // Note that it is as slow as O(|idxes| * |args|),
+      // but given that |idxes| is small, it is just fine
+      sort(std::begin(idxes), std::end(idxes), std::greater<int>());
+      for (auto idx : idxes) {
+        if (idx < arg_vec.size()) {
+          arg_vec.erase(arg_vec.begin() + idx);
+        }
+        arg_names.erase(arg_names.begin() + idx);
+      }
+    }
+
+    if (n_req != kVarg) {
+      n->inputs.resize(n_req);
+      CHECK_LE(arg_vec.size(), n_req)
+          << "Incorrect number of arguments, requires " << n_req
+          << ", provided " << arg_vec.size();
+      for (size_t i = 0; i < arg_vec.size(); ++i) {
+        n->inputs[i] = arg_vec[i]->outputs[0];
+      }
+      // switch to keyword argument matching
+      if (arg_vec.size() != n_req) {
+        if (arg_names.size() != n_req) {
+          LOG(FATAL) << "Not enough argument to call operator " << outputs[0].node->op()->name;
+        }
+        size_t nmatched = 0;
+        for (size_t i = arg_vec.size(); i < n_req; ++i) {
+          auto it = kwarg_map.find(arg_names[i]);
+          if (it != kwarg_map.end() && it->first == arg_names[i]) {
+            n->inputs[i] = it->second->outputs[0];
+            ++nmatched;
+          } else {
+            n->inputs[i] = NodeEntry{
+              CreateVariableNode(DefaultVarName(name, arg_names[i])), 0, 0};
+            // copy attribute of parent over automatically created variables
+            n->inputs[i].node->attrs.dict = n->attrs.dict;
+          }
+        }
+
+        if (nmatched != kwarg_map.size()) {
+          n->inputs.clear();
+          std::vector<std::string> keys = GetKeys(kwarg_map);
+          array_view<std::string> view(dmlc::BeginPtr(arg_names) + arg_vec.size(),
+                                       dmlc::BeginPtr(arg_names) + arg_names.size());
+          KeywordArgumentMismatch("Symbol.Compose", keys, view);
+        }
+      }
+    } else {
+      CHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs";
+      n->inputs.reserve(arg_vec.size());
+      for (const Symbol* s : arg_vec) {
+        n->inputs.push_back(s->outputs[0]);
+      }
+    }
+    UpdateNodeVersion(n);
+
+    FSetInputVarAttrOnCompose fn = fset_attrs.get(n->op(), nullptr);
+    if (fn != nullptr) {
+      for (size_t i = 0; i < n->inputs.size(); ++i) {
+        if (n->inputs[i].node->is_variable()) {
+          fn(n->attrs, n->inputs[i].node, i);
+        }
+      }
+    }
+  } else {
+    // general composition
+    CHECK_EQ(args.size(), 0U)
+        << "General composition only support kwargs for now";
+    size_t nmatched = 0;
+    size_t arg_counter = 0;
+    std::unordered_map<Node *, const NodeEntry*> replace_map;
+    // replace map stores the existing replacement plan for arguments node
+    auto find_replace_map = [&nmatched, &arg_counter, &args, &kwargs, &replace_map]
+        (const NodePtr &node) {
+      if (node->is_variable()) {
+        if (arg_counter < args.size()) {
+          replace_map[node.get()] = &(args[arg_counter]->outputs[0]);
+          ++arg_counter;
+        } else {
+            // match kwargs
+          auto kit = kwargs.find(node->attrs.name);
+          if (kit != kwargs.end()) {
+            replace_map[node.get()] = &(kit->second->outputs[0]);
+            ++nmatched;
+          }
+        }
+      }
+    };
+    DFSVisit(this->outputs, find_replace_map);
+
+    if (nmatched == kwargs.size() && arg_counter <= args.size()) {
+      std::vector<Node*> update_nodes;
+      std::vector<std::pair<NodeEntry*, const NodeEntry*> > replace_plan;
+      auto find_replace_plan = [&replace_map, &replace_plan, &update_nodes]
+          (const NodePtr &node) {
+        // visit all the childs, find possible replacement
+        bool repl = false;
+        for (size_t i = 0; i < node->inputs.size(); ++i) {
+          NodeEntry *e = &(node->inputs[i]);
+          if (e->node->is_variable()) {
+            auto iter = replace_map.find(e->node.get());
+            if (iter != replace_map.end()) {
+              replace_plan.push_back(std::make_pair(e, iter->second));
+              repl = true;
+            }
+          }
+        }
+        if (repl) update_nodes.push_back(node.get());
+      };
+      DFSVisit(this->outputs, find_replace_plan);
+
+      for (const auto& kv : replace_plan) {
+        *(kv.first) = *(kv.second);
+      }
+      for (Node* n : update_nodes) {
+        UpdateNodeVersion(n);
+      }
+    } else {
+      std::vector<std::string> keys = GetKeys(kwargs);
+      std::vector<std::string> arg_names = ListInputNames(kAll);
+      array_view<std::string> view(dmlc::BeginPtr(arg_names) + arg_counter,
+                                   dmlc::BeginPtr(arg_names) + arg_names.size());
+      KeywordArgumentMismatch("Symbol.Compose", keys, arg_names);
+    }
+
+    // update outputs in case the composed variable is part of outputs.
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      if (outputs[i].node->is_variable()) {
+        CHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments";
+        const auto it = kwargs.find(outputs[i].node->attrs.name);
+        if (it != kwargs.end()) outputs[i] = it->second->outputs[0];
+      }
+    }
+  }
+}
+
+Symbol Symbol::operator () (const array_view<const Symbol*>& args,
+                            const std::unordered_map<std::string, const Symbol*>& kwargs,
+                            const std::string& name) const {
+  Symbol s = this->Copy();
+  s.Compose(args, kwargs, name);
+  return s;
+}
+
+void Symbol::AddControlDeps(const Symbol& src) {
+  CHECK_EQ(outputs.size(), 1U)
+      << "AddControlDeps only works for nongrouped symbol";
+  Node* n = outputs[0].node.get();
+  for (const NodeEntry& sp : src.outputs) {
+    n->control_deps.push_back(sp.node);
+  }
+}
+
+Symbol Symbol::GetInternals() const {
+  static auto& fnum_vis_output = Op::GetAttr<FNumVisibleOutputs>("FNumVisibleOutputs");
+  Symbol ret;
+  DFSVisit(this->outputs, [&ret](const NodePtr& node) {
+      Node* n = node.get();
+      if (n->is_variable()) {
+        // grab version from variable.
+        VariableParam& param = nnvm::get<VariableParam>(n->attrs.parsed);
+        ret.outputs.emplace_back(NodeEntry{node, 0, param.version});
+      } else {
+        uint32_t nout = n->num_outputs();
+        if (fnum_vis_output.count(n->op())) {
+          nout = fnum_vis_output[n->op()](n->attrs);
+        }
+        for (uint32_t i = 0; i < nout; ++i) {
+          ret.outputs.emplace_back(NodeEntry{node, i, 0});
+        }
+      }
+    });
+  return ret;
+}
+
+Symbol Symbol::GetChildren() const {
+  Symbol ret;
+  std::unordered_set<Node*> visited;
+  for (const auto& p : this->outputs) {
+    Node* node = p.node.get();
+    if (visited.count(node)) continue;
+    visited.insert(node);
+    ret.outputs.insert(ret.outputs.end(), node->inputs.begin(), node->inputs.end());
+  }
+  return ret;
+}
+
+void Symbol::SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs) {
+  Node* node = outputs[0].node.get();
+  for (const NodeEntry& e : outputs) {
+    CHECK(node == e.node.get())
+        << "Symbol.SetAttrs only works for non-grouped symbol";
+  }
+  for (const auto& kv : attrs) {
+    if (kv.first == "name") {
+      node->attrs.name = kv.second;
+    } else {
+      node->attrs.dict[kv.first] = kv.second;
+    }
+  }
+  if (node->op() != nullptr && node->op()->attr_parser != nullptr) {
+    node->op()->attr_parser(&(node->attrs));
+  }
+}
+
+bool Symbol::GetAttr(const std::string& key, std::string* out) const {
+  Node* node = outputs[0].node.get();
+  for (const NodeEntry& e : outputs) {
+    if (node != e.node.get()) return false;
+  }
+  if (key == "name") {
+    *out = node->attrs.name;
+    return true;
+  } else if (key == "op_name") {
+    if (node->attrs.op != nullptr) {
+      *out = node->attrs.op->name;
+    } else {
+      *out = "null";  // use null with json
+    }
+    return true;
+  } else if (key == "_value_index") {
+    *out = "";
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      if (i != 0) {
+        *out += ", ";
+      }
+      *out += std::to_string(outputs[i].index);
+    }
+    return true;
+  }
+  auto it = node->attrs.dict.find(key);
+  if (it == node->attrs.dict.end()) return false;
+  *out = it->second;
+  return true;
+}
+
+std::unordered_map<std::string, std::string> Symbol::ListAttrs(ListAttrOption option) const {
+  if (option == kRecursive) {
+    std::unordered_map<std::string, std::string> ret;
+    DFSVisit(this->outputs, [&ret](const NodePtr& n) {
+        for (const auto& it : n->attrs.dict) {
+          ret[n->attrs.name + symbol_constants::kNamespaceSeparator + it.first] = it.second;
+        }
+      });
+    return ret;
+  } else {
+    return outputs[0].node->attrs.dict;
+  }
+}
+
+std::vector<std::tuple<std::string, std::string, std::string> >
+    Symbol::ListAttrsRecursive() const {
+  std::vector<std::tuple<std::string, std::string, std::string> > ret;
+  DFSVisit(this->outputs, [&ret](const NodePtr& n) {
+      for (const auto& it : n->attrs.dict) {
+        ret.emplace_back(std::make_tuple(n->attrs.name, it.first, it.second));
+      }
+    });
+  return ret;
+}
+
+Symbol Symbol::CreateFunctor(const Op* op,
+                             std::unordered_map<std::string, std::string> attrs) {
+  static auto& fnum_vis_output = Op::GetAttr<FNumVisibleOutputs>("FNumVisibleOutputs");
+  Symbol s;
+  NodePtr n = Node::Create();
+  n->attrs.op = op;
+  n->attrs.dict = std::move(attrs);
+  if (n->op()->attr_parser != nullptr) {
+    n->op()->attr_parser(&(n->attrs));
+  }
+
+  uint32_t nout = n->num_outputs();
+  if (fnum_vis_output.count(n->op())) {
+    nout = fnum_vis_output[n->op()](n->attrs);
+  }
+  for (uint32_t i = 0; i < nout; ++i) {
+    s.outputs.emplace_back(NodeEntry{n, i, 0});
+  }
+  return s;
+}
+
+Symbol Symbol::CreateFunctor(const NodeAttrs& attrs) {
+  static auto& fnum_vis_output = Op::GetAttr<FNumVisibleOutputs>("FNumVisibleOutputs");
+  Symbol s;
+  NodePtr n = Node::Create();
+  n->attrs = attrs;
+
+  uint32_t nout = n->num_outputs();
+  if (fnum_vis_output.count(n->op())) {
+    nout = fnum_vis_output[n->op()](n->attrs);
+  }
+  for (uint32_t i = 0; i < nout; ++i) {
+    s.outputs.emplace_back(NodeEntry{n, i, 0});
+  }
+  return s;
+}
+
+Symbol Symbol::CreateGroup(const std::vector<Symbol> &symbols) {
+  Symbol ret;
+  for (const auto &s : symbols) {
+    ret.outputs.insert(ret.outputs.end(), s.outputs.begin(), s.outputs.end());
+  }
+  return ret;
+}
+
+Symbol Symbol::CreateVariable(const std::string& name) {
+  Symbol s;
+  s.outputs.emplace_back(NodeEntry{CreateVariableNode(name), 0, 0});
+  return s;
+}
+
+}  // namespace nnvm
diff --git a/nnvm/src/pass/correct_layout.cc b/nnvm/src/pass/correct_layout.cc
new file mode 100644
index 000000000000..cd088257d1b0
--- /dev/null
+++ b/nnvm/src/pass/correct_layout.cc
@@ -0,0 +1,168 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file correct_layout.cc
+ * \brief Infer and correct layout.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/layout.h>
+
+namespace nnvm {
+namespace pass {
+
+nnvm::NodePtr CreateLayoutTransformNode(const Layout& src,
+                                        const Layout& dst) {
+  static const nnvm::Op* trans_op = nnvm::Op::Get("__layout_transform__");
+  static int count = 0;
+  nnvm::NodePtr n = nnvm::Node::Create();
+  n->attrs.op = trans_op;
+  n->attrs.name = src.name() + "_to_" + dst.name() + std::to_string(count++);
+  n->attrs.dict["src_layout"] = src.name();
+  n->attrs.dict["dst_layout"] = dst.name();
+  n->op()->attr_parser(&(n->attrs));
+  return n;
+}
+
+using LayoutAttrDict = std::unordered_map<const Node*, std::vector<Layout> >;
+
+/*!
+ * \brief A simple layout infer & correct pass that will
+ *        insert layout transform nodes automatically.
+ */
+nnvm::Graph CorrectLayout(nnvm::Graph src) {
+  static auto& op_correct_layout =
+    nnvm::Op::GetAttr<FCorrectLayout>("FCorrectLayout");
+
+  const IndexedGraph& idx = src.indexed_graph();
+  std::vector<nnvm::NodePtr> mirror_vec(idx.num_nodes(), nullptr);
+
+  // (new) NodePtr -> output_layouts
+  LayoutAttrDict new_layouts;
+
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    nnvm::NodePtr new_node = nnvm::Node::Create();
+    *new_node = *(inode.source);
+    if (new_node->is_variable()) {
+      // Variable node. No operator. Only one output entry.
+      auto input_iter = std::find(
+        idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid);
+      CHECK(input_iter != idx.input_nodes().cend());
+      int64_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter);
+      if (src.HasAttr("layout_inputs")) {
+        new_layouts[new_node.get()] =
+          {src.GetAttr<std::vector<Layout> >("layout_inputs")[input_id]};
+      } else {
+        new_layouts[new_node.get()] = {Layout::Undef()};
+      }
+      mirror_vec[nid] = new_node;
+      continue;
+    }
+
+    const uint32_t num_inputs = inode.inputs.size();
+    const uint32_t num_outputs = inode.source->num_outputs();
+    // set up output and input layouts
+    std::vector<Layout> request_ilayouts(num_inputs, Layout::Undef());
+    for (size_t i = 0; i < num_inputs; ++i) {
+      const IndexedGraph::NodeEntry& input_entry = inode.inputs[i];
+      const NodePtr& new_input_node = mirror_vec[input_entry.node_id];
+      CHECK(new_input_node != nullptr);
+
+      // fill inputs by previous node (DFS order) inferred layouts.
+      const auto& layouts_iter = new_layouts.find(new_input_node.get());
+      CHECK(layouts_iter != new_layouts.end());
+      request_ilayouts[i] = layouts_iter->second[input_entry.index];
+    }
+    // layouts produced by previous node.
+    std::vector<Layout> produce_ilayouts(request_ilayouts);
+    // input layouts from last pass of LayoutTransform (if apply)
+    std::vector<Layout> last_request_ilayouts(num_inputs, Layout::Undef());
+    // fill outputs by last pass of LayoutTransform (if apply)
+    std::vector<Layout> produce_olayouts(num_outputs, Layout::Undef());
+    if (src.HasAttr("layout")) {
+      const auto& layouts = src.GetAttr<std::vector<Layout> >("layout");
+      for (uint32_t i = 0; i < num_outputs; ++i) {
+        produce_olayouts[i] = layouts[idx.entry_id(nid, i)];
+      }
+      for (uint32_t i = 0; i < num_inputs; ++i) {
+        last_request_ilayouts[i] = layouts[idx.entry_id(inode.inputs[i])];
+      }
+    }
+
+    if (op_correct_layout.count(new_node->op())) {
+      const auto &flayout = op_correct_layout[new_node->op()];
+      CHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts))
+        << "Layout infer fail";
+      CHECK_EQ(request_ilayouts.size(), num_inputs);
+      CHECK_EQ(produce_olayouts.size(), num_outputs);
+    }
+
+    // update new layouts
+    new_layouts[new_node.get()] = std::move(produce_olayouts);
+
+    for (uint32_t i = 0; i < inode.inputs.size(); ++i) {
+      const auto& e = inode.inputs[i];
+      const nnvm::NodePtr& in = mirror_vec[e.node_id];
+      new_node->inputs[i] = nnvm::NodeEntry{in, e.index, e.version};
+
+      // insert layout_transform if necessary
+      const Layout& produce = produce_ilayouts[i];
+      const Layout& request = request_ilayouts[i];
+      if (produce != request && produce.defined()) {
+        nnvm::NodePtr tnode = CreateLayoutTransformNode(produce, request);
+        tnode->attrs.name = idx[e.node_id].source->attrs.name + "_" + request.name();
+        tnode->inputs.emplace_back(new_node->inputs[i]);
+        nnvm::NodeEntry tnode_output{tnode, 0, 0};
+        new_node->inputs[i] = tnode_output;
+        // layout produced by LayoutTransformNode
+        new_layouts[tnode.get()] = {request};
+      } else if (!produce.defined()) {
+        // do reverse infer
+        new_layouts[in.get()][e.index] = request;
+      }
+    }
+    mirror_vec[nid] = new_node;
+  }
+
+  std::vector<nnvm::NodeEntry> outputs;
+  for (const auto& e : idx.outputs()) {
+    outputs.emplace_back(nnvm::NodeEntry{mirror_vec[e.node_id], e.index, e.version});
+  }
+
+  nnvm::Graph ret;
+  ret.outputs = outputs;
+  // restore the layouts to return graph
+  const auto& ret_idx = ret.indexed_graph();
+  std::vector<Layout> ret_layouts(ret_idx.num_node_entries(), Layout::Undef());
+  for (uint32_t nid = 0; nid < ret_idx.num_nodes(); ++nid) {
+    const auto& inode = ret_idx[nid];
+    const auto& layout_iter = new_layouts.find(inode.source);
+    if (layout_iter != new_layouts.end()) {
+      for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+        ret_layouts[ret_idx.entry_id(nid, i)] = std::move(layout_iter->second[i]);
+      }
+    }
+  }
+
+  // cannot call indexed_graph() before return the origin Graph,
+  // thus create a new one
+  nnvm::Graph new_ret;
+  new_ret.outputs = std::move(outputs);
+  new_ret.attrs["layout"] = std::make_shared<any>(std::move(ret_layouts));
+
+  return new_ret;
+}
+
+// register pass
+NNVM_REGISTER_PASS(CorrectLayout)
+.describe("Return a layout-transformed graph of src.")
+.set_body(CorrectLayout)
+.provide_graph_attr("layout")
+.set_change_graph(true);
+
+DMLC_JSON_ENABLE_ANY(LayoutVector, list_layout);
+
+}  // namespace pass
+}  // namespace nnvm
diff --git a/nnvm/src/pass/gradient.cc b/nnvm/src/pass/gradient.cc
new file mode 100644
index 000000000000..6e6d01d5f6a5
--- /dev/null
+++ b/nnvm/src/pass/gradient.cc
@@ -0,0 +1,261 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file gradients.cc
+ * \brief Passes that takes gradient of the graph
+ * This code code was modified based on mxnet codebase by Min Lin
+ */
+#include <nnvm/pass.h>
+#include <nnvm/op_attr_types.h>
+#include <algorithm>
+#include <functional>
+
+namespace nnvm {
+namespace pass {
+namespace {
+
+// default aggregate gradient function
+// require operator zeros and elemwise_sum to be presented.
+NodeEntry DefaultAggregateGradient(std::vector<NodeEntry>&& v) {
+  if (v.size() == 1) {
+    return std::move(v[0]);
+  } else if (v.size() == 0) {
+    NodePtr zero_node = Node::Create();
+    zero_node->attrs.op = Op::Get("zeros");
+    zero_node->attrs.name = "zero_grad";
+    zero_node->attrs.op->attr_parser(&(zero_node->attrs));
+    return NodeEntry{zero_node, 0, 0};
+  } else {
+    NodePtr sum_node = Node::Create();
+    sum_node->attrs.op = Op::Get("elemwise_sum");
+    sum_node->inputs = std::move(v);
+    sum_node->attrs.name = "grad_sum";
+    sum_node->attrs.dict["num_args"] = std::to_string(sum_node->inputs.size());
+    sum_node->attrs.op->attr_parser(&(sum_node->attrs));
+    return NodeEntry{sum_node, 0, 0};
+  }
+}
+
+bool CheckGradAllZero(const std::vector<NodeEntry>& grads,
+                      const std::vector<const Op*>& zero_ops) {
+  if (!grads.size() || !zero_ops.size()) return false;
+  for (const auto& g : grads) {
+    bool found = false;
+    for (const auto& op : zero_ops) {
+      if (g.node->op() == op) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) return false;
+  }
+  return true;
+}
+
+// helper entry
+struct GradEntry {
+#ifdef _MSC_VER
+  NodeEntry sum = NodeEntry{nullptr, 0, 0};
+#else
+  NodeEntry sum{nullptr, 0, 0};
+#endif
+  std::vector<NodeEntry> grads;
+  bool need_attr_hint{true};
+};
+
+Graph Gradient(Graph src) {
+  using nnvm::FGradient;
+  using MirrorFun = std::function<int (const Node& node)>;
+  using AttrHintFun = std::function<NodeEntry (const NodeEntry& src, const NodeEntry &like)>;
+
+  CHECK_NE(src.attrs.count("grad_ys"), 0U)
+      << "Gradient require grad_ys to be presented.";
+  CHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U)
+      << "Gradient require grad_ys_out_grad to be presented.";
+  CHECK_NE(src.attrs.count("grad_xs"), 0U)
+      << "Gradient require grad_xs to be presented.";
+  const std::vector<NodeEntry>& ys =
+      src.GetAttr<std::vector<NodeEntry> >("grad_ys");
+  const std::vector<NodeEntry>& ys_out_grad =
+      src.GetAttr<std::vector<NodeEntry> >("grad_ys_out_grad");
+  const std::vector<NodeEntry>& xs =
+      src.GetAttr<std::vector<NodeEntry> >("grad_xs");
+  using AggFun = std::function<NodeEntry (std::vector<NodeEntry>&& inputs)>;
+  AggFun agg_fun = DefaultAggregateGradient;
+  if (src.attrs.count("grad_aggregate_fun") != 0) {
+    agg_fun = src.GetAttr<AggFun>("grad_aggregate_fun");
+  }
+  MirrorFun mirror_fun = nullptr;
+  if (src.attrs.count("grad_mirror_fun") != 0) {
+    mirror_fun = src.GetAttr<MirrorFun>("grad_mirror_fun");
+  }
+  AttrHintFun attr_hint_fun = nullptr;
+  if (src.attrs.count("attr_hint_fun") != 0) {
+    attr_hint_fun = src.GetAttr<AttrHintFun>("attr_hint_fun");
+  }
+  std::vector<const Op*> zero_ops;
+  if (src.attrs.count("zero_ops") != 0) {
+    zero_ops = src.GetAttr<std::vector<const Op*> >("zero_ops");
+  }
+  const Op* copy_op = (src.attrs.count("copy_op") != 0) ?
+      Op::Get(src.GetAttr<std::string>("copy_op")) :
+      nullptr;
+
+  // topo sort
+  std::vector<NodePtr> topo_order;
+  std::unordered_map<Node*, std::vector<GradEntry> > output_grads;
+
+  DFSVisit(ys, [&](const NodePtr& node) {
+      if (output_grads.count(node.get()) == 0) {
+        output_grads[node.get()].resize(node->num_outputs());
+      }
+      topo_order.push_back(node);
+    });
+
+  CHECK_EQ(ys.size(), ys_out_grad.size());
+  for (size_t i = 0; i < ys.size(); ++i) {
+    NodeEntry ograd = ys_out_grad[i];
+    output_grads[ys[i].node.get()][ys[i].index].grads = { ograd };
+  }
+
+  // Check that all xs are reachable from ys
+  for (size_t i = 0; i < xs.size(); ++i) {
+    CHECK(output_grads.find(xs[i].node.get()) != output_grads.end())
+        << "Cannot differentiate with respect to the " << i+1 << "-th variable "
+        << "because it is unreachable from the outputs.";
+  }
+
+  // construct mirror reduece memory strategy if needed
+  std::unordered_map<Node*, NodePtr> mirror_map;
+  if (mirror_fun != nullptr) {
+    for (const NodePtr& n : topo_order) {
+      if (mirror_fun(*n)) {
+        NodePtr new_node = Node::Create();
+        *new_node = *n;
+        new_node->attrs.name += "_mirror";
+        for (auto& e : new_node->inputs) {
+          e.node = mirror_map.at(e.node.get());
+        }
+        for (auto& n : new_node->control_deps) {
+          n = mirror_map.at(n.get());
+        }
+        mirror_map[n.get()] = std::move(new_node);
+      } else {
+        mirror_map[n.get()] = n;
+      }
+    }
+  }
+
+  // traverse backward
+  static auto& grad_fun_map = Op::GetAttr<FGradient>("FGradient");
+  static auto& finfer_shape = Op::GetAttr<FInferShape>("FInferShape");
+
+  std::vector<NodeEntry> out_agg_grads;
+  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
+    const NodePtr& ptr = *rit;
+    if (ptr->is_variable()) continue;
+    out_agg_grads.clear();
+    auto& out_grad_vec = output_grads.at(ptr.get());
+    for (uint32_t i = 0; i < out_grad_vec.size(); ++i) {
+      GradEntry& e = out_grad_vec[i];
+      e.sum = agg_fun(std::move(e.grads));
+      if (e.need_attr_hint && attr_hint_fun != nullptr) {
+        e.sum = attr_hint_fun(e.sum, NodeEntry{ptr, 0, i});
+      }
+      out_agg_grads.push_back(e.sum);
+    }
+    if ((*rit)->inputs.size() != 0) {
+      NodePtr fwd_node = (mirror_map.size() == 0 ? ptr : mirror_map.at(ptr.get()));
+      std::vector<NodeEntry> input_grads;
+      if (grad_fun_map.count(ptr->op())) {
+        input_grads = grad_fun_map[ptr->op()](fwd_node, out_agg_grads);
+        CHECK_EQ((*rit)->inputs.size(), input_grads.size())
+            << "Gradient function not returning enough gradient";
+      } else if (CheckGradAllZero(out_agg_grads, zero_ops)) {
+        for (size_t i = 0; i < fwd_node->num_inputs(); ++i) {
+          std::ostringstream os;
+          if (1 == fwd_node->num_inputs()) {
+            os << fwd_node->attrs.name << "_backward";
+          } else {
+            os << fwd_node->attrs.name << "_in" << i << "_backward";
+          }
+          auto p = Node::Create();
+          p->attrs.op = zero_ops[0];
+          p->attrs.name = os.str();
+          p->inputs.push_back(fwd_node->inputs[i]);
+          p->control_deps.emplace_back(fwd_node);
+          if (p->op()->attr_parser != nullptr) {
+            p->op()->attr_parser(&(p->attrs));
+          }
+          input_grads.emplace_back(nnvm::NodeEntry{p, 0, 0});
+        }
+      } else {
+        LOG(FATAL) << "Operator " << fwd_node->op()->name << " is non-differentiable "
+                   << "because it didn't register FGradient attribute.";
+      }
+      auto git = input_grads.begin();
+      for (auto it = (*rit)->inputs.begin(); it != (*rit)->inputs.end(); ++it, ++git) {
+        auto& ge = output_grads[it->node.get()][it->index];
+        // if any of the backward op can do shape inference, the hint is not necessary.
+        if (finfer_shape.count(git->node->op())) {
+          ge.need_attr_hint = false;
+        }
+        ge.grads.emplace_back(std::move(*git));
+      }
+    }
+  }
+  // take out the xs' grads
+  Graph ret;
+  ret.outputs.resize(xs.size());
+  NodeEntryMap<std::pair<size_t, size_t> > unique_grads;
+  size_t counter = 0;
+  for (const NodeEntry& e : xs) {
+    GradEntry& entry = output_grads[e.node.get()][e.index];
+    // aggregate sum if there haven't been
+    if (entry.sum.node.get() == nullptr) {
+      entry.sum = agg_fun(std::move(entry.grads));
+      if (entry.need_attr_hint && attr_hint_fun != nullptr) {
+        entry.sum = attr_hint_fun(entry.sum, e);
+      }
+    }
+    if (copy_op != nullptr) {
+      auto kv = unique_grads.find(entry.sum);
+      if (kv == unique_grads.end()) {
+        unique_grads.emplace(std::move(entry.sum), std::make_pair(1, counter));
+      } else {
+        NodePtr copy_node = Node::Create();
+        std::ostringstream os;
+        os << entry.sum.node->attrs.name << "_" << kv->second.first << "_copy";
+        kv->second.first++;
+        copy_node->attrs.op = copy_op;
+        copy_node->attrs.name = os.str();
+        copy_node->inputs.emplace_back(entry.sum);
+        if (copy_node->attrs.op->attr_parser != nullptr) {
+            copy_node->attrs.op->attr_parser(&(copy_node->attrs));
+        }
+        unique_grads.emplace(NodeEntry{std::move(copy_node), 0, 0}, std::make_pair(1, counter));
+      }
+    } else {
+        ret.outputs[counter] = entry.sum;
+    }
+    ++counter;
+  }
+  if (copy_op != nullptr) {
+    for (const auto& kv : unique_grads) {
+      ret.outputs[kv.second.second] = kv.first;
+    }
+  }
+  return ret;
+}
+
+// register pass
+NNVM_REGISTER_PASS(Gradient)
+.describe("Return a gradient graph of src.attrs[\"ys\"] wrt src.attrs[\"xs\"]")
+.set_body(Gradient)
+.set_change_graph(true)
+.depend_graph_attr("grad_ys")
+.depend_graph_attr("grad_xs")
+.depend_graph_attr("grad_ys_out_grad");
+
+}  // namespace
+}  // namespace pass
+}  // namespace nnvm
diff --git a/nnvm/src/pass/graph_algorithm.h b/nnvm/src/pass/graph_algorithm.h
new file mode 100644
index 000000000000..e933ff6238bb
--- /dev/null
+++ b/nnvm/src/pass/graph_algorithm.h
@@ -0,0 +1,112 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file graph_algorithm.h
+ * \brief This header contains graph algorithms on StaticGraph.
+ *  It is used  compute informations such as whether two
+ *  operations can run in parallel, and helps allocation.
+*/
+#ifndef NNVM_PASS_GRAPH_ALGORITHM_H_
+#define NNVM_PASS_GRAPH_ALGORITHM_H_
+
+#include <nnvm/graph.h>
+#include <vector>
+
+namespace nnvm {
+namespace pass {
+
+/*!
+ * \brief Find best path in the DAG, with reward defined
+ *  by sum of reward of each node along the path.
+ * \param graph the original static graph.
+ * \param topo_order topo order of the nodes in the graph.
+ * \param node_reward the reward of each node.
+ * \param path the output path of nodes.
+ * \return the total reward of best path.
+ */
+inline uint32_t FindBestPath(
+    const IndexedGraph& graph,
+    const std::vector<uint32_t>& node_reward,
+    std::vector<uint32_t>* path) {
+  const uint32_t num_nodes = static_cast<uint32_t>(graph.num_nodes());
+  CHECK_EQ(num_nodes, node_reward.size());
+
+  std::vector<uint32_t> best_reward(node_reward.size(), 0);
+  std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
+  uint32_t best_solution = 0, best_start_node = 0;
+
+  // traverse in reverse topo order
+  for (uint32_t i = static_cast<uint32_t>(graph.num_nodes()); i != 0; --i) {
+    const uint32_t nid = i - 1;
+    best_reward[nid] += node_reward[nid];
+    if (best_reward[nid] > best_solution) {
+      best_solution = best_reward[nid];
+      best_start_node = nid;
+    }
+    for (const auto& e : graph[nid].inputs) {
+      const uint32_t prev = e.node_id;
+      if (best_reward[nid] > best_reward[prev]) {
+        best_reward[prev] = best_reward[nid];
+        next_node[prev] = nid;
+      }
+    }
+  }
+  path->clear();
+  uint32_t reward = 0;
+  for (uint32_t nid = best_start_node; nid < num_nodes; nid = next_node[nid]) {
+    path->push_back(nid); reward += node_reward[nid];
+  }
+  CHECK_EQ(reward, best_solution);
+  return best_solution;
+}
+
+/*!
+ * \brief Color the nodes in the graph into index.
+ *  The coloring algorithm tries to assign node group
+ *  such that node in the same group cannot run in parallel.
+ *
+ * \param graph the original indexed graph.
+ * \param node_importance The importance of the node
+ * \param max_ncolor maximum number of colors allowed.
+ * \param color the color index of each of the node.
+ * \return the total number of colors.
+ */
+inline uint32_t ColorNodeGroup(
+    const IndexedGraph &graph,
+    std::vector<uint32_t> node_importance,
+    uint32_t max_ncolor,
+    std::vector<uint32_t> *color) {
+  CHECK_NE(max_ncolor, 0U);
+  CHECK_EQ(graph.num_nodes(), node_importance.size());
+
+  color->clear();
+  color->resize(graph.num_nodes(), max_ncolor);
+  uint32_t cindex;
+  // greedy algorithm, every time
+  // find a path with best reward and assign a new color
+  // All the nodes in the path cannot run in parallel.
+  for (cindex = 0; cindex < max_ncolor - 1; ++cindex) {
+    std::vector<uint32_t> path;
+    uint32_t reward = FindBestPath(graph, node_importance, &path);
+    if (reward == 0) break;
+    for (uint32_t nid : path) {
+      if (node_importance[nid] != 0) {
+        CHECK_EQ(color->at(nid), max_ncolor);
+        color->at(nid) = cindex;
+        // make the importance 0 after color is decided.
+        node_importance[nid] = 0;
+      }
+    }
+  }
+  // assign i for rest of the node
+  for (uint32_t i = 0; i < graph.num_nodes(); ++i) {
+    if (color->at(i) == max_ncolor) {
+      color->at(i) = cindex;
+    }
+  }
+  return cindex + 1;
+}
+
+}  // namespace pass
+}  // namespace nnvm
+
+#endif  // NNVM_PASS_GRAPH_ALGORITHM_H_
diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc
new file mode 100644
index 000000000000..cc4916ce0b9f
--- /dev/null
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -0,0 +1,264 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file infer_shape.cc
+ * \brief Inference the shapes given existin information.
+ */
+#include <nnvm/pass.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+
+namespace nnvm {
+namespace pass {
+namespace {
+
+template<typename AttrType, typename IsNone, typename FDefault>
+Graph InferAttr(Graph &&ret,
+                const AttrType empty_val,
+                const char* infer_name,
+                const char* input_name,
+                const char* attr_key_name,
+                const char* attr_name,
+                const char* unknown_name,
+                IsNone fis_none,
+                FDefault fdefault) {
+  using AttrVector = std::vector<AttrType>;
+  const IndexedGraph& idx = ret.indexed_graph();
+  static auto& finfer_shape =
+      Op::GetAttr<FInferNodeEntryAttr<AttrType> >(infer_name);
+  static auto& is_backward =
+      Op::GetAttr<TIsBackward>("TIsBackward");
+  // gradient function, used to get node correspondence.
+  static auto& fgrad =
+      Op::GetAttr<FGradient>("FGradient");
+  // reshape shape vector
+  AttrVector rshape;
+  if (ret.attrs.count(attr_name) != 0) {
+    rshape = ret.MoveCopyAttr<AttrVector>(attr_name);
+  } else {
+    rshape.resize(idx.num_node_entries(), empty_val);
+  }
+
+  if (ret.attrs.count(input_name) != 0) {
+    const AttrVector& shape_args = ret.GetAttr<AttrVector>(input_name);
+    CHECK_LE(shape_args.size(), idx.input_nodes().size())
+        << "More provided shapes than number of arguments.";
+    for (size_t i = 0; i < shape_args.size(); ++i) {
+      rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i];
+    }
+    // erase the provided arguments
+    ret.attrs.erase(input_name);
+  }
+
+  // get the shape hints
+  std::string shape_hints_key = std::string(attr_name) + "_hints";
+  if (ret.attrs.count(shape_hints_key)) {
+    NodeEntryMap<AttrType> shape_hints =
+      ret.GetAttr<NodeEntryMap<AttrType>>(shape_hints_key);
+    for (const auto& kv : shape_hints) {
+      NodeEntry e = kv.first;
+      if (idx.exist(e.node.get())) {
+        rshape[idx.entry_id(kv.first)] = kv.second;
+      }
+    }
+  }
+
+  std::string shape_attr_key;
+  if (ret.attrs.count(attr_key_name) != 0) {
+    shape_attr_key = ret.GetAttr<std::string>(attr_key_name);
+    // erase the provided arguments
+    ret.attrs.erase(attr_key_name);
+  } else {
+    shape_attr_key = attr_name;
+  }
+  // Temp space for shape inference.
+  std::vector<AttrType> ishape, oshape;
+
+  // inference step function for nid
+  auto infer_step = [&](uint32_t nid, bool last_iter) {
+    const auto& inode = idx[nid];
+    const uint32_t num_inputs = inode.inputs.size();
+    const uint32_t num_outputs = inode.source->num_outputs();
+    if (inode.source->is_variable()) {
+      // Variable node. No operator. Only one output entry.
+      CHECK(inode.source->op() == nullptr);
+      CHECK_EQ(num_outputs, 1U);
+      const uint32_t out_ent_id = idx.entry_id(nid, 0);
+      if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) {
+        auto it = inode.source->attrs.dict.find(shape_attr_key);
+        if (it != inode.source->attrs.dict.end()) {
+          std::istringstream is(it->second);
+          CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
+        }
+      }
+    } else if (is_backward.get(inode.source->op(), false) && inode.control_deps.size()) {
+      CHECK_GE(inode.control_deps.size(), 1U)
+        << "BackwardOp need to have control_deps to its forward op";
+      const IndexedGraph::Node& fnode = idx[inode.control_deps[0]];
+      NodePtr fwd_ptr = inode.source->control_deps[0];
+      CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
+      // use gradient function to find out the correspondence.
+      std::vector<NodeEntry> ograd(fwd_ptr->num_outputs());
+      for (size_t i = 0; i < ograd.size(); ++i) {
+        ograd[i].index = static_cast<uint32_t>(i);
+      }
+      // input gradient list
+      auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd);
+      const Node* igrad_node = nullptr;
+      // Input gradient assignement
+      for (size_t i = 0; i < igrad.size(); ++i) {
+        if (igrad[i].node->op() == inode.source->op()) {
+          uint32_t eid = idx.entry_id(nid, igrad[i].index);
+          if (fis_none(rshape[eid])) {
+            rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])];
+          } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) {
+            CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
+                << "Backward shape inconsistent with the forward shape";
+          }
+          if (igrad_node == nullptr) {
+            igrad_node = igrad[i].node.get();
+          } else {
+            CHECK(igrad_node == igrad[i].node.get());
+          }
+        }
+      }
+      // out grad entries
+      CHECK(igrad_node != nullptr)
+        << "Cannot find matching backward op for " << inode.source->attrs.name;
+      for (size_t i = 0; i < igrad_node->inputs.size(); ++i) {
+        const NodeEntry& e = igrad_node->inputs[i];
+        if (e.node == nullptr) {
+          uint32_t eid = idx.entry_id(inode.inputs[i]);
+          if (fis_none(rshape[eid])) {
+            rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)];
+          }
+        }
+      }
+    } else {
+      bool forward_known = true;
+      // Forward operator inference.
+      ishape.resize(num_inputs, empty_val);
+      for (uint32_t i = 0; i < ishape.size(); ++i) {
+        ishape[i] = rshape[idx.entry_id(inode.inputs[i])];
+        if (fis_none(ishape[i])) forward_known = false;
+      }
+      oshape.resize(num_outputs, empty_val);
+      for (uint32_t i = 0; i < oshape.size(); ++i) {
+        oshape[i] = rshape[idx.entry_id(nid, i)];
+        if (fis_none(oshape[i])) forward_known = false;
+      }
+      auto finfer = finfer_shape.get(inode.source->op(), fdefault);
+      if (!forward_known) {
+        if (finfer != nullptr) {
+          // Call inference function of the operator.
+          try {
+            forward_known = finfer(inode.source->attrs, &ishape, &oshape);
+          } catch (const std::exception& e) {
+            throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what());
+          }
+        } else {
+          CHECK(!last_iter)
+              << "Attribute " << infer_name
+              << " is not registered by op " << inode.source->op()->name
+              << " we are not able to complete the inference because of this";
+        }
+      }
+      // Save to the result map.
+      for (uint32_t i = 0; i < num_inputs; ++i) {
+        rshape[idx.entry_id(inode.inputs[i])] = ishape[i];
+      }
+      for (uint32_t i = 0; i < num_outputs; ++i) {
+        rshape[idx.entry_id(nid, i)] = oshape[i];
+      }
+    }
+  };
+
+  size_t last_num_unknown;
+  size_t num_unknown = rshape.size();
+  int i = 0;
+  do {
+    if (i % 2 == 0) {
+      for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+        infer_step(nid, false);
+      }
+    } else {
+      // backward inference
+      for (uint32_t i = idx.num_nodes(); i != 0; --i) {
+        infer_step(i - 1, false);
+      }
+    }
+    last_num_unknown = num_unknown;
+    num_unknown = 0;
+    for (size_t j = 0; j < idx.num_node_entries(); ++j) {
+      if (fis_none(rshape[j])) {
+        ++num_unknown;
+      }
+    }
+    ++i;
+  } while (num_unknown > 0 && last_num_unknown > num_unknown);
+  // set the shapes
+  ret.attrs[attr_name] = std::make_shared<any>(std::move(rshape));
+  // number of nodes who knows the shape.
+  ret.attrs[unknown_name] = std::make_shared<any>(num_unknown);
+  return ret;
+}
+
+NNVM_REGISTER_PASS(InferShape)
+.describe("Infer the shape of each node entries.")
+.set_body([](Graph ret) {
+    return InferAttr<TShape>(
+        std::move(ret), TShape(),
+        "FInferShape", "shape_inputs", "shape_attr_key",
+        "shape", "shape_num_unknown_nodes",
+        [](const TShape& s) { return s.ndim() == 0 || s.Size() == 0; },
+        nullptr);
+  })
+.set_change_graph(false)
+.provide_graph_attr("shape");
+
+// inference fucntion for same type
+inline bool SameType(const NodeAttrs& attrs,
+                     std::vector<int> *iattr,
+                     std::vector<int> *oattr) {
+  int def_v = -1;
+  for (int v : *oattr) {
+    if (v != -1) {
+      def_v = v; break;
+    }
+  }
+  if (def_v == -1) {
+    for (int v : *iattr) {
+      if (v != -1) {
+        def_v = v; break;
+      }
+    }
+  }
+  if (def_v == -1) return false;
+  for (int& v : *oattr) {
+    v = def_v;
+  }
+  for (int& v : *iattr) {
+    v = def_v;
+  }
+  return true;
+}
+
+NNVM_REGISTER_PASS(InferType)
+.describe("Infer the dtype of each node entries.")
+.set_body([](Graph ret) {
+    return InferAttr<int>(
+        std::move(ret), -1,
+        "FInferType", "dtype_inputs", "dtype_attr_key",
+        "dtype", "dtype_num_unknown_nodes",
+        [](const int t) { return t == -1; },
+        SameType);
+  })
+.set_change_graph(false)
+.provide_graph_attr("dtype");
+
+DMLC_JSON_ENABLE_ANY(ShapeVector, list_shape);
+DMLC_JSON_ENABLE_ANY(DTypeVector, list_int);
+DMLC_JSON_ENABLE_ANY(size_t, size_t);
+
+}  // namespace
+}  // namespace pass
+}  // namespace nnvm
diff --git a/nnvm/src/pass/order_mutation.cc b/nnvm/src/pass/order_mutation.cc
new file mode 100644
index 000000000000..e91d114ea101
--- /dev/null
+++ b/nnvm/src/pass/order_mutation.cc
@@ -0,0 +1,156 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file order_mutation.cc
+ * \brief Add control flow dependencies between nodes
+ *  To correctly order mutation and read to resolve
+ *  write after read problem and read after write problems.
+ */
+#include <nnvm/pass.h>
+#include <nnvm/op_attr_types.h>
+
+namespace nnvm {
+namespace pass {
+namespace {
+
+template<typename T>
+inline T get_with_default(const std::unordered_map<Node*, T> &map,
+                          Node* key,
+                          const T& def) {
+  auto it = map.find(key);
+  if (it != map.end()) return it->second;
+  return def;
+}
+
+inline bool IsMutate(const std::vector<uint32_t>& mutate_inputs, uint32_t i) {
+  return std::binary_search(mutate_inputs.begin(), mutate_inputs.end(), i);
+}
+
+Graph OrderMutation(const Graph& src) {
+  std::unordered_map<Node*, std::vector<NodeEntry> > version_hist;
+  DFSVisit(src.outputs, [&version_hist](const NodePtr& n) {
+      for (const NodeEntry& e : n->inputs) {
+        if (e.node->is_variable()) {
+          if (e.version != 0 && version_hist.count(e.node.get()) == 0) {
+            version_hist[e.node.get()] = std::vector<NodeEntry>{};
+          }
+        }
+      }
+    });
+  // no mutation happens, everything if fine.
+  if (version_hist.size() == 0) return src;
+  // start preparing for remapping the nodes.
+  std::unordered_map<Node*, NodePtr> old_new;
+  auto prepare = [&version_hist, &old_new] (const NodePtr& n) {
+    static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
+    std::vector<uint32_t> mutate_inputs;
+    if (!n->is_variable() && fmutate_inputs.count(n->op())) {
+      mutate_inputs = fmutate_inputs[n->op()](n->attrs);
+    }
+    std::sort(mutate_inputs.begin(), mutate_inputs.end());
+
+    bool need_repl = false;
+    for (size_t i = 0; i < n->inputs.size(); ++i) {
+      const NodeEntry& e = n->inputs[i];
+      if (e.node->is_variable()) {
+        if (e.version != 0) need_repl = true;
+        auto it = version_hist.find(e.node.get());
+        if (it != version_hist.end()) {
+          std::vector<NodeEntry>& vec = it->second;
+          vec.emplace_back(NodeEntry{n, IsMutate(mutate_inputs, i), e.version});
+        }
+      } else {
+        if (old_new.count(e.node.get()) != 0) need_repl = true;
+      }
+    }
+    for (const NodePtr& p : n->control_deps) {
+      if (old_new.count(p.get()) != 0) need_repl = true;
+    }
+    if (need_repl) {
+      NodePtr np = Node::Create();
+      np->attrs = n->attrs;
+      old_new[n.get()] = std::move(np);
+    }
+  };
+  DFSVisit(src.outputs, prepare);
+  // comparator of history entry
+  auto comparator = [](const NodeEntry& a, const NodeEntry &b) {
+    if (a.version < b.version) return true;
+    if (a.version > b.version) return false;
+    return a.index > b.index;
+  };
+
+  for (auto &kv : version_hist) {
+    std::sort(kv.second.begin(), kv.second.end(), comparator);
+  }
+  // copy the nodes, as well as add control deps
+  for (auto &kv : old_new) {
+    // copy the nodes
+    for (const NodeEntry& e : kv.first->inputs) {
+      auto it = old_new.find(e.node.get());
+      if (it != old_new.end()) {
+        kv.second->inputs.emplace_back(NodeEntry{it->second, e.index, e.version});
+      } else {
+        kv.second->inputs.push_back(e);
+      }
+    }
+    for (const NodePtr& p : kv.first->control_deps) {
+      kv.second->control_deps.emplace_back(
+          get_with_default(old_new, p.get(), p));
+    }
+    // add control deps
+    static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
+    std::vector<uint32_t> mutate_inputs;
+    if (fmutate_inputs.count(kv.first->op())) {
+      mutate_inputs = fmutate_inputs[kv.first->op()](kv.first->attrs);
+    }
+    std::sort(mutate_inputs.begin(), mutate_inputs.end());
+
+    for (size_t i = 0; i < kv.first->inputs.size(); ++i) {
+      const NodeEntry& e = kv.first->inputs[i];
+      if (e.node->is_variable() && version_hist.count(e.node.get()) != 0) {
+        std::vector<NodeEntry>& vec = version_hist.at(e.node.get());
+        auto it = std::lower_bound(vec.begin(), vec.end(),
+                                   NodeEntry{nullptr, 1, e.version},
+                                   comparator);
+        if (IsMutate(mutate_inputs, i)) {
+          int read_dep = 0;
+          while (it != vec.begin()) {
+            --it;
+            if (it->index != 0) break;
+            ++read_dep;
+            // depend on previous read
+            kv.second->control_deps.push_back(
+                get_with_default(old_new, it->node.get(), it->node));
+          }
+          if (read_dep == 0 && it->index != 0) {
+            // depend on last write
+            kv.second->control_deps.push_back(
+                get_with_default(old_new, it->node.get(), it->node));
+          }
+        } else {
+          // depend on last write
+          if (it->index != 0) {
+            kv.second->control_deps.push_back(
+                get_with_default(old_new, it->node.get(), it->node));
+          }
+        }
+      }
+    }
+  }
+  Graph ret;
+  for (const NodeEntry &e : src.outputs) {
+    ret.outputs.emplace_back(NodeEntry{
+        get_with_default(old_new, e.node.get(), e.node), e.index, e.version});
+  }
+  return ret;
+}
+
+NNVM_REGISTER_PASS(OrderMutation)
+.describe("Return a new graph that adds control dependencies, "\
+          "to order the mutation and reads if mutation exists.")
+.set_body(OrderMutation)
+.set_change_graph(true);
+
+}  // namespace
+}  // namespace pass
+}  // namespace nnvm
diff --git a/nnvm/src/pass/place_device.cc b/nnvm/src/pass/place_device.cc
new file mode 100644
index 000000000000..0c2307fb1a33
--- /dev/null
+++ b/nnvm/src/pass/place_device.cc
@@ -0,0 +1,218 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file place_device.cc
+ * \brief Inference the device of each operator given known information.
+ *  Insert a copy node automatically when there is a cross device.
+ */
+#include <nnvm/pass.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+
+namespace nnvm {
+namespace pass {
+namespace {
+
+// simply logic to place device according to device_group hint
+// insert copy node when there is
+Graph PlaceDevice(Graph src) {
+  CHECK(src.attrs.count("device_group_attr_key"))
+      << "Need graph attribute \"device_group_attr_key\" in PlaceDevice";
+  CHECK(src.attrs.count("device_assign_map"))
+      << "Need graph attribute \"device_assign_map\" in PlaceDevice";
+  CHECK(src.attrs.count("device_copy_op"))
+      << "Need graph attribute \"device_copy_op\" in PlaceDevice";
+  std::string device_group_attr_key = src.GetAttr<std::string>("device_group_attr_key");
+  const Op* copy_op = Op::Get(src.GetAttr<std::string>("device_copy_op"));
+  auto& device_assign_map = src.GetAttr<DeviceAssignMap>("device_assign_map");
+  const IndexedGraph& idx = src.indexed_graph();
+  static auto& is_backward =
+      Op::GetAttr<TIsBackward>("TIsBackward");
+  DeviceVector device;
+  // copy on write semanatics
+  if (src.attrs.count("device") != 0) {
+    device = src.MoveCopyAttr<DeviceVector>("device");
+    CHECK_EQ(device.size(), idx.num_nodes());
+  } else {
+    device.resize(idx.num_nodes(), -1);
+  }
+
+  // forward pass
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    auto it = inode.source->attrs.dict.find(device_group_attr_key);
+    if (it != inode.source->attrs.dict.end()) {
+      const std::string& device_group = it->second;
+      auto dit = device_assign_map.find(device_group);
+      CHECK(dit != device_assign_map.end())
+          << "The device assignment not found for group " << device_group;
+      device[nid] = dit->second;
+    } else {
+      if (!inode.source->is_variable() &&
+          is_backward.get(inode.source->op(), false)) {
+        if (device[inode.control_deps[0]] != -1) {
+          device[nid] = device[inode.control_deps[0]];
+        }
+      } else {
+        for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+          if (device[e.node_id] != -1) {
+            device[nid] = device[e.node_id]; break;
+          }
+        }
+      }
+    }
+  }
+  // backward pass
+  for (uint32_t i = idx.num_nodes(); i != 0; --i) {
+    uint32_t nid = i - 1;
+    const auto& inode = idx[nid];
+    if (device[nid] == -1) continue;
+    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+      if (device[e.node_id] == -1) device[e.node_id] = device[nid];
+    }
+  }
+
+  int num_dev = 1, other_dev_id = -1;
+  for (int& dev : device) {
+    if (dev == -1) dev = 0;
+    if (dev != other_dev_id) {
+      if (other_dev_id != -1) ++num_dev;
+      other_dev_id = dev;
+    }
+  }
+
+  if (num_dev == 1) {
+    src.attrs.erase("device_group_attr_key");
+    src.attrs.erase("device_assign_map");
+    src.attrs.erase("device_copy_op");
+    src.attrs["device"] = std::make_shared<any>(std::move(device));
+    return src;
+  }
+  std::map<std::tuple<uint32_t, uint32_t, int>, NodePtr> copy_map;
+  std::vector<NodePtr> new_node_map(idx.num_nodes(), nullptr);
+  std::unordered_map<const Node*, int> new_device_map;
+  static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
+
+  // insert copy node
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    int dev_id = device[nid];
+    const auto& inode = idx[nid];
+    // check if mutation is needed
+    bool need_mutate = false;
+    if (!inode.source->is_variable() && fmutate_inputs.count(inode.source->op())) {
+      for (uint32_t index : fmutate_inputs[inode.source->op()](inode.source->attrs)) {
+        auto e = inode.inputs[index];
+        if (new_node_map[e.node_id] != nullptr || dev_id != device[e.node_id]) {
+          LOG(FATAL) << " mutable state cannot go across device"
+                     << " op=" << inode.source->op()->name
+                     << " input_state_index=" << index;
+        }
+      }
+    }
+    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+      if (new_node_map[e.node_id] != nullptr || dev_id != device[e.node_id]) {
+        need_mutate = true; break;
+      }
+    }
+    if (!need_mutate) {
+      for (const uint32_t cid : inode.control_deps) {
+        if (new_node_map[cid] != nullptr)  {
+          need_mutate = true; break;
+        }
+      }
+    }
+    if (inode.source->is_variable()) {
+      CHECK(!need_mutate) << "consistency check";
+    }
+    if (need_mutate) {
+      NodePtr new_node = Node::Create();
+      new_node->attrs = inode.source->attrs;
+      new_node->inputs.reserve(inode.inputs.size());
+      for (size_t i = 0; i < inode.inputs.size(); ++i) {
+        const IndexedGraph::NodeEntry& e = inode.inputs[i];
+        if (dev_id != device[e.node_id]) {
+          auto copy_key = std::make_tuple(e.node_id, e.index, dev_id);
+          auto it = copy_map.find(copy_key);
+          if (it != copy_map.end() && it->first == copy_key) {
+            new_node->inputs.emplace_back(
+                NodeEntry{it->second, 0, 0});
+          } else {
+            NodePtr copy_node = Node::Create();
+            std::ostringstream os;
+            os << inode.source->inputs[i].node->attrs.name << "_" << e.index <<"_copy";
+            copy_node->attrs.op = copy_op;
+            copy_node->attrs.name = os.str();
+            if (new_node_map[e.node_id] != nullptr) {
+              copy_node->inputs.emplace_back(
+                NodeEntry{new_node_map[e.node_id], e.index, 0});
+            } else {
+              copy_node->inputs.push_back(inode.source->inputs[i]);
+            }
+            if (copy_node->attrs.op->attr_parser != nullptr) {
+              copy_node->attrs.op->attr_parser(&(copy_node->attrs));
+            }
+            copy_map[copy_key] = copy_node;
+            new_device_map[copy_node.get()] = dev_id;
+            new_node->inputs.emplace_back(
+                NodeEntry{std::move(copy_node), 0, 0});
+          }
+        } else {
+          if (new_node_map[e.node_id] != nullptr) {
+            new_node->inputs.emplace_back(
+                NodeEntry{new_node_map[e.node_id], e.index, 0});
+          } else {
+            new_node->inputs.push_back(inode.source->inputs[i]);
+          }
+        }
+      }
+      new_node->control_deps.reserve(inode.control_deps.size());
+      for (size_t i = 0; i < inode.control_deps.size(); ++i) {
+        uint32_t cid = inode.control_deps[i];
+        if (new_node_map[cid] != nullptr) {
+          new_node->control_deps.push_back(new_node_map[cid]);
+        } else {
+          new_node->control_deps.push_back(inode.source->control_deps[i]);
+        }
+      }
+      new_device_map[new_node.get()] = dev_id;
+      new_node_map[nid] = std::move(new_node);
+    } else {
+      new_device_map[inode.source] = dev_id;
+    }
+  }
+  // make the new graph
+  Graph ret;
+  for (const NodeEntry& e : src.outputs) {
+    if (new_node_map[idx.node_id(e.node.get())] != nullptr) {
+      ret.outputs.emplace_back(
+          NodeEntry{new_node_map[idx.node_id(e.node.get())], e.index, e.version});
+    } else {
+      ret.outputs.emplace_back(e);
+    }
+  }
+  DeviceVector new_device_vec(ret.indexed_graph().num_nodes());
+  for (uint32_t nid = 0; nid < ret.indexed_graph().num_nodes(); ++nid) {
+    auto source = ret.indexed_graph()[nid].source;
+    if (new_device_map.count(source) == 0) {
+      LOG(FATAL) << "canot find " << source;
+    }
+    new_device_vec[nid] = new_device_map.at(source);
+  }
+  ret.attrs["device"] = std::make_shared<any>(std::move(new_device_vec));
+  return ret;
+}
+
+NNVM_REGISTER_PASS(PlaceDevice)
+.describe("Infer the device type of each operator."\
+          "Insert a copy node when there is cross device copy")
+.set_body(PlaceDevice)
+.set_change_graph(true)
+.provide_graph_attr("device")
+.depend_graph_attr("device_group_attr_key")
+.depend_graph_attr("device_assign_map")
+.depend_graph_attr("device_copy_op");
+
+DMLC_JSON_ENABLE_ANY(DeviceAssignMap, dict_str_int);
+
+}  // namespace
+}  // namespace pass
+}  // namespace nnvm
diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
new file mode 100644
index 000000000000..51448bcf1065
--- /dev/null
+++ b/nnvm/src/pass/plan_memory.cc
@@ -0,0 +1,365 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file plan_memory.cc
+ * \brief Assign memory tag to each of the data entries.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/pass.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/op_attr_types.h>
+#include <memory>
+#include "./graph_algorithm.h"
+
+namespace nnvm {
+namespace pass {
+namespace {
+
+// simple graph based allocator.
+class GraphAllocator {
+ public:
+  // storage id equals integer.
+  using StorageID = int;
+
+  // bad storage id
+  static const StorageID kBadStorageID = -1;
+  // external storage id
+  static const StorageID kExternalStorageID = -2;
+  // dynamic storage id
+  static const StorageID kDynamicStorageID = -3;
+
+  // request a free storage
+  StorageID Request(int dev_id, int dtype, TShape shape, uint32_t node_id) {
+    if (shape.ndim() == 0) return kBadStorageID;
+    // search memory block in [size / match_range_, size * match_range_)
+    // TODO(tqchen) add size of the dtype, assume 4 bytes for now
+    size_t size = shape.Size() * 4;
+    if (match_range_ == 0) return this->Alloc(dev_id, size);
+    auto begin = free_.lower_bound(size / match_range_);
+    auto mid = free_.lower_bound(size);
+    auto end = free_.upper_bound(size * match_range_);
+    // search for memory blocks larger than requested
+    for (auto it = mid; it != end; ++it) {
+      StorageEntry *e = it->second;
+      if (e->device_id != dev_id) continue;
+      if (node_color_.size() != 0 &&
+          node_color_[e->released_by_node] != node_color_[node_id]) continue;
+      // Use exect matching strategy
+      e->max_bytes = std::max(size, e->max_bytes);
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return e->id;
+    }
+    // then search for memory blocks smaller than requested space
+    for (auto it = mid; it != begin;) {
+      --it;
+      StorageEntry *e = it->second;
+      if (e->device_id != dev_id) continue;
+      if (node_color_.size() != 0 &&
+          node_color_[e->released_by_node] != node_color_[node_id]) continue;
+      // Use exect matching strategy
+      e->max_bytes = std::max(size, e->max_bytes);
+      // erase from map and return
+      free_.erase(it);
+      return e->id;
+    }
+    // cannot find anything return a new one.
+    return this->Alloc(dev_id, size);
+  }
+  // release a memory space.
+  void Release(StorageID id, uint32_t node_id) {
+    CHECK_NE(id, kBadStorageID);
+    if (id == kExternalStorageID || id == kDynamicStorageID) return;
+    StorageEntry *e = data_[id].get();
+    e->released_by_node = node_id;
+    free_.insert({e->max_bytes, e});
+  }
+
+  // totoal number of bytes allocated
+  size_t TotalAllocBytes() const {
+    size_t total = 0;
+    for (auto &p : data_) {
+      total += p->max_bytes;
+    }
+    return total;
+  }
+
+  // constructor
+  explicit GraphAllocator(const IndexedGraph* idx, const size_t match_range) : idx_(idx) {
+    this->Init(match_range, dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
+  }
+
+ private:
+  // initialize the graph allocator
+  void Init(const size_t match_range, const uint32_t num_match_color) {
+    match_range_ = match_range;
+    num_match_color_ = num_match_color;
+    if (num_match_color_ > 1) {
+      std::vector<uint32_t> importance(idx_->num_nodes(), 0);
+      for (uint32_t nid = 0; nid < idx_->num_nodes(); ++nid) {
+        if ((*idx_)[nid].source->is_variable()) continue;
+        importance[nid] = 1;
+      }
+      num_match_color_ = pass::ColorNodeGroup(
+          *idx_, importance, num_match_color_, &node_color_);
+    }
+  }
+
+  StorageID Alloc(int dev_id, size_t size) {
+    StorageID id = static_cast<StorageID>(data_.size());
+    std::unique_ptr<StorageEntry> ptr(new StorageEntry());
+    ptr->id = id;
+    ptr->device_id = dev_id;
+    ptr->max_bytes = size;
+    data_.emplace_back(std::move(ptr));
+    return id;
+  }
+  // internal storage entry
+  struct StorageEntry {
+    // the id of the entry.
+    StorageID id;
+    // the device id of the storage.
+    int device_id;
+    // maximum size of storage requested.
+    size_t max_bytes{0};
+    // node index that released it last time
+    uint32_t released_by_node{0};
+  };
+  // scale used for rough match
+  size_t match_range_;
+  // whether use color based match algorithm
+  uint32_t num_match_color_{1};
+  // the size of each dtype
+  std::vector<size_t> dtype_size_dict_;
+  // free list of storage entry
+  std::multimap<size_t, StorageEntry*> free_;
+  // all the storage resources available
+  std::vector<std::unique_ptr<StorageEntry> > data_;
+  // color of nodes in the graph, used for auxiliary policy making.
+  std::vector<uint32_t> node_color_;
+  // internal indexed graph
+  const IndexedGraph* idx_;
+};
+
+/*
+ * Internal method to perform the memory allocation for a graph
+ * */
+size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
+                   const std::pair<uint32_t, uint32_t>& node_range,
+                   StorageVector* storage_ptr,
+                   std::vector<int>* storage_inplace_index_ptr,
+                   const std::vector<uint32_t>& entry_ref_count,
+                   GraphAllocator* allocator) {
+  static auto& finplace_option = Op::GetAttr<FInplaceOption>("FInplaceOption");
+  static auto& finplace_identity = Op::GetAttr<FInplaceIdentity>("FInplaceIdentity");
+  static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
+
+  // Get reference
+  auto &storage = *storage_ptr;
+  auto &storage_inplace_index = *storage_inplace_index_ptr;
+
+  // Get attributes from the graph
+  const ShapeVector& shape_vec = ret.GetAttr<ShapeVector>("shape");
+  const DTypeVector& dtype_vec = ret.GetAttr<DTypeVector>("dtype");
+  const DeviceVector* device_vec = nullptr;
+
+  if (ret.attrs.count("device") != 0) {
+    device_vec = &(ret.GetAttr<DeviceVector>("device"));
+  }
+  size_t num_not_allocated = 0;
+  std::vector<GraphAllocator::StorageID> storage_ref_count(idx.num_node_entries(), 0);
+
+  for (uint32_t nid = node_range.first; nid < node_range.second; ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    // check inplace option
+    if (finplace_option.count(inode.source->op()) != 0) {
+      auto inplace_pairs = finplace_option[inode.source->op()](inode.source->attrs);
+      std::vector<bool> identity;
+      if (finplace_identity.count(inode.source->op()) != 0) {
+        identity = finplace_identity[inode.source->op()](inode.source->attrs);
+        CHECK_EQ(identity.size(), inplace_pairs.size())
+            << "FInplaceOption and FInplaceIdentity returned vectors of different "
+            << "size for operator " << inode.source->op()->name;
+      } else {
+        identity = std::vector<bool>(inplace_pairs.size(), false);
+      }
+      std::vector<bool> taken(inode.inputs.size(), false);
+      for (size_t ipair = 0; ipair < inplace_pairs.size(); ++ipair) {
+        const auto& kv = inplace_pairs[ipair];
+        uint32_t eid_out = idx.entry_id(nid, kv.second);
+        uint32_t eid_in = idx.entry_id(inode.inputs[kv.first]);
+        auto sid_out = storage[eid_out];
+        auto sid_in = storage[eid_in];
+        bool ignore_all_inputs = (fignore_inputs.count(inode.source->op()) != 0 &&
+                                  fignore_inputs[inode.source->op()](
+                                      inode.source->attrs).size() == inode.source->num_inputs());
+        if (taken[kv.first] == false &&
+            sid_out == GraphAllocator::kBadStorageID &&
+            sid_in >= 0 &&
+            ((storage_ref_count[sid_in] == 1 && !ignore_all_inputs) || identity[ipair]) &&
+            entry_ref_count[eid_out] > 0 &&
+            shape_vec[eid_out].Size() == shape_vec[eid_in].Size() &&
+            dtype_vec[eid_out] == dtype_vec[eid_in]) {
+          // inplace optimization
+          taken[kv.first] = true;
+          storage[eid_out] = sid_in;
+          // Reuse storage for output and add ref count of output
+          // to storage. This will get substracted later in free
+          // input section.
+          storage_ref_count[sid_in] += entry_ref_count[eid_out];
+          storage_inplace_index[eid_out] = kv.first;
+        }
+      }
+    }
+    // normal allocation
+    const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0;
+    // sort output nodes based on size before allocating output
+    std::multimap<size_t, uint32_t> eids;
+    for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+      uint32_t eid = idx.entry_id(nid, index);
+      // only request memory for kBadStorageID
+      if (storage[eid] == GraphAllocator::kBadStorageID) {
+        auto &eshape = shape_vec[eid];
+        size_t esize = 0;
+        if (eshape.ndim() != 0) esize = eshape.Size();
+        eids.insert(std::make_pair(esize, eid));
+      }
+    }
+    for (auto rit = eids.rbegin(); rit != eids.rend(); ++rit) {
+        uint32_t eid = rit->second;
+        auto sid = allocator->Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
+        if (sid >= 0) {
+          storage_ref_count[sid] = entry_ref_count[eid];
+        }
+        storage[eid] = sid;
+    }
+    // check if certain inputs is ignored.
+    std::vector<uint32_t> ignore_inputs;
+    if (fignore_inputs.count(inode.source->op()) != 0) {
+      ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
+      std::sort(ignore_inputs.begin(), ignore_inputs.end());
+    }
+    // then free inputs
+    for (size_t i = 0; i < inode.inputs.size(); ++i) {
+      // ref counter of ignored input is already decreased.
+      if (std::binary_search(ignore_inputs.begin(), ignore_inputs.end(), i)) continue;
+      const auto& e = inode.inputs[i];
+      uint32_t eid = idx.entry_id(e);
+      auto sid = storage[eid];
+      // storage_ref_count == 0 means it is taken by inplace op
+      if (sid < 0) continue;
+      // if we decrease it to zero, means we are ready to relase
+      --storage_ref_count[sid];
+      if (storage_ref_count[sid] == 0) {
+        allocator->Release(sid, nid);
+      }
+    }
+    // check if there are outputs that can be freeded immediately
+    // these output are not referenced by any operator.
+    for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+      uint32_t eid = idx.entry_id(nid, index);
+      auto sid = storage[eid];
+      if (sid >= 0 && storage_ref_count[sid] == 0) {
+        allocator->Release(sid, nid);
+        // use -2 to indicate that the node was never touched.
+        storage_inplace_index[eid] = -2;
+      }
+      if (storage[eid] == GraphAllocator::kBadStorageID) {
+        ++num_not_allocated;
+      }
+    }
+  }
+  return num_not_allocated;
+}
+
+
+// function to plan memory
+Graph PlanMemory(Graph ret) {
+  // setup ref counter
+  const IndexedGraph& idx = ret.indexed_graph();
+  static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
+  std::pair<uint32_t, uint32_t> node_range = {0, idx.num_nodes()};
+  if (ret.attrs.count("node_range")) {
+    node_range = ret.MoveCopyAttr<std::pair<uint32_t, uint32_t> >("node_range");
+  }
+  // reference counter of each node
+  std::vector<uint32_t> ref_count;
+  // step 1: initialize reference count
+  if (ret.attrs.count("ref_count") != 0) {
+    ref_count = ret.MoveCopyAttr<std::vector<uint32_t> >("ref_count");
+  } else {
+    ref_count.resize(idx.num_node_entries(), 0);
+    for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+      const auto& inode = idx[nid];
+      if (inode.source->is_variable()) continue;
+      for (const auto& e : inode.inputs) {
+        ++ref_count[idx.entry_id(e)];
+      }
+      // no dataflow dependency is needed for those are ignored.
+      // revoke the dependency counter.
+      if (fignore_inputs.count(inode.source->op()) != 0) {
+        auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
+        for (uint32_t i : ignore_inputs) {
+          --ref_count[idx.entry_id(inode.inputs[i])];
+        }
+      }
+    }
+    for (const auto& e : idx.outputs()) {
+      ++ref_count[idx.entry_id(e)];
+    }
+  }
+  // step 2: allocate memory.
+  StorageVector storage;
+  if (ret.attrs.count("storage") != 0) {
+    storage = ret.MoveCopyAttr<StorageVector>("storage");
+  } else {
+    storage.resize(idx.num_node_entries(), -1);
+  }
+
+  // Search the best NNVM_EXEC_MATCH_RANGE parameter. This is turned off by default
+  size_t min_allocated_bytes = -1;
+  size_t max_match_range = dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16);
+  size_t min_match_range =
+         dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false) ? 1 : max_match_range;
+  for (size_t match_range = min_match_range; match_range <= max_match_range; match_range *= 2) {
+    // Make a copy of related fields
+    StorageVector storage_vec(storage);
+    std::vector<int> storage_inplace_index(idx.num_node_entries(), -1);
+
+    // the allocator
+    GraphAllocator allocator(&idx, match_range);
+
+    // number of entries that are not statically allocated.
+    size_t storage_num_not_allocated =
+      AllocMemory(ret, idx, node_range, &storage_vec, &storage_inplace_index,
+                  ref_count, &allocator);
+    size_t storage_allocated_bytes = allocator.TotalAllocBytes();
+
+    // Choose the plan which leads to minimal memory usage
+    if (min_allocated_bytes > storage_allocated_bytes) {
+      ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec));
+      ret.attrs["storage_inplace_index"] = std::make_shared<any>(std::move(storage_inplace_index));
+      ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(storage_allocated_bytes);
+      ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(storage_num_not_allocated);
+      min_allocated_bytes = storage_allocated_bytes;
+    }
+
+    if (max_match_range == 0) {
+      break;
+    }
+  }
+  return ret;
+}
+
+NNVM_REGISTER_PASS(PlanMemory)
+.describe("Plan the memory allocation of each node entries.")
+.set_body(PlanMemory)
+.set_change_graph(false)
+.depend_graph_attr("dtype")
+.depend_graph_attr("shape")
+.provide_graph_attr("storage_id")
+.provide_graph_attr("storage_inplace_index");
+
+}  // namespace
+}  // namespace pass
+}  // namespace nnvm
diff --git a/nnvm/src/pass/print_graph_ir.cc b/nnvm/src/pass/print_graph_ir.cc
new file mode 100644
index 000000000000..e10185b1951f
--- /dev/null
+++ b/nnvm/src/pass/print_graph_ir.cc
@@ -0,0 +1,217 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file print_graph_ir.cc
+ * \brief Print the graph IR in LLVM style human readable format.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/pass.h>
+#include <nnvm/tuple.h>
+#include <iostream>
+
+namespace nnvm {
+namespace pass {
+
+using AttrPrinter = std::function<void(uint32_t index, std::ostream& os)>;  // NOLINT(*)
+
+template<typename T>
+AttrPrinter GetVectorPrinter_(const T& vec) {
+  return [&vec](uint32_t index, std::ostream& os) {  // NOLINT(*)
+    os << vec[index];
+  };
+}
+
+AttrPrinter GetVectorPrinter(const Graph& graph,
+                             const std::string& key) {
+  auto it = graph.attrs.find(key);
+  CHECK(it != graph.attrs.end())
+      << "Cannot find " << key << " in graph attr";
+  const any& value = *(it->second);
+  if (value.type() == typeid(std::vector<TShape>)) {
+    return GetVectorPrinter_(
+        nnvm::get<std::vector<TShape> >(value));
+  } else if (value.type() == typeid(std::vector<int>)) {
+    return GetVectorPrinter_(
+        nnvm::get<std::vector<int> >(value));
+  } else if (value.type() == typeid(std::vector<std::string>)) {
+    return GetVectorPrinter_(
+        nnvm::get<std::vector<std::string> >(value));
+  } else {
+    LOG(FATAL) << "Cannot handle type " << value.type().name();
+    return nullptr;
+  }
+}
+
+
+// print the graph ir in readable format
+void PrintGraphIR_(Graph src,
+                   const std::vector<std::string>& join_entry_attrs,
+                   const std::vector<std::string>& join_node_attrs,
+                   std::ostream& os) { // NOLINT(*)
+  const IndexedGraph& idx = src.indexed_graph();
+  std::vector<std::function<void(uint32_t, std::ostream&)> > trigger;  // NOLINT(*)
+
+  for (const std::string& key : join_entry_attrs) {
+    AttrPrinter fp = GetVectorPrinter(src, key);
+    auto fprint = [&idx, key, fp](
+        uint32_t nid, std::ostream& os) {  // NOLINT(*)
+      const IndexedGraph::Node& inode = idx[nid];
+      os << ", " << key << "=";
+      if (inode.source->num_outputs() != 1) {
+        os << '[';
+        for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+          if (i != 0) os << ", ";
+          fp(idx.entry_id(nid, i), os);
+        }
+        os << ']';
+      } else {
+        fp(idx.entry_id(nid, 0), os);
+      }
+    };
+    trigger.push_back(fprint);
+  }
+  for (const std::string& key : join_node_attrs) {
+    AttrPrinter fp = GetVectorPrinter(src, key);
+    auto fprint = [&idx, key, fp](
+        uint32_t nid, std::ostream& os) {  // NOLINT(*)
+      os << ", " << key << "=";
+      fp(idx.entry_id(nid, 0), os);
+    };
+    trigger.push_back(fprint);
+  }
+
+  os << "Graph(";
+  if (idx.input_nodes().size() < 4) {
+    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
+      uint32_t nid = idx.input_nodes()[i];
+      if (i != 0)  {
+        os << ", ";
+      }
+      os << '%' << idx[nid].source->attrs.name;
+    }
+  } else {
+    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
+      uint32_t nid = idx.input_nodes()[i];
+      if (i != 0)  {
+        os << ",\n      ";
+      }
+      os << '%' << idx[nid].source->attrs.name;
+    }
+  }
+  os << ") {\n";
+
+  auto print_entry = [&](const IndexedGraph::NodeEntry& e) {
+    if (idx[e.node_id].source->is_variable()) {
+      os << '%' << idx[e.node_id].source->attrs.name;
+    } else if (idx[e.node_id].source->num_outputs() == 1) {
+      os << '%' << e.node_id;
+    } else {
+      os << '%' << e.node_id << "." << e.index;
+    }
+  };
+
+  if (trigger.size() != 0) {
+    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
+      uint32_t nid = idx.input_nodes()[i];
+      os << "  %" << idx[nid].source->attrs.name;
+      for (const auto& fp : trigger) {
+        fp(nid, os);
+      }
+      os << '\n';
+    }
+  }
+
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    os << "  " << "%" << nid << " = "
+       << inode.source->op()->name << "(";
+    bool first = true;
+    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+      if (first) {
+        first = false;
+      } else {
+        os << ", ";
+      }
+      print_entry(e);
+    }
+    for (const auto& kv : inode.source->attrs.dict) {
+      if (first) {
+        first = false;
+      } else {
+        os << ", ";
+      }
+      os << kv.first << "=\'" << kv.second << "\'";
+    }
+    os << ")";
+    if (inode.control_deps.size() != 0) {
+      os << ", control_deps=[";
+      for (size_t i = 0; i < inode.control_deps.size(); ++i) {
+        if (i != 0) os << ", ";
+        uint32_t cid = inode.control_deps[i];
+        if (idx[cid].source->is_variable()) {
+          os << '%' << idx[cid].source->attrs.name;
+        } else {
+          os << '%' << cid;
+        }
+      }
+      os << "]";
+    }
+    // additional attribute trigger
+    for (const auto& fp : trigger) {
+      fp(nid, os);
+    }
+    os << "\n";
+  }
+  os << "  ret ";
+  {
+    bool first = true;
+    for (const IndexedGraph::NodeEntry& e : idx.outputs()) {
+      if (first) {
+        first = false;
+      } else {
+        os << ", ";
+      }
+      print_entry(e);
+    }
+  }
+  os << "\n}";
+  if (src.attrs.size() != 0) {
+    os << "\ngraph_attr_keys = [";
+    bool first = true;
+    for (const auto& kv : src.attrs) {
+      if (first) {
+        first = false;
+      } else {
+        os << ", ";
+      }
+      os << kv.first;
+    }
+    os << "]\n";
+  }
+}
+
+// save a graph to json
+Graph PrintGraphIRPass(Graph src) {
+  std::ostringstream os;
+  std::vector<std::string> join_entry_attrs, join_node_attrs;
+  if (src.attrs.count("join_entry_attrs") != 0) {
+    join_entry_attrs = src.MoveCopyAttr<std::vector<std::string> >(
+        "join_entry_attrs");
+  }
+  if (src.attrs.count("join_node_attrs") != 0) {
+    join_node_attrs = src.MoveCopyAttr<std::vector<std::string> >(
+        "join_node_attrs");
+  }
+  PrintGraphIR_(src, join_entry_attrs, join_node_attrs, os);
+  Graph ret;
+  ret.attrs["graphir"] = std::make_shared<any>(os.str());
+  return ret;
+}
+
+// register pass
+NNVM_REGISTER_PASS(PrintGraphIR)
+.describe("Return a empty Graph, save ir to ret.attrs[\"graphir\"]")
+.set_body(PrintGraphIRPass);
+
+}  // namespace pass
+}  // namespace nnvm
diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc
new file mode 100644
index 000000000000..195d49bfb9b4
--- /dev/null
+++ b/nnvm/src/pass/saveload_json.cc
@@ -0,0 +1,301 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file saveload_json.cc
+ * \brief Save and load graph to/from JSON file.
+ */
+#include <nnvm/pass.h>
+#include <nnvm/pass_functions.h>
+#include <dmlc/json.h>
+#include <algorithm>
+
+namespace dmlc {
+namespace json {
+// overload handler for shared ptr
+template<>
+struct Handler<std::shared_ptr<any> > {
+  inline static void Write(JSONWriter *writer, const std::shared_ptr<any> &data) {
+    writer->Write(*data);
+  }
+  inline static void Read(JSONReader *reader, std::shared_ptr<any> *data) {
+    any v;
+    reader->Read(&v);
+    *data = std::make_shared<any>(std::move(v));
+  }
+};
+}  // namespace json
+}  // namespace dmlc
+
+namespace nnvm {
+namespace pass {
+namespace {
+
+// JSONNode represents an nnvm::Node in JSON
+struct JSONNode;
+// JSONGraph represents an nnvm::Graph or nnvm::Symbol in JSON
+struct JSONGraph;
+
+// auxiliary node structure for serialization.
+struct JSONNode {
+  // the node entry structure in serialized format
+  struct Entry {
+    uint32_t node_id;
+    uint32_t index;
+    uint32_t version;
+    Entry() = default;
+    Entry(uint32_t node_id, uint32_t index, uint32_t version):
+      node_id(node_id), index(index), version(version) {
+    }
+    void Save(dmlc::JSONWriter *writer) const {
+      writer->BeginArray(false);
+      writer->WriteArrayItem(node_id);
+      writer->WriteArrayItem(index);
+      writer->WriteArrayItem(version);
+      writer->EndArray();
+    }
+    void Load(dmlc::JSONReader *reader) {
+      reader->BeginArray();
+      CHECK(reader->NextArrayItem()) << "invalid json format";
+      reader->Read(&node_id);
+      CHECK(reader->NextArrayItem()) << "invalid json format";
+      reader->Read(&index);
+      if (reader->NextArrayItem()) {
+        reader->Read(&version);
+        CHECK(!reader->NextArrayItem()) << "invalid json format";
+      } else {
+        version = 0;
+      }
+    }
+  };
+
+  // pointer to the graph node
+  NodePtr node;
+  // inputs
+  std::vector<Entry> inputs;
+  // control flow dependencies
+  std::vector<uint32_t> control_deps;
+  // subgraphs
+  std::vector<JSONGraph> subgraphs;
+
+  // function to save JSON node.
+  void Save(dmlc::JSONWriter *writer) const {
+    writer->BeginObject();
+    if (node->op() != nullptr) {
+      writer->WriteObjectKeyValue("op", node->op()->name);
+    } else {
+      std::string json_null = "null";
+      writer->WriteObjectKeyValue("op", json_null);
+    }
+    writer->WriteObjectKeyValue("name", node->attrs.name);
+    if (node->attrs.dict.size() != 0) {
+      // write attributes in order;
+      std::map<std::string, std::string> dict(
+          node->attrs.dict.begin(), node->attrs.dict.end());
+      writer->WriteObjectKeyValue("attrs", dict);
+    }
+    writer->WriteObjectKeyValue("inputs", inputs);
+    if (control_deps.size() != 0) {
+      writer->WriteObjectKeyValue("control_deps", control_deps);
+    }
+    if (subgraphs.size() != 0) {
+      writer->WriteObjectKeyValue("subgraphs", subgraphs);
+    }
+    writer->EndObject();
+  }
+
+  void Load(dmlc::JSONReader *reader) {
+    node = Node::Create();
+    control_deps.clear();
+    dmlc::JSONObjectReadHelper helper;
+    std::string op_type_str;
+    helper.DeclareField("op", &op_type_str);
+    helper.DeclareField("name", &(node->attrs.name));
+    helper.DeclareField("inputs", &inputs);
+    helper.DeclareOptionalField("attrs", &(node->attrs.dict));
+    helper.DeclareOptionalField("attr", &(node->attrs.dict));
+    helper.DeclareOptionalField("control_deps", &control_deps);
+    helper.DeclareOptionalField("subgraphs", &subgraphs);
+    // backward compatible code with mxnet graph.
+    int backward_source_id;
+    std::unordered_map<std::string, std::string> param;
+    helper.DeclareOptionalField("param", &param);
+    helper.DeclareOptionalField("backward_source_id", &backward_source_id);
+    helper.ReadAllFields(reader);
+    node->attrs.dict.insert(param.begin(), param.end());
+
+    if (op_type_str != "null") {
+      try {
+        node->attrs.op = Op::Get(op_type_str);
+      } catch (const dmlc::Error &err) {
+        std::ostringstream os;
+        os << "Failed loading Op " << node->attrs.name
+           << " of type " << op_type_str << ": " << err.what();
+        throw dmlc::Error(os.str());
+      }
+    } else {
+      node->attrs.op = nullptr;
+    }
+  }
+};
+
+// graph structure to help read/save JSON.
+struct JSONGraph {
+  std::vector<JSONNode> nodes;
+  std::vector<uint32_t> arg_nodes;
+  std::vector<uint32_t> node_row_ptr;
+  std::vector<JSONNode::Entry> heads;
+  std::unordered_map<std::string, std::shared_ptr<any> > attrs;
+
+  void Save(dmlc::JSONWriter *writer) const {
+    writer->BeginObject();
+    writer->WriteObjectKeyValue("nodes", nodes);
+    writer->WriteObjectKeyValue("arg_nodes", arg_nodes);
+    writer->WriteObjectKeyValue("node_row_ptr", node_row_ptr);
+    writer->WriteObjectKeyValue("heads", heads);
+    if (attrs.size() != 0) {
+      writer->WriteObjectKeyValue("attrs", attrs);
+    }
+    writer->EndObject();
+  }
+
+  void Load(dmlc::JSONReader *reader) {
+    attrs.clear();
+    dmlc::JSONObjectReadHelper helper;
+    helper.DeclareField("nodes", &nodes);
+    helper.DeclareField("arg_nodes", &arg_nodes);
+    helper.DeclareField("heads", &heads);
+    helper.DeclareOptionalField("node_row_ptr", &node_row_ptr);
+    helper.DeclareOptionalField("attrs", &attrs);
+    helper.ReadAllFields(reader);
+  }
+};
+
+void Symbol2JSONGraph(std::shared_ptr<Symbol> src, JSONGraph *jgraph) {
+  std::unordered_map<Node*, uint32_t> node2index;
+  jgraph->node_row_ptr.push_back(0);
+  DFSVisit(src->outputs, [&node2index, jgraph](const NodePtr& n) {
+    uint32_t nid = static_cast<uint32_t>(jgraph->nodes.size());
+    node2index[n.get()] = nid;
+    if (n->is_variable()) {
+      jgraph->arg_nodes.push_back(nid);
+    }
+    JSONNode jnode;
+    jnode.node = n;
+    jnode.inputs.reserve(n->inputs.size());
+    for (const NodeEntry& e : n->inputs) {
+      jnode.inputs.emplace_back(node2index.at(e.node.get()), e.index, e.version);
+    }
+    for (const NodePtr& c : n->control_deps) {
+      jnode.control_deps.push_back(node2index.at(c.get()));
+    }
+    jgraph->node_row_ptr.push_back(jgraph->node_row_ptr.back() + n->num_outputs());
+    jgraph->nodes.emplace_back(std::move(jnode));
+  });
+  for (const NodeEntry& e : src->outputs) {
+    jgraph->heads.emplace_back(node2index.at(e.node.get()), e.index, e.version);
+  }
+  // recursively construct subgraphs
+  for (JSONNode &jnode : jgraph->nodes) {
+    // construct jnode's subgraphs
+    const std::vector<std::shared_ptr<Symbol>> &subgraphs = jnode.node->attrs.subgraphs;
+    std::vector<JSONGraph> &jsubgraphs = jnode.subgraphs;
+    jsubgraphs.resize(subgraphs.size());
+    for (uint32_t i = 0; i < subgraphs.size(); ++i) {
+      Symbol2JSONGraph(subgraphs[i], &jsubgraphs[i]);
+    }
+  }
+}
+
+std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph &jgraph, bool no_parse) {
+  for (const JSONNode &n : jgraph.nodes) {
+    n.node->inputs.reserve(n.inputs.size());
+    for (const JSONNode::Entry &e : n.inputs) {
+      n.node->inputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
+    }
+    n.node->control_deps.reserve(n.control_deps.size());
+    for (uint32_t nid : n.control_deps) {
+      n.node->control_deps.push_back(jgraph.nodes[nid].node);
+    }
+    // rebuild attribute parser
+    if (!no_parse && n.node->op() != nullptr && n.node->op()->attr_parser != nullptr) {
+      n.node->op()->attr_parser(&(n.node->attrs));
+    }
+    for (const JSONGraph &subgraph : n.subgraphs) {
+      // The "no_parse" option here, is to be compatible with
+      // commit cfd3075e85807dcd8f9534c37e053583dee87524
+      // (https://github.com/apache/incubator-mxnet/tree/cfd3075e85807dcd8f9534c37e053583dee87524),
+      // where the parsing of main graph is deferred until
+      // incubator-mxnet/src/nnvm/legacy_json_util.cc:UpgradeJSON_Parse
+      n.node->attrs.subgraphs.push_back(JSONGraph2Symbol(subgraph, false));
+    }
+  }
+  // consistency check
+  for (uint32_t nid : jgraph.arg_nodes) {
+    CHECK(jgraph.nodes[nid].node->is_variable());
+  }
+  std::shared_ptr<Symbol> symbol = std::make_shared<Symbol>();
+  symbol->outputs.reserve(jgraph.heads.size());
+  for (const JSONNode::Entry &e : jgraph.heads) {
+    symbol->outputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
+  }
+  return symbol;
+}
+
+// Load a graph from JSON file.
+Graph LoadJSON(Graph src) {
+  CHECK_NE(src.attrs.count("json"), 0U)
+      << "Load JSON require json to be presented.";
+  const std::string &json_str =
+      nnvm::get<std::string>(*src.attrs.at("json"));
+  bool no_parse = false;
+  if (src.attrs.count("load_json_no_parse")) {
+    no_parse = nnvm::get<bool>(*src.attrs.at("load_json_no_parse"));
+  }
+  std::istringstream is(json_str);
+  dmlc::JSONReader reader(&is);
+  JSONGraph jgraph;
+  // load in json graph.
+  jgraph.Load(&reader);
+  std::shared_ptr<Symbol> symbol = JSONGraph2Symbol(jgraph, no_parse);
+  // return the graph
+  Graph ret;
+  ret.attrs = std::move(jgraph.attrs);
+  ret.outputs = symbol->outputs;
+  return ret;
+}
+
+// save a graph to json
+Graph SaveJSON(Graph src) {
+  std::shared_ptr<Symbol> src_symbol = std::make_shared<Symbol>();
+  src_symbol->outputs = src.outputs;
+  JSONGraph jgraph;
+  Symbol2JSONGraph(src_symbol, &jgraph);
+  jgraph.attrs = src.attrs;
+  std::ostringstream os;
+  dmlc::JSONWriter writer(&os);
+  jgraph.Save(&writer);
+  Graph ret;
+  ret.attrs["json"] = std::make_shared<any>(os.str());
+  return ret;
+}
+
+// register pass
+NNVM_REGISTER_PASS(LoadJSON)
+.describe("Return a new Graph, loaded from src.attrs[\"json\"]")
+.set_body(LoadJSON)
+.set_change_graph(true)
+.depend_graph_attr("json");
+
+NNVM_REGISTER_PASS(SaveJSON)
+.describe("Return a new empty Graph. Save graph to ret.attrs[\"json\"]")
+.set_body(SaveJSON)
+.set_change_graph(true)
+.provide_graph_attr("json");
+
+
+DMLC_JSON_ENABLE_ANY(std::string, str);
+DMLC_JSON_ENABLE_ANY(std::vector<int>, list_int);
+DMLC_JSON_ENABLE_ANY(std::vector<std::string>, list_str);
+
+}  // namespace
+}  // namespace pass
+}  // namespace nnvm
diff --git a/nnvm/src/top/elemwise_op_common.h b/nnvm/src/top/elemwise_op_common.h
new file mode 100644
index 000000000000..e5bb0adcb078
--- /dev/null
+++ b/nnvm/src/top/elemwise_op_common.h
@@ -0,0 +1,351 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file elemwise_op_common.h
+ * \brief Common operator utilities
+ */
+#ifndef NNVM_TOP_ELEMWISE_OP_COMMON_H_
+#define NNVM_TOP_ELEMWISE_OP_COMMON_H_
+
+#include <nnvm/layout.h>
+#include <nnvm/top/nn.h>
+#include <string>
+#include <vector>
+#include <utility>
+#include <functional>
+#include "./op_common.h"
+
+namespace nnvm {
+namespace top {
+
+template<typename AttrType, bool (*is_none)(const AttrType&),
+         bool (*assign)(AttrType*, const AttrType&), bool reverse_infer,
+         std::string (*attr_string)(const AttrType&),
+         int n_in = -1, int n_out = -1>
+inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
+                         std::vector<AttrType> *in_attrs,
+                         std::vector<AttrType> *out_attrs,
+                         const AttrType& none) {
+  AttrType dattr = none;
+  size_t in_size = in_attrs->size();
+  size_t out_size = out_attrs->size();
+  if (n_in != -1)
+    in_size = static_cast<size_t>(n_in);
+  if (n_out != -1)
+    out_size = static_cast<size_t>(n_out);
+
+  auto deduce = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+      for (size_t i = 0; i < size; ++i) {
+        CHECK(assign(&dattr, (*vec)[i]))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": " << "expected " << attr_string(dattr)
+          << ", got " << attr_string((*vec)[i]);
+      }
+    };
+  deduce(in_attrs, in_size, "input");
+  if (reverse_infer) deduce(out_attrs, out_size, "output");
+
+  auto write = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+      for (size_t i = 0; i < size; ++i) {
+        CHECK(assign(&(*vec)[i], dattr))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": " << "expected " << attr_string(dattr)
+          << ", got " << attr_string((*vec)[i]);
+      }
+    };
+  write(in_attrs, in_size, "input");
+  write(out_attrs, out_size, "output");
+
+  if (is_none(dattr)) return false;
+  return true;
+}
+
+template<int n_in, int n_out>
+inline bool ElemwiseShape(const NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in)) << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
+  }
+  return ElemwiseAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+    attrs, in_attrs, out_attrs, TShape());
+}
+
+template<int n_in, int n_out>
+inline bool ElemwiseType(const NodeAttrs& attrs,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in)) << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
+  }
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+    attrs, in_attrs, out_attrs, -1);
+}
+
+inline bool ElementWiseReduceShape(const NodeAttrs& attrs,
+                                   std::vector<TShape> *in_attrs,
+                                   std::vector<TShape> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+    attrs, in_attrs, out_attrs, TShape());
+}
+
+inline bool ElementWiseReduceType(const NodeAttrs& attrs,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+    attrs, in_attrs, out_attrs, -1);
+}
+
+template<int n_in, int n_out>
+inline bool ElemwiseFixedLayout(const NodeAttrs& attrs,
+                                std::vector<Layout> *in_layouts,
+                                const std::vector<Layout> *last_in_layouts,
+                                std::vector<Layout> *out_layouts,
+                                const std::function<Layout(const Layout& in)>& finfer) {
+  const size_t in_size = (n_in == -1) ? in_layouts->size() : static_cast<size_t>(n_in);
+  const size_t out_size = (n_out == -1) ? out_layouts->size() : static_cast<size_t>(n_out);
+
+  auto deduce = [&](Layout *target, const std::vector<Layout> *vec,
+                    size_t size, const char *name) {
+    for (size_t i = 0; i < size; ++i) {
+      if (vec->at(i).defined()) {
+        if (!target->defined()) {
+          *target = vec->at(i);
+        }
+        CHECK_EQ(*target, vec->at(i))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": " << "expected " << *target
+          << ", got " << vec->at(i);
+      }
+    }
+  };
+
+  Layout in, last_in, out;
+  deduce(&in, in_layouts, in_size, "input");
+  deduce(&last_in, last_in_layouts, in_size, "input (last infer pass)");
+  deduce(&out, out_layouts, out_size, "output");
+
+  if (!last_in.defined()) {
+    last_in = in;
+  } else {
+    // else we copy in_layout produced by last infer pass to in_layout,
+    // and let LayoutTransform pass
+    // to insert an layout_transform node to fix the input layout.
+    in = last_in;
+  }
+
+  out = finfer(in);
+
+  auto write = [](std::vector<Layout> *vec, Layout& value, size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+      vec->at(i) = value;
+    }
+  };
+  if (in.defined()) write(in_layouts, in, in_size);
+  if (out.defined()) write(out_layouts, out, out_size);
+
+  return true;
+}
+
+/*! \brief Fix the input layout as the previous inferred (if any) and copy to output */
+template<int n_in, int n_out>
+inline bool ElemwiseFixedLayoutCopyToOut(const NodeAttrs& attrs,
+                                         std::vector<Layout> *in_layouts,
+                                         const std::vector<Layout> *last_in_layouts,
+                                         std::vector<Layout> *out_layouts) {
+  return ElemwiseFixedLayout<n_in, n_out>(
+    attrs, in_layouts, last_in_layouts, out_layouts, [](const Layout& in) {
+    return in;
+  });
+}
+
+/*! \brief Fix the input layout as the previous inferred (if any) and do not define output */
+template<int n_in, int n_out>
+inline bool ElemwiseFixedLayoutUnknownOut(const NodeAttrs& attrs,
+                                          std::vector<Layout> *in_layouts,
+                                          const std::vector<Layout> *last_in_layouts,
+                                          std::vector<Layout> *out_layouts) {
+  return ElemwiseFixedLayout<n_in, n_out>(
+    attrs, in_layouts, last_in_layouts, out_layouts, [](const Layout& in) {
+    return Layout::Undef();
+  });
+}
+
+/*! \brief take arbitrary input layout and copy to output */
+template<int n_in, int n_out>
+inline bool ElemwiseArbitraryLayout(const NodeAttrs& attrs,
+                                    std::vector<Layout> *in_layouts,
+                                    const std::vector<Layout> *last_in_layouts,
+                                    std::vector<Layout> *out_layouts) {
+  const size_t in_size = (n_in == -1) ? in_layouts->size() : static_cast<size_t>(n_in);
+  const size_t out_size = (n_out == -1) ? out_layouts->size() : static_cast<size_t>(n_out);
+
+  Layout in;
+  for (size_t i = 0; i < in_size; ++i) {
+    if (!in.defined()) in = in_layouts->at(i);
+    CHECK_EQ(in, in_layouts->at(i))
+      << "Incompatible attr in node " << attrs.name << " at " << i
+      << "-th input: expected " << in
+      << ", got " << in_layouts->at(i);
+  }
+
+  if (in.defined()) {
+    for (size_t i = 0; i < out_size; ++i) {
+      out_layouts->at(i) = in;
+    }
+  }
+
+  return true;
+}
+
+/*!
+ * \brief try to convert right layout to left layout if they are different.
+ *        if the converting fails, it will use the last inferred layouts.
+ */
+inline bool ElemwiseBinaryKeepLeftLayout(const NodeAttrs& attrs,
+                                         std::vector<Layout> *in_layouts,
+                                         const std::vector<Layout> *last_in_layouts,
+                                         std::vector<Layout> *out_layouts) {
+  CHECK_EQ(in_layouts->size(), 2U);
+  CHECK_EQ(last_in_layouts->size(), 2U);
+  CHECK_EQ(out_layouts->size(), 1U);
+
+  const Layout& lhs_last = (*last_in_layouts)[0];
+  const Layout& rhs_last = (*last_in_layouts)[1];
+  CHECK((lhs_last.defined() && rhs_last.defined()) ||
+        (!lhs_last.defined() && !rhs_last.defined()));
+
+  const Layout& lhs = (*in_layouts)[0];
+  const Layout& rhs = (*in_layouts)[1];
+
+  if (!lhs.defined() && !rhs.defined()) {
+    CHECK(!lhs_last.defined() && !rhs_last.defined())
+      << "Lost input layouts in node " << attrs.name
+      << ": last inferred lhs=" << lhs_last << ", rhs=" << rhs_last;
+    return true;
+  } else if (!lhs.defined()) {
+    CHECK(!lhs_last.defined() && !rhs_last.defined());
+    in_layouts->at(0) = rhs;
+    out_layouts->at(0) = rhs;
+    return true;
+  } else if (!rhs.defined()) {
+    CHECK(!lhs_last.defined() && !rhs_last.defined());
+    in_layouts->at(1) = lhs;
+    out_layouts->at(0) = lhs;
+    return true;
+  }
+
+  if (lhs == rhs) {
+    // for same layout, we can always do binary calculation
+    // and pass the layout to next layer
+    out_layouts->at(0) = lhs;
+    return true;
+  }
+
+  if (rhs.convertible(lhs)) {
+    in_layouts->at(1) = lhs;
+    out_layouts->at(0) = lhs;
+  } else {
+    CHECK(lhs_last.defined() && rhs_last.defined())
+      << "Incompatible input layouts in node " << attrs.name
+      << ". lhs: " << lhs << ", rhs: " << rhs;
+    CHECK(lhs_last == rhs_last);
+    in_layouts->at(0) = lhs_last;
+    in_layouts->at(1) = rhs_last;
+    out_layouts->at(0) = lhs_last;
+  }
+
+  return true;
+}
+
+#define NNVM_REGISTER_ELEMWISE_UNARY_OP(name)                       \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(1)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)        \
+  .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)           \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                       \
+    ElemwiseArbitraryLayout<1, 1>)                                  \
+  .set_attr<FInplaceOption>("FInplaceOption",                       \
+    [](const NodeAttrs& attrs){                                     \
+      return std::vector<std::pair<int, int> >{{0, 0}};             \
+    })                                                              \
+  .add_argument("data", "Tensor", "The input tensor.")
+
+
+#define NNVM_REGISTER_INIT_OP(name)                                 \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(0)                                                \
+  .set_num_outputs(1)
+
+
+#define NNVM_REGISTER_INIT_LIKE_OP(name)                            \
+  NNVM_REGISTER_ELEMWISE_UNARY_OP(name)                             \
+  .set_attr<FGradient>("FGradient", MakeZeroGradNodes)              \
+  .add_argument("data", "Symbol", "The input")
+
+
+#define NNVM_REGISTER_ELEMWISE_BINARY_OP(name)                      \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(2)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr<FInferShape>("FInferShape", ElemwiseShape<2, 1>)        \
+  .set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)           \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                       \
+    ElemwiseBinaryKeepLeftLayout)                                   \
+  .set_attr<FInplaceOption>("FInplaceOption",                       \
+    [](const NodeAttrs& attrs) {                                    \
+      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};     \
+    })                                                              \
+  .add_argument("lhs", "Tensor", "first input")                     \
+  .add_argument("rhs", "Tensor", "second input")
+
+
+#define NNVM_REGISTER_ELEMWISE_REDUCE_OP(name)                      \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs([](const NodeAttrs& attrs) {                      \
+    return static_cast<uint32_t>(                                   \
+      dmlc::get<ElementWiseReduceParam>(attrs.parsed).num_args);    \
+    })                                                              \
+  .set_attr_parser(ParamParser<ElementWiseReduceParam>)             \
+  .set_attr<FGetAttrDict>("FGetAttrDict",                           \
+    ParamGetAttrDict<ElementWiseReduceParam>)                       \
+  .set_attr<nnvm::FInferShape>("FInferShape",                       \
+    ElementWiseReduceShape)                                         \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                       \
+    ElemwiseFixedLayoutCopyToOut<-1, 1>)                             \
+  .set_attr<nnvm::FInferType>("FInferType", ElementWiseReduceType)  \
+  .add_argument("args", "Symbol[]", "Positional input arguments")
+
+
+#define NNVM_REGISTER_INDICATOR_OP(name)                            \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_outputs(1)                                               \
+  .set_attr<FInferType>(                                            \
+    "FInferType", [](const NodeAttrs& attrs,                        \
+                     std::vector<int>* in_attrs,                    \
+                     std::vector<int>* out_attrs) {                 \
+      CHECK_EQ(out_attrs->size(), 1U);                              \
+      NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0,                 \
+        static_cast<int>(kFloat32));                                \
+      return true;                                                  \
+  })                                                                \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                       \
+    ElemwiseFixedLayoutUnknownOut<1, 1>)                            \
+  .set_attr<FGradient>(                                             \
+    "FGradient", [](const NodePtr& n,                               \
+                    const std::vector<NodeEntry>& ograds) {         \
+      return MakeZeroGradNodes(n, ograds);                          \
+  })
+
+
+}  // namespace top
+}  // namespace nnvm
+#endif  // NNVM_TOP_ELEMWISE_OP_COMMON_H_
diff --git a/nnvm/src/top/image/resize.cc b/nnvm/src/top/image/resize.cc
new file mode 100644
index 000000000000..b89070fe3897
--- /dev/null
+++ b/nnvm/src/top/image/resize.cc
@@ -0,0 +1,109 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file resize.cc
+ * \brief Property def of resize operators.
+ */
+#include <tvm/tvm.h>
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/layout.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include "../nn/nn_common.h"
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/elemwise.h"
+#include "topi/transform.h"
+#include "topi/image/resize.h"
+#include "resize.h"
+
+namespace nnvm {
+namespace top {
+using tvm::Expr;
+using tvm::Array;
+using tvm::Tensor;
+using nnvm::compiler::FTVMCompute;
+
+DMLC_REGISTER_PARAMETER(ResizeParam);
+
+inline bool ResizeInferShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape>* in_shape,
+                             std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+  TShape dshape = (*in_shape)[0];
+  if (dshape.ndim() ==  0) return false;
+  dshape = ConvertLayout(dshape, param.layout, kNCHW);
+
+  TShape oshape = dshape;
+  oshape[2] = param.size[0];
+  oshape[3] = param.size[1];
+
+  oshape = ConvertLayout(oshape, kNCHW, param.layout);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+
+  return true;
+}
+
+inline bool ResizeLayout(const NodeAttrs& attrs,
+                         std::vector<Layout> *in_layouts,
+                         const std::vector<Layout> *last_in_layouts,
+                         std::vector<Layout> *out_layouts) {
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  CHECK_EQ(in_layouts->size(), 1U);
+  CHECK_EQ(out_layouts->size(), 1U);
+  const Layout layout(param.layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, layout);
+  return true;
+}
+
+NNVM_REGISTER_OP(resize)
+.describe(R"(Perform resize to input array with nearest neighbour or bilinear interpolation.
+
+- **data**: data is 4D array of shape
+            (batch_size, channels, in_height, in_width) for NCHW
+            (batch_size, in_height, in_width, channels) for NHWC
+
+- **out**: Output is 4D array of shape
+           for layout NCHW
+           (batch_size, channels, size[0], size[1])
+
+           for layout NHWC
+           (batch_size, size[0], size[1], channels)
+
+)" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(ResizeParam::__FIELDS__())
+.set_attr_parser(ParamParser<ResizeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ResizeParam>)
+.set_attr<FInferShape>("FInferShape", ResizeInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ResizeLayout)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  Array<Expr> oshape;
+  if (param.layout == "NCHW") {
+    oshape.push_back(out_info[0]->shape[2]);
+    oshape.push_back(out_info[0]->shape[3]);
+  } else {
+    oshape.push_back(out_info[0]->shape[1]);
+    oshape.push_back(out_info[0]->shape[2]);
+  }
+
+  return Array<Tensor>{ topi::image::resize(inputs[0], oshape, param.layout,
+                                             param.align_corners, param.method)};
+})
+.set_support_level(2);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/image/resize.h b/nnvm/src/top/image/resize.h
new file mode 100644
index 000000000000..7237152f7525
--- /dev/null
+++ b/nnvm/src/top/image/resize.h
@@ -0,0 +1,45 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file resize.h
+ */
+#ifndef NNVM_TOP_IMAGE_RESIZE_H_
+#define NNVM_TOP_IMAGE_RESIZE_H_
+
+#include <string>
+#include <vector>
+#include <utility>
+#include <iostream>
+#include <sstream>
+
+namespace nnvm {
+namespace top {
+
+struct ResizeParam : public dmlc::Parameter<ResizeParam> {
+  TShape size;
+  std::string layout;
+  std::string method;
+  bool align_corners;
+
+  DMLC_DECLARE_PARAMETER(ResizeParam) {
+    DMLC_DECLARE_FIELD(size)
+      .describe("Output size");
+    DMLC_DECLARE_FIELD(layout)
+      .set_default("NCHW")
+      .describe("Dimension ordering of data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Resize is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(method)
+      .set_default("BILINEAR")
+      .describe("Specify the mode to use for scaling."
+                "NEAREST_NEIGHBOR -  Nearest Neighbor"
+                "BILINEAR - Bilinear Interpolation");
+    DMLC_DECLARE_FIELD(align_corners)
+      .set_default(false)
+      .describe("Should be true to preserve the values at the corner pixels");
+  }
+};
+
+}  // namespace top
+}  // namespace nnvm
+#endif  // NNVM_TOP_IMAGE_RESIZE_H_
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
new file mode 100644
index 000000000000..2843bea1f4ad
--- /dev/null
+++ b/nnvm/src/top/nn/convolution.cc
@@ -0,0 +1,566 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file convolution.cc
+ * \brief Convolution operators
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/layout.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include <tvm/tensor.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <tvm/tvm.h>
+#include "./nn_common.h"
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/nn.h"
+
+
+using tvm::Tensor;
+using tvm::Array;
+using nnvm::compiler::FTVMCompute;
+
+namespace nnvm {
+namespace top {
+
+// conv2d
+DMLC_REGISTER_PARAMETER(Conv2DParam);
+
+inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape>* in_shape,
+                             std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+
+  const Layout in_layout(param.layout);
+  const Layout kernel_layout(param.kernel_layout);
+  CHECK(in_layout.convertible(kNCHW))
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+  CHECK(kernel_layout.convertible(kOIHW))
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param.out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+  CHECK(out_layout.convertible(kNCHW))
+    << "Conv only support output layouts that are convertible from NCHW."
+    << " But got " << out_layout;
+
+  if (param.use_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+
+  TShape dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  dshape = ConvertLayout(dshape, in_layout, kNCHW);
+
+  CHECK_EQ(dshape.ndim(), 4U) << "Input data should be 4D";
+  CHECK_EQ(param.kernel_size.ndim(), 2U);
+  CHECK_EQ(param.strides.ndim(), 2U)
+      << "incorrect stride size: " << param.strides;
+  CHECK_EQ(param.dilation.ndim(), 2U)
+      << "incorrect dilate size: " << param.dilation;
+  CHECK_EQ(dshape[1] % param.groups, 0U)
+      << "input channels must divide group size";
+  CHECK_EQ(param.channels % param.groups, 0U)
+      << "output channels must divide group size";
+
+  TShape wshape({param.channels / param.groups,
+                 dshape[1] / param.groups,
+                 param.kernel_size[0],
+                 param.kernel_size[1]});
+
+  wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+
+  wshape[kernel_layout.indexof('O')] *= param.groups;
+
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kWeight, wshape);
+  if (param.use_bias) {
+    static const Layout default_bias_layout("C");
+    TShape bias_shape({param.channels});
+    auto oc_block = out_layout.subsizeof('C');
+    if (oc_block > 0) {
+      size_t split_axis = (out_layout.indexof('C') < out_layout.indexof('c')) ? 1 : 0;
+      bias_shape = ConvertLayout(bias_shape, default_bias_layout,
+                                 default_bias_layout.split('C', split_axis, oc_block));
+    }
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kBias, bias_shape);
+  }
+  // dilation
+  dim_t dilated_ksize_y = 1 + (param.kernel_size[0] - 1) * param.dilation[0];
+  dim_t dilated_ksize_x = 1 + (param.kernel_size[1] - 1) * param.dilation[1];
+  TShape oshape({dshape[0], param.channels, 0, 0});
+  if (dshape[2] != 0) {
+    oshape[2] = (dshape[2] + param.padding[0] * 2 - dilated_ksize_y) / param.strides[0] + 1;
+  }
+  if (dshape[3] != 0) {
+    oshape[3] = (dshape[3] + param.padding[1] * 2 - dilated_ksize_x) / param.strides[1] + 1;
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, ConvertLayout(oshape, kNCHW, out_layout));
+  // Perform incomplete shape inference. Fill in the missing values in data shape.
+  // 1) We can always fill in the batch_size.
+  // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+  oshape = ConvertLayout((*out_shape)[0], out_layout, kNCHW);
+  dshape[0] = oshape[0];
+  if (oshape[2] && param.strides[0] == 1) {
+    dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param.padding[0];
+  }
+  if (oshape[3] && param.strides[1] == 1) {
+    dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param.padding[1];
+  }
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kData,
+                          ConvertLayout(dshape, kNCHW, in_layout));
+  // Check whether the kernel sizes are valid
+  if (dshape[2] != 0) {
+    CHECK_LE(dilated_ksize_y, dshape[2] + 2 * param.padding[0])
+      << "kernel size exceed input";
+  }
+  if (dshape[3] != 0) {
+    CHECK_LE(dilated_ksize_x, dshape[3] + 2 * param.padding[1])
+        << "kernel size exceed input";
+  }
+  return true;
+}
+
+inline bool WinogradConv2DInferShape(const nnvm::NodeAttrs& attrs,
+                                     std::vector<TShape>* in_shape,
+                                     std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const WinogradConv2DParam& param = nnvm::get<WinogradConv2DParam>(attrs.parsed);
+
+  const Layout in_layout(param.layout);
+  const Layout kernel_layout(param.kernel_layout);
+  CHECK(in_layout.convertible(kNCHW))
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+  CHECK(kernel_layout.convertible(kOIHW))
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param.out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+  CHECK(out_layout.convertible(kNCHW))
+    << "Conv only support output layouts that are convertible from NCHW."
+    << " But got " << out_layout;
+
+  if (param.use_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+
+  TShape dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  dshape = ConvertLayout(dshape, in_layout, kNCHW);
+
+  CHECK_EQ(dshape.ndim(), 4U) << "Input data should be 4D";
+  CHECK_EQ(param.kernel_size.ndim(), 2U);
+  CHECK_EQ(param.strides.ndim(), 2U)
+      << "incorrect stride size: " << param.strides;
+  CHECK_EQ(param.dilation.ndim(), 2U)
+      << "incorrect dilate size: " << param.dilation;
+  CHECK_EQ(dshape[1] % param.groups, 0U)
+      << "input channels must divide group size";
+  CHECK_EQ(param.channels % param.groups, 0U)
+      << "output channels must divide group size";
+
+  // NOTE: Do not check weight shape here!
+  // Different backend requires different layout to compute
+  // the batch gemm stage in winograd efficiently, but we want to
+  // make this NNVM symbol work for all backends.
+  // So we accept all weight shapes, and assume the TOPI developers
+  // can handle this correctly in alter_op_layout.
+
+  if (param.use_bias) {
+    static const Layout default_bias_layout("C");
+    TShape bias_shape({param.channels});
+    auto oc_block = out_layout.subsizeof('C');
+    if (oc_block > 0) {
+      size_t split_axis = (out_layout.indexof('C') < out_layout.indexof('c')) ? 1 : 0;
+      bias_shape = ConvertLayout(bias_shape, default_bias_layout,
+                                 default_bias_layout.split('C', split_axis, oc_block));
+    }
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, WinogradConv2DParam::kBias, bias_shape);
+  }
+  // dilation
+  dim_t dilated_ksize_y = 1 + (param.kernel_size[0] - 1) * param.dilation[0];
+  dim_t dilated_ksize_x = 1 + (param.kernel_size[1] - 1) * param.dilation[1];
+  TShape oshape({dshape[0], param.channels, 0, 0});
+  if (dshape[2] != 0) {
+    oshape[2] = (dshape[2] + param.padding[0] * 2 - dilated_ksize_y) / param.strides[0] + 1;
+  }
+  if (dshape[3] != 0) {
+    oshape[3] = (dshape[3] + param.padding[1] * 2 - dilated_ksize_x) / param.strides[1] + 1;
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, ConvertLayout(oshape, kNCHW, out_layout));
+  // Perform incomplete shape inference. Fill in the missing values in data shape.
+  // 1) We can always fill in the batch_size.
+  // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+  oshape = ConvertLayout((*out_shape)[0], out_layout, kNCHW);
+  dshape[0] = oshape[0];
+  if (oshape[2] && param.strides[0] == 1) {
+    dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param.padding[0];
+  }
+  if (oshape[3] && param.strides[1] == 1) {
+    dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param.padding[1];
+  }
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, WinogradConv2DParam::kData,
+                          ConvertLayout(dshape, kNCHW, in_layout));
+  // Check whether the kernel sizes are valid
+  if (dshape[2] != 0) {
+    CHECK_LE(dilated_ksize_y, dshape[2] + 2 * param.padding[0])
+      << "kernel size exceed input";
+  }
+  if (dshape[3] != 0) {
+    CHECK_LE(dilated_ksize_x, dshape[3] + 2 * param.padding[1])
+      << "kernel size exceed input";
+  }
+  return true;
+}
+
+template <typename PARAM>
+inline bool Conv2DInferType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_type,
+                            std::vector<int>* out_type) {
+  const PARAM& param = nnvm::get<PARAM>(attrs.parsed);
+  if (param.use_bias) {
+    CHECK_EQ(in_type->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_type->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_type->size(), 1U);
+  if (param.out_dtype != -1) {
+    CHECK(!type_is_none((*in_type)[0]));
+    for (size_t i = 1; i < in_type->size(); ++i) {
+      NNVM_ASSIGN_INPUT_TYPE(attrs, *in_type, i, (*in_type)[0]);
+    }
+    NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_type, 0, param.out_dtype);
+  } else {
+    ElemwiseType<-1, 1>(attrs, in_type, out_type);
+  }
+  return true;
+}
+
+
+template<typename PARAM>
+inline bool Conv2DCorrectLayout(const NodeAttrs& attrs,
+                                std::vector<Layout> *ilayouts,
+                                const std::vector<Layout> *last_ilayouts,
+                                std::vector<Layout> *olayouts) {
+  const PARAM& param = nnvm::get<PARAM>(attrs.parsed);
+
+  const Layout in_layout(param.layout);
+  Layout out_layout(param.out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+
+  const Layout kernel_layout(param.kernel_layout);
+  if (param.use_bias) {
+    CHECK_EQ(ilayouts->size(), 3U) << "Input:[data, weight, bias]";
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 0, in_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kernel_layout);
+    // automatically decide bias layout
+    Layout bias_layout("C");
+    auto oc_block = out_layout.subsizeof('C');
+    if (oc_block > 0) {
+      size_t split_axis = (out_layout.indexof('C') < out_layout.indexof('c')) ? 1 : 0;
+      bias_layout = bias_layout.split('C', split_axis, oc_block);
+    }
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 2, bias_layout);
+  } else {
+    CHECK_EQ(ilayouts->size(), 2U) << "Input:[data, weight]";
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 0, in_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kernel_layout);
+  }
+
+  CHECK_EQ(olayouts->size(), 1U);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, out_layout);
+
+  return true;
+}
+
+NNVM_REGISTER_OP(conv2d)
+.describe(R"code(2D convolution layer (e.g. spatial convolution over images).
+
+This layer creates a convolution kernel that is convolved
+with the layer input to produce a tensor of
+outputs. If `use_bias` is True,
+a bias vector is created and added to the outputs.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+- **bias**: (channels,)
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_argument("weight", "4D Tensor", "Weight matrix.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(Conv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
+.set_support_level(2)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return MakeGradNode("_conv2d_grad", n,
+                        {ograds[0], n->inputs[Conv2DParam::kData],
+                         n->inputs[Conv2DParam::kWeight]},
+                        n->attrs.dict);
+});
+
+NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
+.describe(R"code(2D convolution layer (e.g. spatial convolution over images).
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "5D Tensor", "Packed input data.")
+.add_argument("weight", "6D Tensor", "Packed weight matrix.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(Conv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
+.set_support_level(2);
+
+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
+.describe(R"code(Weight transformation of winograd fast convolution algorithm.
+Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
+weight transformation in advance.
+
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+)code" NNVM_ADD_FILELINE)
+.add_argument("weight", "4D Tensor", "Weight tensor.")
+.add_arguments(WinogradWeightTransformParam::__FIELDS__())
+.set_attr_parser(ParamParser<WinogradWeightTransformParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradWeightTransformParam>)
+.set_attr<FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
+                                         std::vector<TShape> *in_shape,
+                                         std::vector<TShape> *out_shape) {
+  const auto& param = nnvm::get<WinogradWeightTransformParam>(attrs.parsed);
+  const TShape &wshape = (*in_shape)[0];
+
+  CHECK_EQ(wshape.ndim(), 4) << "Weight should be a 4 dimensional tensor";
+
+  TShape oshape({param.tile_size + wshape[2] - 1,
+                 param.tile_size + wshape[3] - 1,
+                 wshape[0],
+                 wshape[1]});
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+  })
+.set_attr<FCorrectLayout>("FCorrectLayot", [](const NodeAttrs& attrs,
+                                              std::vector<Layout> *ilayouts,
+                                              const std::vector<Layout> *last_ilayouts,
+                                              std::vector<Layout> *olayouts) {
+  Layout layout("OIHW");
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
+  return true;
+})
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(5);
+
+DMLC_REGISTER_PARAMETER(WinogradWeightTransformParam);
+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform)
+.describe(R"code(Compute conv2d with winograd algorithm.
+
+- **data**: Input is 4D array of shape  (batch_size, in_channels, height, width)
+- **weight**: Any shape
+            We do not check shape for this input tensor.
+
+- **bias**: (channels,)
+- **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_argument("weight", "Tensor", "Transformed weight tensor.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(WinogradConv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<WinogradConv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradConv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<WinogradConv2DParam>)
+.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape)
+.set_attr<FInferType>("FInferType", Conv2DInferType<WinogradConv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<WinogradConv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<WinogradConv2DParam>)
+.set_support_level(5);
+
+DMLC_REGISTER_PARAMETER(WinogradConv2DParam);
+
+NNVM_REGISTER_OP(_conv2d_grad)
+  .describe(R"code(2D convolution grad.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("ograd", "4D Tensor", "Output grad.")
+.add_argument("data", "4D Tensor", "Input data of conv2d.")
+.add_argument("weight", "4D Tensor", "Input weight.")
+.set_num_inputs(3)
+.set_num_outputs(UseBiasNumInputs<Conv2DParam>)
+.set_attr<FListOutputNames>("FListOutputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FInferShape>(
+  "FInferShape", [](const nnvm::NodeAttrs& attrs,
+                    std::vector<TShape>* in_attrs,
+                    std::vector<TShape>* out_attrs) {
+    const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, Conv2DParam::kData, in_attrs->at(1));
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, Conv2DParam::kWeight, in_attrs->at(2));
+    if (param.use_bias) {
+      NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, Conv2DParam::kBias, TShape({param.channels}));
+    }
+    return true;
+})
+.set_attr<FInferType>("FInferType", ElemwiseType<3, -1>)
+.set_attr<TIsBackward>("TIsBackward", true);
+
+
+DMLC_REGISTER_PARAMETER(Conv2DTransposeParam);
+
+inline bool Conv2DTransposeInferShape(const nnvm::NodeAttrs& attrs,
+                                      std::vector<TShape>* in_shape,
+                                      std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+  const Conv2DTransposeParam& param = nnvm::get<Conv2DTransposeParam>(attrs.parsed);
+  const Layout layout(param.layout);
+  const Layout kernel_layout(param.kernel_layout);
+  if (param.use_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+
+  const TShape& dshape = (*in_shape)[Conv2DTransposeParam::kData];
+  if (dshape.ndim() ==  0) return false;
+  TShape dshape_nchw = ConvertLayout(dshape, layout, kNCHW);
+
+  CHECK_EQ(dshape_nchw[1] % param.groups, 0U)
+      << "input num_filter must divide group size";
+  CHECK_EQ(param.channels % param.groups, 0U)
+      << "output num_filter must divide group size";
+  CHECK_EQ(param.kernel_size.ndim(), 2U)
+      << "incorrect kernel size: " << param.kernel_size;
+  CHECK_EQ(param.strides.ndim(), 2U)
+      << "incorrect stride size: " << param.strides;
+  CHECK_EQ(param.dilation.ndim(), 2U)
+      << "incorrect dilate size: " << param.dilation;
+
+  TShape wshape({dshape_nchw[1],
+                 param.channels / param.groups,
+                 param.kernel_size[0],
+                 param.kernel_size[1]});
+  wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DTransposeParam::kWeight, wshape);
+
+  if (param.use_bias) {
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape,
+                            Conv2DTransposeParam::kBias,
+                            TShape({param.channels}));
+  }
+  // dilation
+  dim_t dilated_ksize_y = 1 + (param.kernel_size[0] - 1) * param.dilation[0];
+  dim_t dilated_ksize_x = 1 + (param.kernel_size[1] - 1) * param.dilation[1];
+  // output shape.
+  TShape oshape({dshape_nchw[0], param.channels, 0, 0});
+  oshape[2] = (param.strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
+               2 * param.padding[0] + param.output_padding[0]);
+
+  oshape[3] = (param.strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
+               2 * param.padding[1] + param.output_padding[1]);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0,
+                           ConvertLayout(oshape, kNCHW, layout));
+  return true;
+}
+
+inline bool Conv2DTransposeCorrectLayout(const NodeAttrs& attrs,
+                                         std::vector<Layout> *ilayouts,
+                                         const std::vector<Layout> *last_ilayouts,
+                                         std::vector<Layout> *olayouts) {
+  const Conv2DTransposeParam& param = nnvm::get<Conv2DTransposeParam>(attrs.parsed);
+
+  const Layout in_layout(param.layout);
+
+  const Layout kernel_layout(param.kernel_layout);
+  if (param.use_bias) {
+    CHECK_EQ(ilayouts->size(), 3U) << "Input:[data, weight, bias]";
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 0, in_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kernel_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 2, Layout("C"));
+  } else {
+    CHECK_EQ(ilayouts->size(), 2U) << "Input:[data, weight]";
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 0, in_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kernel_layout);
+  }
+
+  CHECK_EQ(olayouts->size(), 1U);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, in_layout);
+
+  return true;
+}
+
+NNVM_REGISTER_OP(conv2d_transpose)
+.describe(R"code(Transposed 2D convolution layer (sometimes called Deconvolution).
+
+The need for transposed convolutions generally arises
+from the desire to use a transformation going in the opposite direction
+of a normal convolution, i.e., from something that has the shape of the
+output of some convolution to something that has the shape of its input
+while maintaining a connectivity pattern that is compatible with
+said convolution.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (in_channels, channels, kernel_size[0], kernel_size[1])
+- **bias**: (channels,)
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+            out_height and out_width are calculated as::
+                out_height = (height-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
+                out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_argument("weight", "4D Tensor", "Weight matrix.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(Conv2DTransposeParam::__FIELDS__())
+.set_attr_parser(ParamParser<Conv2DTransposeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DTransposeParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DTransposeParam>)
+.set_attr<FInferShape>("FInferShape", Conv2DTransposeInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DTransposeCorrectLayout)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DTransposeParam>)
+.set_support_level(2);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
new file mode 100644
index 000000000000..322d77b6d032
--- /dev/null
+++ b/nnvm/src/top/nn/nn.cc
@@ -0,0 +1,763 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file nn.cc
+ * \brief Property def of nn operators.
+ */
+#include <tvm/tvm.h>
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/layout.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "./nn_common.h"
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/nn/dense.h"
+#include "topi/nn.h"
+#include "topi/nn/softmax.h"
+
+namespace nnvm {
+namespace top {
+
+using tvm::Var;
+using tvm::Expr;
+using tvm::Tensor;
+using tvm::Array;
+using nnvm::compiler::FTVMCompute;
+
+// dense
+DMLC_REGISTER_PARAMETER(DenseParam);
+
+inline bool DenseInferShape(const nnvm::NodeAttrs& attrs,
+                            std::vector<TShape>* in_shape,
+                            std::vector<TShape>* out_shape) {
+  const DenseParam& param = nnvm::get<DenseParam>(attrs.parsed);
+  if (param.use_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+  // reverse infer
+  if ((*out_shape)[0].ndim() != 0) {
+    TShape dshape = (*out_shape)[0];
+    dshape[dshape.ndim() - 1] = 0;
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, DenseParam::kData, dshape);
+  }
+  dim_t num_inputs = 0;
+  if ((*in_shape)[DenseParam::kData].ndim() != 0) {
+    TShape oshape = (*in_shape)[DenseParam::kData];
+    num_inputs = oshape[oshape.ndim() - 1];
+    oshape[oshape.ndim() - 1] = param.units;
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  }
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, DenseParam::kWeight,
+                          TShape({param.units, num_inputs}));
+  if (param.use_bias) {
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, DenseParam::kBias, TShape({param.units}));
+  }
+  return true;
+}
+
+NNVM_REGISTER_OP(dense)
+.describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`.
+
+- **data**: `(x1, x2, ..., xn, input_dim)`
+- **weight**: `(units, input_dim)`
+- **bias**: `(units,)`
+- **out**: `(x1, x2, ..., xn, units)`
+
+The learnable parameters include both ``weight`` and ``bias``.
+
+If ``use_bias`` is set to be false, then the ``bias`` term is ignored.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "nD Tensor", "Input data.")
+.add_argument("weight", "2D Tensor", "Weight matrix.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(DenseParam::__FIELDS__())
+.set_attr_parser(ParamParser<DenseParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<DenseParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<DenseParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<DenseParam>)
+.set_attr<FInferShape>("FInferShape", DenseInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+// leave weight & bias layout undefined
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutCopyToOut<1, 1>)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    const DenseParam& param = nnvm::get<DenseParam>(n->attrs.parsed);
+
+    NodeEntry data_grad = MakeNode("matmul",
+                                   n->attrs.name + "_data_grad",
+                                   {ograds[0], n->inputs[DenseParam::kWeight]});
+    NodeEntry w_grad_sub = MakeNode("matmul",
+                                     n->attrs.name + "_weight_grad_sub0",
+                                     {ograds[0], n->inputs[DenseParam::kData]},
+                                     {{"transpose_a", "true"}});
+    TShape w_reduce_axis = {0, -1};
+    std::ostringstream w_oss; w_oss << w_reduce_axis;
+    NodeEntry w_grad = MakeNode("sum", n->attrs.name + "_weight_grad",
+                                {w_grad_sub},
+                                {{"axis", w_oss.str()}, {"exclude", "true"}});
+    std::vector<NodeEntry> grads = {data_grad, w_grad};
+
+    if (param.use_bias) {
+      TShape axis = {-1};
+      std::ostringstream b_oss; b_oss << axis;
+      grads.push_back(MakeNode("sum", n->attrs.name + "_bias_grad",
+                      {ograds[0]},
+                      {{"axis", b_oss.str()}, {"exclude", "true"}}));
+    }
+    return grads;
+})
+.set_support_level(1);
+
+// relu
+NNVM_REGISTER_ELEMWISE_UNARY_OP(relu)
+.describe(R"code(Computes rectified linear.
+
+.. math::
+   max(input, 0)
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::relu(inputs[0], 0.0f) };
+  })
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = relu(x)
+    // grad = indicator(x > 0) * ograd
+    NodeEntry sub0 = MakeNode("zeros_like", n->attrs.name + "_sub0",
+                              {n->inputs[0]});
+    NodeEntry sub1 = MakeNode("greater", n->attrs.name + "_sub1",
+                              {n->inputs[0], sub0}, {{"exclude", "true"}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad",
+               {ograds[0], sub1})
+    };
+})
+.set_support_level(1);
+
+// dropout
+DMLC_REGISTER_PARAMETER(DropoutParam);
+
+NNVM_REGISTER_OP(dropout)
+.describe(R"(Applies dropout operation to input array.
+
+- During training, each element of the input is set to zero with probability p.
+  The whole array is rescaled by :math:`1/(1-p)` to keep the expected
+  sum of the input unchanged.
+
+)" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input to which dropout will be applied")
+.add_arguments(DropoutParam::__FIELDS__())
+.set_attr_parser(ParamParser<DropoutParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<DropoutParam>)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 2>)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 2>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseArbitraryLayout<1, 1>)
+.set_attr<FNumVisibleOutputs>("FNumVisibleOutputs", [](const NodeAttrs& attrs) {
+    return 1;
+  })
+.set_attr<FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output", "mask"};
+  })
+.set_support_level(1);
+
+// batchnorm
+DMLC_REGISTER_PARAMETER(BatchNormParam);
+
+inline bool BatchNormInferShape(const nnvm::NodeAttrs& attrs,
+                                std::vector<TShape>* in_shape,
+                                std::vector<TShape>* out_shape) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 5U)
+      << "Input:[data, gamma, beta, moving_mean, moving_var]";
+  CHECK_EQ(out_shape->size(), 3U);
+  const TShape &dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  CHECK((size_t)param.axis < dshape.Size());
+
+  TShape bshape({dshape[param.axis]});
+  if (in_shape->at(1).ndim() == 0) in_shape->at(1) = bshape;
+  if (in_shape->at(2).ndim() == 0) in_shape->at(2) = bshape;
+  if (in_shape->at(3).ndim() == 0) in_shape->at(3) = bshape;
+  if (in_shape->at(4).ndim() == 0) in_shape->at(4) = bshape;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, dshape);
+  out_shape->at(1) = in_shape->at(3);
+  out_shape->at(2) = in_shape->at(4);
+  return true;
+}
+
+inline bool BatchNormCorrectLayout(const NodeAttrs& attrs,
+                                   std::vector<Layout> *in_layouts,
+                                   const std::vector<Layout> *last_in_layouts,
+                                   std::vector<Layout> *out_layouts) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  CHECK_EQ(in_layouts->size(), 5U);
+  CHECK_EQ(last_in_layouts->size(), 5U);
+  CHECK_EQ(out_layouts->size(), 3U);
+
+  Layout data_layout = in_layouts->at(0);
+  const Layout& origin_data_layout = last_in_layouts->at(0);
+  Layout param_layout("C");
+  if (data_layout.defined()) {
+    if (data_layout.indexof('C') != param.axis) {
+      CHECK(origin_data_layout.defined())
+        << "Channel in data layout " << data_layout
+        << " is not at index " << param.axis;
+      // convert it to the original one.
+      data_layout = origin_data_layout;
+      NNVM_ASSIGN_LAYOUT(*in_layouts, 0, origin_data_layout);
+    } else if (data_layout.indexof('c') >= 0 &&
+               static_cast<uint32_t>(data_layout.indexof('c')) != (data_layout.ndim()-1)) {
+      CHECK(origin_data_layout.defined())
+        << "sub-channel c in data layout " << data_layout
+        << " does not at the final dimension";
+      // convert it to the original one.
+      data_layout = origin_data_layout;
+      NNVM_ASSIGN_LAYOUT(*in_layouts, 0, origin_data_layout);
+    } else {
+      for (Layout::LayoutDim axis : data_layout) {
+        if (Layout::is_subdim(axis) && axis != 'c') {
+          CHECK(origin_data_layout.defined())
+            << "sub-axis other than c appears in data layout " << data_layout;
+          // convert it to the original one.
+          data_layout = origin_data_layout;
+          NNVM_ASSIGN_LAYOUT(*in_layouts, 0, origin_data_layout);
+          break;
+        }
+      }
+    }
+
+    // decide the param layout
+    if (data_layout.defined()) {
+      auto channel_block = data_layout.subsizeof('C');
+      if (channel_block > 0) {
+        param_layout = param_layout.split('C', 1, channel_block);
+      }
+    }
+  }
+
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 0, data_layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 1, param_layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 2, param_layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 3, param_layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 4, param_layout);
+
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, data_layout);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 1, param_layout);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 2, param_layout);
+  return true;
+}
+
+NNVM_REGISTER_OP(batch_norm)
+.describe(R"(Batch normalization layer (Ioffe and Szegedy, 2014).
+Normalizes the input at each batch, i.e. applies a transformation
+that maintains the mean activation close to 0 and the activation
+standard deviation close to 1.
+
+.. math::
+
+  data\_mean[i] = mean(data[:,i,:,...]) \\
+  data\_var[i] = var(data[:,i,:,...])
+
+Then compute the normalized output, which has the same shape as input, as following:
+
+.. math::
+
+  out[:,i,:,...] = \frac{data[:,i,:,...] - data\_mean[i]}{\sqrt{data\_var[i]+\epsilon}} * gamma[i] + beta[i]
+
+Both *mean* and *var* returns a scalar by treating the input as a vector.
+
+Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta`` have shape *(k,)*.
+
+Besides the inputs and the outputs, this operator accepts two auxiliary
+states, ``moving_mean`` and ``moving_var``, which are *k*-length
+vectors. They are global statistics for the whole dataset, which are updated
+by::
+
+  moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+  moving_var = moving_var * momentum + data_var * (1 - momentum)
+
+The parameter ``axis`` specifies which axis of the input shape denotes
+the 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel
+axis to be the last item in the input shape.
+
+.. note::
+    This operator can be optimized away for inference.
+)" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input to which dropout will be applied")
+.add_argument("gamma", "Tensor", "The gamma scale factor")
+.add_argument("beta", "Tensor", "The beta offset factor")
+.add_argument("moving_mean", "Tensor", "running mean of input")
+.add_argument("moving_var", "Tensor", "running variance of input")
+.add_arguments(BatchNormParam::__FIELDS__())
+.set_attr_parser(ParamParser<BatchNormParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<BatchNormParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", BatchNormCorrectLayout)
+.set_num_inputs(5)
+.set_num_outputs(3)
+.set_attr<FInferShape>("FInferShape", BatchNormInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<5, 3>)
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
+  })
+.set_attr<FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output", "mean", "var"};
+  })
+.set_attr<FNumVisibleOutputs>("FNumVisibleOutputs", [](const NodeAttrs& attrs) {
+    return 1;
+  })
+.set_attr<FMutateInputs>("FMutateInputs", [](const NodeAttrs& attrs) {
+    return std::vector<uint32_t>{3, 4};
+  })
+.set_support_level(1);
+
+// softmax
+DMLC_REGISTER_PARAMETER(SoftmaxParam);
+
+NNVM_REGISTER_OP(softmax)
+.describe(R"code(Computes softmax.
+
+.. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+.. note::
+    This operator can be optimized away for inference.
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_arguments(SoftmaxParam::__FIELDS__())
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<SoftmaxParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutCopyToOut<1, 1>)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+    return Array<Tensor>{ topi::nn::softmax(inputs[0], param.axis) };
+  })
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // grad_x = grad_y dot jacobian of softmax
+    //
+    // jacobian of softmax
+    // [-y1y1 + y1, -y1y2,        ...    ]
+    // [ ...      , -y2y2 + y2,   ...    ]
+    // [ ...                      ...    ]
+    // [ ...                  ,-ynyn + yn]
+    //
+    // grad_x =
+    // [-y1*(ograd1*y1 - ograd1 + ograd2*y2 + ...),
+    //  -y2*(ograd1*y1 - ograd2 + ograd2*y2 + ...),
+    //  ...
+    //  -yn*(ograd1*y1 - ogradn + ograd2*y2 + ...)]
+
+    // grad_x = ograd elemwise_mul output
+    // grad_x = sum(grad_x, keepdim, axis)
+    // grad_x = grad_x broadcast_mul output
+    // grad_x = neg grad_x
+    // grad_x = grad_x + ograd elemwise_mul output
+    const SoftmaxParam& param = nnvm::get<SoftmaxParam>(n->attrs.parsed);
+    NodeEntry output =  NodeEntry{n, 0, 0};
+    NodeEntry sub0 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub0", {ograds[0], output});
+    NodeEntry sub1 = MakeNode("sum", n->attrs.name + "_grad_sub1", {sub0},
+                              {{"axis", std::to_string(param.axis)}, {"keepdims", "true"}});
+    NodeEntry sub2 = MakeNode("broadcast_mul", n->attrs.name + "_grad_sub2", {sub1, output});
+    return std::vector<NodeEntry> {
+      MakeNode("elemwise_sub", n->attrs.name + "_grad", {sub0, sub2})
+    };
+});
+
+// log_softmax
+NNVM_REGISTER_OP(log_softmax)
+.describe(R"code(Computes log softmax.
+
+.. math:: \text{log_softmax}(x)_i = \log \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+.. note::
+    This operator can be optimized away for inference.
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_arguments(SoftmaxParam::__FIELDS__())
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<SoftmaxParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutCopyToOut<1, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+    CHECK_EQ(param.axis, -1) << "Currently only axis=-1 is supported";
+    return Array<Tensor>{ topi::nn::log_softmax(inputs[0]) };
+  })
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // grad_x = grad_y dot jacobian of logsoftmax
+    //
+    // jacobian of logsoftmax
+    // [-y1 + 1, -y2,        ...    ]
+    // [ ...   , -y2 + 1,    ...    ]
+    // [ ...                 ...    ]
+    // [ ...                ,-yn + 1]
+    //
+    // grad_x =
+    // [ograd1 - exp(y1)*(ograd1 + ... + ogradn),
+    //  ograd2 - exp(y2)*(ograd1 + ... + ogradn),
+    //  ...
+    //  ogradn - exp(yn)*(ograd1 + ... + ogradn)]
+
+    // grad_x = sum(ograd, keepdim, axis)
+    // sigma = exp(output)
+    // grad_x = grad_x elemwise_mul sigma
+    // grad_x = neg grad_x
+    // grad_x = grad_x + ograd
+    const SoftmaxParam& param = nnvm::get<SoftmaxParam>(n->attrs.parsed);
+    NodeEntry output =  NodeEntry{n, 0, 0};
+    NodeEntry sub0 = MakeNode("sum", n->attrs.name + "_grad_sub0", {ograds[0]},
+                              {{"axis", std::to_string(param.axis)}, {"keepdims", "true"}});
+    NodeEntry sub1 = MakeNode("exp", n->attrs.name + "_grad_sub1", {output});
+    NodeEntry sub2 = MakeNode("broadcast_mul", n->attrs.name + "_grad_sub2", {sub0, sub1});
+    return std::vector<NodeEntry> {
+      MakeNode("elemwise_sub", n->attrs.name + "_grad", {ograds[0], sub2})
+    };
+})
+.set_support_level(1);
+
+// leaky_relu
+DMLC_REGISTER_PARAMETER(LeakyReLUParam);
+
+NNVM_REGISTER_OP(leaky_relu)
+.describe(R"code(Leaky version of a Rectified Linear Unit.
+
+`y = x > 0 ? x : alpha * x`
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_arguments(LeakyReLUParam::__FIELDS__())
+.set_attr_parser(ParamParser<LeakyReLUParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<LeakyReLUParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseArbitraryLayout<1, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
+    return Array<Tensor>{ topi::leaky_relu(inputs[0], param.alpha) };
+  })
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = leak_relu(x)
+    // grad = indicator(x > 0) + alpha * indicator(x < 0)
+    const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(n->attrs.parsed);
+    NodeEntry zero = MakeNode("zeros_like", n->attrs.name + "_grad_zero",
+                              {n->inputs[0]});
+    NodeEntry sub0 = MakeNode("greater", n->attrs.name + "_pos_grad",
+                              {n->inputs[0], zero});
+    NodeEntry sub1 = MakeNode("less", n->attrs.name + "_neg_grad",
+                              {n->inputs[0], zero});
+    NodeEntry sub2 = MakeNode("__mul_scalar__", n->attrs.name + "_neg_mul_2",
+                              {sub1},
+                              {{"scalar", std::to_string(param.alpha)}});
+    NodeEntry sub3 = MakeNode("elemwise_add", n->attrs.name + "_sub3", {sub0, sub2});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad", {ograds[0], sub3})
+    };
+})
+.set_support_level(1);
+
+// prelu
+DMLC_REGISTER_PARAMETER(PReLUParam);
+
+inline bool PReluInferShape(const nnvm::NodeAttrs &attrs,
+                            std::vector<TShape> *in_shape,
+                            std::vector<TShape> *out_shape) {
+  const PReLUParam &param = nnvm::get<PReLUParam>(attrs.parsed);
+  TShape dshape = in_shape->at(0);
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, 0, dshape);
+
+  // The case of parametric relu
+  CHECK_EQ(dshape.ndim(), 4) << "Input data should be 4D, but got " << dshape.ndim();
+  CHECK(size_t(param.axis) < dshape.Size())
+      << "Wrong axis ("  << param.axis << ")value.";
+
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, 1, TShape({dshape[param.axis]}));
+
+  TShape oshape(dshape);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+inline bool PReluCorrectLayout(const NodeAttrs& attrs,
+                               std::vector<Layout> *in_layouts,
+                               const std::vector<Layout> *last_in_layouts,
+                               std::vector<Layout> *out_layouts) {
+  const PReLUParam& param = nnvm::get<PReLUParam>(attrs.parsed);
+  CHECK_EQ(in_layouts->size(), 2U);
+  CHECK_EQ(last_in_layouts->size(), 2U);
+  CHECK_EQ(out_layouts->size(), 1U);
+
+  const Layout& data_layout = last_in_layouts->at(0).defined() ?
+                              last_in_layouts->at(0) : in_layouts->at(0);
+  if (data_layout.defined()) {
+    CHECK(data_layout.indexof('C') == param.axis && !data_layout.contains('c'))
+      << "Channel in data layout " << data_layout
+      << " is not at index " << param.axis;
+  }
+
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 0, data_layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 1, Layout("C"));
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, data_layout);
+
+  return true;
+}
+
+NNVM_REGISTER_OP(prelu)
+.describe(R"code(Parametric version of a Rectified Linear Unit.
+It accepts two arguments: an input ``x`` and a channelwise slope ``alpha``
+and computes the output as :math:`PReLU(x) y = x > 0 ? x : alpha * x`,
+where :math:`*` is an channelwise multiplication for each sample in the
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("alpha", "Tensor", "Input channelwise alpha.")
+.add_arguments(PReLUParam::__FIELDS__())
+.set_attr_parser(ParamParser<PReLUParam>)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", PReluInferShape)
+.set_attr<FCorrectLayout>("FCorrectLayout", PReluCorrectLayout)
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "alpha"};
+  })
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const PReLUParam& param = nnvm::get<PReLUParam>(attrs.parsed);
+    return Array<Tensor>{ topi::prelu(inputs[0], inputs[1], param.axis)};
+  })
+.set_support_level(4);
+
+DMLC_REGISTER_PARAMETER(PadParam);
+
+inline bool PadInferShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape>* in_shape,
+                          std::vector<TShape>* out_shape) {
+  const PadParam& param = nnvm::get<PadParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+  TShape dshape = (*in_shape)[0];
+  if (dshape.ndim() == 0) return false;
+  CHECK_EQ(param.pad_width.ndim(), dshape.ndim());
+  TShape oshape = dshape;
+  for (uint32_t i = 0; i < dshape.ndim(); i++) {
+    CHECK_EQ(param.pad_width[i].ndim(), 2U);
+    int pad_before = param.pad_width[i][0];
+    int pad_after = param.pad_width[i][1];
+    oshape[i] = dshape[i] + pad_before + pad_after;
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+NNVM_REGISTER_OP(pad)
+.describe(R"code(Pad for n-D tensor.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "n-D Tensor", "Input data.")
+.add_arguments(PadParam::__FIELDS__())
+.set_attr_parser(ParamParser<PadParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<PadParam>)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_attr<FInferShape>("FInferShape", PadInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutCopyToOut<1, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const PadParam& param = nnvm::get<PadParam>(attrs.parsed);
+    auto pad_width = param.pad_width;
+    CHECK(pad_width.ndim() == inputs[0]->shape.size() &&
+      pad_width[0].ndim() == 2)
+      << "Illegal pad_width";
+    Array<tvm::Expr> pad_before;
+    for (size_t i = 0; i < pad_width.ndim(); ++i) {
+      pad_before.push_back(tvm::make_const(tvm::Int(32), pad_width[i][0]));
+    }
+    Array<tvm::Expr> pad_after;
+    for (size_t i = 0; i < pad_width.ndim(); ++i) {
+      pad_after.push_back(tvm::make_const(tvm::Int(32), pad_width[i][1]));
+    }
+    return Array<Tensor>{ topi::pad(inputs[0], pad_before, pad_after, param.pad_value) };
+})
+.set_support_level(1);
+
+// layout transformer
+DMLC_REGISTER_PARAMETER(LayoutTransformParam);
+
+inline bool LayoutTransformInferShape(const NodeAttrs& attrs,
+                                      std::vector<TShape>* in_attrs,
+                                      std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
+  CHECK_EQ(out_attrs->size(), 1U);
+  const LayoutTransformParam& param = nnvm::get<LayoutTransformParam>(attrs.parsed);
+  const TShape &dshape = (*in_attrs)[0];
+  if (dshape.ndim() == 0) return false;
+  const TShape &oshape = ConvertLayout(dshape,
+                                       Layout(param.src_layout),
+                                       Layout(param.dst_layout));
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  return true;
+}
+
+NNVM_REGISTER_OP(__layout_transform__)
+.describe(R"code(Transform the input data layout.
+
+For transforming from NCHW to N16cHWC, the `__layout_transform__` operator reshapes
+the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
+
+)code" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.add_argument("data", "Tensor", "Input data.")
+.add_arguments(LayoutTransformParam::__FIELDS__())
+.set_attr_parser(ParamParser<LayoutTransformParam>)
+.set_attr<FInferShape>("FInferShape", LayoutTransformInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>(
+  "FCorrectLayout", [](const NodeAttrs& attrs,
+                     std::vector<Layout> *ilayouts,
+                     const std::vector<Layout> *last_ilayouts,
+                     std::vector<Layout> *olayouts) {
+    const LayoutTransformParam& param = nnvm::get<LayoutTransformParam>(attrs.parsed);
+    CHECK_EQ(ilayouts->size(), 1U);
+    CHECK_EQ(olayouts->size(), 1U);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 0, Layout(param.src_layout));
+    NNVM_ASSIGN_LAYOUT(*olayouts, 0, Layout(param.dst_layout));
+    return true;
+})
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& outputs) {
+    const LayoutTransformParam& param = nnvm::get<LayoutTransformParam>(attrs.parsed);
+
+    Layout src_layout(param.src_layout);
+    Layout dst_layout(param.dst_layout);
+
+    if (src_layout == dst_layout) {
+      return Array<Tensor>{ inputs[0] };
+    } else if (!src_layout.defined() || !dst_layout.defined()) {
+      LOG(FATAL) << "cannot convert from/to undefined layout";
+    }
+
+    CHECK(src_layout.convertible(dst_layout)) << "cannot convert from " << param.src_layout
+                                                << " to " << param.dst_layout;
+
+    return Array<Tensor> {
+      topi::layout_transform(inputs[0], outputs[0]->shape, [&](const Array<Var>& dst_indices) {
+        std::vector<Expr> dst_to_src_indices;
+        for (Layout::LayoutDim src_axis : src_layout) {
+          int dst_major_pos = dst_layout.indexof(Layout::to_superdim(src_axis));
+          int dst_minor_pos = dst_layout.indexof(Layout::to_subdim(src_axis));
+          int32_t src_factor = static_cast<int32_t>(src_layout.subsizeof(src_axis));
+          int32_t dst_factor = static_cast<int32_t>(dst_layout.subsizeof(src_axis));
+
+          Expr src_index(dst_indices[dst_major_pos]);
+          if (dst_minor_pos >= 0) {
+            CHECK_GT(dst_factor, 0);
+            src_index = src_index * dst_factor + dst_indices[dst_minor_pos];
+          }
+          if (Layout::is_superdim(src_axis) && src_factor > 0) {
+            src_index = src_index / src_factor;
+          } else if (Layout::is_subdim(src_axis) && src_factor > 0) {
+            src_index = src_index % src_factor;
+          }
+          dst_to_src_indices.push_back(src_index);
+        }
+        return Array<Expr>(dst_to_src_indices);
+      })
+    };
+})
+.set_support_level(1);
+
+DMLC_REGISTER_PARAMETER(LRNParam);
+
+inline bool LRNInferShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape>* in_shape,
+                          std::vector<TShape>* out_shape) {
+  TShape dshape = (*in_shape)[0];
+  TShape oshape = dshape;
+
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+NNVM_REGISTER_OP(lrn)
+.describe(R"code(LRN layer)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.set_attr_parser(ParamParser<LRNParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<LRNParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", LRNInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_support_level(1);
+
+DMLC_REGISTER_PARAMETER(L2NormalizeParam);
+
+inline bool L2NormalizeInferShape(const nnvm::NodeAttrs& attrs,
+                                  std::vector<TShape>* in_shape,
+                                  std::vector<TShape>* out_shape) {
+  TShape dshape = (*in_shape)[0];
+  TShape oshape = dshape;
+
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+NNVM_REGISTER_OP(l2_normalize)
+.describe(R"code(L2NORMALIZE layer)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.set_attr_parser(ParamParser<L2NormalizeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<L2NormalizeParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", L2NormalizeInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseArbitraryLayout<1, 1>)
+.set_support_level(1);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/nn/nn_common.h b/nnvm/src/top/nn/nn_common.h
new file mode 100644
index 000000000000..4dc9f7db54c0
--- /dev/null
+++ b/nnvm/src/top/nn/nn_common.h
@@ -0,0 +1,92 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file nn_common.h
+ * \brief Common utilities for nn ops.
+ */
+#ifndef NNVM_TOP_NN_NN_COMMON_H_
+#define NNVM_TOP_NN_NN_COMMON_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <nnvm/layout.h>
+#include <nnvm/top/nn.h>
+#include <string>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace nnvm {
+namespace top {
+
+template<typename ParamType>
+inline uint32_t UseBiasNumInputs(const NodeAttrs& attrs) {
+  const ParamType& param = get<ParamType>(attrs.parsed);
+  return param.use_bias ? 3 : 2;
+}
+
+template<typename ParamType>
+inline std::vector<std::string> UseBiasListInputNames(const NodeAttrs& attrs) {
+  const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
+  if (param.use_bias) {
+    return {"data", "weight", "bias"};
+  } else {
+    return {"data", "weight"};
+  }
+}
+
+/*!
+ * \brief Convert shape in src_layout to shape in dst_layout
+ * \param src original shape
+ * \param src_layout layout of original shape
+ * \param dst_layout target layout
+ * \return shape in target layout
+ */
+inline TShape ConvertLayout(TShape src, const Layout& src_layout, const Layout& dst_layout) {
+  if (src_layout == dst_layout) {
+    return src;
+  } else if (!src_layout.defined()) {
+    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
+  } else if (!dst_layout.defined()) {
+    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
+  }
+
+  CHECK(src_layout.convertible(dst_layout)) << "cannot convert from "
+                                            << src_layout << " to " << dst_layout;
+
+  TShape dst(dst_layout.ndim());
+  for (size_t i = 0; i < src_layout.ndim(); ++i) {
+    Layout::LayoutDim src_dim = src_layout[i];
+    if (Layout::is_superdim(src_dim)) {
+      int dst_major_pos = dst_layout.indexof(Layout::to_superdim(src_dim));
+      int dst_minor_pos = dst_layout.indexof(Layout::to_subdim(src_dim));
+      int src_minor_pos = src_layout.indexof(Layout::to_subdim(src_dim));
+      int src_factor = src_layout.subsizeof(src_dim);
+      int dst_factor = dst_layout.subsizeof(src_dim);
+
+      uint32_t src_dim_size = src[i];
+      if (src_minor_pos >= 0) {
+        CHECK_EQ(src_factor, src[src_minor_pos]) << "src shape " << src
+                                                 << " does not agree with layout " << src_layout;
+        src_dim_size *= src_factor;
+      }
+
+      dst[dst_major_pos] = src_dim_size;
+      if (dst_minor_pos >= 0) {
+        CHECK_GT(dst_factor, 0);
+        CHECK_LE(dst_factor, src_dim_size) << "Converting " << src
+                                           << " from " << src_layout
+                                           << " to " << dst_layout
+                                           << ": cannot split dimension size of "
+                                           << src_dim_size << " by " << dst_factor;
+        dst[dst_major_pos] /= dst_factor;
+        dst[dst_minor_pos] = dst_factor;
+      }
+    }
+  }
+  return dst;
+}
+
+}  // namespace top
+}  // namespace nnvm
+
+#endif  // NNVM_TOP_NN_NN_COMMON_H_
diff --git a/nnvm/src/top/nn/pooling.cc b/nnvm/src/top/nn/pooling.cc
new file mode 100644
index 000000000000..cccd5b1c710b
--- /dev/null
+++ b/nnvm/src/top/nn/pooling.cc
@@ -0,0 +1,417 @@
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file pooling.cc
+ * \brief Property def of pooling operators.
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/util.h>
+#include <nnvm/top/nn.h>
+#include "./nn_common.h"
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/nn/pooling.h"
+
+namespace nnvm {
+namespace top {
+using namespace tvm;
+using namespace nnvm::compiler;
+
+DMLC_REGISTER_PARAMETER(MaxPool2DParam);
+
+template <typename T>
+inline bool Pool2DInferShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape>* in_shape,
+                             std::vector<TShape>* out_shape) {
+  const T& param = nnvm::get<T>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+
+  TShape dshape = (*in_shape)[0];
+  if (dshape.ndim() ==  0) return false;
+
+  CHECK_GE(dshape.ndim(), 2U)
+    << "Pool2D only support input >= 2-D: input must have height and width";
+
+  Layout layout(param.layout);
+  CHECK(layout.contains('H') && layout.contains('W') &&
+        !layout.contains('h') && !layout.contains('w'))
+    << "Invalid layout " << layout
+    << ". Pool2D layout must have H and W, which cannot be split";
+
+  const auto hidx = layout.indexof('H');
+  const auto widx = layout.indexof('W');
+
+  dim_t pad_h, pad_w;
+  if (param.padding.ndim() == 1) {
+    pad_h = param.padding[0] * 2;
+    pad_w = param.padding[0] * 2;
+  } else if (param.padding.ndim() == 2) {
+    // (top, left)
+    pad_h = param.padding[0] * 2;
+    pad_w = param.padding[1] * 2;
+  } else if (param.padding.ndim() == 4) {
+    // (top, left, bottom, right)
+    pad_h = param.padding[0] + param.padding[2];
+    pad_w = param.padding[1] + param.padding[3];
+  } else {
+    return false;
+  }
+
+  TShape oshape = dshape;
+  CHECK(param.pool_size[0] <= dshape[hidx] + pad_h)
+      << "pool size (" << param.pool_size[0] << ") exceeds input (" << dshape[hidx]
+      << " padded to " << (dshape[hidx] + pad_h) << ")";
+  CHECK(param.pool_size[1] <= dshape[widx] + pad_w)
+      << "pool size (" << param.pool_size[1] << ") exceeds input (" << dshape[widx]
+      << " padded to " << (dshape[widx] + pad_w) << ")";
+
+  if (!param.ceil_mode) {
+    oshape[hidx] = ((dshape[hidx] + pad_h - param.pool_size[0]) /
+                    param.strides[0]) + 1;
+    oshape[widx] = ((dshape[widx] + pad_w - param.pool_size[1]) /
+                    param.strides[1]) + 1;
+  } else {
+    oshape[hidx] = ((dshape[hidx] + pad_h - param.pool_size[0] +
+                    param.strides[0] - 1) / param.strides[0]) + 1;
+    oshape[widx] = ((dshape[3] + pad_w - param.pool_size[1] +
+                    param.strides[1] - 1) / param.strides[1]) + 1;
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+template <typename T>
+inline bool Pool2DCorrectLayout(const NodeAttrs& attrs,
+                                std::vector<Layout> *ilayouts,
+                                const std::vector<Layout> *last_ilayouts,
+                                std::vector<Layout> *olayouts) {
+  const T &param = nnvm::get<T>(attrs.parsed);
+  CHECK_EQ(ilayouts->size(), 1);
+  CHECK_EQ(last_ilayouts->size(), 1);
+  CHECK_EQ(olayouts->size(), 1);
+
+  Layout input = (*ilayouts)[0];
+  const Layout layout(param.layout);
+
+  if (input.defined()) {
+    CHECK(input.convertible(layout)) << "Invalid input layout " << input;
+    if (input.indexof('W') != layout.indexof('W') ||
+        input.indexof('H') != layout.indexof('H') ||
+        input.contains('w') || input.contains('h')) {
+      // as long as the index doesn't change for width and height
+      // pool2d can keep the input layout.
+      input = layout;
+    }
+  } else {
+    input = layout;
+  }
+
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, input);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, input);
+
+  return true;
+}
+
+NNVM_REGISTER_OP(max_pool2d)
+.describe(R"code(Max pooling operation for one dimensional data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+           out_height and out_width are calculated as::
+
+               out_height = floor((height+padding[0]+padding[2]-pool_size[0])/strides[0])+1
+               out_width = floor((width+padding[1]+padding[3]-pool_size[1])/strides[1])+1
+
+           where padding will be an expanded array based on number of values passed as::
+               one int : all sides same padding used.
+               two int : bottom, right use same as top and left.
+               four int: padding width in the order of (top, left, bottom, right).
+
+           When `ceil_mode` is `True`, ceil will be used instead of floor in this
+           equation.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(MaxPool2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<MaxPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<MaxPool2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_attr<FInferShape>("FInferShape", Pool2DInferShape<MaxPool2DParam>)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Pool2DCorrectLayout<MaxPool2DParam>)
+.set_attr<FTVMCompute>("FTVMCompute", [](const NodeAttrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Array<Tensor>& out_info) {
+  const MaxPool2DParam& param = nnvm::get<MaxPool2DParam>(attrs.parsed);
+  auto pool_size = ShapeToArray(param.pool_size);
+  auto strides = ShapeToArray(param.strides);
+  auto padding = ShapeToArray(param.padding);
+  auto ceil_mode = param.ceil_mode;
+
+  Layout layout(param.layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+    << "max_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1) << "max_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1) << "max_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+
+  if (param.padding.ndim() == 1) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+  } else if (param.padding.ndim() == 2) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[1]);
+  }
+
+  return Array<Tensor>{
+    topi::nn::pool(inputs[0], pool_size, strides, padding,
+                   topi::nn::kMaxPool, ceil_mode, layout.name())};
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return MakeGradNode("_max_pool2d_grad", n,
+                        {ograds[0], n->inputs[0], NodeEntry{n, 0, 0}},
+                        n->attrs.dict);
+})
+.set_support_level(2);
+
+NNVM_REGISTER_OP(_max_pool2d_grad)
+  .describe(R"code(Max pooling 2D grad.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("ograd", "4D Tensor", "Output grad.")
+.add_argument("input", "4D Tensor", "Input data of max_pool2d grad.")
+.add_argument("output", "4D Tensor", "Output data of max_pool2d grad.")
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<MaxPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<MaxPool2DParam>)
+.set_attr<FInferShape>("FInferShape", AssignOutputAttr<TShape, 1, 0>)
+.set_attr<FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<TIsBackward>("TIsBackward", true);
+
+DMLC_REGISTER_PARAMETER(AvgPool2DParam);
+
+NNVM_REGISTER_OP(avg_pool2d)
+.describe(R"code(Average pooling operation for one dimensional data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+           out_height and out_width are calculated as::
+
+               out_height = floor((height+padding[0]+padding[2]-pool_size[0])/strides[0])+1
+               out_width = floor((width+padding[1]+padding[3]-pool_size[1])/strides[1])+1
+
+           where padding will be an expanded array based on number of values passed as::
+               one int : all sides same padding used.
+               two int : bottom, right use same as top and left.
+               four int: padding width in the order of (top, left, bottom, right).
+
+           When `ceil_mode` is `True`, ceil will be used instead of floor in this
+           equation.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(AvgPool2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<AvgPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<AvgPool2DParam>)
+.set_attr<FInferShape>("FInferShape", Pool2DInferShape<AvgPool2DParam>)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Pool2DCorrectLayout<AvgPool2DParam>)
+.set_attr<FTVMCompute>("FTVMCompute", [](const NodeAttrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Array<Tensor>& out_info) {
+  const AvgPool2DParam& param = nnvm::get<AvgPool2DParam>(attrs.parsed);
+  auto pool_size = ShapeToArray(param.pool_size);
+  auto strides = ShapeToArray(param.strides);
+  auto padding = ShapeToArray(param.padding);
+  auto ceil_mode = param.ceil_mode;
+  auto count_include_pad = param.count_include_pad;
+
+  Layout layout(param.layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+    << "avg_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1) << "avg_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1) << "avg_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+
+  if (param.padding.ndim() == 1) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+  } else if (param.padding.ndim() == 2) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[1]);
+  }
+
+  return Array<Tensor>{
+    topi::nn::pool(inputs[0], pool_size, strides, padding,
+                   topi::nn::kAvgPool, ceil_mode, layout.name(), count_include_pad)};
+})
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(2);
+
+
+DMLC_REGISTER_PARAMETER(GlobalPool2DParam);
+
+inline bool GlobalPool2DInferShape(const nnvm::NodeAttrs& attrs,
+                                   std::vector<TShape>* in_shape,
+                                   std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  const GlobalPool2DParam& param = nnvm::get<GlobalPool2DParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+
+  TShape dshape = (*in_shape)[0];
+  if (dshape.ndim() ==  0) return false;
+
+  CHECK_GE(dshape.ndim(), 2U)
+    << "Pool2D only support input >= 2-D: input must have height and width";
+
+  Layout layout(param.layout);
+  CHECK(layout.contains('H') && layout.contains('W') &&
+        !layout.contains('h') && !layout.contains('w'))
+    << "Invalid layout " << layout
+    << ". Pool2D layout must have H and W, which cannot be split";
+
+  const auto hidx = layout.indexof('H');
+  const auto widx = layout.indexof('W');
+
+  TShape oshape = dshape;
+  oshape[hidx] = oshape[widx] = 1;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+inline bool GlobalPool2DCorrectLayout(const NodeAttrs& attrs,
+                                      std::vector<Layout> *ilayouts,
+                                      const std::vector<Layout> *last_ilayouts,
+                                      std::vector<Layout> *olayouts) {
+  const GlobalPool2DParam &param = nnvm::get<GlobalPool2DParam>(attrs.parsed);
+  CHECK_EQ(ilayouts->size(), 1);
+  CHECK_EQ(last_ilayouts->size(), 1);
+  CHECK_EQ(olayouts->size(), 1);
+
+  Layout input = (*ilayouts)[0];
+  const Layout layout(param.layout);
+
+  if (input.defined()) {
+    CHECK(input.convertible(layout)) << "Invalid input layout " << input;
+    if (input.indexof('W') != layout.indexof('W') ||
+        input.indexof('H') != layout.indexof('H') ||
+        input.contains('w') || input.contains('h')) {
+      // as long as the index doesn't change for width and height
+      // pool2d can keep the input layout.
+      input = layout;
+    }
+  } else {
+    input = layout;
+  }
+
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, input);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, input);
+
+  return true;
+}
+
+NNVM_REGISTER_OP(global_max_pool2d)
+.describe(R"code(Global max pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(GlobalPool2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<GlobalPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<GlobalPool2DParam>)
+.set_attr<FInferShape>("FInferShape", GlobalPool2DInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", GlobalPool2DCorrectLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+  const GlobalPool2DParam& param = nnvm::get<GlobalPool2DParam>(attrs.parsed);
+  Layout layout(param.layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+    << "global_max_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1)
+    << "global_max_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1)
+    << "global_max_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+
+  return Array<Tensor>{
+    topi::nn::global_pool(inputs[0], topi::nn::kMaxPool, layout.name()) };
+})
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(2);
+
+
+NNVM_REGISTER_OP(global_avg_pool2d)
+.describe(R"code(Global average pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(GlobalPool2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<GlobalPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<GlobalPool2DParam>)
+.set_attr<FInferShape>("FInferShape", GlobalPool2DInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", GlobalPool2DCorrectLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+  const GlobalPool2DParam& param = nnvm::get<GlobalPool2DParam>(attrs.parsed);
+  Layout layout(param.layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+    << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1)
+    << "global_avg_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1)
+    << "global_avg_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+
+  return Array<Tensor>{
+    topi::nn::global_pool(inputs[0], topi::nn::kAvgPool, layout.name()) };
+})
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(2);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/nn/upsampling.cc b/nnvm/src/top/nn/upsampling.cc
new file mode 100644
index 000000000000..6c5e13441406
--- /dev/null
+++ b/nnvm/src/top/nn/upsampling.cc
@@ -0,0 +1,106 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file upsampling.cc
+ * \brief Property def of upsampling operators.
+ */
+#include <tvm/tvm.h>
+#include <tvm/expr.h>
+#include <nnvm/layout.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "./nn_common.h"
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/elemwise.h"
+#include "topi/transform.h"
+#include "topi/nn/upsampling.h"
+
+namespace nnvm {
+namespace top {
+using tvm::Expr;
+using tvm::Array;
+using tvm::Tensor;
+using nnvm::compiler::FTVMCompute;
+
+DMLC_REGISTER_PARAMETER(UpSamplingParam);
+
+inline bool UpSamplingInferShape(const nnvm::NodeAttrs& attrs,
+                                 std::vector<TShape>* in_shape,
+                                 std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+  TShape dshape = (*in_shape)[0];
+  if (dshape.ndim() ==  0) return false;
+
+  dshape = ConvertLayout(dshape, param.layout, kNCHW);
+  TShape oshape = dshape;
+  oshape[2] = oshape[2] * param.scale;
+  oshape[3] = oshape[3] * param.scale;
+  oshape = ConvertLayout(oshape, kNCHW, param.layout);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+
+  return true;
+}
+
+inline bool UpsamplingLayout(const NodeAttrs& attrs,
+                             std::vector<Layout> *in_layouts,
+                             const std::vector<Layout> *last_in_layouts,
+                             std::vector<Layout> *out_layouts) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  CHECK_EQ(in_layouts->size(), 1U);
+  CHECK_EQ(out_layouts->size(), 1U);
+  const Layout layout(param.layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, layout);
+  return true;
+}
+
+NNVM_REGISTER_OP(upsampling)
+.describe(R"(Perform upsampling to input array with nearest neighbour or bilinear interpolation.
+
+- **data**: data is 4D array of shape
+            (batch_size, channels, in_height, in_width) for NCHW
+            (batch_size, in_height, in_width, channels) for NHWC
+
+- **out**: Output is 4D array of shape
+           for layout NCHW
+           (batch_size, channels, in_height*scale, in_width*scale)
+
+           for layout NHWC
+           (batch_size, in_height*scale, in_width*scale, channels)
+
+)" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(UpSamplingParam::__FIELDS__())
+.set_attr_parser(ParamParser<UpSamplingParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<UpSamplingParam>)
+.set_attr<FInferShape>("FInferShape", UpSamplingInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", UpsamplingLayout)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  Array<Expr> oshape;
+  if (param.layout == "NCHW") {
+    oshape.push_back(out_info[0]->shape[2]);
+    oshape.push_back(out_info[0]->shape[3]);
+  } else {
+    oshape.push_back(out_info[0]->shape[1]);
+    oshape.push_back(out_info[0]->shape[2]);
+  }
+
+  return Array<Tensor>{ topi::nn::upsampling(inputs[0], oshape, param.layout, param.method)};
+})
+.set_support_level(2);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/op_common.h b/nnvm/src/top/op_common.h
new file mode 100644
index 000000000000..826067ed50d7
--- /dev/null
+++ b/nnvm/src/top/op_common.h
@@ -0,0 +1,331 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file op_common.h
+ * \brief Common operator utilities
+ */
+#ifndef NNVM_TOP_OP_COMMON_H_
+#define NNVM_TOP_OP_COMMON_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <nnvm/top/tensor.h>
+#include <string>
+#include <vector>
+#include <unordered_set>
+
+namespace nnvm {
+namespace top {
+/*!
+ * \brief Parse keyword arguments as PType arguments and save to parsed
+ * \tparam PType the parameter type.
+ * \param attrs The attributes.
+ */
+template<typename PType>
+inline void ParamParser(nnvm::NodeAttrs* attrs) {
+  PType param;
+  try {
+    param.Init(attrs->dict);
+  } catch (const dmlc::ParamError& e) {
+    std::ostringstream os;
+    os << e.what();
+    os << ", in operator " << attrs->op->name << "("
+       << "name=\"" << attrs->name << "\"";
+    for (const auto& k : attrs->dict) {
+      os << ", " << k.first << "=\"" << k.second << "\"";
+    }
+    os << ")";
+    throw dmlc::ParamError(os.str());
+  }
+  attrs->parsed = std::move(param);
+}
+
+/*!
+ * \brief Parse keyword arguments as PType arguments and save to parsed
+ * \tparam PType the arameter type.
+ * \param attrs The attributes.
+ */
+template<typename PType>
+inline std::unordered_map<std::string, std::string>
+ParamGetAttrDict(const nnvm::NodeAttrs& attrs) {
+  std::unordered_map<std::string, std::string> dict = attrs.dict;
+  nnvm::get<PType>(attrs.parsed).UpdateDict(&dict);
+  return dict;
+}
+
+/*! \brief check if shape is empty or contains unkown (0) dim. */
+inline bool shape_is_none(const TShape& x) {
+  return x.ndim() == 0 || x.Size() == 0;
+}
+
+/*! \brief check if type is none (-1) */
+inline bool type_is_none(const int& x) {
+  return x == -1;
+}
+
+/*! \brief check if shape is scalar({1}). */
+inline bool shape_is_scalar(const TShape& x) {
+  return x.ndim() == 1 && x.Size() == 1;
+}
+
+/*! \brief get string representation of shape */
+inline std::string shape_string(const TShape& x) {
+  std::ostringstream os;
+  os << x;
+  return os.str();
+}
+
+/*! \brief get string representation of shape */
+inline std::string type_string(const int& x) {
+  return std::to_string(x);
+}
+
+/*!
+ * \brief Assign x to y. Checks for compatiblity when y is not empty.
+ *  Allow missing dim in both x and y (as 0).
+ * \param y target shape.
+ * \param x source shape.
+ * \return whether x and y are compatible.
+ */
+inline bool shape_assign(TShape *y, const TShape& x) {
+  if (y->ndim() == 0) {
+    *y = x;
+    return true;
+  } else if (y->ndim() != x.ndim()) {
+    return x.ndim() == 0;
+  } else {
+    for (size_t i = 0; i < y->ndim(); ++i) {
+      if ((*y)[i] == 0) {
+        (*y)[i] = x[i];
+      } else if ((*y)[i] != x[i] && x[i] != 0) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+
+/*!
+ * \brief Assign x to y. Checks for compatiblity when y is not -1.
+ * \param y target type.
+ * \param x source type.
+ * \return whether x and y are compatible.
+ */
+inline bool type_assign(int *y, const int& x) {
+  if (*y == -1) {
+    *y = x;
+    return true;
+  } else if (*y != x && x != -1) {
+    return false;
+  }
+  return true;
+}
+
+template<typename AttrType>
+inline std::string attr_assign_error_msg(const NodeAttrs& attrs,
+                                         int index, bool is_input,
+                                         const AttrType& expected,
+                                         const AttrType& actual,
+                                         const char* attr_name) {
+  static const auto& flist_inputs = Op::GetAttr<FListInputNames>("FListInputNames");
+  static const auto& flist_outputs = Op::GetAttr<FListOutputNames>("FListOutputNames");
+  const auto& flist = is_input ? flist_inputs : flist_outputs;
+  std::string name;
+  if (flist.count(attrs.op)) {
+    name = flist[attrs.op](attrs)[index];
+  } else {
+    name = (is_input ? "data" : "output") + std::to_string(index);
+  }
+  std::ostringstream msg;
+  msg << "Operator " << attrs.op->name << "(";
+  for (const auto& kv : attrs.dict) msg << kv.first << "=" << kv.second << ", ";
+  msg << "name=" << attrs.name << ") expects " << name << "\'s " << attr_name
+      << " to be " << expected << ", but got " << actual << ".";
+  return msg.str();
+}
+
+/*!
+ * \brief macro assign shape to input if out is unknown otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param inputs the shape array to store the result
+ * \param index the index of in the array
+ * \param shape the inferred shape
+ */
+#define NNVM_ASSIGN_INPUT_SHAPE(attrs, inputs, index, shape)             \
+  {                                                                      \
+    if (!shape_assign(&(inputs)[index], TShape(shape))) {                \
+      LOG(FATAL) << attr_assign_error_msg(attrs, index, true, shape,     \
+                                          (inputs)[index], "shape");     \
+    }                                                                    \
+  }
+
+/*!
+ * \brief macro assign shape to out if out is unknown otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param inputs the shape array to store the result
+ * \param index the index of in the array
+ * \param shape the inferred shape
+ */
+#define NNVM_ASSIGN_OUTPUT_SHAPE(attrs, outputs, index, shape)           \
+  {                                                                      \
+    if (!shape_assign(&(outputs)[index], TShape(shape))) {               \
+      LOG(FATAL) << attr_assign_error_msg(attrs, index, false, shape,    \
+                                          (outputs)[index], "shape");    \
+    }                                                                    \
+  }
+
+/*!
+ * \brief macro assign type to out if out is unknown (-1) otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param inputs the type array to store the result
+ * \param index the index of in the array
+ * \param type the inferred type
+ */
+#define NNVM_ASSIGN_INPUT_TYPE(attrs, inputs, index, type)               \
+  {                                                                      \
+    if (!type_assign(&(inputs)[index], type)) {                          \
+      LOG(FATAL) << attr_assign_error_msg(attrs, index, true, type,      \
+                                          (inputs)[index], "type");      \
+    }                                                                    \
+  }
+
+/*!
+ * \brief macro assign type to out if out is unknown (-1) otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param inputs the type array to store the result
+ * \param index the index of in the array
+ * \param type the inferred type
+ */
+#define NNVM_ASSIGN_OUTPUT_TYPE(attrs, outputs, index, type)             \
+  {                                                                      \
+    if (!type_assign(&(outputs)[index], type)) {                         \
+      LOG(FATAL) << attr_assign_error_msg(attrs, index, false, type,     \
+                                          (outputs)[index], "type");     \
+    }                                                                    \
+  }
+
+#define NNVM_ASSIGN_LAYOUT(outputs, index, layout)                       \
+  {                                                                      \
+    if (layout.defined()) {                                              \
+      (outputs)[index] = layout;                                         \
+    }                                                                    \
+  }
+
+/*!
+ * \brief macro assign rhs shape to lhs
+ *  Use macro so we can see the error file more clearly
+ * \param lhs lhs shape
+ * \param rhs rhs shape
+ */
+#define SHAPE_ASSIGN(lhs, rhs)                                \
+  if ((lhs).ndim() == 0) (lhs) = (rhs);                       \
+  else                                                        \
+    CHECK_EQ(lhs, rhs) << "shape inference inconsistent";     \
+
+/*!
+ * \brief macro assign rhs type to lhs
+ *  Use macro so we can see the error file more clearly
+ * \param lhs lhs type
+ * \param rhs rhs type
+ */
+#define DTYPE_ASSIGN(lhs, rhs)                                \
+  if ((lhs) == -1) (lhs) = (rhs);                             \
+  else                                                        \
+    CHECK_EQ(lhs, rhs) << "type inference inconsistent";     \
+
+// simply return the shape as same
+inline bool SameShape(const NodeAttrs& attrs,
+                      std::vector<TShape> *ishape,
+                      std::vector<TShape> *oshape) {
+  if (ishape->size() == 0 || (*ishape)[0].ndim() == 0) return false;
+  for (TShape& pshape : *oshape) {
+    pshape = (*ishape)[0];
+  }
+  for (TShape& pshape : *ishape) {
+    pshape = (*ishape)[0];
+  }
+  return true;
+}
+
+// return shape from node attrs
+template<typename PType>
+inline bool ZeroShape(const NodeAttrs& attrs,
+                      std::vector<TShape> *ishape,
+                      std::vector<TShape> *oshape) {
+  const TShape& ts = dmlc::get<PType>(attrs.parsed).shape;
+  if (ts.ndim() != 0) {
+    SHAPE_ASSIGN(oshape->at(0), ts);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// do not infer layout
+inline bool ZeroLayout(const NodeAttrs& attrs,
+                       std::vector<Layout> *in_layouts,
+                       const std::vector<Layout> *last_in_layouts,
+                       std::vector<Layout> *out_layouts) {
+  return true;
+}
+
+// simply assign output shape or type from input
+template<typename AttrType, int in_index, int out_index>
+inline bool AssignOutputAttr(const NodeAttrs& attrs,
+                              std::vector<AttrType> *in_attrs,
+                              std::vector<AttrType> *out_attrs) {
+  CHECK_LT(in_index, in_attrs->size());
+  CHECK_LT(out_index, out_attrs->size());
+  const TShape &dshape = in_attrs->at(in_index);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, out_index, dshape);
+  return true;
+}
+
+// return type from node attrs
+template<typename PType>
+inline bool ZeroType(const NodeAttrs& attrs,
+                     std::vector<int> *iattr,
+                     std::vector<int> *oattr) {
+  int dtype = dmlc::get<PType>(attrs.parsed).dtype;
+  DTYPE_ASSIGN(oattr->at(0), dtype);
+  return true;
+}
+
+// Make zero grad node
+inline std::vector<NodeEntry> MakeZeroGradNodes(
+  const NodePtr& n,
+  const std::vector<NodeEntry>& ograds) {
+  std::vector<NodeEntry> ret;
+  for (uint32_t i = 0; i < n->num_inputs(); ++i) {
+    std::ostringstream os;
+    ret.push_back(MakeNode("zeros_like", n->attrs.name + "_zero_grad",
+                           {n->inputs[i]}));
+  }
+  return ret;
+}
+
+// Helper to make gradient node
+inline std::vector<NodeEntry> MakeGradNode(
+  const char* op_name,
+  const NodePtr& n,
+  std::vector<NodeEntry> inputs,
+  std::unordered_map<std::string, std::string> attr = {{}}) {
+  NodePtr p = Node::Create();
+  p->attrs.op = nnvm::Op::Get(op_name);
+  p->attrs.name = n->attrs.name + "_grad";
+  p->inputs = std::move(inputs);
+  p->attrs.dict = std::move(attr);
+  if (p->attrs.op->attr_parser) {
+    p->attrs.op->attr_parser(&p->attrs);
+  }
+  std::vector<NodeEntry> ret;
+  for (uint32_t i = 0; i < p->num_outputs(); ++i) {
+    ret.emplace_back(NodeEntry{p, i, 0});
+  }
+  return ret;
+}
+
+
+}  // namespace top
+}  // namespace nnvm
+
+#endif  // NNVM_TOP_OP_COMMON_H_
diff --git a/nnvm/src/top/tensor/broadcast.cc b/nnvm/src/top/tensor/broadcast.cc
new file mode 100644
index 000000000000..a3ac2df93f56
--- /dev/null
+++ b/nnvm/src/top/tensor/broadcast.cc
@@ -0,0 +1,596 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file broadcast.cc
+ * \brief broadcast operator.
+ */
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/util.h>
+#include <nnvm/top/tensor.h>
+#include <nnvm/top/nn.h>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/broadcast.h"
+#include "topi/elemwise.h"
+
+namespace nnvm {
+namespace top {
+using namespace tvm;
+using namespace nnvm::compiler;
+
+// broadcast_to
+DMLC_REGISTER_PARAMETER(BroadcastToParam);
+
+inline bool BroadcastToInferShape(const NodeAttrs& attrs,
+                                  std::vector<TShape>* in_attrs,
+                                  std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& ishape = (*in_attrs)[0];
+  if (ishape.ndim() == 0) return false;
+
+  const BroadcastToParam& param = nnvm::get<BroadcastToParam>(attrs.parsed);
+  CHECK_EQ(ishape.ndim(), param.shape.ndim())
+      << "Operand of shape " << ishape
+      << " cannot be broadcasted to " << param.shape;
+  TShape oshape = param.shape;
+  for (dim_t i = 0; i < ishape.ndim(); ++i) {
+    if (oshape[i] != 0) {
+      CHECK(ishape[i] == oshape[i] || ishape[i] == 1)
+        << "Array cannot be broadcasted from " <<
+          ishape << " to " << param.shape;
+    } else {
+      oshape[i] = ishape[i];
+    }
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  return true;
+}
+
+NNVM_REGISTER_OP(broadcast_to)
+.describe(R"code(Broadcasts the input array to a new shape.
+
+Broadcasting is a mechanism that allows NDArrays to perform arithmetic operations
+with arrays of different shapes efficiently without creating multiple copies of arrays.
+Also see, `Broadcasting <https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html>`_ for more explanation.
+
+Broadcasting is allowed on axes with size 1, such as from `(2,1,3,1)` to
+`(2,8,3,9)`. Elements will be duplicated on the broadcasted axes.
+
+For example::
+
+   broadcast_to([[1,2,3]], shape=(2,3)) = [[ 1.,  2.,  3.],
+                                           [ 1.,  2.,  3.]])
+
+The dimension which you do not want to change can also be kept as `0` which means copy the original value.
+So with `shape=(2,0)`, we will obtain the same result as in the above example.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_arguments(BroadcastToParam::__FIELDS__())
+.set_attr_parser(ParamParser<BroadcastToParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<BroadcastToParam>)
+.set_attr<FInferShape>("FInferShape", BroadcastToInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+    const Array<Tensor>& inputs,
+    const Array<Tensor>& out_info) {
+      const BroadcastToParam& param = nnvm::get<BroadcastToParam>(attrs.parsed);
+      auto shape = ShapeToArray(param.shape);
+      return Array<Tensor>{ topi::broadcast_to(inputs[0], shape) };
+  })
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_support_level(4);
+
+// binary broadcast op
+inline bool BinaryBroadcastShape(const nnvm::NodeAttrs& attrs,
+                                 std::vector<TShape>* in_attrs,
+                                 std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& lhs = (*in_attrs)[0];
+  const TShape& rhs = (*in_attrs)[1];
+
+  // avoid pre-mature shape inference.
+  if (lhs.ndim() == 0 || rhs.ndim() == 0) return false;
+
+  if (lhs == rhs) {
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *out_attrs, 0, lhs);
+    return true;
+  }
+  TShape out(std::max(lhs.ndim(), rhs.ndim()));
+  dim_t bl = out.ndim() - lhs.ndim();
+  dim_t br = out.ndim() - rhs.ndim();
+  for (dim_t i = 0; i < out.ndim(); ++i) {
+    dim_t l = 1, r = 1;
+    if (i >= bl) l = lhs[i - bl];
+    if (i >= br) r = rhs[i - br];
+    if (l != r) {
+      if (l == 0 || r == 0) {
+        out[i] = 0;
+      } else {
+        CHECK(l == 1 || r == 1)
+          << "operands could not be broadcast together with shapes "
+          << lhs << " " << rhs << ", l=" << l << ", r=" << r;
+        out[i] = std::max(l, r);
+      }
+    } else {
+      out[i] = l;
+    }
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, out);
+  return true;
+}
+
+inline bool BinaryBroadcastCorrectLayout(const NodeAttrs& attrs,
+                                         std::vector<Layout> *ilayouts,
+                                         const std::vector<Layout> *last_ilayouts,
+                                         std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), 2U);
+  CHECK_EQ(olayouts->size(), 1U);
+  Layout lhs = (*ilayouts)[0];
+  Layout rhs = (*ilayouts)[1];
+  Layout out(Layout::Undef());
+
+  if (lhs.defined() && rhs.defined()) {
+    if (lhs == rhs) {
+      NNVM_ASSIGN_LAYOUT(*olayouts, 0, lhs);
+      return true;
+    }
+    // For example, NCHW <-> CHW, N16nCH16cW <-> HCW16c, etc, are broadcast-convertible
+    // because as the definition, CHW can broadcast with NCHW.
+    // For the second case, we can convert HCW16c to CH16cW then it can broadcast with N16nCH16cW.
+    // But CNHW <-> CHW, NCHW16n <-> CHW are not,
+    // because not matter how we adjust the layout of 'CHW',
+    // we can never have an 'N' between 'C' and "HW".
+    size_t l_start = 0, r_start = 0;
+    size_t l = 0, r = 0;
+    bool find_first_match = false;
+    while (l < lhs.ndim() && r < rhs.ndim()) {
+      if (!rhs.contains(Layout::to_superdim(lhs[l]))) {
+        CHECK(!find_first_match) << lhs << " and " << rhs << " are not broadcast-convertible";
+        l_start = ++l;
+      } else if (!lhs.contains(Layout::to_superdim(rhs[r]))) {
+        CHECK(!find_first_match) << lhs << " and " << rhs << " are not broadcast-convertible";
+        r_start = ++r;
+      } else {
+        find_first_match = true;
+        ++l; ++r;
+      }
+    }
+    if (l_start > 0 && r_start > 0) {
+      LOG(FATAL) << lhs << " and " << rhs << " are not broadcast-convertible";
+    } else if (l_start > 0) {
+      rhs = lhs.sublayout(l_start, lhs.ndim()-l_start);
+      out = lhs;
+    } else if (r_start > 0) {
+      lhs = rhs.sublayout(r_start, rhs.ndim()-r_start);
+      out = rhs;
+    } else {
+      // prior to keep left layout
+      rhs = lhs;
+      out = lhs;
+    }
+  } else if (lhs.defined()) {
+    const Layout& last_lhs = last_ilayouts->at(0);
+    if (last_lhs.defined()) {
+      CHECK(lhs.convertible(last_lhs)) << "current lhs layout " << lhs
+                                       << " cannot be converted to the original one " << last_lhs;
+      lhs = last_lhs;
+      // cannot decide output layout
+    }
+  } else if (rhs.defined()) {
+    const Layout& last_rhs = last_ilayouts->at(1);
+    if (last_rhs.defined()) {
+      CHECK(rhs.convertible(last_rhs)) << "current rhs layout " << rhs
+                                       << " cannot be converted to the original one " << last_rhs;
+      rhs = last_rhs;
+      // cannot decide output layout
+    }
+  }
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, lhs);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 1, rhs);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, out);
+  return true;
+}
+
+#define NNVM_REGISTER_BINARY_BROADCAST_OP(name, TOPIOp)             \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(2)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr<FInferShape>("FInferShape", BinaryBroadcastShape)       \
+  .set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)           \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                       \
+    BinaryBroadcastCorrectLayout)                                   \
+  .set_attr<FInplaceOption>("FInplaceOption",                       \
+    [](const NodeAttrs& attrs) {                                    \
+      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};     \
+    })                                                              \
+  .set_attr<FTVMCompute>(                                           \
+    "FTVMCompute", [](const NodeAttrs& attrs,                       \
+      const Array<Tensor>& inputs,                                  \
+      const Array<Tensor>& out_info) {                              \
+        return Array<Tensor>{                                       \
+          topi::TOPIOp(inputs[0], inputs[1]) };                     \
+    })                                                              \
+  .add_argument("lhs", "Tensor", "first input")                     \
+  .add_argument("rhs", "Tensor", "second input")
+
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_add, add)
+.add_alias("__add_symbol__")
+.describe(R"code(Returns element-wise sum of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   broadcast_add(x, y) = [[ 1.,  1.,  1.],
+                          [ 2.,  2.,  2.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return std::vector<NodeEntry>{
+      MakeNode("collapse_sum", n->attrs.name + "_dlhs", { ograds[0], n->inputs[0] }),
+      MakeNode("collapse_sum", n->attrs.name + "_drhs", { ograds[0], n->inputs[1] })
+    };
+});
+
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_sub, subtract)
+.add_alias("__sub_symbol__")
+.describe(R"code(Returns element-wise difference of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   broadcast_sub(x, y) = [[ 1.,  1.,  1.],
+                          [ 0.,  0.,  0.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return std::vector<NodeEntry>{
+      MakeNode("collapse_sum", n->attrs.name + "_dlhs", { ograds[0], n->inputs[0] }),
+      MakeNode("collapse_sum", n->attrs.name + "_drhs", {
+          MakeNode("negative", n->attrs.name + "_drhs_neg", {ograds[0]}),
+          n->inputs[1]
+        })
+    };
+});
+
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_mul, multiply)
+.add_alias("__mul_symbol__")
+.describe(R"code(Returns element-wise product of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   broadcast_mul(x, y) = [[ 0.,  0.,  0.],
+                          [ 1.,  1.,  1.]]
+)code" NNVM_ADD_FILELINE)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    NodeEntry dlhs = MakeNode("collapse_sum", n->attrs.name + "_dlhs_sum", {
+        MakeNode("broadcast_mul", n->attrs.name + "_dlhs_mul",
+                 { n->inputs[1], ograds[0] }),
+        n->inputs[0]
+      });
+    NodeEntry drhs = MakeNode("collapse_sum", n->attrs.name + "_drhs_sum", {
+        MakeNode("broadcast_mul", n->attrs.name + "_drhs_mul",
+                 { n->inputs[0], ograds[0] }),
+        n->inputs[1]
+      });
+    return std::vector<NodeEntry>{ dlhs, drhs };
+});
+
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_div, divide)
+.add_alias("__div_symbol__")
+.describe(R"code(Returns element-wise division of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 6.,  6.,  6.],
+        [ 6.,  6.,  6.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   broadcast_div(x, y) = [[ 3.,  3.,  3.],
+                          [ 2.,  2.,  2.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    NodeEntry dlhs = MakeNode("collapse_sum", n->attrs.name + "_dlhs_sum", {
+        MakeNode("broadcast_div", n->attrs.name + "_dlhs_div",
+                 { ograds[0], n->inputs[1] }),
+        n->inputs[0]
+      });
+    NodeEntry dy = MakeNode("broadcast_div", n->attrs.name + "_drhs_div", {
+        NodeEntry{n, 0, 0},
+        MakeNode("negative", n->attrs.name + "_rhs_neg", {n->inputs[1]})
+      });
+    NodeEntry drhs = MakeNode("collapse_sum", n->attrs.name + "_drhs_sum", {
+        MakeNode("broadcast_mul", n->attrs.name + "_drhs_mul", { dy, ograds[0] }),
+        n->inputs[1]
+      });
+    return std::vector<NodeEntry>{ dlhs, drhs };
+});
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_mod, mod)
+.add_alias("__mod_symbol__")
+.describe(R"code(Returns element-wise mod of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   broadcast_mod(x, y) = [[ 1.,  0.,  1.],
+                          [ 1.,  2.,  0.]]
+
+)code" NNVM_ADD_FILELINE);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_max, maximum)
+.add_alias("__max_symbol__")
+.describe(R"code(Returns element-wise max of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   broadcast_max(x, y) = [[ 2.,  2.,  3.],
+                          [ 4.,  5.,  6.]]
+
+)code" NNVM_ADD_FILELINE);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_min, minimum)
+.add_alias("__min_symbol__")
+.describe(R"code(Returns element-wise minimum of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   broadcast_min(x, y) = [[ 1.,  2.,  2.],
+                          [ 3.,  3.,  3.]]
+
+)code" NNVM_ADD_FILELINE);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_pow, power)
+.add_alias("__pow_symbol__")
+.describe(R"code(Returns element-wise x^y of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 1.],
+        [ 2.]]
+
+   broadcast_pow(x, y) = [[ 1.,   2.,   3. ],
+                          [ 16.,  25.,  36.]]
+
+)code" NNVM_ADD_FILELINE);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_left_shift, left_shift)
+.add_alias("__left_shift_symbol__")
+.describe(R"code(Returns element-wise x << y of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 2.],
+        [ 1.]]
+
+   broadcast_left_shift(x, y) = [[ 4.,  8.,  12.],
+                                 [ 8.,  10., 12.]]
+
+)code" NNVM_ADD_FILELINE);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_right_shift, right_shift)
+.add_alias("__right_shift_symbol__")
+.describe(R"code(Returns element-wise x >> y of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 4.,  8.,  12.],
+        [ 8.,  10., 12.]]
+
+   y = [[ 2.],
+        [ 1.]]
+
+   broadcast_right_shift(x, y) = [[ 1.,  2.,  3.],
+                                  [ 4.,  5.,  6.]]
+
+)code" NNVM_ADD_FILELINE);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_greater, greater)
+.add_alias("__greater_symbol__")
+.describe(R"code(Returns element-wise x > y of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   broadcast_greater(x, y) = [[ 0.,  0.,  1.],
+                              [ 1.,  1.,  1.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::greater(inputs[0], inputs[1]), out_info[0]->dtype) };
+}, 11);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_less, less)
+.add_alias("__less_symbol__")
+.describe(R"code(Returns element-wise x < y of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   broadcast_less(x, y) = [[ 1.,  0.,  0.],
+                           [ 0.,  0.,  0.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::less(inputs[0], inputs[1]), out_info[0]->dtype) };
+}, 11);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_equal, equal)
+.add_alias("__equal_symbol__")
+.describe(R"code(Returns element-wise x == y of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 2.],
+        [ 5.]]
+
+   broadcast_equal(x, y) = [[ 0.,  1.,  0.],
+                            [ 0.,  1.,  0.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::equal(inputs[0], inputs[1]), out_info[0]->dtype) };
+}, 11);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_not_equal, not_equal)
+.add_alias("__not_equal_symbol__")
+.describe(R"code(Returns element-wise x != y of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 2.],
+        [ 4.]]
+
+   broadcast_not_equal(x, y) = [[ 1.,  0.,  1.],
+                                [ 0.,  1.,  1.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::not_equal(inputs[0],
+                                                     inputs[1]),
+                                                     out_info[0]->dtype) };
+}, 11);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_greater_equal, greater_equal)
+.add_alias("__greater_equal_symbol__")
+.describe(R"code(Returns element-wise x >= y of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 2.],
+        [ 6.]]
+
+   broadcast_greater_equal(x, y) = [[ 0.,  1.,  1.],
+                                    [ 0.,  0.,  1.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::greater_equal(inputs[0],
+                                                         inputs[1]),
+                                                         out_info[0]->dtype) };
+}, 11);
+
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_less_equal, less_equal)
+.add_alias("__less_equal_symbol__")
+.describe(R"code(Returns element-wise x <= y of the input arrays with broadcasting.
+
+Example::
+
+   x = [[ 1.,  2.,  3.],
+        [ 4.,  5.,  6.]]
+
+   y = [[ 1.],
+        [ 5.]]
+
+   broadcast_less_equal(x, y) = [[ 1.,  0.,  0.],
+                                 [ 1.,  1.,  0.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::less_equal(inputs[0],
+                                                      inputs[1]),
+                                                      out_info[0]->dtype) };
+}, 11);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
new file mode 100644
index 000000000000..239f44783392
--- /dev/null
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -0,0 +1,942 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file elemwise.cc
+ * \brief Elemenwise operators
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/util.h>
+#include <nnvm/top/tensor.h>
+#include <cmath>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/broadcast.h"
+#include "topi/elemwise.h"
+#include "topi/tags.h"
+#include "../../compiler/compile_engine.h"
+
+namespace nnvm {
+namespace top {
+
+using namespace tvm;
+using namespace nnvm::compiler;
+
+// undefined op
+NNVM_REGISTER_ELEMWISE_UNARY_OP(__undef__)
+.describe(R"code(undefined op.
+
+Used to produce invalide node during optimization.
+
+)code" NNVM_ADD_FILELINE)
+.set_num_outputs(1)
+.set_num_inputs(0);
+
+// floor
+NNVM_REGISTER_ELEMWISE_UNARY_OP(floor)
+.describe(R"code(Take floor input array, computed element-wise.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::floor(inputs[0]) };
+});
+
+// ceil
+NNVM_REGISTER_ELEMWISE_UNARY_OP(ceil)
+.describe(R"code(Take ceil input array, computed element-wise.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::ceil(inputs[0]) };
+});
+
+// trunc
+NNVM_REGISTER_ELEMWISE_UNARY_OP(trunc)
+.describe(R"code(Take truncated value of the input, element-wise.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::trunc(inputs[0]) };
+});
+
+// round
+NNVM_REGISTER_ELEMWISE_UNARY_OP(round)
+.describe(R"code(Round elements of the input to nearest integer.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::round(inputs[0]) };
+});
+
+// abs
+NNVM_REGISTER_ELEMWISE_UNARY_OP(abs)
+.describe(R"code(Take absolute value of elements of the input.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::abs(inputs[0]) };
+});
+
+// sigmoid
+NNVM_REGISTER_ELEMWISE_UNARY_OP(sigmoid)
+.describe(R"code(Computes sigmoid.
+
+.. math::
+  Y = 1 / (1 + exp(-X))
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::sigmoid(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = 1 / (1 + exp(-n0))
+    // grad_0 = grad_y * y * (1 - y)
+    NodeEntry sub0 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_0",
+                              {ograds[0], NodeEntry{n, 0, 0}});
+    NodeEntry sub1 = MakeNode("__rsub_scalar__", n->attrs.name + "_grad_sub_1",
+                              {NodeEntry{n, 0, 0}}, {{"scalar", "1"}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {sub0, sub1})
+    };
+});
+
+// tanh
+NNVM_REGISTER_ELEMWISE_UNARY_OP(tanh)
+.describe(R"code(Computes hyperbolic tangent.
+
+.. math::
+   Y = sinh(X) / cosh(X)
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::tanh(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = sinh(n0) / cosh(n0)
+    // grad_0 = grad_y * (1 - y^2)
+    NodeEntry sub0 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_0",
+                              {NodeEntry{n, 0, 0}, NodeEntry{n, 0, 0}});
+    NodeEntry sub1 = MakeNode("__rsub_scalar__", n->attrs.name + "_grad_sub_1",
+                              {sub0}, {{"scalar", "1"}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {ograds[0], sub1})
+    };
+});
+
+// exp
+NNVM_REGISTER_ELEMWISE_UNARY_OP(exp)
+.describe(R"code(Returns the exp input array, computed element-wise.
+
+.. math::
+   exp(x)
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::exp(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = exp(n0)
+    // grad_0 = grad_y * y
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {ograds[0], NodeEntry{n, 0, 0}})
+    };
+});
+
+// log
+NNVM_REGISTER_ELEMWISE_UNARY_OP(log)
+.describe(R"code(Returns the log input array, computed element-wise.
+
+.. math::
+   log(x)
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::log(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = log(n0)
+    // grad_0 = grad_y / n0
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_div", n->attrs.name + "_grad_0",
+               {ograds[0], n->inputs[0]})
+    };
+});
+
+// sqrt
+NNVM_REGISTER_ELEMWISE_UNARY_OP(sqrt)
+.describe(R"code(Returns the sqrt input array, computed element-wise.
+
+.. math::
+   \sqrt(x)
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::sqrt(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = sqrt(n0)
+    // grad_0 = grad_y / (2 * y)
+    NodeEntry sub0 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_0",
+                              {NodeEntry{n, 0, 0}}, {{"scalar", "2"}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_div", n->attrs.name + "_grad_0",
+             {ograds[0], sub0})
+    };
+});
+
+// binary ops
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_add)
+.describe(R"code(Element-wise add
+
+)code")
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::add(inputs[0], inputs[1]) };
+  })
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 + n1
+    // grad_0 = grad_y
+    // grad_1 = grad_y
+    return std::vector<NodeEntry>{ MakeNode("copy", n->attrs.name + "_grad_0",
+                                            {ograds[0]}),
+                                   MakeNode("copy", n->attrs.name + "_grad_0",
+                                            {ograds[0]}) };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_sub)
+.describe(R"code(Element-wise substraction
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::subtract(inputs[0], inputs[1]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 - n1
+    // grad_0 = grad_y
+    // grad_1 = - grad_y
+    return std::vector<NodeEntry>{
+      ograds[0],
+      MakeNode("negative", n->attrs.name + "_grad_1", {ograds[0]}),
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_mul)
+.describe(R"code(Element-wise multiplication
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::multiply(inputs[0], inputs[1]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 * n1
+    // grad_0 = grad_y * n1
+    // grad_1 = grad_y * n0
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {ograds[0], n->inputs[1]}),
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_1",
+               {ograds[0], n->inputs[0]})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_div)
+.describe(R"code(Element-wise multiplication
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::divide(inputs[0], inputs[1]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 / n1
+    // grad_0 = grad_y / n1
+    // grad_1 = - grad_y * n0 / n1^2
+    NodeEntry sub0 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_0",
+                              {ograds[0], n->inputs[0]});
+    NodeEntry sub1 = MakeNode("negative", n->attrs.name + "_grad_sub_1",
+                              {sub0});
+    NodeEntry sub2 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_2",
+                              {n->inputs[1], n->inputs[1]});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_div", n->attrs.name + "_grad_0",
+               {ograds[0], n->inputs[1]}),
+      MakeNode("elemwise_div", n->attrs.name + "_grad_1",
+               {sub1, sub2})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_mod)
+  .describe(R"code(Element-wise modulo
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::mod(inputs[0], inputs[1]) };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_pow)
+  .describe(R"code(Element-wise power
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::power(inputs[0], inputs[1]) };
+});
+
+// negative
+NNVM_REGISTER_ELEMWISE_UNARY_OP(negative)
+.describe(R"code(Elemenwise numeric negative
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::negative(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = - n0
+    // grad_0 = - grad_y
+    return std::vector<NodeEntry>{
+      MakeNode("negative", n->attrs.name + "_grad_0", {ograds[0]}),
+    };
+});
+
+// copy
+NNVM_REGISTER_ELEMWISE_UNARY_OP(copy)
+.describe(R"code(Copy tensor to another one.
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::identity(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = copy(n0)
+    // grad_0 = grad_y
+    return std::vector<NodeEntry>{ MakeNode("copy", n->attrs.name + "_grad_0",
+                                            {ograds[0]}) };
+});
+
+DMLC_REGISTER_PARAMETER(InitOpParam);
+DMLC_REGISTER_PARAMETER(InitOpWithScalarParam);
+DMLC_REGISTER_PARAMETER(FillValueParam);
+
+// full
+NNVM_REGISTER_INIT_OP(full)
+.describe(R"code(Fill array with scalar value
+
+)code"  NNVM_ADD_FILELINE)
+.set_attr_parser(ParamParser<InitOpWithScalarParam>)
+.set_attr<FGetAttrDict>(
+  "FGetAttrDict", ParamGetAttrDict<InitOpWithScalarParam>)
+.add_arguments(InitOpWithScalarParam::__FIELDS__())
+.set_attr<FInferShape>("FInferShape", ZeroShape<InitOpWithScalarParam>)
+.set_attr<FInferType>("FInferType", ZeroType<InitOpWithScalarParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ZeroLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const InitOpWithScalarParam& param = nnvm::get<InitOpWithScalarParam>(attrs.parsed);
+    Array<Expr> shape = ShapeToArray(param.shape);
+    Type dtype = GetTVMType(param.dtype);
+    Expr fill_value = tvm::make_const(dtype, param.fill_value);
+    return Array<Tensor>{ topi::full(shape, dtype, fill_value) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INIT_OP(zeros)
+.describe(R"code(Fill target with zeros
+
+)code"  NNVM_ADD_FILELINE)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<FGetAttrDict>(
+  "FGetAttrDict", ParamGetAttrDict<InitOpParam>)
+.add_arguments(InitOpParam::__FIELDS__())
+.set_attr<FInferShape>("FInferShape", ZeroShape<InitOpParam>)
+.set_attr<FInferType>("FInferType", ZeroType<InitOpParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ZeroLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const InitOpParam& param = nnvm::get<InitOpParam>(attrs.parsed);
+    Array<Expr> shape = ShapeToArray(param.shape);
+    Type dtype = GetTVMType(param.dtype);
+    Expr fill_value = tvm::make_const(dtype, 0);
+    return Array<Tensor>{ topi::full(shape, dtype, fill_value) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INIT_OP(ones)
+.describe(R"code(Fill target with ones
+
+)code"  NNVM_ADD_FILELINE)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<FGetAttrDict>(
+  "FGetAttrDict", ParamGetAttrDict<InitOpParam>)
+.add_arguments(InitOpParam::__FIELDS__())
+.set_attr<FInferShape>("FInferShape", ZeroShape<InitOpParam>)
+.set_attr<FInferType>("FInferType", ZeroType<InitOpParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ZeroLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const InitOpParam& param = nnvm::get<InitOpParam>(attrs.parsed);
+    Array<Expr> shape = ShapeToArray(param.shape);
+    Type dtype = GetTVMType(param.dtype);
+    Expr fill_value = tvm::make_const(dtype, 1);
+    return Array<Tensor>{ topi::full(shape, dtype, fill_value) };
+})
+.set_support_level(4);
+
+// full_like
+NNVM_REGISTER_INIT_LIKE_OP(full_like)
+.describe(R"code(Return an scalar value array with the same shape and type
+as the input array
+
+)code"  NNVM_ADD_FILELINE)
+.add_arguments(FillValueParam::__FIELDS__())
+.set_attr_parser(ParamParser<FillValueParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<FillValueParam>)
+.set_attr<FTVMCompute>(
+    "FTVMCompute", [](const NodeAttrs& attrs,
+                      const Array<Tensor>& inputs,
+                      const Array<Tensor>& out_info) {
+      const FillValueParam& param = nnvm::get<FillValueParam>(attrs.parsed);
+      const Expr fill_value = tvm::make_const(out_info[0]->dtype, param.fill_value);
+      return Array<Tensor> { topi::full_like(inputs[0], fill_value) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INIT_LIKE_OP(zeros_like)
+.describe(R"code(Return an array of zeros with the same shape and type
+as the input array.
+
+)code")
+.set_attr<FTVMCompute>(
+    "FTVMCompute", [](const NodeAttrs& attrs,
+                      const Array<Tensor>& inputs,
+                      const Array<Tensor>& out_info) {
+      return Array<Tensor> { topi::full_like(inputs[0],
+                                             tvm::make_const(out_info[0]->dtype, 0)) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INIT_LIKE_OP(ones_like)
+.describe(R"code(Return an array of ones with the same shape and type
+as the input array.
+
+)code")
+.set_attr<FTVMCompute>(
+    "FTVMCompute", [](const NodeAttrs& attrs,
+                      const Array<Tensor>& inputs,
+                      const Array<Tensor>& out_info) {
+      return Array<Tensor> { topi::full_like(inputs[0],
+                                             tvm::make_const(out_info[0]->dtype, 1)) };
+})
+.set_support_level(4);
+
+// unary scalar op
+DMLC_REGISTER_PARAMETER(ScalarParam);
+
+#define NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(op)                        \
+  NNVM_REGISTER_ELEMWISE_UNARY_OP(op)                                   \
+  .add_arguments(ScalarParam::__FIELDS__())                             \
+  .set_attr_parser(ParamParser<ScalarParam>)                            \
+  .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ScalarParam>)
+
+inline Tensor binary_scalar_op(const NodeAttrs& attrs,
+                               const Tensor& x,
+                               std::function<Expr(Expr, Expr)> f) {
+  const ScalarParam& param = nnvm::get<ScalarParam>(attrs.parsed);
+  auto scalar_val = static_cast<float>(param.scalar);
+  return compute(x->shape, [&](const Array<Var>& i) {
+    auto scalar_const = make_const(x->dtype, scalar_val);
+    return f(x(i), scalar_const);
+    }, "tensor", topi::kElementWise);
+}
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__add_scalar__)
+.describe(R"code(Tensor add scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return x + y; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return std::vector<NodeEntry>{ MakeNode("copy", n->attrs.name + "_grad_0",
+                                            {ograds[0]}) };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__sub_scalar__)
+.describe(R"code(Tensor substract scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return x - y; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return std::vector<NodeEntry>{ograds[0]};
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rsub_scalar__)
+.describe(R"code(scalar substract Tensor
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return y - x; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return std::vector<NodeEntry>{
+      MakeNode("negative", n->attrs.name + "_grad_0", {ograds[0]})
+    };
+});
+
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__lshift_scalar__)
+.describe(R"code(Tensor left shift by scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ScalarParam& param = nnvm::get<ScalarParam>(attrs.parsed);
+    int scalar_val = static_cast<int>(param.scalar);
+    return Array<Tensor>{
+      topi::left_shift(inputs[0],
+                       make_const(inputs[0]->dtype, scalar_val))};
+    });
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rshift_scalar__)
+.describe(R"code(Tensor right shift by scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ScalarParam& param = nnvm::get<ScalarParam>(attrs.parsed);
+    int scalar_val = static_cast<int>(param.scalar);
+    return Array<Tensor>{
+      topi::right_shift(inputs[0],
+                        make_const(inputs[0]->dtype, scalar_val))};
+  });
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__mul_scalar__)
+.describe(R"code(Tensor multiplies scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return x * y; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 * scalar
+    // grad_0 = grad_y * scalar
+    return std::vector<NodeEntry>{
+      MakeNode("__mul_scalar__", n->attrs.name + "_grad_0",
+               {ograds[0]}, {{"scalar", n->attrs.dict["scalar"]}})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__div_scalar__)
+.describe(R"code(Tensor divides scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return x / y; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 / scalar
+    // grad_0 = grad_y / scalar
+    return std::vector<NodeEntry>{
+      MakeNode("__div_scalar__", n->attrs.name + "_grad_0",
+               {ograds[0]}, {{"scalar", n->attrs.dict["scalar"]}})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rdiv_scalar__)
+.describe(R"code(scalar divides Tensor
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return y / x; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = scalar / n0
+    // grad_0 = - grad_y * scalar / n0^2
+    NodeEntry sub0 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_0",
+                              {ograds[0]},
+                              {{"scalar", n->attrs.dict["scalar"]}});
+    NodeEntry sub1 = MakeNode("negative", n->attrs.name + "_grad_sub_1",
+                              {sub0});
+    NodeEntry sub2 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_2",
+                              {n->inputs[0], n->inputs[0]});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_div", n->attrs.name + "_grad_0",
+               {sub1, sub2})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__pow_scalar__)
+.describe(R"code(Tensor power scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return tvm::pow(x, y); }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0^scalar
+    // grad_0 = grad_y * scalar * n0^(scalar - 1)
+    double scalar = std::stod(n->attrs.dict["scalar"]);
+    NodeEntry sub0 = MakeNode("__pow_scalar__", n->attrs.name + "_grad_sub_0",
+                              {n->inputs[0]},
+                              {{"scalar", std::to_string(scalar - 1)}});
+    NodeEntry sub1 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_1",
+                              {ograds[0]},
+                              {{"scalar", std::to_string(scalar)}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {sub0, sub1})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rpow_scalar__)
+.describe(R"code(scalar power Tensor
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return tvm::pow(y, x); }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = scalar^n0
+    // grad_0 = grad_y * scalar^n0 * log(scalar)
+    double num = std::stod(n->attrs.dict["scalar"]);
+    NodeEntry sub0 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_0",
+                              {NodeEntry{n, 0, 0}},
+                              {{"scalar", std::to_string(std::log(num))}});
+    return std::vector<NodeEntry>{
+      MakeNode("__mul_symbol__", n->attrs.name + "_grad_0",
+               {ograds[0], sub0})
+    };
+});
+
+DMLC_REGISTER_PARAMETER(ElementWiseReduceParam);
+
+NNVM_REGISTER_ELEMWISE_REDUCE_OP(elemwise_sum)
+.describe(R"code(Adds all input arguments element-wise.
+
+)code"  NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ElementWiseReduceParam& param = nnvm::get<ElementWiseReduceParam>(attrs.parsed);
+    CHECK_EQ(param.num_args, inputs.size()) << """Compute definition of elemwise sum""";
+    return Array<Tensor>{ topi::elemwise_sum(inputs) };
+})
+.set_attr<nnvm::FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    CHECK_EQ(ograds.size(), 1);
+    std::vector<NodeEntry> ret;
+    for (size_t i = 0; i < n->inputs.size(); i++) {
+      ret.push_back(MakeNode("copy", n->attrs.name + "_grad_0", {ograds[0]}));
+    }
+    return ret;
+  })
+.set_support_level(4);
+
+NNVM_REGISTER_ELEMWISE_UNARY_OP(block_grad)
+.describe(R"code(Blocks gradient computation for input.
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<nnvm::FInplaceIdentity>(
+  "FInplaceIdentity", [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+})
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_support_level(4);
+
+DMLC_REGISTER_PARAMETER(IndicatorParam);
+
+// indicator function
+NNVM_REGISTER_INDICATOR_OP(greater)
+.describe(R"code(Greater function that returns a mask tensor
+with 1.0 if (left > right), otherwise 0.0 element-wise.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("lhs", "Tensor", "First input")
+.add_argument("rhs", "Tensor", "Second input")
+.set_num_inputs(2)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::greater(inputs[0], inputs[1]), out_info[0]->dtype) };
+})
+.set_support_level(4);
+
+
+NNVM_REGISTER_INDICATOR_OP(less)
+  .describe(R"code(Less function that returns a mask tensor
+with 1.0 if (left < right), otherwise 0.0 element-wise.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("lhs", "Tensor", "First input")
+.add_argument("rhs", "Tensor", "Second input")
+.set_num_inputs(2)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::less(inputs[0], inputs[1]), out_info[0]->dtype) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INDICATOR_OP(_max_mask)
+  .describe(R"code(Function that returns a mask tensor
+with 1.0 if the value is maximum over given axes, otherwise 0.0 element-wise.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input")
+.set_num_inputs(1)
+.add_arguments(IndicatorParam::__FIELDS__())
+.set_attr_parser(ParamParser<IndicatorParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<IndicatorParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_support_level(1);
+
+NNVM_REGISTER_INDICATOR_OP(_min_mask)
+  .describe(R"code(Function that returns a mask tensor
+with 1.0 if the value is minimum over given axes, otherwise 0.0 element-wise.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input")
+.set_num_inputs(1)
+.add_arguments(IndicatorParam::__FIELDS__())
+.set_attr_parser(ParamParser<IndicatorParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<IndicatorParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_support_level(1);
+
+
+DMLC_REGISTER_PARAMETER(ClipParam);
+
+NNVM_REGISTER_OP(clip)
+.describe(R"doc(Clips (limits) the values in an array.
+Given an interval, values outside the interval are clipped to the interval edges.
+Clipping ``x`` between `a_min` and `a_x` would be::
+   clip(x, a_min, a_max) = max(min(x, a_max), a_min))
+Example::
+    x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    clip(x,1,8) = [ 1.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  8.]
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ClipParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ClipParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ClipParam params = get<ClipParam>(attrs.parsed);
+    return Array<Tensor>{
+      topi::clip(inputs[0], tvm::make_const(tvm::Float(32), params.a_min),
+                 tvm::make_const(tvm::Float(32), params.a_max)) };
+  })
+.add_argument("data", "NDArray-or-Symbol", "Input array.")
+.add_arguments(ClipParam::__FIELDS__())
+.set_attr<nnvm::FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = clip(x, a_min, a_max)
+    // min_mask = greater_equal(x, a_min*ones_like(x))
+    //          => ones_like(x) - less(x, a_min)
+    // max_mask = less_equal(x, a_max*ones_like(x))
+    //          => ones_like(x) - greater(x, a_max)
+    // grad_x = min_mask * max_mask * grad_y
+    CHECK_EQ(ograds.size(), 1);
+
+    NodeEntry sub0 = MakeNode("ones_like", n->attrs.name + "_grad_sub_0",
+                              {n->inputs[0]});
+    // min_mask
+    NodeEntry sub1 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_1",
+                              {sub0}, {{"scalar", n->attrs.dict["a_min"]}});
+    NodeEntry sub2 = MakeNode("less", n->attrs.name + "_grad_sub_2",
+                              {n->inputs[0], sub1});
+    NodeEntry sub3 = MakeNode("elemwise_sub", n->attrs.name + "_grad_sub_3",
+                              {sub0, sub2});
+
+    // max_mask
+    NodeEntry sub4 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_4",
+                              {sub0}, {{"scalar", n->attrs.dict["a_max"]}});
+    NodeEntry sub5 = MakeNode("greater", n->attrs.name + "_grad_sub_5",
+                              {n->inputs[0], sub4});
+    NodeEntry sub6 = MakeNode("elemwise_sub", n->attrs.name + "_grad_sub_6",
+                              {sub0, sub5});
+
+    // min_mask * max_mask
+    NodeEntry sub7 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_7",
+                              {sub3, sub6});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad",
+               {sub7, ograds[0]})
+    };
+  })
+.set_support_level(4);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/tensor/matrix_op.cc b/nnvm/src/top/tensor/matrix_op.cc
new file mode 100644
index 000000000000..c881e683a6c5
--- /dev/null
+++ b/nnvm/src/top/tensor/matrix_op.cc
@@ -0,0 +1,177 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file matrix_op.cc
+ * \brief Matrix operators
+ */
+#include <topi/nn.h>
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/top/tensor.h>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+
+using namespace nnvm::compiler;
+
+DMLC_REGISTER_PARAMETER(MatMulParam);
+
+inline bool DotShape(const nnvm::NodeAttrs& attrs,
+                     std::vector<TShape> *in_attrs,
+                     std::vector<TShape> *out_attrs) {
+  const MatMulParam& param = nnvm::get<MatMulParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TShape lshape = (*in_attrs)[0];
+  TShape rshape = (*in_attrs)[1];
+
+  if (lshape.ndim() == 1)  lshape = TShape{1, lshape[0]};
+  if (rshape.ndim() == 1) rshape = TShape{1, rshape[0]};
+
+  if (param.transpose_a) std::reverse(lshape.begin(), lshape.end());
+  if (param.transpose_b) std::reverse(rshape.begin(), rshape.end());
+
+  CHECK_EQ(lshape[lshape.ndim() - 1], rshape[0])
+    << "dot shape inconsistent: " << lshape << " X " << rshape;
+
+  TShape oshape(lshape.ndim() + rshape.ndim() - 2);
+  for (uint32_t i = 0; i < lshape.ndim() - 1; i++) oshape[i] = lshape[i];
+  for (uint32_t i = 1; i < rshape.ndim(); i++) oshape[i + lshape.ndim() - 2] = rshape[i];
+
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  return true;
+}
+
+inline bool DotCorrectLayout(const NodeAttrs& attrs,
+                             std::vector<Layout> *ilayouts,
+                             const std::vector<Layout> *last_ilayouts,
+                             std::vector<Layout> *olayouts) {
+  const MatMulParam& param = nnvm::get<MatMulParam>(attrs.parsed);
+  CHECK_EQ(ilayouts->size(), 2U);
+  CHECK_EQ(olayouts->size(), 1U);
+  const Layout& lhs = last_ilayouts->at(0).defined() ? last_ilayouts->at(0)
+                                                     : ilayouts->at(0);
+  const Layout& rhs = last_ilayouts->at(1).defined() ? last_ilayouts->at(1)
+                                                     : ilayouts->at(1);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, lhs);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 1, rhs);
+
+  if (lhs.ndim() > 1 && rhs.ndim() > 1) {
+    // concat lhs and rhs layout
+    const Layout& lhs_out = param.transpose_a ? lhs.reverse() : lhs;
+    const Layout& rhs_out = param.transpose_b ? rhs.reverse() : rhs;
+    Layout out = lhs_out.sublayout(0, lhs_out.ndim()-1) +
+        rhs_out.sublayout(1, rhs_out.ndim()-1);
+    NNVM_ASSIGN_LAYOUT(*olayouts, 0, out);
+  }
+  return true;
+}
+
+NNVM_REGISTER_OP(matmul)
+.describe(R"doc(Matrix multiplication of two arrays.
+
+``dot``'s behavior depends on the input array dimensions:
+
+- 1-D arrays: inner product of vectors
+- 2-D arrays: matrix multiplication
+- N-D arrays: a sum product over the last axis of the first input and the first
+  axis of the second input
+
+  For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the
+  result array will have shape `(n,m,r,s)`. It is computed by::
+
+    dot(x,y) = sum(x[i,j,:]*y[:,a,b])
+
+)doc" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<MatMulParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<MatMulParam>)
+.add_arguments(MatMulParam::__FIELDS__())
+.add_argument("lhs", "NDArray-or-Symbol", "The first input")
+.add_argument("rhs", "NDArray-or-Symbol", "The second input")
+.set_attr<FInferShape>("FInferShape", DotShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", DotCorrectLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const MatMulParam& param = nnvm::get<MatMulParam>(attrs.parsed);
+    return Array<Tensor>{
+      topi::matmul(inputs[0], inputs[1], param.transpose_a, param.transpose_b)
+    };
+  })
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // z = x dot y
+    // xshape (n,m,k), yshape (k,r,s)
+    const MatMulParam& param = nnvm::get<MatMulParam>(n->attrs.parsed);
+    bool Ta = param.transpose_a;
+    bool Tb = param.transpose_b;
+    // Ta = false, Tb = false
+    // grad_x = grad_z dot y.T
+    // grad_y = x.T dot grad_z
+    if (!Ta && !Tb) {
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {ograds[0], n->inputs[1]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "true"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {n->inputs[0], ograds[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "false"}})
+      };
+    } else if (Ta && !Tb) {
+      // Ta = true, Tb = false
+      // grad_x = y dot grad_z.T
+      // grad_y = x dot grad_z
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {n->inputs[1], ograds[0]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "true"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {n->inputs[0], ograds[0]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "false"}})
+      };
+    } else if (!Ta && Tb) {
+      // Ta = false, Tb = true
+      // grad_x = grad_z dot y
+      // grad_y = grad_z.T dot x
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {ograds[0], n->inputs[1]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "false"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {ograds[0], n->inputs[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "false"}})
+      };
+    } else {
+      // Ta = true, Tb = true
+      // grad_x = y.T dot grad_z.T
+      // grad_y = grad_z.T dot x.T
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {n->inputs[1], ograds[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "true"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {ograds[0], n->inputs[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "true"}})
+      };
+    }
+});
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
new file mode 100644
index 000000000000..d8f426b4f4bc
--- /dev/null
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -0,0 +1,327 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file reduce.cc
+ * \brief reduce operator.
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/util.h>
+#include <nnvm/top/tensor.h>
+#include <numeric>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/detail/constant_utils.h"
+#include "topi/elemwise.h"
+#include "topi/reduction.h"
+#include "topi/transform.h"
+
+namespace nnvm {
+namespace top {
+using namespace tvm;
+using namespace nnvm::compiler;
+
+// reduce
+DMLC_REGISTER_PARAMETER(ReduceParam);
+
+inline TShape GetReduceAxes(const uint32_t indim,
+                            const TShape& axis,
+                            bool exclude) {
+  if (axis.ndim() == 0) {
+    TShape r_axes(indim);
+    std::iota(r_axes.begin(), r_axes.end(), 0);
+    return r_axes;
+  }
+
+  CHECK_LT(axis[axis.ndim() - 1], indim)
+    << "Reduction axis " << axis[axis.ndim() - 1]
+    << " exceeds input dimensions " << indim;
+
+  TShape in_axis = axis;
+  for (auto& i : in_axis) {
+    i = i < 0 ? i + indim : i;
+    CHECK_GE(i, 0) << "axis out of bounds in reduce operator";
+    CHECK_LT(i, indim) << "axis out of bounds in reduce operator";
+  }
+  std::sort(in_axis.begin(), in_axis.end());
+  if (!exclude) return in_axis;
+  TShape r_axis(indim - in_axis.ndim());
+  for (unsigned i = 0, j = 0, k = 0; i < indim; ++i) {
+    if (j < in_axis.ndim() && i == in_axis[j]) {
+        ++j;
+        continue;
+    }
+    r_axis[k++] = i;
+  }
+  return r_axis;
+}
+
+inline TShape ReduceShapeImpl(const TShape& ishape,
+                              const TShape& axis,
+                              bool keepdims,
+                              bool exclude) {
+  uint32_t indim = ishape.ndim();
+  TShape r_axes = GetReduceAxes(indim, axis, exclude);
+  if (!r_axes.ndim()) return ishape;
+  if (r_axes.ndim() == indim)
+    return TShape(keepdims ? indim : 1);
+
+  if (keepdims) {
+    TShape oshape(ishape);
+    for (unsigned i = 0, j = 0; i < indim; ++i) {
+      if (i != r_axes[j]) continue;
+      oshape[i] = 1;
+      ++j;
+    }
+    return oshape;
+  }
+
+  TShape oshape(indim - r_axes.ndim());
+  for (unsigned i = 0, j = 0, k = 0; i < indim; ++i) {
+    if (i == r_axes[j]) {
+      ++j;
+      continue;
+    }
+    oshape[k++] = ishape[i];
+  }
+  return oshape;
+}
+
+inline bool ReduceShape(const nnvm::NodeAttrs& attrs,
+                        std::vector<TShape>* in_attrs,
+                        std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if ((*in_attrs)[0].ndim() == 0) return false;
+  const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+  NNVM_ASSIGN_INPUT_SHAPE(
+      attrs, *out_attrs, 0,
+      ReduceShapeImpl((*in_attrs)[0], param.axis,
+                      param.keepdims, param.exclude));
+  return true;
+}
+
+inline bool CollapseShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape>* in_attrs,
+                          std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if ((*in_attrs)[0].ndim() == 1) return false;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, (*in_attrs)[1]);
+  return true;
+}
+
+template<typename PType>
+inline void AxesParamParser(nnvm::NodeAttrs* attrs) {
+  PType param;
+  param.Init(attrs->dict);
+  std::sort(&param.axis[0], &param.axis[param.axis.ndim()]);
+  attrs->parsed = std::move(param);
+}
+
+#define NNVM_REGISTER_BASE_REDUCE_OP(op)                                 \
+  NNVM_REGISTER_OP(op)                                                   \
+  .add_arguments(ReduceParam::__FIELDS__())                              \
+  .set_attr_parser(AxesParamParser<ReduceParam>)                         \
+  .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReduceParam>) \
+  .set_num_outputs(1)
+
+#define NNVM_REGISTER_REDUCE_OP(op)                                     \
+  NNVM_REGISTER_BASE_REDUCE_OP(op)                                      \
+  .add_argument("data", "Tensor", "The input")                          \
+  .set_attr<FInferShape>("FInferShape", ReduceShape)                    \
+  .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)               \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                           \
+    ElemwiseFixedLayoutUnknownOut<1, 1>)                                \
+  .set_num_inputs(1)
+
+NNVM_REGISTER_REDUCE_OP(sum)
+.describe(R"code(Computes the sum of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  sum(data, axis=1)
+  [[  4.   8.]
+   [ 10.   9.]
+   [ 21.   6.]]
+
+  sum(data, axis=[1,2])
+  [ 12.  19.  27.]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
+    auto axis = ShapeToArray(r_axes);
+    return Array<Tensor>{
+      topi::sum(inputs[0], axis, param.keepdims) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    bool exclude = param.exclude;
+    TShape p_axis = param.axis;
+    if (!param.exclude && param.axis.ndim() == 0) {
+      exclude = true;
+      p_axis = TShape();
+    }
+    std::ostringstream axis; axis << p_axis;
+    return std::vector<NodeEntry>{
+      MakeNode("expand_like", n->attrs.name + "_grad",
+               {ograds[0], n->inputs[0]},
+               {{"axis", axis.str()},
+                {"exclude", std::to_string(exclude)}})
+  };
+});
+
+NNVM_REGISTER_REDUCE_OP(max)
+.describe(R"code(Computes the max of array elements over given axes.
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    auto axis = ShapeToArray(r_axes);
+    return Array<Tensor>{
+      topi::max(inputs[0], axis, param.keepdims) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    std::ostringstream axis; axis << param.axis;
+    NodeEntry sub0 = MakeNode("expand_like", n->attrs.name + "_grad_sub0",
+                             {ograds[0], n->inputs[0]},
+                             {{"axis", axis.str()},
+                              {"exclude", std::to_string(param.exclude)}});
+    NodeEntry sub1 = MakeNode("_max_mask", n->attrs.name + "_grad_sub1",
+                              {ograds[0]},
+                              {{"axis", axis.str()},
+                               {"exclude", std::to_string(param.exclude)}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad", {sub0, sub1})
+    };
+});
+
+NNVM_REGISTER_REDUCE_OP(min)
+.describe(R"code(Computes the min of array elements over given axes.
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    auto axis = ShapeToArray(r_axes);
+    return Array<Tensor>{
+      topi::min(inputs[0], axis, param.keepdims) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    std::ostringstream axis; axis << param.axis;
+    NodeEntry sub0 = MakeNode("expand_like", n->attrs.name + "_grad_sub0",
+                              {ograds[0], n->inputs[0]},
+                              {{"axis", axis.str()},
+                               {"exclude", std::to_string(param.exclude)}});
+    NodeEntry sub1 = MakeNode("_min_mask", n->attrs.name + "_grad_sub1",
+                              {ograds[0]},
+                              {{"axis", axis.str()},
+                               {"exclude", std::to_string(param.exclude)}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad", {sub0, sub1})
+    };
+});
+
+NNVM_REGISTER_BASE_REDUCE_OP(collapse_sum)
+.add_argument("data", "Tensor", "The input")
+.add_argument("as", "Tensor", "The reference")
+.set_attr<FInferShape>("FInferShape", CollapseShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<2, 1>)
+.set_num_inputs(2)
+.describe(R"code(Reduces lhs to the shape of rhs via sum)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::collapse_sum(inputs[0], inputs[1]->shape) };
+});
+
+template<int Type>
+inline bool InferFixedType(const NodeAttrs& attrs,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  // Static type inference for argmax operation. Argmax return indices which
+  // should have Int32 type as shapes do.
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, static_cast<int>(Type));
+  return true;
+}
+
+NNVM_REGISTER_BASE_REDUCE_OP(argmax)
+.describe(R"code(Creates an operation that finds the indices of the maximum
+values over a given axis.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "The input")
+.set_attr<FInferShape>("FInferShape", ReduceShape)
+.set_attr<FInferType>("FInferType", InferFixedType<kInt32>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_num_inputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    auto axis = ShapeToArray(r_axes);
+    return Array<Tensor>{
+      topi::argmax(inputs[0], axis, param.keepdims) };
+});
+
+NNVM_REGISTER_BASE_REDUCE_OP(argmin)
+.describe(R"code(Creates an operation that finds the indices of the minimum
+values over a given axis.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "The input")
+.set_attr<FInferShape>("FInferShape", ReduceShape)
+.set_attr<FInferType>("FInferType", InferFixedType<kInt32>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_num_inputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    auto axis = ShapeToArray(r_axes);
+    return Array<Tensor>{
+      topi::argmin(inputs[0], axis, param.keepdims) };
+});
+
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/tensor/state_op.cc b/nnvm/src/top/tensor/state_op.cc
new file mode 100644
index 000000000000..13c3563c4201
--- /dev/null
+++ b/nnvm/src/top/tensor/state_op.cc
@@ -0,0 +1,72 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file state_op.cc
+ * \brief Experimental operators
+ *   Currently we only support assign
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/top/tensor.h>
+#include <topi/elemwise.h>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+
+using namespace tvm;
+using namespace nnvm::compiler;
+
+NNVM_REGISTER_OP(_assign)
+.describe(R"doc(Assign rhs to the lhs.
+
+lhs must be a Variable.
+This is an experimental operator.
+
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<FMutateInputs>(
+  "FMutateInputs", [](const NodeAttrs& attrs) {
+    return std::vector<uint32_t>{0};
+})
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    // This implementation is needed for the special
+    // logic handling assign in the compiler
+    // It simply copies the result of rhs the output
+    // The later decoration in compiler will change
+    // the memory assignment of assign to tie
+    // the lhs to the output.
+    return Array<Tensor>{ topi::identity(inputs[1]) };
+})
+.set_attr<FInferShape>("FInferShape", SameShape)
+.set_attr<FCorrectLayout>(
+  "FCorrectLayout", [](const NodeAttrs& attrs,
+                     std::vector<Layout> *in_layouts,
+                     const std::vector<Layout> *last_in_layouts,
+                     std::vector<Layout> *out_layouts) {
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 1, (*in_layouts)[0]);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, (*in_layouts)[0]);
+  return true;
+})
+.set_attr<FInplaceOption>(
+  "FInplaceOption", [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{1, 0}};
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return std::vector<NodeEntry>{
+      MakeNode("zeros_like", n->attrs.name + "_zero_grad",
+               {n->inputs[0]}),
+      ograds[0]
+    };
+});
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
new file mode 100644
index 000000000000..78255d20f040
--- /dev/null
+++ b/nnvm/src/top/tensor/transform.cc
@@ -0,0 +1,1331 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file transform.cc
+ * \brief Injective transformation of shape or type.
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/util.h>
+#include <nnvm/top/tensor.h>
+#include <cctype>
+#include <sstream>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/nn/flatten.h"
+#include "topi/transform.h"
+#include "topi/elemwise.h"
+#include "topi/detail/constant_utils.h"
+#include "../../compiler/compile_engine.h"
+
+namespace nnvm {
+namespace top {
+using namespace tvm;
+using namespace nnvm::compiler;
+
+// flatten
+inline bool FlattenInferShape(const NodeAttrs& attrs,
+                              std::vector<TShape>* in_attrs,
+                              std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape &dshape = (*in_attrs)[0];
+  if (dshape.ndim() == 0) return false;
+  uint32_t target_dim = 1;
+  for (uint32_t i = 1; i < dshape.ndim(); ++i) {
+    target_dim *= dshape[i];
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0,
+                           TShape({dshape[0], target_dim}));
+  return true;
+}
+
+NNVM_REGISTER_OP(flatten)
+.describe(R"code(Flattens the input into a 2-D array.
+
+For an input array with shape ``(d1, d2, ..., dk)``, `flatten` operation reshapes
+the input array into an output array of shape ``(d1, d2*...*dk)``.
+
+Example::
+
+    x = [[
+        [1,2,3],
+        [4,5,6],
+        [7,8,9]
+    ],
+    [   [1,2,3],
+        [4,5,6],
+        [7,8,9]
+    ]],
+
+    flatten(x) = [[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.],
+       [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.]]
+
+)code" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", FlattenInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.add_argument("data", "Tensor", "Input data.")
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::nn::flatten(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return MakeGradNode("reshape_like", n,
+                        {ograds[0], n->inputs[0]});
+})
+.set_support_level(1);
+
+// concatenate
+DMLC_REGISTER_PARAMETER(ConcatenateParam);
+
+inline bool ConcatenateInferShape(const NodeAttrs& attrs,
+                                  std::vector<TShape>* in_shape,
+                                  std::vector<TShape>* out_shape) {
+  const ConcatenateParam& param = nnvm::get<ConcatenateParam>(attrs.parsed);
+  TShape dshape;
+  dim_t size = 0;
+  bool has_zero = false;
+  for (size_t i = 0; i < in_shape->size(); ++i) {
+    TShape tmp = (*in_shape)[i];
+    if (tmp.ndim()) {
+      CHECK_LT(static_cast<dim_t>(param.axis), tmp.ndim())
+          << "concat dim " << param.axis << " out of range of input shape " << tmp;
+      has_zero = tmp[param.axis] == 0 || has_zero;
+      size += tmp[param.axis];
+      tmp[param.axis] = 0;
+      shape_assign(&dshape, tmp);
+    }
+  }
+
+  TShape tmp = (*out_shape)[0];
+  if (tmp.ndim()) {
+    CHECK_LT(static_cast<dim_t>(param.axis), tmp.ndim())
+        << "concat dim " << param.axis << " out of range of input shape " << tmp;
+    tmp[param.axis] = 0;
+    shape_assign(&dshape, tmp);
+  }
+
+  if (dshape.ndim() == 0) return false;
+
+  for (size_t i = 0; i < in_shape->size(); ++i) {
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, i, dshape);
+  }
+
+  if (!has_zero) dshape[param.axis] = size;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, dshape);
+  return dshape.Size() != 0;
+}
+
+inline bool ConcatenateCorrectLayout(const NodeAttrs& attrs,
+                                     std::vector<Layout> *ilayouts,
+                                     const std::vector<Layout> *last_ilayouts,
+                                     std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), last_ilayouts->size());
+  CHECK_EQ(olayouts->size(), 1U);
+
+  for (size_t i = 0; i < ilayouts->size(); ++i) {
+    const Layout& input = last_ilayouts->at(i).defined() ?
+                          last_ilayouts->at(i) : ilayouts->at(i);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, i, input);
+  }
+
+  return true;
+}
+
+NNVM_REGISTER_OP(concatenate)
+.describe(R"code(Joins input arrays along a given axis.
+
+The dimensions of the input arrays should be the same except the axis along
+which they will be concatenated.
+The dimension of the output array along the concatenated axis will be equal
+to the sum of the corresponding dimensions of the input arrays.
+
+Example::
+
+   x = [[1,1],[2,2]]
+   y = [[3,3],[4,4],[5,5]]
+   z = [[6,6], [7,7],[8,8]]
+
+   concatenate(x,y,z,dim=0) = [[ 1.,  1.],
+                               [ 2.,  2.],
+                               [ 3.,  3.],
+                               [ 4.,  4.],
+                               [ 5.,  5.],
+                               [ 6.,  6.],
+                               [ 7.,  7.],
+                               [ 8.,  8.]]
+
+   Note that you cannot concat x,y,z along dimension 1 since dimension
+   0 is not the same for all the input arrays.
+
+   concatenate(y,z,dim=1) = [[ 3.,  3.,  6.,  6.],
+                             [ 4.,  4.,  7.,  7.],
+                             [ 5.,  5.,  8.,  8.]]
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor-or-Tensor[]", "List of arrays to concatenate")
+.add_arguments(ConcatenateParam::__FIELDS__())
+.set_attr_parser(ParamParser<ConcatenateParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ConcatenateParam>)
+.set_attr<FInferShape>("FInferShape", ConcatenateInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ConcatenateCorrectLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ConcatenateParam& param = nnvm::get<ConcatenateParam>(attrs.parsed);
+    return Array<Tensor>{ topi::concatenate(inputs, param.axis) };
+})
+.set_num_outputs(1)
+.set_num_inputs(kVarg)
+.set_support_level(1);
+
+// expand_dims
+DMLC_REGISTER_PARAMETER(ExpandDimsParam);
+
+inline bool ExpandDimsInferShape(const NodeAttrs& attrs,
+                                 std::vector<TShape>* in_shape,
+                                 std::vector<TShape>* out_shape) {
+  const ExpandDimsParam& param = nnvm::get<ExpandDimsParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  const TShape& dshape = in_shape->at(0);
+  int ndim = static_cast<int>(dshape.ndim());
+  CHECK(param.axis >= -ndim - 1 && param.axis <= ndim)
+    << "with axis = " << param.axis << " ndim = " << ndim;
+  int axis = param.axis < 0 ? ndim + param.axis + 1 : param.axis;
+  std::vector<dim_t> oshape;
+  for (int i = 0; i < axis; ++i) {
+    oshape.push_back(dshape[i]);
+  }
+  for (int i = 0; i < param.num_newaxis; ++i) {
+    oshape.push_back(1);
+  }
+  for (int i = axis; i < ndim; ++i) {
+    oshape.push_back(dshape[i]);
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0,
+                           TShape(oshape.begin(), oshape.end()));
+  return true;
+}
+
+NNVM_REGISTER_OP(expand_dims)
+.describe(R"code(Inserts a new axis of size 1 into the array shape
+
+For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1, num_newaxis=5)``
+will return a new array with shape ``(2,1,1,1,1,1,3,4)``.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input tensor")
+.add_arguments(ExpandDimsParam::__FIELDS__())
+.set_attr_parser(ParamParser<ExpandDimsParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ExpandDimsParam>)
+.set_attr<FInferShape>("FInferShape", ExpandDimsInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ExpandDimsParam& param = nnvm::get<ExpandDimsParam>(attrs.parsed);
+    return Array<Tensor>{ topi::expand_dims(inputs[0], param.axis, param.num_newaxis) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return std::vector<NodeEntry> {
+      MakeNode("collapse_sum", n->attrs.name + "_grad", {ograds[0], n->inputs[0]})
+    };
+})
+.set_support_level(1);
+
+NNVM_REGISTER_OP(expand_like)
+  .describe(R"code(Expand an input array with the shape of second array.
+This operation can be thought of as a composition of expand_dims and broadcast_to.
+If the dimensions are already expanded then it just broadcasts.
+Examples::
+  input = [ 12.  19.  27.]
+  input.shape = (3,)
+  new_shape_array = [[[1,2],[2,3],[1,3]],
+                     [[1,4],[4,3],[5,2]],
+                     [[7,1],[7,2],[7,3]]]
+  new_shape_array.shape = (3, 3, 2)
+  expand_like(input, [1,2], new_shape_array) =
+                    [[[12,12],[12,12],[12,12]],
+                     [[19,19],[19,19],[19,19]],
+                     [[27,27],[27,27],[27,27]]]
+)code" NNVM_ADD_FILELINE)
+.add_argument("input", "Tensor", "Source input")
+.add_argument("shape_like", "Tensor", "Input with new shape")
+.add_arguments(IndicatorParam::__FIELDS__())
+.set_attr_parser(ParamParser<IndicatorParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<IndicatorParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", AssignOutputAttr<TShape, 1, 0>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+// never transform layout of the second input array.
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    const IndicatorParam& param = nnvm::get<IndicatorParam>(n->attrs.parsed);
+    std::ostringstream axis;
+    axis << param.axis;
+
+    if (param.axis.ndim() == 0 && !param.exclude) {
+      // Special case needed because sum interprets axis=[] differently
+      return std::vector<NodeEntry>{
+        ograds[0],
+        MakeNode("zeros_like", n->attrs.name + "_zero_grad", {n->inputs[1]})
+      };
+    }
+
+    auto sum_node =
+      MakeNode("sum", n->attrs.name + "_sum_grad",
+               {ograds[0]},
+               {{"axis", axis.str()},
+                {"exclude", std::to_string(param.exclude)}});
+
+    return std::vector<NodeEntry>{
+      MakeNode("reshape_like", n->attrs.name + "_grad",
+               {sum_node, n->inputs[0]}),
+      MakeNode("zeros_like", n->attrs.name + "_zero_grad", {n->inputs[1]})
+    };
+  })
+  .set_support_level(4);
+
+// split
+DMLC_REGISTER_PARAMETER(SplitParam);
+
+inline void SplitParamParser(nnvm::NodeAttrs* attrs) {
+  SplitParam param;
+  param.Init(attrs->dict);
+  if (!std::isdigit(attrs->dict.at("indices_or_sections")[0])) {
+    param.equal_split = false;
+  } else {
+    CHECK_EQ(param.indices_or_sections.ndim(), 1);
+    param.equal_split = true;
+  }
+  attrs->parsed = std::move(param);
+}
+
+inline bool SplitInferShape(const NodeAttrs& attrs,
+                            std::vector<TShape>* in_shape,
+                            std::vector<TShape>* out_shape) {
+  const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
+  const TShape& dshape = (*in_shape)[0];
+  if (dshape.ndim() == 0) return false;
+
+  if (param.equal_split) {
+    int num_outputs = param.indices_or_sections[0];
+    CHECK_EQ(out_shape->size(), static_cast<size_t>(num_outputs));
+    CHECK_LT(param.axis, dshape.ndim());
+    TShape oshape = dshape;
+    CHECK_EQ(oshape[param.axis] % num_outputs, 0)
+        << "indices_or_sections need to be able to divide input.shape[axis]";
+    oshape[param.axis] /= num_outputs;
+
+    for (size_t i = 0; i < out_shape->size(); ++i) {
+      NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, i, oshape);
+    }
+  } else {
+    dim_t num_outputs = param.indices_or_sections.ndim() + 1;
+    CHECK_EQ(out_shape->size(), static_cast<size_t>(num_outputs));
+    CHECK_LT(param.axis, dshape.ndim());
+    TShape oshape = dshape;
+    dim_t begin = 0;
+    for (dim_t i = 0; i < num_outputs - 1; ++i) {
+      CHECK_GT(param.indices_or_sections[i], begin)
+          << "indices_or_sections need to be a sorted ascending list";
+      oshape[param.axis] = param.indices_or_sections[i] - begin;
+      begin = param.indices_or_sections[i];
+      NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, i, oshape);
+    }
+    CHECK_LT(begin, dshape[param.axis])
+        << "The sum of sections must match the input.shape[axis]";
+    oshape[param.axis] = dshape[param.axis] - begin;
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, num_outputs - 1, oshape);
+  }
+  return true;
+}
+
+inline uint32_t SplitNumOutputs(const NodeAttrs& attrs) {
+  const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
+  if (param.equal_split) {
+    return static_cast<uint32_t>(param.indices_or_sections[0]);
+  } else {
+    return static_cast<uint32_t>(param.indices_or_sections.ndim()) + 1;
+  }
+}
+
+// Intentionally not add ParamGetAttrDict for indices_or_sections.
+NNVM_REGISTER_OP(split)
+.describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
+
+**Note** that `indices_or_sections` should evenly divide the length of the axis
+along which to split the array.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Array to be splitted")
+.add_arguments(SplitParam::__FIELDS__())
+.set_attr_parser(SplitParamParser)
+.set_attr<FInferShape>("FInferShape", SplitInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, -1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, -1>)
+.set_num_inputs(1)
+.set_num_outputs(SplitNumOutputs)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
+    if (param.equal_split) {
+      return Array<Tensor>{
+        topi::split_sections(inputs[0], param.indices_or_sections[0], param.axis) };
+    } else {
+      Array<Expr> indices;
+      for (auto i : param.indices_or_sections) {
+        indices.push_back(tvm::make_const(tvm::Int(32), i));
+      }
+      return Array<Tensor>{ topi::split(inputs[0], indices, param.axis) };
+    }
+})
+.set_support_level(1);
+
+// cast
+DMLC_REGISTER_PARAMETER(CastParam);
+
+inline bool CastInferType(const NodeAttrs& attrs,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  const CastParam& param = nnvm::get<CastParam>(attrs.parsed);
+  CHECK_EQ(out_attrs->size(), 1U);
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, param.dtype);
+  return true;
+}
+
+NNVM_REGISTER_OP(cast)
+.describe(R"code(Cast the content of input to dtype.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data array")
+.add_arguments(CastParam::__FIELDS__())
+.set_attr_parser(ParamParser<CastParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<CastParam>)
+.set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<FInferType>("FInferType", CastInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseArbitraryLayout<1, 1>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const CastParam& param = nnvm::get<CastParam>(attrs.parsed);
+    Type dtype = GetTVMType(param.dtype);
+    return Array<Tensor>{ topi::cast(inputs[0], dtype) };
+})
+.set_support_level(1);
+
+
+// reshape
+DMLC_REGISTER_PARAMETER(ReshapeParam);
+
+inline bool ReshapeInferShape(const NodeAttrs& attrs,
+                              std::vector<TShape>* in_attrs,
+                              std::vector<TShape>* out_attrs) {
+  const ReshapeParam& param = nnvm::get<ReshapeParam>(attrs.parsed);
+  CHECK_GT(param.shape.ndim(), 0);
+  CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  const TShape &dshape = (*in_attrs)[0];
+  if (dshape.ndim() == 0) return false;
+
+  const Tuple<int64_t>& target_shape = param.shape;
+  std::vector<int64_t> oshape;
+  dim_t src_idx = 0;
+  int infer_idx = -1;
+
+  for (dim_t i = 0; i < target_shape.ndim(); ++i) {
+    int svalue = target_shape[i];
+    // special flag handling for shape inference.
+    if (svalue > 0) {
+      oshape.push_back(svalue);
+      ++src_idx;
+    } else if (svalue == 0) {
+      // keep same
+      CHECK_LT(src_idx, dshape.ndim());
+      oshape.push_back(dshape[src_idx++]);
+    } else if (svalue == -1) {
+      // inference based on rest
+      CHECK_LT(infer_idx, 0)
+          << "One and only one dim can be inferred";
+      infer_idx = i;
+      oshape.push_back(1);
+      ++src_idx;
+    } else if (svalue == -2) {
+      // copy all remaining dims from source
+      while (src_idx < dshape.ndim()) {
+        oshape.push_back(dshape[src_idx++]);
+      }
+    } else if (svalue == -3) {
+      // merge two dims from source
+      CHECK_LT(src_idx + 1, dshape.ndim());
+      dim_t d1 = dshape[src_idx++];
+      dim_t d2 = dshape[src_idx++];
+      oshape.push_back(d1 * d2);
+    } else if (svalue == -4) {
+      // split the source dim s into two dims
+      // read the left dim and then the right dim (either can be -1)
+      CHECK_LT(i + 2, target_shape.ndim());
+      CHECK_LT(src_idx, dshape.ndim());
+      dim_t d0 = dshape[src_idx++];
+      int d1 = target_shape[++i];
+      int d2 = target_shape[++i];
+      CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1.";
+      if (d1 == -1) d1 = d0 / d2;
+      if (d2 == -1) d2 = d0 / d1;
+      CHECK_EQ(d1 * d2, static_cast<int>(d0)) <<
+          "Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0;
+      oshape.push_back(d1);
+      oshape.push_back(d2);
+    }
+  }
+
+  if (infer_idx >= 0) {
+    if (dshape.Size() > 0) {
+      int new_size = 1;
+      for (int x : oshape) {
+        new_size *= x;
+      }
+      oshape[infer_idx] = dshape.Size() / new_size;
+    } else {
+      oshape[infer_idx] = 0;
+    }
+  }
+  TShape out_shape(oshape.begin(), oshape.end());
+  CHECK_EQ(out_shape.Size(), dshape.Size())
+      << "Target shape size is different to source. "
+      << "Target: " << out_shape
+      << "\nSource: " << dshape;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, out_shape);
+  return true;
+}
+
+NNVM_REGISTER_OP(reshape)
+.describe(R"code(Reshapes the input array.
+
+Given an array and a shape, this function returns a copy of the array in the new shape.
+The shape is a tuple of integers such as (2,3,4). The size of the new shape should be same as the size of the input array.
+
+Example::
+
+  reshape([1,2,3,4], shape=(2,2)) = [[1,2], [3,4]]
+
+To give user more convenience in without doing manual shape inference,
+some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}.
+The significance of each is explained below:
+
+- ``0``  copy this dimension from the input to the output shape.
+
+  Example::
+
+  - input shape = (2,3,4), shape = (4,0,2), output shape = (4,3,2)
+  - input shape = (2,3,4), shape = (2,0,0), output shape = (2,3,4)
+
+- ``-1`` infers the dimension of the output shape by using the remainder of the input dimensions
+  keeping the size of the new array same as that of the input array.
+  At most one dimension of shape can be -1.
+
+  Example::
+
+  - input shape = (2,3,4), shape = (6,1,-1), output shape = (6,1,4)
+  - input shape = (2,3,4), shape = (3,-1,8), output shape = (3,1,8)
+  - input shape = (2,3,4), shape=(-1,), output shape = (24,)
+
+- ``-2`` copy all/remainder of the input dimensions to the output shape.
+
+  Example::
+
+  - input shape = (2,3,4), shape = (-2,), output shape = (2,3,4)
+  - input shape = (2,3,4), shape = (2,-2), output shape = (2,3,4)
+  - input shape = (2,3,4), shape = (-2,1,1), output shape = (2,3,4,1,1)
+
+- ``-3`` use the product of two consecutive dimensions of the input shape as the output dimension.
+
+  Example::
+
+  - input shape = (2,3,4), shape = (-3,4), output shape = (6,4)
+  - input shape = (2,3,4,5), shape = (-3,-3), output shape = (6,20)
+  - input shape = (2,3,4), shape = (0,-3), output shape = (2,12)
+  - input shape = (2,3,4), shape = (-3,-2), output shape = (6,4)
+
+- ``-4`` split one dimension of the input into two dimensions passed subsequent to -4 in shape (can contain -1).
+
+  Example::
+
+  - input shape = (2,3,4), shape = (-4,1,2,-2), output shape =(1,2,3,4)
+  - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_arguments(ReshapeParam::__FIELDS__())
+.set_attr_parser(ParamParser<ReshapeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReshapeParam>)
+.set_attr<FInferShape>("FInferShape", ReshapeInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::reshape(inputs[0], out_info[0]->shape) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return std::vector<NodeEntry>{
+      MakeNode("reshape_like", n->attrs.name + "_grad",
+               {ograds[0], n->inputs[0]})
+    };
+})
+.set_support_level(3);
+
+NNVM_REGISTER_OP(reshape_like)
+  .describe(R"code(Reshapes the input array by the size of another array.
+For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
+the input array into an output array with the same shape as the second input array.
+.. note::
+    Sizes for both array should be compatible.
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("shape_like", "Tensor", "Input data.")
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<FInferShape>(
+  "FInferShape", [](const NodeAttrs& attrs,
+                    std::vector<TShape>* in_attrs,
+                    std::vector<TShape>* out_attrs) {
+    CHECK_EQ(in_attrs->at(0).Size(), in_attrs->at(1).Size())
+      << "Reshape inputs size should be compatible";
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, in_attrs->at(1));
+    return true;
+})
+.set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+// never transform layout of the second input array.
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return std::vector<NodeEntry>{
+      MakeNode("reshape_like", n->attrs.name + "_grad", {ograds[0], n->inputs[0]}),
+      MakeNode("zeros_like", n->attrs.name + "_zero_grad", { n->inputs[1]})
+    };
+})
+.set_support_level(4);
+
+// squeeze
+DMLC_REGISTER_PARAMETER(SqueezeParam);
+
+inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape>* in_attrs,
+                           std::vector<TShape>* out_attrs) {
+  const SqueezeParam& param = nnvm::get<SqueezeParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& shp = (*in_attrs)[0];
+  if (shp.ndim() == 0) return false;
+
+  std::vector<int64_t> oshape;
+  if (param.axis.ndim() == 0) {
+    for (dim_t i = 0; i < shp.ndim(); ++i) {
+      if (shp[i] != 1) {
+        oshape.emplace_back(shp[i]);
+      }
+    }
+  } else {
+    std::unordered_set<dim_t> axis_checker;
+    for (size_t i = 0; i < param.axis.ndim(); ++i) {
+      int real_axis;
+      if (param.axis[i] < 0) {
+        real_axis = param.axis[i] + static_cast<int>(shp.ndim());
+      } else {
+        real_axis = param.axis[i];
+      }
+      CHECK(real_axis < static_cast<int>(shp.ndim()) && real_axis >= 0);
+      axis_checker.insert(real_axis);
+    }
+    for (size_t i = 0; i < shp.ndim(); ++i) {
+      if (axis_checker.find(i) == axis_checker.end()) {
+        oshape.emplace_back(shp[i]);
+      } else {
+        CHECK_EQ(shp[i], 1) << "The squeezed axis must have shape 1!"
+                            << "Want to squeeze " << i
+                            << ", which has shape" << shp[i];
+      }
+    }
+  }
+  if (oshape.size() == 0) {
+    // Handles the case where all axes are squeezed.
+    oshape.push_back(1);
+  }
+  TShape out_shape(oshape.begin(), oshape.end());
+  CHECK_EQ(out_shape.Size(), shp.Size())
+      << "Target shape size is different to source. "
+      << "Target: " << out_shape
+      << "\nSource: " << shp;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, out_shape);
+  return true;
+}
+
+NNVM_REGISTER_OP(squeeze)
+.describe(R"code(Squeeze axises in the array.
+
+Examples::
+
+  x = [[[0], [1], [2]]]
+  x.shape = (1, 3, 1)
+
+  squeeze(x) = [0, 1, 2]
+
+  squeeze(x, 0) = [[0], [1], [2]]
+
+  squeeze(x, (0, 2)) = [0, 1, 2]
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Source input")
+.add_arguments(SqueezeParam::__FIELDS__())
+.set_attr_parser(ParamParser<SqueezeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<SqueezeParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", SqueezeShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const SqueezeParam& param = nnvm::get<SqueezeParam>(attrs.parsed);
+    auto axis = ShapeToArray(param.axis);
+    return Array<Tensor>{ topi::squeeze(inputs[0], axis) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return std::vector<NodeEntry>{
+      MakeNode("reshape_like", n->attrs.name + "_grad",
+               {ograds[0], n->inputs[0]})
+    };
+})
+.set_support_level(1);
+
+// transpose
+DMLC_REGISTER_PARAMETER(TransposeParam);
+
+inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape>* in_attrs,
+                           std::vector<TShape>* out_attrs) {
+  const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& shp = (*in_attrs)[0];
+  if (shp.ndim() == 0) return false;
+
+  TShape ret(shp.ndim());
+  if (param.axes.ndim() == 0) {
+    for (dim_t i = 0; i < shp.ndim(); ++i) {
+      ret[i] = shp[shp.ndim() - 1 - i];
+    }
+  } else {
+    CHECK_EQ(shp.ndim(), param.axes.ndim());
+    for (size_t i = 0; i < shp.ndim(); ++i) {
+      CHECK(param.axes[i] < shp.ndim());
+      ret[i] = shp[param.axes[i]];
+    }
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, ret);
+  return true;
+}
+
+inline bool TransposeCorrectLayout(const NodeAttrs& attrs,
+                                   std::vector<Layout> *ilayouts,
+                                   const std::vector<Layout> *last_ilayouts,
+                                   std::vector<Layout> *olayouts) {
+  const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
+  CHECK_EQ(ilayouts->size(), 1U);
+  CHECK_EQ(olayouts->size(), 1U);
+
+  const Layout& input = last_ilayouts->at(0).defined()
+                        ? last_ilayouts->at(0)
+                        : ilayouts->at(0);
+
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, input);
+
+  if (input.defined()) {
+    std::ostringstream new_layout;
+    if (param.axes.ndim() == 0) {
+      for (size_t i = 0; i < input.ndim(); ++i) {
+        new_layout << input.at(input.ndim() - 1 - i);
+      }
+    } else {
+      CHECK_EQ(input.ndim(), param.axes.ndim());
+      for (size_t i = 0; i < input.ndim(); ++i) {
+        CHECK(param.axes[i] < static_cast<int>(input.ndim()));
+        new_layout << input.at(param.axes[i]);
+      }
+    }
+    NNVM_ASSIGN_LAYOUT(*olayouts, 0, Layout(new_layout.str()));
+  }
+
+  return true;
+}
+
+NNVM_REGISTER_OP(transpose)
+.describe(R"code(Permutes the dimensions of an array.
+
+Examples::
+
+  x = [[ 1, 2],
+       [ 3, 4]]
+
+  transpose(x) = [[ 1.,  3.],
+                  [ 2.,  4.]]
+
+  x = [[[ 1.,  2.],
+        [ 3.,  4.]],
+
+       [[ 5.,  6.],
+        [ 7.,  8.]]]
+
+  transpose(x) = [[[ 1.,  5.],
+                   [ 3.,  7.]],
+
+                  [[ 2.,  6.],
+                   [ 4.,  8.]]]
+
+  transpose(x, axes=(1,0,2)) = [[[ 1.,  2.],
+                                 [ 5.,  6.]],
+
+                                [[ 3.,  4.],
+                                 [ 7.,  8.]]]
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Source input")
+.add_arguments(TransposeParam::__FIELDS__())
+.set_attr_parser(ParamParser<TransposeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<TransposeParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", TransposeShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", TransposeCorrectLayout)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_support_level(4)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
+    auto axes = ShapeToArray(param.axes);
+    return Array<Tensor>{ topi::transpose(inputs[0], axes) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    const TransposeParam& param = nnvm::get<TransposeParam>(n->attrs.parsed);
+    std::ostringstream oss; oss << param.axes;
+    return std::vector<NodeEntry>{
+      MakeNode("transpose", n->attrs.name + "_t", {ograds[0]}, {{"axes", oss.str()}})
+    };
+});
+
+// strided_slice
+DMLC_REGISTER_PARAMETER(StridedSliceParam);
+
+inline void StridedSliceParamParser(nnvm::NodeAttrs* attrs) {
+  StridedSliceParam param;
+  param.Init(attrs->dict);
+  attrs->parsed = std::move(param);
+}
+
+inline bool StridedSliceInferShape(const NodeAttrs& attrs,
+                            std::vector<TShape>* in_shape,
+                            std::vector<TShape>* out_shape) {
+  const StridedSliceParam& param = nnvm::get<StridedSliceParam>(attrs.parsed);
+  const TShape& dshape = (*in_shape)[0];
+  if (dshape.ndim() == 0) return false;
+  TShape oshape = dshape;
+  dim_t num_axis = dshape.ndim();
+
+  std::vector<int64_t> begin_vec;
+  std::copy(param.begin.begin(), param.begin.end(), std::back_inserter(begin_vec));
+  for (dim_t i = begin_vec.size(); i < num_axis; ++i) {
+    begin_vec.push_back(0);
+  }
+
+  std::vector<int64_t> end_vec;
+  std::copy(param.end.begin(), param.end.end(), std::back_inserter(end_vec));
+  for (dim_t i = end_vec.size(); i < num_axis; ++i) {
+    end_vec.push_back(dshape[i]);
+  }
+
+  std::vector<int64_t> stride_vec;
+  std::copy(param.stride.begin(), param.stride.end(), std::back_inserter(stride_vec));
+  for (dim_t i = stride_vec.size(); i < num_axis; ++i) {
+    stride_vec.push_back(1);
+  }
+
+  for (dim_t i = 0; i < num_axis; ++i) {
+      int64_t begin_range = stride_vec[i] < 0 ? -1 : 0;
+      int64_t end_range = stride_vec[i] < 0 ? dshape[i] - 1 : dshape[i];
+      int64_t begin = begin_vec[i] < 0 ? dshape[i] + begin_vec[i] : begin_vec[i];
+      int64_t end = end_vec[i] < 0 ? dshape[i] + end_vec[i] : end_vec[i];
+      begin = std::min(std::max(begin, begin_range), end_range);
+      end = std::min(std::max(end, begin_range), end_range);
+
+      int interval = std::abs(end - begin);
+      int slice_size = static_cast<int>((interval
+                                       + std::abs(stride_vec[i]) - 1) / std::abs(stride_vec[i]));
+      CHECK(stride_vec[i] < 0 ? (end < begin) : (begin < end))
+        << ": Input [Begin=" << begin_vec[i] << ", End=" << end_vec[i]
+        << "] is invalid for axis=" << i;
+      oshape[i] = slice_size;
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+NNVM_REGISTER_OP(strided_slice)
+.describe(R"code(Strided slice of an array.
+
+Examples::
+
+  x = [[  1.,   4.,   7.,  10.],
+       [  2.,   5.,   8.,  11.],
+       [  3.,   6.,   9.,  12.]]
+
+  strided_slice(x, begin=[0, 1], end=[2, 4], stride=[1, 1]) = [[ 4.,  7.,  10.],
+                                                               [ 5.,  8.,  11.]]
+
+  x = [[[ 1.,  2.],
+        [ 3.,  4.]],
+
+       [[ 5.,  6.],
+        [ 7.,  8.]]]
+
+  strided_slice(x, begin=[0, 0], end=[2, 2]) = [[[ 1.,  2.],
+                                                 [ 3.,  4.]],
+
+                                                [[ 5.,  6.],
+                                                 [ 7.,  8.]]]
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Array to be sliced")
+.add_arguments(StridedSliceParam::__FIELDS__())
+.set_attr_parser(StridedSliceParamParser)
+.set_attr<FInferShape>("FInferShape", StridedSliceInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseArbitraryLayout<1, 1>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const StridedSliceParam& param = nnvm::get<StridedSliceParam>(attrs.parsed);
+    Array<Expr> begin;
+    Array<Expr> end;
+    Array<Expr> stride;
+
+    for (int64_t i : param.begin) {
+        begin.push_back(tvm::make_const(tvm::Int(32), i));
+    }
+
+    for (int64_t i : param.end) {
+        end.push_back(tvm::make_const(tvm::Int(32), i));
+    }
+
+    for (int64_t i : param.stride) {
+        stride.push_back(tvm::make_const(tvm::Int(32), i));
+    }
+
+    return Array<Tensor>{ topi::strided_slice(inputs[0], begin, end, stride) };
+})
+.set_support_level(1);
+
+// Flip
+DMLC_REGISTER_PARAMETER(FlipParam);
+
+NNVM_REGISTER_OP(flip)
+.describe(R"code(Reverse the elements of an array.
+
+Examples::
+
+  x = [[ 1, 2],
+       [ 3, 4]]
+
+  flip(x) = [[ 3.,  4.],
+                  [ 1.,  2.]]
+
+  x = [[[ 1.,  2.],
+        [ 3.,  4.]],
+
+       [[ 5.,  6.],
+        [ 7.,  8.]]]
+
+  flip(x) = [[[ 5.,  6.],
+                   [ 7.,  8.]],
+
+                  [[ 1.,  2.],
+                   [ 3.,  4.]]]
+
+  flip(x, axis=1) = [[[ 3.,  4.],
+                                 [ 1.,  2.]],
+
+                                [[ 7.,  8.],
+                                 [ 5.,  6.]]]
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Source input")
+.add_arguments(FlipParam::__FIELDS__())
+.set_attr_parser(ParamParser<FlipParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<FlipParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_support_level(4)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const FlipParam& param = nnvm::get<FlipParam>(attrs.parsed);
+    return Array<Tensor>{ topi::flip(inputs[0], param.axis) };
+});
+
+
+// take
+DMLC_REGISTER_PARAMETER(TakeParam);
+
+inline bool TakeInferShape(const NodeAttrs& attrs,
+                           std::vector<TShape>* in_shape,
+                           std::vector<TShape>* out_shape) {
+  CHECK_EQ(in_shape->size(), 2U);
+  CHECK_EQ(out_shape->size(), 1U);
+  const TShape& dshape = (*in_shape)[0];
+  const TShape& indicesshape = (*in_shape)[1];
+  if (dshape.ndim() == 0) return false;
+  if (indicesshape.ndim() == 0) return false;
+
+  const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
+  TShape oshape((!param.axis ? 0: dshape.ndim() - 1) + indicesshape.ndim());
+  if (!param.axis) {
+    for (size_t j = 0; j < indicesshape.ndim(); ++j) {
+      oshape[j] = indicesshape[j];
+    }
+  } else {
+    int axis = param.axis.value();
+    if (axis < 0) {
+      axis += dshape.ndim();
+    }
+    CHECK_LT(axis, dshape.ndim());
+
+    size_t posi = 0;
+    for (size_t i = 0; i < dshape.ndim(); ++i) {
+      if (static_cast<int>(i) == axis) {
+        for (size_t j = 0; j < indicesshape.ndim(); ++j) {
+          oshape[posi++] = indicesshape[j];
+        }
+      } else {
+        oshape[posi++] = dshape[i];
+      }
+    }
+  }
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, 0, dshape);
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, 1, indicesshape);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return dshape.Size() != 0;
+}
+
+inline bool TakeInferType(const NodeAttrs& attrs,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_EQ((*in_attrs)[1], kInt32);
+  NNVM_ASSIGN_INPUT_TYPE(attrs, *in_attrs, 0, (*in_attrs)[0]);
+  NNVM_ASSIGN_INPUT_TYPE(attrs, *in_attrs, 1, static_cast<int>(kInt32));
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, (*in_attrs)[0]);
+  return true;
+}
+
+inline bool TakeCorrectLayout(const NodeAttrs& attrs,
+                              std::vector<Layout> *ilayouts,
+                              const std::vector<Layout> *last_ilayouts,
+                              std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), last_ilayouts->size());
+  CHECK_EQ(olayouts->size(), 1U);
+
+  for (size_t i = 0; i < ilayouts->size(); ++i) {
+    const Layout& input = last_ilayouts->at(i).defined() ?
+                          last_ilayouts->at(i) : ilayouts->at(i);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, i, input);
+  }
+
+  return true;
+}
+
+NNVM_REGISTER_OP(take)
+.describe(R"code(Take elements from an array along an axis.
+
+When axis is not None, this function does the same thing as 'fancy' indexing
+(indexing arrays using arrays); however, it can be easier to use if you need
+elements along a given axis.
+
+**Note** that when axis is none the flattened input array is used.
+
+Examples::
+
+  a = [[ 1, 2],
+       [ 3, 4]]
+  indices = [3, 0, 2]
+  take(a, indices) = [ 4, 1, 3]
+
+  a = [[ 1., 2.],
+       [ 3., 4.]]
+  indices = [1, 0]
+  take(a, indices, axis=1) = [[ 2., 1.],
+                              [ 4., 3.]]
+
+  )code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Array to be indexed")
+.add_argument("indices", "Tensor", "The indices of the values to extract")
+.add_arguments(TakeParam::__FIELDS__())
+.set_attr_parser(ParamParser<TakeParam>)
+.set_attr<FInferShape>("FInferShape", TakeInferShape)
+.set_attr<FInferType>("FInferType", TakeInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", TakeCorrectLayout)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+    "FTVMCompute", [](const NodeAttrs& attrs,
+                      const Array<Tensor>& inputs,
+                      const Array<Tensor>& out_info) {
+      const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
+      if (!param.axis) {
+        return Array<Tensor>{
+            topi::take(inputs[0], inputs[1]) };
+      } else {
+        return Array<Tensor>{
+            topi::take(inputs[0], inputs[1], param.axis.value()) };
+      }
+  });
+
+
+// SliceLike
+DMLC_REGISTER_PARAMETER(SliceLikeParam);
+
+inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape>* in_attrs,
+                           std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const SliceLikeParam& param = nnvm::get<SliceLikeParam>(attrs.parsed);
+  const TShape& src_shape = in_attrs->at(0);
+  const TShape& target_shape = in_attrs->at(1);
+  Tuple<dim_t> end_idx;
+  end_idx = Tuple<dim_t>(src_shape);
+  if (param.axis.ndim() == 0) {
+    for (size_t i = 0; i < src_shape.ndim(); ++i) {
+      if (i < target_shape.ndim()) {
+        end_idx[i] = target_shape[i];
+        CHECK_LE(end_idx[i], src_shape[i])
+          << "End index of axis " << i << " exceeds input shape: "
+          << end_idx[i] << " vs " << src_shape[i];
+      }
+    }
+  } else {
+    for (auto i : param.axis) {
+      if (i < 0) {
+        i = src_shape.ndim() + i;
+      }
+      CHECK_LT(i, target_shape.ndim())
+        << "Axis " << i << " exceeds dimension "
+        << target_shape.ndim()<< " of target_shape.";
+      end_idx[i] = target_shape[i];
+      CHECK_LE(end_idx[i], src_shape[i])
+        << "End index of axis " << i << " exceeds input shape: "
+        << end_idx[i] << " vs " << src_shape[i];
+    }
+  }
+  TShape out_shape = TShape(std::move(end_idx));
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, out_shape);
+  return true;
+}
+
+NNVM_REGISTER_OP(slice_like)
+.describe(R"code(Slice the first input respect to the second input.
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data to be sliced.")
+.add_argument("slice_like", "Tensor", "Tensor with target shape")
+.set_num_inputs(2)
+.set_num_outputs(1)
+.add_arguments(SliceLikeParam::__FIELDS__())
+.set_attr_parser(ParamParser<SliceLikeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<SliceLikeParam>)
+.set_attr<FInferShape>("FInferShape", SliceLikeShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseBinaryKeepLeftLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const auto& param = nnvm::get<SliceLikeParam>(attrs.parsed);
+    Array<Expr> src_shape = inputs[0]->shape;
+    Array<Expr> target_shape = inputs[1]->shape;
+    Array<Expr> begin_idx, end_idx, strides;
+    for (size_t i = 0; i < src_shape.size(); ++i) {
+      begin_idx.push_back(make_const(tvm::Int(32), 0));
+      strides.push_back(make_const(tvm::Int(32), 1));
+    }
+    end_idx = Array<Expr>(src_shape);
+    if (param.axis.ndim() == 0) {
+      for (size_t i = 0; i < src_shape.size(); ++i) {
+        if (i < target_shape.size()) {
+          end_idx.Set(i, target_shape[i]);
+          CHECK_LE(topi::GetConstInt(end_idx[i]),
+                   topi::GetConstInt(src_shape[i]))
+            << "End index of axis " << i << " exceeds input shape: "
+            << topi::GetConstInt(end_idx[i]) << " vs "
+            << topi::GetConstInt(src_shape[i]);
+        }
+      }
+    } else {
+      for (int axis : param.axis) {
+        if (axis < 0) {
+          axis = static_cast<int>(src_shape.size()) + axis;
+        }
+        end_idx.Set(static_cast<size_t>(axis), target_shape[axis]);
+        CHECK_LE(topi::GetConstInt(end_idx[axis]),
+                 topi::GetConstInt(src_shape[axis]))
+          << "End index of axis " << axis << " exceeds input shape: "
+          << topi::GetConstInt(end_idx[axis]) << " vs "
+          << topi::GetConstInt(src_shape[axis]);
+      }
+    }
+    return Array<Tensor>{
+      topi::strided_slice(inputs[0], begin_idx, end_idx, strides)
+    };
+})
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "slice_like"};
+})
+.set_support_level(4);
+
+// where
+inline bool WhereShape(const nnvm::NodeAttrs& attrs,
+                       std::vector<TShape>* in_attrs,
+                       std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& cond_shape = in_attrs->at(0);
+  const TShape& x_shape = in_attrs->at(1);
+  const TShape& y_shape = in_attrs->at(2);
+  CHECK_EQ(x_shape, y_shape) << "x and y must have the same shape: "
+                             << x_shape << " vs " << y_shape;
+  if (cond_shape != x_shape) {
+    CHECK_EQ(cond_shape.ndim(), 1)
+      << "Shape of condition " << cond_shape
+      << " must be either equal to x or has dimension of 1.";
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, x_shape);
+  return true;
+}
+
+inline bool WhereInferType(const NodeAttrs &attrs,
+                           std::vector<int> *in_attrs,
+                           std::vector<int> *out_attrs) {
+  DTYPE_ASSIGN(out_attrs->at(0), in_attrs->at(1));
+  return true;
+}
+
+inline bool WhereCorrectLayout(const NodeAttrs& attrs,
+                               std::vector<Layout> *ilayouts,
+                               const std::vector<Layout> *last_ilayouts,
+                               std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), last_ilayouts->size());
+  CHECK_EQ(olayouts->size(), 1U);
+
+  for (size_t i = 0; i < ilayouts->size(); ++i) {
+    const Layout& input = last_ilayouts->at(i).defined() ?
+                          last_ilayouts->at(i) : ilayouts->at(i);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, i, input);
+  }
+
+  return true;
+}
+
+NNVM_REGISTER_OP(where)
+.describe(R"code(
+Return the elements, either from x or y, depending on the condition.
+
+Given three ndarrays, condition, x, and y, return an ndarray with the elements
+from x or y, depending on the elements from condition are true or false.
+x and y must have the same shape. If condition has the same shape as x,
+each element in the output array is from x if the corresponding element
+in the condition is true, and from y if false.
+
+If condition does not have the same shape as x, it must be a 1D array whose
+size is the same as x’s first dimension size. Each row of the output array
+is from x’s row if the corresponding element from condition is true, and
+from y’s row if false.
+
+Note that all non-zero values are interpreted as True in condition.
+
+Examples::
+
+  x = [[1, 2], [3, 4]]
+  y = [[5, 6], [7, 8]]
+  cond = [[0, 1], [-1, 0]]
+  where(cond, x, y) = [[5, 2], [3, 8]]
+
+
+  cond = [1, 0]
+  where(cond, x, y) = [[1, 2], [7, 8]]
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("condition", "Tensor", "Condition array")
+.add_argument("x", "Tensor", "First array to be selected")
+.add_argument("y", "Tensor", "Second array to be selected")
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", WhereShape)
+.set_attr<FInferType>("FInferType", WhereInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", WhereCorrectLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{
+      topi::where(inputs[0], inputs[1], inputs[2])
+    };
+  })
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"condition", "x", "y"};
+})
+.set_support_level(4);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/vision/nms.cc b/nnvm/src/top/vision/nms.cc
new file mode 100644
index 000000000000..2680b894255b
--- /dev/null
+++ b/nnvm/src/top/vision/nms.cc
@@ -0,0 +1,80 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file nms.cc
+ * \brief Property def of SSD non-maximum suppression operator.
+ */
+
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/op.h>
+#include <nnvm/top/nn.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+using compiler::FTVMCompute;
+using tvm::Tensor;
+using tvm::Array;
+
+DMLC_REGISTER_PARAMETER(NMSParam);
+
+bool NMSShape(const NodeAttrs& attrs,
+              std::vector<TShape> *in_attrs,
+              std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U) << "Inputs: [data, valid_count]";
+  TShape dshape = in_attrs->at(0);
+  TShape vshape = in_attrs->at(1);
+  CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3-D.";
+  CHECK_EQ(vshape.ndim(), 1U) << "Input valid count should be 1-D.";
+  CHECK_EQ(dshape[2], 6U) << "Data input should have shape "
+    "(batch_size, num_anchors, 6).";
+  CHECK_EQ(dshape[0], vshape[0]) << "batch_size mismatch.";
+  out_attrs->clear();
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, dshape);
+  return true;
+}
+
+inline bool NMSInferType(const NodeAttrs &attrs,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  DTYPE_ASSIGN(out_attrs->at(0), in_attrs->at(0));
+  return true;
+}
+
+inline bool NMSInferLayout(const NodeAttrs& attrs,
+                           std::vector<Layout> *ilayouts,
+                           const std::vector<Layout> *last_ilayouts,
+                           std::vector<Layout> *olayouts) {
+  static const Layout kNCHW("NCHW");
+  CHECK_EQ(ilayouts->size(), 2U);
+  CHECK_EQ(olayouts->size(), 1U);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, kNCHW);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kNCHW);
+  return true;
+}
+
+NNVM_REGISTER_OP(nms)
+  .describe(R"doc("Non-maximum suppression."
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NMSParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict",
+                        ParamGetAttrDict<NMSParam>)
+.add_arguments(NMSParam::__FIELDS__())
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "valid_count"};
+})
+.set_attr<FInferShape>("FInferShape", NMSShape)
+.set_attr<FInferType>("FInferType", NMSInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", NMSInferLayout)
+.set_support_level(4);
+
+}  // namespace top
+}  // namespace nnvm
+
diff --git a/nnvm/src/top/vision/ssd/mutibox_op.cc b/nnvm/src/top/vision/ssd/mutibox_op.cc
new file mode 100644
index 000000000000..7f1aca5d2b82
--- /dev/null
+++ b/nnvm/src/top/vision/ssd/mutibox_op.cc
@@ -0,0 +1,158 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file multibox_op.cc
+ * \brief Property def of SSD multibox related operators.
+ */
+
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/op.h>
+#include <nnvm/top/nn.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include "../../op_common.h"
+#include "../../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+using compiler::FTVMCompute;
+using tvm::Tensor;
+using tvm::Array;
+
+DMLC_REGISTER_PARAMETER(MultiBoxPriorParam);
+
+bool MultiBoxPriorShape(const NodeAttrs& attrs,
+                        std::vector<TShape> *in_attrs,
+                        std::vector<TShape> *out_attrs) {
+  const MultiBoxPriorParam& param = nnvm::get<MultiBoxPriorParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U) << "Inputs: [data]" << in_attrs->size();
+  TShape dshape = in_attrs->at(0);
+  CHECK_GE(dshape.ndim(), 4U) << "Input data should be 4D: "
+      "[batch, channel, height, width]";
+  int in_height = dshape[2];
+  CHECK_GT(in_height, 0) << "Input height should > 0";
+  int in_width = dshape[3];
+  CHECK_GT(in_width, 0) << "Input width should > 0";
+  // since input sizes are same in each batch, we could share MultiBoxPrior
+  TShape oshape = TShape(3);
+  int num_sizes = param.sizes.ndim();
+  int num_ratios = param.ratios.ndim();
+  oshape[0] = 1;
+  oshape[1] = in_height * in_width * (num_sizes + num_ratios - 1);
+  oshape[2] = 4;
+  CHECK_EQ(param.steps.ndim(), 2) << "Step ndim must be 2: (step_y, step_x)";
+  CHECK_GE(param.steps[0] * param.steps[1], 0) << "Must specify both "
+      "step_y and step_x";
+  out_attrs->clear();
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  return true;
+}
+
+inline bool MultiBoxPriorLayout(const NodeAttrs& attrs,
+                                std::vector<Layout> *ilayouts,
+                                const std::vector<Layout> *last_ilayouts,
+                                std::vector<Layout> *olayouts) {
+  static const Layout kNCHW("NCHW");
+  CHECK_EQ(ilayouts->size(), 1U);
+  CHECK_EQ(olayouts->size(), 1U);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, kNCHW);
+  return true;
+}
+
+NNVM_REGISTER_OP(multibox_prior)
+  .describe(R"doc("Generate prior(anchor) boxes from data, sizes and ratios."
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<MultiBoxPriorParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<MultiBoxPriorParam>)
+.add_arguments(MultiBoxPriorParam::__FIELDS__())
+.add_argument("data", "Tensor", "Input data")
+.set_attr<FInferShape>("FInferShape", MultiBoxPriorShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", MultiBoxPriorLayout)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return std::vector<NodeEntry>{
+      MakeNode("zeros_like", n->attrs.name + "_zero_grad",
+      {n->inputs[0]}),
+      ograds[0]
+    };
+})
+.set_support_level(4);
+
+DMLC_REGISTER_PARAMETER(MultiBoxTransformLocParam);
+
+bool MultiBoxTransformLocShape(const NodeAttrs& attrs,
+                               std::vector<TShape> *in_attrs,
+                               std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U) << "Inputs: [cls_prob, loc_pred, anchor]";
+  TShape cshape = in_attrs->at(0);
+  TShape lshape = in_attrs->at(1);
+  TShape ashape = in_attrs->at(2);
+  CHECK_EQ(cshape.ndim(), 3U) << "Class probability should be 3-D.";
+  CHECK_EQ(lshape.ndim(), 2U) << "Location prediction should be 2-D.";
+  CHECK_EQ(ashape.ndim(), 3U) << "Anchor should be 3-D.";
+  CHECK_EQ(cshape[2], ashape[1]) << "Number of anchors mismatch.";
+  CHECK_EQ(cshape[2] * 4, lshape[1]) << "# anchors mismatch with # loc.";
+  CHECK_GT(ashape[1], 0U) << "Number of anchors must > 0.";
+  CHECK_EQ(ashape[2], 4U);
+  TShape oshape0 = TShape(3);
+  oshape0[0] = cshape[0];
+  oshape0[1] = ashape[1];
+  oshape0[2] = 6;  // [id, prob, xmin, ymin, xmax, ymax]
+  TShape oshape1 = TShape(1);
+  oshape1[0] = cshape[0];
+  out_attrs->clear();
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape0);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 1, oshape1);
+  return true;
+}
+
+inline bool MultiBoxTransformLocLayout(const NodeAttrs& attrs,
+                                       std::vector<Layout> *ilayouts,
+                                       const std::vector<Layout> *last_ilayouts,
+                                       std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), 3U);
+  CHECK_EQ(last_ilayouts->size(), 3U);
+  CHECK_EQ(olayouts->size(), 2U);
+  for (size_t i = 0; i < last_ilayouts->size(); ++i) {
+    const Layout& last_layout = last_ilayouts->at(i);
+    if (last_layout.defined()) {
+      NNVM_ASSIGN_LAYOUT(*ilayouts, i, last_layout);
+    }
+  }
+  return true;
+}
+
+inline bool MultiBoxTransformLocInferType(const NodeAttrs &attrs,
+                                          std::vector<int> *in_attrs,
+                                          std::vector<int> *out_attrs) {
+  DTYPE_ASSIGN(out_attrs->at(0), in_attrs->at(0));
+  DTYPE_ASSIGN(out_attrs->at(1), 4U);
+  return true;
+}
+
+NNVM_REGISTER_OP(multibox_transform_loc)
+  .describe(R"doc("Location transformation for multibox detection."
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<MultiBoxTransformLocParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict",
+                        ParamGetAttrDict<MultiBoxTransformLocParam>)
+.add_arguments(MultiBoxTransformLocParam::__FIELDS__())
+.add_argument("cls_prob", "Tensor", "Class probabilities.")
+.add_argument("loc_pred", "Tensor", "Location regression predictions.")
+.add_argument("anchor", "Tensor", "Multibox prior anchor boxes")
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"cls_prob", "loc_pred", "anchor"};
+})
+.set_attr<FInferShape>("FInferShape", MultiBoxTransformLocShape)
+.set_attr<FInferType>("FInferType", MultiBoxTransformLocInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", MultiBoxTransformLocLayout)
+.set_support_level(4);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/vision/yolo/region.cc b/nnvm/src/top/vision/yolo/region.cc
new file mode 100644
index 000000000000..182c9b2ab3bc
--- /dev/null
+++ b/nnvm/src/top/vision/yolo/region.cc
@@ -0,0 +1,35 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file region.cc
+ * \brief Property def of pooling operators.
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "../../op_common.h"
+#include "region.h"
+
+namespace nnvm {
+namespace top {
+
+NNVM_REGISTER_OP(yolo_region)
+.describe(R"code(Region layer
+)code" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_support_level(5)
+.add_argument("data", "Tensor", "Input data")
+.set_attr<FInferType>("FInferType", RegionType<1, 1>)
+.set_attr<FInferShape>("FInferShape", RegionShape<1, 1>)
+.set_attr<FInplaceOption>(
+    "FInplaceOption",
+    [](const NodeAttrs &attrs) {
+      return std::vector<std::pair<int, int>>{{0, 0}, {1, 0}};
+    })
+.set_attr<FGradient>("FGradient", [](const NodePtr &n,
+                                     const std::vector<NodeEntry> &ograds) {
+  return std::vector<NodeEntry>{ograds[0], ograds[0]};
+});
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/vision/yolo/region.h b/nnvm/src/top/vision/yolo/region.h
new file mode 100644
index 000000000000..f9dc87c59c6c
--- /dev/null
+++ b/nnvm/src/top/vision/yolo/region.h
@@ -0,0 +1,101 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file region.h
+ */
+#ifndef NNVM_TOP_VISION_YOLO_REGION_H_
+#define NNVM_TOP_VISION_YOLO_REGION_H_
+
+#include <string>
+#include <vector>
+#include <utility>
+#include <iostream>
+#include <sstream>
+
+namespace nnvm {
+namespace top {
+
+template <typename AttrType,
+          bool (*is_none)(const AttrType &),
+          bool (*assign)(AttrType *,
+          const AttrType &),
+          bool reverse_infer,
+          std::string (*attr_string)(const AttrType &),
+          int n_in = -1,
+          int n_out = -1>
+inline bool RegionAttr(const nnvm::NodeAttrs &attrs,
+                       std::vector<AttrType> *in_attrs,
+                       std::vector<AttrType> *out_attrs,
+                       const AttrType &none) {
+  AttrType dattr = none;
+  size_t in_size = in_attrs->size();
+  size_t out_size = out_attrs->size();
+  if (n_in != -1) {
+    in_size = static_cast<size_t>(n_in);
+  }
+  if (n_out != -1) {
+    out_size = static_cast<size_t>(n_out);
+  }
+
+  auto deduce = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+    for (size_t i = 0; i < size; ++i) {
+      if (i == 0)
+        CHECK(assign(&dattr, (*vec)[i]))
+            << "Incompatible attr in node " << attrs.name << " at " << i
+            << "-th " << name << ": "
+            << "expected " << attr_string(dattr) << ", got "
+            << attr_string((*vec)[i]);
+    }
+  };
+  deduce(in_attrs, in_size, "input");
+
+  auto write = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+    for (size_t i = 0; i < size; ++i) {
+      CHECK(assign(&(*vec)[i], dattr))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": "
+          << "expected " << attr_string(dattr) << ", got "
+          << attr_string((*vec)[i]);
+    }
+  };
+  write(out_attrs, out_size, "output");
+
+  if (is_none(dattr)) {
+    return false;
+  }
+  return true;
+}
+
+template <int n_in, int n_out>
+inline bool RegionShape(const NodeAttrs &attrs,
+                        std::vector<TShape> *in_attrs,
+                        std::vector<TShape> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
+        << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
+        << " in operator " << attrs.name;
+  }
+  return RegionAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+      attrs, in_attrs, out_attrs, TShape());
+}
+
+template <int n_in, int n_out>
+inline bool RegionType(const NodeAttrs &attrs,
+                       std::vector<int> *in_attrs,
+                       std::vector<int> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
+        << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
+        << " in operator " << attrs.name;
+  }
+  return RegionAttr<int, type_is_none, type_assign, true, type_string>(
+      attrs, in_attrs, out_attrs, -1);
+}
+}  // namespace top
+}  // namespace nnvm
+#endif  // NNVM_TOP_VISION_YOLO_REGION_H_
diff --git a/nnvm/src/top/vision/yolo/reorg.cc b/nnvm/src/top/vision/yolo/reorg.cc
new file mode 100644
index 000000000000..e44d77c07953
--- /dev/null
+++ b/nnvm/src/top/vision/yolo/reorg.cc
@@ -0,0 +1,52 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file reorg.cc
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "../../op_common.h"
+#include "../../elemwise_op_common.h"
+#include "reorg.h"
+
+namespace nnvm {
+namespace top {
+
+// reorg
+DMLC_REGISTER_PARAMETER(ReorgParam);
+
+inline bool ReorgInferShape(const nnvm::NodeAttrs &attrs,
+                            std::vector<TShape> *in_shape,
+                            std::vector<TShape> *out_shape) {
+  const ReorgParam &param = nnvm::get<ReorgParam>(attrs.parsed);
+  TShape dshape = in_shape->at(0);
+  if (dshape.ndim() == 0)
+    return false;
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, 0, dshape);
+  CHECK_EQ(dshape.ndim(), 4) << "Input data should be 4D";
+  CHECK_GT(param.stride, 0U) << "Stride value cannot be 0";
+  TShape oshape({dshape[0], 0, 0, 0});
+  oshape[1] = dshape[1] * param.stride * param.stride;
+  oshape[2] = dshape[2] / param.stride;
+  oshape[3] = dshape[3] / param.stride;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+NNVM_REGISTER_OP(yolo_reorg)
+.describe(R"(Perform reorg operation on input array based on the stride value.
+- **data**: Input is 4D array of shape (batch_size, channels, in_height, in_width).
+- **out**: Output is 4D array of shape (batch_size, channels/(stride*stride), in_height*stride, in_width*stride).
+)" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_support_level(5)
+.add_argument("data", "Tensor", "Data input to reorganize")
+.set_attr_parser(ParamParser<ReorgParam>)
+.add_arguments(ReorgParam::__FIELDS__())
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReorgParam>)
+.set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_attr<FInferShape>("FInferShape", ReorgInferShape);
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/vision/yolo/reorg.h b/nnvm/src/top/vision/yolo/reorg.h
new file mode 100644
index 000000000000..a16edeceaec2
--- /dev/null
+++ b/nnvm/src/top/vision/yolo/reorg.h
@@ -0,0 +1,110 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file reorg.h
+ */
+#ifndef NNVM_TOP_VISION_YOLO_REORG_H_
+#define NNVM_TOP_VISION_YOLO_REORG_H_
+
+#include <string>
+#include <vector>
+#include <utility>
+#include <iostream>
+#include <sstream>
+
+namespace nnvm {
+namespace top {
+
+template <typename AttrType,
+          bool (*is_none)(const AttrType &),
+          bool (*assign)(AttrType *,
+          const AttrType &),
+          bool reverse_infer,
+          std::string (*attr_string)(const AttrType &),
+          int n_in = -1,
+          int n_out = -1>
+inline bool ReorgAttr(const nnvm::NodeAttrs &attrs,
+                      std::vector<AttrType> *in_attrs,
+                      std::vector<AttrType> *out_attrs,
+                      const AttrType &none) {
+  AttrType dattr = none;
+  size_t in_size = in_attrs->size();
+  size_t out_size = out_attrs->size();
+  if (n_in != -1) {
+    in_size = static_cast<size_t>(n_in);
+  }
+  if (n_out != -1) {
+    out_size = static_cast<size_t>(n_out);
+  }
+
+  auto deduce = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+    for (size_t i = 0; i < size; ++i) {
+      if (i == 0) {
+        CHECK(assign(&dattr, (*vec)[i]))
+            << "Incompatible attr in node " << attrs.name << " at " << i
+            << "-th " << name << ": "
+            << "expected " << attr_string(dattr) << ", got "
+            << attr_string((*vec)[i]);
+      }
+    }
+  };
+  deduce(in_attrs, in_size, "input");
+
+  auto write = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+    for (size_t i = 0; i < size; ++i) {
+      CHECK(assign(&(*vec)[i], dattr))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": "
+          << "expected " << attr_string(dattr) << ", got "
+          << attr_string((*vec)[i]);
+    }
+  };
+  write(out_attrs, out_size, "output");
+
+  if (is_none(dattr)) {
+    return false;
+  }
+  return true;
+}
+
+template <int n_in, int n_out>
+inline bool ReorgShape(const NodeAttrs &attrs,
+                       std::vector<TShape> *in_attrs,
+                       std::vector<TShape> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
+        << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
+        << " in operator " << attrs.name;
+  }
+  return ReorgAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+      attrs, in_attrs, out_attrs, TShape());
+}
+
+template <int n_in, int n_out>
+inline bool ReorgType(const NodeAttrs &attrs,
+                      std::vector<int> *in_attrs,
+                      std::vector<int> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
+        << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
+        << " in operator " << attrs.name;
+  }
+  return ReorgAttr<int, type_is_none, type_assign, true, type_string>(
+      attrs, in_attrs, out_attrs, -1);
+}
+
+struct ReorgParam : public dmlc::Parameter<ReorgParam> {
+  int stride;
+
+  DMLC_DECLARE_PARAMETER(ReorgParam) {
+    DMLC_DECLARE_FIELD(stride).set_default(1).describe("Stride value");
+  }
+};
+}  // namespace top
+}  // namespace nnvm
+#endif  // NNVM_TOP_VISION_YOLO_REORG_H_
diff --git a/nnvm/tests/cpp/.gitignore b/nnvm/tests/cpp/.gitignore
new file mode 100644
index 000000000000..de6dea3ebbec
--- /dev/null
+++ b/nnvm/tests/cpp/.gitignore
@@ -0,0 +1,3 @@
+unittest
+*.d
+*_test
diff --git a/nnvm/tests/cpp/op_test.cc b/nnvm/tests/cpp/op_test.cc
new file mode 100644
index 000000000000..c97952dda1b9
--- /dev/null
+++ b/nnvm/tests/cpp/op_test.cc
@@ -0,0 +1,27 @@
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <nnvm/op.h>
+#include <utility>
+
+NNVM_REGISTER_OP(add)
+.describe("add two data together")
+.set_num_inputs(2)
+.set_attr("inplace_pair", std::make_pair(0, 0));
+
+NNVM_REGISTER_OP(add)
+.set_attr<std::string>("nick_name", "plus");
+
+
+TEST(Op, GetAttr) {
+  using namespace nnvm;
+  auto add = Op::Get("add");
+  auto nick = Op::GetAttr<std::string>("nick_name");
+
+  CHECK_EQ(nick[add], "plus");
+}
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/nnvm/tests/cpp/tuple_test.cc b/nnvm/tests/cpp/tuple_test.cc
new file mode 100644
index 000000000000..806fdc42ac20
--- /dev/null
+++ b/nnvm/tests/cpp/tuple_test.cc
@@ -0,0 +1,30 @@
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <nnvm/tuple.h>
+
+TEST(Tuple, Basic) {
+  using nnvm::Tuple;
+  using nnvm::TShape;
+  Tuple<int> x{1, 2, 3};
+  Tuple<int> y{1, 2, 3, 5, 6};
+  x = std::move(y);
+
+  CHECK_EQ(x.ndim(), 5);
+  Tuple<int> z{1, 2, 3, 5, 6};
+  std::ostringstream os;
+  os << z;
+  CHECK_EQ(os.str(), "[1,2,3,5,6]");
+  std::istringstream is(os.str());
+  is >> y;
+  CHECK_EQ(x, y);
+  Tuple<nnvm::dim_t> ss{1, 2, 3};
+  TShape s = ss;
+  s = std::move(ss);
+  CHECK((s == TShape{1, 2, 3}));
+}
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/nnvm/tests/cpp/unittest.mk b/nnvm/tests/cpp/unittest.mk
new file mode 100644
index 000000000000..1328eb7a314a
--- /dev/null
+++ b/nnvm/tests/cpp/unittest.mk
@@ -0,0 +1,12 @@
+GTEST_LIB=$(GTEST_PATH)/lib/
+GTEST_INC=$(GTEST_PATH)/include/
+
+TEST_SRC = $(wildcard tests/cpp/*_test.cc)
+TEST = $(patsubst tests/cpp/%_test.cc, tests/cpp/%_test, $(TEST_SRC))
+
+tests/cpp/%_test: tests/cpp/%_test.cc lib/libnnvm.a
+	$(CXX) -std=c++11 $(CFLAGS) -MM -MT tests/cpp/$* $< >tests/cpp/$*.d
+	$(CXX) -std=c++11 $(CFLAGS) -I$(GTEST_INC) -o $@ $(filter %.cc %.a, $^)  \
+		-L$(GTEST_LIB)  $(LDFLAGS) -lgtest
+
+-include tests/cpp/*.d
diff --git a/nnvm/tests/lint/pylintrc b/nnvm/tests/lint/pylintrc
new file mode 100644
index 000000000000..ef5a780f3d42
--- /dev/null
+++ b/nnvm/tests/lint/pylintrc
@@ -0,0 +1,406 @@
+[MASTER]
+
+# Specify a configuration file.
+#rcfile=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS, _cy2, _cy3
+
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Use multiple processes to speed up Pylint.
+jobs=8
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-whitelist=numpy,opencv
+
+# Allow optimization of some AST trees. This will activate a peephole AST
+# optimizer, which will apply various small optimizations. For instance, it can
+# be used to obtain the result of joining multiple strings with the addition
+# operator. Joining a lot of strings can lead to a maximum recursion error in
+# Pylint and this flag can prevent that. It has one side effect, the resulting
+# AST will be different than the one from reality. This option is deprecated
+# and it will be removed in Pylint 2.0.
+optimize-ast=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=indexing-exception,old-raise-syntax
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]". This option is deprecated
+# and it will be removed in Pylint 2.0.
+files-output=no
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,dict-separator
+
+# Maximum number of lines in a module
+max-module-lines=1000
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[TYPECHECK]
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,future.builtins
+
+
+[BASIC]
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,_,a,b,op,x,y,wd,lr,kv,k,v,s,p,h,c,m,n,X,t,g,f
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Naming hint for module names
+module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Regular expression matching correct constant names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Naming hint for constant names
+const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Naming hint for inline iteration names
+inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
+
+# Regular expression matching correct method names
+method-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for method names
+method-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Naming hint for class attribute names
+class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Regular expression matching correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for argument names
+argument-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for attribute names
+attr-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for variable names
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for function names
+function-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+# Naming hint for class names
+class-name-hint=[A-Z_][a-zA-Z0-9]+$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+
+
+[ELIF]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=optparse
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=0
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of boolean expressions in a if statement
+max-bool-expr=5
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
diff --git a/nnvm/tests/python/compiler/test_alter_op_layout.py b/nnvm/tests/python/compiler/test_alter_op_layout.py
new file mode 100644
index 000000000000..0fbf5ad3b479
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_alter_op_layout.py
@@ -0,0 +1,53 @@
+"""Unittest cases for AlterOpLayout pass"""
+from nnvm import symbol as sym
+from nnvm.compiler import graph_attr
+from nnvm.top import registry as reg
+import nnvm.graph as graph
+
+def get_layouts(g):
+    ldict = {}
+    vlayout = g.json_attr("layout")
+    entry_ptr = g.index.entry_ptr
+    for i, n in enumerate(g.index.nodes):
+        begin, end = entry_ptr[i], entry_ptr[i + 1]
+        ldict[n["name"]] = vlayout[begin:end]
+    return ldict
+
+
+def test_alter_conv2d_layout():
+    data = sym.Variable("data", shape=(1, 32, 512, 512))
+    conv = sym.conv2d(data, name="conv", channels=16,
+                      kernel_size=(3,3), padding=(1,1),
+                      use_bias=False, layout="NCHW")
+    # split here
+    convs = sym.split(conv, indices_or_sections=2)
+    relus = [sym.relu(x, name="relu") for x in convs]
+    relu = sym.concatenate(*relus)
+    flatten = sym.flatten(relu, name="flatten")
+    softmax = sym.softmax(flatten, name="softmax")
+    g = graph.create(softmax)
+
+    g = g.apply("CorrectLayout")
+    g = graph_attr.set_dtype_inputs(g, "float32")
+    g = g.apply(["InferShape", "InferType"])
+    layouts_origin = get_layouts(g)
+
+    @reg.register_alter_op_layout("conv2d", level=100)
+    def alter_conv2d_layout(attrs, inputs, tinfos):
+        new_attrs = {k : attrs[k] for k in attrs.keys()}
+        new_attrs["layout"] = "NCHW16c"
+        new_attrs["kernel_layout"] = "NCHW16c"
+        new_attrs["name"] = "conv_alter"
+        return sym.conv2d(inputs[0], inputs[1], **new_attrs)
+
+    g = g.apply("AlterOpLayout")
+    layouts = get_layouts(g)
+
+    # check copy layouts
+    for node in ["data", "relu", "flatten", "softmax", "conv_weight"]:
+        assert(layouts[node] == layouts_origin[node])
+    assert(layouts["conv_alter"] == layouts_origin["conv"])
+
+
+if __name__ == "__main__":
+    test_alter_conv2d_layout()
diff --git a/nnvm/tests/python/compiler/test_build.py b/nnvm/tests/python/compiler/test_build.py
new file mode 100644
index 000000000000..5e1f0337c293
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_build.py
@@ -0,0 +1,102 @@
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.compiler.build_module import _run_graph, precompute_prune
+
+def test_compile():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.exp(y + x)
+    shape = (10, 128)
+    dtype = tvm.float32
+    shape_dict = {"x": shape, "y": shape}
+    def verify(graph, lib):
+        m = graph_runtime.create(graph, lib, tvm.cpu(0))
+        # get member functions
+        set_input, run, get_output = m["set_input"], m["run"], m["get_output"]
+        na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+        nb = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+        # set inputs
+        set_input("x", na)
+        set_input("y", nb)
+        # execute
+        run()
+        # get outputs
+        out = tvm.nd.empty(shape, dtype)
+        get_output(0, out)
+        np.testing.assert_allclose(
+            out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
+
+    graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
+    assert graph.index.num_nodes == 3
+    verify(graph, lib)
+
+    with nnvm.compiler.build_config(opt_level=0):
+        graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
+        # print(graph.ir())
+        assert graph.index.num_nodes == 4
+        verify(graph, lib)
+
+def test_run():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.exp(y + x)
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    res = _run_graph(z, {"x": nx, "y": ny})
+    np.testing.assert_allclose(
+        res[0].asnumpy(), np.exp(nx.asnumpy() + ny.asnumpy()))
+
+
+def test_precompute_prune():
+    x = sym.Variable("x") + 1
+    a = sym.Variable("a")
+    y = sym.Variable("y")
+    z = y + x + a
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx, "a": na}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"y": ny.shape}, params=params)
+    assert graph.index.num_nodes == 4
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    params["y"] = ny
+    res = tvm.nd.empty(shape)
+    m["load_params"](nnvm.compiler.save_param_dict(params))
+    m.run()
+    out = m.get_output(0, out=res)
+    np.testing.assert_allclose(
+        res.asnumpy(), nx.asnumpy() + 1 + ny.asnumpy() + na.asnumpy())
+
+
+def test_dtypes():
+    x = sym.Variable("x")
+    y = sym.relu(x)
+    dshape = (1, 3, 32, 32)
+    oshape = dshape
+    for dtype in ['float32', 'float64', 'int32', 'int16', 'int8', 'int64']:
+        graph, lib, _ = nnvm.compiler.build(y, 'llvm', {"x": dshape}, dtype=dtype)
+        m = graph_runtime.create(graph, lib, tvm.cpu())
+        if 'float' in dtype:
+          data = np.random.uniform(size=dshape).astype(dtype)
+        elif 'int' in dtype:
+          data = np.random.randint(-127, 127, dshape).astype(dtype)
+        m.run(x=data)
+        data = (data > 0) * data
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        np.testing.assert_allclose(out.asnumpy(), data, atol=1e-5, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    test_precompute_prune()
+    test_compile()
+    test_run()
+    test_dtypes()
diff --git a/nnvm/tests/python/compiler/test_compiler_cache.py b/nnvm/tests/python/compiler/test_compiler_cache.py
new file mode 100644
index 000000000000..970b193a6875
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_compiler_cache.py
@@ -0,0 +1,42 @@
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+
+def test_compile_cache():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.exp(y + x)
+    shape = (10, 1)
+    dtype = tvm.float32
+    shape_dict = {"x": shape, "y": shape}
+    def verify(graph, lib):
+        m = graph_runtime.create(graph, lib, tvm.cpu(0))
+        # get member functions
+        na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+        nb = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+        m.run(x=na, y=nb)
+        # get outputs
+        out = m.get_output(0, tvm.nd.empty(shape, dtype))
+        np.testing.assert_allclose(
+            out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
+
+    engine = nnvm.compiler.engine
+    graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
+    inputs = [tvm.placeholder((10,)), tvm.placeholder((10,))]
+
+    gkey = nnvm.compiler.graph_key(nnvm.graph.create(z), inputs, "llvm")
+    gkey2 = nnvm.compiler.graph_key(nnvm.graph.create(z), inputs + inputs, "llvm")
+    gf = engine[gkey]
+    assert gf is not None
+    assert engine[gkey2] is None
+    graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
+    assert graph.index.num_nodes == 3
+    verify(graph, lib)
+    # Test various set external cache
+    engine.clear_cache()
+    engine[gkey] = gf
+
+if __name__ == "__main__":
+    test_compile_cache()
diff --git a/nnvm/tests/python/compiler/test_fold_axis.py b/nnvm/tests/python/compiler/test_fold_axis.py
new file mode 100644
index 000000000000..bbd50193b4b0
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_fold_axis.py
@@ -0,0 +1,110 @@
+"""Unittest cases for fold_axis"""
+import nnvm
+import nnvm.testing.resnet
+import numpy as np
+from nnvm import symbol as sym
+from nnvm.compiler import graph_util, graph_attr
+
+def test_fold_axis_conv():
+    def before(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        x = x * sym.expand_dims(in_scale, axis=1, num_newaxis=2)
+        y = sym.conv2d(x, conv_weight, conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       name="conv")
+        y = sym.relu(y)
+        y = y * sym.expand_dims(out_scale, axis=1, num_newaxis=2)
+        return y
+
+    def expected(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        conv_weight = conv_weight * sym.expand_dims(out_scale, axis=1, num_newaxis=3)
+        conv_weight = conv_weight * sym.expand_dims(in_scale, axis=1, num_newaxis=2)
+        conv_bias = conv_bias * out_scale
+        y = sym.conv2d(x,
+                       conv_weight,
+                       conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       name="conv")
+        y = sym.relu(y)
+        return y
+
+    # Before simplify
+    def check(shape, channels):
+        x = sym.Variable("x") + 1
+        weight = sym.Variable("weight")
+        bias = sym.Variable("bias")
+        in_scale = sym.Variable("in_scale")
+        out_scale = sym.Variable("out_scale")
+        y1 = before(x, weight, bias, in_scale, out_scale, channels)
+        y2 = expected(x, weight, bias, in_scale, out_scale, channels)
+        ishape = {"x": shape, "out_scale": (channels,), "in_scale": (shape[1],)}
+        g1 = nnvm.graph.create(y1)
+        g2 = nnvm.graph.create(y2)
+        graph_attr.set_shape_inputs(g1, ishape)
+        g1 = g1.apply("InferShape").apply("FoldScaleAxis")
+        # assert graph equals as expected
+        graph_util.check_graph_equal(g1, g2)
+
+    check((2, 4, 10, 10), 2)
+
+
+def test_fold_fail():
+    def before(x, scale, channels):
+        y = sym.conv2d(x,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       name="conv")
+        y = y * sym.expand_dims(scale, axis=1, num_newaxis=1)
+        return y
+
+    # Before simplify
+    def check(shape, channels):
+        x = sym.Variable("x")
+        bias = sym.Variable("bias")
+        scale = sym.Variable("scale")
+        y1 = before(x, scale, channels)
+        ishape = {"x": shape, "scale": (channels,), "bias": (channels,)}
+        g1 = nnvm.graph.create(y1)
+        graph_attr.set_shape_inputs(g1, ishape)
+        g2 = g1.apply("InferShape").apply("FoldScaleAxis")
+        # assert graph equals as expected
+        graph_util.check_graph_equal(g1, g2)
+
+    check((2, 10, 10, 10), 10)
+
+
+def test_fold_resnet():
+    batch_size = 1
+    num_classes = 1000
+    image_shape = (3, 224, 224)
+    data_shape = (batch_size,) +image_shape
+    net, params = nnvm.testing.resnet.get_workload(
+        batch_size=1, image_shape=image_shape)
+    ishape = {"data" : data_shape}
+    graph = nnvm.graph.create(net)
+    data = np.random.uniform(size=data_shape).astype("float32")
+    # Initial pass do shape type inference
+    shape, _ = graph_util.infer_shape(graph, **ishape)
+    ishape.update(zip(graph.index.input_names, shape))
+
+    def run_prune(graph, params, opt_level):
+        # Apply optimization
+        with nnvm.compiler.build_config(opt_level=0):
+            graph = nnvm.compiler.optimize(graph, ishape)
+        graph, params = nnvm.compiler.build_module.precompute_prune(graph, params)
+        params["data"] = data
+        return nnvm.compiler.build_module._run_graph(graph, params)
+
+    x = run_prune(graph, params, 0)
+    y = run_prune(graph, params, 3)
+    np.testing.assert_allclose(y[0].asnumpy(), x[0].asnumpy())
+
+
+if __name__ == "__main__":
+    test_fold_resnet()
+    test_fold_axis_conv()
+    test_fold_fail()
diff --git a/nnvm/tests/python/compiler/test_graph_pass.py b/nnvm/tests/python/compiler/test_graph_pass.py
new file mode 100644
index 000000000000..ec5ab0479389
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_graph_pass.py
@@ -0,0 +1,18 @@
+"""Unittest cases for graph pass"""
+import nnvm
+import nnvm.compiler
+from nnvm import symbol as sym
+from nnvm.compiler import graph_util, graph_attr
+
+def test_infer_attr():
+    x = sym.Variable("x")
+    y = x * 2
+    g = nnvm.graph.create(y)
+    ishape, oshape = graph_util.infer_shape(g, x=(10,20))
+    assert tuple(oshape[0]) == (10, 20)
+
+    itype, otype = graph_util.infer_dtype(g, x="float32")
+    assert otype[0] == "float32"
+
+if __name__ == "__main__":
+    test_infer_attr()
diff --git a/nnvm/tests/python/compiler/test_nhwc_layout.py b/nnvm/tests/python/compiler/test_nhwc_layout.py
new file mode 100644
index 000000000000..96a8135435c3
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_nhwc_layout.py
@@ -0,0 +1,57 @@
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime as runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+
+def get_sym(layout, kernel_layout, channels):
+    data = sym.Variable(name="data")
+    data = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1),
+                      layout=layout, kernel_layout=kernel_layout, use_bias=True)
+    data = sym.max_pool2d(data=data, pool_size=(2, 2), strides=(2, 2), layout=layout)
+    data = sym.upsampling(data=data, scale=2, layout=layout)
+    softmax_axis = 1
+    if layout == "NHWC":
+        softmax_axis = 3
+    data = sym.softmax(data=data, axis=softmax_axis)
+    return data
+
+
+def build_and_run(sym, params, data, out_shape):
+    ctx = tvm.cpu(0)
+    graph, lib, params = nnvm.compiler.build(sym, "llvm", shape={"data":data.shape}, params=params)
+    module = runtime.create(graph, lib, ctx)
+    module.set_input(**params)
+    module.set_input("data", data)
+    module.run()
+    out =  module.get_output(0, tvm.nd.empty(out_shape))
+    return out.asnumpy()
+
+
+def test_nhwc():
+    data_shape = (1, 3, 224, 224)
+    out_channel = 8
+    nchw_sym = get_sym("NCHW", "OIHW", out_channel)
+    nhwc_sym = get_sym("NHWC", "HWIO", out_channel)
+    conv_weight = np.random.uniform(-1, 1, (out_channel, 3, 3, 3)).astype(np.float32)
+    conv_bias = np.random.uniform(-1, 1, (out_channel)).astype(np.float32)
+    nchw_params = {
+        "conv2d0_weight" : tvm.nd.array(conv_weight, ctx=tvm.cpu(0)),
+        "conv2d0_bias" : tvm.nd.array(conv_bias, ctx=tvm.cpu(0))
+    }
+    nhwc_params = {
+        "conv2d1_weight" : tvm.nd.array(conv_weight.transpose(2, 3, 1, 0), ctx=tvm.cpu(0)),
+        "conv2d1_bias" : tvm.nd.array(conv_bias, ctx=tvm.cpu(0))
+    }
+
+    data = np.random.uniform(-1, 1, data_shape).astype(np.float32)
+    oshape = (1, out_channel, 224, 224)
+    oshape_nhwc = (1, 224, 224, out_channel)
+    nchw_output = build_and_run(nchw_sym, nchw_params, data, oshape)
+    nhwc_output = build_and_run(nhwc_sym, nhwc_params, data.transpose(0, 2, 3, 1), oshape_nhwc)
+    np.testing.assert_allclose(nchw_output, nhwc_output.transpose(0, 3, 1, 2), rtol=1e-5, atol=1e-5)
+
+
+if __name__ == "__main__":
+    test_nhwc()
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
new file mode 100644
index 000000000000..f33e18197840
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -0,0 +1,83 @@
+import nnvm
+import numpy as np
+import tvm
+import topi.testing
+from tvm.contrib import graph_runtime
+from nnvm import symbol as sym
+from nnvm.compiler import graph_util, graph_attr
+from nnvm.testing import ctx_list
+
+def test_ewise_injective():
+    x = sym.Variable("x")
+    y = x * 2
+    y = sym.flatten(y) + 1
+    dshape = (10, 2, 3)
+    shape_dict = {"x": dshape}
+    dtype = "float32"
+    target = "llvm"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        assert graph.index.num_nodes == 2
+        m = graph_runtime.create(graph, lib, ctx)
+        x_np = np.random.uniform(size=dshape).astype(dtype)
+        m.run(x=x_np)
+        out = m.get_output(0, tvm.nd.empty((10, 6)))
+        np.testing.assert_allclose(
+            out.asnumpy(),  x_np.reshape(out.shape) * 2 + 1,
+            atol=1e-5, rtol=1e-5)
+
+
+def test_conv_ewise_injective():
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=32, kernel_size=(3, 3), groups=32,
+                   name="y", padding=(1,1))
+    y = sym.flatten(y + 1) + 1
+    dtype = "float32"
+    dshape = (1, 32, 18, 18)
+    kshape = (32, 1, 3, 3)
+    oshape = (1, 32* 18 * 18)
+    shape_dict = {"x": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        # print(graph.ir(join_entry_attrs=["shape"]))
+        assert graph.index.num_nodes == 5
+        # set input
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.depthwise_conv2d_python_nchw(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1) + 1
+        c_np = c_np.reshape(c_np.shape[0], np.prod(c_np.shape[1:])) + 1
+        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_injective_reduce_injective():
+    x = sym.Variable("x")
+    x = sym.flatten(x) + 1
+    y = sym.sum(x, axis=1)
+    dtype = "float32"
+    dshape = (32, 1, 18, 18)
+    shape_dict = {"x": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        assert graph.index.num_nodes == 2
+        data = np.random.uniform(size=dshape).astype(dtype)
+        m.run(x=data)
+        c_np = np.sum(data.reshape(32, 18 * 18) + 1, axis=1)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(c_np.shape, dtype))
+        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    test_injective_reduce_injective()
+    test_ewise_injective()
+    test_conv_ewise_injective()
diff --git a/nnvm/tests/python/compiler/test_optimizer.py b/nnvm/tests/python/compiler/test_optimizer.py
new file mode 100644
index 000000000000..fd620271d861
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_optimizer.py
@@ -0,0 +1,118 @@
+import numpy as np
+import tvm
+import nnvm
+import nnvm.compiler.optimizer as optimizer
+import nnvm.compiler.lr_scheduler as lr_scheduler
+
+from nnvm.testing.config import ctx_list
+from tvm.contrib import graph_runtime
+
+
+def helper(symbol, inputs, params, update_func, run_times, target, ctx, dtype="float32"):
+    ishapes = {}
+    np_inputs = {}
+    params_dict = {}
+    for (name, shape, s) in inputs:
+        ishapes.update({name: shape})
+        np_inputs.update({name: np.random.uniform(size=shape).astype(dtype)})
+    for (name, shape, s) in params:
+        np_inputs.update({name: np.random.uniform(size=shape).astype(dtype)})
+        params_dict.update({name: np_inputs[name]})
+
+    graph, lib, rt_params = nnvm.compiler.build(symbol, target, shape=ishapes)
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input(**np_inputs)
+    m.set_input(**rt_params)
+    for _ in range(run_times):
+        m.run()
+    y_np = update_func(**np_inputs)
+    out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
+    np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
+
+
+def test_sgd():
+    for target, ctx in ctx_list():
+        data = nnvm.sym.Variable("data")
+        weight = nnvm.sym.Variable("weight")
+        out = nnvm.sym.elemwise_mul(data, weight ** 2)
+
+        dshape = (1, 2, 3)
+        wshape = dshape
+
+        base_lr = 0.1
+        lr_factor = 0.5
+        rescale_grad = 0.2
+        wd = 0.1
+        clip_gradient = 0.25
+
+        scheduler = lr_scheduler.FactorScheduler(base_lr=base_lr, step=1, factor=lr_factor)
+        opt = optimizer.SGD(learning_rate=base_lr, lr_scheduler=scheduler,
+                            rescale_grad=rescale_grad, clip_gradient=clip_gradient,
+                            wd=wd)
+        opt_sym = opt.minimize(out, var=weight)
+
+        inputs = [("data", dshape, data)]
+        params = [("weight", wshape, weight)]
+
+        def update_func(data, weight):
+            gradient_0 = data * 2 * weight * rescale_grad
+            gradient_0 = np.clip(gradient_0, -clip_gradient, clip_gradient)
+            weight_0 = weight - base_lr * lr_factor * (gradient_0 + wd * weight)
+            gradient_1 = data * 2 * weight_0 * rescale_grad
+            gradient_1 = np.clip(gradient_1, -clip_gradient, clip_gradient)
+            weight_1 = weight_0 - base_lr * (lr_factor ** 2) * (gradient_1 + wd * weight_0)
+            return weight_1
+
+        helper(opt_sym, inputs, params, update_func, 2, target, ctx)
+
+
+
+def test_adam():
+    for target, ctx in ctx_list():
+        data = nnvm.sym.Variable("data")
+        weight = nnvm.sym.Variable("weight")
+        out = nnvm.sym.elemwise_mul(data, weight ** 2)
+
+        dshape = (1, 2, 3)
+        wshape = dshape
+
+        base_lr = 0.1
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        lr_factor = 0.5
+        rescale_grad = 0.2
+        wd = 0.1
+        clip_gradient = 0.25
+
+        scheduler = lr_scheduler.FactorScheduler(base_lr=base_lr, step=1, factor=lr_factor)
+        opt = optimizer.Adam(learning_rate=base_lr, beta1=beta1, beta2=beta2, epsilon=epsilon,
+                             lr_scheduler=scheduler, rescale_grad=rescale_grad,
+                             clip_gradient=clip_gradient, wd=wd)
+        opt_sym = opt.minimize(out, var=weight)
+
+        inputs = [("data", dshape, data)]
+        params = [("weight", wshape, weight)]
+
+        def update_func(data, weight):
+            rate_0 = np.sqrt(1 - beta2) / (1 - beta1)
+            lr_0 = base_lr * lr_factor * rate_0
+            gradient_0 = data * 2 * weight * rescale_grad
+            gradient_0 = np.clip(gradient_0, -clip_gradient, clip_gradient)
+            m_0 = (1 - beta1) * gradient_0
+            v_0 = (1 - beta2) * (gradient_0 ** 2)
+            weight_0 = weight - lr_0 * (m_0 / (np.sqrt(v_0) + epsilon) + wd * weight)
+            rate_1 = np.sqrt(1 - beta2 ** 2) / (1 - beta1 ** 2)
+            lr_1 = base_lr * (lr_factor ** 2) * rate_1
+            gradient_1 = data * 2 * weight_0 * rescale_grad
+            gradient_1 = np.clip(gradient_1, -clip_gradient, clip_gradient)
+            m_1 = beta1 * m_0 + (1 - beta1) * gradient_1
+            v_1 = beta2 * v_0 + (1 - beta2) * (gradient_1 ** 2)
+            weight_1 = weight_0 - lr_1 * (m_1 / (np.sqrt(v_1) + epsilon) + wd * weight_0)
+            return weight_1
+
+        helper(opt_sym, inputs, params, update_func, 2, target, ctx)
+
+if __name__ == "__main__":
+    test_sgd()
+    test_adam()
diff --git a/nnvm/tests/python/compiler/test_param_dict.py b/nnvm/tests/python/compiler/test_param_dict.py
new file mode 100644
index 000000000000..a6605123fa0d
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_param_dict.py
@@ -0,0 +1,84 @@
+import os
+import numpy as np
+import nnvm.compiler
+import tvm
+import json
+import base64
+from tvm._ffi.base import py_str
+from tvm import rpc
+from tvm.contrib import util, graph_runtime
+
+
+def test_save_load():
+    x = np.random.uniform(size=(10, 2)).astype("float32")
+    y = np.random.uniform(size=(1, 2, 3)).astype("float32")
+    x[:] = 1
+    y[:] = 1
+    params = {"x": x, "y": y}
+    param_bytes = nnvm.compiler.save_param_dict(params)
+    assert isinstance(param_bytes, bytearray)
+    param2 = nnvm.compiler.load_param_dict(param_bytes)
+    assert len(param2) == 2
+    np.testing.assert_equal(param2["x"].asnumpy(), x)
+    np.testing.assert_equal(param2["y"].asnumpy(), y)
+
+
+def test_ndarray_reflection():
+    x = np.random.uniform(size=(10, 2)).astype("float32")
+    xx = tvm.nd.array(x)
+    xnode = tvm.make.node("NDArrayWrapper", name="xx", array=xx)
+    xnode2 = tvm.make.node("NDArrayWrapper", name="x2", array=xx)
+    assert xnode.array.same_as(xx)
+    json_str = tvm.save_json([xnode, xnode2])
+    json_dict = json.loads(json_str)
+    b64_str = json_dict["b64ndarrays"][0]
+    decoded = py_str(base64.b64encode(base64.b64decode(b64_str)))
+    assert b64_str == decoded
+    xlist = tvm.load_json(json_str)
+    np.testing.assert_equal(xlist[0].array.asnumpy(), xx.asnumpy())
+    assert xlist[1].array == xlist[0].array
+
+
+def test_bigendian_rpc_param():
+    """Test big endian rpc when there is a PowerPC RPC server available"""
+    host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
+    port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090)
+    if host is None:
+        return
+
+    def verify_nnvm(remote, target, shape, dtype):
+        x = nnvm.sym.Variable("x")
+        y = x + 1
+        graph, lib, _ = nnvm.compiler.build(
+            y, target,
+            shape={"x": shape},
+        dtype={"x": dtype})
+
+        temp = util.tempdir()
+        path_dso = temp.relpath("dev_lib.o")
+        lib.save(path_dso)
+        remote.upload(path_dso)
+        lib = remote.load_module("dev_lib.o")
+        a = np.random.randint(0, 256, size=shape).astype(dtype)
+        a[:] = 1
+        params = {"x" : a}
+        ctx = remote.cpu(0)
+        m = graph_runtime.create(graph, lib, ctx)
+        # uses save param_dict
+        m.load_params(nnvm.compiler.save_param_dict(params))
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
+        np.testing.assert_allclose(a + 1, out.asnumpy())
+
+    print("Test RPC connection to PowerPC...")
+    remote = rpc.connect(host, port)
+    target = "llvm -mtriple=powerpc-linux-gnu"
+    for dtype in ["float32", "float64", "int32", "int8"]:
+        verify_nnvm(remote, target, (10,), dtype)
+
+
+
+if __name__ == "__main__":
+    test_ndarray_reflection()
+    test_save_load()
+    test_bigendian_rpc_param()
diff --git a/nnvm/tests/python/compiler/test_rpc_exec.py b/nnvm/tests/python/compiler/test_rpc_exec.py
new file mode 100644
index 000000000000..129a0a425fbc
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_rpc_exec.py
@@ -0,0 +1,50 @@
+import tvm
+from tvm import rpc
+from tvm.contrib import util, graph_runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+import numpy as np
+
+def test_rpc_executor():
+    host = "localhost"
+    port = 9120
+    server = rpc.Server(host, port)
+
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.exp(y + x)
+    shape = (10, 128)
+    dtype = tvm.float32
+    shape_dict = {"x": shape, "y": shape}
+    tmp = util.tempdir()
+    lib_name  = tmp.relpath("net.o")
+
+    graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
+    # save module
+    lib.save(lib_name)
+    remote = rpc.connect(server.host, server.port)
+    remote.upload(lib_name)
+    ctx = remote.cpu(0)
+    # load remote
+    rlib = remote.load_module("net.o")
+
+    # Create remotemodule
+    m = graph_runtime.create(graph, rlib, remote.cpu(0))
+    # get member functions
+    set_input, run, get_output = m["set_input"], m["run"], m["get_output"]
+    na = tvm.nd.array(np.ones(shape).astype(dtype), ctx)
+    nb = tvm.nd.array(np.ones(shape).astype(dtype), ctx)
+    # set inputs
+    set_input("x", na)
+    set_input("y", nb)
+    # execute
+    run()
+    # get outputs
+    out = tvm.nd.empty(shape, dtype, ctx)
+    get_output(0, out)
+    np.testing.assert_allclose(
+        out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
+    server.terminate()
+
+if __name__ == "__main__":
+    test_rpc_executor()
diff --git a/nnvm/tests/python/compiler/test_simplify_inference.py b/nnvm/tests/python/compiler/test_simplify_inference.py
new file mode 100644
index 000000000000..e2826765995e
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_simplify_inference.py
@@ -0,0 +1,50 @@
+"""Unittest cases for simplify batch_norm"""
+import nnvm
+from nnvm import symbol as sym
+from nnvm.compiler import graph_util, graph_attr
+
+def test_simplify_batchnorm():
+    def simple_bn(x, gamma, beta, moving_mean, moving_var,
+                  axis=1, epsilon=1e-5, shape=None):
+        # expect = (x - moving_mean) / sym.sqrt(moving_var + eps) * gamma + beta
+        scale = sym.elemwise_mul(1 / sym.sqrt(moving_var + epsilon), gamma)
+        shift = sym.elemwise_add(
+            sym.elemwise_mul(sym.negative(moving_mean), scale), beta)
+        shape = [-1 if i == axis else 1 for i in range(len(shape))]
+        # for 2D
+        num_newaxis=len(shape) - axis - 1
+        if num_newaxis:
+            scale = sym.expand_dims(scale, axis=1, num_newaxis=num_newaxis)
+            shift = sym.expand_dims(shift, axis=1, num_newaxis=num_newaxis)
+        return x * scale + shift
+
+
+    # Before simplify
+    def check(dim, axis, nstep):
+        eps = 0.01
+        x = sym.Variable("x") + 1
+        beta = sym.Variable("beta")
+        gamma = sym.Variable("gamma")
+        moving_var = sym.Variable("moving_var")
+        moving_mean = sym.Variable("moving_mean")
+        y1, y2 = x, sym.Variable("xx") + 1
+        ishape = {"x": tuple(10 for i in range(dim))}
+        for i in range(nstep):
+            y1 = sym.batch_norm(
+                y1 + 1, gamma, beta, moving_mean, moving_var, epsilon=eps, axis=axis)
+            y1 = sym.dropout(y1)
+            y2 = simple_bn(y2 + 1, gamma, beta, moving_mean, moving_var,
+                           epsilon=eps, axis=axis, shape=ishape["x"])
+        g = nnvm.graph.create(y1)
+        g2 = nnvm.graph.create(y2)
+        graph_attr.set_shape_inputs(g, ishape)
+        g1 = g.apply("InferShape").apply("SimplifyInference")
+        # assert graph equals as expected
+        graph_util.check_graph_equal(g1, g2)
+
+    check(2, 1, 1)
+    check(4, 0, 3)
+    check(4, 1, 2)
+
+if __name__ == "__main__":
+    test_simplify_batchnorm()
diff --git a/nnvm/tests/python/compiler/test_top_assign.py b/nnvm/tests/python/compiler/test_top_assign.py
new file mode 100644
index 000000000000..e411385712f5
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_assign.py
@@ -0,0 +1,41 @@
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+
+
+def test_update():
+    w = sym.Variable("w")
+    w2 = sym.Variable("w2")
+    w = sym._assign(w, w + 1)
+    w2 = sym._assign(w2, w + 1)
+
+    dshape = (5, 3, 18, 18)
+    shape_dict = {"w": dshape, "w2":dshape}
+    dtype = "float32"
+
+    def check(target, ctx):
+        graph, lib, _ = nnvm.compiler.build(w2, target, shape_dict)
+
+        m = graph_runtime.create(graph, lib, ctx)
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.set_input("w", data)
+        m.run()
+        out = m.get_input("w2", tvm.nd.empty(dshape, dtype))
+        np.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 2, rtol=1e-5)
+
+        m.run()
+        out = m.get_input("w2", tvm.nd.empty(dshape, dtype))
+        np.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 3, rtol=1e-5)
+
+    for target, ctx in ctx_list():
+        check(target, ctx)
+
+
+if __name__ == "__main__":
+    test_update()
diff --git a/nnvm/tests/python/compiler/test_top_level1.py b/nnvm/tests/python/compiler/test_top_level1.py
new file mode 100644
index 000000000000..d9c6655fea1d
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_level1.py
@@ -0,0 +1,521 @@
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+import topi.testing
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+
+def helper(symbol, inputs, dtype,
+           np_forward, np_backward=None,
+           need_input=True, need_head_grads=True,
+           rnd_min=-1, rnd_max=1):
+    ishapes = {}
+    itypes = {}
+    input_syms = []
+    np_inputs = {}
+    for (name, shape, s) in inputs:
+        ishapes.update({name: shape})
+        itypes.update({name: dtype})
+        np_inputs.update({name: np.random.uniform(rnd_min, rnd_max, size=shape).astype(dtype)})
+        input_syms.append(s)
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes, itypes)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(**np_inputs)
+        y_np = np_forward(**np_inputs)
+        out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
+        np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
+        # backward
+        if np_backward:
+            graph._set_symbol_list_attr("grad_ys", symbol)
+            graph._set_symbol_list_attr("grad_xs", input_syms)
+            graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads", shape=y_np.shape))
+            graph = graph.apply("Gradient")
+            ishapes.update({"head_grads": y_np.shape})
+            graph, lib, _ = nnvm.compiler.build(graph, target, ishapes)
+            m = graph_runtime.create(graph, lib, ctx)
+            head_grads = np.random.uniform(size=y_np.shape).astype(dtype)
+            y_np = np_backward(head_grads=head_grads, **np_inputs)
+            b_inputs = {}
+            if need_input:
+                b_inputs.update(np_inputs)
+            if need_head_grads:
+                b_inputs.update({"head_grads":head_grads})
+            m.run(**b_inputs)
+            for i in range(len(y_np)):
+                out = m.get_output(i, tvm.nd.empty(y_np[i].shape, dtype))
+                np.testing.assert_allclose(out.asnumpy(), y_np[i], atol=1e-5, rtol=1e-5)
+
+
+def test_relu():
+    x = sym.Variable("x")
+    y = sym.relu(sym.leaky_relu(x, alpha=0.3) - 0.2)
+
+    def forward(x):
+        x = (x < 0) * x * 0.3 + (x > 0) * x - 0.2
+        return (x > 0) * x
+
+    def backward(head_grads, x):
+        sub = (x < 0) * x * 0.3 + (x > 0) * x - 0.2
+        return [(sub > 0).astype("float") * \
+                ((x > 0).astype("float") + 0.3 * (x < 0).astype("float")) * head_grads]
+
+    dtype = "float32"
+    dshape = (1, 3, 32, 32)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward)
+
+def test_prelu_nchw():
+    x = sym.Variable("x")
+    a = sym.Variable("a")
+    y = sym.prelu(data=x, alpha=a)
+
+    def forward(x, a):
+        return (x < 0) * (x * a.reshape(3, 1, 1)) + (x>=0) * x
+
+    dtype = "float32"
+    dshape_x = (1, 3, 32, 32)
+    dshape_w = (3,)
+
+    inputs = [
+        ('x', dshape_x, x),
+        ('a', dshape_w, a)
+    ]
+    helper(y, inputs, dtype, forward)
+
+def test_prelu_nhwc():
+    x = sym.Variable("x")
+    a = sym.Variable("a")
+    y = sym.prelu(data=x, alpha=a, axis=3)
+
+    def forward(x, a):
+        return (x < 0) * (x * a.reshape(1, 1, 3)) + (x>=0) * x
+
+    dtype = "float32"
+    dshape_x = (1, 32, 32, 3)
+    dshape_w = (3,)
+
+    inputs = [
+        ('x', dshape_x, x),
+        ('a', dshape_w, a)
+    ]
+
+
+    helper(y, inputs, dtype, forward)
+
+def test_sym_scalar_pow():
+    scalar = 3
+    x = sym.Variable("x")
+    y = x**scalar
+
+    def forward(x):
+        return x**scalar
+
+    def backward(head_grads, x):
+        return [scalar * x**(scalar -  1) * head_grads]
+
+    dtype = "float32"
+    dshape = (1, 3, 32, 32)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward)
+
+
+def test_scalar_sym_pow():
+    scalar = 3
+    x = sym.Variable("x")
+    y = scalar**x
+
+    def forward(x):
+        return scalar**x
+
+    def backward(head_grads, x):
+        return [np.log(scalar) * scalar**x * head_grads]
+
+    dtype = "float32"
+    dshape = (1, 3, 32, 32)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward)
+
+
+def test_exp():
+    x = sym.Variable("x")
+    y = sym.exp(x)
+
+    def forward(x):
+        return np.exp(x)
+
+    def backward(head_grads, x):
+        return [np.exp(x) * head_grads]
+
+    dtype = "float32"
+    dshape = (1, 3, 32, 32)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward)
+
+
+def test_log():
+    x = sym.Variable("x")
+    y = sym.log(x)
+
+    def forward(x):
+        return np.log(x)
+
+    def backward(head_grads, x):
+        return [1. / x * head_grads]
+
+    dtype = "float32"
+    dshape = (1, 3, 32, 32)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward, rnd_min=0.001)
+
+
+def test_tanh():
+    x = sym.Variable("x")
+    y = sym.tanh(x)
+
+    def forward(x):
+        return np.sinh(x) / np.cosh(x)
+
+    def backward(head_grads, x):
+        y_np = forward(x)
+        return [(1 - y_np**2) * head_grads]
+
+    dtype = "float32"
+    dshape = (1, 3, 32, 32)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward)
+
+
+def test_sigmoid():
+    x = sym.Variable("x")
+    y = sym.sigmoid(x)
+
+    def forward(x):
+        return 1.0 / (1.0 + np.exp(-x))
+
+    def backward(head_grads, x):
+        y_np = forward(x)
+        return [y_np *(1 - y_np) * head_grads]
+
+    dtype = "float32"
+    dshape = (1, 3, 32, 32)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward)
+
+
+def test_softmax():
+    x = sym.Variable("x")
+    y = sym.softmax(x)
+
+    def forward(x):
+        return topi.testing.softmax_python(x)
+
+    def backward(head_grads, x):
+        y = topi.testing.softmax_python(x)
+        grad = y * (head_grads - np.sum(y * head_grads, axis=1, keepdims=True))
+        return [grad]
+
+    dtype = "float32"
+    dshape = (10, 1000)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward)
+
+
+def test_log_softmax():
+    x = sym.Variable("x")
+    y = sym.log_softmax(x)
+
+    def forward(x):
+        return topi.testing.log_softmax_python(x)
+
+    def backward(head_grads, x):
+        y = topi.testing.log_softmax_python(x)
+        grad = head_grads - np.exp(y) * np.sum(head_grads, axis=1, keepdims=True)
+        return [grad]
+
+    dtype = "float32"
+    dshape = (10, 1000)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward)
+
+
+def test_dense():
+    x = sym.Variable("x", shape=(10, 100))
+    w = sym.Variable("dense_weight", shape=(3, 100))
+    b = sym.Variable("dense_bias", shape=(3,))
+    y = sym.dense(x, w, b, use_bias=True, units=3, name="dense")
+    y = sym.flatten(y)
+
+    def forward(x, dense_weight, dense_bias):
+        return np.dot(x, dense_weight.T) + dense_bias
+    dtype = "float32"
+    inputs = [
+        ('x', (10, 100), x),
+        ('dense_weight', (3, 100), w),
+        ('dense_bias', (3,), b)
+    ]
+    helper(y, inputs, dtype, forward)
+
+
+def test_batchnorm():
+    x = sym.Variable("x")
+    beta = sym.Variable("beta")
+    gamma = sym.Variable("gamma")
+    moving_var = sym.Variable("moving_var")
+    moving_mean = sym.Variable("moving_mean")
+    eps = 1e-5
+    y = sym.batch_norm(
+        x, gamma, beta, moving_mean, moving_var, epsilon=eps)
+
+    def forward(x, gamma, beta, moving_mean, moving_var):
+        return (x - moving_mean) / np.sqrt(moving_var + eps) * gamma + beta
+
+    dtype = "float32"
+    inputs = [
+        ('x', (10, 20), x),
+        ('gamma', (20,), gamma),
+        ('beta', (20,), beta),
+        ('moving_mean', (20,), moving_var),
+        ('moving_var', (20,), moving_mean)
+    ]
+
+    helper(y, inputs,  dtype, forward, rnd_min=0.001)
+
+
+def verify_concatenate(ishape, axis):
+    x = [sym.Variable("x%d" % i) for i in range(len(ishape))]
+    y = sym.concatenate(*x, axis=axis) + 1
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        # set input
+        data = []
+        for i, shape in enumerate(ishape):
+            data.append(np.random.uniform(size=shape).astype(dtype))
+        pdict = {"x%d" % i :  v for i, v in enumerate(data)}
+        shape = {"x%d" % i :  v.shape for i, v in enumerate(data)}
+        graph, lib, _ = nnvm.compiler.build(y, target, shape)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(**pdict)
+        out_np = np.concatenate(data, axis=axis) + 1
+        out = m.get_output(0, tvm.nd.empty(out_np.shape))
+        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+
+def test_concatenate():
+    verify_concatenate([(2, 3, 4), (1, 3, 4)], axis=0)
+    verify_concatenate([(2, 4), (2, 7)], axis=1)
+
+
+def verify_split(ishape, indices_or_sections, axis):
+    x = sym.Variable("x")
+    y = sym.split(x, indices_or_sections=indices_or_sections, axis=axis)
+    dtype = "float32"
+    x_np = np.random.uniform(size=ishape).astype(dtype)
+    res = np.split(x_np, indices_or_sections, axis=axis)
+    for target, ctx in ctx_list():
+        # set input
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(x=x_np)
+        for i, arr  in enumerate(res):
+            out = m.get_output(i, tvm.nd.empty(arr.shape))
+            np.testing.assert_allclose(out.asnumpy(), arr, atol=1e-5, rtol=1e-5)
+
+
+def test_split():
+    verify_split((2, 3), 2, axis=0)
+    verify_split((5, 3), [3], axis=0)
+    verify_split((5, 9, 3), [3, 4], axis=1)
+
+def verify_strided_slice(ishape, begin, end, strideinp=None):
+    stride = strideinp if strideinp else [1, 1, 1]
+    x = sym.Variable("x")
+    if strideinp:
+        y = sym.strided_slice(x, begin = begin, end = end, stride = stride) + 1
+    else:
+        y = sym.strided_slice(x, begin = begin, end = end) + 1
+    x_np = np.random.uniform(size=ishape).astype("float32")
+    for i in range(len(begin), 3):
+        begin.append(0)
+    for i in range(len(end), 3):
+        end.append(ishape[i])
+    def test_forward(x, begin, end, stride):
+        return x[begin[0]:end[0]:stride[0],
+                    begin[1]:end[1]:stride[1], begin[2]:end[2]:stride[2]] + 1
+
+    for target, ctx in ctx_list():
+        # set input
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(x=x_np)
+        res = test_forward(x_np, begin, end, stride)
+        out = m.get_output(0, tvm.nd.empty(res.shape))
+        np.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
+
+def test_strided_slice():
+    verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
+    verify_strided_slice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1])
+    verify_strided_slice((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2])
+    verify_strided_slice((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 1000, 3])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4])
+    verify_strided_slice((3, 4, 3), [1, 1], [4, 4, 3])
+
+def verify_take(src_shape, indices_src, axis=None):
+    src_dtype = "float32"
+    indices_dtype = "int32"
+    indices_src = np.array(indices_src, dtype=indices_dtype)
+    a = sym.Variable("a")
+    indices = sym.Variable("indices")
+    y = sym.take(a, indices, axis=axis)
+    for target, ctx in ctx_list():
+        # set input
+        shape_dict = {"a":src_shape, "indices":indices_src.shape}
+        type_dict = {"a":src_dtype, "indices":indices_dtype}
+        graph, lib, _ = nnvm.compiler.build(y, target, shape=shape_dict, dtype=type_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+
+        shape_size = 1
+        for i in range(len(src_shape)):
+            shape_size = shape_size * src_shape[i]
+        a_src = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
+        out_np = np.take(a_src, indices_src, axis=axis)
+        m.run(a=a_src, indices=indices_src)
+        out = m.get_output(0, tvm.nd.empty(out_np.shape, dtype=src_dtype))
+        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+def test_take():
+    verify_take((4,), [1])
+    verify_take((4,), [[0,1,2,3]])
+    verify_take((3,3,3), [[11,25]])
+    verify_take((4,), [[0,1],[2,3]])
+    verify_take((4,), [1], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 1)
+    verify_take((4,3,5,6), [[2,1,0,0]], -2)
+
+
+def verify_squeeze(dshape, axis):
+    x = sym.Variable("x")
+    if axis:
+        y = sym.squeeze(x, axis=axis)
+    else:
+        y = sym.squeeze(x)
+    y = y + 1
+
+    def forward(x):
+        return np.squeeze(x, axis=axis) + 1
+
+    def backward(head_grads, x):
+        return [np.reshape(head_grads, x.shape)]
+
+    dtype = "float32"
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, forward, backward)
+
+
+def test_squeeze():
+    verify_squeeze((1, 3, 2, 5), None)
+    verify_squeeze((1, 3, 1), axis=0)
+    verify_squeeze((1, 3, 2, 5, 1), axis=-1)
+
+
+def test_pad():
+    x = sym.Variable("x")
+    y = sym.pad(x, pad_width=((0, 0), (0, 0), (0, 1), (2, 3)), pad_value=1.)
+
+    def forward(x):
+        return np.pad(x,
+                      pad_width=((0, 0), (0, 0), (0, 1), (2, 3)),
+                      mode='constant', constant_values=1.)
+
+    dtype = "float32"
+    inputs = [('x', (1, 3, 28, 28), x)]
+    helper(y, inputs, dtype, forward)
+
+def verify_lrn(ishape, size, axis, bias, alpha, beta):
+    x = sym.Variable("x")
+    y = sym.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
+    dtype = "float32"
+    x_np = np.random.uniform(size=ishape).astype(dtype)
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(x=x_np)
+        out = m.get_output(0, tvm.nd.empty(ishape))
+        out_np = topi.testing.lrn_python(x_np, size, axis, bias, alpha, beta)
+        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+    #Checking LRN op followed by elementwise op relu
+    z = sym.relu(y)
+    x_np = np.random.uniform(low=-10.0, high=10.0, size=ishape).astype(dtype)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(z, target, {"x": ishape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(x=x_np)
+        out = m.get_output(0, tvm.nd.empty(ishape))
+        out_np = topi.testing.lrn_python(x_np, size, axis, bias, alpha, beta)
+        out_np = (out_np > 0) * out_np
+        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+def verify_l2_normalize(ishape, eps, axis):
+    x = sym.Variable("x")
+    y = sym.l2_normalize(x, eps=eps, axis=axis)
+    dtype = "float32"
+    x_np = np.random.uniform(size=ishape).astype(dtype)
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(x=x_np)
+        out = m.get_output(0, tvm.nd.empty(ishape))
+        out_np = topi.testing.l2_normalize_python(x_np, eps, axis)
+        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+    #Checking L2 normalization op followed by elementwise op relu
+    z = sym.relu(y)
+    x_np = np.random.uniform(low=-10.0, high=10.0, size=ishape).astype(dtype)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(z, target, {"x": ishape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(x=x_np)
+        out = m.get_output(0, tvm.nd.empty(ishape))
+        out_np = topi.testing.l2_normalize_python(x_np, eps, axis)
+        out_np = (out_np > 0) * out_np
+        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+def test_lrn():
+    verify_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
+    verify_lrn((1, 3, 20, 20), 3, 1, 2.0, 1.0, 0.75)
+
+def test_l2_normalize():
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (1,))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (1, 2))
+
+if __name__ == "__main__":
+    test_split()
+    test_concatenate()
+    test_log_softmax()
+    test_batchnorm()
+    test_dense()
+    test_relu()
+    test_prelu_nchw()
+    test_prelu_nhwc()
+    test_sym_scalar_pow()
+    test_scalar_sym_pow()
+    test_exp()
+    test_log()
+    test_tanh()
+    test_sigmoid()
+    test_softmax()
+    test_squeeze()
+    test_pad()
+    test_take()
+    test_lrn()
+    test_l2_normalize()
+    test_strided_slice()
diff --git a/nnvm/tests/python/compiler/test_top_level2.py b/nnvm/tests/python/compiler/test_top_level2.py
new file mode 100644
index 000000000000..c26f5356557f
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_level2.py
@@ -0,0 +1,346 @@
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+import topi
+import topi.testing
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+
+
+def test_conv2d():
+    def run_test_conv2d(sym, dtype, dshape, kshape, oshape, shape_dict, padding):
+        for target, ctx in ctx_list():
+            graph, lib, _ = nnvm.compiler.build(sym, target, shape_dict)
+            m = graph_runtime.create(graph, lib, ctx)
+            data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+            kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+            bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
+            m.run(x=data, y_weight=kernel, y_bias=bias)
+            out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+            c_np = topi.testing.conv2d_nchw_python(
+                data.asnumpy(), kernel.asnumpy(), 1, padding)
+            c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
+            np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=10, kernel_size=(3,3),
+                   name="y", padding=(1,1))
+    dtype = "float32"
+    dshape = (1, 3, 18, 18)
+    kshape = (10, 3, 3, 3)
+    oshape = (1, 10, 18, 18)
+    shape_dict = {"x": dshape}
+    run_test_conv2d(y, dtype, dshape, kshape, oshape, shape_dict, (1,1))
+
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=10, kernel_size=(1,3),
+                   name="y", padding=(0,1))
+    dtype = "float32"
+    dshape = (1, 3, 224, 224)
+    kshape = (10, 3, 1, 3)
+    oshape = (1, 10, 224, 224)
+    shape_dict = {"x": dshape}
+    run_test_conv2d(y, dtype, dshape, kshape, oshape, shape_dict, (0,1))
+
+
+def test_mixed_precision():
+    x = sym.Variable("x")
+    dtype = "int8"
+    out_dtype="int32"
+    y = sym.conv2d(x,
+                   channels=10,
+                   kernel_size=(3,3),
+                   name="y",
+                   padding=(1,1),
+                   use_bias=False,
+                   out_dtype="int32")
+    dshape = (1, 3, 18, 18)
+    kshape = (10, 3, 3, 3)
+    oshape = (1, 10, 18, 18)
+    shape_dict = {"x": dshape}
+    dtype_dict = {"x": dtype}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict, dtype_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(-127, 127, size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(-127, 127, size=kshape).astype(dtype))
+        m.run(x=data, y_weight=kernel)
+        out = m.get_output(0, tvm.nd.empty(oshape, out_dtype))
+        c_np = topi.testing.conv2d_nchw_python(
+            data.asnumpy().astype(out_dtype),
+            kernel.asnumpy().astype(out_dtype), 1, 1)
+        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_dilated_conv2d():
+    dilation = 3
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=10, kernel_size=(3, 3), dilation=(dilation, dilation),
+                   name="y", padding=(1, 1))
+    dtype = "float32"
+    dshape = (1, 3, 18, 18)
+    kshape = (10, 3, 3, 3)
+    oshape = (1, 10, 14, 14)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
+        kernel_np = np.random.uniform(size=kshape).astype(dtype)
+        kernel = tvm.nd.array(kernel_np)
+        dkernel_np = topi.testing.dilate_python(kernel_np, (1, 1, dilation, dilation))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.conv2d_nchw_python(
+            data.asnumpy(), dkernel_np, 1, 1)
+        c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
+        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_grouped_conv2d_nchw():
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=32, kernel_size=(3,3), groups=32,
+                   name="y", padding=(1,1))
+    dtype = "float32"
+    dshape = (1, 32, 18, 18)
+    kshape = (32, 1, 3, 3)
+    oshape = (1, 32, 18, 18)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.depthwise_conv2d_python_nchw(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
+        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+def test_grouped_conv2d_nhwc():
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=32, kernel_size=(3,3), groups=32,
+                   name="y", padding=(1,1), layout="NHWC", kernel_layout ='HWOI')
+    dtype = "float32"
+    dshape = (1, 18, 18, 32)
+    kshape = (3, 3, 32, 1)
+    oshape = (1, 18, 18, 32)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[2]).astype(dtype))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.depthwise_conv2d_python_nhwc(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        c_np = c_np + bias.asnumpy().reshape(1, 1, kshape[2])
+        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_conv2d_transpose():
+    x = sym.Variable("x")
+    y = sym.conv2d_transpose(x, channels=10, kernel_size=(3,3), strides=(2,2),
+                             name="y", padding=(1,1), output_padding=(2,2))
+    dtype = "float32"
+    dshape = (1, 3, 18, 18)
+    kshape = (3, 10, 3, 3)
+    oshape = (1, 10, 37, 37)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[1]).astype(dtype))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.conv2d_transpose_nchw_python(
+            data.asnumpy(), kernel.asnumpy(), 2, 1)
+        c_np = c_np + bias.asnumpy().reshape(kshape[1], 1, 1)
+        d_np = np.zeros(shape=oshape)
+        d_np[:,:,0:c_np.shape[2],0:c_np.shape[3]] = c_np
+        np.testing.assert_allclose(out.asnumpy(), d_np, rtol=1e-5)
+
+
+def test_max_pool2d():
+    x = sym.Variable("x")
+    y = sym.max_pool2d(x, pool_size=(2,2), strides=(2,2),
+                       padding=(0,0), name="y", ceil_mode=True)
+    dtype = "float32"
+    dshape = (1, 3, 28, 28)
+    oshape = (1, 3, 14, 14)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = np.max(data.asnumpy().reshape(1,3,14,2,14,2), axis=(3,5))
+        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_avg_pool2d():
+    x = sym.Variable("x")
+    y = sym.avg_pool2d(x, pool_size=(2,2), strides=(2,2), padding=(0,0), name="y")
+    dtype = "float32"
+    dshape = (1, 3, 28, 28)
+    oshape = (1, 3, 14, 14)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = np.mean(data.asnumpy().reshape(1,3,14,2,14,2), axis=(3,5))
+        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_avg_pool2d_no_count_pad():
+    kh, kw = (4, 4)
+    sh, sw = (2, 2)
+    ph, pw = (2, 2)
+
+    x = sym.Variable("x")
+    y = sym.avg_pool2d(x, pool_size=(kh, kw), strides=(sw, sw), padding=(ph, pw),
+                       name="y", count_include_pad=False)
+    dtype = "float32"
+    n = 1
+    (ic, ih, iw) = (3, 28, 28)
+    (oc, oh, ow) = (3, 15, 15)
+
+    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
+    pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype)
+    no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw)))
+    pad_np[np.ix_(*no_zero)] = a_np
+    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)
+
+    for i in range(oh):
+        for j in range(ow):
+            pad_count = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3))
+            b_np[:,:,i,j] = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw],
+                                   axis=(2,3)) / np.maximum(pad_count, 1)
+    b_np = np.maximum(b_np, 0.0)
+    shape_dict = {"x": (n, ic, ih, iw)}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(a_np)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty((n, oc, oh, ow), dtype))
+        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_global_max_pool2d():
+    x = sym.Variable("x")
+    y = sym.global_max_pool2d(x, name="y")
+    dtype = "float32"
+    dshape = (1, 1024, 7, 7)
+    oshape = (1, 1024, 1, 1)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = np.max(data.asnumpy(), axis=(2,3), keepdims=True)
+        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_global_avg_pool2d():
+    x = sym.Variable("x")
+    y = sym.global_avg_pool2d(x, name="y")
+    dtype = "float32"
+    dshape = (1, 1024, 7, 7)
+    oshape = (1, 1024, 1, 1)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = np.mean(data.asnumpy(), axis=(2,3), keepdims=True)
+        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_upsampling_nearest_neighbor():
+    x = sym.Variable("x")
+    scale = 2
+    y = sym.upsampling(x, scale=scale, name="y")
+    dtype = "float32"
+    dshape = (1, 16, 32, 32)
+    oshape = (1, 16, 32*scale, 32*scale)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        a_np = np.random.uniform(size=dshape).astype(dtype)
+        data = tvm.nd.array(a_np)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = topi.testing.upsampling_python(a_np, scale, "NCHW")
+        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+def test_upsampling_bilinear():
+    x = sym.Variable("x")
+    scale = 2
+    y = sym.upsampling(x, scale=scale, method="BILINEAR", name="y", layout="NCHW")
+    dtype = "float32"
+    dshape = (1, 4, 32, 32)
+    oshape = (1, 4, 32*scale, 32*scale)
+    shape_dict = {"x": dshape}
+    dtype_dict = {"x": dtype}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict, dtype_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        a_np = np.random.uniform(size=dshape).astype(dtype)
+        data = tvm.nd.array(a_np)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = topi.testing.bilinear_resize_python(a_np, (32*scale, 32*scale), "NCHW")
+        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+
+def test_resize_bilinear():
+    x = sym.Variable("x")
+    y = sym.resize(x, size=(60, 60), method="BILINEAR", name="y", layout="NHWC")
+    dtype = "float32"
+    dshape = (1, 32, 32, 4)
+    oshape = (1, 60, 60, 4)
+    shape_dict = {"x": dshape}
+    dtype_dict = {"x": dtype}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict, dtype_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        a_np = np.random.uniform(size=dshape).astype(dtype)
+        data = tvm.nd.array(a_np)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = topi.testing.bilinear_resize_python(a_np, (60, 60), "NHWC")
+        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+
+if __name__ == "__main__":
+    test_mixed_precision()
+    test_conv2d()
+    test_dilated_conv2d()
+    test_grouped_conv2d_nchw()
+    test_grouped_conv2d_nhwc()
+    test_conv2d_transpose()
+    test_max_pool2d()
+    test_avg_pool2d()
+    test_avg_pool2d_no_count_pad()
+    test_global_max_pool2d()
+    test_global_avg_pool2d()
+    test_upsampling_nearest_neighbor()
+    test_upsampling_bilinear()
+    test_resize_bilinear()
diff --git a/nnvm/tests/python/compiler/test_top_level3.py b/nnvm/tests/python/compiler/test_top_level3.py
new file mode 100644
index 000000000000..c8bd37c38e5b
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_level3.py
@@ -0,0 +1,48 @@
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+import topi.testing
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+from test_top_level1 import helper
+
+def check_map(symfunc, np_func, np_backward=None, dtype="float32", rnd_min=-1, rnd_max=1):
+    x = sym.Variable("x")
+    y = symfunc(x)
+    dshape = (1, 3, 32, 32)
+    inputs = [('x', dshape, x)]
+    helper(y, inputs, dtype, lambda x: np_func(x), np_backward,
+           rnd_min=rnd_min, rnd_max=rnd_max)
+
+
+def test_floor():
+    check_map(sym.floor, np.floor)
+
+def test_ceil():
+    check_map(sym.ceil, np.ceil)
+
+def test_trunc():
+    check_map(sym.trunc, np.trunc)
+
+def test_round():
+    check_map(sym.round, np.round)
+
+def test_abs():
+    check_map(sym.abs, np.abs)
+    check_map(sym.abs, np.abs, dtype = "int32")
+    check_map(sym.abs, np.abs, dtype = "int8")
+
+def test_shift():
+    n = 3
+    for dtype in ["int32", "int8"]:
+        check_map(lambda x : x >> n, lambda x: x >> n, dtype=dtype, rnd_min=-100, rnd_max=100)
+        check_map(lambda x : x << n, lambda x: x << n, dtype=dtype, rnd_min=-100, rnd_max=100)
+
+if __name__ == "__main__":
+    test_shift()
+    test_floor()
+    test_ceil()
+    test_round()
+    test_abs()
+    test_trunc()
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
new file mode 100644
index 000000000000..5bf134b49a7b
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -0,0 +1,755 @@
+import math
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+import topi
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+
+
+def helper(symbol, inputs, dtype,
+           np_forward, np_backward=None,
+           need_input=True, need_head_grads=True, in_range={}):
+    ishapes = {}
+    input_syms = []
+    np_inputs = {}
+    for (name, shape, s) in inputs:
+        ishapes.update({name: shape})
+        if name in in_range:
+            np_inputs.update({name: np.random.uniform(size=shape,
+                                                      low=in_range[name][0],
+                                                      high=in_range[name][1]).astype(dtype)})
+        else:
+            np_inputs.update({name: np.random.uniform(size=shape).astype(dtype)})
+        input_syms.append(s)
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes, dtype=dtype)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(**np_inputs)
+        y_np = np_forward(**np_inputs)
+        out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
+        np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
+        # backward
+        if np_backward:
+            graph._set_symbol_list_attr("grad_ys", symbol)
+            graph._set_symbol_list_attr("grad_xs", input_syms)
+            graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads", shape=y_np.shape))
+            graph = graph.apply("Gradient")
+            ishapes.update({"head_grads": y_np.shape})
+            graph, lib, _ = nnvm.compiler.build(graph, target, ishapes)
+            m = graph_runtime.create(graph, lib, ctx)
+            head_grads = np.random.uniform(size=y_np.shape).astype(dtype)
+            y_np = np_backward(head_grads=head_grads, **np_inputs)
+            b_inputs = {}
+            if need_input:
+                b_inputs.update(np_inputs)
+            if need_head_grads:
+                b_inputs.update({"head_grads":head_grads})
+            m.run(**b_inputs)
+            for i in range(len(y_np)):
+                out = m.get_output(i, tvm.nd.empty(y_np[i].shape, dtype))
+                np.testing.assert_allclose(out.asnumpy(), y_np[i], atol=1e-5, rtol=1e-5)
+
+
+def verify_transpose(dshape, axes):
+    x = sym.Variable("x")
+    if axes:
+        y = sym.transpose(x, axes=axes)
+    else:
+        y = sym.transpose(x)
+    y = y + 1
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        # set input
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out_np = np.transpose(data.asnumpy(), axes=axes) + 1
+        out = m.get_output(0, tvm.nd.empty(out_np.shape))
+        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+def verify_reduce_explicit(dshape, data, result, fsym, oshape=None, otype='float32', **kwargs):
+    """ Verify reduce operations by comparign its result with `result` """
+    x = sym.Variable("x")
+    y = fsym(x + 0, **kwargs)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        # set input
+        m.run(x=data)
+        # oshape set to None means do not test the shape-correctness
+        oshape = result.shape if oshape is None else oshape
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype=otype))
+        np.testing.assert_equal(out.asnumpy().shape, result.shape)
+        np.testing.assert_allclose(out.asnumpy(), result, atol=1e-5, rtol=1e-5)
+
+def verify_reduce(dshape, fnp, fsym, oshape=None, otype='float32', **kwargs):
+    """ Verify reduce operations by generating data at random and calling numpy
+    version as reference """
+    data = np.random.uniform(size=dshape).astype(otype)
+    result = fnp(data + 0, **kwargs)
+    verify_reduce_explicit(dshape, data, result, fsym, oshape=oshape, otype=otype, **kwargs)
+
+def verify_collapse(dshape, target_shape, fnp):
+    x = sym.Variable("x", shape=dshape)
+    t = sym.Variable("t", shape=target_shape)
+    y = sym.collapse_sum(x, t)
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target,
+                                            {"x": dshape, "t": target_shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        data = np.random.uniform(size=dshape).astype(dtype)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(target_shape))
+        out_np = fnp(data)
+        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+
+def test_transpose():
+    verify_transpose((2, 3, 4), (0, 2, 1))
+    verify_transpose((2, 3, 4), None)
+
+
+def test_reduce():
+
+    def _with_keepdims(func):
+        """ Wrapper around numpy's argmax/argmin with `keepdims` argument supported """
+        def wrapper(data, axis=None, keepdims=False):
+            if not keepdims:
+                return func(data, axis=axis)
+            else:
+                if axis is not None:
+                    out_shape = list(data.shape)
+                    out_shape[axis] = 1
+                else:
+                    out_shape = [1 for _ in range(len(data.shape))]
+                return func(data, axis=axis).reshape(out_shape)
+        return wrapper
+
+    verify_reduce((2, 3, 4), np.max, sym.max, axis=1, keepdims=True)
+    verify_reduce((4, 4, 3), np.min, sym.min, keepdims=True)
+    verify_reduce((4, 4, 3), np.sum, sym.sum, axis=(0, 2))
+    verify_reduce((4, 4, 3), np.sum, sym.sum)
+
+    data = np.array([[[1,2],[3,4]],[[3,44],[5,6]]], dtype=np.float32)
+    verify_reduce_explicit([2,2,2], data, np.array([[1,1],[1,0]]), sym.argmax, otype='int32', axis=[0,2], exclude=True)
+    verify_reduce_explicit([2,2,2], data, np.array([[0,0],[0,1]]), sym.argmin, otype='int32', axis=[0,2], exclude=True)
+    shape = [4, 4, 3]
+    for axis in [None, 0, 1, 2]:
+        for keepdims in [True,False]:
+            kwargs = { 'keepdims':keepdims }
+            if axis is None:
+                # FIXME: NNVM doesn't support setting `axis=None` explicitly.
+                kwargs.update({'oshape': [1,1,1] if keepdims else [] })
+            else:
+                kwargs.update({'axis': axis})
+                kwargs.update({'oshape': shape[:axis]+[1]+shape[axis+1:] if keepdims else shape[:axis]+shape[axis+1:]})
+
+            verify_reduce(shape, _with_keepdims(np.argmax), sym.argmax, otype='int32', **kwargs)
+            verify_reduce(shape, _with_keepdims(np.argmin), sym.argmin, otype='int32', **kwargs)
+
+
+def test_collapse():
+    verify_collapse((2, 3, 4), (1,), lambda x: x.sum())
+    verify_collapse((2, 3, 4), (1, 1, 1), lambda x: x.sum(keepdims=True))
+    verify_collapse((2, 3, 4), (1, 1), lambda x: x.sum().reshape(1, 1))
+    verify_collapse((2, 3, 4), (1, 4), lambda x: x.reshape(-1, 4).sum(0, keepdims=True))
+    verify_collapse((2, 3, 4), (3, 4), lambda x: x.sum(0))
+    verify_collapse((2, 3, 4), (1, 3, 4), lambda x: x.sum(0, keepdims=True))
+    verify_collapse((2, 3, 4), (1, 1, 4), lambda x: x.sum((0, 1), keepdims=True))
+    verify_collapse((2, 3, 4), (2, 1, 4), lambda x: x.sum(1, keepdims=True))
+    verify_collapse((2, 3, 4), (2, 1, 1), lambda x: x.sum((1, 2), keepdims=True))
+    verify_collapse((2, 3, 4), (2, 3, 1), lambda x: x.sum(2, keepdims=True))
+    verify_collapse((2, 3, 4), (2, 3, 4), lambda x: x)
+
+
+def verify_flip(ishape, axis):
+    x = sym.Variable("x")
+    y = sym.flip(x, axis=axis) + 1
+    dtype = "float32"
+    x_np = np.random.uniform(size=ishape).astype(dtype)
+    res = np.flip(x_np, axis) + 1
+
+    for target, ctx in ctx_list():
+        # set input
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(x=x_np)
+        out = m.get_output(0, tvm.nd.empty(res.shape))
+        np.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
+
+
+def test_flip():
+    verify_flip((3, 4, 3), 1)
+    verify_flip((3, 4, 3), 0)
+    verify_flip((3, 4, 3), 2)
+    verify_flip((3, 4, 3), -1)
+    verify_flip((3, 4, 3), -3)
+    verify_flip((3, 4, 3), -2)
+
+
+def verify_reshape(dshape, oshape):
+    x = sym.Variable("x")
+    y = sym.reshape(x, shape=oshape)
+    y = y + 1
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        # set input
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out_np = data.asnumpy().reshape(oshape) + 1
+        out = m.get_output(0, tvm.nd.empty(out_np.shape))
+        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+
+def test_reshape():
+    verify_reshape((2, 3, 4), (-1, 2, 1))
+    verify_reshape((2, 3, 4), (8, 3))
+    verify_reshape((4, 7), (2, 7, 2))
+
+
+def test_clip():
+    x = sym.Variable("x")
+    a_min=0.2
+    a_max=0.75
+    y = sym.clip(x, a_min=a_min, a_max=a_max)
+
+    def forward(x):
+        return np.clip(x, a_min=a_min, a_max=a_max)
+
+    def backward(head_grads, x):
+        mask1 = np.greater_equal(x, a_min).astype("float")
+        mask2 = np.less_equal(x, a_max).astype("float")
+        return [head_grads * mask1 * mask2]
+
+
+    dtype = "float32"
+    inputs = [('x', (3, 4, 5), x)]
+    helper(y, inputs, dtype, forward, backward)
+
+
+def test_broadcast():
+    a = sym.Variable("a")
+    b = sym.Variable("b")
+    inputs = [('a', (3, 4, 5), a),
+              ('b', (1, 5), b)]
+    dtype = "float32"
+
+    def _collapse(g):
+        return g.reshape(-1, inputs[-1][1][-1]).sum(0, keepdims=True)
+
+    y = sym.broadcast_add(a, b)
+    def _backward_add(head_grads, a, b):
+        da = head_grads
+        db = _collapse(head_grads)
+        return da, db
+    helper(y, inputs, dtype, lambda a, b: a + b, _backward_add)
+
+    y = sym.broadcast_sub(a, b)
+    def _backward_sub(head_grads, a, b):
+        da = head_grads
+        db = -_collapse(head_grads)
+        return da, db
+    helper(y, inputs, dtype, lambda a, b: a - b, _backward_sub)
+
+    y = sym.broadcast_mul(a, b)
+    def _backward_mul(head_grads, a, b):
+        da = head_grads * b
+        db = _collapse(head_grads * a)
+        return da, db
+    helper(y, inputs, dtype, lambda a, b: a * b, _backward_mul)
+
+    y = sym.broadcast_div(a, b)
+    def _backward_div(head_grads, a, b):
+        da = head_grads / b
+        db = _collapse(- head_grads * a / b**2)
+        return da, db
+    helper(y, inputs, dtype, lambda a, b: a / b, _backward_div)
+
+    y = sym.broadcast_mod(a, b)
+    helper(y, inputs, 'int32',
+           lambda a, b: np.mod(a, b),
+           in_range={'a': (0.001, 100), 'b': (1, 100)})
+
+    y = sym.broadcast_max(a, b)
+    helper(y, inputs, dtype, lambda a, b: np.maximum(a, b))
+
+    y = sym.broadcast_min(a, b)
+    helper(y, inputs, dtype, lambda a, b: np.minimum(a, b))
+
+    y = sym.broadcast_pow(a, b)
+    helper(y, inputs, dtype,
+           lambda a, b: np.power(a, b),
+           in_range={'a': (0.001, 100), 'b': (0.001, 2)})
+
+    y = sym.broadcast_left_shift(a, b)
+    helper(y, inputs, 'int32', lambda a, b: a << b)
+
+    y = sym.broadcast_right_shift(a, b)
+    helper(y, inputs, 'int32', lambda a, b: a >> b)
+
+    y = sym.broadcast_greater(a, b)
+    helper(y, inputs, dtype, lambda a, b: np.greater(a, b))
+
+    y = sym.broadcast_less(a, b)
+    helper(y, inputs, dtype, lambda a, b: np.less(a, b))
+
+    y = sym.broadcast_equal(a, b)
+    helper(y, inputs, 'int32', lambda a, b: np.equal(a, b),
+           in_range={'a': (-2, 2), 'b': (-2, 2)})
+
+    y = sym.broadcast_not_equal(a, b)
+    helper(y, inputs, 'int32', lambda a, b: np.not_equal(a, b),
+           in_range={'a': (-2, 2), 'b': (-2, 2)})
+
+    y = sym.broadcast_greater_equal(a, b)
+    helper(y, inputs, 'int32', lambda a, b: np.greater_equal(a, b),
+           in_range={'a': (-3, 3), 'b': (-3, 3)})
+
+    y = sym.broadcast_less_equal(a, b)
+    helper(y, inputs, 'int32', lambda a, b: np.less_equal(a, b),
+           in_range={'a': (-3, 3), 'b': (-3, 3)})
+
+def test_greater():
+    l = sym.Variable("l")
+    r = sym.Variable("r")
+    y = sym.greater(l, r)
+
+    def forward(l, r):
+        return np.greater(l, r).astype("float32")
+
+    def backward(head_grads, l, r):
+        return [np.zeros_like(l)]
+
+
+    dtype = "float32"
+    inputs = [('l', (3, 4, 5), l),
+              ('r', (3, 4, 5), r)]
+    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+
+
+def test_less():
+    l = sym.Variable("l")
+    r = sym.Variable("r")
+    y = sym.less(l, r)
+
+    def forward(l, r):
+        return np.less(l, r).astype("float32")
+
+    def backward(head_grads, l, r):
+        return [np.zeros_like(l)]
+
+
+    dtype = "float32"
+    inputs = [('l', (3, 4, 5), l),
+              ('r', (3, 4, 5), r)]
+    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+
+
+def test_reshape_like():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.reshape_like(x, y)
+
+    def forward(x, y):
+        return np.reshape(x, y.shape)
+
+    def backward(head_grads, x, y):
+        return [np.reshape(head_grads, x.shape),
+                np.zeros_like(y)]
+
+
+    dtype = "float32"
+    inputs = [('x', (3, 4, 5), x),
+              ('y', (5, 4, 3), y)]
+    helper(z, inputs, dtype, forward, backward)
+
+
+def verify_expand_like(in_shape, out_shape, axis, exclude):
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.expand_like(x, y, axis=axis, exclude=exclude)
+
+    def forward(x, y):
+        odim = len(out_shape)
+
+        if len(x.shape) == len(y.shape):
+            return np.broadcast_to(x, y.shape)
+
+        if x.shape == (1,) and len(y.shape) == odim:
+            x = np.reshape(x, ())
+
+        real_axis = [i if i >= 0 else i + odim for i in axis]
+        real_axis = sorted(real_axis)
+        if exclude:
+            real_axis = list(set(range(odim)) - set(real_axis))
+        for i in real_axis:
+            x = np.expand_dims(x, i).astype(x.dtype)
+        for i in real_axis:
+            x = np.concatenate([x]*out_shape[i], axis=i).astype(x.dtype)
+
+        return x
+
+    def backward(head_grads, x, y):
+        odim = len(out_shape)
+
+        keepdims = len(x.shape) == len(y.shape)
+
+        if x.shape == (1,) and len(y.shape) == odim:
+            x = np.reshape(x, ())
+
+        real_axis = [i if i >= 0 else i + odim for i in axis]
+        real_axis = sorted(real_axis)
+        if exclude:
+            real_axis = list(set(range(odim)) - set(real_axis))
+        return [np.sum(head_grads, axis=tuple(real_axis), keepdims=keepdims),
+                np.zeros_like(y)]
+
+
+    dtype = "float32"
+    inputs = [('x', in_shape, x),
+              ('y', out_shape, y)]
+    helper(z, inputs, dtype, forward, backward, need_input=False)
+
+
+def test_expand_like():
+    verify_expand_like((3,), (3, 2), [1], False)
+    verify_expand_like((2,), (2, 3), [1], False)
+    verify_expand_like((3, 4), (3, 5, 4), [1], False)
+    verify_expand_like((5, 7), (5, 6, 7, 8), [0, 2], True)
+    verify_expand_like((2, 3), (2, 3), [], False)
+    verify_expand_like((1,), (2, 3), [0, 1], False)
+    verify_expand_like((1, 1), (2, 3), [0, 1], False)
+    verify_expand_like((2, 1), (2, 3), [1], False)
+    verify_expand_like((1, 3), (2, 3), [0], False)
+
+
+def verify_elemwise_sum(num_args):
+    s = [sym.Variable("input" + str(i)) for i in range(num_args)]
+    y = sym.elemwise_sum(*s, num_args=num_args)
+
+    def forward(**inputs):
+        return np.sum(np.array(list(inputs.values())), axis=0)
+
+    def backward(head_grads, **inputs):
+        return [head_grads] * num_args
+
+    dtype = "float32"
+    inputs = [("input" + str(i), (3, 4, 5), s[i])
+              for i in range(num_args)]
+    helper(y, inputs, dtype, forward, backward, need_input=False)
+
+
+def test_elemwise_sum():
+    verify_elemwise_sum(1)
+    verify_elemwise_sum(5)
+    verify_elemwise_sum(7)
+
+
+def test_block_grad():
+    x = sym.Variable("x")
+    y = sym.block_grad(x)
+
+    def forward(x):
+        return x
+
+    def backward(head_grads, x):
+        return [np.zeros_like(head_grads)]
+
+
+    dtype = "float32"
+    inputs = [('x', (3, 4, 5), x)]
+    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+
+
+def test_full():
+    shape = (3, 4, 5)
+    value = 7
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        data = sym.Variable("data", dtype=dtype)
+        # full_like
+        s = sym.full_like(data=data, fill_value=value, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=np.random.uniform(size=shape).astype(dtype))
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        np.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=value, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # ones_like
+        s = sym.ones_like(data=data, fill_value=value, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=np.random.uniform(size=shape).astype(dtype))
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        np.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=1, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # zeros_like
+        s = sym.zeros_like(data=data, fill_value=value, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=np.random.uniform(size=shape).astype(dtype))
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        np.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=0, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # full
+        s = sym.full(shape=shape, dtype=dtype, fill_value=value, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        np.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=value, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # ones
+        s = sym.ones(shape=shape, dtype=dtype, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        np.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=1, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # zeros
+        s = sym.zeros(shape=shape, dtype=dtype, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        np.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=0, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+
+def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1),
+                          offsets=(0.5, 0.5), clip=False):
+    data = sym.Variable("data")
+    out = sym.multibox_prior(data=data, sizes=sizes, ratios=ratios, steps=steps,
+                             offsets=offsets, clip=clip)
+
+    in_height = dshape[2]
+    in_width = dshape[3]
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
+    size_ratio_concat = sizes + ratios
+    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
+    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+    offset_h = offsets[0]
+    offset_w = offsets[1]
+
+    oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
+    dtype = "float32"
+    np_out = np.zeros(oshape).astype(dtype)
+
+    for i in range(in_height):
+        center_h = (i + offset_h) * steps_h
+        for j in range(in_width):
+            center_w = (j + offset_w) * steps_w
+            for k in range(num_sizes + num_ratios - 1):
+                w = size_ratio_concat[k] * in_height / in_width / 2.0 if k < num_sizes else \
+                    size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                h = size_ratio_concat[k] / 2.0 if k < num_sizes else \
+                    size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                count = i * in_width * (num_sizes + num_ratios - 1) + j * (num_sizes + num_ratios - 1) + k
+                np_out[0][count][0] = center_w - w
+                np_out[0][count][1] = center_h - h
+                np_out[0][count][2] = center_w + w
+                np_out[0][count][3] = center_h + h
+    if clip:
+        np_out = np.clip(np_out, 0, 1)
+
+    target = "llvm"
+    ctx = tvm.cpu()
+    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
+    m.run()
+    out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
+    np.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+
+def test_multibox_prior():
+    verify_multibox_prior((1, 3, 50, 50))
+    verify_multibox_prior((1, 3, 224, 224), sizes=(0.5, 0.25, 0.1), ratios=(1, 2, 0.5))
+    verify_multibox_prior((1, 32, 32, 32), sizes=(0.5, 0.25), ratios=(1, 2), steps=(2, 2), clip=True)
+
+def test_multibox_transform_loc():
+    batch_size = 1
+    num_anchors = 3
+    num_classes = 3
+    cls_prob = sym.Variable("cls_prob")
+    loc_preds = sym.Variable("loc_preds")
+    anchors = sym.Variable("anchors")
+    transform_loc_data, valid_count = sym.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_preds,
+                                                                 anchor=anchors)
+    out = sym.nms(data=transform_loc_data, valid_count=valid_count)
+
+    # Manually create test case
+    np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]])
+    np_loc_preds = np.array([[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4, -0.8]])
+    np_anchors = np.array([[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2], [1.2, 1.2, 1.5, 1.5]]])
+
+    expected_np_out = np.array([[[1, 0.69999999, 0, 0, 0.10818365, 0.10008108],
+                                 [0, 0.44999999, 1, 1, 1, 1],
+                                 [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]])
+
+    target = "llvm"
+    dtype = "float32"
+    ctx = tvm.cpu()
+    graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes),
+                                                      "loc_preds": (batch_size, num_anchors * 4),
+                                                      "anchors": (1, num_anchors, 4)})
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
+    m.run()
+    out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
+    np.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
+
+def test_nms():
+    dshape = (1, 5, 6)
+    data = sym.Variable("data")
+    valid_count = sym.Variable("valid_count", dtype="int32")
+    nms_threshold = 0.7
+    force_suppress = True
+    nms_topk = 2
+    out = sym.nms(data=data, valid_count=valid_count, nms_threshold=nms_threshold,
+                  force_suppress=force_suppress, nms_topk=nms_topk)
+
+    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
+                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
+                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
+    np_valid_count = np.array([4]).astype("int32")
+    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
+                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1]]])
+
+    target = "llvm"
+    ctx = tvm.cpu()
+    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
+                                        dtype={"data": "float32", "valid_count": "int32"})
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input(**{"data": np_data, "valid_count": np_valid_count})
+    m.run()
+    out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
+    np.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+
+def np_slice_like(np_data, np_shape_like, axis=[]):
+    begin_idx = [0 for _ in np_data.shape]
+    end_idx = list(np_data.shape)
+    if len(axis) > 0:
+        for i in axis:
+            if i < 0:
+                i = len(np_data.shape) + i
+            end_idx[i] = np_shape_like.shape[i]
+    else:
+        for i in range(len(np_data.shape)):
+            if i < len(np_shape_like.shape):
+                end_idx[i] = np_shape_like.shape[i]
+    slice_idx = []
+    for b, e in zip(begin_idx, end_idx):
+        slice_idx.append(slice(b, e))
+    np_result = np_data[slice_idx]
+    return np_result
+
+def verify_slice_like(np_data, np_shape_like, axis=[]):
+    dtype = "float32"
+    np_data = np_data.astype(dtype)
+    np_shape_like = np_shape_like.astype(dtype)
+    np_result = np_slice_like(np_data, np_shape_like, axis)
+    data1 = sym.Variable("data1")
+    data2 = sym.Variable("data2")
+    net = sym.slice_like(data=data1, slice_like=data2, axis=axis)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, {"data1": np_data.shape,
+                                                          "data2": np_shape_like.shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"data1": np_data, "data2": np_shape_like})
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(np_result.shape, dtype))
+        np.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+
+def test_slice_like():
+    np_data = np.random.uniform(size=(3, 4, 5))
+    np_shape_like = np.random.uniform(size=(1, 2, 3))
+    verify_slice_like(np_data, np_shape_like)
+    np_data = np.random.uniform(size=(3, 4, 5))
+    np_shape_like = np.random.uniform(size=(1, 2))
+    verify_slice_like(np_data, np_shape_like)
+    np_data = np.random.uniform(size=(3, 4, 5))
+    np_shape_like = np.random.uniform(size=(1, 2, 3))
+    axis = (1, 2)
+    verify_slice_like(np_data, np_shape_like, axis)
+    np_data = np.random.uniform(size=(3, 4, 5))
+    np_shape_like = np.random.uniform(size=(1, 2, 3))
+    axis = (-1, -3)
+    verify_slice_like(np_data, np_shape_like, axis)
+    np_data = np.random.uniform(size=(1, 3, 224, 224))
+    np_shape_like = np.random.uniform(size=(1, 3, 112, 112))
+    axis = (2, 3)
+    verify_slice_like(np_data, np_shape_like, axis)
+
+def verify_where(condition, x, y):
+    dtype = "float32"
+    if len(condition.shape) == 1:
+        np_out = np.array([xv if c else yv for (c,xv,yv) in zip(condition,x,y)])
+    else:
+        np_out = np.where(condition, x, y)
+    cond_var = sym.Variable("condition")
+    x_var = sym.Variable("x")
+    y_var = sym.Variable("y")
+    net = sym.where(cond_var, x_var, y_var)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, {"condition": condition.shape,
+                                                          "x": x.shape, "y": y.shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"condition": condition, "x": x, "y": y})
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(x.shape, dtype))
+        np.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+
+def test_where():
+    shape = (13, 8, 224, 224, 6)
+    condition = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+    x = np.random.uniform(size=shape).astype("float32")
+    y = np.random.uniform(size=shape).astype("float32")
+    verify_where(condition, x, y)
+    condition = np.random.uniform(low=-1, high=1, size=(shape[0],)).astype("float32")
+    x = np.random.uniform(size=shape).astype("float32")
+    y = np.random.uniform(size=shape).astype("float32")
+    verify_where(condition, x, y)
+
+
+if __name__ == "__main__":
+    test_reshape()
+    test_broadcast()
+    test_reduce()
+    test_collapse()
+    test_transpose()
+    test_clip()
+    test_greater()
+    test_less()
+    test_reshape_like()
+    test_expand_like()
+    test_elemwise_sum()
+    test_block_grad()
+    test_full()
+    test_flip()
+    test_multibox_prior()
+    test_multibox_transform_loc()
+    test_nms()
+    test_slice_like()
+    test_where()
+    print(nnvm.compiler.engine.dump())
diff --git a/nnvm/tests/python/frontend/coreml/model_zoo/.gitignore b/nnvm/tests/python/frontend/coreml/model_zoo/.gitignore
new file mode 100644
index 000000000000..4242a1b2e2e0
--- /dev/null
+++ b/nnvm/tests/python/frontend/coreml/model_zoo/.gitignore
@@ -0,0 +1,3 @@
+*.mlmodel
+*.jpg
+*.png
diff --git a/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
new file mode 100644
index 000000000000..87b9b5668432
--- /dev/null
+++ b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
@@ -0,0 +1,33 @@
+import urllib
+import os
+from PIL import Image
+import numpy as np
+
+def download(url, path, overwrite=False):
+    if os.path.exists(path) and not overwrite:
+        return
+    print('Downloading {} to {}.'.format(url, path))
+    urllib.URLopener().retrieve(url, path)
+
+def get_mobilenet():
+    url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
+    dst = 'mobilenet.mlmodel'
+    real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
+    download(url, real_dst)
+    return os.path.abspath(real_dst)
+
+def get_resnet50():
+    url = 'https://docs-assets.developer.apple.com/coreml/models/Resnet50.mlmodel'
+    dst = 'resnet50.mlmodel'
+    real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
+    download(url, real_dst)
+    return os.path.abspath(real_dst)
+
+def get_cat_image():
+    url = 'https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png'
+    dst = 'cat.jpg'
+    real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
+    download(url, real_dst)
+    img = Image.open(real_dst).resize((224, 224))
+    img = np.transpose(img, (2, 0, 1))[np.newaxis, :]
+    return np.asarray(img)
diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py
new file mode 100644
index 000000000000..d5c460e56987
--- /dev/null
+++ b/nnvm/tests/python/frontend/coreml/test_forward.py
@@ -0,0 +1,45 @@
+import numpy as np
+
+import topi
+import tvm
+from tvm.contrib import graph_runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+from nnvm import frontend
+import coremltools as cm
+import model_zoo
+
+def get_tvm_output(symbol, x, params, target, ctx,
+                   out_shape=(1000,), input_name='image', dtype='float32'):
+    shape_dict = {input_name : x.shape}
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(symbol, target, shape_dict, params=params)
+    m = graph_runtime.create(graph, lib, ctx)
+    # set inputs
+    m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
+    m.set_input(**params)
+    m.run()
+    # get outputs
+    out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+    return out.asnumpy()
+
+def test_model_checkonly(model_file, model_name=''):
+    model = cm.models.MLModel(model_file)
+    sym, params = nnvm.frontend.from_coreml(model)
+    x = model_zoo.get_cat_image()
+    for target, ctx in ctx_list():
+        tvm_output = get_tvm_output(sym, x, params, target, ctx)
+        print(target, ctx, model_name, 'prediction id: ', np.argmax(tvm_output.flat))
+
+def test_mobilenet_checkonly():
+    model_file = model_zoo.get_mobilenet()
+    test_model_checkonly(model_file, 'mobilenet')
+
+def test_resnet50_checkonly():
+    model_file = model_zoo.get_resnet50()
+    test_model_checkonly(model_file, 'resnet50')
+
+if __name__ == '__main__':
+    test_mobilenet_checkonly()
+    test_resnet50_checkonly()
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
new file mode 100644
index 000000000000..e68aed085664
--- /dev/null
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -0,0 +1,398 @@
+"""
+Compile Darknet Models
+=====================
+This article is a test script to test darknet models with NNVM.
+All the required models and libraries will be downloaded from the internet
+by the script.
+"""
+import os
+import requests
+import sys
+import urllib
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+from nnvm import frontend
+from nnvm.testing.darknet import __darknetffi__
+import nnvm.compiler
+if sys.version_info >= (3,):
+    import urllib.request as urllib2
+else:
+    import urllib2
+
+
+def _download(url, path, overwrite=False, sizecompare=False):
+    ''' Download from internet'''
+    if os.path.isfile(path) and not overwrite:
+        if sizecompare:
+            file_size = os.path.getsize(path)
+            res_head = requests.head(url)
+            res_get = requests.get(url, stream=True)
+            if 'Content-Length' not in res_head.headers:
+                res_get = urllib2.urlopen(url)
+            urlfile_size = int(res_get.headers['Content-Length'])
+            if urlfile_size != file_size:
+                print("exist file got corrupted, downloading", path, " file freshly")
+                _download(url, path, True, False)
+                return
+        print('File {} exists, skip.'.format(path))
+        return
+    print('Downloading from url {} to {}'.format(url, path))
+    try:
+        urllib.request.urlretrieve(url, path)
+        print('')
+    except:
+        urllib.urlretrieve(url, path)
+
+DARKNET_LIB = 'libdarknet.so'
+DARKNETLIB_URL = 'https://github.com/siju-samuel/darknet/blob/master/lib/' \
+                                    + DARKNET_LIB + '?raw=true'
+_download(DARKNETLIB_URL, DARKNET_LIB)
+LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
+
+def _get_tvm_output(net, data):
+    '''Compute TVM output'''
+    dtype = 'float32'
+    sym, params = frontend.darknet.from_darknet(net, dtype)
+
+    target = 'llvm'
+    shape_dict = {'data': data.shape}
+    graph, library, params = nnvm.compiler.build(sym, target, shape_dict, dtype, params=params)
+    # Execute on TVM
+    ctx = tvm.cpu(0)
+    m = graph_runtime.create(graph, library, ctx)
+    # set inputs
+    m.set_input('data', tvm.nd.array(data.astype(dtype)))
+    m.set_input(**params)
+    m.run()
+    # get outputs
+    out_shape = (net.outputs,)
+    tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+    return tvm_out
+
+def test_forward(net):
+    '''Test network with given input image on both darknet and tvm'''
+    def get_darknet_output(net, img):
+        return LIB.network_predict_image(net, img)
+    dtype = 'float32'
+
+    test_image = 'dog.jpg'
+    img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + test_image   +'?raw=true'
+    _download(img_url, test_image)
+    img = LIB.letterbox_image(LIB.load_image_color(test_image.encode('utf-8'), 0, 0), net.w, net.h)
+    darknet_output = get_darknet_output(net, img)
+    darknet_out = np.zeros(net.outputs, dtype='float32')
+    for i in range(net.outputs):
+        darknet_out[i] = darknet_output[i]
+    batch_size = 1
+
+    data = np.empty([batch_size, img.c, img.h, img.w], dtype)
+    i = 0
+    for c in range(img.c):
+        for h in range(img.h):
+            for k in range(img.w):
+                data[0][c][h][k] = img.data[i]
+                i = i + 1
+
+    tvm_out = _get_tvm_output(net, data)
+    np.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-3, atol=1e-3)
+
+def test_rnn_forward(net):
+    '''Test network with given input data on both darknet and tvm'''
+    def get_darknet_network_predict(net, data):
+        return LIB.network_predict(net, data)
+    from cffi import FFI
+    ffi = FFI()
+    np_arr = np.zeros([1, net.inputs], dtype='float32')
+    np_arr[0, 84] = 1
+    cffi_arr = ffi.cast('float*', np_arr.ctypes.data)
+    tvm_out = _get_tvm_output(net, np_arr)
+    darknet_output = get_darknet_network_predict(net, cffi_arr)
+    darknet_out = np.zeros(net.outputs, dtype='float32')
+    for i in range(net.outputs):
+        darknet_out[i] = darknet_output[i]
+    np.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-4, atol=1e-4)
+
+def test_forward_extraction():
+    '''test extraction model'''
+    model_name = 'extraction'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    _download(cfg_url, cfg_name)
+    _download(weights_url, weights_name)
+    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_alexnet():
+    '''test alexnet model'''
+    model_name = 'alexnet'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    _download(cfg_url, cfg_name)
+    _download(weights_url, weights_name)
+    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_resnet50():
+    '''test resnet50 model'''
+    model_name = 'resnet50'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    _download(cfg_url, cfg_name)
+    _download(weights_url, weights_name)
+    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_yolo():
+    '''test yolo model'''
+    model_name = 'yolov2'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    _download(cfg_url, cfg_name)
+    _download(weights_url, weights_name)
+    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_convolutional():
+    '''test convolutional layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_dense():
+    '''test fully connected layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_connected_layer(1, 75, 20, 1, 0, 0)
+    net.layers[0] = layer
+    net.w = net.h = 5
+    LIB.resize_network(net, 5, 5)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_dense_batchnorm():
+    '''test fully connected layer with batchnorm'''
+    net = LIB.make_network(1)
+    layer = LIB.make_connected_layer(1, 12, 2, 1, 1, 0)
+    for i in range(5):
+        layer.rolling_mean[i] = np.random.rand(1)
+        layer.rolling_variance[i] = np.random.rand(1)
+        layer.scales[i] = np.random.rand(1)
+    net.layers[0] = layer
+    net.w = net.h = 2
+    LIB.resize_network(net, 2, 2)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_maxpooling():
+    '''test maxpooling layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_maxpool_layer(1, 224, 224, 3, 2, 2, 0)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_avgpooling():
+    '''test avgerage pooling layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_avgpool_layer(1, 224, 224, 3)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_batch_norm():
+    '''test batch normalization layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 1, 0, 0, 0)
+    for i in range(32):
+        layer.rolling_mean[i] = np.random.rand(1)
+        layer.rolling_variance[i] = np.random.rand(1)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_shortcut():
+    '''test shortcut layer'''
+    net = LIB.make_network(3)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0)
+    layer_3 = LIB.make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32)
+    layer_3.activation = 1
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.layers[2] = layer_3
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_reorg():
+    '''test reorg layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 222, 222, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_reorg_layer(1, 110, 110, 32, 2, 0, 0, 0)
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 222
+    LIB.resize_network(net, 222, 222)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_region():
+    '''test region layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 8, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_region_layer(1, 111, 111, 2, 2, 1)
+    layer_2.softmax = 1
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_elu():
+    '''test elu activation layer'''
+    net = LIB.make_network(1)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_1.activation = 8
+    net.layers[0] = layer_1
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_softmax():
+    '''test softmax layer'''
+    net = LIB.make_network(1)
+    layer_1 = LIB.make_softmax_layer(1, 75, 1)
+    layer_1.temperature=1
+    net.layers[0] = layer_1
+    net.w = net.h = 5
+    LIB.resize_network(net, net.w, net.h)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_softmax_temperature():
+    '''test softmax layer'''
+    net = LIB.make_network(1)
+    layer_1 = LIB.make_softmax_layer(1, 75, 1)
+    layer_1.temperature=0.8
+    net.layers[0] = layer_1
+    net.w = net.h = 5
+    LIB.resize_network(net, net.w, net.h)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_rnn():
+    '''test softmax layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    activation = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_rnn_layer(batch, inputs, outputs, steps, activation, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    test_rnn_forward(net)
+    LIB.free_network(net)
+
+def test_forward_crnn():
+    '''test softmax layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    c = 3
+    h = 224
+    w = 224
+    hidden_filters = c
+    output_filters = c
+    steps = 1
+    activation = 0
+    batch_normalize = 0
+    inputs = 256
+    outputs = 256
+    layer_1 = LIB.make_crnn_layer(batch, h, w, c, hidden_filters, output_filters,
+                                  steps, activation, batch_normalize)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = output_filters * h * w
+    net.w = w
+    net.h = h
+    LIB.resize_network(net, net.w, net.h)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_activation_logistic():
+    '''test logistic activation layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    h = 224
+    w = 224
+    c = 3
+    n = 32
+    groups = 1
+    size = 3
+    stride = 2
+    padding = 0
+    activation = 0
+    batch_normalize = 0
+    binary = 0
+    xnor = 0
+    adam = 0
+    layer_1 = LIB.make_convolutional_layer(batch, h, w, c, n, groups, size, stride, padding,
+                                           activation, batch_normalize, binary, xnor, adam)
+    net.layers[0] = layer_1
+    net.w = w
+    net.h = h
+    LIB.resize_network(net, net.w, net.h)
+    test_forward(net)
+    LIB.free_network(net)
+
+if __name__ == '__main__':
+    test_forward_resnet50()
+    test_forward_alexnet()
+    test_forward_extraction()
+    test_forward_yolo()
+    test_forward_convolutional()
+    test_forward_maxpooling()
+    test_forward_avgpooling()
+    test_forward_batch_norm()
+    test_forward_shortcut()
+    test_forward_dense()
+    test_forward_dense_batchnorm()
+    test_forward_softmax()
+    test_forward_softmax_temperature()
+    test_forward_rnn()
+    test_forward_reorg()
+    test_forward_region()
+    test_forward_elu()
+    test_forward_rnn()
+    test_forward_crnn()
+    test_forward_activation_logistic()
\ No newline at end of file
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
new file mode 100644
index 000000000000..17c9fc1329d7
--- /dev/null
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -0,0 +1,205 @@
+import numpy as np
+import nnvm
+import tvm
+from tvm.contrib import graph_runtime
+from nnvm.testing.config import ctx_list
+import keras
+
+# prevent keras from using up all gpu memory
+import tensorflow as tf
+from keras.backend.tensorflow_backend import set_session
+config = tf.ConfigProto()
+config.gpu_options.per_process_gpu_memory_fraction = 0.5
+set_session(tf.Session(config=config))
+
+
+def verify_keras_frontend(keras_model):
+    # Keras frontend currently supports tensorflow backend only.
+    assert(keras.backend.backend() == 'tensorflow')
+
+    in_shapes = []
+    for layer in keras_model._input_layers:
+        in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
+    out_shape = [dim.value if dim.value is not None else 1 for dim in keras_model._output_layers[0].output.shape]
+
+    def get_keras_output(xs, dtype='float32'):
+        return keras_model.predict(xs)
+
+    def get_tvm_output(xs, target, ctx, dtype='float32'):
+        sym, params = nnvm.frontend.from_keras(keras_model)
+        shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)}
+        with nnvm.compiler.build_config(opt_level=2):
+            graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        for name, x in zip(keras_model.input_names, xs):
+            m.set_input(name, tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+        return out.asnumpy()
+
+    xs = [np.random.uniform(size=shape) for shape in in_shapes]
+    keras_out = get_keras_output(xs)
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output([x.transpose([0,3,1,2]) for x in xs], target, ctx)
+        np.testing.assert_allclose(keras_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+
+def test_forward_elemwise_add():
+    r = []
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    r.append(x)
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(x)
+    r.append(x)
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(x)
+    # add two symbols
+    y = keras.layers.add([keras.layers.add([x, r[0]]), r[1]])
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+    # add three symbols
+    y = keras.layers.add([x, r[0], r[1]])
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+
+def test_forward_dense():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.MaxPooling2D(pool_size=(2,2))(data)
+    x = keras.layers.Flatten()(x)
+    x = keras.layers.Dropout(0.5)(x)
+    x = keras.layers.Dense(10, activation='relu', kernel_initializer='uniform')(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_transpose_conv():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(filters=10, kernel_size=(3,3), strides=(2,2), padding='same')(data)
+    x = keras.layers.DepthwiseConv2D(kernel_size=(3,3), padding='same')(x)
+    x = keras.layers.Conv2DTranspose(filters=64, kernel_size=(3,3), padding='valid')(x)
+    x = keras.layers.GlobalMaxPooling2D()(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_separable_conv():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.SeparableConv2D(filters=10, kernel_size=(3,3),
+        padding='same', activation='relu')(data)
+    x = keras.layers.BatchNormalization(scale=True, center=False,
+        beta_initializer='uniform', gamma_initializer='uniform')(x)
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_upsample():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.UpSampling2D(size=(3,3))(data)
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+def test_forward_reshape():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Reshape(target_shape=(32,32,3))(data)
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_vgg16():
+    keras_model = keras.applications.vgg16.VGG16(include_top=True, weights=None,
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_xception():
+    keras_model = keras.applications.xception.Xception(include_top=True, weights=None,
+        input_shape=(299,299,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_resnet50():
+    keras_model = keras.applications.resnet50.ResNet50(include_top=True, weights=None,
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_mobilenet():
+    keras_model = keras.applications.mobilenet.MobileNet(include_top=True, weights=None,
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+def test_forward_activations():
+    data = keras.layers.Input(shape=(32,32,3))
+    weights = np.random.rand(1, 32, 32, 3)
+    act_funcs = [keras.layers.Activation('softmax'),
+                 keras.layers.Activation('softplus'),
+                 keras.layers.ReLU(),
+                 keras.layers.LeakyReLU(alpha=0.3),
+                 keras.layers.PReLU(weights=weights, alpha_initializer="zero"),
+                 keras.layers.ELU(alpha=0.5),
+                 keras.layers.Activation('selu'),
+                 keras.layers.ThresholdedReLU(theta=0.5),
+                 keras.layers.Activation('softsign'),
+                 keras.layers.Activation('hard_sigmoid'),
+                 keras.layers.Activation('sigmoid'),
+                 keras.layers.Activation('tanh'),
+                 keras.layers.Activation('linear')]
+    for act_func in act_funcs:
+        x = act_func(data)
+        x = keras.layers.GlobalMaxPooling2D()(x)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
+
+def test_forward_multi_inputs():
+    data1 = keras.layers.Input(shape=(32,32,3))
+    data2 = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data1)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data2)
+    z = keras.layers.add([x, y])
+    z = keras.layers.GlobalAveragePooling2D()(z)
+    keras_model = keras.models.Model([data1, data2], z)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_reuse_layers():
+    # reuse conv2d
+    data = keras.layers.Input(shape=(32,32,3))
+    conv2d = keras.layers.Conv2D(8, (3, 3), padding="same")
+    x = conv2d(data)
+    y = conv2d(data)
+    z = keras.layers.add([x, y])
+    z = keras.layers.GlobalAveragePooling2D()(z)
+    keras_model = keras.models.Model(data, z)
+    verify_keras_frontend(keras_model)
+
+    # reuse add
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    add = keras.layers.Add()
+    x = add([x, x])
+    x = add([x, x])
+    z = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, z)
+    verify_keras_frontend(keras_model)
+
+
+if __name__ == '__main__':
+    test_forward_elemwise_add()
+    test_forward_activations()
+    test_forward_dense()
+    test_forward_transpose_conv()
+    test_forward_separable_conv()
+    test_forward_upsample()
+    test_forward_reshape()
+    test_forward_vgg16()
+    test_forward_xception()
+    test_forward_resnet50()
+    test_forward_mobilenet()
+
+    test_forward_multi_inputs()
+    test_forward_reuse_layers()
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
new file mode 100644
index 000000000000..e3c9acdf23ef
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
@@ -0,0 +1,44 @@
+"""MXNet and NNVM model zoo."""
+from __future__ import absolute_import
+from . import mlp, resnet, vgg, dqn, dcgan, squeezenet
+import nnvm.testing
+
+__all__ = ['mx_mlp', 'nnvm_mlp', 'mx_resnet', 'nnvm_resnet', 'mx_vgg', 'nnvm_vgg',
+           'mx_squeezenet', 'nnvm_squeezenet']
+
+_num_class = 1000
+
+# mlp fc
+mx_mlp = mlp.get_symbol(_num_class)
+nnvm_mlp = nnvm.testing.mlp.get_workload(1, _num_class)[0]
+
+# resnet fc
+mx_resnet = {}
+nnvm_resnet = {}
+for num_layer in [18, 34, 50, 101, 152, 200, 269]:
+    mx_resnet[num_layer] = resnet.get_symbol(_num_class, num_layer, '3,224,224')
+    nnvm_resnet[num_layer] = nnvm.testing.resnet.get_workload(
+        1, _num_class, num_layers=num_layer)[0]
+
+# vgg fc
+mx_vgg = {}
+nnvm_vgg = {}
+for num_layer in [11, 13, 16, 19]:
+    mx_vgg[num_layer] = vgg.get_symbol(_num_class, num_layer)
+    nnvm_vgg[num_layer] = nnvm.testing.vgg.get_workload(
+        1, _num_class, num_layers=num_layer)[0]
+
+# squeezenet
+mx_squeezenet = {}
+nnvm_squeezenet = {}
+for version in ['1.0', '1.1']:
+    mx_squeezenet[version] = squeezenet.get_symbol(version=version)
+    nnvm_squeezenet[version] = nnvm.testing.squeezenet.get_workload(1, version=version)[0]
+
+# dqn
+mx_dqn = dqn.get_symbol()
+nnvm_dqn = nnvm.testing.dqn.get_workload(1)[0]
+
+# dcgan generator
+mx_dcgan = dcgan.get_symbol()
+nnvm_dcgan = nnvm.testing.dcgan.get_workload(1)[0]
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py b/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py
new file mode 100644
index 000000000000..98133d369b13
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py
@@ -0,0 +1,63 @@
+# pylint: disable=unused-argument
+"""
+The MXNet symbol of DCGAN generator
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+
+import mxnet as mx
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = mx.sym.Deconvolution(data,
+                               kernel=kshape,
+                               stride=stride,
+                               pad=(pad_y, pad_x),
+                               adj=(adj_y, adj_x),
+                               num_filter=oshape[0],
+                               no_bias=True,
+                               name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = mx.sym.BatchNorm(net, eps=eps, name="%s_bn" % prefix)
+    net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
+    return net
+
+def get_symbol(oshape=(3, 32, 32), ngf=128, code=None):
+    """get symbol of dcgan generator"""
+    assert oshape[-1] == 32, "Only support 32x32 image"
+    assert oshape[-2] == 32, "Only support 32x32 image"
+
+    code = mx.sym.Variable("data") if code is None else code
+    net = mx.sym.FullyConnected(code, name="g1", num_hidden=4*4*ngf*4, no_bias=True, flatten=False)
+    net = mx.sym.Activation(net, act_type='relu')
+    # 4 x 4
+    net = mx.sym.reshape(net, shape=(-1, ngf * 4, 4, 4))
+    # 8 x 8
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    net = deconv2d(
+        net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv")
+    net = mx.sym.Activation(net, act_type='tanh')
+    return net
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/dqn.py b/nnvm/tests/python/frontend/mxnet/model_zoo/dqn.py
new file mode 100644
index 000000000000..e037511efdf2
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/dqn.py
@@ -0,0 +1,27 @@
+"""
+The mxnet symbol of Nature DQN
+
+Reference:
+Mnih, Volodymyr, et al.
+"Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+import mxnet as mx
+
+def get_symbol(num_action=18):
+    data = mx.sym.Variable(name='data')
+    net = mx.sym.Convolution(data, kernel=(8, 8), stride=(4, 4),
+                             num_filter=32, name='conv1')
+    net = mx.sym.Activation(net, act_type='relu', name='relu1')
+    net = mx.sym.Convolution(net, kernel=(4, 4), stride=(2, 2),
+                             num_filter=64, name='conv2')
+    net = mx.sym.Activation(net, act_type='relu', name='relu2')
+    net = mx.sym.Convolution(net, kernel=(3, 3), stride=(1, 1),
+                             num_filter=64, name='conv3')
+    net = mx.sym.Activation(net, act_type='relu', name='relu3')
+    net = mx.sym.FullyConnected(net, num_hidden=512, name='fc4')
+    net = mx.sym.Activation(net, act_type='relu', name='relu4')
+    net = mx.sym.FullyConnected(net, num_hidden=num_action, name='fc5', flatten=False)
+
+    return net
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/mlp.py b/nnvm/tests/python/frontend/mxnet/model_zoo/mlp.py
new file mode 100644
index 000000000000..922b208749bf
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/mlp.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+a simple multilayer perceptron
+"""
+import mxnet as mx
+
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.sym.Flatten(data=data)
+    try:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128, flatten=False)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64, flatten=False)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes, flatten=False)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    except:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    return mlp
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py b/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
new file mode 100644
index 000000000000..42a62af023e7
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+import mxnet as mx
+import numpy as np
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    if bottle_neck:
+        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
+        conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    dtype : str
+        Precision (float32 or float16)
+    """
+    num_unit = len(units)
+    assert(num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if dtype == 'float32':
+        # data = mx.sym.identity(data=data, name='id')
+        data = data
+    else:
+        if dtype == 'float16':
+            data = mx.sym.Cast(data=data, dtype=np.float16)
+    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    (nchannel, height, width) = image_shape
+    if height <= 32:            # such as cifar10
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace)
+    else:                       # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
+                             memonger=memonger)
+        for j in range(units[i]-1):
+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
+    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = mx.sym.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
+    flat = mx.sym.Flatten(data=pool1)
+    try:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', flatten=False)
+    except:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
+    if dtype == 'float16':
+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
+    return mx.sym.softmax(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+
+    return resnet(units       = units,
+                  num_stages  = num_stages,
+                  filter_list = filter_list,
+                  num_classes = num_classes,
+                  image_shape = image_shape,
+                  bottle_neck = bottle_neck,
+                  workspace   = conv_workspace,
+                  dtype       = dtype)
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/squeezenet.py b/nnvm/tests/python/frontend/mxnet/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..deb896a21385
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/squeezenet.py
@@ -0,0 +1,76 @@
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+import mxnet as mx
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = mx.sym.concat(left, right, dim=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = mx.sym.Convolution(net, num_filter=channels, kernel=(kernel_size, kernel_size),
+                             pad=(padding, padding))
+    net = mx.sym.Activation(net, act_type='relu')
+    return net
+
+# Net
+def get_symbol(num_classes=1000, version='1.0', **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    net = mx.sym.Variable("data")
+    if version == '1.0':
+        net = mx.sym.Convolution(net, num_filter=96, kernel=(7, 7), stride=(2, 2), pad=(3, 3))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 64, 256, 256)
+    else:
+        net = mx.sym.Convolution(net, num_filter=64, kernel=(3, 3), stride=(2, 2), pad=(1, 1))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256)
+    net = mx.sym.Dropout(net, p=0.5)
+    net = mx.sym.Convolution(net, num_filter=num_classes, kernel=(1, 1))
+    net = mx.sym.Activation(net, act_type='relu')
+    net = mx.sym.Pooling(data=net, global_pool=True, kernel=(13, 13), pool_type='avg')
+    net = mx.sym.flatten(net)
+    return mx.sym.softmax(net)
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/vgg.py b/nnvm/tests/python/frontend/mxnet/model_zoo/vgg.py
new file mode 100644
index 000000000000..68215bb80aaa
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/vgg.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+
+import mxnet as mx
+import numpy as np
+
+def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
+        internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
+    return internel_layer
+
+def get_classifier(input_data, num_classes, **kwargs):
+    flatten = mx.sym.Flatten(data=input_data, name="flatten")
+    try:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6", flatten=False)
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7", flatten=False)
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8", flatten=False)
+    except:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
+    return fc8
+
+def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
+    """
+    Parameters
+    ----------
+    num_classes : int, default 1000
+        Number of classification classes.
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+    batch_norm : bool, default False
+        Use batch normalization.
+    dtype: str, float32 or float16
+        Data precision.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data = mx.sym.Variable(name="data")
+    if dtype == 'float16':
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    if dtype == 'float16':
+        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
+    symbol = mx.sym.softmax(data=classifier, name='softmax')
+    return symbol
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
new file mode 100644
index 000000000000..971c99473d5c
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -0,0 +1,181 @@
+import numpy as np
+
+import topi
+import tvm
+from tvm.contrib import graph_runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+from nnvm import frontend
+import mxnet as mx
+from mxnet import gluon
+from mxnet.gluon.model_zoo import vision
+import model_zoo
+
+
+def verify_mxnet_frontend_impl(mx_symbol, data_shape=(1, 3, 224, 224), out_shape=(1, 1000),
+                               gluon_impl=False, name=None):
+    """Use name different from test to avoid let nose pick it up"""
+    if gluon_impl:
+        def get_gluon_output(name, x):
+            net = vision.get_model(name)
+            net.collect_params().initialize(mx.init.Xavier())
+            net_sym = gluon.nn.SymbolBlock(outputs=net(mx.sym.var('data')),
+                                           inputs=mx.sym.var('data'),
+                                           params=net.collect_params())
+            out = net_sym(mx.nd.array(x.astype(dtype))).asnumpy()
+            return out, net_sym
+    else:
+        def get_mxnet_output(symbol, x, dtype='float32'):
+            from collections import namedtuple
+            Batch = namedtuple('Batch', ['data'])
+            mod = mx.mod.Module(symbol, label_names=None)
+            mod.bind(data_shapes=[('data', x.shape)], for_training=False)
+            mod.init_params()
+            mod.forward(Batch([mx.nd.array(x.astype(dtype))]))
+            out = mod.get_outputs()[0].asnumpy()
+            args, auxs = mod.get_params()
+            return out, args, auxs
+
+    def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
+        if gluon_impl:
+            new_sym, params = frontend.from_mxnet(symbol)
+        else:
+            new_sym, params = frontend.from_mxnet(symbol, args, auxs)
+
+        dshape = x.shape
+        shape_dict = {'data': dshape}
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("data", tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+        return out.asnumpy()
+
+    # random input
+    dtype = 'float32'
+    x = np.random.uniform(size=data_shape)
+    if gluon_impl:
+        gluon_out, gluon_sym = get_gluon_output(name, x)
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
+            np.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
+    else:
+        mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
+        assert "data" not in args
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
+            np.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mlp():
+    mlp = model_zoo.mx_mlp
+    verify_mxnet_frontend_impl(mlp)
+
+def test_forward_vgg():
+    for n in [11]:
+        mx_sym = model_zoo.mx_vgg[n]
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_resnet():
+    for n in [18]:
+        mx_sym = model_zoo.mx_resnet[n]
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_elu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='elu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_rrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='rrelu', lower_bound=0.3, upper_bound=0.7)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_prelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='prelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_softrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.Activation(data, act_type='softrelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_fc_flatten():
+    # test flatten=True option in mxnet 0.11.1
+    data = mx.sym.var('data')
+    try:
+        mx_sym = mx.sym.FullyConnected(data, num_hidden=100, flatten=True)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+        mx_sym = mx.sym.FullyConnected(mx.sym.Flatten(data), num_hidden=100, flatten=False)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+    except:
+        pass
+
+def test_forward_clip():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicity
+    mx_sym = mx.sym.clip(data, a_min=0, a_max=1)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_split():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=False)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 1, 2, 1))
+
+def test_forward_split_squeeze():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=True)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 2, 1))
+
+def test_forward_expand_dims():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.expand_dims(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 1, 3, 4))
+
+def test_forward_pooling():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='avg')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='max')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+def test_forward_lrn():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
+    verify_mxnet_frontend_impl(mx_sym, (1, 10, 24, 24), (1, 10, 24, 24))
+    
+def test_forward_l2_normalize():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.L2Normalization(data, 1e-10,mode='instance')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+    mx_sym = mx.sym.L2Normalization(data, 1e-10,mode='channel')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+    mx_sym = mx.sym.L2Normalization(data, 1e-10,mode='spatial')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+if __name__ == '__main__':
+    test_forward_mlp()
+    test_forward_vgg()
+    test_forward_resnet()
+    test_forward_elu()
+    test_forward_rrelu()
+    test_forward_prelu()
+    test_forward_softrelu()
+    test_forward_fc_flatten()
+    test_forward_clip()
+    test_forward_split()
+    test_forward_split_squeeze()
+    test_forward_expand_dims()
+    test_forward_pooling()
+    test_forward_lrn()
+    test_forward_l2_normalize()
diff --git a/nnvm/tests/python/frontend/mxnet/test_graph.py b/nnvm/tests/python/frontend/mxnet/test_graph.py
new file mode 100644
index 000000000000..18e124ad6ffc
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/test_graph.py
@@ -0,0 +1,72 @@
+import mxnet as mx
+import nnvm
+from nnvm.compiler import graph_util, graph_attr
+import model_zoo
+
+def compare_graph(sym1, sym2, ishape=(2, 3, 224, 224)):
+    g1 = nnvm.graph.create(sym1)
+    g2 = nnvm.graph.create(sym2)
+    graph_attr.set_shape_inputs(g1, {'data':ishape})
+    graph_attr.set_shape_inputs(g2, {'data':ishape})
+    g1 = g1.apply("InferShape").apply("SimplifyInference")
+    g2 = g2.apply("InferShape").apply("SimplifyInference")
+    graph_util.check_graph_equal(g1, g2)
+
+def test_mlp():
+    mx_sym = model_zoo.mx_mlp
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_mlp
+    compare_graph(from_mx_sym, nnvm_sym)
+
+def test_vgg():
+    for n in [11, 13, 16, 19]:
+        mx_sym = model_zoo.mx_vgg[n]
+        from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+        nnvm_sym = model_zoo.nnvm_vgg[n]
+        compare_graph(from_mx_sym, nnvm_sym)
+
+def test_resnet():
+    for n in [18, 34, 50, 101]:
+        mx_sym = model_zoo.mx_resnet[n]
+        from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+        nnvm_sym = model_zoo.nnvm_resnet[n]
+        compare_graph(from_mx_sym, nnvm_sym)
+
+def test_squeezenet():
+    for version in ['1.0', '1.1']:
+        mx_sym = model_zoo.mx_squeezenet[version]
+        from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+        nnvm_sym = model_zoo.nnvm_squeezenet[version]
+        compare_graph(from_mx_sym, nnvm_sym)
+
+def test_dqn():
+    mx_sym = model_zoo.mx_dqn
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_dqn
+    compare_graph(from_mx_sym, nnvm_sym)
+
+def test_dcgan():
+    mx_sym = model_zoo.mx_dcgan
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_dcgan
+    compare_graph(from_mx_sym, nnvm_sym)
+
+def test_multi_outputs():
+    def compose(F, **kwargs):
+        x = F.sym.Variable('x')
+        y = F.sym.Variable('y')
+        z = F.sym.split(x, **kwargs)
+        return F.sym.broadcast_sub(F.sym.broadcast_add(z[0], z[2]), y)
+    mx_sym = compose(mx, num_outputs=3, axis=1)
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = compose(nnvm, indices_or_sections=3, axis=1)
+    compare_graph(from_mx_sym, nnvm_sym)
+
+if __name__ == '__main__':
+    test_mlp()
+    test_vgg()
+    test_resnet()
+    test_multi_outputs()
+    test_dqn()
+    test_dcgan()
+    test_squeezenet()
diff --git a/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py b/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py
new file mode 100644
index 000000000000..ee5093e07a97
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py
@@ -0,0 +1,38 @@
+"""Store for onnx examples and common models."""
+from __future__ import absolute_import as _abs
+import os
+import logging
+from .super_resolution import get_super_resolution
+
+def _download(url, filename, overwrite=False):
+    if os.path.isfile(filename) and not overwrite:
+        logging.debug('File %s existed, skip.', filename)
+        return
+    logging.debug('Downloading from url %s to %s', url, filename)
+    try:
+        import urllib.request
+        urllib.request.urlretrieve(url, filename)
+    except:
+        import urllib
+        urllib.urlretrieve(url, filename)
+
+def _as_abs_path(fname):
+    cur_dir = os.path.abspath(os.path.dirname(__file__))
+    return os.path.join(cur_dir, fname)
+
+
+URLS = {
+    'super_resolution.onnx': 'https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/93672b029103648953c4e5ad3ac3aadf346a4cdc/super_resolution_0.2.onnx',
+    'squeezenet1_1.onnx': 'https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/93672b029103648953c4e5ad3ac3aadf346a4cdc/squeezenet1_1_0.2.onnx',
+    'lenet.onnx': 'https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/93672b029103648953c4e5ad3ac3aadf346a4cdc/lenet_0.2.onnx',
+    'resnet18_1_0.onnx': 'https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/b385b1b242dc89a35dd808235b885ed8a19aedc1/resnet18_1.0.onnx'}
+
+# download and add paths
+for k, v  in URLS.items():
+    name = k.split('.')[0]
+    path = _as_abs_path(k)
+    _download(v, path, False)
+    locals()[name] = path
+
+# symbol for graph comparison
+super_resolution_sym = get_super_resolution()
diff --git a/nnvm/tests/python/frontend/onnx/model_zoo/super_resolution.py b/nnvm/tests/python/frontend/onnx/model_zoo/super_resolution.py
new file mode 100644
index 000000000000..19662322a66b
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/model_zoo/super_resolution.py
@@ -0,0 +1,20 @@
+"""NNVM symbol corresponding to super_resolution.onnx example."""
+from nnvm import sym
+
+def get_super_resolution():
+    factor = 3
+    size = 224
+    data = sym.Variable(name='9')
+    conv1 = sym.conv2d(data, channels=64, kernel_size=(5, 5), padding=(2, 2), use_bias=False)
+    relu1 = sym.relu(conv1 + sym.expand_dims(sym.Variable(name='2', shape=(64)), axis=1, num_newaxis=2))
+    conv2 = sym.conv2d(relu1, channels=64, kernel_size=(3, 3), padding=(1, 1), use_bias=False)
+    relu2 = sym.relu(conv2 + sym.expand_dims(sym.Variable(name='4', shape=(64)), axis=1, num_newaxis=2))
+    conv3 = sym.conv2d(relu2, channels=32, kernel_size=(3, 3), padding=(1, 1), use_bias=False)
+    relu3 = sym.relu(conv3 + sym.expand_dims(sym.Variable(name='6', shape=(32)), axis=1, num_newaxis=2))
+    conv4 = sym.conv2d(relu3, channels=factor**2, kernel_size=(3, 3), padding=(1, 1), use_bias=False)
+    conv4 = conv4 + sym.expand_dims(sym.Variable(name='8', shape=(factor**2)), axis=1, num_newaxis=2)
+    # TODO(zhreshold): allow shape inference for batch size > 1
+    r1 = sym.reshape(conv4, shape=(1, 1, factor, factor, size, size))
+    t1 = sym.transpose(r1, axes=(0, 1, 4, 2, 5, 3))
+    r2 = sym.reshape(t1, shape=(1, 1, size * factor, size * factor))
+    return r2
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
new file mode 100644
index 000000000000..bddf4a87009c
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -0,0 +1,330 @@
+import numpy as np
+import nnvm
+import tvm
+from tvm.contrib import graph_runtime
+from nnvm.testing.config import ctx_list
+import onnx
+from model_zoo import super_resolution, squeezenet1_1, lenet, resnet18_1_0
+from onnx import helper, TensorProto
+
+def get_tvm_output(model, x, target, ctx, out_shape, dtype='float32'):
+    new_sym, params = nnvm.frontend.from_onnx(model)
+    input_name = model.graph.input[0].name
+    shape_dict = {input_name: x.shape}
+    dtype_dict = {input_name: dtype}
+    graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params)
+    m = graph_runtime.create(graph, lib, ctx)
+    # set inputs
+    m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
+    m.set_input(**params)
+    m.run()
+    # get outputs
+    out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+    return out.asnumpy()
+
+
+def get_caffe2_output(model, x, dtype='float32'):
+    import caffe2.python.onnx.backend
+    prepared_backend = caffe2.python.onnx.backend.prepare(model)
+    W = {model.graph.input[0].name: x.astype(dtype)}
+    c2_out = prepared_backend.run(W)[0]
+    return c2_out
+
+
+def verify_onnx_forward_impl(graph_file, data_shape, out_shape):
+    dtype = 'float32'
+    x = np.random.uniform(size=data_shape)
+    model = onnx.load(graph_file)
+    c2_out = get_caffe2_output(model, x, dtype)
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, x, target, ctx, out_shape, dtype)
+        np.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def verify_super_resolution_example():
+    verify_onnx_forward_impl(super_resolution, (1, 1, 224, 224), (1, 1, 672, 672))
+
+def verify_squeezenet1_1():
+    verify_onnx_forward_impl(squeezenet1_1, (1, 3, 224, 224), (1, 1000))
+
+def verify_lenet():
+    verify_onnx_forward_impl(lenet, (1, 1, 28, 28), (1, 10))
+
+def verify_resnet18():
+    verify_onnx_forward_impl(resnet18_1_0, (1, 3, 224, 224), (1, 1000))
+
+
+def test_reshape():
+    in_shape = (4, 3, 3, 4)
+    ref_shape = (3, 4, 4, 3)
+
+    ref_array = np.array(ref_shape)
+    ref_node = onnx.helper.make_node('Constant',
+                                 inputs=[],
+                                 outputs=['ref_in'],
+                                 value=onnx.helper.make_tensor(name = 'const_tensor',
+                                                               data_type = onnx.TensorProto.INT32,
+                                                               dims = ref_array.shape,
+                                                               vals = ref_array.flatten().astype(int)))
+    reshape_node = helper.make_node("Reshape", ["in", "ref_in"], ["out"])
+
+    graph = helper.make_graph([ref_node, reshape_node],
+                              "reshape_test",
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(ref_shape))])
+
+    model = helper.make_model(graph, producer_name='reshape_test')
+
+    for target, ctx in ctx_list():
+        x = np.random.uniform(size=in_shape)
+        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
+
+    np.testing.assert_allclose(ref_shape, tvm_out.shape)
+
+def test_reshape_like():
+    in_shape = (4, 3, 3, 4)
+    ref_shape = (3, 4, 4, 3)
+
+    ref_array = np.random.uniform(size=ref_shape).astype('float32')
+    ref_node = onnx.helper.make_node('Constant',
+                                 inputs=[],
+                                 outputs=['ref_in'],
+                                 value=onnx.helper.make_tensor(name = 'const_tensor',
+                                                               data_type = onnx.TensorProto.FLOAT,
+                                                               dims = ref_array.shape,
+                                                               vals = ref_array.flatten().astype(float)))
+    copy_node = helper.make_node("Identity", ["ref_in"], ["copy_in"])
+    reshape_node = helper.make_node("Reshape", ["in", "copy_in"], ["out"])
+
+    graph = helper.make_graph([ref_node, copy_node, reshape_node],
+                              "reshape_like_test",
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(ref_shape))])
+
+    model = helper.make_model(graph, producer_name='reshape_like_test')
+
+    for target, ctx in ctx_list():
+        x = np.random.uniform(size=in_shape)
+        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
+
+    np.testing.assert_allclose(ref_shape, tvm_out.shape)
+
+def _test_power_iteration(x_shape, y_shape):
+    if isinstance(y_shape, int):
+        y_shape = [y_shape]
+
+    x = np.random.uniform(size=x_shape).astype(np.float32)
+    y = np.random.uniform(size=y_shape).astype(np.float32)
+
+    np_res = np.power(x, y).astype(np.float32)
+
+    res = helper.make_node("Pow", ['x', 'y'], ['out'])
+
+    graph = helper.make_graph([res],
+                              'power_test',
+                              inputs = [helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x_shape)),
+                                        helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(np_res.shape))])
+
+    model = helper.make_model(graph, producer_name='power_test')
+
+    for target, ctx in ctx_list():
+        new_sym, params = nnvm.frontend.from_onnx(model)
+
+        input_name = model.graph.input[0].name
+        input_name1 = model.graph.input[1].name
+        shape_dict = {input_name: x.shape, input_name1: y.shape}
+        dtype_dict = {input_name: x.dtype, input_name1: y.dtype}
+
+        graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input(input_name, tvm.nd.array(x))
+        m.set_input(input_name1, tvm.nd.array(y))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(np_res.shape, np_res.dtype))
+
+        np.testing.assert_allclose(np_res, tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
+
+def test_power():
+    _test_power_iteration((1, 3), (1))
+    _test_power_iteration((2, 3), (2, 3))
+    _test_power_iteration((2, 3), (1, 3))
+
+def test_squeeze():
+    in_shape = (1, 3, 1, 3, 1, 1)
+    out_shape = (3, 3)
+    y = helper.make_node("Squeeze", ['in'], ['out'])
+
+    graph = helper.make_graph([y],
+                              'squeeze_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='squeeze_test')
+
+    for target, ctx in ctx_list():
+        x = np.random.uniform(size=in_shape)
+        tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
+
+    np.testing.assert_allclose(out_shape, tvm_out.shape)
+
+def test_unsqueeze():
+    in_shape = (3, 3)
+    axis = (0, 3, 4)
+    out_shape = (1, 3, 3, 1, 1)
+    y = helper.make_node("Unsqueeze", ['in'], ['out'], axes=list(axis))
+
+    graph = helper.make_graph([y],
+                              'squeeze_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='squeeze_test')
+
+    for target, ctx in ctx_list():
+        x = np.random.uniform(size=in_shape)
+        tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
+
+    np.testing.assert_allclose(out_shape, tvm_out.shape)
+
+def verify_gather(in_shape, indices, axis=0):
+    indices_src = np.array(indices, dtype="int32")
+
+    x = np.random.uniform(size=in_shape)
+    out_np = np.take(x, indices_src, axis=axis)
+
+    y = helper.make_node("Gather", ['in'], ['out'], indices=indices, axis=axis)
+
+    graph = helper.make_graph([y],
+                              'gather_test',
+                              inputs = [helper.make_tensor_value_info("in",
+                                  TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                  TensorProto.FLOAT, list(out_np.shape))])
+
+    model = helper.make_model(graph, producer_name='gather_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, x, target, ctx, out_np.shape, 'float32')
+
+    np.testing.assert_allclose(out_np, tvm_out)
+
+def test_gather():
+    verify_gather((4,), [1])
+    verify_gather((4,), [0, 1, 2, 3])
+    verify_gather((4, 2), [1], 1)
+    verify_gather((4, 3, 5, 6), [2, 1, 0, 0], -2)
+
+def _test_slice_iteration(indata, outdata, starts, ends, axes=None):
+    if axes:
+        y = helper.make_node("Slice", ['in'], ['out'], axes=axes, starts=starts, ends=ends)
+    else:
+        y = helper.make_node("Slice", ['in'], ['out'], starts=starts, ends=ends)
+
+    graph = helper.make_graph([y],
+                              'slice_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))])
+
+    model = helper.make_model(graph, producer_name='slice_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+
+    np.testing.assert_allclose(outdata, tvm_out)
+
+def test_slice():
+    x = np.random.randn(20, 10, 5).astype(np.float32)
+    _test_slice_iteration(x, x[0:3, 0:10], (0, 0), (3, 10), (0, 1))
+    _test_slice_iteration(x, x[:, :, 3:4], (0, 0, 3), (20, 10, 4))
+    _test_slice_iteration(x, x[:, 1:1000], (1), (1000), (1))
+    _test_slice_iteration(x, x[:, 0:-1], (0), (-1), (1))
+
+def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
+    indata = np.random.uniform(size=(2, 4, 5, 6)).astype(dtype)
+    outdata = outfunc(indata, **npargs)
+
+    y = helper.make_node(opname, ['in'], ['out'], **kwargs)
+
+    graph = helper.make_graph([y],
+                              opname+'_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))])
+
+    model = helper.make_model(graph, producer_name=opname+'_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype)
+
+    np.testing.assert_allclose(outdata, tvm_out)
+
+def test_floor():
+    _test_onnx_op_elementwise((2, 4, 5, 6), np.floor, {}, 'float32', 'Floor', {})
+
+def test_ceil():
+    _test_onnx_op_elementwise((2, 4, 5, 6), np.ceil, {}, 'float32', 'Ceil', {})
+
+def test_clip():
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              np.clip,
+                              {'a_min': -1.0, 'a_max': 1.0},
+                              'float32',
+                              'Clip',
+                              {'min': -1.0, 'max': 1.0})
+
+def test_matmul():
+    a_shape = (4, 3)
+    b_shape = (3, 4)
+    out_shape = (4, 4)
+
+    a_array = np.random.uniform(size=a_shape).astype('float32')
+    b_array = np.random.uniform(size=b_shape).astype('float32')
+
+    mul_node = helper.make_node("MatMul", ["a", "b"], ["out"])
+
+    graph = helper.make_graph([mul_node],
+                              "matmul_test",
+                              inputs = [helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
+                                        helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='matmul_test')
+
+    for target, ctx in ctx_list():
+        new_sym, params = nnvm.frontend.from_onnx(model)
+
+        input_name = model.graph.input[0].name
+        input_name1 = model.graph.input[1].name
+        shape_dict = {input_name: a_array.shape, input_name1: b_array.shape}
+        dtype_dict = {input_name: 'float32', input_name1: 'float32'}
+
+        graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input(input_name, tvm.nd.array(a_array.astype('float32')))
+        m.set_input(input_name1, tvm.nd.array(b_array.astype('float32')))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, 'float32'))
+
+        np.testing.assert_allclose(np.matmul(a_array, b_array), tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
+
+if __name__ == '__main__':
+    # verify_super_resolution_example()
+    # verify_squeezenet1_1()
+    # verify_lenet()
+    verify_resnet18()
+    test_reshape()
+    test_reshape_like()
+    test_power()
+    test_squeeze()
+    test_unsqueeze()
+    test_slice()
+    test_floor()
+    test_ceil()
+    test_clip()
+    test_matmul()
+    test_gather()
diff --git a/nnvm/tests/python/frontend/onnx/test_graph.py b/nnvm/tests/python/frontend/onnx/test_graph.py
new file mode 100644
index 000000000000..7fa705ef4c65
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/test_graph.py
@@ -0,0 +1,25 @@
+"""Test graph equality of onnx models."""
+import nnvm
+import onnx
+from nnvm.compiler import graph_util, graph_attr
+from model_zoo import super_resolution, super_resolution_sym
+
+def compare_graph(onnx_file, nnvm_sym, ishape):
+    onnx_model = onnx.load(onnx_file)
+    onnx_sym, params = nnvm.frontend.from_onnx(onnx_model)
+    g1 = nnvm.graph.create(onnx_sym)
+    g2 = nnvm.graph.create(nnvm_sym)
+    input_name = onnx_model.graph.input[0].name
+    ishapes = {input_name: ishape}
+    graph_attr.set_shape_inputs(g1, ishapes)
+    graph_attr.set_shape_inputs(g2, ishapes)
+    g1 = g1.apply("InferShape").apply("SimplifyInference")
+    g2 = g2.apply("InferShape").apply("SimplifyInference")
+    graph_util.check_graph_equal(g1, g2)
+
+def test_super_resolution_example():
+    fname, symbol = super_resolution, super_resolution_sym
+    compare_graph(fname, symbol, ishape=(1, 1, 224, 224))
+
+if __name__ == '__main__':
+    test_super_resolution_example()
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
new file mode 100644
index 000000000000..0642e46b7863
--- /dev/null
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -0,0 +1,943 @@
+# pylint: disable=import-self, invalid-name, unused-argument
+"""
+Tensorflow testcases
+====================
+This article is a test script to test tensorflow operator with NNVM.
+"""
+from __future__ import print_function
+import numpy as np
+import nnvm.compiler
+import tvm
+import tensorflow as tf
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import graph_util
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.ops import init_ops
+from tensorflow.core.framework import graph_pb2
+
+import nnvm.testing.tf
+
+#######################################################################
+# Generic run functions for TVM & tensorflow
+# ------------------------------------------
+def run_tvm_graph(graph_def, input_data, input_node, output_shape, output_dtype):
+    """ Generic function to compile on nnvm and execute on tvm """
+
+    sym, params = nnvm.frontend.from_tensorflow(graph_def)
+    target = 'llvm'
+    if isinstance(input_data, list):
+        shape_dict = {}
+        dtype_dict = {}
+        for i, e in enumerate(input_node):
+            shape_dict[e] = input_data[i].shape
+            dtype_dict[e] = input_data[i].dtype
+    else:
+        shape_dict = {input_node: input_data.shape}
+        dtype_dict = {input_node: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+                                             dtype=dtype_dict, params=params)
+
+    ctx = tvm.cpu(0)
+    from tvm.contrib import graph_runtime
+    m = graph_runtime.create(graph, lib, ctx)
+    # set inputs
+    if isinstance(input_data, list):
+        for i, e in enumerate(input_node):
+            m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
+    else:
+        m.set_input(input_node, tvm.nd.array(input_data.astype(input_data.dtype)))
+
+    m.set_input(**params)
+    # execute
+    m.run()
+    # get outputs
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, s in enumerate(output_shape):
+            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
+        return tvm_output.asnumpy()
+
+def run_tf_graph(sess, input_data, input_node, output_node):
+    """ Generic function to execute tensorflow """
+
+    tensor = sess.graph.get_tensor_by_name(output_node)
+
+    if isinstance(input_data, list):
+        input_dict = {}
+        for i, e in enumerate(input_node):
+            input_dict[e] = input_data[i]
+    else:
+        input_dict = {input_node: input_data}
+
+    output_data = sess.run(tensor, input_dict)
+    return output_data
+
+#######################################################################
+# Pooling
+# -------
+def _test_pooling(input_shape, **kwargs):
+    """ One iteration of pool operation with given shapes and attributes """
+
+    x = -np.arange(
+        np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1
+
+    with tf.Graph().as_default():
+        in_data = constant_op.constant(x, shape=input_shape, dtype='float32')
+        # pylint: disable=unused-variable
+        pool = nn_ops.pool(in_data, **kwargs)
+        # pylint: enable=unused-variable
+
+        if kwargs['pooling_type'] == 'MAX':
+            out_node = 'max_pool'
+            out_name = 'max_pool:0'
+        else:
+            out_node = 'avg_pool'
+            out_name = 'avg_pool:0'
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                [out_node],
+                )
+
+            tf_output = run_tf_graph(sess, x, 'Const:0', out_name)
+            tvm_output = run_tvm_graph(graph_def, x.astype('float32'),
+                                       "Const", tf_output.shape, 'float32')
+            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
+
+            sess.close()
+
+def test_forward_pooling():
+    """ Pooling """
+
+    _test_pooling(input_shape=[2, 9, 10, 2],
+                 window_shape=[1, 1],
+                 padding='SAME',
+                 pooling_type='MAX',
+                 dilation_rate=[1, 1],
+                 strides=[1, 1])
+    _test_pooling(input_shape=[2, 9, 10, 2],
+                 window_shape=[1, 1],
+                 padding='SAME',
+                 pooling_type='AVG',
+                 dilation_rate=[1, 1],
+                 strides=[1, 1])
+
+    _test_pooling(input_shape=[2, 10, 9, 2],
+                 window_shape=[1, 1],
+                 padding='SAME',
+                 pooling_type='MAX',
+                 dilation_rate=[1, 1],
+                 strides=[1, 1])
+    _test_pooling(input_shape=[2, 10, 9, 2],
+                 window_shape=[1, 1],
+                 padding='SAME',
+                 pooling_type='AVG',
+                 dilation_rate=[1, 1],
+                 strides=[1, 1])
+
+    _test_pooling(input_shape=[2, 9, 10, 2],
+                 window_shape=[2, 1],
+                 padding='SAME',
+                 pooling_type='MAX',
+                 dilation_rate=[1, 1],
+                 strides=[1, 1])
+    _test_pooling(input_shape=[2, 9, 10, 2],
+                 window_shape=[2, 1],
+                 padding='SAME',
+                 pooling_type='AVG',
+                 dilation_rate=[1, 1],
+                 strides=[2, 1])
+
+    _test_pooling(input_shape=[2, 10, 9, 2],
+                 window_shape=[2, 3],
+                 padding='SAME',
+                 pooling_type='MAX',
+                 dilation_rate=[1, 1],
+                 strides=[2, 1])
+    _test_pooling(input_shape=[2, 10, 9, 2],
+                 window_shape=[2, 3],
+                 padding='SAME',
+                 pooling_type='AVG',
+                 dilation_rate=[1, 1],
+                 strides=[1, 2])
+
+
+#######################################################################
+# Convolution
+# -----------
+
+def _test_convolution(tensor_in_sizes, filter_in_sizes,
+                      dilations, strides, padding, data_format):
+    """ One iteration of convolution with given shapes and attributes """
+
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+        total_size_1 *= s
+    for s in filter_in_sizes:
+        total_size_2 *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    data_array = [f * 1.0 for f in range(1, total_size_1 + 1)]
+    filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]
+
+    with tf.Graph().as_default():
+        in_data = constant_op.constant(data_array, shape=tensor_in_sizes, dtype='float32')
+        in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32')
+        strides = [1] + strides + [1]
+        dilations = [1] + dilations + [1]
+
+        # pylint: disable=unused-variable
+        conv = nn_ops.conv2d(in_data,
+                             in_filter,
+                             strides=strides,
+                             padding=padding,
+                             data_format=data_format)
+        # pylint: enable=unused-variable
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                ['Conv2D'],
+                )
+
+            tf_output = run_tf_graph(sess, np.reshape(data_array, tensor_in_sizes),
+                                     'Const:0', 'Conv2D:0')
+            tvm_output = run_tvm_graph(graph_def,
+                                       np.reshape(data_array, tensor_in_sizes).astype('float32'),
+                                       "Const", tf_output.shape, 'float32')
+
+            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
+
+            sess.close()
+
+def test_forward_convolution():
+    _test_convolution([4, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NHWC')
+    _test_convolution([4, 17, 17, 19], [3, 3, 19, 19], [1, 1], [2, 2], 'VALID', 'NHWC')
+    _test_convolution([4, 17, 17, 124], [1, 1, 124, 19], [1, 1], [1, 1], 'SAME', 'NHWC')
+    _test_convolution([4, 17, 17, 12], [3, 3, 12, 32], [1, 1], [2, 2], 'VALID', 'NHWC')
+
+#######################################################################
+# Reshape
+# -------
+
+def _test_reshape(data, out_shape):
+    """ One iteration of reshape operation with given data and out shape """
+
+    with tf.Graph().as_default():
+        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+
+        # pylint: disable=unused-variable
+        reshape_out = array_ops.reshape(in_data, out_shape)
+        # pylint: enable=unused-variable
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                ['Reshape'],
+                )
+
+            tf_output = run_tf_graph(sess, data,
+                                     'Const:0', 'Reshape:0')
+            tvm_output = run_tvm_graph(graph_def,
+                                       data,
+                                       "Const", tf_output.shape, data.dtype)
+
+            np.testing.assert_allclose(tf_output, tvm_output)
+
+            sess.close()
+
+def test_forward_reshape():
+    _test_reshape(np.arange(6.0), [2, 3])
+    _test_reshape(np.arange(6), [-1, 2])
+    _test_reshape(np.arange(6), [3, -1])
+    _test_reshape(np.arange(6), [-1])
+
+#######################################################################
+# Squeeze
+# -------
+
+def _test_squeeze(data, squeeze_dims=None):
+    """ One iteration of squeeze """
+
+    if squeeze_dims is None:
+        squeeze_dims = []
+
+    with tf.Graph().as_default():
+        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+
+        # pylint: disable=unused-variable
+        if squeeze_dims:
+            squeeze_out = array_ops.squeeze(in_data, squeeze_dims)
+        else:
+            squeeze_out = array_ops.squeeze(in_data)
+        # pylint: enable=unused-variable
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                ['Squeeze'],
+                )
+
+            tf_output = run_tf_graph(sess, data,
+                                     'Const:0', 'Squeeze:0')
+            tvm_output = run_tvm_graph(graph_def,
+                                       data,
+                                       "Const", tf_output.shape, data.dtype)
+
+            np.testing.assert_allclose(tf_output, tvm_output)
+
+            sess.close()
+
+def test_forward_squeeze():
+    """ Squeeze """
+
+    # Nothing to squeeze.
+    _test_squeeze(np.arange(2).reshape((2)))
+    _test_squeeze(np.arange(6).reshape((2, 3)))
+
+    # Squeeze the middle element away.
+    _test_squeeze(np.arange(4).reshape((2, 1, 2)))
+
+    # Squeeze on both ends.
+    _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)))
+
+    # Positive squeeze dim index.
+    _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [0])
+    _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [2, 4])
+    _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [0, 4, 2])
+
+    # Negative squeeze dim index.
+    _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [-1])
+    _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [-3, -5])
+    _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [-3, -5, -1])
+
+#######################################################################
+# ConcatV2
+# --------
+
+def _test_concat_v2(data, dim):
+    """ One iteration of ConcatV2 """
+
+    with tf.Graph().as_default():
+
+        # pylint: disable=unused-variable
+        concat_out = gen_array_ops._concat_v2(data, dim)
+        # pylint: enable=unused-variable
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                ['ConcatV2'],
+                )
+
+            tf_output = run_tf_graph(sess, data,
+                                     ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'], 'ConcatV2:0')
+            tvm_output = run_tvm_graph(graph_def,
+                                       data,
+                                       ["ConcatV2/values_0", 'ConcatV2/values_1'],
+                                       tf_output.shape, tf_output.dtype)
+
+            np.testing.assert_allclose(tf_output, tvm_output)
+
+            sess.close()
+
+def _test_forward_concat_v2():
+    t1 = np.array([])
+    t2 = np.array([])
+    test_concat_v2([t1, t2], 0)
+
+    t1 = np.array([[1, 2, 3], [4, 5, 6]])
+    t2 = np.array([[7, 8, 9], [10, 11, 12]])
+
+    _test_concat_v2([t1, t2], 1)
+
+#######################################################################
+# Sigmoid
+# -------
+
+def _test_sigmoid(data):
+    """ One iteration of sigmoid """
+
+    with tf.Graph().as_default():
+        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+
+        # pylint: disable=unused-variable
+        sigmoid_out = math_ops.sigmoid(in_data)
+        # pylint: enable=unused-variable
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                ['Sigmoid'],
+                )
+
+            tf_output = run_tf_graph(sess, data,
+                                     'Const:0', 'Sigmoid:0')
+            tvm_output = run_tvm_graph(graph_def,
+                                       data,
+                                       "Const", tf_output.shape, data.dtype)
+
+            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+
+            sess.close()
+
+def test_forward_sigmoid():
+    """ Sigmoid """
+
+    _test_sigmoid(np.random.uniform(size=(3, 4, 4, 3)).astype('float32'))
+
+#######################################################################
+# Argmin/Argmax
+# -------------
+
+def _test_argx(func, data, **kwargs):
+
+    with tf.Graph().as_default():
+        inp = constant_op.constant(data, shape=data.shape, dtype=data.dtype, name="c0")
+
+        # pylint: disable=unused-variable
+        out = func(inp, name="argx0", **kwargs)
+        # pylint: enable=unused-variable
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess=sess,
+                input_graph_def=sess.graph.as_graph_def(add_shapes=True),
+                output_node_names=["argx0"])
+
+            tf_output = run_tf_graph(sess, data, input_node="c0:0", output_node="argx0:0")
+            tvm_output = run_tvm_graph(graph_def, data, "c0", tf_output.shape, output_dtype='int32')
+
+            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+
+            sess.close()
+
+def test_argmin_argmax():
+    for axis in [None,0,1,2]:
+        data = np.random.uniform(size=(8,4,9)).astype('float32')
+        _test_argx(tf.argmax, data=data, axis=axis)
+        _test_argx(tf.argmin, data=data, axis=axis)
+
+#######################################################################
+# Variable
+# --------
+
+def _test_variable(data):
+    tf.reset_default_graph()
+    input_op = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+    input_tensor = array_ops.reshape(input_op, data.shape)
+
+    size = input_tensor.shape.dims[1]
+    with variable_scope.variable_scope("linear", reuse=None):
+        w = variable_scope.get_variable(
+            "w", shape=[size, size], dtype=input_tensor.dtype)
+    # pylint: disable=unused-variable
+    output_op = math_ops.matmul(input_tensor, w)
+    # pylint: enable=unused-variable
+
+    with tf.Session() as sess:
+        sess.run(variables.global_variables_initializer())
+        final_graph_def = tf.graph_util.convert_variables_to_constants(
+            sess,
+            sess.graph.as_graph_def(add_shapes=True),
+            ['MatMul'],
+            )
+
+        tf_output = run_tf_graph(sess, data, 'Placeholder:0', 'MatMul:0')
+        tvm_output = run_tvm_graph(final_graph_def, data,
+                                   "Placeholder", tf_output.shape, data.dtype)
+
+        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+        sess.close()
+
+def test_forward_variable():
+    """Variable type op test"""
+    _test_variable(np.random.uniform(size=(32, 100)).astype('float32'))
+
+
+#######################################################################
+# LSTM
+# ----
+def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype):
+    tf.reset_default_graph()
+    input_size = num_hidden
+    input_data = np.full((batch_size, input_size), 1., dtype=dtype)
+    in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
+    in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
+
+    def _get_tensorflow_output():
+        with tf.Session() as sess:
+            with variable_scope.variable_scope(
+                "root", initializer=init_ops.constant_initializer(0.5)):
+                m0 = array_ops.zeros([batch_size, num_hidden])
+                m1 = array_ops.zeros([batch_size, num_hidden])
+                x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype)
+                g, ((out_m0, out_m1)) = \
+                     tf.contrib.rnn.LSTMBlockCell(num_hidden,
+                                                  forget_bias=forget_bias)(x, ((m0, m1)))
+                sess.run([variables.global_variables_initializer()])
+                res = sess.run([g, out_m0, out_m1], {
+                    x.name: np.array([[1., 1.]]),
+                    m0.name: 0.1 * np.ones([batch_size, num_hidden]),
+                    m1.name: 0.1 * np.ones([batch_size, num_hidden]),
+                })
+            graph_def = sess.graph.as_graph_def(add_shapes=True)
+            final_graph_def = graph_util.convert_variables_to_constants(
+                sess,
+                graph_def,
+                ['root/lstm_cell/LSTMBlockCell'])
+            return final_graph_def, res
+
+    graph_def, tf_out = _get_tensorflow_output()
+    tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h],
+                               ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c',
+                                'root/lstm_cell/LSTMBlockCell_h'],
+                               [tf_out[0].shape, (2, batch_size, num_hidden)],
+                               [tf_out[0].dtype, tf_out[1].dtype])
+
+    if isinstance(tvm_output, list):
+        out = tvm_output[0]
+        out_state = tvm_output[1]
+        out_state_tup = np.split(out_state, indices_or_sections=2, axis=0)
+        out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
+        out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
+        tvm_out = [out, out_state_c, out_state_h]
+        np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
+
+def test_forward_lstm():
+    '''test LSTM block cell'''
+    _test_lstm_cell(1, 2, 1, 0.0, 'float32')
+
+
+#######################################################################
+# StridedSlice
+# ------------
+
+def _test_stridedslice(ip_shape, begin, end, stride, dtype,
+                             begin_mask=0, end_mask=0, new_axis_mask=0,
+                             shrink_axis_mask=0, ellipsis_mask=0):
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, ip_shape, name="in_data")
+    tf.strided_slice(in_data, begin, end, stride, begin_mask=begin_mask,
+                         end_mask=end_mask, new_axis_mask=new_axis_mask,
+                         shrink_axis_mask=shrink_axis_mask,
+                         ellipsis_mask=ellipsis_mask, name="strided_slice")
+    np_data = np.random.uniform(size=ip_shape).astype(dtype)
+
+    with tf.Session() as sess:
+        final_graph_def = tf.graph_util.convert_variables_to_constants(
+            sess,
+            sess.graph.as_graph_def(add_shapes=True),
+            ['strided_slice'])
+        tf_output = run_tf_graph(sess, np_data,
+                                 'in_data:0', 'strided_slice:0')
+        tvm_output = run_tvm_graph(final_graph_def, np_data,
+                                   "in_data", tf_output.shape, np_data.dtype)
+        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+        sess.close()
+
+def test_forward_stridedslice():
+    '''test StridedSlice'''
+    _test_stridedslice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], 'float32')
+    _test_stridedslice((3, 4, 3), [1, 0], [4, 3], [2, 1], 'float32', ellipsis_mask=8)
+    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 2], [2, 1, 1], 'float32', new_axis_mask=5)
+    _test_stridedslice((3, 4, 3), [1, 1, 1], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=4)
+    _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=4, new_axis_mask=2)
+    _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
+    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
+    _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=2)
+    _test_stridedslice((3,4), [1, 0], [4, 4], [1, 1], 'float32', shrink_axis_mask=2)
+    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], 'float32', shrink_axis_mask=2, new_axis_mask=2)
+    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], 'float32', shrink_axis_mask=1, new_axis_mask=2)
+    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], 'float32', shrink_axis_mask=2, new_axis_mask=1)
+    _test_stridedslice((3, 4, 5, 4, 5, 6), [0, 0], [2, 3], [1, 1], 'float32', shrink_axis_mask=5, new_axis_mask=1)
+    _test_stridedslice((3, 4, 5, 4, 5, 6), [0, 0, 1, 2, 1], [2, 3, 4, 5, 3], [1, 1, 2, 2, 1],
+                       'float32', shrink_axis_mask=5, new_axis_mask=1, ellipsis_mask=2, begin_mask=8, end_mask=8)
+    _test_stridedslice((3, 4, 5, 4, 5, 6), [0, 0, 1, 2, 1], [2, 3, 4, 5, 3], [1, 1, 2, 2, 1],
+                       'float32', shrink_axis_mask=8, new_axis_mask=1, ellipsis_mask=2, begin_mask=5, end_mask=5)
+    _test_stridedslice((3, 4, 5, 4, 5, 6), [0, 0, 1, 2, 1], [2, 3, 4, 5, 3], [1, 1, 2, 2, 1],
+                       'float32', shrink_axis_mask=16, new_axis_mask=1, ellipsis_mask=2, begin_mask=5, end_mask=5)
+    _test_stridedslice((3, 4, 5, 4, 5, 6), [1, 2, 0, -3], [4, 5, 3, 3], [2, 2, 1, 1],
+                       'float32', shrink_axis_mask=8, new_axis_mask=1, ellipsis_mask=2, begin_mask=5,
+                       end_mask=8)
+
+
+#######################################################################
+# Gather
+# ------
+
+def _test_gather(ip_shape, indice_shape, indice_value, axis, dtype):
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, ip_shape, name="in_data")
+    indices = tf.placeholder("int32", indice_shape, name="indices")
+    tf.gather(in_data, indices, axis=axis)
+    np_data = np.random.uniform(size=ip_shape).astype(dtype)
+
+    def _fill_indices(indice_value):
+        indices = np.array(ip_shape, dtype=dtype)
+        if isinstance(indice_value, int):
+            indices = np.array([indice_value], dtype='int32')
+        else:
+            indices = np.asarray(indice_value, dtype='int32')
+        return indices
+    np_indices = _fill_indices(indice_value)
+
+    with tf.Session() as sess:
+        final_graph_def = tf.graph_util.convert_variables_to_constants(
+            sess,
+            sess.graph.as_graph_def(add_shapes=True),
+            ['GatherV2'])
+        tf_output = run_tf_graph(sess, [np_data, np_indices], ['in_data:0',
+                                 'indices:0'], 'GatherV2:0')
+        tvm_output = run_tvm_graph(final_graph_def, [np_data, np_indices],
+                                   ['in_data', 'indices'], tf_output.shape, dtype)
+        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+        sess.close()
+
+def test_forward_gather():
+    '''test gather layer'''
+    _test_gather((4,), (1,), 1, 0, 'int32')
+    _test_gather((4,), (1,), 1, 0, 'float32')
+    _test_gather((1,4), (1,), [0], 0, 'int32')
+    _test_gather((4,), (1,2,2), [[[1,0],[0,1]]], 0, 'float32')
+    _test_gather((2,2), (1,2,2), [[[1,0],[0,1]]], 0, 'int32')
+    _test_gather((2,2), (1,2,2), [[[1,0],[0,1]]], 1, 'int32')
+    _test_gather((2,2), (1,2,2), [[[1,0],[0,1]]], 0, 'float32')
+    _test_gather((3,3,3), (1,1,2), [[[1,0]]], 0, 'int32')
+    _test_gather((3,3,3), (1,1,2), [[[1,0]]], 2, 'int32')
+    _test_gather((4,3,5,6), (1,4), [[2,1,0,0]], 0, 'float32')
+
+
+#######################################################################
+# Multi Input to graph
+# --------------------
+
+def test_forward_multi_input():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.int32, shape=[3, 3], name='in1')
+        in2 = tf.placeholder(tf.int32, shape=[3, 3], name='in2')
+        in3 = tf.placeholder(tf.int32, shape=[3, 3], name='in3')
+        in4 = tf.placeholder(tf.int32, shape=[3, 3], name='in4')
+
+        out1 = tf.add(in1, in2, name='out1')
+        out2 = tf.subtract(in3, in4, name='out2')
+
+        out = tf.multiply(out1, out2, name='out')
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                ['out'],
+                )
+
+            in_data = np.arange(9, dtype='int32').reshape([3, 3])
+
+            tf_output = run_tf_graph(sess, [in_data, in_data, in_data, in_data ],
+                                     ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0')
+            tvm_output = run_tvm_graph(graph_def,
+                                       [in_data, in_data, in_data, in_data ],
+                                       ['in1', 'in2', 'in3', 'in4'],
+                                       tf_output.shape, tf_output.dtype)
+
+            np.testing.assert_allclose(tf_output, tvm_output)
+
+            sess.close()
+
+#######################################################################
+# Resize Bilinear
+# ---------------
+
+def _test_resize_bilinear(in_shape, to_shape, align_corners):
+    """ One iteration of resize bilinear """
+
+    data = np.random.uniform(size=in_shape).astype('float32')
+    shape_data = np.array(to_shape).astype('int32')
+
+    with tf.Graph().as_default():
+        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+        shape_data = constant_op.constant(shape_data, shape=shape_data.shape, dtype=shape_data.dtype)
+
+        # pylint: disable=unused-variable
+        resize_out = tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners)
+        # pylint: enable=unused-variable
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                ['ResizeBilinear'],
+                )
+
+            tf_output = run_tf_graph(sess, data,
+                    'Const:0', 'ResizeBilinear:0')
+
+            tvm_output = run_tvm_graph(graph_def,
+                                       data,
+                                       "Const", tf_output.shape, data.dtype)
+
+            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
+
+            sess.close()
+
+def test_forward_resize_bilinear():
+    """ Resize Bilinear """
+
+    _test_resize_bilinear((4, 16, 32, 32), [50, 50], False)
+    _test_resize_bilinear((6, 32, 64, 64), [20, 20], True)
+
+
+#######################################################################
+# Inception V3
+# ------------
+def test_forward_inception_v3():
+    '''test inception V3 model'''
+    with tf.Graph().as_default():
+        graph_def = nnvm.testing.tf.get_workload('InceptionV3/inception_v3_2016_08_28_frozen-with_shapes.pb')
+        # Call the utility to import the graph definition into default graph.
+        graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
+
+        data = np.random.uniform(size=(1, 299, 299, 3)).astype('float32')
+
+        with tf.Session() as sess:
+            tf_output = run_tf_graph(sess, data, 'input:0', 'InceptionV3/Predictions/Reshape_1:0')
+            tvm_output = run_tvm_graph(graph_def, data, 'input', tf_output.shape, 'float32')
+            np.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+
+#######################################################################
+# Inception V1
+# ------------
+def test_forward_inception_v1():
+    '''test inception V1 model'''
+    with tf.Graph().as_default():
+        graph_def = nnvm.testing.tf.get_workload("InceptionV1/classify_image_graph_def-with_shapes.pb")
+        # Call the utility to import the graph definition into default graph.
+        graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
+
+        # Build an image from random data.
+        from PIL import Image
+        from tvm.contrib import util
+
+        img_array = np.random.uniform(size=(1, 600, 600, 3)).astype("uint8")
+        img = Image.frombuffer('RGB', (600, 600), img_array.tostring(), 'raw', 'RGB', 0, 1)
+        temp = util.tempdir()
+        img_path = temp.relpath("tf-test.jpg")
+        img.save(img_path);
+
+        import os.path
+        if not tf.gfile.Exists(os.path.join(img_path)):
+            tf.logging.fatal('File does not exist %s', image)
+        data = tf.gfile.FastGFile(os.path.join(img_path), 'rb').read()
+
+        temp.remove()
+
+        # Extract tensorflow decoded image frame for tvm input
+        with tf.Session() as sess:
+            tvm_data = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'DecodeJpeg:0')
+
+        with tf.Session() as sess:
+            tf_output = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'softmax:0')
+            tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents', tf_output.shape, 'float32')
+            np.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+
+#######################################################################
+# Mobilenet
+# ---------
+def test_forward_mobilenet():
+    '''test mobilenet model'''
+    with tf.Graph().as_default():
+        graph_def = nnvm.testing.tf.get_workload("MobilenetV1/mobilenet_v1_1.0_224_frozen-with-shapes.pb")
+        # Call the utility to import the graph definition into default graph.
+        graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
+
+        data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
+        out_node = 'MobilenetV1/Predictions/Reshape_1'
+
+        with tf.Session() as sess:
+            tf_output = run_tf_graph(sess, data, 'input:0', out_node + ':0')
+            tvm_output = run_tvm_graph(graph_def, data, 'input', tf_output.shape, 'float32')
+            np.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
+
+#######################################################################
+# PTB
+# ---
+dir(tf.contrib)
+def test_forward_ptb():
+    '''test ptb model'''
+    config = nnvm.testing.tf.get_config()
+    num_steps = config.num_steps
+    num_hidden = config.hidden_size
+    num_layers = config.num_layers
+    batch_size = config.batch_size
+    vocab_size = config.vocab_size
+    out_sample_shape = (batch_size, vocab_size)
+    out_state_shape = (num_layers, 2, batch_size, num_hidden)
+    #Sample input
+    inpt = "we have no useful information on"
+    cnt_sample = 20
+
+    def _pretty_print(items, is_char_model, id2word):
+        if not is_char_model:
+            return ' '.join([id2word[x] for x in items])
+        else:
+            return ''.join([id2word[x] for x in items]).replace('_', ' ')
+
+    def _get_tvm_graph_module(graph_def):
+        sym, params = nnvm.frontend.from_tensorflow(graph_def)
+
+        #Cell inputs 'c and 'h' consist of all layers values
+        shape_dict = {'Model/Placeholder': (batch_size, num_steps),
+                      'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_c':(num_layers, batch_size, num_hidden),
+                      'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_h':(num_layers, batch_size, num_hidden)}
+        dtype_dict = {'Model/Placeholder': 'int32',
+                      'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_c':'float32',
+                      'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_h':'float32'}
+        target = 'llvm'
+        graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+                                                 dtype=dtype_dict, params=params)
+        from tvm.contrib import graph_runtime
+        ctx = tvm.cpu(0)
+        return params, graph_runtime.create(graph, lib, ctx)
+
+    def _do_tvm_sample(model, data, in_states, params, num_samples):
+        """Sampled from the model"""
+        samples = []
+        state = in_states
+        sample = None
+        def _get_sample(data, state):
+            input_data = np.full((batch_size, num_steps), data, dtype="int32")
+            in_state_tup = np.split(state, indices_or_sections=2, axis=1)
+            in_state_c = np.reshape(in_state_tup[0], (num_layers, batch_size, num_hidden))
+            in_state_h = np.reshape(in_state_tup[1], (num_layers, batch_size, num_hidden))
+
+            model.set_input('Model/Placeholder', tvm.nd.array(input_data.astype("int32")))
+            model.set_input('Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_c',
+                        tvm.nd.array(in_state_c.astype("float32")))
+            model.set_input('Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_h',
+                        tvm.nd.array(in_state_h.astype("float32")))
+            model.set_input(**params)
+            model.run()
+            tvm_output = model.get_output(0, tvm.nd.empty(out_sample_shape,
+                                                      "float32")).asnumpy()
+            state_output = model.get_output(1, tvm.nd.empty(out_state_shape,
+                                                        "float32")).asnumpy()
+            sample = nnvm.testing.tf.pick_from_weight(tvm_output[0])
+            return sample, state_output
+
+        for x in data:
+            sample, state = _get_sample(x, state)
+
+        if sample is not None:
+            samples.append(sample)
+        else:
+            samples.append(0)
+
+        k = 1
+        while k < num_samples:
+            sample, state = _get_sample(samples[-1], state)
+            samples.append(sample)
+            k += 1
+        return samples, state
+
+    with tf.Graph().as_default():
+        word_to_id, id_to_word, graph_def = nnvm.testing.tf.get_workload_ptb()
+        vocab_size = len(word_to_id)
+        # Call the utility to import the graph definition into default graph.
+        graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
+        sess = tf.Session()
+
+    #TVM graph module creation
+    params, m = _get_tvm_graph_module(graph_def)
+
+    # Create 10 predicted statments of 20 words
+    cnt_stm = 0
+    while cnt_stm < 10:
+        cnt_stm += 1
+        in_state = np.full((num_layers, 2, batch_size, num_hidden), 0, dtype="float32")
+        seed_for_sample = inpt.split()
+        tvm_samples, tvm_state = _do_tvm_sample(m, [word_to_id[word] \
+                                                    for word in seed_for_sample],
+                                                in_state, params, cnt_sample)
+        tvm_sample_str = _pretty_print(tvm_samples, False, id_to_word)
+        tf_samples, tf_state = nnvm.testing.tf.do_tf_sample(sess,
+                                [word_to_id[word] for word in seed_for_sample],
+                                in_state, cnt_sample)
+        tf_sample_str = _pretty_print(tf_samples, False, id_to_word)
+        inpt = tvm_sample_str
+        np.testing.assert_allclose(tf_samples, tvm_samples, rtol=1e-5, atol=1e-5)
+        assert(tvm_sample_str == tf_sample_str)
+
+#######################################################################
+# LRN (Local Response Normalization)
+# ----------------------------------
+
+def _test_lrn(ishape, size, axis, bias, alpha, beta):
+    """ testing local response normalization """
+    lrn_depth_radius = size / 2
+
+    inp_array = np.random.uniform(size=ishape).astype(np.float32)
+
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype, name="lrn0_data")
+        nn_ops.local_response_normalization(in1,
+                                            name="lrn",
+                                            depth_radius=lrn_depth_radius,
+                                            bias=bias,
+                                            alpha=alpha,
+                                            beta=beta)
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                ['lrn'],)
+
+            tf_output = run_tf_graph(sess, inp_array, 'lrn0_data:0', 'lrn:0')
+            tvm_output = run_tvm_graph(graph_def,
+                                       inp_array,
+                                       "lrn0_data", tf_output.shape, tf_output.dtype)
+            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
+            sess.close()
+
+def test_forward_lrn():
+    _test_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
+
+# Main
+# ----
+if __name__ == '__main__':
+    test_forward_convolution()
+    test_forward_pooling()
+    test_forward_reshape()
+    test_forward_squeeze()
+    test_forward_sigmoid()
+    if tf.__version__ == '1.4.1':
+        _test_forward_concat_v2()
+    test_forward_multi_input()
+    test_forward_inception_v3()
+    test_forward_inception_v1()
+    test_forward_mobilenet()
+    test_forward_variable()
+    test_forward_resize_bilinear()
+    test_forward_lstm()
+    test_forward_stridedslice()
+    test_forward_gather()
+    test_forward_ptb()
+    test_forward_lrn()
diff --git a/nnvm/tests/python/unittest/test_correct_layout.py b/nnvm/tests/python/unittest/test_correct_layout.py
new file mode 100644
index 000000000000..6176586284a7
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_correct_layout.py
@@ -0,0 +1,352 @@
+import nnvm
+import nnvm.symbol as sym
+import nnvm.graph as graph
+from nnvm.compiler import graph_attr
+
+def correct_layout(g, layout=None):
+    if isinstance(g, nnvm.symbol.Symbol):
+        g = graph.create(g)
+    if layout:
+        graph_attr.set_layout_inputs(g, layout)
+    g = g.apply("CorrectLayout")
+    ldict = {}
+    vlayout = g.json_attr("layout")
+    entry_ptr = g.index.entry_ptr
+    for i, n in enumerate(g.index.nodes):
+        begin, end = entry_ptr[i], entry_ptr[i + 1]
+        ldict[n["name"]] = vlayout[begin:end]
+    return g, ldict
+
+
+# Level 1
+def test_dense():
+    x = sym.Variable("data", shape=(10, 20))
+    y = sym.dense(x, units=30, name="fc")
+    g, ldict = correct_layout(y, "HW")
+    assert(ldict["data"][0] == "HW")
+    assert(ldict["fc"][0] == "HW")
+    assert(ldict["fc_bias"][0] == "__undef__")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "HW16w")
+    assert(ldict["data"][0] == "HW16w")
+    assert(ldict["data_HW"][0] == "HW")
+    assert(ldict["fc"][0] == "HW")
+    assert(ldict["fc_bias"][0] == "__undef__")
+
+
+def test_matmul():
+    a = sym.Variable("a", shape=(10, 20))
+    b = sym.Variable("b", shape=(20, 30))
+    c = sym.matmul(a, b, name="matmul")
+    g, ldict = correct_layout(c, {"a" : "HW", "b" : "WC"})
+    assert(ldict["a"][0] == "HW")
+    assert(ldict["b"][0] == "WC")
+    assert(ldict["matmul"][0] == "HC")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, {"a" : "HW16w", "b" : "WC16c"})
+    assert(ldict["a"][0] == "HW16w")
+    assert(ldict["a_HW"][0] == "HW")
+    assert(ldict["b"][0] == "WC16c")
+    assert(ldict["b_WC"][0] == "WC")
+    assert(ldict["matmul"][0] == "HC")
+    a = sym.Variable("a", shape=(20, 10))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True)
+    g, ldict = correct_layout(c, {"a" : "HW", "b" : "HC"})
+    assert(ldict["a"][0] == "HW")
+    assert(ldict["b"][0] == "HC")
+    assert(ldict["matmul"][0] == "WC")
+    b = sym.Variable("b", shape=(30, 20))
+    c = sym.matmul(a, b, name="matmul", transpose_b=True)
+    g, ldict = correct_layout(c, {"a" : "HW", "b" : "CW"})
+    assert(ldict["a"][0] == "HW")
+    assert(ldict["b"][0] == "CW")
+    assert(ldict["matmul"][0] == "HC")
+    a = sym.Variable("a", shape=(20, 10))
+    b = sym.Variable("b", shape=(30, 20))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True, transpose_b=True)
+    g, ldict = correct_layout(c, {"a" : "HW", "b" : "CH"})
+    assert(ldict["a"][0] == "HW")
+    assert(ldict["b"][0] == "CH")
+    assert(ldict["matmul"][0] == "WC")
+
+
+def test_concatenate():
+    x1 = sym.Variable("x", shape=(10, 20))
+    x2 = sym.Variable("y", shape=(10, 30))
+    z = sym.concatenate(x1, x2, name="concat")
+    g, ldict = correct_layout(z, {"x": "HW", "y": "HW"})
+    assert(ldict["x"][0] == "HW")
+    assert(ldict["y"][0] == "HW")
+    assert(ldict["concat"][0] == "__undef__")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, {"x": "HW16w", "y": "HW16w"})
+    assert(ldict["x"][0] == "HW16w")
+    assert(ldict["y"][0] == "HW16w")
+    assert(ldict["x_HW"][0] == "HW")
+    assert(ldict["y_HW"][0] == "HW")
+    assert(ldict["concat"][0] == "__undef__")
+
+
+def test_expand_dims():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.expand_dims(x, axis=1, name="y")
+    g, ldict = correct_layout(y, "HW")
+    assert(ldict["x"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "HW16w")
+    assert(ldict["x"][0] == "HW16w")
+    assert(ldict["x_HW"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_split():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.split(x, indices_or_sections=[11], name="y")
+    g, ldict = correct_layout(y, "HW")
+    assert(ldict["x"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "HW16w")
+    assert(ldict["x"][0] == "HW16w")
+    assert(ldict["x_HW"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_batchnorm():
+    x = sym.Variable("data", shape=(10, 20, 30, 40))
+    y = sym.batch_norm(x, axis=1, epsilon=2e-5, name="bn")
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["bn"][0] == "NCHW")
+    assert(ldict["bn"][1] == "C")
+    assert(ldict["bn"][2] == "C")
+    assert(ldict["bn_beta"][0] == "C")
+    assert(ldict["bn_gamma"][0] == "C")
+    assert(ldict["bn_moving_mean"][0] == "C")
+    assert(ldict["bn_moving_var"][0] == "C")
+    # batch_norm can deal with sub-dim of C at the last dim.
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["data"][0] == "NCHW16c")
+    assert(ldict["bn"][0] == "NCHW16c")
+    assert(ldict["bn"][1] == "C16c")
+    assert(ldict["bn"][2] == "C16c")
+    assert(ldict["bn_beta"][0] == "C")
+    assert(ldict["bn_beta_C16c"][0] == "C16c")
+    assert(ldict["bn_gamma"][0] == "C")
+    assert(ldict["bn_gamma_C16c"][0] == "C16c")
+    assert(ldict["bn_moving_mean"][0] == "C")
+    assert(ldict["bn_moving_mean_C16c"][0] == "C16c")
+    assert(ldict["bn_moving_var"][0] == "C")
+    assert(ldict["bn_moving_var_C16c"][0] == "C16c")
+    # but for other layout, it does a layout transform for data
+    g, ldict = correct_layout(g, "NCH16cW")
+    assert(ldict["data"][0] == "NCH16cW")
+    assert(ldict["data_NCHW16c"][0] == "NCHW16c")
+    assert(ldict["bn"][0] == "NCHW16c")
+    assert(ldict["bn"][1] == "C16c")
+    assert(ldict["bn"][2] == "C16c")
+    assert(ldict["bn_beta"][0] == "C")
+    assert(ldict["bn_beta_C16c"][0] == "C16c")
+    assert(ldict["bn_gamma"][0] == "C")
+    assert(ldict["bn_gamma_C16c"][0] == "C16c")
+    assert(ldict["bn_moving_mean"][0] == "C")
+    assert(ldict["bn_moving_mean_C16c"][0] == "C16c")
+    assert(ldict["bn_moving_var"][0] == "C")
+    assert(ldict["bn_moving_var_C16c"][0] == "C16c")
+
+
+def test_flatten():
+    x = sym.Variable("x", shape=(10, 20, 10, 10))
+    y = sym.flatten(x, name="y")
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["x_NCHW"][0] == "NCHW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_softmax():
+    x = sym.Variable("x", shape=(10, 20, 10, 10))
+    y = sym.softmax(x, name="y")
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "NCHW")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["x_NCHW"][0] == "NCHW")
+    assert(ldict["y"][0] == "NCHW")
+
+
+# Level 2
+def test_conv2d():
+    x = sym.Variable("data", shape=(1, 32, 512, 512))
+    y = sym.conv2d(x, name="conv", channels=12,
+                   kernel_size=(3,3), padding=(1,1), layout="NCHW")
+    _, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["conv_weight"][0] == "OIHW")
+    assert(ldict["conv_bias"][0] == "C")
+    assert(ldict["conv"][0] == "NCHW")
+    y = sym.conv2d(x, name="conv", channels=12,
+                   kernel_size=(3,3), padding=(1,1), layout="NCHW16c",
+                   kernel_layout="OIHW16i16o", out_layout="NCHW8c")
+    _, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW16c")
+    assert(ldict["conv_weight"][0] == "OIHW16i16o")
+    assert(ldict["conv_bias"][0] == "C8c")
+    assert(ldict["conv"][0] == "NCHW8c")
+    y = sym.conv2d(x, name="conv", channels=12,
+                   kernel_size=(3,3), padding=(1,1), layout="N16cHWC")
+    _, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "N16cHWC")
+    assert(ldict["conv_weight"][0] == "OIHW")
+    assert(ldict["conv_bias"][0] == "16cC")
+    assert(ldict["conv"][0] == "N16cHWC")
+
+
+def test_conv2d_transpose():
+    x = sym.Variable("data", shape=(1, 32, 512, 512))
+    y = sym.conv2d_transpose(x, name="conv", channels=12,
+                             kernel_size=(3,3), padding=(1,1), layout="NCHW")
+    _, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["conv_weight"][0] == "OIHW")
+    assert(ldict["conv_bias"][0] == "C")
+    assert(ldict["conv"][0] == "NCHW")
+
+
+def test_max_pool2d():
+    x = sym.Variable("data", shape=(1, 32, 512, 512))
+    y = sym.max_pool2d(x, name="pool", pool_size=(3,3),
+                       padding=(1,1), layout="NCHW")
+    g, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["pool"][0] == "NCHW")
+    # if index of H and W remain the same,
+    # pool2d does not convert the layout.
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["data"][0] == "NCHW16c")
+    assert(ldict["pool"][0] == "NCHW16c")
+    # for other layout it requires a layout transform.
+    g, ldict = correct_layout(g, "NHWC")
+    assert(ldict["data"][0] == "NHWC")
+    assert(ldict["data_NCHW"][0] == "NCHW")
+    assert(ldict["pool"][0] == "NCHW")
+
+
+def test_global_pool2d():
+    x = sym.Variable("data", shape=(1, 32, 512, 512))
+    y = sym.global_max_pool2d(x, name="pool", layout="NCHW")
+    g, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["pool"][0] == "NCHW")
+    # if index of H and W remain the same,
+    # pool2d does not convert the layout.
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["data"][0] == "NCHW16c")
+    assert(ldict["pool"][0] == "NCHW16c")
+    # for other layout it requires a layout transform.
+    g, ldict = correct_layout(g, "NHWC")
+    assert(ldict["data"][0] == "NHWC")
+    assert(ldict["data_NCHW"][0] == "NCHW")
+    assert(ldict["pool"][0] == "NCHW")
+
+
+# Level 3
+def test_reshape():
+    x = sym.Variable("x", shape=(4,))
+    y = sym.reshape(x, shape=(2,2), name="y")
+    g, ldict = correct_layout(y, "C")
+    assert(ldict["x"][0] == "C")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    g, ldict = correct_layout(g, "C16c")
+    assert(ldict["x"][0] == "C16c")
+    assert(ldict["x_C"][0] == "C")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_transpose():
+    x = sym.Variable("x", shape=(1, 32, 512, 512))
+    y = sym.transpose(x, name="y", axes=(0, 2, 3, 1))
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "NHWC")
+    # second pass will insert layout transform
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["x_NCHW"][0] == "NCHW")
+    assert(ldict["y"][0] == "NHWC")
+
+
+def test_broadcast_to():
+    x = sym.Variable("x", shape=(4, 1))
+    y = sym.broadcast_to(x, shape=(0, 4), name="y")
+    g, ldict = correct_layout(y, "HW")
+    assert(ldict["x"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    g, ldict = correct_layout(g, "HW16h")
+    assert(ldict["x"][0] == "HW16h")
+    assert(ldict["x_HW"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_broadcast_binary():
+    x = sym.Variable("x", shape=(1, 16, 512, 512))
+    y = sym.Variable("y", shape=(16, 512, 512))
+    z = sym.broadcast_add(x, y, name="z")
+    g, ldict = correct_layout(z, {"x": "NCHW", "y": "CHW"})
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "CHW")
+    assert(ldict["z"][0] == "NCHW")
+    # prior to keep the left layout if they do not match.
+    g, ldict = correct_layout(g, {"x": "NCHW16c", "y": "CHW"})
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["y"][0] == "CHW")
+    assert(ldict["y_CHW16c"][0] == "CHW16c")
+    assert(ldict["z"][0] == "NCHW16c")
+    # broadcast_add(HCW16c, N16nCH16cW)
+    g, ldict = correct_layout(z, {"x": "HCW16c", "y": "N16nCH16cW"})
+    assert(ldict["x"][0] == "HCW16c")
+    assert(ldict["y"][0] == "N16nCH16cW")
+    assert(ldict["x_CH16cW"][0] == "CH16cW")
+    assert(ldict["z"][0] == "N16nCH16cW")
+
+
+def test_reduce():
+    x = sym.Variable("x", shape=(1, 16, 512, 512))
+    y = sym.sum(x, name="y", axis=1)
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["x_NCHW"][0] == "NCHW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+if __name__ == "__main__":
+    test_dense()
+    test_matmul()
+    test_concatenate()
+    test_expand_dims()
+    test_split()
+    test_batchnorm()
+    test_flatten()
+    test_softmax()
+    test_conv2d()
+    test_conv2d_transpose()
+    test_max_pool2d()
+    test_global_pool2d()
+    test_reshape()
+    test_transpose()
+    test_broadcast_to()
+    test_broadcast_binary()
+    test_reduce()
\ No newline at end of file
diff --git a/nnvm/tests/python/unittest/test_graph.py b/nnvm/tests/python/unittest/test_graph.py
new file mode 100644
index 000000000000..09d249700ad9
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_graph.py
@@ -0,0 +1,144 @@
+import json
+import nnvm.symbol as sym
+import nnvm.graph as graph
+import nnvm.compiler.graph_util as graph_util
+
+def test_json_pass():
+    x = sym.Variable('x')
+    y = sym.dense(data=x, name='conv', units=30)
+    g = graph.create(y)
+    ret = g.apply('SaveJSON')
+    ret._set_json_attr('json', ret.json_attr('json'))
+    g2 = ret.apply('LoadJSON')
+    assert g2.apply('SaveJSON').json_attr('json') == ret.json_attr('json')
+    json = g.json()
+    g2 = graph.load_json(json)
+    assert json == g2.json()
+
+
+def test_json_pass_with_attr():
+    x = sym.Variable('x')
+    y = sym.dense(data=x, name='fc', units=30)
+    g = graph.create(y)
+    g._set_json_attr('version', '0.1.0')
+    ret = g.apply('SaveJSON')
+    json_str = ret.json_attr('json')
+    ret._set_json_attr('json', json_str)
+    g2 = ret.apply('LoadJSON')
+    assert g2.json_attr('version') == '0.1.0'
+
+
+def test_graph_json_attr():
+    x = sym.Variable('x')
+    y = sym.dense(data=x, name='fc', units=30)
+    g = graph.create(y)
+    g._set_json_attr('ilist', [1,2,3], 'list_int')
+    assert g.json_attr('ilist') == [1,2,3]
+
+def test_list_args():
+    x = sym.Variable('x')
+    z = sym.Variable('z')
+    y = sym.dense(data=x, name='fc', units=30)
+    y = sym.elemwise_add(y, z, name='add1')
+
+def test_infer_shape():
+    x = sym.Variable('x', shape=(2, 4, 2))
+    y = sym.elemwise_add(x, x, name='add1')
+    y = sym.flatten(y, name="flatten")
+    g = graph.create(y)
+    g._set_json_attr("shape_attr_key", "shape")
+    g = g.apply('InferShape')
+    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
+    jnodes = jgraph['nodes']
+    jnode_row_ptr = jgraph['node_row_ptr']
+    nindex = {n['name']: i for i, n in enumerate(jnodes)}
+    assert g.json_attr('shape')[jnode_row_ptr[nindex["flatten"]]] == [2, 8]
+    assert g.json_attr('shape')[jnode_row_ptr[nindex["add1"]]] == [2, 4, 2]
+
+def test_infer_shape_known_partial():
+    x = sym.Variable('x')
+    y = sym.elemwise_add(x, x, name='add1')
+    y = sym.flatten(y, name="flatten1")
+    g = graph.create(y)
+    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
+    shape = [[2, 4, 2], [] , []]
+    g._set_json_attr("shape", shape, 'list_shape')
+    g = g.apply("InferShape")
+    jnodes = jgraph['nodes']
+    jnode_row_ptr = jgraph['node_row_ptr']
+    nindex = {n['name']: i for i, n in enumerate(jnodes)}
+    assert g.json_attr('shape')[jnode_row_ptr[nindex["flatten1"]]] == [2, 8]
+    assert g.json_attr('shape')[jnode_row_ptr[nindex["add1"]]] == [2, 4, 2]
+
+def test_infer_type():
+    x = sym.Variable('x', dtype=0)
+    y = sym.elemwise_add(x, x, name='add1')
+    y = sym.cast(y, dtype="float64", name="cast1")
+    g = graph.create(y)
+    g._set_json_attr("dtype_attr_key", "dtype")
+    g = g.apply('InferType')
+    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
+    jnodes = jgraph['nodes']
+    jnode_row_ptr = jgraph['node_row_ptr']
+    nindex = {n['name']: i for i, n in enumerate(jnodes)}
+    assert g.json_attr('dtype')[jnode_row_ptr[nindex["cast1"]]] == 1
+    assert g.json_attr('dtype')[jnode_row_ptr[nindex["add1"]]] == 0
+
+def test_plan_memory():
+    x = sym.Variable('x', shape=(4, 2))
+    x2 = sym.elemwise_add(x, x, name='addk')
+    y = sym.flatten(x2, name="reshapek")
+    y = sym.elemwise_add(y, x2, name="add2")
+    y = sym.elemwise_add(y, y)
+    g = graph.create(y)
+    g._set_json_attr("shape_attr_key", "shape")
+    g = g.apply(["InferShape", "InferType", "PlanMemory"])
+    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
+    jnodes = jgraph['nodes']
+    jnode_row_ptr = jgraph['node_row_ptr']
+    storage_id = g.json_attr('storage_id')
+    nindex = {n['name']: i for i, n in enumerate(jnodes)}
+    assert (storage_id[jnode_row_ptr[nindex["addk"]]] !=
+            storage_id[jnode_row_ptr[nindex["reshapek"]]])
+    assert (storage_id[jnode_row_ptr[nindex["add2"]]] ==
+            storage_id[jnode_row_ptr[nindex["reshapek"]]])
+
+def test_print_graph_ir():
+    x = sym.Variable("x", shape=(1, 1, 10, 20))
+    y = sym.conv2d(x + 1, name="y", channels=10, kernel_size=(3,3))
+    g = graph.create(y)
+    g = g.apply("InferShape")
+    ir1 = g.ir()
+    ir2 = g.ir(join_entry_attrs=["shape"])
+    assert("y_bias" in ir1)
+    assert("shape=" in ir2)
+
+def test_gradient():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z1 = sym.elemwise_add(x, sym.sqrt(y))
+    z2 = sym.log(x)
+    gradient = graph_util.gradients([z1, z2], [x, y])
+    assert len(gradient) == 2
+
+    g1 = sym.Variable("g1")
+    g2 = sym.Variable("g2")
+    grad_ys = [g1, g2]
+    gradient = graph_util.gradients(sym.Group([z1, z2]),
+                               sym.Group([x, y]), grad_ys=grad_ys)
+    g_graph = graph.create(sym.Group(gradient)).ir()
+    assert len(gradient) == 2
+    assert "g1" in g_graph
+    assert "g2" in g_graph
+
+if __name__ == "__main__":
+    test_print_graph_ir()
+    test_json_pass_with_attr()
+    test_graph_json_attr()
+    test_json_pass()
+    test_infer_shape()
+    test_infer_shape_known_partial()
+    test_infer_type()
+    test_plan_memory()
+    test_list_args()
+    test_gradient()
diff --git a/nnvm/tests/python/unittest/test_graph_gradient.py b/nnvm/tests/python/unittest/test_graph_gradient.py
new file mode 100644
index 000000000000..d3f25097819b
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_graph_gradient.py
@@ -0,0 +1,136 @@
+import nnvm.symbol as sym
+from nnvm.compiler import graph_util
+
+def test_cnn_gradients():
+    # input data
+    h = 128
+    w = 128
+    data_shape = (1000, 3, h, w)
+    data = sym.Variable('data', shape=data_shape, dtype=0)
+
+    # conv2d
+    num_channels = 64
+    kernel_size = 32
+    conv_w_shape = (num_channels, 3, kernel_size, kernel_size)
+    conv_b_shape = (num_channels,)
+    conv_w = sym.Variable('conv_w', shape=conv_w_shape)
+    conv_b = sym.Variable('conv_b', shape=conv_b_shape)
+    conv1 = sym.conv2d(data=data, weight=conv_w, bias=conv_b,
+                      channels=num_channels, kernel_size=(kernel_size, kernel_size),
+                      name='conv1')
+    # relu1
+    relu1 = sym.relu(data=conv1, name='relu1')
+    # max pooling
+    max_pooling1 = sym.max_pool2d(data=relu1, pool_size=(2, 2), name='max_pooling1')
+    # flatten
+    flatten1 = sym.flatten(data=max_pooling1)
+    # shape after flatten
+    flatten_out_shape = (h - kernel_size) * (w - kernel_size) * num_channels
+    # dense1
+    dense1_hidden_units = 100
+    dense1 = sym.dense(data=flatten1, name='dense1', units=dense1_hidden_units)
+    # relu2
+    relu2 = sym.relu(data=dense1, name='relu2')
+    # dense2
+    dense2_hidden_units = 10
+    dense2 = sym.dense(data=relu2, name='dense2', units=dense2_hidden_units)
+    # softmax
+    mlp = sym.softmax(data=dense2, name='softmax')
+    # fake non-sparse label
+    label = sym.full_like(mlp, fill_value=1)
+    # cross entropy loss
+    ce_loss = sym.sum(
+        sym.elemwise_mul(sym.log_softmax(dense2), label),
+        axis=1,
+        keepdims=True,
+        name="ce_loss")
+
+    # input variables:
+    # print grad_g.symbol.list_input_names()
+    # >> ['data', 'conv_w', 'conv_b',
+    #     'dense1_weight', 'dense1_bias',
+    #     'dense2_weight', 'dense2_bias']
+
+    # output gradient variables:
+    # print grad_g.symbol.list_output_names()
+    # >> ['conv1_grad_data', 'conv1_grad_weight', 'conv1_grad_bias',
+    #     'dense1_grad_weight', 'dense1_grad_bias',
+    #     'dense2_grad_weight', 'dense2_grad_bias']
+    grad_g = graph_util.get_gradient_graph(ce_loss, ce_loss.list_input_variables())
+
+    # infer shape
+    in_shapes, out_shapes = graph_util.infer_shape(grad_g)
+
+    # forward graph shape
+    assert in_shapes == [list(data_shape), list(conv_w_shape), list(conv_b_shape),
+                          [dense1_hidden_units, flatten_out_shape], [dense1_hidden_units],
+                          [dense2_hidden_units, dense1_hidden_units], [dense2_hidden_units]]
+    # input grads shape should be equal with input shape
+    assert in_shapes == out_shapes
+
+    # output grads w.r.t input variables
+    grads = graph_util.gradients(ce_loss, ce_loss.list_input_variables())
+
+    # gradients number should be equal with grad_input number
+    assert len(grads) == len(ce_loss.list_input_variables())
+
+    # infer type
+    in_dtypes, out_dtypes = graph_util.infer_dtype(grad_g)
+    assert out_dtypes == ['float32', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32']
+
+def test_multi_loss_graph_gradients():
+    # input data
+    shape1 = (1000, 100)
+    data1 = sym.Variable('data1', shape=(1000, 100), dtype=0)
+
+    # fake non-sparse label
+    label = sym.full(fill_value=3)
+
+    # square loss
+    sub1 = sym.elemwise_sub(data1, label, name="sub1")
+    square_loss = sym.sum(data=sub1**2, axis=1, name="square_loss")
+
+    # fake loss1
+    shape2 = (1000, )
+    data2 = sym.Variable('data2', shape=shape2, dtype=0)
+    loss1 = sym.sqrt(data2, name="loss1")
+
+    # fake loss2
+    loss2 = sym.relu(data1, name='loss2')
+
+    # block loss1
+    total_loss = sym.elemwise_sum(
+        sym.block_grad(loss1),
+        square_loss,
+        num_args=2,
+        name="total_loss")
+
+    # grad_g.symbol.list_output_names()
+    # >> ['loss1_grad_0_output', 'grad_sum_output']
+    grad_g = graph_util.get_gradient_graph([total_loss, loss2], total_loss.list_input_variables())
+    # infer shape
+    in_shapes, out_shapes = graph_util.infer_shape(grad_g)
+    assert out_shapes == [list(shape2), list(shape1)]
+
+    # grad_data1 is elemwise_sum of grad_loss2, grad_square_loss
+    grad_data1 = grad_g.symbol[1]
+    assert grad_data1.list_attr()['num_args'] == '2'
+
+    # block grad should return zero grad
+    grad_data2 = grad_g.symbol[0]
+    assert 'zeros_like' in grad_g.ir()
+
+    # test reverse infer shape for label
+    assert grad_g.apply('InferShape').json_attr('shape_num_unknown_nodes') == 0
+
+    # infer type
+    in_dtypes, out_dtypes = graph_util.infer_dtype(grad_g)
+    assert out_dtypes == ['float32', 'float32']
+    
+    # test reverse infer type for label
+    assert grad_g.apply('InferType').json_attr('dtype_num_unknown_nodes') == 0
+
+
+if __name__ == "__main__":
+    test_cnn_gradients()
+    test_multi_loss_graph_gradients()
diff --git a/nnvm/tests/python/unittest/test_infer_shape.py b/nnvm/tests/python/unittest/test_infer_shape.py
new file mode 100644
index 000000000000..51e0e9576781
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_infer_shape.py
@@ -0,0 +1,374 @@
+import json
+import nnvm.symbol as sym
+import nnvm.graph as graph
+
+def infer_shape(sym):
+    g = graph.create(sym)
+    g._set_json_attr("shape_attr_key", "shape")
+    g = g.apply("InferShape")
+    sdict = {}
+    vshape = g.json_attr("shape")
+    entry_ptr = g.index.entry_ptr
+    for i, n in enumerate(g.index.nodes):
+        begin, end = entry_ptr[i], entry_ptr[i + 1]
+        sdict[n["name"]] = vshape[begin:end]
+    return sdict
+
+# Level 1
+def test_dense():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.dense(x, units=30, name="fc")
+    sdict = infer_shape(y)
+    assert(sdict["fc"][0] == [10, 30])
+    assert(sdict["fc_bias"][0] == [30])
+
+
+def test_matmul():
+    a = sym.Variable('a', shape=(10, 20))
+    b = sym.Variable('b', shape=(20, 30))
+    c = sym.matmul(a, b, name="matmul")
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 30])
+    a = sym.Variable('a', shape=(20, 10))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True)
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 30])
+    b = sym.Variable('b', shape=(30, 20))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True, transpose_b=True)
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 30])
+    a = sym.Variable('a', shape=(10, 20))
+    c = sym.matmul(a, b, name="matmul", transpose_b=True)
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 30])
+    a = sym.Variable('a', shape=(10, 20, 30))
+    b = sym.Variable('b', shape=(30, 40, 50))
+    c = sym.matmul(a, b, name="matmul")
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 20, 40, 50])
+    a = sym.Variable('a', shape=(30, 20, 10))
+    b = sym.Variable('b', shape=(50, 40, 30))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True, transpose_b=True)
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 20, 40, 50])
+
+
+def test_concatenate():
+    x1 = sym.Variable("x", shape=(10, 20))
+    x2 = sym.Variable("y", shape=(10, 30))
+    z = sym.concatenate(x1, x2, name="concat")
+    sdict = infer_shape(z)
+    assert(sdict["concat"][0] == [10, 50])
+    z = sym.concatenate(x1, x1, axis=0, name="concat")
+    sdict = infer_shape(z)
+    assert(sdict["concat"][0] == [20, 20])
+
+
+def test_expand_dims():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.expand_dims(x, axis=1, name="y")
+    sdict = infer_shape(y)
+    assert(sdict["y"][0] == [10, 1, 20])
+    y = sym.expand_dims(x, axis=-1, name="y", num_newaxis=2)
+    sdict = infer_shape(y)
+    assert(sdict["y"][0] == [10, 20, 1, 1])
+
+
+def test_split():
+    x1 = sym.Variable("x", shape=(10, 20))
+    z = sym.split(x1, indices_or_sections=[11], name="y")
+    sdict = infer_shape(z)
+    assert(sdict["y"][0] == [10, 11])
+    assert(sdict["y"][1] == [10, 9])
+    z = sym.split(x1, indices_or_sections=2, name="y")
+    sdict = infer_shape(z)
+    assert(sdict["y"][0] == [10, 10])
+    assert(sdict["y"][1] == [10, 10])
+
+
+def test_batchnorm():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.batch_norm(1 / x, name="bn")
+    sdict = infer_shape(y)
+    assert(sdict["bn_gamma"][0] == [20])
+
+    x = sym.Variable("x", shape=(10, 20, 30, 40))
+    y = sym.batch_norm(data=x, axis=0, epsilon=2e-5, name='bn')
+    sdict = infer_shape(y)
+    assert(sdict['bn_moving_var'][0] == [10])
+
+    y = sym.batch_norm(data=x, axis=1, epsilon=2e-5, name='bn')
+    sdict = infer_shape(y)
+    assert(sdict['bn_gamma'][0] == [20])
+
+    y = sym.batch_norm(data=x, axis=2, epsilon=2e-5, name='bn')
+    sdict = infer_shape(y)
+    assert(sdict['bn_beta'][0] == [30])
+
+    y = sym.batch_norm(data=x, axis=3, epsilon=2e-5, name='bn')
+    sdict = infer_shape(y)
+    assert(sdict['bn_moving_mean'][0] == [40])
+
+def test_flatten():
+    x = sym.Variable("x", shape=(10, 20, 10))
+    y = sym.flatten(x) * 2
+    y = sym.exp(y, name="y")
+    sdict = infer_shape(y)
+    assert(sdict["y"][0] == [10, 200])
+
+def test_squeeze():
+    x = sym.Variable("x", shape=(1, 1, 1, 10))
+    y = sym.squeeze(x, axis=(1,2), name='squeeze')
+    sdict = infer_shape(y)
+    assert(sdict['squeeze'][0] == [1, 10])
+
+    x = sym.Variable("x", shape=(1, 3, 1))
+    y = sym.squeeze(x, name='squeeze')
+    sdict = infer_shape(y)
+    assert(sdict['squeeze'][0] == [3])
+
+    y = sym.squeeze(x, axis=(0), name='squeeze')
+    sdict = infer_shape(y)
+    assert(sdict['squeeze'][0] == [3, 1])
+
+    y = sym.squeeze(x, axis=(0,2), name='squeeze')
+    sdict = infer_shape(y)
+    assert(sdict['squeeze'][0] == [3])
+
+# Level 2
+def test_conv2d():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.conv2d(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 10, 10, 12),
+          (4, 12, 10, 12),
+          channels=12,
+          kernel_size=(3,3),
+          padding=(1,1))
+    check((4, 10, 12, 4),
+          (4, 8, 8, 5),
+          channels=5,
+          kernel_size=(3, 5),
+          layout="NHWC")
+    check((4, 10, 12, 4),
+          (4, 6, 8, 5),
+          channels=5,
+          dilation=(2, 2),
+          kernel_size=(3, 3),
+          layout="NHWC")
+    check((4, 10, 12, 4),
+          (4, 5, 6, 5),
+          channels=5,
+          strides=(2, 2),
+          kernel_size=(3, 3),
+          padding=(1, 1),
+          layout="NHWC")
+
+
+def test_conv2d_packed():
+    def check(in_shape,
+              out_shape,
+              kernel_shape,
+              **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.conv2d(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+        assert(tuple(sdict["y_weight"][0]) == tuple(kernel_shape))
+
+    check((4, 10, 10, 12, 1, 8),
+          (4, 10, 10, 2, 1, 8),
+          (2, 12, 3, 3, 8, 8),
+          channels=8 * 2,
+          kernel_size=(3,3),
+          padding=(1,1),
+          layout="NHWC1n8c",
+          kernel_layout="OIHW8o8i")
+
+
+def test_conv2d_transpose():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.conv2d_transpose(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 10, 10, 12),
+          (4, 15, 10, 12),
+          channels=15,
+          kernel_size=(3,3),
+          padding=(1,1))
+    check((4, 10, 10, 12),
+          (4, 15, 10, 14),
+          channels=15,
+          kernel_size=(3, 5),
+          padding=(1, 1))
+    check((4, 10, 10, 12),
+          (4, 15, 11, 15),
+          channels=15,
+          kernel_size=(3, 5),
+          padding=(1, 1),
+          output_padding=(1, 1))
+    check((4, 10, 10, 12),
+          (4, 15, 15, 11),
+          channels=11,
+          kernel_size=(5, 5),
+          output_padding=(1, 1),
+          layout="NHWC")
+
+
+def test_max_pool2d():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.max_pool2d(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 10, 12, 12),
+          (4, 10, 12, 12),
+          pool_size=(3,3),
+          padding=(1,1))
+    check((4, 10, 12, 12),
+          (4, 10, 6, 6),
+          pool_size=(3, 3),
+          padding=(1, 1),
+          strides=(2, 2))
+    check((4, 10, 12, 12),
+          (4, 10, 7, 7),
+          pool_size=(3, 3),
+          padding=(1, 1),
+          strides=(2, 2),
+          ceil_mode=True)
+    check((4, 12, 14, 10),
+          (4, 6, 7, 10),
+          pool_size=(3, 3),
+          padding=(1, 1),
+          strides=(2, 2),
+          layout="NHWC")
+
+
+def test_global_pool2d():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.global_max_pool2d(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 10, 12, 12),
+          (4, 10, 1, 1))
+    check((4, 10, 12, 12),
+          (4, 1, 1, 12),
+          layout="NHWC")
+
+
+# Level 3
+def test_reshape():
+    def check(in_shape, tshape, out_shape):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.reshape(x, shape=tshape, name="y")
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4,), (2, 2), (2, 2))
+    check((2, 3, 4), (4, 0, 2), (4, 3, 2))
+    check((2, 3, 4), (2, 0, 0), (2, 3, 4))
+    check((2, 3, 4), (6, 1, -1), (6, 1, 4))
+    check((2, 3, 4), (3, -1, 8), (3, 1, 8))
+    check((2, 3, 4), (-1,), (24,))
+    check((2, 3, 4), (-2,), (2, 3, 4))
+    check((2, 3, 4), (2, -2), (2, 3, 4))
+    check((2, 3, 4), (-2, 1, 1), (2, 3, 4, 1, 1))
+    check((2, 3, 4), (-3, 4), (6, 4))
+    check((2, 3, 4, 5), (-3, -3), (6, 20))
+    check((2, 3, 4), (0, -3), (2, 12))
+    check((2, 3, 4), (-3, -2), (6, 4))
+    check((2, 3, 4), (-4, 1, 2, -2), (1, 2, 3, 4))
+    check((2, 3, 4), (2, -4, -1, 3, -2), (2, 1, 3, 4))
+
+
+def test_prelu():
+    def check(in_shape, axis,  out_shape):
+        x = sym.Variable("x", shape=in_shape)
+        w = sym.Variable("w")
+        y = sym.prelu(x, w, axis=axis, name="y")
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+    check((1, 3, 2, 2), 1, (1, 3, 2, 2))
+    check((1, 2, 2, 3), 3, (1, 2, 2, 3))
+
+
+# Level 4
+def test_transpose():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.transpose(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 1), (1, 4))
+    check((0, 1, 2, 3), (1, 2, 3, 0), axes=(1, 2, 3, 0))
+
+
+def test_broadcast_to():
+    def check(in_shape, tshape, out_shape):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.broadcast_to(x, shape=tshape, name="y")
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 1), (0, 4), (4, 4))
+    check((4, 1, 5), (0, 4, 5), (4, 4, 5))
+
+
+def test_broadcast_binary():
+    def check(lhs_shape, rhs_shape, out_shape):
+        x = sym.Variable("x", shape=lhs_shape)
+        y = sym.Variable("y", shape=rhs_shape)
+        z = sym.broadcast_add(x, y, name="y")
+        sdict = infer_shape(z)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 1), (4), (4, 4))
+    check((5, 1, 1), (1, 4, 4), (5, 4, 4))
+    check((6, 1, 4), (5, 4), (6, 5, 4))
+
+
+def test_reduce():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.sum(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 5), (4,), axis=1)
+    check((4, 5), (4, 1), axis=1, keepdims=True)
+    check((4, 5), (1, 5), axis=0, keepdims=True)
+    check((4, 5), (1, 1), axis=(), keepdims=True)
+    check((4, 5), (1,), axis=())
+    check((4, 5, 10), (5,), axis=(0, 2))
+    check((4, 5, 10), (1, 5, 1), axis=(0, 2), keepdims=True)
+
+
+if __name__ == "__main__":
+    test_conv2d_packed()
+    test_expand_dims()
+    test_dense()
+    test_matmul()
+    test_concatenate()
+    test_split()
+    test_batchnorm()
+    test_flatten()
+    test_conv2d()
+    test_conv2d_transpose()
+    test_max_pool2d()
+    test_global_pool2d()
+    test_reshape()
+    test_broadcast_to()
+    test_broadcast_binary()
+    test_reduce()
+    test_transpose()
+    test_prelu()
+    test_squeeze()
diff --git a/nnvm/tests/python/unittest/test_symbol.py b/nnvm/tests/python/unittest/test_symbol.py
new file mode 100644
index 000000000000..93d4fea26445
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_symbol.py
@@ -0,0 +1,61 @@
+import nnvm.symbol as sym
+from nnvm import NNVMError
+
+def test_dense():
+    x = sym.Variable('x')
+    y = sym.dense(x, units=30, name="fc")
+    assert y.list_input_names() == ["x", "fc_weight", "fc_bias"]
+
+def test_batch_norm():
+    x = sym.Variable('x')
+    y = sym.dense(x, units=30, name="fc")
+    z = sym.batch_norm(x, name='bn')
+    assert z.list_input_names('aux_state') == ['bn_moving_mean', 'bn_moving_var']
+    assert z.list_input_names('read_only') == ['x', 'bn_gamma', 'bn_beta']
+
+def test_compose():
+    x = sym.Variable('x')
+    z = sym.Variable('z')
+    y = sym.exp(sym.elemwise_add(x, x, name='add', gpu=2),
+                name='exp', gpu=1, attr={"kk": "1"})
+
+    assert y.list_input_names() == ['x']
+    assert y.list_output_names() == ["exp_output"]
+    assert y.list_attr()['gpu'] == '1'
+    z = y.get_internals()
+    assert z['add_output'].list_output_names() == ['add_output']
+    assert y.list_attr(recursive=True)['add$gpu'] == '2'
+
+def test_default_input():
+    x = sym.Variable('x')
+    y = sym.dense(data=x, units=30, name='fc', use_bias=False)
+    assert y.list_input_names() == ['x', 'fc_weight']
+    tname = [z.list_output_names()[0] for z in y.list_input_variables()]
+    assert tname == y.list_input_names()
+    try:
+        z = sym.elemwise_add(x)
+        assert False
+    except NNVMError:
+        pass
+
+def test_copy():
+    x = sym.Variable('x')
+    z = sym.Variable('z')
+    y = sym.exp(sym.elemwise_add(x, x, name='add', gpu=2),
+                name='exp', gpu=1, attr={"kk": "1"})
+    assert y.__copy__().debug_str() == y.debug_str()
+
+
+def test_op_name():
+    x = sym.Variable('x')
+    y = sym.exp(x)
+    op_name = y.attr("op_name")
+    op_func = sym.__dict__[op_name]
+    z = op_func(x)
+
+if __name__ == "__main__":
+    test_op_name()
+    test_copy()
+    test_default_input()
+    test_compose()
+    test_batch_norm()
diff --git a/nnvm/tests/python/unittest/test_top_level1.py b/nnvm/tests/python/unittest/test_top_level1.py
new file mode 100644
index 000000000000..cf0baf4c3696
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_top_level1.py
@@ -0,0 +1,50 @@
+import nnvm.symbol as sym
+import nnvm.graph as graph
+
+def test_dense():
+    x = sym.Variable('x')
+    x1 = sym.dense(x, units=3, name="dense")
+    x2 = sym.flatten(x1)
+    x3 = sym.softmax(x2)
+    assert x3.list_input_names() == ['x', 'dense_weight', 'dense_bias']
+
+
+def test_concatenate_split():
+    x = sym.Variable('x')
+    y = sym.Variable('y')
+    y = sym.concatenate(x, y)
+    assert y.list_input_names() == ['x', 'y']
+    z = sym.split(y, indices_or_sections=10)
+    assert len(z.list_output_names()) == 10
+    z = sym.split(y, indices_or_sections=[10, 20])
+    assert len(z.list_output_names()) == 3
+
+def test_expand_dims():
+    x = sym.Variable('x')
+    y = sym.expand_dims(x, axis=1, num_newaxis=2)
+    assert y.list_input_names() == ['x']
+
+
+def test_unary():
+    x = sym.Variable('x')
+    x = sym.exp(x)
+    x = sym.log(x)
+    x = sym.sigmoid(x)
+    x = sym.tanh(x)
+    x = sym.relu(x)
+    assert x.list_input_names() == ['x']
+
+
+def test_batchnorm():
+    x = sym.Variable('x')
+    x = sym.batch_norm(x, name="bn")
+    assert x.list_input_names() == [
+        "x", "bn_gamma", "bn_beta", "bn_moving_mean", "bn_moving_var"]
+
+
+if __name__ == "__main__":
+    test_concatenate_split()
+    test_expand_dims()
+    test_dense()
+    test_unary()
+    test_batchnorm()
diff --git a/nnvm/tests/python/unittest/test_top_level2.py b/nnvm/tests/python/unittest/test_top_level2.py
new file mode 100644
index 000000000000..df34549fbe46
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_top_level2.py
@@ -0,0 +1,19 @@
+import nnvm.symbol as sym
+
+def test_conv2d():
+    x = sym.Variable('x')
+    y = sym.conv2d(x, channels=3, kernel_size=(3, 3),
+                   name="y", use_bias=False)
+    assert y.list_input_names() == ["x", "y_weight"]
+
+
+def test_max_pool2d():
+    x = sym.Variable('x')
+    y = sym.max_pool2d(x, pool_size=(3, 3), name="y")
+    y = sym.global_max_pool2d(y)
+    assert y.list_input_names() == ["x"]
+
+
+if __name__ == "__main__":
+    test_conv2d()
+    test_max_pool2d()
diff --git a/nnvm/tests/python/unittest/test_top_level3.py b/nnvm/tests/python/unittest/test_top_level3.py
new file mode 100644
index 000000000000..47e7a8bce100
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_top_level3.py
@@ -0,0 +1,30 @@
+import nnvm.symbol as sym
+
+def test_reshape():
+    x = sym.Variable("x")
+    y = sym.reshape(x, shape=(10, 20), name="y")
+    assert(y.list_input_names() == ["x"])
+
+
+def test_scalar_op():
+    x = sym.Variable("x")
+    y = (1 / (x * 2) - 1) ** 2
+    assert(y.list_input_names() == ["x"])
+
+def test_leaky_relu():
+    x = sym.Variable("x")
+    y = sym.leaky_relu(x, alpha=0.1)
+    assert(y.list_input_names() == ["x"])
+
+def test_prelu():
+    x = sym.Variable("x")
+    w = sym.Variable("w")
+    y = sym.prelu(x, w)
+    assert(y.list_input_names()[0] == 'x')
+    assert(y.list_input_names()[1] == 'w')
+
+if __name__ == "__main__":
+    test_scalar_op()
+    test_reshape()
+    test_leaky_relu()
+    test_prelu()
diff --git a/nnvm/tests/python/unittest/test_top_level4.py b/nnvm/tests/python/unittest/test_top_level4.py
new file mode 100644
index 000000000000..fe90b2b1cc5c
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_top_level4.py
@@ -0,0 +1,20 @@
+import nnvm.symbol as sym
+
+def test_binary_broadcast():
+    x = sym.Variable('x')
+    y = sym.Variable('y')
+    z = x + y
+    z = x * y
+    z = x - y
+    z = x / y
+
+
+def test_broadcast_to():
+    x = sym.Variable('x')
+    y = sym.broadcast_to(x, shape=(3, 3))
+    assert y.list_input_names() == ["x"]
+
+
+if __name__ == "__main__":
+    test_binary_broadcast()
+    test_broadcast_to()
diff --git a/python/conda/build.sh b/python/conda/build.sh
deleted file mode 100644
index 4965abc56b54..000000000000
--- a/python/conda/build.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# See Note [CUDA_TOOLKIT_ROOT_DIR versus CUDA_BIN_PATH]
-if [ -z "$CONDA_CUDA_HOME" ]; then
-  CUDA_ARGS=""
-else
-  # See Note [Bash argument quoting]
-  CUDA_ARGS="-DCUDA_TOOLKIT_ROOT_DIR=$(printf %q "$CONDA_CUDA_HOME")"
-fi
-
-if [ -z "$PREFIX" ]; then
-  PREFIX="$CONDA_PREFIX"
-fi
-
-if [ "$(uname)" = 'Darwin' ]
-then
-    # Without this, Apple's default shipped clang will refuse to see any
-    # headers like mutex.
-    export MACOSX_DEPLOYMENT_TARGET=10.9
-fi
-
-rm -rf build || true
-mkdir -p build
-cd build
-# Enable static-libstdc++ to make it easier to link this library with
-# other C++ compilers
-CXXFLAGS=-static-libstdc++ cmake3 -DCMAKE_PREFIX_PATH=${PREFIX} -DCMAKE_INSTALL_PREFIX=${PREFIX} -DUSE_CUDA=1 -DUSE_LLVM=1 -DINSTALL_DEV=1 $CUDA_ARGS ..
-make -j20 VERBOSE=1
-make install/fast
-cd ..
-
-# Also install the headers for libraries that TVM vendored
-mkdir -p "$PREFIX/include"
-# TODO: arguably dlpack and dmlc-core should get its own packaging and
-# install their headers themselves
-cp -R dlpack/include/. "$PREFIX/include"
-cp -R dmlc-core/include/. "$PREFIX/include"
-# TODO: HalideIR's includes could conflict, but TVM currently assumes they
-# are installed here, awfully enough
-cp -R HalideIR/src/. "$PREFIX/include"
-
-cd python
-$PYTHON setup.py install
-cd ..
diff --git a/python/setup.py b/python/setup.py
index 168729391412..cbf8c5591703 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -18,16 +18,29 @@
     from setuptools import setup
     from setuptools.extension import Extension
 
-# We can not import `libinfo.py` in setup.py directly since __init__.py
-# Will be invoked which introduces dependences
 CURRENT_DIR = os.path.dirname(__file__)
-libinfo_py = os.path.join(CURRENT_DIR, './tvm/_ffi/libinfo.py')
-libinfo = {'__file__': libinfo_py}
-exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
 
-LIB_PATH = libinfo['find_lib_path']()
-_, LIB_NAME = os.path.split(LIB_PATH[0])
-__version__ = libinfo['__version__']
+def get_lib_path():
+    """Get library path, name and version"""
+    # We can not import `libinfo.py` in setup.py directly since __init__.py
+    # Will be invoked which introduces dependences
+    libinfo_py = os.path.join(CURRENT_DIR, './tvm/_ffi/libinfo.py')
+    libinfo = {'__file__': libinfo_py}
+    exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
+    version = libinfo['__version__']
+    if not os.getenv('CONDA_BUILD'):
+        lib_path = libinfo['find_lib_path']()
+        libs = [lib_path[0]]
+        if libs[0].find("runtime") == -1:
+            for name in lib_path[1:]:
+                if name.find("runtime") != -1:
+                    libs.append(name)
+                    break
+    else:
+        libs = None
+    return libs, version
+
+LIB_LIST, __version__ = get_lib_path()
 
 def config_cython():
     """Try to configure cython and return cython configuration"""
@@ -79,20 +92,34 @@ def has_ext_modules(self):
     def is_pure(self):
         return False
 
+include_libs = False
+wheel_include_libs = False
+if not os.getenv('CONDA_BUILD'):
+    if "bdist_wheel" in sys.argv:
+        wheel_include_libs = True
+    else:
+        include_libs = True
+
+setup_kwargs = {}
+
 # For bdist_wheel only
-if "bdist_wheel" in sys.argv:
-    shutil.copy(LIB_PATH[0], os.path.join(CURRENT_DIR, 'tvm'))
+if wheel_include_libs:
     with open("MANIFEST.in", "w") as fo:
-        fo.write("include tvm/%s\n" % LIB_NAME)
+        for path in LIB_LIST:
+            shutil.copy(path, os.path.join(CURRENT_DIR, 'tvm'))
+            _, libname = os.path.split(path)
+            fo.write("include tvm/%s\n" % libname)
     setup_kwargs = {
         "include_package_data": True
     }
-else:
+
+if include_libs:
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    rpath = os.path.relpath(LIB_PATH[0], curr_path)
+    for i, path in enumerate(LIB_LIST):
+        LIB_LIST[i] = os.path.relpath(path, curr_path)
     setup_kwargs = {
         "include_package_data": True,
-        "data_files": [('tvm', [rpath])]
+        "data_files": [('tvm', LIB_LIST)]
     }
 
 setup(name='tvm',
@@ -109,7 +136,10 @@ def is_pure(self):
       ext_modules=config_cython(),
       **setup_kwargs)
 
-# Wheel cleanup
-if "bdist_wheel" in sys.argv:
+
+if wheel_include_libs:
+    # Wheel cleanup
     os.remove("MANIFEST.in")
-    os.remove("tvm/%s" % LIB_NAME)
+    for path in LIB_LIST:
+        _, libname = os.path.split(path)
+        os.remove("tvm/%s" % libname)
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index e23eed7168dc..a028dfeddf36 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -15,9 +15,12 @@
 from . import node
 from . import ir_builder
 from . import target
+from . import generic
+from . import hybrid
 
 from . import ndarray as nd
-from .ndarray import context, cpu, gpu, opencl, cl, metal, mtl, vpi, rocm, ext_dev
+from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl
+from .ndarray import vpi, rocm, opengl, ext_dev
 
 from ._ffi.runtime_ctypes import TypeCode
 from ._ffi.function import Function
@@ -32,4 +35,4 @@
 from .tag import tag_scope
 
 # Contrib initializers
-from .contrib import rocm as _rocm, nvcc as _nvcc
+from .contrib import rocm as _rocm, nvcc as _nvcc, sdaccel as _sdaccel
diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index ec278bc2045c..79f3c6033a1f 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -94,7 +94,8 @@ def _make_tvm_args(args, temp_args):
             type_codes[i] = TypeCode.NULL
         elif isinstance(arg, NDArrayBase):
             values[i].v_handle = ctypes.cast(arg.handle, ctypes.c_void_p)
-            type_codes[i] = TypeCode.ARRAY_HANDLE
+            type_codes[i] = (TypeCode.NDARRAY_CONTAINER
+                             if not arg.is_view else TypeCode.ARRAY_HANDLE)
         elif isinstance(arg, _nd._TVM_COMPATS):
             values[i].v_handle = ctypes.c_void_p(arg._tvm_handle)
             type_codes[i] = arg.__class__._tvm_tcode
@@ -165,7 +166,7 @@ def __init__(self, handle, is_global):
         self.is_global = is_global
 
     def __del__(self):
-        if not self.is_global:
+        if not self.is_global and _LIB is not None:
             check_call(_LIB.TVMFuncFree(self.handle))
 
     def __call__(self, *args):
@@ -203,11 +204,13 @@ def _handle_return_func(x):
 # setup return handle for function type
 RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func
 RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module
+RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
 C_TO_PY_ARG_SWITCH[TypeCode.FUNC_HANDLE] = _wrap_arg_func(
     _handle_return_func, TypeCode.FUNC_HANDLE)
 C_TO_PY_ARG_SWITCH[TypeCode.MODULE_HANDLE] = _wrap_arg_func(
     _return_module, TypeCode.MODULE_HANDLE)
 C_TO_PY_ARG_SWITCH[TypeCode.ARRAY_HANDLE] = lambda x: _make_array(x.v_handle, True)
+C_TO_PY_ARG_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
 
 _CLASS_MODULE = None
 _CLASS_FUNCTION = None
diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/node.py
index 08efc3913084..01244519532b 100644
--- a/python/tvm/_ffi/_ctypes/node.py
+++ b/python/tvm/_ffi/_ctypes/node.py
@@ -30,6 +30,7 @@ def _return_node(x):
 C_TO_PY_ARG_SWITCH[TypeCode.NODE_HANDLE] = _wrap_arg_func(
     _return_node, TypeCode.NODE_HANDLE)
 
+
 class NodeBase(object):
     __slots__ = ["handle"]
     # pylint: disable=no-member
@@ -44,7 +45,8 @@ def __init__(self, handle):
         self.handle = handle
 
     def __del__(self):
-        check_call(_LIB.TVMNodeFree(self.handle))
+        if _LIB is not None:
+            check_call(_LIB.TVMNodeFree(self.handle))
 
     def __getattr__(self, name):
         ret_val = TVMValue()
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 0bbcc7064960..50a99245f793 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -18,6 +18,7 @@ cdef enum TVMTypeCode:
     kFuncHandle = 10
     kStr = 11
     kBytes = 12
+    kNDArrayContainer = 13
     kExtBegin = 15
 
 cdef extern from "tvm/runtime/c_runtime_api.h":
@@ -37,7 +38,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
         DLDataType dtype
         int64_t* shape
         int64_t* strides
-        uint64_t byte_offset;
+        uint64_t byte_offset
 
     ctypedef struct TVMValue:
         int64_t v_int64
@@ -64,8 +65,8 @@ ctypedef int (*TVMPackedCFunc)(
 ctypedef void (*TVMPackedCFuncFinalizer)(void* resource_handle)
 
 cdef extern from "tvm/runtime/c_runtime_api.h":
-    void TVMAPISetLastError(const char* msg);
-    const char *TVMGetLastError();
+    void TVMAPISetLastError(const char* msg)
+    const char *TVMGetLastError()
     int TVMFuncCall(TVMFunctionHandle func,
                     TVMValue* arg_values,
                     int* type_codes,
@@ -128,7 +129,7 @@ cdef inline c_str(pystr):
 
 cdef inline CALL(int ret):
     if ret != 0:
-        raise TVMError(TVMGetLastError())
+        raise TVMError(py_str(TVMGetLastError()))
 
 
 cdef inline object ctypes_handle(void* chandle):
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index 7cadf22a1cc7..989f5b8e7b47 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -6,7 +6,6 @@ from ..base import string_types
 from ..node_generic import convert_to_node, NodeGeneric
 from ..runtime_ctypes import TVMType, TVMContext, TVMByteArray
 
-print("TVM: Initializing cython mode...")
 
 cdef void tvm_callback_finalize(void* fhandle):
     local_pyfunc = <object>(fhandle)
@@ -74,10 +73,10 @@ def convert_to_tvm_func(object pyfunc):
     return ret
 
 
-cdef inline void make_arg(object arg,
-                          TVMValue* value,
-                          int* tcode,
-                          list temp_args):
+cdef inline int make_arg(object arg,
+                         TVMValue* value,
+                         int* tcode,
+                         list temp_args) except -1:
     """Pack arguments into c args tvm call accept"""
     cdef unsigned long long ptr
     if isinstance(arg, NodeBase):
@@ -85,7 +84,8 @@ cdef inline void make_arg(object arg,
         tcode[0] = kNodeHandle
     elif isinstance(arg, NDArrayBase):
         value[0].v_handle = (<NDArrayBase>arg).chandle
-        tcode[0] = kArrayHandle
+        tcode[0] = (kNDArrayContainer if
+                    not (<NDArrayBase>arg).c_is_view else kArrayHandle)
     elif isinstance(arg, _TVM_COMPATS):
         ptr = arg._tvm_handle
         value[0].v_handle = (<void*>ptr)
@@ -152,6 +152,7 @@ cdef inline void make_arg(object arg,
         temp_args.append(arg)
     else:
         raise TypeError("Don't know how to handle type %s" % type(arg))
+    return 0
 
 cdef inline bytearray make_ret_bytes(void* chandle):
     handle = ctypes_handle(chandle)
@@ -173,6 +174,8 @@ cdef inline object make_ret(TVMValue value, int tcode):
         return value.v_int64
     elif tcode == kFloat:
         return value.v_float64
+    elif tcode == kNDArrayContainer:
+        return c_make_array(value.v_handle, False)
     elif tcode == kStr:
         return py_str(value.v_str)
     elif tcode == kBytes:
diff --git a/python/tvm/_ffi/_cython/node.pxi b/python/tvm/_ffi/_cython/node.pxi
index 93b3fb124f0a..a563af5237f9 100644
--- a/python/tvm/_ffi/_cython/node.pxi
+++ b/python/tvm/_ffi/_cython/node.pxi
@@ -23,6 +23,8 @@ cdef inline object make_ret_node(void* chandle):
             obj = cls(None)
         else:
             obj = NodeBase(None)
+    else:
+        obj = NodeBase(None)
     (<NodeBase>obj).chandle = chandle
     return obj
 
diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index b17043c2911d..4c1e979cb684 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -41,6 +41,10 @@ def _load_lib():
 __version__ = libinfo.__version__
 # library instance of nnvm
 _LIB, _LIB_NAME = _load_lib()
+
+# Whether we are runtime only
+_RUNTIME_ONLY = "runtime" in _LIB_NAME
+
 # The FFI mode of TVM
 _FFI_MODE = os.environ.get("TVM_FFI", "auto")
 
@@ -94,3 +98,18 @@ def c_array(ctype, values):
         Created ctypes array
     """
     return (ctype * len(values))(*values)
+
+
+def decorate(func, fwrapped):
+    """A wrapper call of decorator package, differs to call time
+
+    Parameters
+    ----------
+    func : function
+        The original function
+
+    fwrapped : function
+        The wrapped function
+    """
+    import decorator
+    return decorator.decorate(func, fwrapped)
diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py
index b89da713a1ad..cfda2a35f9b9 100644
--- a/python/tvm/_ffi/function.py
+++ b/python/tvm/_ffi/function.py
@@ -181,10 +181,10 @@ def register(myf):
             myf = convert_to_tvm_func(myf)
         check_call(_LIB.TVMFuncRegisterGlobal(
             c_str(func_name), myf.handle, ioverride))
+        return myf
     if f:
-        register(f)
-    else:
-        return register
+        return register(f)
+    return register
 
 
 def get_global_func(name, allow_missing=False):
@@ -234,6 +234,31 @@ def list_global_func_names():
     return fnames
 
 
+def extract_ext_funcs(finit):
+    """
+    Extract the extension PackedFuncs from a C module.
+
+    Parameters
+    ----------
+    finit : ctypes function
+        a ctypes that takes signature of TVMExtensionDeclarer
+
+    Returns
+    -------
+    fdict : dict of str to Function
+        The extracted functions
+    """
+    fdict = {}
+    def _list(name, func):
+        fdict[name] = func
+    myf = convert_to_tvm_func(_list)
+    ret = finit(myf.handle)
+    _ = myf
+    if ret != 0:
+        raise RuntimeError("cannot initialize with %s" % finit)
+    return fdict
+
+
 def _get_api(f):
     flocal = f
     flocal.is_global = True
@@ -255,15 +280,25 @@ def my_api_func(*args):
         return flocal(*args)
     return my_api_func
 
-def _init_api(namespace):
+def _init_api(namespace, target_module_name=None):
     """Initialize api for a given module name
 
-    mod : str
-       The name of the module.
+    namespace : str
+       The namespace of the source registry
+
+    target_module_name : str
+       The target module name if different from namespace
     """
-    module = sys.modules[namespace]
-    assert namespace.startswith("tvm.")
-    prefix = namespace[4:]
+    target_module_name = (
+        target_module_name if target_module_name else namespace)
+    if namespace.startswith("tvm."):
+        _init_api_prefix(target_module_name, namespace[4:])
+    else:
+        _init_api_prefix(target_module_name, namespace)
+
+
+def _init_api_prefix(module_name, prefix):
+    module = sys.modules[module_name]
 
     for name in list_global_func_names():
         if prefix == "api":
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index 273e8f8fb003..390849f8536d 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -4,7 +4,7 @@
 import os
 
 
-def find_lib_path(name=None, search_path=None):
+def find_lib_path(name=None, search_path=None, optional=False):
     """Find dynamic library files.
 
     Parameters
@@ -40,11 +40,11 @@ def find_lib_path(name=None, search_path=None):
         dll_path.extend([p.strip() for p in os.environ['DYLD_LIBRARY_PATH'].split(":")])
 
     # Pip lib directory
-    dll_path.append(os.path.join(ffi_dir, "../"))
+    dll_path.append(os.path.join(ffi_dir, ".."))
     # Default cmake build directory
     dll_path.append(os.path.join(source_dir, "build"))
     dll_path.append(os.path.join(source_dir, "build", "Release"))
-    # Default mkae build directory
+    # Default make build directory
     dll_path.append(os.path.join(source_dir, "lib"))
 
     dll_path.append(install_lib_dir)
@@ -56,7 +56,12 @@ def find_lib_path(name=None, search_path=None):
         else:
             dll_path.append(search_path)
     if name is not None:
-        lib_dll_path = [os.path.join(p, name) for p in dll_path]
+        if isinstance(name, list):
+            lib_dll_path = []
+            for n in name:
+                lib_dll_path += [os.path.join(p, n) for p in dll_path]
+        else:
+            lib_dll_path = [os.path.join(p, name) for p in dll_path]
         runtime_dll_path = []
     else:
         if sys.platform.startswith('win32'):
@@ -74,15 +79,19 @@ def find_lib_path(name=None, search_path=None):
     if not use_runtime:
         # try to find lib_dll_path
         lib_found = [p for p in lib_dll_path if os.path.exists(p) and os.path.isfile(p)]
-    if use_runtime or not lib_found:
+        lib_found += [p for p in runtime_dll_path if os.path.exists(p) and os.path.isfile(p)]
+    else:
         # try to find runtime_dll_path
         use_runtime = True
         lib_found = [p for p in runtime_dll_path if os.path.exists(p) and os.path.isfile(p)]
 
     if not lib_found:
-        raise RuntimeError('Cannot find the files.\n' +
-                           'List of candidates:\n' +
-                           str('\n'.join(lib_dll_path + runtime_dll_path)))
+        message = ('Cannot find the files.\n' +
+                   'List of candidates:\n' +
+                   str('\n'.join(lib_dll_path + runtime_dll_path)))
+        if not optional:
+            raise RuntimeError(message)
+        return None
 
     if use_runtime:
         sys.stderr.write("Loading runtime library %s... exec only\n" % lib_found[0])
@@ -91,4 +100,5 @@ def find_lib_path(name=None, search_path=None):
 
 
 # current version
-__version__ = "0.1.0"
+# We use the version of the incoming release for code that is under development
+__version__ = "0.4.0"
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index b0dfd0f73fd9..3788c07ac440 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -134,6 +134,32 @@ def context(self):
         """context of this array"""
         return self.ctx
 
+    def __hash__(self):
+        return ctypes.cast(self.handle, ctypes.c_void_p).value
+
+    def __eq__(self, other):
+        return self.same_as(other)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def same_as(self, other):
+        """Check object identity equality
+
+        Parameters
+        ----------
+        other : object
+            The other object to compare to
+
+        Returns
+        -------
+        same : bool
+            Whether other is same as self.
+        """
+        if not isinstance(other, NDArrayBase):
+            return False
+        return self.__hash__() == other.__hash__()
+
     def __setitem__(self, in_slice, value):
         """Set ndarray value"""
         if (not isinstance(in_slice, slice) or
@@ -177,13 +203,14 @@ def copyfrom(self, source_array):
             shape = shape + (t.lanes,)
             t.lanes = 1
             dtype = str(t)
-        source_array = np.ascontiguousarray(source_array, dtype=dtype)
+
         if source_array.shape != shape:
             raise ValueError("array shape do not match the shape of NDArray {0} vs {1}".format(
                 source_array.shape, shape))
+        source_array = np.ascontiguousarray(source_array, dtype=dtype)
         assert source_array.flags['C_CONTIGUOUS']
         data = source_array.ctypes.data_as(ctypes.c_void_p)
-        nbytes = ctypes.c_size_t(np.prod(source_array.shape) * source_array.dtype.itemsize)
+        nbytes = ctypes.c_size_t(source_array.size * source_array.dtype.itemsize)
         check_call(_LIB.TVMArrayCopyFromBytes(self.handle, data, nbytes))
         return self
 
@@ -212,7 +239,7 @@ def asnumpy(self):
         np_arr = np.empty(shape, dtype=dtype)
         assert np_arr.flags['C_CONTIGUOUS']
         data = np_arr.ctypes.data_as(ctypes.c_void_p)
-        nbytes = ctypes.c_size_t(np.prod(np_arr.shape) * np_arr.dtype.itemsize)
+        nbytes = ctypes.c_size_t(np_arr.size * np_arr.dtype.itemsize)
         check_call(_LIB.TVMArrayCopyToBytes(self.handle, data, nbytes))
         return np_arr
 
diff --git a/python/tvm/_ffi/node_generic.py b/python/tvm/_ffi/node_generic.py
index 7561097bf305..b7230f29da59 100644
--- a/python/tvm/_ffi/node_generic.py
+++ b/python/tvm/_ffi/node_generic.py
@@ -13,12 +13,14 @@ def _set_class_node_base(cls):
     global _CLASS_NODE_BASE
     _CLASS_NODE_BASE = cls
 
+
 class NodeGeneric(object):
     """Base class for all classes that can be converted to node."""
     def asnode(self):
         """Convert value to node"""
         raise NotImplementedError()
 
+
 def convert_to_node(value):
     """Convert a python value to corresponding node type.
 
@@ -46,7 +48,8 @@ def convert_to_node(value):
     elif isinstance(value, dict):
         vlist = []
         for item in value.items():
-            if not isinstance(item[0], _CLASS_NODE_BASE):
+            if (not isinstance(item[0], _CLASS_NODE_BASE) and
+                    not isinstance(item[0], string_types)):
                 raise ValueError("key of map must already been a container type")
             vlist.append(item[0])
             vlist.append(convert_to_node(item[1]))
@@ -56,6 +59,7 @@ def convert_to_node(value):
     else:
         raise ValueError("don't know how to convert type %s to node" % type(value))
 
+
 def const(value, dtype=None):
     """Construct a constant value for a given type.
 
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index cfadd18188f5..4c36e82a81ec 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -3,6 +3,7 @@
 from __future__ import absolute_import
 
 import ctypes
+import json
 import numpy as np
 from .base import _LIB, check_call
 from .. import _api_internal
@@ -24,6 +25,7 @@ class TypeCode(object):
     FUNC_HANDLE = 10
     STR = 11
     BYTES = 12
+    NDARRAY_CONTAINER = 13
     EXT_BEGIN = 15
 
 class TVMByteArray(ctypes.Structure):
@@ -94,9 +96,13 @@ class TVMContext(ctypes.Structure):
         1 : 'cpu',
         2 : 'gpu',
         4 : 'opencl',
+        5 : 'aocl',
+        6 : 'sdaccel',
+        7 : 'vulkan',
         8 : 'metal',
         9 : 'vpi',
         10: 'rocm',
+        11: 'opengl',
         12: 'ext_dev',
     }
     STR2MASK = {
@@ -108,9 +114,13 @@ class TVMContext(ctypes.Structure):
         'nvptx': 2,
         'cl': 4,
         'opencl': 4,
+        'aocl' : 5,
+        'sdaccel': 6,
+        'vulkan': 7,
         'metal': 8,
         'vpi': 9,
         'rocm': 10,
+        'opengl': 11,
         'ext_dev': 12,
     }
     def __init__(self, device_type, device_id):
@@ -136,6 +146,12 @@ def warp_size(self):
         return _api_internal._GetDeviceAttr(
             self.device_type, self.device_id, 2)
 
+    @property
+    def max_shared_memory_per_block(self):
+        """Total amount of shared memory per block in bytes."""
+        return _api_internal._GetDeviceAttr(
+            self.device_type, self.device_id, 3)
+
     @property
     def compute_version(self):
         """Get compute verison number in string.
@@ -148,7 +164,37 @@ def compute_version(self):
             The version string in `major.minor` format.
         """
         return _api_internal._GetDeviceAttr(
-            self.device_type, self.device_id, 3)
+            self.device_type, self.device_id, 4)
+
+    @property
+    def device_name(self):
+        """Return the string name of device."""
+        return _api_internal._GetDeviceAttr(
+            self.device_type, self.device_id, 5)
+
+    @property
+    def max_clock_rate(self):
+        """Return the max clock frequency of device."""
+        return _api_internal._GetDeviceAttr(
+            self.device_type, self.device_id, 6)
+
+    @property
+    def multi_processor_count(self):
+        """Return the number of compute units of device."""
+        return _api_internal._GetDeviceAttr(
+            self.device_type, self.device_id, 7)
+
+    @property
+    def max_thread_dimensions(self):
+        """Return the maximum size of each thread axis
+
+        Returns
+        -------
+        dims: List of int
+            The maximum length of threadIdx.x, threadIdx.y, threadIdx.z
+        """
+        return json.loads(_api_internal._GetDeviceAttr(
+            self.device_type, self.device_id, 8))
 
     def sync(self):
         """Synchronize until jobs finished at the context."""
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 903e40308ecb..75debc33db66 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -8,7 +8,7 @@
 from ._ffi.node import register_node, NodeBase
 from ._ffi.node import convert_to_node as _convert_to_node
 from ._ffi.function import Function
-from ._ffi.function import _init_api, register_func, get_global_func
+from ._ffi.function import _init_api, register_func, get_global_func, extract_ext_funcs
 from ._ffi.function import convert_to_tvm_func as _convert_tvm_func
 from ._ffi.runtime_ctypes import TVMType
 from . import _api_internal
@@ -19,6 +19,7 @@
 from . import container as _container
 from . import tag as _tag
 
+int8 = "int8"
 int32 = "int32"
 float32 = "float32"
 handle = "handle"
@@ -188,7 +189,7 @@ def placeholder(shape, dtype=None, name="placeholder"):
         shape, dtype, name)
 
 
-def compute(shape, fcompute, name="compute", tag=""):
+def compute(shape, fcompute, name="compute", tag="", attrs=None):
     """Construct a new tensor by computing over the shape domain.
 
     The compute rule is result[axis] = fcompute(axis)
@@ -204,6 +205,12 @@ def compute(shape, fcompute, name="compute", tag=""):
     name: str, optional
         The name hint of the tensor
 
+    tag: str, optional
+        Additonal tag information about the compute.
+
+    attrs: dict, optional
+        The additional auxiliary attributes about the compute.
+
     Returns
     -------
     tensor: Tensor
@@ -231,13 +238,13 @@ def compute(shape, fcompute, name="compute", tag=""):
         body = [body]
     body = convert(body)
     op_node = _api_internal._ComputeOp(
-        name, tag, dim_var, body)
+        name, tag, attrs, dim_var, body)
     num = op_node.num_outputs
     outputs = tuple(op_node.output(i) for i in range(num))
     return outputs[0] if num == 1 else outputs
 
 
-def scan(init, update, state_placeholder, inputs=None, name="scan", tag=""):
+def scan(init, update, state_placeholder, inputs=None, name="scan", tag="", attrs=None):
     """Construct new tensors by scanning over axis.
 
     Parameters
@@ -258,6 +265,12 @@ def scan(init, update, state_placeholder, inputs=None, name="scan", tag=""):
     name: str, optional
         The name hint of the tensor
 
+    tag: str, optional
+        Additonal tag information about the compute.
+
+    attrs: dict, optional
+        The additional auxiliary attributes about the compute.
+
     Returns
     -------
     tensor: Tensor or list of Tensors
@@ -293,13 +306,22 @@ def scan(init, update, state_placeholder, inputs=None, name="scan", tag=""):
     if len(init) != len(update) or len(init) != len(state_placeholder):
         raise ValueError("init, update, state_placeholder must have same length")
     axis = _IterVar((init[0].shape[0], update[0].shape[0]), "%s.idx" % name, 3)
-    op = _api_internal._ScanOp(name, tag, axis, init, update,
+    op = _api_internal._ScanOp(name, tag, attrs,
+                               axis, init, update,
                                state_placeholder, inputs)
     res = [op.output(i) for i in range(len(update))]
     return res[0] if len(res) == 1 else res
 
 
-def extern(shape, inputs, fcompute, name="extern", dtype=None, tag=""):
+def extern(shape,
+           inputs,
+           fcompute,
+           name="extern",
+           dtype=None,
+           in_buffers=None,
+           out_buffers=None,
+           tag="",
+           attrs=None):
     """Compute several tensor via extern function.
 
     Parameters
@@ -331,6 +353,19 @@ def extern(shape, inputs, fcompute, name="extern", dtype=None, tag=""):
         The data types of outputs,
         by default dtype will be same as inputs.
 
+    in_buffers: Buffer or list of Buffer, optional
+        Input buffers.
+
+    out_buffers: Buffer or list of Buffers, optional
+        Output buffers.
+
+
+    tag: str, optional
+        Additonal tag information about the compute.
+
+    attrs: dict, optional
+        The additional auxiliary attributes about the compute.
+
     Returns
     -------
     tensor: Tensor or list of Tensors
@@ -356,14 +391,25 @@ def extern(shape, inputs, fcompute, name="extern", dtype=None, tag=""):
         tag = _tag.TagScope.current.tag
     shape = (shape,) if isinstance(shape, (_expr.Expr, _Integral)) else shape
     shape = [shape] if isinstance(shape[0], (_expr.Expr, _Integral)) else shape
-    input_placeholders = []
-    output_placeholders = []
+    if in_buffers is not None:
+        in_buffers = [in_buffers] if not isinstance(in_buffers, list) else in_buffers
+        if len(inputs) != len(in_buffers):
+            raise RuntimeError("Number of inputs and in_buffers mismatch: %d vs %d."
+                               % (len(inputs), len(in_buffers)))
+    if out_buffers is not None:
+        out_buffers = [out_buffers] if not isinstance(out_buffers, list) else out_buffers
+        if len(shape) != len(out_buffers):
+            raise RuntimeError("Number of outputs and out_buffers mismatch: %d vs %d."
+                               % (len(shape), len(out_buffers)))
+    input_placeholders = in_buffers or []
+    output_placeholders = out_buffers or []
     types = set()
     for t in inputs:
         if not isinstance(t, _tensor.Tensor):
             raise ValueError("expect inputs to be tensor")
-        input_placeholders.append(
-            decl_buffer(t.shape, t.dtype, t.op.name))
+        if in_buffers is None:
+            input_placeholders.append(
+                decl_buffer(t.shape, t.dtype, t.op.name))
         types.add(t.dtype)
 
     if dtype is None:
@@ -371,14 +417,18 @@ def extern(shape, inputs, fcompute, name="extern", dtype=None, tag=""):
             raise ValueError("Cannot infer output type, please provide dtype argument")
         infered_type = types.pop()
         dtype = [infered_type for _ in shape]
+    if isinstance(dtype, str):
+        dtype = [dtype]
 
-    for shp, dt in zip(shape, dtype):
-        output_placeholders.append(decl_buffer(shp, dt, name))
+    if out_buffers is None:
+        for shp, dt in zip(shape, dtype):
+            output_placeholders.append(decl_buffer(shp, dt, name))
     body = fcompute(input_placeholders, output_placeholders)
     if isinstance(body, _expr.Expr):
         body = _make.Evaluate(body)
 
-    op = _api_internal._ExternOp(name, tag, inputs, input_placeholders,
+    op = _api_internal._ExternOp(name, tag, attrs,
+                                 inputs, input_placeholders,
                                  output_placeholders, body)
     res = [op.output(i) for i in range(len(output_placeholders))]
     return res[0] if len(res) == 1 else res
@@ -460,7 +510,6 @@ def decl_buffer(shape,
         elem_offset = var('%s_elem_offset' % name, shape[0].dtype)
     if data is None:
         data = var(name, "handle")
-
     return _api_internal._Buffer(
         data, dtype, shape, strides, elem_offset, name, scope,
         data_alignment, offset_factor)
@@ -549,13 +598,16 @@ def reduce_axis(dom, name="rv"):
 
 
 def select(cond, t, f):
-    """Construct a select branch
+    """Construct a select branch.
+
     Parameters
     ----------
     cond : Expr
         The condition
+
     t : Expr
         The result expression if cond is true.
+
     f : Expr
         The result expression if cond is false.
 
@@ -566,6 +618,7 @@ def select(cond, t, f):
     """
     return _make.Select(convert(cond), convert(t), convert(f))
 
+
 def comm_reducer(fcombine, fidentity, name="reduce"):
     """Create a commutative reducer for reduction.
 
@@ -650,6 +703,7 @@ def _make_reduce(expr, axis, where=None):
                         for i in range(size))
         return outputs[0] if size == 1 else outputs
 
+    # pylint: disable=keyword-arg-before-vararg
     def reducer(expr, axis, where=None, *args):
         if isinstance(axis, (_schedule.IterVar, list, tuple)):
             assert not args
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
new file mode 100644
index 000000000000..20426be84aa1
--- /dev/null
+++ b/python/tvm/autotvm/__init__.py
@@ -0,0 +1,29 @@
+"""The auto-tuning module of tvm
+
+This module includes:
+
+* Tuning space definition API
+
+* Efficient auto-tuners
+
+* Tuning result and database support
+
+* Distributed measurement to scale up tuning
+"""
+
+from . import database
+from . import feature
+from . import measure
+from . import record
+from . import task
+from . import tuner
+from . import util
+from . import env
+from . import tophub
+
+# some shortcuts
+from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, use_rpc
+from .tuner import callback
+from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
+    ApplyHistoryBest as apply_history_best
+from .env import GLOBAL_SCOPE
diff --git a/python/tvm/autotvm/database.py b/python/tvm/autotvm/database.py
new file mode 100644
index 000000000000..c0c041cb18c9
--- /dev/null
+++ b/python/tvm/autotvm/database.py
@@ -0,0 +1,181 @@
+# pylint: disable=consider-using-enumerate,invalid-name
+"""
+Database of MeasureInput/MeasureResult pair.
+This can be used for replaying measurement.
+"""
+import os
+
+from .record import encode, decode, measure_str_key
+
+
+class Database(object):
+    """
+    Base class for a record database object.
+    """
+    def load(self, inp, get_all=False):
+        """
+        Load a result based on an input's string key
+
+        Parameters
+        ----------
+        inp: MeasureInput
+            to be translated into key for RedisDB
+        get_all: bool, optional
+            Whether the latest result (or all matching results) should be returned
+
+        Returns
+        -------
+        rec: MeasureResult if previously saved, otherwise None
+        """
+        raise NotImplementedError()
+
+    def save(self, inp, res, extend=False):
+        """
+        Save a result based on an input's string key
+
+        Parameters
+        ----------
+        inp: MeasureInput
+            to be translated into key for RedisDB
+        res: MeasureResult
+            to associate with key
+        extend:
+            Whether to extend existing MeasureResults if they exist
+        """
+        raise NotImplementedError()
+
+
+def filter_inputs(db, measure_inputs, retry=False):
+    """
+    Filter a measure_inputs batch based on saved db results
+
+    Parameters
+    ----------
+    db: Database
+        database object
+    measure_inputs: Array of MeasureInput
+        measure_inputs as expected in measure_batch
+    retry: bool
+        whether to retry if the saved result is a failure
+
+    Returns
+    -------
+    partial_results: Array of MeasureResult
+        a full list of result, where None denotes no corresponding saved result
+    unsaved: Array of MeasureInput
+        a list that only contains unsaved inputs
+    """
+    partial_results = list()
+    unsaved = list()
+    for inp in measure_inputs:
+        res = db.load(inp)
+        if res is None or (retry and res.error_no != 0):
+            unsaved.append(inp)
+            partial_results.append(None)
+        else:
+            partial_results.append(res)
+    return partial_results, unsaved
+
+class RedisDatabase(Database):
+    """
+    Redis version of record database
+    """
+    REDIS_PROD = 15
+    REDIS_LOCA = 14
+    REDIS_TEST = 13        # for unit test
+    REDIS_NIGHT_TEMP = 12  # for nightly report (will be flushed after every workload)
+
+    MAGIC_SPLIT = "$"
+
+    def __init__(self, db_index=REDIS_PROD):
+        import redis
+
+        if db_index == RedisDatabase.REDIS_TEST:
+            host = 'localhost'
+        else:
+            host = os.environ.get('TVM_FLEET_HOST')
+        self.db = redis.StrictRedis(host=host, port=6379, db=db_index)
+        self.db_index = db_index
+
+    def set(self, key, value):
+        self.db.set(key, value)
+
+    def get(self, key):
+        return self.db.get(key)
+
+    def load(self, inp, get_all=False):
+        current = self.get(measure_str_key(inp))
+        if current is not None:
+            current = str(current)
+            records = [decode(x) for x in current.split(RedisDatabase.MAGIC_SPLIT)]
+            results = [rec[1] for rec in records]
+            if get_all:
+                return results
+            return max(results, key=lambda result: result.timestamp)
+        return current
+
+    def save(self, inp, res, extend=False):
+        current = self.get(measure_str_key(inp))
+        if not extend or current is None:
+            self.set(measure_str_key(inp),
+                     RedisDatabase.MAGIC_SPLIT.join([encode(inp, res)]))
+        else:
+            current = current.split(RedisDatabase.MAGIC_SPLIT)
+            self.set(measure_str_key(inp),
+                     RedisDatabase.MAGIC_SPLIT.join(current + [encode(inp, res)]))
+
+    def filter(self, func):
+        """
+        Dump all of the records for a particular target
+
+        Parameters
+        ----------
+        func: callable
+            The signature of the function is bool (MeasureInput, Array of MeasureResult)
+
+        Returns
+        -------
+        list of records (inp, result) matching the target
+
+        Examples
+        --------
+        get records for a target
+        >>> db.filter(lambda inp, resulst: "cuda" in inp.target.keys)
+        """
+        matched_records = list()
+        # may consider filtering in iterator in the future
+        for key in self.db:
+            current = self.get(key)
+            try:
+                records = [decode(x) for x in current.spilt(RedisDatabase.MAGIC_SPLIT)]
+            except TypeError:  # got a badly formatted/old format record
+                continue
+
+            inps, results = zip(*records)
+            inp = inps[0]
+            if not func(inp, results):
+                continue
+            result = max(results, key=lambda res: res.timestamp)
+            matched_records.append((inp, result))
+        return matched_records
+
+    def flush(self):
+        self.db.flushdb()
+
+class DummyDatabase(RedisDatabase):
+    """
+    A database based on python dictionary for testing.
+    """
+
+    def __init__(self):
+        # pylint: disable=super-init-not-called
+        self.db = {}
+
+    def set(self, key, value):
+        self.db[key] = value
+
+    def get(self, key):
+        return self.db.get(key)
+
+    def flush(self):
+        self.db = {}
diff --git a/python/tvm/autotvm/env.py b/python/tvm/autotvm/env.py
new file mode 100644
index 000000000000..dc559a7bce1d
--- /dev/null
+++ b/python/tvm/autotvm/env.py
@@ -0,0 +1,13 @@
+"""Global configuration/variable scope for autotvm"""
+
+class AutotvmGlobalScope(object):
+    current = None
+
+    def __init__(self):
+        self._old = AutotvmGlobalScope.current
+        AutotvmGlobalScope.current = self
+
+        self.cuda_target_arch = None
+        self.in_tuning = False
+
+GLOBAL_SCOPE = AutotvmGlobalScope()
diff --git a/python/tvm/autotvm/feature.py b/python/tvm/autotvm/feature.py
new file mode 100644
index 000000000000..e0054628e686
--- /dev/null
+++ b/python/tvm/autotvm/feature.py
@@ -0,0 +1,181 @@
+# pylint: disable=invalid-name
+"""Extract feature of iter vars
+
+There are two types of feature
+1) Itervar feature
+   This feature is extracted based on loop variables.
+   Different loop structures will result in different shapes of feature
+2) Curve sample feature (relation feature)
+   This feature is extracted by sampling relation curve.
+   This feature is invariant of loop structure.
+"""
+
+import struct
+import numpy as np
+
+from tvm import schedule, ir_pass, build_module, get_global_func, target as _target
+
+def ana_lower(sch, args,
+              binds=None,
+              simple_mode=True):
+    """Do lower while keeping all axes in IR
+    i.e. Do not eliminate loop with extent of 1, do not vectorize, unroll or inject virtual threads
+    """
+    binds, _ = build_module.get_binds(args, binds)
+    sch = sch.normalize()
+    # Phase 0
+    bounds = schedule.InferBound(sch)
+    stmt = schedule.ScheduleOps(sch, bounds, True)
+    stmt = ir_pass.StorageFlatten(stmt, binds, 64)
+    stmt = ir_pass.CanonicalSimplify(stmt)
+    assert simple_mode
+    return stmt
+
+try:
+    _get_buffer_curve_sample_flatten = get_global_func(
+        "autotvm.feature.GetCurveSampleFeatureFlatten")
+    _get_itervar_feature = get_global_func("autotvm.feature.GetItervarFeature")
+    _get_itervar_feature_flatten = get_global_func("autotvm.feature.GetItervarFeatureFlatten")
+except ValueError as e:
+    def raise_error(*args, **kwargs):  # pylint: disable=unused-argument
+        raise RuntimeError("Cannot load autotvm c++ API")
+    _get_buffer_curve_sample_flatten = _get_itervar_feature = _get_itervar_feature_flatten = \
+        raise_error
+
+def get_itervar_feature(sch, args, take_log=False):
+    """get features of iter vars
+
+    Parameters
+    ----------
+    sch: tvm.schedule.Schedule
+    args: Array of tvm.tensor.Tensor
+        the buffer args for lower
+    take_log: bool
+        whether take log of numerical statics
+
+    Returns
+    -------
+    features of every axis in the IR, see doc/features.md for detail
+    """
+    stmt = ana_lower(sch, args, simple_mode=True)
+    feas = _get_itervar_feature(stmt, take_log)
+
+    # convert tvm node to python type
+    ret = []
+    for row in feas:
+        tmp = []
+        tmp.append([row[0][0].value, row[0][1]])
+        for item in row[1:]:
+            tmp.append([item[0].value] + [x.value for x in item[1:]])
+        ret.append(tmp)
+    return ret
+
+def flatten_itervar_feature(fea):
+    """flatten features into one-dimensional feature vectors
+
+    Parameters
+    ----------
+    fea: list
+        return value of get_itervar_feature
+
+    Returns
+    -------
+    flatten_feature: np.ndarray
+        one-dimensional vector
+    """
+    flatten = []
+    for axis in fea:
+        for pair in axis[1:]:
+            flatten.append(pair[1:])
+    return np.concatenate(flatten)
+
+def get_itervar_feature_flatten(sch, args, take_log=True):
+    """get flatten features of iter vars
+    this is equivalent to get_itervar_feature + flatten_itervar_feature, but much faster.
+
+    Parameters
+    ----------
+    sch: tvm.schedule.Schedule
+    args: Array of tvm.tensor.Tensor
+        the buffer args for lower
+    take_log: bool
+        whether take log of numerical statics
+
+    Returns
+    -------
+    flatten_feature: np.ndarray
+        one-dimensional vector
+    """
+    stmt = ana_lower(sch, args, simple_mode=True)
+    feas = _get_itervar_feature_flatten(stmt, take_log)
+    feas = struct.unpack('%df' % (len(feas)//4), feas)
+    return feas
+
+def get_flatten_name(fea):
+    """ Get names of feature after flatten.
+
+    Parameters
+    ----------
+    fea: list or str
+        return value of get_itervar_feature or a line of logfile
+
+    Returns
+    -------
+    feature_names: Array of str
+    """
+
+    feature_name = {
+        "_attr_": ["length", "nest_level", "topdown", "bottomup"] +
+                  ["ann_%d" % i for i in range(20)],
+        "_arith_": ["add", "mul", "div"],
+        "buf_touch": ["stride", "mod", "count", "reuse", "T_count", "T_reuse"],
+    }
+
+    if isinstance(fea, str):
+        from .record import decode
+        # flatten line to feature
+        line = fea
+        inp, _ = decode(line)
+        target = _target.create(inp.target)
+        with target:
+            s, args = inp.template.instantiate(inp.config)
+        fea = get_itervar_feature(s, args)
+
+    names = []
+    ct = 0
+    for row in fea:
+        var_name = str(row[0][1])
+        for pair in row[1:]:
+            key = pair[0]
+            if key in feature_name:
+                name_list = feature_name[key]
+            else:
+                name_list = feature_name["buf_touch"]
+
+            for i in range(len((pair[1:]))):
+                names.append(".".join(["f%d" % ct, var_name, key, name_list[i]]))
+                ct += 1
+    return names
+
+
+def get_buffer_curve_sample_flatten(sch, args, sample_n=30):
+    """
+    Get flatten curve sample feature (relation feature)
+
+    Parameters
+    ----------
+    sch: tvm.schedule.Schedule
+    args: Array of tvm.tensor.Tensor
+        the buffer args for lower
+    sample_n: int
+        number of sample points along one dimension
+
+    Returns
+    -------
+    flatten_feature: np.ndarray
+        one-dimensional vector
+    """
+    stmt = ana_lower(sch, args, simple_mode=True)
+    feas = _get_buffer_curve_sample_flatten(stmt, sample_n, False)
+    feas = struct.unpack('%df' % (len(feas)//4), feas)
+    return feas
diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py
new file mode 100644
index 000000000000..f75fbac61e11
--- /dev/null
+++ b/python/tvm/autotvm/measure/__init__.py
@@ -0,0 +1,7 @@
+"""Distributed executor infrastructure to scale up the tuning"""
+
+from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option
+from .measure_methods import request_remote, create_measure_batch, use_rpc
+
+from .local_executor import LocalExecutor
+from .executor import Future, Executor
diff --git a/python/tvm/autotvm/measure/executor.py b/python/tvm/autotvm/measure/executor.py
new file mode 100644
index 000000000000..17ea1d7fda9e
--- /dev/null
+++ b/python/tvm/autotvm/measure/executor.py
@@ -0,0 +1,83 @@
+""" Abstraction for asynchronous job execution """
+
+class Executor(object):
+    """
+    Base abstract executor interface for asynchronous job submission.
+    Allows submit asynchronous jobs and returns the Future object.
+    """
+    # timeout for jobs that may hang
+    DEFAULT_TIMEOUT = 60
+
+    def submit(self, func, *args, **kwargs):
+        """
+        Pass task (function, arguments) to the Executor.
+
+        Parameters
+        ----------
+        func : callable
+            function to be run by a worker
+        args : list or tuple, optional
+            arguments passed to the function
+        kwargs : dict, optional
+            The keyword arguments
+
+        Returns
+        -------
+        future : Future
+            Future object wrapping the task which can be used to
+            collect the task's result.
+        """
+        raise NotImplementedError()
+
+
+class Future(object):
+    """
+    Base class of the future object.
+    The implementations can return object of subclass of this.
+    This objects encapsulates the asynchronous execution of task
+    submitted to another thread, or another worker for execution.
+
+    Future objects store the state of tasks--can be polled for
+    result or a blocking call to retrieve the result can be used.
+    """
+    def done(self):
+        """
+        Return True if job was successfully cancelled or finished running.
+        """
+        raise NotImplementedError()
+
+    def get(self, timeout=None):
+        """
+        Get the result. This will block until the result is available.
+
+        Parameters
+        ----------
+        timeout : int or float, optional
+            Maximum number of seconds to wait before it timeouts.
+            If not specified, it means we block until the result is available.
+
+        Returns
+        -------
+        result : Any
+            The result returned by the submitted function.
+
+        Raises
+        ------
+        TimeoutError : if the result call timeouts.
+        """
+        raise NotImplementedError()
+
+class FutureError(RuntimeError):
+    """Base error class of all future events"""
+    pass
+
+# pylint:disable=redefined-builtin
+class TimeoutError(FutureError):
+    """Error raised when a task is timeout."""
+    pass
+
+class ExecutionError(FutureError):
+    """
+    Error raised when future execution crashes or failed.
+    """
+    pass
diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py
new file mode 100644
index 000000000000..8a045ecfb4c0
--- /dev/null
+++ b/python/tvm/autotvm/measure/local_executor.py
@@ -0,0 +1,140 @@
+"""Local based implementation of the executor using multiprocessing"""
+
+import signal
+
+from multiprocessing import Process, Queue
+try:
+    from queue import Empty
+except ImportError:
+    from Queue import Empty
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+from . import executor
+
+
+def kill_child_processes(parent_pid, sig=signal.SIGTERM):
+    """kill all child processes recursively"""
+    try:
+        parent = psutil.Process(parent_pid)
+    except psutil.NoSuchProcess:
+        return
+    children = parent.children(recursive=True)
+    for process in children:
+        try:
+            process.send_signal(sig)
+        except psutil.NoSuchProcess:
+            return
+
+def _execute_func(func, queue, args, kwargs):
+    """execute function and return the result or exception to a queue"""
+    try:
+        res = func(*args, **kwargs)
+    except Exception as exc:  # pylint: disable=broad-except
+        res = exc
+    queue.put(res)
+
+def timeout_monitor(queue, timeout, func, args, kwargs):
+    """A wrapper to support timeout of a function call"""
+
+    # start a new process for timeout (cannot use thread because we have c function)
+    p = Process(target=_execute_func, args=(func, queue, args, kwargs))
+    p.start()
+    p.join(timeout=timeout)
+
+    alive = p.is_alive()
+    kill_child_processes(p.pid)
+    p.terminate()
+    p.join()
+
+    if alive:
+        queue.put(executor.TimeoutError())
+    else:
+        if queue.empty():
+            queue.put(executor.ExecutionError("Fatal error in local executor"))
+
+
+class LocalFuture(executor.Future):
+    """Local wrapper for the future
+
+    Parameters
+    ----------
+    process: multiprocessing.Process
+        process for running this task
+    queue: multiprocessing.Queue
+        queue for receiving the result of this task
+    """
+    def __init__(self, process, queue):
+        self._done = False
+        self._process = process
+        self._queue = queue
+
+    def done(self):
+        self._done = self._done or not self._queue.empty()
+        return self._done
+
+    def get(self, timeout=None):
+        try:
+            res = self._queue.get(block=True, timeout=timeout)
+        except Empty:
+            raise executor.TimeoutError()
+        if self._process.is_alive():
+            kill_child_processes(self._process.pid)
+            self._process.terminate()
+        self._process.join()
+        self._queue.close()
+        self._queue.join_thread()
+        self._done = True
+        del self._queue
+        del self._process
+        return res
+
+
+class LocalFutureNoFork(executor.Future):
+    """Local wrapper for the future.
+    This is a none-fork version of LocalFuture.
+    Use this for the runtime that does not support fork (like cudnn)
+    """
+    def __init__(self, result):
+        self._result = result
+
+    def done(self):
+        return True
+
+    def get(self, timeout=None):
+        return self._result
+
+
+class LocalExecutor(executor.Executor):
+    """Local executor that runs workers on the same machine with multiprocessing.
+
+    Parameters
+    ----------
+    timeout: float, optional
+        timeout of a job. If time is out. A TimeoutError will be returned (not raised)
+    do_fork: bool, optional
+        For some runtime systems that do not support fork after initialization
+        (e.g. cuda runtime, cudnn). Set this to False if you have used these runtime
+        before submitting jobs.
+    """
+    def __init__(self, timeout=None, do_fork=True):
+        self.timeout = timeout or executor.Executor.DEFAULT_TIMEOUT
+        self.do_fork = do_fork
+
+        if self.do_fork:
+            if not psutil:
+                raise RuntimeError("Python package psutil is missing. "
+                                   "please try `pip install psutil`")
+
+    def submit(self, func, *args, **kwargs):
+        if not self.do_fork:
+            return LocalFutureNoFork(func(*args, **kwargs))
+
+        queue = Queue(1)
+        process = Process(target=timeout_monitor,
+                          args=(queue, self.timeout, func, args, kwargs))
+        process.start()
+        return LocalFuture(process, queue)
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
new file mode 100644
index 000000000000..6a05e1a6a349
--- /dev/null
+++ b/python/tvm/autotvm/measure/measure.py
@@ -0,0 +1,127 @@
+# pylint: disable=pointless-string-statement,consider-using-enumerate,invalid-name
+"""User facing API for specifying how to measure the generated code"""
+from collections import namedtuple
+
+class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
+    """
+    Stores all the necessary inputs for a measurement.
+
+    Parameters
+    ----------
+    target : tvm.target.Target
+        The target device
+    task : task.Task
+        Task function
+    config : ConfigEntity
+        Specific configuration.
+    """
+
+class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp"])):
+    """
+    Stores all the results of a measurement
+
+    Parameters
+    ----------
+    costs: Array of float or Array of Exception
+        If no error occurs for this measurement, it is an array of measured running times.
+        If some error occurs during the measurement, it is an array of the exception objections.
+    error_no: int
+        Denote error type, defined by MeasureErrorNo
+    all_cost: float
+        All cost of this measure, including rpc, compilation, test runs
+    timestamp: float
+        The absolute time stamp when we finish measurement.
+    """
+
+
+class MeasureErrorNo(object):
+    """Error type for MeasureResult"""
+    NO_ERROR = 0              # no error
+    INSTANTIATION_ERROR = 1   # error when calling template function
+    COMPILE_HOST = 2          # error when compiling code on host (e.g. tvm.build)
+    COMPILE_DEVICE = 3        # error when compiling code on device (e.g. opencl JIT on device)
+    RUNTIME_DEVICE = 4        # error when run program on device
+    WRONG_ANSWER = 5          # answer is wrong when compared to a golden output
+    FLEET_ERROR = 6           # error of measure infrastructure
+
+
+def measure_option(measure_func,
+                   number=1,
+                   repeat=1,
+                   timeout=60,
+                   parallel_num=1,
+                   do_fork=True,
+                   build_func='default',
+                   check_correctness=False,
+                   replay_db=None):
+    """Configure how to do measurement
+
+    Parameters
+    ----------
+    measure_func: str or callable
+        'local': use the local device for measurement. The tuner will start a tracker
+        and a RPC server silently for the user.
+
+        callable: It is a callable function for measurement.
+                  See the return value of measure/measure_methods.py::use_rpc for example.
+    number : int, optional
+        Number of times to do the measurement for average
+    repeat : int, optional
+        Number of times to repeat the measurement.
+        In total, the generated code will be run (1 + number x repeat) times,
+        where the first one is warm up. The returned result contains `repeat` costs,
+        each of which is the average of `number` test run.
+    timeout: int, optional
+        Timeout for a whole batch. TimeoutError will be returned as the result if a
+        task timeouts.
+    parallel_num: int, optional
+        The number of measurement task that can run in parallel.
+        Set this according to the number of cpu cores (for compilation) and
+        the number of devices you have (for measuring generate code).
+    do_fork: bool, optional
+        Whether use multiprocessing (based on fork) for running measure jobs in parallel.
+        Set this to False if you want to debug (see trackback) or using fork is not suitable.
+        NOTE: If this is False, parallel and timeout do not work.
+    build_func: str or callable, optional
+        'default': call default builder. This works for normal target (llvm, cuda)
+
+        'ndk': use Android NDK to create shared library. Use this for android target.
+
+        callable: customized build function for other backends (e.g. VTA).
+                  See measure/measure_methods.py::default_build_func for example.
+    check_correctness: bool
+        Whether check correctness after measurement. This will use llvm cpu as reference.
+    replay_db : Database, optional
+        The database that we retrieve saved MeasureResult from.
+
+    Returns
+    -------
+    options: dict
+        A dict to store all options
+
+    Note
+    ----
+    To support customized measure, you can pass callable `measure_func` or
+    `build_func` in. The `measure_func` will call `build_func` to build binary library
+    and handle the logic of measurement.
+
+    Signature:
+    * measure_func (see the return value of measure/measure_methods.py::use_rpc for example)
+    def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
+        return measure_results
+
+    * build_func (see measure/measure_methods.py::default_build_func for example)
+    def build_func(inp, tmp_dir, **kwargs):
+        return func, args, filename
+    """
+    return {
+        'measure_func': measure_func,
+        'number': number,
+        'repeat': repeat,
+        'timeout': timeout,
+        'parallel_num': parallel_num,
+        'do_fork': do_fork,
+        'build_func': build_func,
+        'check_correctness': check_correctness,
+        'replay_db': replay_db,
+    }
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
new file mode 100644
index 000000000000..c2ce6ceffe79
--- /dev/null
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -0,0 +1,455 @@
+# pylint: disable=consider-using-enumerate,invalid-name,too-many-function-args
+"""
+Functions that run on executor for measurement.
+These functions are responsible for building tvm module, uploading it to
+remote devices, recording the running time costs and checking the correctness of output
+"""
+
+import logging
+import os
+import time
+from random import getrandbits
+
+import numpy as np
+
+from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func, \
+    target as _target
+from ...contrib import nvcc, util, ndk
+
+from ..util import get_const_tuple
+from ..env import AutotvmGlobalScope
+from ..task.space import InstantiationError
+
+from .measure import MeasureResult, MeasureErrorNo
+from .local_executor import LocalExecutor
+
+
+class HashMismatchError(ValueError):
+    """Raised when the code hash of a submitted config doesn't match that on the
+       measure side """
+    pass
+
+
+def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
+    """request a remote session
+
+    Parameters
+    ----------
+    device_key: string
+        device key of registered device in tracker
+    tracker_addr: Tuple(string, int), optional
+        The address of rpc tracker in (host, port) format.
+        If is none, will use environment variable "TVM_TRACKER_HOST"
+        and "TVM_TRACKER_PORT"
+    priority: int, optional
+        priority of this request, larger is more prior
+    timeout: float, optional
+        timeout of this session (units: seconds)
+
+    Returns
+    ------
+    session: RPCSession
+    """
+    # connect to the tracker
+    if tracker_addr:
+        host = tracker_addr[0] or os.environ['TVM_TRACKER_HOST']
+        port = tracker_addr[1] or int(os.environ['TVM_TRACKER_PORT'])
+    else:
+        host = os.environ['TVM_TRACKER_HOST']
+        port = int(os.environ['TVM_TRACKER_PORT'])
+
+    tracker = rpc.connect_tracker(host, port)
+    remote = tracker.request(device_key, priority=priority,
+                             session_timeout=timeout)
+    return remote
+
+
+def create_measure_batch(task, option):
+    """Get a standard measure_batch function.
+
+    Parameters
+    ----------
+    task: tvm.autotvm.task.Task
+        The tuning task
+    option: dict
+        The option for measuring generated code.
+        You should use the return value of function :any:`measure_option` for this argument.
+
+    Returns
+    -------
+    measure_batch: callable
+        a callback function to measure a batch of configs
+    """
+    from ..database import filter_inputs
+
+    measure_func = option['measure_func']
+    number, repeat = option['number'], option['repeat']
+    timeout, parallel_num, do_fork = option['timeout'], option['parallel_num'], option['do_fork']
+    build_func = option['build_func']
+    check_correctness = option['check_correctness']
+    replay_db = option['replay_db']
+
+    executor = LocalExecutor(timeout=timeout, do_fork=do_fork)
+
+    # convert convenient string to function object
+    attach_objects = None
+    if measure_func == 'local':
+        # start temporary rpc tracker and rpc server for the user
+        tracker = rpc.Tracker('localhost', port=9000, port_end=10000, silent=True)
+        device_key = '$local$device$%d' % tracker.port
+        server = rpc.Server('localhost', port=9000, port_end=10000,
+                            key=device_key,
+                            use_popen=True, silent=True,
+                            tracker_addr=(tracker.host, tracker.port))
+
+        measure_func = use_rpc(device_key, tracker.host, tracker.port)
+        attach_objects = (server, tracker)
+
+    build_kwargs = {}
+    if build_func == 'default':
+        build_func = default_build_func
+    if build_func == 'ndk':
+        build_func = default_build_func
+        build_kwargs['use_ndk'] = True
+
+    # add device info of cuda and opencl target
+    if ('cuda' in task.target.keys or 'opencl' in task.target.keys) \
+            and hasattr(measure_func, 'rpc_info'):
+        rpc_info = measure_func.rpc_info
+        add_gpu_target_info(task.target, rpc_info["key"], (rpc_info["host"], rpc_info["port"]),
+                            build_kwargs)
+
+    if check_correctness:
+        # use llvm cpu to generate a reference input/output
+        # this option works for tuning topi, but might not work for you custom op
+        with _target.create("llvm"):
+            s, arg_bufs = task.instantiate(task.config_space.get(0))
+        ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
+                     for x in arg_bufs]
+        func = build(s, arg_bufs, "llvm")
+        tvm_buf = [nd.array(x) for x in ref_input]
+        func(*tvm_buf)
+        ref_output = [x.asnumpy() for x in tvm_buf]
+    else:
+        ref_input = ref_output = None
+
+    def measure_batch(measure_inputs):
+        """measure the time cost for a batch of configs in real machines"""
+        if replay_db is not None:
+            partial_results, measure_inputs = \
+                filter_inputs(replay_db, measure_inputs, retry=False)
+
+        # launch measure jobs in parallel
+        pack_size = getattr(measure_func, "pack_size", 1)  # measure `pack_size` inputs in one job
+        futures = []
+        for i in range(0, len(measure_inputs), pack_size):
+            input_pack = measure_inputs[i:i + pack_size]
+            ret = executor.submit(
+                measure_func,
+                input_pack,
+                build_func,
+                build_kwargs,
+                number,
+                repeat,
+                ref_input,
+                ref_output)
+            futures.append(ret)
+
+        # transform results
+        results = []
+        for future in futures:
+            result = future.get()
+            if isinstance(result, Exception):
+                tstamp = time.time()
+                results.extend([MeasureResult((result,), MeasureErrorNo.FLEET_ERROR,
+                                              timeout, tstamp)] * pack_size)
+            else:
+                results.extend(result)
+
+        if replay_db is not None:
+            result_idx = 0
+            for i in range(len(partial_results)):
+                if partial_results[i] is None:
+                    partial_results[i] = results[result_idx]
+                    result_idx += 1
+            return partial_results
+        return results
+
+    measure_batch.parallel_num = parallel_num
+    # attach server and tracker object to avoid them of being garbage-collected
+    measure_batch.attach_objects = attach_objects
+    return measure_batch
+
+
+def use_rpc(key,
+            host=None,
+            port=None,
+            priority=1,
+            session_timeout=60,
+            pack_size=1):
+    """
+    Create a standard measure_func which uses RPC Tracker for measurement.
+    This measure_func will request a device from the RPC Tracker and
+    upload the built binary library to that device for measurement.
+
+    Parameters
+    ----------
+    key: str
+        The registered key of the device in tracker. The tuner will request devices for
+        measurement by this key.
+    host: str, optional
+        The hostname of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_HOST"
+    port: int, optional
+        The port of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_PORT"
+    priority: int, optional
+        Priority of this task, used by scheduler in tracker
+    session_timeout: int, optional
+        Timeout of rpc session
+    pack_size: int, optional
+        The number of configs measure in one RPC session.
+        Usually this can be set to 1. If your device has high overhead to establish a
+        rpc connection, set this higher.
+    """
+    def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
+        """Do measurement for a list of inputs inside a same RPC session.
+
+        Parameters
+        ----------
+        input_pack: List of MeasureInput
+            The inputs of measurement
+        build_func: callable
+            Function for building the code. see :any:`default_build_func` for example
+        build_kwargs: dict
+            Extra arguments for build_func
+        number : int, optional
+            Number of times to do the measurement for average
+        repeat : int, optional
+            Number of times to repeat the measurement.
+            In total, the generated code will be run (1 + number x repeat) times,
+            where the first one is warm up. The returned result contains `repeat` costs,
+            each of which is the average of `number` test run.
+        ref_input: List of numpy array
+            Reference input for correctness check
+        ref_output: List of numpy array
+            Reference output for correctness check
+
+        Returns
+        -------
+        results: List of MeasureResult
+            The results for input_pack
+        """
+        remote = request_remote(key, (host, port), priority, session_timeout)
+
+        res = _measure_common(input_pack, build_func, build_kwargs, number, repeat,
+                              ref_input, ref_output,
+                              remote)
+        return res
+
+    fmeasure.pack_size = pack_size
+    fmeasure.rpc_info = {"key": key, "host": host, "port": port}
+    return fmeasure
+
+
+def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
+                    ref_input=None, ref_output=None, remote=None):
+    """Measure the time cost for a pack of inputs.
+
+    (Note: A pack is a list of inputs which will be measured inside a same RPC session)
+
+    Parameters
+    ----------
+    input_pack : list of MeasureInput
+        The inputs we need to evaluate
+    build_func : function takes MeasureInput returns tuple of (time_func, ctx, args)
+        The build function used to build each input.
+    build_kwargs: Dict
+        The extra keyword arguments to build_func
+    number : int, optional
+        Number of times to do the measurement for average
+    repeat : int, optional
+        Number of times to repeat the measurement.
+        In total, the generated code will be run (1 + number x repeat) times,
+        where the first one is warm up. The returned result contains `repeat` costs,
+        each of which is the average of `number` test run.
+    ref_input: Array of np.ndarray, optional
+        Reference input for checking correctness
+    ref_output: Array of np.ndarray, optional
+        Reference output for checking correctness
+    remote: RPCSession, optional
+        The remote RPC session
+
+    Returns
+    -------
+    res_pack : Array of MeasureResult
+        The list of results of measurement.
+    """
+    res_pack = []
+    tmp_dir = util.tempdir() if remote else None
+
+    for inp in input_pack:
+        tic = time.time()
+
+        # build function
+        try:
+            func, arg_bufs, filename = build_func(inp, tmp_dir, **build_kwargs)
+        except TVMError as exc:
+            tstamp = time.time()
+            msg = str(exc)
+            if "Stack trace returned" in msg:
+                msg = msg[:msg.index("Stack trace returned")]
+            if "InstantiationError" in msg:
+                try:
+                    msg = msg.split('\n')[-2].split(": ")[1]
+                except Exception:  # pylint: disable=broad-except
+                    pass
+                raise InstantiationError(msg)
+            else:
+                res_pack.append(MeasureResult((RuntimeError(msg),),
+                                              MeasureErrorNo.COMPILE_HOST,
+                                              tstamp - tic, tstamp))
+            continue
+        except InstantiationError as e:
+            tstamp = time.time()
+            res_pack.append(MeasureResult((e,),
+                                          MeasureErrorNo.INSTANTIATION_ERROR,
+                                          tstamp - tic, tstamp))
+            continue
+
+        # upload built module
+        if remote:
+            remote.upload(tmp_dir.relpath(filename))
+            func = remote.load_module(filename)
+            ctx = remote.context(str(inp.target), 0)
+            time_f = func.time_evaluator(
+                func.entry_name, ctx, number=number, repeat=repeat)
+        else:
+            ctx = context(str(inp.target), 0)
+            time_f = func.time_evaluator(
+                func.entry_name, ctx, number=number, repeat=repeat)
+
+        # measure time
+        errno = MeasureErrorNo.NO_ERROR
+        try:
+            if ref_input:
+                args = [nd.array(x, ctx=ctx) for x in ref_input]
+            else:
+                args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype, ctx=ctx)
+                        for x in arg_bufs]
+            costs = time_f(*args).results
+            if len(costs) > 2:  # remove largest and smallest value to reduce variance
+                costs = list(costs)
+                costs.sort()
+                costs = tuple(costs[1:-1])
+            if ref_output:
+                for expected, real in zip(ref_output, args):
+                    if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
+                        logging.warning("Wrong Answer!")
+                        errno = MeasureErrorNo.WRONG_ANSWER
+        except TVMError as exc:
+            msg = str(exc)
+            if "Stack trace returned" in msg:
+                msg = msg[:msg.index("Stack trace returned")]
+            costs = (RuntimeError(msg),)
+            errno = MeasureErrorNo.RUNTIME_DEVICE
+        tstamp = time.time()
+        res_pack.append(MeasureResult(costs, errno, tstamp - tic, tstamp))
+    return res_pack
+
+
+def default_build_func(inp, tmp_dir=None, **kwargs):
+    """Build function module. Exception will be raised when any error occurs
+
+    Parameters
+    ----------
+    inp: MeasureInput
+       The input of this measurement
+    tmp_dir: tvm.contrib.util.TempDirectory, optional
+       The temporary directory for exporting built binary library.
+       If is not None (in RPC mode), the library in this directory will be uploaded to
+       remote devices.
+    kwargs: Dict, optional
+        Other extra arguments
+
+    Returns
+    -------
+    func: Function
+        TVM built function. Typically this is the return value of tvm.build.
+    args: Array of Buffer or Tensor
+        The argument list for the function. Typically this is the second argument of tvm.build.
+    filename: str
+        The filename of the output build library
+    """
+    # build function
+    with inp.target:
+        s, args = inp.task.instantiate(inp.config)
+
+        # check invalidity of template and code hash consistency
+        if not inp.config.valid():
+            raise InstantiationError(inp.config.errors)
+        code_hash = getattr(s, 'code_hash', None)
+        if inp.config.code_hash != code_hash:
+            raise HashMismatchError('got {0:s}, expected {1:s}'
+                                    .format(str(inp.config.code_hash), str(code_hash)))
+
+        opts = {}
+        if "check_gpu" in kwargs:  # Add verify pass to filter out invalid configs in advance.
+            opts["add_lower_pass"] = [(2, gpu_verify_pass(**kwargs['check_gpu']))]
+        if 'cuda_arch' in kwargs:
+            set_cuda_target_arch(kwargs['cuda_arch'])
+
+        with build_config(**opts):
+            func = build(s, args, target_host=inp.task.target_host)
+
+    # export library to temp directory
+    if tmp_dir:
+        if kwargs.get('use_ndk', False):  # for Android NDK
+            filename = "tmp_func_%0x.so" % getrandbits(64)
+            func.export_library(tmp_dir.relpath(filename), ndk.create_shared)
+        else:
+            filename = "tmp_func_%0x.tar" % getrandbits(64)
+            func.export_library(tmp_dir.relpath(filename))
+    else:
+        filename = None
+
+    return func, args, filename
+
+
+def add_gpu_target_info(target, device_key, rpc_tracker_addr, kwargs):
+    """Add device info for gpu target.
+    The info will be used to check the validity of generated code."""
+    remote = request_remote(device_key, rpc_tracker_addr)
+    ctx = remote.context(str(target), 0)
+    max_dims = ctx.max_thread_dimensions
+    kwargs['check_gpu'] = {
+        'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
+        'max_threads_per_block': ctx.max_threads_per_block,
+        'max_thread_x': max_dims[0],
+        'max_thread_y': max_dims[1],
+        'max_thread_z': max_dims[2],
+    }
+
+    if 'cuda' in target.keys:
+        kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
+
+def set_cuda_target_arch(arch):
+    """set target architecture of nvcc compiler"""
+    AutotvmGlobalScope.current.cuda_target_arch = arch
+
+
+@register_func
+def tvm_callback_cuda_compile(code):
+    """use nvcc to generate ptx code for better optimization"""
+    ptx = nvcc.compile_cuda(code, target="ptx", arch=AutotvmGlobalScope.current.cuda_target_arch)
+    return ptx
+
+
+def gpu_verify_pass(**kwargs):
+    """Verify the validity of a gpu kernel.
+    This pass will check memory usage and number of threads per block.
+    """
+    def verify_pass(stmt):
+        valid = ir_pass.VerifyGPUCode(stmt, kwargs)
+        if not valid:
+            raise InstantiationError("Skipped because of invalid gpu kernel")
+        return stmt
+    return verify_pass
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
new file mode 100644
index 000000000000..a46cee9bf998
--- /dev/null
+++ b/python/tvm/autotvm/record.py
@@ -0,0 +1,296 @@
+# pylint: disable=superfluous-parens, redefined-outer-name, redefined-outer-name,pointless-string-statement
+# pylint: disable=consider-using-enumerate,invalid-name
+"""Tuning record and serialization format"""
+
+import argparse
+import base64
+import logging
+import multiprocessing
+import pickle
+import json
+import time
+from collections import OrderedDict
+
+from .. import build, lower, target as _target
+
+from . import task
+from .task import ConfigEntity, ApplyHistoryBest
+from .measure import MeasureInput, MeasureResult
+
+AUTOTVM_LOG_VERSION = 0.1
+
+try:  # convert unicode to str for python2
+    _unicode = unicode
+except NameError:
+    _unicode = ()
+
+try:
+    _long = long
+except NameError:
+    _long = int
+
+
+def measure_str_key(inp, include_config=True):
+    """ get unique str key for MeasureInput
+
+    Parameters
+    ----------
+    inp: MeasureInput
+        input for the measure
+    include_config: bool, optional
+        whether includes config in the str key
+
+    Returns
+    -------
+    key: str
+        The str representation of key
+    """
+    config_str = str(inp.config) if include_config else ""
+    return "".join([str(inp.target), inp.task.name, str(inp.task.args),
+                    str(inp.task.kwargs), config_str])
+
+
+def encode(inp, result, protocol='json'):
+    """encode (MeasureInput, MeasureResult) pair to a string
+
+    Parameters
+    ----------
+    inp: autotvm.tuner.MeasureInput
+    result: autotvm.tuner.MeasureResult
+        pair of input/result
+    protocol: str
+        log protocol, json or pickle
+
+    Returns
+    -------
+    row: str
+        a row in the logger file
+    """
+
+    if protocol == 'json':
+        json_dict = {
+            "i": (str(inp.target),
+                  inp.task.name, inp.task.args, inp.task.kwargs,
+                  inp.task.workload,
+                  inp.config.to_json_dict()),
+
+            "r": (result.costs if result.error_no == 0 else (1e9,),
+                  result.error_no,
+                  result.all_cost,
+                  result.timestamp),
+
+            "v": AUTOTVM_LOG_VERSION
+        }
+        return json.dumps(json_dict)
+    elif protocol == 'pickle':
+        row = (str(inp.target),
+               str(base64.b64encode(pickle.dumps([inp.task.name,
+                                                  inp.task.args,
+                                                  inp.task.kwargs,
+                                                  inp.task.workload])).decode()),
+               str(base64.b64encode(pickle.dumps(inp.config)).decode()),
+               str(base64.b64encode(pickle.dumps(tuple(result))).decode()))
+        return '\t'.join(row)
+    else:
+        raise RuntimeError("Invalid log protocol: " + protocol)
+
+
+def decode(row, protocol='json'):
+    """Decode encoded record string to python object
+
+    Parameters
+    ----------
+    row: str
+        a row in the logger file
+    protocol: str
+        log protocol, json or pickle
+
+    Returns
+    -------
+    input: autotvm.tuner.MeasureInput
+    result: autotvm.tuner.MeasureResult
+    """
+    # pylint: disable=unused-variable
+    if protocol == 'json':
+        row = json.loads(row)
+        tgt, task_name, task_args, task_kwargs, workload, config = row['i']
+        tgt = _target.create(str(tgt))
+
+        def clean_json_to_python(x):
+            """1. Convert all list in x to tuple (hashable)
+               2. Convert unicode to str for python2
+            """
+            if isinstance(x, list):
+                return tuple([clean_json_to_python(a) for a in x])
+            if isinstance(x, _unicode):
+                return str(x)
+            if isinstance(x, (_long, int)):
+                return int(x)
+            return x
+
+        tsk = task.Task(clean_json_to_python(task_name), clean_json_to_python(task_args))
+        tsk.workload = clean_json_to_python(workload)
+        config = ConfigEntity.from_json_dict(config)
+        inp = MeasureInput(tgt, tsk, config)
+        result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["r"]])
+
+        return inp, result
+    elif protocol == 'pickle':
+        items = row.split("\t")
+        tgt = _target.create(items[0])
+        task_tuple = pickle.loads(base64.b64decode(items[1].encode()))
+        config = pickle.loads(base64.b64decode(items[2].encode()))
+        result = pickle.loads(base64.b64decode(items[3].encode()))
+
+        tsk = task.Task(task_tuple[0], task_tuple[1])
+        tsk.workload = task_tuple[3]
+        return MeasureInput(tgt, tsk, config), MeasureResult(*result)
+    else:
+        raise RuntimeError("Invalid log protocol: " + protocol)
+
+
+def load_from_file(filename):
+    """Generator: load records from file.
+    This is a generator that yields the records.
+
+    Parameters
+    ----------
+    filename: str
+
+    Yields
+    ------
+    input: autotvm.tuner.MeasureInput
+    result: autotvm.tuner.MeasureResult
+    """
+    for row in open(filename):
+        yield decode(row)
+
+
+def split_workload(in_file, clean=True):
+    """Split a log file into separate files, each of which contains only a single workload
+    This function can also delete duplicated records in log file
+
+    Parameters
+    ----------
+    in_file: str
+        input filename
+    clean: bool
+        whether delete duplicated items
+    """
+    tic = time.time()
+    lines = list(open(in_file).readlines())
+
+    logging.info("start converting...")
+    pool = multiprocessing.Pool()
+    lines = pool.map(decode, lines)
+    logging.info("map done %.2f", time.time() - tic)
+
+    wkl_dict = OrderedDict()
+    for inp, res in lines:
+        wkl = measure_str_key(inp, False)
+        if wkl not in wkl_dict:
+            wkl_dict[wkl] = []
+        wkl_dict[wkl].append([inp, res])
+
+    if clean:
+        for i, (k, v) in enumerate(wkl_dict.items()):
+            # clean duplicated items
+            added = set()
+            cleaned = []
+            for inp, res in v:
+                str_key = measure_str_key(inp)
+                if str_key in added:
+                    continue
+                added.add(str_key)
+                cleaned.append([inp, res])
+
+            # write to file
+            logging.info("Key: %s\tValid: %d\tDup: %d\t", k, len(cleaned), len(v) - len(cleaned))
+            with open(args.i + ".%03d.wkl" % i, 'w') as fout:
+                for inp, res in cleaned:
+                    fout.write(encode(inp, res) + '\n')
+    else:
+        for i, (k, v) in enumerate(wkl_dict.items()):
+            logging.info("Key: %s\tNum: %d", k, len(v))
+            with open(args.i + ".%03d.wkl" % i, 'w') as fout:
+                for inp, res in v:
+                    fout.write(encode(inp, res) + '\n')
+
+def pick_best(in_file, out_file):
+    """
+    Pick best entries from a file and store it to another file.
+    This distill the useful log entries from a large log file.
+
+    Parameters
+    ----------
+    in_file: str
+        The filename of input
+    out_file: str or file
+        The filename of output
+    """
+    best_context = ApplyHistoryBest(load_from_file(in_file))
+    best_set = set()
+
+    for v in best_context.best_by_model.values():
+        best_set.add(measure_str_key(v[0]))
+
+    for v in best_context.best_by_targetkey.values():
+        best_set.add(measure_str_key(v[0]))
+
+    logging.info("Extract %d best records from the %s", len(best_set), in_file)
+    fout = open(out_file, 'w') if isinstance(out_file, str) else out_file
+
+    for inp, res in load_from_file(in_file):
+        if measure_str_key(inp) in best_set:
+            fout.write(encode(inp, res) + "\n")
+            best_set.remove(measure_str_key(inp))
+
+"""
+Usage:
+This record executable module has three modes.
+
+* Print log file in readable format
+e.g. python -m autotvm.record --mode read --i collect_conv.log --begin 0 --end 5 --ir --code
+
+* Extract history best from a large log file
+e.g. python -m autotvm.record --mode pick --i collect.log
+
+* Split a log file into separate files, each of which contains only a single wkl
+e.g. python -m autotvm.record --mode split --i collect.log
+"""
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mode", choices=['read', 'pick', 'split'], default='read')
+    parser.add_argument("--i", type=str, help="input file")
+    parser.add_argument("--o", type=str, default=None, help='output file')
+    parser.add_argument("--begin", type=int, default=0)
+    parser.add_argument("--end", type=int, default=5)
+    parser.add_argument("--ir", action='store_true')
+    parser.add_argument("--code", action='store_true')
+
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO)
+
+    if args.mode == 'pick':
+        args.o = args.o or args.i + ".best.log"
+        pick_best(args.i, args.o)
+    elif args.mode == 'read':
+        for i, (inp, result) in enumerate(load_from_file(args.i)):
+            if args.begin <= i < args.end:
+                with inp.target:
+                    s, arg_bufs = inp.task.instantiate(inp.config)
+
+                print("")
+                print(inp.target, inp.task, inp.config)
+                print(result)
+
+                if args.ir:
+                    with inp.target:
+                        print(lower(s, arg_bufs, simple_mode=True))
+
+                if args.code:
+                    with inp.target:
+                        func = build(s, arg_bufs)
+                        print(func.imported_modules[0].get_source())
+    elif args.mode == 'split':
+        split_workload(args.i)
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
new file mode 100644
index 000000000000..0d43f92656cd
--- /dev/null
+++ b/python/tvm/autotvm/task/__init__.py
@@ -0,0 +1,15 @@
+"""Task is a tunable composition of template functions.
+
+Tuner takes a tunable task and optimizes the joint configuration
+space of all the template functions in the task.
+This module defines the task data structure, as well as a collection(zoo)
+of typical tasks of interest.
+"""
+
+from .task import Task, create, register, template, get_config, args_to_workload
+from .space import ConfigSpace, ConfigEntity
+from .code_hash import attach_code_hash, attach_code_hash_to_arg
+from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, dispatcher
+
+from .topi_integration import register_topi_compute, register_topi_schedule
+from .nnvm_integration import extract_from_graph
diff --git a/python/tvm/autotvm/task/code_hash.py b/python/tvm/autotvm/task/code_hash.py
new file mode 100644
index 000000000000..48343272bbdc
--- /dev/null
+++ b/python/tvm/autotvm/task/code_hash.py
@@ -0,0 +1,43 @@
+"""
+Decorator functions for hashing schedule code
+
+code hashing is used to check the consistence of schedule code and the parameters loaded from log
+"""
+import inspect
+import zlib
+
+from tvm import schedule
+
+def attach_code_hash(s):
+    """Decorator for attaching a code hash to a schedule
+
+    Parameters
+    ----------
+    s: Schedule
+        tvm.schedule.Schedule to attach the hash to
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            func(*args, **kwargs)
+            raw_hash = zlib.crc32(''.join(inspect.getsourcelines(func)[0]).encode())
+            s.code_hash = hex(raw_hash)[2:]
+        return wrapper
+    return decorator
+
+def attach_code_hash_to_arg(arg_idx=1):
+    """Decorator for attaching a code hash to a schedule
+
+    Parameters
+    ----------
+    arg_idx: int
+        index of the argument (expected to be a Schedule) to attach the code
+        hash to
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            func(*args, **kwargs)
+            assert isinstance(args[arg_idx], schedule.Schedule)
+            raw_hash = zlib.crc32(''.join(inspect.getsourcelines(func)[0]).encode())
+            args[arg_idx].code_hash = hex(raw_hash)[2:]
+        return wrapper
+    return decorator
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
new file mode 100644
index 000000000000..beb4e4dcf204
--- /dev/null
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -0,0 +1,244 @@
+"""
+Template dispatcher module.
+
+A dispatcher is a function that can contains multiple behaviors.
+Its specific behavior is can be controlled by DispatchContext.
+
+DispatchContext is used in two ways, usually via different implementation
+of the DispatchContext base class.
+
+- During search, we can use it to pass the current proposal from tuner.
+- During evaluation, we can use it to set pick the best policy.
+"""
+from __future__ import absolute_import as _abs
+
+import logging
+
+from decorator import decorate
+import numpy as np
+
+from tvm import target as _target
+
+class DispatchContext(object):
+    """
+    Base class of dispatch context.
+
+    DispatchContext enables the target and workload
+    specific dispatch mechanism for templates.
+    """
+    current = None
+
+    def query(self, target, workload):
+        """
+        Query the context to get the specific implementation.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+
+        Returns
+        -------
+        cfg : ConfigSpace
+            The specific configuration.
+        """
+        raise NotImplementedError()
+
+    def __enter__(self):
+        self._old_ctx = DispatchContext.current
+        DispatchContext.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        DispatchContext.current = self._old_ctx
+
+
+def dispatcher(fworkload):
+    """Wrap a workload dispatcher function.
+
+    Parameters
+    ----------
+    fworkload : function
+        The workload extraction function from arguments.
+
+    Returns
+    -------
+    fdispatcher : function
+        A wrapped dispatcher function, which will
+        dispatch based on DispatchContext and
+        the current workload.
+    """
+    dispatch_dict = {}
+    func_name = fworkload.__name__
+
+    def register(key, func=None, override=False):
+        """Register template function.
+
+        Parameters
+        ----------
+        key : str or List of str
+            The template key to identify the template
+            under this dispatcher.
+        func : function
+            The function to be registered.
+            The first argument of the function is always
+            cfg returned by DispatchContext,
+            the rest arguments are the same as the fworkload.
+        override : bool
+            Whether override existing registration.
+
+        Returns
+        -------
+        The register function if necessary.
+        """
+        if isinstance(key, str):
+            key = [key]
+
+        def _do_reg(myf):
+            for x in key:
+                if x in dispatch_dict and not override:
+                    raise ValueError(
+                        "Key %s is already registered for %s" % (x, func_name))
+                dispatch_dict[x] = myf
+            return myf
+
+        if func:
+            return _do_reg(func)
+        return _do_reg
+
+    def dispatch_func(func, *args, **kwargs):
+        """The wrapped dispatch function"""
+        tgt = _target.current_target()
+        context = DispatchContext.current
+        if context is None:
+            raise RuntimeError("DispatchContext is not initialized")
+        workload = func(*args, **kwargs)
+        cfg = context.query(tgt, workload)
+        return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
+
+    fdecorate = decorate(fworkload, dispatch_func)
+    fdecorate.register = register
+    return fdecorate
+
+
+class ApplyConfig(DispatchContext):
+    """Apply a specific config entity during query.
+
+    Parameters
+    ----------
+    config : ConfigSpace or ConfigEntity
+        The specific configuration we care about.
+    """
+    def __init__(self, config):
+        super(ApplyConfig, self).__init__()
+        self._config = config
+        self.workload = None
+
+    def query(self, target, workload):
+        """Override query"""
+        self.workload = workload
+        return self._config
+
+
+class ApplyHistoryBest(DispatchContext):
+    """
+    Apply the history best config
+
+    Parameters
+    ----------
+    records : str or iterator of (MeasureInput, MeasureResult)
+        Collection of tuning records.
+        If is str, then it should be the filename of a records log file.
+                   Each row of this file is an encoded record pair.
+        Otherwise, it is an iterator.
+    default: ConfigEntity, optional
+        The default config to return when no history records
+    """
+    def __init__(self, records, default=None):
+        super(ApplyHistoryBest, self).__init__()
+
+        self.best_by_targetkey = {}
+        self.best_by_model = {}
+        self._default = default
+
+        if records:
+            self.load(records)
+
+    def load(self, records):
+        """Load records to this dispatch context
+
+        Parameters
+        ----------
+        records : str or iterator of (MeasureInput, MeasureResult)
+            Collection of tuning records.
+            If is str, then it should be the filename of a records log file.
+                       Each row of this file is an encoded record pair.
+            Otherwise, it is an iterator.
+        """
+        from ..record import load_from_file
+
+        if isinstance(records, str):
+            records = load_from_file(records)
+        if not records:
+            return
+
+        best_by_targetkey = self.best_by_targetkey
+        best_by_model = self.best_by_model
+
+        counter = 0
+        for inp, res in records:
+            counter += 1
+            if res.error_no != 0:
+                continue
+
+            # use target keys in tvm target system as key to build best map
+            for k in inp.target.keys:
+                key = (k, inp.task.workload)
+                if key not in best_by_targetkey:
+                    best_by_targetkey[key] = (inp, res)
+                else:
+                    _, other_res = best_by_targetkey[key]
+                    if np.mean(other_res.costs) > np.mean(res.costs):
+                        best_by_targetkey[key] = (inp, res)
+
+            # use model as key to build best map
+            for opt in inp.target.options:
+                if opt.startswith("-model"):
+                    model = opt[7:]
+                    key = (model, inp.task.workload)
+                    if key not in best_by_model:
+                        best_by_model[key] = (inp, res)
+                    else:
+                        _, other_res = best_by_model[key]
+                        if np.mean(other_res.costs) > np.mean(res.costs):
+                            best_by_model[key] = (inp, res)
+                    break
+
+        logging.debug("Finish loading %d records", counter)
+
+    def query(self, target, workload):
+        if target is None:
+            raise RuntimeError("Need a target context to find the history best. "
+                               "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
+                               " above the dispatcher call. So does other target. ")
+
+        # first try matching by model
+        for opt in target.options:
+            if opt.startswith("-model"):
+                model = opt[7:]
+                key = (model, workload)
+                if key in self.best_by_model:
+                    return self.best_by_model[key][0].config
+
+        # then try matching by target key
+        for k in target.keys:
+            key = (k, workload)
+            if key in self.best_by_targetkey:
+                return self.best_by_targetkey[key][0].config
+
+        if self._default:
+            return self._default
+        raise RuntimeError(
+            "Cannot find config for target=%s, workload=%s" % (target, workload))
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
new file mode 100644
index 000000000000..a16527f9cb01
--- /dev/null
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -0,0 +1,177 @@
+# pylint: disable=unused-variable,invalid-name
+"""
+Decorator and utilities for the integration with TOPI and NNVM
+
+"""
+import warnings
+
+from ... import tensor, placeholder, target as _target
+
+from ..util import get_const_tuple
+from .task import create, register
+
+
+def serialize_args(args):
+    """serialize arguments of a topi function to a hashable tuple.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tensor.Tensor):
+            ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
+        else:
+            ret.append(t)
+    return tuple(ret)
+
+
+def deserialize_args(args):
+    """The inverse function of :code:`serialize_args`.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tuple) and t[0] == 'TENSOR':
+            ret.append(placeholder(shape=t[1], dtype=t[2]))
+        else:
+            ret.append(t)
+    return ret
+
+
+# Task extractor for nnvm graph
+class TaskExtractEnv:
+    """Global environment for extracting tuning tasks from nnvm graph"""
+    current = None
+
+    def __init__(self):
+        import topi
+        import nnvm
+
+        self.symbol2topi = {
+            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw]
+        }
+
+        self.topi_to_task = {
+            topi.nn.conv2d: "topi_nn_conv2d",
+            topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
+        }
+
+        self._register_dummy()
+        self._register_topi_task()
+        self.task_collection = []
+
+    def _register_dummy(self):
+        """Register dummy function to track the topi function call"""
+        for func in self.topi_to_task:
+            def _local_scope(local_func):
+                """build a scope to holds the function"""
+                @local_func.register("dummy", )
+                def _dummy_func(*args, **kwargs):
+                    assert not kwargs, "Do not support extracting tuning tasks when" \
+                                       "kwargs is used in TOPI function call." \
+                                       "Please modify it to use only positional args."
+
+                    if (self.topi_to_task[local_func], serialize_args(args)) \
+                            not in self.task_collection:
+                        self.task_collection.append((self.topi_to_task[local_func],
+                                                     serialize_args(args)))
+                    with _target.create("opencl"):
+                        return local_func(*args)
+
+            _local_scope(func)
+
+    def _register_topi_task(self):
+        """register tuning wrapper for topi function"""
+        import topi
+
+        # Tuning wrapper for topi functions
+        @register("topi_nn_conv2d")
+        def _topi_nn_conv2d(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            layout = args[-2]
+            assert layout == 'NCHW', "only support NCHW currently"
+            C = topi.nn.conv2d(*args, **kwargs)
+            s = topi.generic.schedule_conv2d_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_depthwise_conv2d_nchw")
+        def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.depthwise_conv2d_nchw(*args, **kwargs)
+            s = topi.generic.schedule_depthwise_conv2d_nchw([C])
+            return s, [A, W, C]
+
+    def reset(self):
+        """Reset task collections"""
+        self.task_collection = []
+
+    def get_tasks(self):
+        """Get collected tasks"""
+        return self.task_collection
+
+    @staticmethod
+    def get():
+        """Get the single instance of TaskExtractEnv"""
+        if not TaskExtractEnv.current:
+            TaskExtractEnv.current = TaskExtractEnv()
+        return TaskExtractEnv.current
+
+
+def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
+    """ Extract tuning tasks from a nnvm graph.
+
+    This function collects tunning tasks by building the graph
+    with a "dummy" target and tracing all the calls to topi.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to tune
+    shape : dict of str to tuple, optional
+        The input shape to the graph
+    dtype : str or dict of str to str
+        The input types to the graph
+    target: tvm.target.Target
+        The compilation target
+    symbols : Array of nnvm.symbol
+        Array of nnvm symbols
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
+    """
+    import nnvm.compiler
+
+    env = TaskExtractEnv.get()
+
+    topi_funcs = []
+    for sym_name in symbols:
+        if sym_name in env.symbol2topi:
+            topi_funcs.extend(env.symbol2topi[sym_name])
+        else:
+            warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
+
+    # run compiler to collect all TOPI calls during compilation
+    env.reset()
+    dummy_target = _target.create("opencl -device=dummy")
+    nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
+
+    tasks = []
+    for task_name, args in env.get_tasks():
+        tasks.append(create(task_name, args,
+                            target=target, target_host=target_host,
+                            template_key='direct'))
+
+    return tasks
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
new file mode 100644
index 000000000000..ea823c6f2760
--- /dev/null
+++ b/python/tvm/autotvm/task/space.py
@@ -0,0 +1,891 @@
+# pylint: disable=too-few-public-methods,invalid-name,unused-argument,arguments-differ
+# pylint: disable=consider-using-enumerate
+"""
+Template configuration space.
+
+Each template function can be parametrized by a ConfigSpace.
+The space is declared when we invoke the template function with ConfigSpace.
+During evaluation, we pass in a ConfigEntity, which contains a specific
+entity in the space. This entity contains deterministic parameters.
+"""
+from __future__ import absolute_import as _abs
+
+import itertools
+import functools
+import math
+from collections import namedtuple, OrderedDict
+import numpy as np
+
+from tvm import schedule, thread_axis
+from tvm.autotvm.util import get_const_int
+
+Axis = namedtuple('Axis', ['space', 'index'])
+
+try:
+    _long = long
+except NameError:
+    _long = int
+
+
+class InstantiationError(ValueError):
+    """Actively detected error in instantiating a template with a config,
+     raised by cfg.raise_error
+     e.g. too many unrolling, too many threads in a block
+    """
+    pass
+
+
+class TransformSpace(object):
+    """Base class for transform space
+    TransformSpace is the node in the computation graph of axes
+
+    Note
+    ----
+    We can regard our schedule code as a transformation graph of axes.
+    Starting from raw axes in the definition of tvm.compute, we can transform these axes
+    by some operators. The operator includes 'split', 'reorder' and 'annotate'.
+    Each operator has some tunable parameters (e.g. the split factor).
+    Then the tuning process is just to find good parameters of these op.
+
+    So the all the combinations of the parameters of these op forms our search space.
+
+    Naming convention:
+    We call the set of all possible values as XXXSpace. (XXX can be Split, Reorder, Config ...)
+    We call a specific entity in a space as XXXEntity.
+    """
+    def __init__(self):
+        self.ins = []
+        self.num_output = 0
+        self.entities = []
+
+    def __len__(self):
+        return len(self.entities)
+
+    def __getitem__(self, index):
+        """Get an entity of the space by index
+
+        Parameters
+        ----------
+        index: int
+
+        Returns
+        -------
+        transform entity
+        """
+        return self.entities[index]
+
+    @staticmethod
+    def get_num_output():
+        """get number of output axes after this transform
+
+        Returns
+        -------
+        n: int
+            number of output axes
+        """
+        return 0
+
+
+class VirtualAxis(TransformSpace):
+    """Axis placeholder in template
+
+    Parameters
+    ----------
+    var: int or tvm.schedule.IterVar
+        If is int, return a virtual axis whose length is the provided argument.
+        If is IterVar, return a virtual axis whose length is extracted from
+                       the IterVar's extent domain.
+    name: str
+    """
+    name_ct = 0
+
+    def __init__(self, var, name=None):
+        super(VirtualAxis, self).__init__()
+        self.num_output = 1
+
+        if name is None:
+            name = 'axis_%d' % VirtualAxis.name_ct
+            VirtualAxis.name_ct += 1
+
+        self.name = name
+        if isinstance(var, (int, _long)):
+            self.length = var
+        elif isinstance(var, schedule.IterVar):
+            self.name = var.var.name
+            if var.dom is None:
+                self.length = -1
+            else:
+                self.length = get_const_int(var.dom.extent)
+        elif isinstance(var, VirtualAxis):
+            self.length = var.length
+        else:
+            raise RuntimeError("Invalid type of axis: " + str(type(var)))
+
+    @staticmethod
+    def get_num_output(var, name=None):
+        return 1
+
+    def __repr__(self):
+        return "vaxis(%s)" % self.name
+
+
+def get_factors(n):
+    """return all factors of an integer
+
+    Parameters
+    ----------
+    n: int
+        integer to factorize
+
+    Returns
+    -------
+    factors: list
+        List of all factors
+    """
+    step = 2 if n % 2 else 1
+    ret = list(set(
+        functools.reduce(
+            list.__add__, ([i, n//i] for i in range(1, int(math.sqrt(n)) + 1, step)
+                           if n % i == 0))))
+    ret.sort()
+    return ret
+
+
+class SplitSpace(TransformSpace):
+    """Split an axis for several times"""
+    def __init__(self, axes, policy, **kwargs):
+        super(SplitSpace, self).__init__()
+        axis = axes[0]
+
+        self.policy = policy
+        self.entities = []
+
+        if policy == 'all':
+            num_outputs = kwargs["num_outputs"]
+            max_factor = kwargs.get("max_factor", 1 << 31)
+            fil = kwargs.get("filter", lambda x: True)
+
+            length = axis.length
+            factors = get_factors(length)
+            factors = [x for x in factors if x <= max_factor]
+            # copy factors for every level
+            self.product = length
+            self.num_outputs = num_outputs
+            self.factors = [factors] * (num_outputs-1)
+            self._generate_space(0, [None] * (num_outputs - 1))
+            self.entities = list(filter(fil, self.entities))
+            self.num_output = num_outputs
+        elif policy == 'candidate':
+            self.product = axis.length
+            self.num_outputs = kwargs["num_outputs"]
+            for size in kwargs["candidate"]:
+                assert len(size) == self.num_outputs
+                # assert np.prod(size) == self.product
+                self.entities.append(SplitEntity(size))
+            self.num_output = self.num_outputs
+        else:
+            raise RuntimeError("Invalid policy: " + policy)
+
+    def _generate_space(self, now, tmp_stack):
+        """Generate space by DFS"""
+        if now == self.num_outputs - 1:
+            if self.product % np.prod(tmp_stack) == 0:
+                first = int(self.product // int(np.prod(tmp_stack)))
+                self.entities.append(SplitEntity([first] + tmp_stack[::-1]))
+        else:
+            for factor in self.factors[now]:
+                tmp_stack[now] = factor
+                self._generate_space(now + 1, tmp_stack)
+
+    @staticmethod
+    def get_num_output(axes, policy, **kwargs):
+        return kwargs["num_outputs"]
+
+    def __repr__(self):
+        return ("Split(policy=%s, product=%d, num_outputs=%d) len=%d" %
+                (self.policy, self.product, self.num_outputs, len(self)))
+
+
+class SplitEntity(object):
+    """
+    A split operation with detailed parameters
+    that can apply to an axis
+
+    Parameters
+    ----------
+    size: Array of int
+        the size of every axis after split
+        e.g. an axis of extent 128, we split it into 3 axes, a possible
+             size is [4, 4, 8] (4x4x8 = 128)
+    """
+    def __init__(self, size):
+        self.size = size
+
+    def apply(self, sch, op, axis):
+        """Apply split to an axis
+
+        Parameters
+        ----------
+        sch: tvm.schedule.Schedule
+            The tvm schedule
+        op: tvm.tensor.Operation
+            The stage to be applied
+        axis: tvm.schedule.IterVar
+            axis to split
+
+        Returns
+        -------
+        axes : list of Axis
+            The transformed axes.
+        """
+        ret = []
+        for i in range(1, len(self.size)):
+            ax0, ax1 = sch[op].split(axis, int(np.prod(self.size[i:])))
+            ret.append(ax0)
+            axis = ax1
+        return ret + [axis]
+
+    def __repr__(self):
+        return str(self.size)
+
+
+class ReorderSpace(TransformSpace):
+    """The parameter space for ordering an array of axes"""
+    def __init__(self, axes, policy, **kwargs):
+        super(ReorderSpace, self).__init__()
+        self.ins = axes
+        self.policy = policy
+        self.num_output = len(axes)
+
+        if policy == 'identity':
+            self.entities = [ReorderEntity(range(len(axes)))]
+        elif policy == 'all':
+            self.entities = [
+                ReorderEntity(x) for x in itertools.permutations(range(len(axes)))]
+        elif policy == 'interval_all':
+            begin, end = kwargs['interval']
+            sub_space = list(itertools.permutations(range(begin, end)))
+            prefix, suffix = tuple(range(begin)), tuple(range(end, len(axes)))
+            self.entities = [ReorderEntity(prefix + x + suffix) for x in sub_space]
+        elif policy == 'candidate':
+            candidate = kwargs["candidate"]
+            for can in candidate:
+                perm = [axes.index(x) for x in can]
+                self.entities.append(ReorderEntity(perm))
+        elif policy == 'interleave':
+            spatial, reduce = kwargs['spatial'], kwargs['reduce']
+
+            spatial = [[axes.index(x) for x in ch] for ch in spatial]
+            reduce = [[axes.index(x) for x in ch] for ch in reduce]
+
+            outer_merged = self._merge_chain([x[:-1] for x in spatial])
+            inner_merged = self._merge_chain([x[-1:] for x in spatial] + reduce)
+
+            for o in outer_merged:
+                for i in inner_merged:
+                    self.entities.append(ReorderEntity(o + i))
+        elif policy == 'interleave_cuda':
+            spatial, reduce = kwargs['spatial'], kwargs['reduce']
+
+            spatial = [[axes.index(x) for x in ch] for ch in spatial]
+            reduce = [[axes.index(x) for x in ch] for ch in reduce]
+
+            outer_merged = self._merge_chain([x[:-1] for x in spatial])
+            reduce_merged = self._merge_chain(reduce)
+            inner_merged = [x[-1] for x in spatial]
+
+            for o in outer_merged:
+                for r in reduce_merged:
+                    self.entities.append(ReorderEntity(o + r + inner_merged))
+        else:
+            raise RuntimeError("Invalid policy: " + policy)
+
+    @staticmethod
+    def get_num_output(axes, policy, **kwargs):
+        return len(axes)
+
+    def __repr__(self):
+        return "Reorder(policy=%s) len=%d" % (self.policy, len(self))
+
+    def _merge_chain(self, chains):
+        """generate all combinations of merge some chains"""
+        merged = []
+        tmp_pt = [0] * len(chains)
+        tmp_stack = []
+
+        size = np.sum([len(x) for x in chains])
+        self._merge_dfs(chains, size, tmp_pt, tmp_stack, merged)
+        return merged
+
+    def _merge_dfs(self, chains, size, tmp_pt, tmp_stack, merged):
+        if np.sum(tmp_pt) == size:
+            merged.append(list(tmp_stack))
+            return
+        else:
+            for i in range(len(chains)):
+                # use i == np.argmax(....) here to take spatial order into consideration
+                # if we don't want to consider spatial order, we can use tmp_pt[i] == np.max(....)
+                if (tmp_pt[i] < len(chains[i]) and
+                        (i == np.argmax([len(chains[x]) - tmp_pt[x] for x in range(len(chains))]))):
+                    tmp_stack.append(chains[i][tmp_pt[i]])
+                    tmp_pt[i] += 1
+                    self._merge_dfs(chains, size, tmp_pt, tmp_stack, merged)
+                    tmp_pt[i] -= 1
+                    tmp_stack.pop()
+
+
+class ReorderEntity(object):
+    """A reorder operation with detailed parameters that can apply to axes
+
+    Parameters
+    ----------
+    perm: Array of int
+        define the permutation
+    """
+    def __init__(self, perm):
+        self.perm = perm
+
+    def apply(self, sch, op, axes):
+        """Apply reorder to an array of axes
+
+        Parameters
+        ----------
+        sch: tvm.schedule.Schedule
+            The tvm schedule
+        op: tvm.tensor.Operation
+            The stage to be applied
+        axis: tvm.schedule.IterVar
+            axis to split
+
+        Returns
+        -------
+        axes : list of Axis
+            The transformed axes.
+        """
+        if len(axes) == len(self.perm):
+            new_order = [axes[i] for i in self.perm]
+        else:
+            new_order = [axes[i] for i in self.perm if i < len(axes)]
+        sch[op].reorder(*new_order)
+        return new_order
+
+    def __repr__(self):
+        return str(self.perm)
+
+
+class AnnotateSpace(TransformSpace):
+    """The parameter space for annotating an array of axes"""
+    def __init__(self, axes, policy, **kwargs):
+        super(AnnotateSpace, self).__init__()
+
+        self.ins = axes
+        self.policy = policy
+        self.num_output = len(axes)
+
+        if policy == 'bind_gpu':
+            self.num_axis = len(axes)
+            if self.num_axis >= 6:
+                self.entities.append(AnnotateEntity(
+                    ['fuse'] * (self.num_axis - 6) +
+                    ['blockIdx.z', 'blockIdx.y', 'blockIdx.x',
+                     'threadIdx.z', 'threadIdx.y', 'threadIdx.x']))
+            elif self.num_axis >= 4:
+                self.entities.append(AnnotateEntity(
+                    ['fuse'] * (self.num_axis - 4) +
+                    ['blockIdx.y', 'blockIdx.x',
+                     'threadIdx.y', 'threadIdx.x']))
+            elif self.num_axis >= 2:
+                self.entities.append(AnnotateEntity(
+                    ['fuse'] * (self.num_axis - 2) +
+                    ['blockIdx.x', 'threadIdx.x']))
+            else:
+                raise RuntimeError("Unhandled case in bind_gpu")
+        elif policy == 'bind_gpu_virtual':
+            self.num_axis = len(axes)
+            if self.num_axis >= 9:
+                self.entities.append(AnnotateEntity(
+                    ['fuse'] * (self.num_axis - 9) +
+                    ['blockIdx.z', 'blockIdx.y', 'blockIdx.x',
+                     'vthread', 'vthread', 'vthread',
+                     'threadIdx.z', 'threadIdx.y', 'threadIdx.x']))
+            elif self.num_axis >= 6:
+                self.entities.append(AnnotateEntity(
+                    ['fuse'] * (self.num_axis - 6) +
+                    ['blockIdx.y', 'blockIdx.x',
+                     'vthread', 'vthread',
+                     'threadIdx.y', 'threadIdx.x']))
+            elif self.num_axis >= 3:
+                self.entities.append(AnnotateEntity(
+                    ['fuse'] * (self.num_axis - 3) +
+                    ['blockIdx.x', 'vthread', 'threadIdx.x']))
+            else:
+                raise RuntimeError("Unhandled case in bind_gpu")
+        elif policy == 'locate_cache':
+            self.num_axis = len(axes)
+            num_anchor = kwargs["num_anchor"]
+            self.anns = list(itertools.combinations(np.arange(self.num_axis), num_anchor))
+            self.entities = [AnnotateEntity(x) for x in self.anns]
+        else:  # none, vec, unroll, try_vec, try_unroll, try_vec_unroll, ...
+            anns = policy.replace('try', 'none').split('_')
+
+            for ann in anns:
+                if ann not in ['none', 'unroll', 'vec']:
+                    raise RuntimeError("Invalid policy: " + policy)
+
+            self.num_axis = len(axes)
+            self.anns = [anns] * self.num_axis
+            self._generate_space(0, [""] * self.num_axis)
+
+    def _generate_space(self, now, tmp_stack):
+        """Generate space by DFS"""
+        if now == self.num_axis:
+            # only vectorize inner most dimension
+            vec_ct = tmp_stack.count('vec')
+            if vec_ct == 0 or vec_ct == 1:
+                self.entities.append(AnnotateEntity(list(tmp_stack)))
+        else:
+            for ann in self.anns[now]:
+                tmp_stack[now] = ann
+                self._generate_space(now + 1, tmp_stack)
+
+    @staticmethod
+    def get_num_output(axes, policy, **kwargs):
+        return len(axes)
+
+    def __repr__(self):
+        return "Annotate(policy=%s) len=%d" % (self.policy, len(self))
+
+
+class AnnotateEntity(object):
+    """An annotation operation with detailed parameters that can apply to axes
+
+    Parameters
+    ----------
+    anns: Array of string
+        The annotations of axes
+    """
+    def __init__(self, anns):
+        self.anns = anns
+
+    def apply(self, sch, op, axes, axis_lens=None,
+              max_unroll=None, vec_size=None, cfg=None, source=None):
+        """Apply annotation to an array of axes
+
+        Parameters
+        ----------
+        sch: tvm.schedule.Schedule
+            The tvm schedule
+        op: tvm.tensor.Operation
+            The stage to be applied
+        axes: Array of tvm.schedule.IterVar
+            axis to split
+        axis_lens: Array of int, optional
+            the length of axes
+        max_unroll: int, optional
+            maximum unroll step
+        vec_size: Array of int, optional
+            valid vector lanes for vectorization
+        cfg: ConfigEntity, optional
+            cfg for recording error
+        source: Array of Array tensor, optional
+            source tensor for attaching cache
+
+        Returns
+        -------
+        axes : list of tvm.schedule.IterVar
+            The transformed axes
+        """
+        if source is not None:  # special case : attach cache_read/cache_write
+            for src, to in zip(source, self.anns):
+                for t in src:
+                    sch[t].compute_at(sch[op], axes[to])
+        else:  # other cases
+            for i, ann in enumerate(self.anns):
+                if ann == 'none':
+                    pass
+                elif ann == 'unroll':
+                    if max_unroll and axis_lens[i] > max_unroll:
+                        cfg.raise_error("Too large factor for unrolling")
+                    sch[op].unroll(axes[i])
+                elif ann == 'vec':
+                    if vec_size and axis_lens[i] not in vec_size:
+                        cfg.raise_error("Wrong size of lanes in vectorization")
+                    sch[op].vectorize(axes[i])
+                elif ann == 'blockIdx.x':
+                    sch[op].bind(axes[i], thread_axis('blockIdx.x'))
+                elif ann == 'blockIdx.y':
+                    sch[op].bind(axes[i], thread_axis('blockIdx.y'))
+                elif ann == 'blockIdx.z':
+                    sch[op].bind(axes[i], thread_axis('blockIdx.z'))
+                elif ann == 'threadIdx.x':
+                    sch[op].bind(axes[i], thread_axis('threadIdx.x'))
+                elif ann == 'threadIdx.y':
+                    sch[op].bind(axes[i], thread_axis('threadIdx.y'))
+                elif ann == 'threadIdx.z':
+                    sch[op].bind(axes[i], thread_axis('threadIdx.z'))
+                elif ann == 'vthread':
+                    sch[op].bind(axes[i], thread_axis("vthread"))
+                elif ann == 'fuse':
+                    assert i < len(axes) - 1
+                    axes[i+1] = sch[op].fuse(axes[i], axes[i+1])
+                else:
+                    raise RuntimeError("Invalid annotation " + ann)
+        return axes
+
+    def __repr__(self):
+        return str(self.anns)
+
+
+class OtherOptionSpace(TransformSpace):
+    """The parameter space for general option"""
+    def __init__(self, axes, policy, **kwargs):
+        super(OtherOptionSpace, self).__init__()
+
+        candidate = kwargs["candidate"]
+        self.entities = [OtherOptionEntity(x) for x in candidate]
+
+    @staticmethod
+    def get_num_output(axes, policy, **kwargs):
+        return 0
+
+    def __repr__(self):
+        return "OtherOption(%s) len=%d" % (self.entities, len(self))
+
+
+class OtherOptionEntity(object):
+    """The parameter entity for general option, with a detailed value"""
+    def __init__(self, val):
+        self.val = val
+
+    def __repr__(self):
+        return str(self.val)
+
+
+class ConfigSpace(object):
+    """The configuration space of a schedule. Pass it as config in template to
+       collect transformation space and build transform graph of axes
+    """
+    def __init__(self):
+        # private dict to provide sugar
+        self.space_map = OrderedDict()  # name -> space
+        self._collect = True
+        self._length = None
+        self._entity_map = OrderedDict()
+        self._constraints = []
+        self.errors = []
+        self.template_key = None
+        self.code_hash = None
+        self.flop = 0
+
+    @staticmethod
+    def axis(var):
+        """get a virtual axis (axis placeholder)
+
+        Parameters
+        ----------
+        var: int or tvm.schedule.IterVar
+            If is int, return an axis whose length is the provided argument.
+            If is IterVar, return an axis whose length is extracted from the
+                           IterVar's extent domain.
+        """
+        return VirtualAxis(var)
+
+    reduce_axis = axis
+
+    def define_split(self, name, axis, policy='all', **kwargs):
+        """Define a new tunable knob which splits an axis into a list of axes
+
+        Parameters
+        ----------
+        name: str
+            name to index the entity of this space
+        axis: tvm.schedule.IterVar
+            axis to split
+        policy: str
+            name of policy.
+            If is 'all', the tuner will try all divisible factors.
+            If is 'candidate', try listed candidate.
+        kwargs: dict
+            extra arguments for policy
+        """
+        axes = [axis]
+        return self._add_new_transform(SplitSpace, name, axes, policy, **kwargs)
+
+    def define_reorder(self, name, axes, policy, **kwargs):
+        """Define a new tunable knob which reorders a list of axes
+
+        Parameters
+        ----------
+        name: str
+            name to index the entity of this space
+        axes: Array of tvm.schedule.IterVar
+            axes to reorder
+        policy: str
+            name of policy
+            If is 'identity', do an identity permutation.
+            If is 'all', try all permutations.
+            If is 'interval_all', try all permutations of an interval of axes.
+            If is 'candidate', try listed candidate.
+            If is 'interleave', interleave chains of spatial axes and chains of reduction axes.
+        kwargs: dict
+            extra arguments for policy
+        """
+        return self._add_new_transform(ReorderSpace, name, axes, policy, **kwargs)
+
+    def define_annotate(self, name, axes, policy, **kwargs):
+        """Define a new tunable knob which annotates a list of axes
+
+        Parameters
+        ----------
+        name: str
+            name to index the entity of this space
+        axes: Array of tvm.schedule.IterVar
+            axes to annotate
+        policy: str
+            name of policy
+            If is 'unroll', unroll the axes.
+            If is 'try_unroll', try to unroll the axes.
+            If is 'try_unroll_vec', try to unroll or vectorize the axes.
+            If is 'bind_gpu', bind the first few axes to gpu threads.
+            If is 'locate_cache', choose n axes to attach shared/local cache.
+        kwargs: dict
+            extra arguments for policy
+        """
+        return self._add_new_transform(AnnotateSpace, name, axes, policy, **kwargs)
+
+    def define_knob(self, name, candidate):
+        """Define a tunable knob with a list of candidates
+
+        Parameters
+        ----------
+        name: str
+            name key of that option
+        candidate: list
+            list of candidates
+        """
+        return self._add_new_transform(OtherOptionSpace, name, [], None, candidate=candidate)
+
+    def add_flop(self, flop):
+        """Add float operation statistics for this tuning task
+
+        Parameters
+        ---------
+        flop: int or float
+            number of float operations
+        """
+        self.flop += flop
+
+    def raise_error(self, msg):
+        """register error in config
+        Using this to actively detect error when scheudling.
+        Otherwise these error will occur during runtime, which
+        will cost more time.
+
+        Parameters
+        ----------
+        msg: str
+        """
+        self.errors.append(msg)
+
+    def valid(self):
+        """Check whether the config meets all the constraints
+        Note: This check should be called after instantiation of task,
+              because the ConfigEntity/ConfigSpace collects errors during instantiation
+
+        Returns
+        -------
+        valid: bool
+            whether the config meets all the constraints
+        """
+        return not bool(self.errors)
+
+    def _add_new_transform(self, space_class, name, axes, policy, **kwargs):
+        """Add a new transform space in template"""
+        if self._collect:
+            # convert schedule axis to space definition axis
+            axes = [x if isinstance(x, (VirtualAxis, Axis)) else self.axis(x) for x in axes]
+
+            # add subspace (knob)
+            space = space_class(axes, policy, **kwargs)
+            self.space_map[name] = space
+            self._entity_map[name] = space[0]
+            return [Axis(space, i) for i in range(space.num_output)]
+        return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))]
+
+    def __len__(self):
+        if self._length is None:
+            self._length = int(np.prod([len(x) for x in self.space_map.values()]))
+        return self._length
+
+    def get(self, index):
+        """Get a config entity with detailed parameters from this space
+
+        Parameters
+        ----------
+        index: int
+            index in the space
+        """
+        entities = OrderedDict()
+        t = index
+        for name, space in self.space_map.items():
+            entities[name] = space[t % len(space)]
+            t //= len(space)
+        ret = ConfigEntity(index, self.code_hash, self.template_key, entities, self._constraints)
+        return ret
+
+    def __iter__(self):
+        return self._entity_map.__iter__()
+
+    def __getitem__(self, name):
+        """get the transform entity(knob) of this entity by name
+           do not use this to get a ConfigEntity of this space (should use ConfigSpace.get instead)
+
+        Parameters
+        ----------
+        name: str
+            name of the transform
+        """
+        return self._entity_map[name]
+
+    def __repr__(self):
+        res = "ConfigSpace (len=%d, space_map=\n" % len(self)
+        for i, (name, space) in enumerate(self.space_map.items()):
+            res += "  %2d %s: %s\n" % (i, name, space)
+        return res + ")"
+
+
+_ann_to_number = {
+    'none': 0, 'vec': 1, 'unroll': 2,
+    'blockIdx.x': 3, 'blockIdx.y': 4, 'blockIdx.z': 5,
+    'threadIdx.x': 6, 'threadIdx.y': 7, 'threadIdx.z': 8,
+    'vthread': 9, 'fuse': 10
+}
+
+class ConfigEntity(ConfigSpace):
+    """A configuration with detailed parameters
+
+    Parameters
+    ----------
+    index: int
+        index of this config in space
+    code_hash: str
+        hash of schedule code
+    template_key : str
+        The specific template key
+    entity_map: dict
+        map name to transform entity
+    constraints : list
+        List of constraints
+    """
+    def __init__(self, index, code_hash, template_key, entity_map, constraints):
+        super(ConfigEntity, self).__init__()
+        self.index = index
+        self.template_key = template_key
+        self._collect = False
+        self._entity_map = entity_map
+        self._space_map = None
+        self._constraints = constraints
+        self.code_hash = code_hash
+
+    def get_flatten_feature(self):
+        """ flatten entities to a numerical one-dimensional feature vector
+
+        Returns
+        -------
+        fea: np.array
+            one dimensional float32 array
+        """
+        fea = []
+        for _, v in self._entity_map.items():
+            if isinstance(v, SplitEntity):
+                fea.extend(v.size)
+            elif isinstance(v, ReorderEntity):
+                # use a naive way: directly copy the permutation
+                fea.extend(v.perm)
+            elif isinstance(v, AnnotateEntity):
+                # one-hot encoding
+                for ann in v.anns:
+                    tmp = [0] * len(_ann_to_number)
+                    tmp[_ann_to_number[ann]] = 1
+                    fea.extend(tmp)
+            elif isinstance(v, OtherOptionEntity):
+                fea.append(v.val)
+        return np.array(fea, dtype=np.float32)
+
+    def get_other_option(self):
+        """
+        Returns
+        -------
+        other_option: dict
+            other tunable parameters (tunable parameters defined by `cfg.define_knob`)
+        """
+        return {x: x.val for x in self._entity_map.values() if isinstance(x, OtherOptionEntity)}
+
+    def to_json_dict(self):
+        """convert to a json serializable dictionary
+
+        Return
+        ------
+        json_dict: dict
+            a json serializable dictionary
+        """
+        ret = {}
+        ret['i'] = int(self.index)
+        ret['t'] = self.template_key
+        ret['c'] = self.code_hash
+        entity_map = []
+        for k, v in self._entity_map.items():
+            if isinstance(v, SplitEntity):
+                entity_map.append((k, 'sp', v.size))
+            elif isinstance(v, ReorderEntity):
+                entity_map.append((k, 're', v.perm))
+            elif isinstance(v, AnnotateEntity):
+                entity_map.append((k, 'an', v.anns))
+            elif isinstance(v, OtherOptionEntity):
+                entity_map.append((k, 'ot', v.val))
+            else:
+                raise RuntimeError("Invalid entity instance: " + v)
+        ret['e'] = entity_map
+        return ret
+
+    @staticmethod
+    def from_json_dict(json_dict):
+        """Build a ConfigEntity from json serializable dictionary
+
+        Parameters
+        ----------
+        json_dict: dict
+            Json serializable dictionary. This should be the return value
+            of :any:`to_json_dict`.
+
+        Returns
+        -------
+        config: ConfigEntity
+            The corresponding config object
+
+        """
+        index = json_dict["i"]
+        code_hash = json_dict["c"]
+        template_key = json_dict["t"]
+        constraints = []
+        entity_map = OrderedDict()
+
+        for item in json_dict["e"]:
+            key, knob_type, knob_args = item
+            if knob_type == 'sp':
+                entity = SplitEntity(knob_args)
+            elif knob_type == 're':
+                entity = ReorderEntity(knob_args)
+            elif knob_type == 'an':
+                entity = AnnotateEntity(knob_args)
+            elif knob_type == 'ot':
+                entity = OtherOptionEntity(knob_args)
+            else:
+                raise RuntimeError("Invalid config knob type: " + knob_type)
+            entity_map[str(key)] = entity
+
+        return ConfigEntity(index, code_hash, template_key, entity_map, constraints)
+
+    def __repr__(self):
+        return "%s,%s,%s,%d" % (str(self._entity_map)[12:-1], self.template_key,
+                                self.code_hash, self.index)
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
new file mode 100644
index 000000000000..7a386f1f9e67
--- /dev/null
+++ b/python/tvm/autotvm/task/task.py
@@ -0,0 +1,385 @@
+# pylint: disable=unused-variable
+"""Definition of task function.
+
+Task can be constructed from tuple of func, args, and kwargs.
+func is a state-less function, or a string that
+registers the standard task.
+"""
+
+import numpy as np
+
+from ... import tensor, expr, container, target as _target
+
+from ..util import get_const_int, get_const_tuple, get_func_name
+from .dispatcher import DispatchContext, ApplyConfig, dispatcher
+from .space import ConfigSpace
+
+def _raise_error(*args, **kwargs):  # pylint: disable=unused-argument
+    raise RuntimeError("The function of this task is not found. Possibly the function "
+                       "of this task is registered in another python file "
+                       "which is not imported in this run")
+
+class Task(object):
+    """A Tunable Task
+
+    Parameters
+    ----------
+    name: str
+        The name of the task.
+    args: Tuple
+        Positional argument of func
+    """
+    def __init__(self, name, args):
+        self.name = name
+        self.args = args
+        self.kwargs = {}  # currently unused
+
+        # init null config space
+        self.config_space = None
+        self.func = TASK_TABLE.get(name, _raise_error)
+
+        # auxiliary info, available after `init_space` is called
+        self.workload = None
+        self.flop = None
+        self.target = None
+        self.target_host = None
+
+    def instantiate(self, config):
+        """Instantiate this task function (template) with a config.
+        Returns corresponding schedule.
+
+        Parameters
+        ----------
+        config: template.ConfigEntity
+            parameter config for this template
+
+        Returns
+        -------
+        sch: tvm.schedule.Schedule
+            The tvm schedule
+        arg_bufs: Array of tvm.tensor.Tensor
+            The input/output buffers
+        """
+        config.flop = 0
+        with ApplyConfig(config):
+            sch, arg_bufs = self.func(*self.args, **self.kwargs)
+        if not self.flop:
+            config.flop = config.flop or compute_flop(sch)
+            self.flop = config.flop
+        return sch, arg_bufs
+
+    def __getstate__(self):
+        # custom pickle implementation is required for
+        # some unpickable local task functions.
+        # So we only pickle the name of the function
+        # and restore the function by name when unpickling it.
+        return {
+            "name": self.name,
+            "args": self.args,
+            "kwargs": self.kwargs,
+            "config_space": self.config_space,
+            "workload": self.workload,
+            "flop": self.flop,
+            "target": self.target,
+            "target_host": self.target_host
+        }
+
+    def __setstate__(self, state):
+        self.name = state["name"]
+        self.args = state["args"]
+        self.kwargs = state["kwargs"]
+        self.config_space = state["config_space"]
+        self.func = TASK_TABLE.get(state["name"], _raise_error)
+        self.workload = state["workload"]
+        self.flop = state["flop"]
+        self.target = state["target"]
+        self.target_host = state["target_host"]
+
+    def __repr__(self):
+        return "Task(func_name=%s, args=%s, kwargs=%s, workload=%s)" % (
+            self.name, self.args, self.kwargs, self.workload
+        )
+
+TASK_TABLE = {
+}
+
+def register(name, func=None, override=False):
+    """Register a task function.
+
+    Parameters
+    ----------
+    name : str
+        The name to identify the task.
+    func : callable
+        The function to be registered.
+    override : bool
+        Whether override existing registration.
+
+    Returns
+    -------
+    func: callable
+        The registered function
+    """
+    def _do_reg(myf):
+        if name in TASK_TABLE and not override:
+            raise ValueError(
+                "Key %s is already registered" % name)
+        TASK_TABLE[name] = myf
+        return myf
+    if func:
+        return _do_reg(func)
+    return _do_reg
+
+def create(func_name, args, target, target_host=None, template_key=None):
+    """Create a tuning task and initialize its search space
+
+    Parameters
+    ----------
+    func_name : str or callable
+        The task function
+    args : List
+        Positional arguments
+    target : Target
+        The compilation target
+    target_host: Target, optional
+        The compilation target for host side
+
+    Returns
+    -------
+    tsk: Task
+        a task object
+    """
+    if callable(func_name):
+        # register this function if it is not registered before
+        func = func_name
+        func_name = func.func_name if hasattr(func, 'func_name') else func.__name__
+        if func_name in TASK_TABLE:
+            assert func == TASK_TABLE[func_name], "Find name conflict in task registration. " \
+                                                  "Consider to choose another name for this task"
+        else:
+            register(func_name, func=func)
+
+    func = TASK_TABLE[func_name]
+    ret = Task(func_name, args)
+
+    if isinstance(target, str):
+        target = _target.create(target)
+
+    # init config space
+    ret.config_space = ConfigSpace()
+    ret.config_space.template_key = template_key or ""
+
+    ctx = ApplyConfig(ret.config_space)
+    with ctx:
+        with target:
+            sch, _ = func(*args)
+            ret.config_space.code_hash = getattr(sch, 'code_hash', None)
+
+    ret.workload = ctx.workload
+    ret.flop = ret.config_space.flop or compute_flop(sch)
+    ret.target = target
+    ret.target_host = target_host
+
+    return ret
+
+def args_to_workload(x):
+    """Convert argument list to hashable workload tuple.
+    This function will convert list to tuple, tvm node to python value and
+    flatten tvm.tensor.Tensor to a tuple
+
+    Parameters
+    ----------
+    x: primitive hashable types or tensor.Tensor
+        The original value
+
+    Returns
+    -------
+    ret: hashable
+        The hashable value
+    """
+    if isinstance(x, tensor.Tensor):
+        return get_const_tuple(x.shape) + (x.dtype, )
+    elif isinstance(x, (tuple, list, container.Array)):
+        return tuple([args_to_workload(a) for a in x])
+    elif isinstance(x, (str, int, float, np.int, np.float)):
+        return x
+    elif isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)):
+        return x.value
+    elif x is None:
+        return None
+    else:
+        raise RuntimeError('Do not support type "%s" in argument. Consider to use'
+                           'primitive types only' % type(x))
+
+def template(func):
+    """
+    Decorate a function as a tunable schedule template
+
+    Parameters
+    ----------
+    func: callable
+        A callable template function.
+        Its argument should be hashable values.
+        Its return value should be a Tuple(Schedule, Array of Tensor)
+
+    Returns
+    -------
+    func: callable
+        The decorated function
+
+    Examples
+    --------
+    The following code is a tunable template for a blocked matrix multiplication
+
+    .. code-block:: python
+
+        @autotvm.template
+        def matmul(N, L, M, dtype):
+            A = tvm.placeholder((N, L), name='A', dtype=dtype)
+            B = tvm.placeholder((L, M), name='B', dtype=dtype)
+
+            k = tvm.reduce_axis((0, L), name='k')
+            C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+            s = tvm.create_schedule(C.op)
+
+            # schedule
+            y, x = s[C].op.axis
+            k = s[C].op.reduce_axis[0]
+
+            ##### define space begin #####
+            cfg = autotvm.get_config()
+            cfg.define_split("tile_y", y, num_outputs=2)
+            cfg.define_split("tile_x", x, num_outputs=2)
+            ##### define space end #####
+
+            # schedule according to config
+            yo, yi = cfg["tile_y"].apply(s, C, y)
+            xo, xi = cfg["tile_x"].apply(s, C, x)
+
+            s[C].reorder(yo, xo, k, yi, xi)
+
+            return s, [A, B, C]
+    """
+    # pylint: disable=unused-variable
+
+    fname = get_func_name(func)
+
+    @register(fname)
+    @dispatcher
+    def config_dispatcher(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        return (fname, ) + args_to_workload(args)
+
+    @config_dispatcher.register("")
+    def template_call(cfg, *args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        with ApplyConfig(cfg):
+            return func(*args, **kwargs)
+
+    config_dispatcher.func_name = fname
+    return config_dispatcher
+
+def get_config():
+    """Get current config object
+
+    Returns
+    -------
+    cfg: ConfigSpace or ConfigEntity
+        The current config
+    """
+    return DispatchContext.current.query(None, None)
+
+class FlopCalculationError(RuntimeError):
+    """Error happens when estimating FLOP for a compute op"""
+    pass
+
+def compute_flop(sch):
+    """Calculate number of FLOP (floating number operations) of the compute ops in a schedule
+
+    Parameters
+    ----------
+    sch: tvm.schedule.Schedule
+        schedule
+
+    Returns
+    -------
+    flop: int
+        number of FLOP in this schedule
+    """
+    def _prod_length(axes):
+        """compute product of the lengths of a list of axes"""
+        try:
+            num_iter = int(np.prod([get_const_int(axis.dom.extent) for axis in axes]))
+        except ValueError:
+            raise FlopCalculationError("The length of axis is not constant. ")
+        return num_iter
+
+    def _count_flop(exp):
+        """compute flop for a single expression"""
+        if isinstance(exp, expr.Reduce):
+            num_iter = _prod_length(exp.axis)
+            combiner = exp.combiner.result
+            source = exp.source
+            if len(combiner) != 1:
+                raise FlopCalculationError("Found multiple output in the combiner of reduce op")
+            if len(source) != 1:
+                raise FlopCalculationError("Found multiple output in the source of reduce op")
+            return num_iter * (_count_flop(combiner[0]) + _count_flop(source[0]))
+        elif isinstance(exp, (expr.FloatImm, expr.IntImm, expr.UIntImm)):
+            return 0
+        elif isinstance(exp, expr.Cast):
+            return _count_flop(exp.value)
+        elif isinstance(exp, expr.Var):
+            return 0
+        elif isinstance(exp, (expr.Add, expr.Sub, expr.Mul, expr.Div, expr.Mod,
+                              expr.Max, expr.Min,
+                              expr.EQ, expr.NE, expr.LT, expr.LE, expr.GT, expr.GE,
+                              expr.And, expr.Or, expr.Not)):
+            base = 1 if "float" in exp.a.dtype else 0
+
+            if isinstance(exp, expr.Not):  # unary
+                return base + _count_flop(exp.a)
+
+            return base + _count_flop(exp.a) + _count_flop(exp.b)
+        elif isinstance(exp, expr.Select):
+            return _count_flop(exp.condition) + max(_count_flop(exp.true_value),
+                                                    _count_flop(exp.false_value))
+        elif isinstance(exp, expr.Call):
+            return sum([_count_flop(x) for x in exp.args])
+        else:
+            raise FlopCalculationError("Found unsupported operator in the compute expr")
+
+    def traverse(ops):
+        """accumulate flops"""
+        ret = 0
+        for op in ops:
+            if isinstance(op, tensor.ComputeOp):
+                num_element = _prod_length(op.axis)
+
+                body = op.body
+                if len(body) != 1:
+                    raise FlopCalculationError("Found multiple output in the compute")
+                exp = body[0]
+
+                ret += num_element * _count_flop(exp)
+                ret += traverse([t.op for t in op.input_tensors])
+
+            elif isinstance(op, tensor.PlaceholderOp):
+                pass
+            else:
+                raise FlopCalculationError("Only support tvm.compute currently. "
+                                           "Other ops like tvm.scan is not supported")
+        return ret
+
+    try:
+        ret = traverse(sch.outputs)
+    except FlopCalculationError as exc:
+        raise RuntimeError("FLOP estimator fails for this operator. Error msg: "
+                           + str(exc) + ". Please use `cfg.add_flop` to manually set "
+                                        "FLOP for this operator")
+
+    if ret == 0:
+        raise RuntimeError("Cannot find float number operation in this operator. "
+                           "Please use `cfg.add_flop` to manually set "
+                           "FLOP for this operator")
+    return ret
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
new file mode 100644
index 000000000000..012ca4a214e9
--- /dev/null
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -0,0 +1,193 @@
+# pylint: disable=unused-variable,invalid-name
+"""
+Decorators for registering tunable templates to TOPI.
+
+These decorators can make your simple implementation be able to use different configurations
+for different workloads.
+Here we directly use all arguments to the TOPI call as "workload", so make sure all the arguments
+(except tvm.Tensor) in you calls are hashable. For tvm.Tensor, we will serialize it to a hashable
+tuple.
+
+See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
+"""
+
+from ... import _api_internal, tensor
+
+from ..util import get_func_name
+from .task import args_to_workload, dispatcher
+
+
+# A table that records all registered dispatcher for all targets
+_REGISTED_DISPATHCER = {
+}
+
+
+def register_topi_compute(topi_compute, target_keys, template_keys, func=None):
+    """Register a tunable template for a topi compute function.
+
+    After the registration. This topi compute will become a configuration dispatcher. It uses
+    all its argument as workload and dispatches configurations according to the input workload.
+
+    It also stores this "workload" to its final ComputeOp, which can be used to reconstruct
+    "workload" in the following topi_schedule call.
+
+    Parameters
+    ----------
+    topi_compute: GenericFunc
+        The topi compute function that will be overloaded
+    target_keys: str or list of str
+        The compilation target. The same as the argument of GenericFunc.register.
+    template_keys: str or list of str
+        The template key.
+        We might have several strategies for a single operator (e.g. direct, im2col, winograd).
+        The template key is used to identity the algorithm strategy.
+        Every operator must have a "direct" template, which is used by default.
+    func: None or callable
+        If it is None, return a decorator.
+        If is callable, decorate this function.
+
+    Returns
+    -------
+    decorator: callable
+        A decorator
+
+    Examples
+    --------
+    See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
+    """
+    fname = get_func_name(topi_compute)
+
+    def _decorator(f):
+        targets = [target_keys] if isinstance(target_keys, str) else target_keys
+        for target_key in targets:
+            if target_key not in _REGISTED_DISPATHCER:
+                _REGISTED_DISPATHCER[target_key] = {}
+            if topi_compute not in _REGISTED_DISPATHCER:
+                @topi_compute.register(target_key)
+                @dispatcher
+                def config_dispatcher(*args, **kwargs):
+                    """override topi call as a config dispatcher"""
+                    assert not kwargs, "Do not support kwargs in template function call"
+                    return (fname, ) + args_to_workload(args)
+                _REGISTED_DISPATHCER[target_key][topi_compute] = config_dispatcher
+
+            config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_compute]
+
+            @config_dispatcher.register(template_keys)
+            def template_call(cfg, *args, **kwargs):
+                """call the topi func and attach workload to compute node"""
+                assert not kwargs, "Do not support kwargs in template function call"
+
+                if f == topi_compute.fdefault:
+                    node = f(*args, **kwargs)
+                else:
+                    node = f(cfg, *args, **kwargs)
+
+                # attach workload to return op
+                op = node.op
+                attrs = {}
+                for k, v in node.op.attrs.items():
+                    attrs[k] = v
+                attrs['workload'] = (fname, ) + args_to_workload(args)
+                if isinstance(op, tensor.ComputeOp):
+                    op = _api_internal._ComputeOp(
+                        op.name, op.tag, attrs, op.axis, op.body)
+                elif isinstance(op, tensor.ExternOp):
+                    op = _api_internal._ExternOp(
+                        op.name, op.tag, attrs,
+                        op.inputs, op.input_placeholders,
+                        op.output_placeholders, op.body)
+                else:
+                    raise RuntimeError("Unsupported op type: " + str(type(op)))
+
+                if isinstance(node, tensor.Tensor):
+                    return op.output(0)
+                return [op.output(i) for i in range(len(node))]
+
+        return f
+
+    if func:
+        _decorator(func)
+
+    return _decorator
+
+
+def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None):
+    """Register a tunable template for a topi schedule function.
+
+    After the registration. This topi schedule will become a configuration dispatcher. It dispatches
+    configurations according to the input workload.
+
+    Note that this function will try to find "workload" from all the ComputeOp in the input.
+    You can attach "workload" to your compute op by using :any:`register_topi_compute`.
+
+    Parameters
+    ----------
+    topi_schedule: GenericFunc
+        The topi schedule function that will be overloaded
+    target_keys: str or list of str
+        The compilation target
+    template_keys: str or list of str
+        The template key.
+        We might have several strategies for a single operator (e.g. direct, im2col, winograd).
+        The template key is used to identity the algorithm strategy.
+        Every operator must have a "direct" template, which is used by default.
+    func: None or callable
+        If it is None, return a decorator.
+        If is callable, decorate this function.
+
+    Returns
+    -------
+    decorator: callable
+        A decorator
+
+    Examples
+    --------
+    See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
+    """
+    def _decorator(f):
+        targets = [target_keys] if isinstance(target_keys, str) else target_keys
+        for target_key in targets:
+            if target_key not in _REGISTED_DISPATHCER:
+                _REGISTED_DISPATHCER[target_key] = {}
+            if topi_schedule not in _REGISTED_DISPATHCER[target_key]:
+                @topi_schedule.register(target_key)
+                @dispatcher
+                def config_dispatcher(outs):
+                    """override topi call as a workload dispatcher"""
+                    def traverse(tensors):
+                        """traverse all ops to find attached workload"""
+                        for t in tensors:
+                            op = t.op
+                            if 'workload' in op.attrs:
+                                return op.attrs['workload']
+                            wkl = traverse(op.input_tensors)
+                            if wkl:
+                                return wkl
+                        return None
+
+                    outs = [outs] if isinstance(outs, tensor.Tensor) else outs
+                    workload = traverse(outs)
+
+                    if workload is None:
+                        raise RuntimeError("Cannot find workload in attribute of this schedule")
+
+                    return args_to_workload(workload)
+
+                _REGISTED_DISPATHCER[target_key][topi_schedule] = config_dispatcher
+
+            config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_schedule]
+
+            @config_dispatcher.register(template_keys)
+            def template_call(cfg, outs):
+                """call the schedule func"""
+                if f == topi_schedule.fdefault:
+                    return f(outs)
+                return f(cfg, outs)
+
+        return f
+
+    if func:
+        _decorator(func)
+
+    return _decorator
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
new file mode 100644
index 000000000000..70a3a511ec61
--- /dev/null
+++ b/python/tvm/autotvm/tophub.py
@@ -0,0 +1,123 @@
+"""
+TopHub: Tensor Operator Hub
+To get the best performance, we typically need auto-tuning for the specific devices.
+TVM releases pre-tuned parameters in TopHub for some common networks and hardware targets.
+TVM will download these parameters for you when you create the target for the first time.
+"""
+
+import logging
+import os
+import json
+
+from .task import ApplyHistoryBest
+from .. import target as _target
+from ..contrib.util import tempdir
+from ..contrib.download import download
+
+AUTOTVM_TOPHUB_ROOT_PATH = os.path.join(os.path.expanduser('~'), ".tvm", "tophub")
+
+
+def _alias(name):
+    """convert alias for some packages"""
+    table = {
+        'vtacpu': 'vta',
+    }
+    return table.get(name, name)
+
+
+def context(target, extra_files=None):
+    """Return the dispatch context with pre-tuned parameters.
+    The corresponding downloaded *.log files under tophub root path will be loaded.
+    Users can also add their own files in argument `extra_files`.
+
+    Parameters
+    ----------
+    target: Target
+        The compilation target
+    extra_files: list of str, optional
+        Extra log files to load
+    """
+    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
+    best_context = ApplyHistoryBest([])
+
+    if isinstance(target, str):
+        target = _target.create(target)
+
+    big_target = str(target).split()[0]
+    if os.path.isfile(os.path.join(rootpath, big_target + ".log")):
+        best_context.load(os.path.join(rootpath, big_target + ".log"))
+
+    for opt in target.options:
+        if opt.startswith("-device"):
+            model = _alias(opt[8:])
+            if os.path.isfile(os.path.join(rootpath, model) + ".log"):
+                best_context.load(os.path.join(rootpath, model) + ".log")
+
+    if extra_files:
+        for filename in extra_files:
+            best_context.load(filename)
+
+    return best_context
+
+
+def download_package(backend):
+    """Download pre-tuned parameters of operators for a backend
+
+    Parameters
+    ----------
+    backend: str
+        The name of package
+    """
+    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
+
+    if not os.path.isdir(rootpath):
+        # make directory
+        splits = os.path.split(rootpath)
+        for j in range(1, len(splits)+1):
+            path = os.path.join(*splits[:j])
+            if not os.path.isdir(path):
+                os.mkdir(path)
+
+    backend = _alias(backend)
+    logging.info("Download pre-tuned parameters for %s", backend)
+    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s.log" % backend,
+             os.path.join(rootpath, backend + ".log"), True, verbose=0)
+
+
+def check_package(backend):
+    """Check whether have pre-tuned parameters of the certain target.
+    If not, will download it.
+
+    Parameters
+    ----------
+    backend: str
+        The name of package
+    """
+    backend = _alias(backend)
+
+    if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
+        return
+    download_package(backend)
+
+
+def list_packages():
+    """List all available pre-tuned op parameters for targets
+
+    Returns
+    -------
+    ret: List
+        All available packets
+    """
+    path = tempdir()
+    filename = path.relpath("info.json")
+    logging.info("Download meta info for pre-tuned parameters")
+    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/info.json",
+             filename, True, verbose=0)
+
+    with open(filename, "r") as fin:
+        text = "".join(fin.readlines())
+    info = json.loads(text)
+    keys = list(info.keys())
+    keys.sort()
+
+    return [(k, info[k]) for k in keys]
diff --git a/python/tvm/autotvm/tuner/__init__.py b/python/tvm/autotvm/tuner/__init__.py
new file mode 100644
index 000000000000..af81442e79f4
--- /dev/null
+++ b/python/tvm/autotvm/tuner/__init__.py
@@ -0,0 +1,14 @@
+"""
+A tuner takes a task as input. It proposes some promising :any:`ConfigEntity`
+in the :any:`ConfigSpace` and measure them on the real hardware. Then it
+proposed the next batch of :any:`ConfigEntity` according to the measure results.
+This tuning loop is repeated.
+"""
+
+from . import callback
+
+from .tuner import Tuner
+
+from .gridsearch_tuner import GridSearchTuner, RandomTuner
+from .ga_tuner import GATuner
+from .xgboost_tuner import XGBTuner
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
new file mode 100644
index 000000000000..4737fe510636
--- /dev/null
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -0,0 +1,124 @@
+# pylint: disable=consider-using-enumerate,invalid-name
+"""Namespace of callback utilities of AutoTVM"""
+import sys
+import time
+
+import numpy as np
+
+from .. import record
+
+
+def log_to_file(file_out, protocol='json'):
+    """Log the tuning records into file.
+    The rows of the log are stored in the format of autotvm.record.encode.
+
+    Parameters
+    ----------
+    file_out : File or str
+        The file to log to.
+    protocol: str, optional
+        The log protocol. Can be 'json' or 'pickle'
+
+    Returns
+    -------
+    callback : callable
+        Callback function to do the logging.
+    """
+    def _callback(_, inputs, results):
+        """Callback implementation"""
+        if isinstance(file_out, str):
+            with open(file_out, "a") as f:
+                for inp, result in zip(inputs, results):
+                    f.write(record.encode(inp, result, protocol) + "\n")
+        else:
+            for inp, result in zip(inputs, results):
+                file_out.write(record.encode(inp, result, protocol) + "\n")
+    return _callback
+
+
+def log_to_database(db):
+    """Save the tuning records to a database object.
+
+    Parameters
+    ----------
+    db: Database
+        The database
+    """
+    def _callback(_, inputs, results):
+        """Callback implementation"""
+        for inp, result in zip(inputs, results):
+            db.save(inp, result)
+    return _callback
+
+
+class Monitor(object):
+    """A monitor to collect statistic during tuning"""
+    def __init__(self):
+        self.scores = []
+        self.timestamps = []
+
+    def __call__(self, tuner, inputs, results):
+        for inp, res in zip(inputs, results):
+            if res.error_no == 0:
+                flops = inp.task.flop / np.mean(res.costs)
+                self.scores.append(flops)
+            else:
+                self.scores.append(0)
+
+            self.timestamps.append(res.timestamp)
+
+    def reset(self):
+        self.scores = []
+        self.timestamps = []
+
+    def trial_scores(self):
+        """get scores (currently is flops) of all trials"""
+        return np.array(self.scores)
+
+    def trial_timestamps(self):
+        """get wall clock time stamp of all trials"""
+        return np.array(self.timestamps)
+
+
+def progress_bar(total, prefix=''):
+    """Display progress bar for tuning
+
+    Parameters
+    ----------
+    total: int
+        The total number of trials
+    prefix: str
+        The prefix of output message
+    """
+    class _Context:
+        """Context to store local variables"""
+        def __init__(self):
+            self.best_flops = 0
+            self.cur_flops = 0
+            self.ct = 0
+            self.total = total
+
+        def __del__(self):
+            sys.stdout.write(' Done.\n')
+
+    ctx = _Context()
+    tic = time.time()
+
+    def _callback(tuner, inputs, results):
+        ctx.ct += len(inputs)
+
+        flops = 0
+        for inp, res in zip(inputs, results):
+            if res.error_no == 0:
+                flops = inp.task.flop / np.mean(res.costs)
+
+        ctx.cur_flops = flops
+        ctx.best_flops = tuner.best_flops
+
+        sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
+                         '| %.2f s' %
+                         (prefix, ctx.cur_flops/1e9, ctx.best_flops/1e9, ctx.ct, ctx.total,
+                          time.time() - tic))
+        sys.stdout.flush()
+
+    return _callback
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
new file mode 100644
index 000000000000..916bd4ee68c6
--- /dev/null
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -0,0 +1,122 @@
+# pylint: disable=consider-using-enumerate,invalid-name,abstract-method
+
+"""Tuner with genetic algorithm"""
+
+import numpy as np
+
+from .tuner import Tuner
+from .model_based_tuner import knob2point, point2knob
+
+
+class GATuner(Tuner):
+    """Tuner with genetic algorithm.
+    This tuner does not have a cost model so it always run measurement on real machines.
+    This tuner expands the :code:`ConfigEntity` as gene.
+
+    Parameters
+    ----------
+    pop_size: int
+        number of genes in one generation
+    elite_num: int
+        number of elite to keep
+    mutation_prob: float
+        probability of mutation of a knob in a gene
+    """
+    def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1):
+        super(GATuner, self).__init__(task)
+
+        # algorithm configurations
+        self.pop_size = pop_size
+        self.elite_num = elite_num
+        self.mutation_prob = mutation_prob
+
+        assert elite_num <= pop_size, "The number of elites must be less than population size"
+
+        # space info
+        self.space = task.config_space
+        self.dims = [len(x) for x in self.space.space_map.values()]
+
+        self.visited = set([])
+
+        # current generation
+        self.genes = []
+        self.scores = []
+        self.elites = []
+        self.elite_scores = []
+        self.trial_pt = 0
+
+        # random initialization
+        self.pop_size = min(self.pop_size, len(self.space))
+        for _ in range(self.pop_size):
+            tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims)
+            while knob2point(tmp_gene, self.dims) in self.visited:
+                tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims)
+
+            self.genes.append(tmp_gene)
+            self.visited.add(knob2point(tmp_gene, self.dims))
+
+    def next_batch(self, batch_size):
+        ret = []
+        for _ in range(batch_size):
+            gene = self.genes[self.trial_pt % self.pop_size]
+            self.trial_pt += 1
+            ret.append(self.space.get(knob2point(gene, self.dims)))
+
+        return ret
+
+    def update(self, inputs, results):
+        for inp, res in zip(inputs, results):
+            if res.error_no == 0:
+                y = inp.task.flop / np.mean(res.costs)
+                self.scores.append(y)
+            else:
+                self.scores.append(0)
+
+        if len(self.scores) >= len(self.genes):
+            genes = self.genes + self.elites
+            scores = np.array(self.scores[:len(self.genes)] + self.elite_scores)
+
+            # reserve elite
+            self.elites, self.elite_scores = [], []
+            elite_indexes = np.argpartition(scores, -self.elite_num)[-self.elite_num:]
+            for ind in elite_indexes:
+                self.elites.append(genes[ind])
+                self.elite_scores.append(scores[ind])
+
+            # cross over
+            indices = np.arange(len(genes))
+            scores /= np.max(scores)
+            probs = scores / np.sum(scores)
+            tmp_genes = []
+            for _ in range(self.pop_size):
+                p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs)
+                p1, p2 = genes[p1], genes[p2]
+                point = np.random.randint(len(self.dims))
+                tmp_gene = p1[:point] + p2[point:]
+                tmp_genes.append(tmp_gene)
+
+            # mutation
+            next_genes = []
+            for tmp_gene in tmp_genes:
+                for j, dim in enumerate(self.dims):
+                    if np.random.random() < self.mutation_prob:
+                        tmp_gene[j] = np.random.randint(dim)
+
+                if len(self.visited) < len(self.space):
+                    while knob2point(tmp_gene, self.dims) in self.visited:
+                        j = np.random.randint(len(self.dims))
+                        tmp_gene[j] = np.random.randint(self.dims[j])
+                    next_genes.append(tmp_gene)
+                    self.visited.add(knob2point(tmp_gene, self.dims))
+                else:
+                    break
+
+            self.genes = next_genes
+            self.trial_pt = 0
+            self.scores = []
+
+    def has_next(self):
+        return len(self.visited) - (len(self.genes) - self.trial_pt) < len(self.space)
+
+    def load_history(self, data_set):
+        pass
diff --git a/python/tvm/autotvm/tuner/gridsearch_tuner.py b/python/tvm/autotvm/tuner/gridsearch_tuner.py
new file mode 100644
index 000000000000..21a17a132640
--- /dev/null
+++ b/python/tvm/autotvm/tuner/gridsearch_tuner.py
@@ -0,0 +1,69 @@
+# pylint: disable=abstract-method
+"""Grid search tuner and random tuner"""
+
+import numpy as np
+
+from .tuner import Tuner
+
+
+class GridSearchTuner(Tuner):
+    """Enumerate the search space in a grid search order"""
+    def __init__(self, task):
+        super(GridSearchTuner, self).__init__(task)
+        self.counter = 0
+
+    def next_batch(self, batch_size):
+        ret = []
+        for _ in range(batch_size):
+            if self.counter >= len(self.task.config_space):
+                continue
+            index = self.counter
+            ret.append(self.task.config_space.get(index))
+            self.counter = self.counter + 1
+        return ret
+
+    def has_next(self):
+        return self.counter < len(self.task.config_space)
+
+    def load_history(self, data_set):
+        pass
+
+    def __getstate__(self):
+        return {"counter": self.counter}
+
+    def __setstate__(self, state):
+        self.counter = state['counter']
+
+
+class RandomTuner(Tuner):
+    """Enumerate the search space in a random order"""
+    def __init__(self, task):
+        super(RandomTuner, self).__init__(task)
+        self.visited = set()
+
+    def next_batch(self, batch_size):
+        ret = []
+        counter = 0
+        while counter < batch_size:
+            if len(self.visited) >= len(self.task.config_space):
+                break
+            index = np.random.randint(len(self.task.config_space))
+            while index in self.visited:
+                index = np.random.randint(len(self.task.config_space))
+
+            ret.append(self.task.config_space.get(index))
+            self.visited.add(index)
+            counter += 1
+        return ret
+
+    def has_next(self):
+        return len(self.visited) < len(self.task.config_space)
+
+    def load_history(self, data_set):
+        pass
+
+    def __getstate__(self):
+        return {"visited": self.counter}
+
+    def __setstate__(self, state):
+        self.counter = state['visited']
diff --git a/python/tvm/autotvm/tuner/metric.py b/python/tvm/autotvm/tuner/metric.py
new file mode 100644
index 000000000000..c18f080ec0cb
--- /dev/null
+++ b/python/tvm/autotvm/tuner/metric.py
@@ -0,0 +1,106 @@
+# pylint: disable=invalid-name
+"""Metrics for evaluating tuning process"""
+
+import numpy as np
+
+from ..util import get_rank
+
+def max_curve(trial_scores):
+    """ f(n) = max([s[i] fo i < n])
+
+    Parameters
+    ----------
+    trial_scores: Array of float
+        the score of i th trial
+
+    Returns
+    -------
+    curve: Array of float
+        function values
+    """
+    ret = np.empty(len(trial_scores))
+    keep = -1e9
+    for i, score in enumerate(trial_scores):
+        keep = max(keep, score)
+        ret[i] = keep
+    return ret
+
+def mean_curve(trial_scores):
+    """ f(n) = mean([s[i] fo i < n])
+
+    Parameters
+    ----------
+    trial_scores: Array of float
+        the score of i th trial
+
+    Returns
+    -------
+    curve: Array of float
+        function values
+    """
+    ret = np.empty(len(trial_scores))
+    keep = 0
+    for i, score in enumerate(trial_scores):
+        keep += score
+        ret[i] = keep / (i+1)
+    return ret
+
+def recall_curve(trial_ranks, top=None):
+    """
+    if top is None, f(n) = sum([I(rank[i] < n) for i < n]) / n
+    if top is K,    f(n) = sum([I(rank[i] < K) for i < n]) / K
+
+    Parameters
+    ----------
+    trial_ranks: Array of int
+        the rank of i th trial in labels
+    top: int or None
+        top-n recall
+
+    Returns
+    -------
+    curve: Array of float
+        function values
+    """
+    if not isinstance(trial_ranks, np.ndarray):
+        trial_ranks = np.array(trial_ranks)
+
+    ret = np.zeros(len(trial_ranks))
+    if top is None:
+        for i in range(len(trial_ranks)):
+            ret[i] = np.sum(trial_ranks[:i] <= i) / (i+1)
+    else:
+        for i in range(len(trial_ranks)):
+            ret[i] = 1.0 * np.sum(trial_ranks[:i] < top) / top
+    return ret
+
+def cover_curve(trial_ranks):
+    """
+    f(n) = max k s.t. {1,2,...,k} is a subset of {ranks[i] for i < n}
+
+    Parameters
+    ----------
+    trial_ranks: Array of int
+        the rank of i th trial in labels
+
+    Returns
+    -------
+    curve: Array of float
+        function values
+    """
+    ret = np.empty(len(trial_ranks))
+    keep = -1
+    cover = set()
+    for i, rank in enumerate(trial_ranks):
+        cover.add(rank)
+        while keep+1 in cover:
+            keep += 1
+        ret[i] = keep + 1
+    return ret / len(trial_ranks)
+
+def average_recall(preds, labels, N):
+    """evaluate average recall-n for predictions and labels"""
+    trials = np.argsort(preds)[::-1]
+    ranks = get_rank(labels[trials])
+    curve = recall_curve(ranks)
+    return np.sum(curve[:N]) / N
diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py
new file mode 100644
index 000000000000..d1c1b16d3181
--- /dev/null
+++ b/python/tvm/autotvm/tuner/model_based_tuner.py
@@ -0,0 +1,354 @@
+# pylint: disable=no-else-return,invalid-name,consider-using-enumerate,abstract-method
+"""Base class for model-based tuner
+This type of tuner will fit a cost model and use some optimization methods to
+find optimums points of cost model in space.
+"""
+import gc
+
+import numpy as np
+
+from .tuner import Tuner
+
+
+class FeatureCache(object):
+    """Feature cache manager for cache sharing between different cost models"""
+    def __init__(self):
+        self.feature_cache = {}
+
+    def get(self, key):
+        """ Get feature cache dictionary for a key
+
+        Parameters
+        ----------
+        key: str
+            The key of a feature type
+
+        Returns
+        -------
+        fea_cache: dict
+            cache dictionary
+        """
+        if key not in self.feature_cache:
+            self.feature_cache[key] = {}
+
+        return self.feature_cache[key]
+
+    def size(self, key):
+        """" Get the size of a feature cache dictionary
+
+        Parameters
+        ----------
+        key: str
+            The key of a feature type
+
+        Returns
+        -------
+        n: int
+        """
+        return len(self.feature_cache.get(key, tuple()))
+
+    def clear(self, key):
+        """Clear feature cache for a key
+
+        Parameters
+        ----------
+        key: str
+            The key of a feature type
+        """
+        del self.feature_cache[key]
+        self.feature_cache[key] = {}
+        gc.collect()
+
+
+class CostModel(object):
+    """Cost model to predict the speed of a config"""
+    def __init__(self):
+        pass
+
+    def fit(self, xs, ys, plan_size):
+        """Fit to training data
+
+        Parameters
+        ----------
+        xs: Array of int
+            indexes of configs in the config space
+        ys: Array of float
+            The speed (flop, float number operations per second)
+        plan_size: int
+            The plan size of tuner
+        """
+        raise NotImplementedError()
+
+    def fit_log(self, records, plan_size):
+        """Fit training data from log.
+
+        Parameters
+        ----------
+        records: Array of Tuple(MeasureInput, MeasureResult)
+            The tuning records
+        plan_size: int
+            The plan size of tuner
+        """
+        raise NotImplementedError()
+
+    def predict(self, xs, output_margin=False):
+        """Predict the speed of configs
+
+        Parameters
+        ----------
+        xs: Array of int
+            The indexes of configs to predict
+        output_margin: bool, optional
+            Whether output the untransformed margin.
+            When a model is used as base model, it should output untransformed margin
+
+        Returns
+        -------
+        preds: Array of float
+            The prediction
+        """
+        raise NotImplementedError()
+
+    def load_basemodel(self, base_model):
+        """Load base model for transfer learning
+
+        Parameters
+        ----------
+        base_model: CostModel
+                base model
+        """
+        raise NotImplementedError()
+
+    def clone_new(self):
+        """Clone a new model with the same parameters.
+        This function will only copy hyperparameters of the tuner, not all the trained model
+
+        This is used for deriving a base model conveniently
+
+        Returns
+        -------
+        model: CostModel
+            A model with the same hyperparameter (argument)
+        """
+        raise NotImplementedError()
+
+
+class ModelOptimizer(object):
+    """Optimizer used to find optimal points of cost model"""
+    def __init__(self):
+        pass
+
+    def find_maximums(self, model, num, exclusive):
+        """Find maximum of a cost model
+
+        Note we use cost model to predict GFLOPS, so we should find the maximum
+
+        Parameters
+        ----------
+        model: CostModel
+            Cost model
+        num: int
+            The number of returned maximum points
+        exclusive: set, optional
+            The excluded set of this optimizer. Return results won't include any
+            elements in this set.
+        """
+        raise NotImplementedError()
+
+
+class ModelBasedTuner(Tuner):
+    """Base class for model based tuner
+    This type of tuner will fit a cost model and use an optimizer to
+    find the maximums of the cost model as next trials
+
+    Parameters
+    ----------
+    task: autotvm.task.Task
+        The tuning task
+    cost_model: CostModel
+        The cost model that predicts the speed of a config (IR)
+    model_optimizer:
+        The optimizer to find local optimum points of cost model in tuning search space
+    plan_size: int
+        Tuner will re-fit model per `plan_size` new measure samples
+    diversity_filter_ratio: int or float, optional
+        If is not None, the tuner will first select
+        top-(plan_size * diversity_filter_ratio) candidates according to the cost model
+        and then pick plan_size of them according to the diversity metric.
+    """
+
+    def __init__(self, task, cost_model, model_optimizer, plan_size, diversity_filter_ratio=None):
+        super(ModelBasedTuner, self).__init__(task)
+
+        # space
+        self.task = task
+        self.target = task.target
+        self.plan_size = plan_size
+        self.space = task.config_space
+        self.space_len = len(task.config_space)
+        self.dims = [len(x) for x in self.space.space_map.values()]
+
+        self.cost_model = cost_model
+        self.model_optimizer = model_optimizer
+        self.diversity_filter_ratio = diversity_filter_ratio
+
+        if self.diversity_filter_ratio:
+            assert self.diversity_filter_ratio >= 1, "Diversity filter ratio " \
+                                                     "must be larger than one"
+
+        # trial plan
+        self.trials = []
+        self.trial_pt = 0
+        self.visited = set()
+
+        # observed samples
+        self.xs = []
+        self.ys = []
+        self.flops_max = 0.0
+        self.train_ct = 0
+
+    def next_batch(self, batch_size):
+        ret = []
+
+        counter = 0
+        while counter < batch_size:
+            if len(self.visited) >= len(self.space):
+                break
+
+            while self.trial_pt < len(self.trials):
+                index = self.trials[self.trial_pt]
+                if index not in self.visited:
+                    break
+                self.trial_pt += 1
+
+            if self.trial_pt >= len(self.trials):  # trial list is empty, choose randomly
+                index = np.random.randint(len(self.space))
+                while index in self.visited:
+                    index = np.random.randint(len(self.space))
+
+            ret.append(self.space.get(index))
+            self.visited.add(index)
+
+            counter += 1
+        return ret
+
+    def update(self, inputs, results):
+        for inp, res in zip(inputs, results):
+            index = inp.config.index
+            if res.error_no == 0:
+                self.xs.append(index)
+                flops = inp.task.flop / np.mean(res.costs)
+                self.flops_max = max(self.flops_max, flops)
+                self.ys.append(flops)
+            else:
+                self.xs.append(index)
+                self.ys.append(0.0)
+
+        # if we have enough new training samples
+        if len(self.xs) >= self.plan_size * (self.train_ct + 1) \
+                and self.flops_max > 1e-6:
+            self.cost_model.fit(self.xs, self.ys, self.plan_size)
+            if self.diversity_filter_ratio:
+                candidate = self.model_optimizer.find_maximums(
+                    self.cost_model, self.plan_size * self.diversity_filter_ratio, self.visited)
+                scores = self.cost_model.predict(candidate)
+                knobs = [point2knob(x, self.dims) for x in candidate]
+                pick_index = submodular_pick(0 * scores, knobs, self.plan_size, knob_weight=1)
+                maximums = np.array(candidate)[pick_index]
+            else:
+                maximums = self.model_optimizer.find_maximums(
+                    self.cost_model, self.plan_size, self.visited)
+
+            self.trials = maximums
+            self.trial_pt = 0
+            self.train_ct += 1
+
+    def load_history(self, data_set):
+        # filter data, only pick the data with a same task
+        data = []
+        for inp, res in data_set:
+            if inp.task.name == self.task.name and \
+                            inp.config.template_key == self.task.config_space.template_key:
+                data.append((inp, res))
+        if not data:
+            return
+
+        # fit base model
+        base_model = self.cost_model.clone_new()
+        base_model.fit_log(data, self.plan_size)
+
+        # use base model to select initial points
+        if not self.trials:
+            # no plan yet, use base model to select initial trials
+            maximums = self.model_optimizer.find_maximums(base_model, self.plan_size, self.visited)
+            self.trials = maximums
+            self.trial_pt = 0
+
+        self.cost_model.load_basemodel(base_model)
+
+    def has_next(self):
+        return len(self.visited) < len(self.space)
+
+
+def point2knob(p, dims):
+    """convert point form (single integer) to knob form (vector)"""
+    knob = []
+    for dim in dims:
+        knob.append(p % dim)
+        p //= dim
+    return knob
+
+
+def knob2point(knob, dims):
+    """convert knob form (vector) to point form (single integer)"""
+    p = 0
+    for j, k in enumerate(knob):
+        p += int(np.prod(dims[:j])) * k
+    return p
+
+
+def submodular_pick(scores, knobs, n_pick, knob_weight=1.0):
+    """Run greedy optimization to pick points with regard to both score and diversity.
+    DiversityScore = knob_weight * number of unique knobs in the selected set
+    Obj = sum(scores[i] for i in pick) + DiversityScore
+    Note that this objective function is a monotone submodular function.
+
+    Parameters
+    ----------
+    scores: Array of float
+        score of every points
+    knobs: Array of Array of int
+        feature vector (tunable knobs) of every points
+    n_pick: int
+        number of points to pick
+    knob_weight: float
+        weight of an unique knob feature
+    """
+    n = len(scores)
+    assert n == len(knobs)
+    n_knobs = len(knobs[0])
+
+    knobs_set = [set() for _ in range(n_knobs)]
+
+    ret = []
+    remain = list(range(len(scores)))
+
+    for _ in range(n_pick):
+        max_x = -1
+        max_delta = -1e9
+
+        for x in remain:
+            tmp_delta = scores[x]
+            for i in range(n_knobs):
+                if knobs[x][i] not in knobs_set[i]:
+                    tmp_delta += knob_weight
+
+            if tmp_delta > max_delta:
+                max_delta, max_x = tmp_delta, x
+
+        ret.append(max_x)
+        remain.remove(max_x)
+        for i in range(n_knobs):
+            knobs_set[i].add(knobs[max_x][i])
+
+    return ret
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
new file mode 100644
index 000000000000..2084e0cb0da6
--- /dev/null
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@@ -0,0 +1,148 @@
+# pylint: disable=consider-using-enumerate
+"""
+Cost model optimizer based on simulated annealing
+"""
+
+import heapq
+import logging
+import time
+
+import numpy as np
+
+from ..util import sample_ints
+from .model_based_tuner import ModelOptimizer, knob2point, point2knob
+
+class SimulatedAnnealingOptimizer(ModelOptimizer):
+    """parallel simulated annealing optimization algorithm
+
+    Parameters
+    ----------
+    task: Task
+        The tuning task
+    n_iter: int
+        The number of iterations of simulated annealing
+    temp: float or Array of float
+        If is a single float, then use a constant temperature.
+        If is an Array, then perform linear cooling from temp[0] to temp[1]
+    early_stop: int, optional
+        Stop iteration if the optimal set do not change in `early_stop` rounds
+    log_interval: int, optional
+        Print log every `log_interval` iterations
+    """
+    def __init__(self, task, n_iter=500, temp=(1, 0), persistent=True, parallel_size=128,
+                 early_stop=50, log_interval=50):
+        super(SimulatedAnnealingOptimizer, self).__init__()
+
+        self.task = task
+        self.dims = [len(x) for x in self.task.config_space.space_map.values()]
+
+        self.n_iter = n_iter
+        self.temp = temp
+        self.persistent = persistent
+        self.parallel_size = min(parallel_size, len(self.task.config_space))
+        self.early_stop = early_stop or 1e9
+        self.log_interval = log_interval
+        self.points = None
+
+    def find_maximums(self, model, num, exclusive):
+        tic = time.time()
+        temp, n_iter, early_stop, log_interval = \
+                self.temp, self.n_iter, self.early_stop, self.log_interval
+
+        if self.persistent and self.points is not None:
+            points = self.points
+        else:
+            points = np.array(sample_ints(0, len(self.task.config_space), self.parallel_size))
+
+        scores = model.predict(points)
+
+        # build heap and insert initial points
+        heap_items = [(float('-inf'), -i) for i in range(num)]
+        heapq.heapify(heap_items)
+        in_heap = set(exclusive)
+        in_heap.update([-i for i in range(num)])
+
+        for s, p in zip(scores, points):
+            if s > heap_items[0][0] and p not in in_heap:
+                pop = heapq.heapreplace(heap_items, (s, p))
+                in_heap.remove(pop[1])
+                in_heap.add(p)
+
+        k = 0
+        k_last_modify = 0
+
+        if isinstance(temp, (tuple, list, np.ndarray)):
+            t = temp[0]
+            cool = 1.0 * (temp[0] - temp[1]) / (n_iter + 1)
+        else:
+            t = temp
+            cool = 0
+
+        while k < n_iter and k < k_last_modify + early_stop:
+            new_points = np.empty_like(points)
+            for i, p in enumerate(points):
+                new_points[i] = random_walk(p, self.dims)
+
+            new_scores = model.predict(new_points)
+
+            ac_prob = np.exp((new_scores - scores) / t)
+            ac_index = np.random.random(len(ac_prob)) < ac_prob
+
+            points[ac_index] = new_points[ac_index]
+            scores[ac_index] = new_scores[ac_index]
+
+            for s, p in zip(new_scores, new_points):
+                if s > heap_items[0][0] and p not in in_heap:
+                    pop = heapq.heapreplace(heap_items, (s, p))
+                    in_heap.remove(pop[1])
+                    in_heap.add(p)
+                    k_last_modify = k
+
+            k += 1
+            t -= cool
+
+            if log_interval and k % log_interval == 0:
+                t_str = "%.2f" % t
+                logging.debug("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\ttemp: %s\t"
+                              "elapsed: %.2f",
+                              k, k_last_modify, heap_items[0][0],
+                              np.max([v for v, _ in heap_items]), t_str,
+                              time.time() - tic)
+
+        heap_items.sort(key=lambda item: -item[0])
+        logging.debug("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\telapsed: %.2f",
+                      k, k_last_modify, heap_items[-1][0], heap_items[0][0], time.time() - tic)
+        logging.debug("SA Maximums: %s", heap_items)
+
+        if self.persistent:
+            self.points = points
+
+        return [x[1] for x in heap_items]
+
+def random_walk(p, dims):
+    """random walk as local transition
+
+    Parameters
+    ----------
+    p: int
+        index of the ConfigEntity
+    dims: Array of int
+        sizes of each dimension
+
+    Returns
+    -------
+    new_p: int
+        new neighborhood index
+    """
+    # transform to knob form
+    old = point2knob(p, dims)
+    new = list(old)
+
+    # mutate
+    while new == old:
+        from_i = np.random.randint(len(old))
+        to_v = np.random.randint(dims[from_i])
+        new[from_i] = to_v
+
+    # transform to index form
+    return knob2point(new, dims)
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
new file mode 100644
index 000000000000..b737a9fc5966
--- /dev/null
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -0,0 +1,147 @@
+# pylint: disable=unused-argument, no-self-use, invalid-name
+"""Base class of tuner"""
+import logging
+
+import numpy as np
+
+from ..measure import MeasureInput
+from ..measure import create_measure_batch
+
+from ..env import GLOBAL_SCOPE
+
+class Tuner(object):
+    """Base class for tuners
+
+    Parameters
+    ----------
+    task: autotvm.task.Task
+        Tuning Task
+    """
+
+    def __init__(self, task, **kwargs):
+        self.param = kwargs
+        self.recorder = None
+
+        self.task = task
+
+        # keep the current best
+        self.best_config = None
+        self.best_flops = 0
+        self.best_measure_pair = None
+        self.best_iter = 0
+
+    def has_next(self):
+        """Whether has next untried config in the space
+
+        Returns
+        -------
+        has_next: bool
+        """
+        raise NotImplementedError()
+
+    def next_batch(self, batch_size):
+        """get the next batch of configs to be measure on real hardware
+
+        Parameters
+        ----------
+        batch_size: int
+            The size of the batch
+
+        Returns
+        -------
+        a batch of configs
+        """
+        raise NotImplementedError()
+
+    def update(self, inputs, results):
+        """Update parameters of the tuner according to measurement results
+
+        Parameters
+        ----------
+        inputs: Array of autotvm.measure.MeasureInput
+            The input for measurement
+        results: Array of autotvm.measure.MeasureResult
+            result for measurement
+        """
+        pass
+
+    def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
+        """Begin tuning
+
+        Parameters
+        ----------
+        n_trial: int
+            Maximum number of configs to try (measure on real hardware)
+        measure_option: dict
+            The options for how to measure generated code.
+            You should use the return value ot autotvm.measure_option for this argument.
+        early_stopping: int
+            Early stop the tuning when not finding better configs in this number of trials
+        callbacks: List of callable
+            A list of callback functions. The signature of callback function is
+            (Tuner, List of MeasureInput, List of MeasureResult)
+            with no return value. These callback functions will be called on
+            every measurement pair. See autotvm/tuner/callback.py for some examples.
+        """
+        measure_batch = create_measure_batch(self.task, measure_option)
+        parallel_num = getattr(measure_batch, 'parallel_num', 1)
+        early_stopping = early_stopping or 1e9
+
+        GLOBAL_SCOPE.in_tuning = True
+        i = 0
+        while i < n_trial:
+            if not self.has_next():
+                break
+
+            configs = self.next_batch(min(parallel_num, n_trial - i))
+
+            inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
+            results = measure_batch(inputs)
+
+            # keep best config
+            for k, (inp, res) in enumerate(zip(inputs, results)):
+                config = inp.config
+                if res.error_no == 0:
+                    flops = inp.task.flop / np.mean(res.costs)
+                else:
+                    flops = 0
+                if flops > self.best_flops:
+                    self.best_flops = flops
+                    self.best_config = config
+                    self.best_measure_pair = (inp, res)
+                    self.best_iter = i + k
+
+                logging.debug("No: %d\tGFLOPS: %.2f/%.2f\tresult: %s\t%s",
+                              i + k + 1, flops / 1e9, self.best_flops / 1e9,
+                              res, config)
+
+            i += len(results)
+
+            self.update(inputs, results)
+
+            for callback in callbacks:
+                callback(self, inputs, results)
+
+            if i > self.best_iter + early_stopping:
+                logging.debug("Early stopped. Best iter: %d.", self.best_iter)
+                break
+
+        GLOBAL_SCOPE.in_tuning = False
+
+        del measure_batch
+
+    def reset(self):
+        """reset the status of tuner"""
+        self.best_config = None
+        self.best_flops = 0
+        self.best_measure_pair = None
+
+    def load_history(self, data_set):
+        """load history data for transfer learning
+
+        Parameters
+        ----------
+        data_set: Array of (MeasureInput, MeasureResult) pair
+            Previous tuning records
+        """
+        raise NotImplementedError()
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
new file mode 100644
index 000000000000..ce28842a4f37
--- /dev/null
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -0,0 +1,499 @@
+# pylint: disable=invalid-name
+"""XGBoost as cost model"""
+
+import multiprocessing
+import logging
+import time
+
+import numpy as np
+try:
+    import xgboost as xgb
+except ImportError:
+    xgb = None
+
+from .. import feature
+from ..util import get_rank
+from .metric import max_curve, recall_curve, cover_curve
+from .model_based_tuner import CostModel, FeatureCache
+
+class XGBoostCostModel(CostModel):
+    """XGBoost as cost model
+
+    Parameters
+    ----------
+    task: Task
+        The tuning task
+    feature_type: str, optional
+        If is 'itervar', use features extracted from IterVar (loop variable).
+        If is 'knob', use flatten ConfigEntity directly.
+        If is 'curve', use sampled curve feature (relation feature).
+
+        Note on choosing feature type:
+        For single task tuning, 'itervar' and 'knob' is good.
+                                'itervar' is more accurate but 'knob' is much faster.
+        For cross-shape tuning (e.g. many convolutions with different shapes),
+                               'itervar' and 'curve' has better transferability,
+                               'knob' is faster.
+        For cross-device or cross-operator tuning, you can use 'curve' only.
+    loss_type: str
+        If is 'reg', use regression loss to train cost model.
+                     The cost model predicts the normalized flops.
+        If is 'rank', use pairwise rank loss to train cost model.
+                     The cost model predicts relative rank score.
+    num_threads: int, optional
+        The number of threads.
+    log_interval: int, optional
+        If is not none, the cost model will print training log every `log_interval` iterations.
+    """
+    def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval=25):
+        super(XGBoostCostModel, self).__init__()
+
+        if xgb is None:
+            raise RuntimeError("XGBoost is required for XGBoostCostModel. "
+                               "Please install its python package first. "
+                               "Help: (https://xgboost.readthedocs.io/en/latest/) ")
+
+        self.task = task
+        self.target = task.target
+        self.space = task.config_space
+
+        self.fea_type = feature_type
+        self.loss_type = loss_type
+        self.num_threads = num_threads
+        self.log_interval = log_interval
+
+        if loss_type == 'reg':
+            self.xgb_params = {
+                'max_depth': 3,
+                'gamma': 0.0001,
+                'min_child_weight': 1,
+
+                'subsample': 1.0,
+
+                'eta': 0.3,
+                'lambda': 1.00,
+                'alpha': 0,
+
+                'objective': 'reg:linear',
+            }
+        elif loss_type == 'rank':
+            self.xgb_params = {
+                'max_depth': 3,
+                'gamma': 0.0001,
+                'min_child_weight': 1,
+
+                'subsample': 1.0,
+
+                'eta': 0.3,
+                'lambda': 1.00,
+                'alpha': 0,
+
+                'objective': 'rank:pairwise',
+            }
+        else:
+            raise RuntimeError("Invalid loss type: " + loss_type)
+
+        self.xgb_params['silent'] = 1
+        if num_threads:
+            self.xgb_params['nthread'] = num_threads
+        self.bst = None
+
+        if feature_type == 'itervar':
+            self.feature_extract_func = _extract_itervar_feature_index
+        elif feature_type == 'knob':
+            self.feature_extract_func = _extract_knob_feature_index
+        elif feature_type == 'curve':
+            self.feature_extract_func = _extract_curve_feature_index
+        else:
+            raise RuntimeError("Invalid feature type " + feature_type)
+
+        self.feature_cache = FeatureCache()
+        self.feature_extra_ct = 0
+        self.pool = None
+        self.base_model = None
+        self.upper_model = None
+
+        self._sample_size = 0
+
+        self._reset_pool()
+
+    def _reset_pool(self):
+        # reset processing pool for feature extraction
+        if self.pool:
+            self.pool.terminate()
+            self.pool.join()
+            del self.pool
+        # use global variable to pass common arguments
+        global _extract_space, _extract_target, _extract_task
+        _extract_space = self.space
+        _extract_target = self.target
+        _extract_task = self.task
+        self.pool = multiprocessing.Pool(self.num_threads)
+
+    def _base_model_discount(self):
+        return 1.0 / (2 ** (self._sample_size / 50.0))
+
+    def fit(self, xs, ys, plan_size):
+        tic = time.time()
+        self._reset_pool()
+
+        x_train = self._get_feature(xs)
+        y_train = np.array(ys)
+        y_max = np.max(y_train)
+        y_train = y_train / max(y_max, 1e-8)
+
+        valid_index = y_train > 1e-6
+        index = np.random.permutation(len(x_train))
+        dtrain = xgb.DMatrix(x_train[index], y_train[index])
+        self._sample_size = len(x_train)
+
+        if self.base_model:
+            dtrain.set_base_margin(self._base_model_discount() *
+                                   self.base_model.predict(xs, output_margin=True))
+
+        self.bst = xgb.train(self.xgb_params, dtrain,
+                             num_boost_round=8000,
+                             callbacks=[custom_callback(
+                                 stopping_rounds=20,
+                                 metric='tr-a-recall@%d' % plan_size,
+                                 evals=[(dtrain, 'tr')],
+                                 maximize=True,
+                                 fevals=[
+                                     xgb_average_recalln_curve_score(plan_size),
+                                 ],
+                                 verbose_eval=self.log_interval)])
+
+        logging.debug("XGB train: %.2f\tobs: %d\terror: %d\tn_cache: %d",
+                      time.time() - tic, len(xs),
+                      len(xs) - np.sum(valid_index),
+                      self.feature_cache.size(self.fea_type))
+
+    def fit_log(self, records, plan_size):
+        tic = time.time()
+        self._reset_pool()
+
+        args = list(records)
+        logging.debug("XGB load %d entries from history log file", len(args))
+
+        if self.fea_type == 'itervar':
+            feature_extract_func = _extract_itervar_feature_log
+        elif self.fea_type == 'knob':
+            feature_extract_func = _extract_knob_feature_log
+        elif self.fea_type == 'curve':
+            feature_extract_func = _extract_curve_feature_log
+        else:
+            raise RuntimeError("Invalid feature type: " + self.fea_type)
+        res = self.pool.map(feature_extract_func, args)
+        xs, ys = zip(*res)
+        xs, ys = np.array(xs), np.array(ys)
+
+        x_train = xs
+        y_train = ys
+        y_max = np.max(y_train)
+        y_train = y_train / max(y_max, 1e-8)
+
+        index = np.random.permutation(len(x_train))
+        dtrain = xgb.DMatrix(x_train[index], y_train[index])
+
+        plan_size *= 2
+        self.bst = xgb.train(self.xgb_params, dtrain,
+                             num_boost_round=400,
+                             callbacks=[custom_callback(
+                                 stopping_rounds=100,
+                                 metric='tr-a-recall@%d' % plan_size,
+                                 evals=[(dtrain, 'tr')],
+                                 maximize=True,
+                                 fevals=[
+                                     xgb_average_recalln_curve_score(plan_size),
+                                 ],
+                                 verbose_eval=self.log_interval)])
+
+        logging.debug("XGB train: %.2f\tobs: %d", time.time() - tic, len(xs))
+
+    def predict(self, xs, output_margin=False):
+        feas = self._get_feature(xs)
+        dtest = xgb.DMatrix(feas)
+
+        if self.base_model:
+            dtest.set_base_margin(self._base_model_discount() *
+                                  self.base_model.predict(xs, output_margin=True))
+
+        return self.bst.predict(dtest, output_margin=output_margin)
+
+    def load_basemodel(self, base_model):
+        self.base_model = base_model
+        if isinstance(base_model, XGBoostCostModel):
+            # share feature cache
+            base_model.feature_cache = self.feature_cache
+
+            # close thread pool
+            if base_model.pool:
+                base_model.pool.terminate()
+                base_model.pool.join()
+                del base_model.pool
+            self.base_model.upper_model = self
+
+    def clone_new(self):
+        return XGBoostCostModel(self.task, self.fea_type, self.loss_type,
+                                self.num_threads, self.log_interval)
+
+    def _get_feature(self, indexes):
+        """get features for indexes, run extraction if we do not have cache for them"""
+        # free feature cache
+        if self.feature_cache.size(self.fea_type) >= 100000:
+            self.feature_cache.clear(self.fea_type)
+
+        fea_cache = self.feature_cache.get(self.fea_type)
+
+        indexes = np.array(indexes)
+        need_extract = [x for x in indexes if x not in fea_cache]
+
+        if need_extract:
+            pool = self.pool if self.upper_model is None else self.upper_model.pool
+            feas = pool.map(self.feature_extract_func, need_extract)
+            for i, fea in zip(need_extract, feas):
+                fea_cache[i] = fea
+
+        ret = np.empty((len(indexes), fea_cache[indexes[0]].shape[-1]), dtype=np.float32)
+        for i, ii in enumerate(indexes):
+            ret[i, :] = fea_cache[ii]
+        return ret
+
+
+_extract_space = None
+_extract_target = None
+_extract_task = None
+
+def _extract_itervar_feature_index(index):
+    """extract iteration var feature for an index in extract_space"""
+    config = _extract_space.get(index)
+    with _extract_target:
+        sch, args = _extract_task.instantiate(config)
+    fea = feature.get_itervar_feature_flatten(sch, args, take_log=True)
+    fea = np.concatenate((fea, list(config.get_other_option().values())))
+    return fea
+
+def _extract_itervar_feature_log(arg):
+    """extract iteration var feature for log items"""
+    inp, res = arg
+    config = inp.config
+    with inp.target:
+        sch, args = inp.task.instantiate(config)
+    fea = feature.get_itervar_feature_flatten(sch, args, take_log=True)
+    x = np.concatenate((fea, list(config.get_other_option().values())))
+
+    if res.error_no == 0:
+        y = inp.task.flop / np.mean(res.costs)
+    else:
+        y = 0.0
+    return x, y
+
+def _extract_knob_feature_index(index):
+    """extract knob feature for an index in extract_space"""
+    config = _extract_space.get(index)
+    return config.get_flatten_feature()
+
+def _extract_knob_feature_log(arg):
+    """extract knob feature for log items"""
+    inp, res = arg
+    config = inp.config
+    x = config.get_flatten_feature()
+
+    if res.error_no == 0:
+        with inp.target:  # necessary, for calculating flops of this task
+            inp.task.instantiate(config)
+        y = inp.task.flop / np.mean(res.costs)
+    else:
+        y = 0.0
+    return x, y
+
+def _extract_curve_feature_index(index):
+    """extract sampled curve feature for an index in extract_space"""
+    config = _extract_space.get(index)
+    with _extract_target:
+        sch, args = _extract_task.instantiate(config)
+    fea = feature.get_buffer_curve_sample_flatten(sch, args, sample_n=20)
+    fea = np.concatenate((fea, list(config.get_other_option().values())))
+    return np.array(fea)
+
+def _extract_curve_feature_log(arg):
+    """extract sampled curve feature for log items"""
+    inp, res = arg
+    config = inp.config
+    with inp.target:
+        sch, args = inp.task.instantiate(config)
+    fea = feature.get_buffer_curve_sample_flatten(sch, args, sample_n=20)
+    x = np.concatenate((fea, list(config.get_other_option().values())))
+
+    if res.error_no == 0:
+        y = inp.task.flop / np.mean(res.costs)
+    else:
+        y = 0.0
+    return x, y
+
+
+def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None,
+                    maximize=False, verbose_eval=True):
+    """callback function for xgboost to support multiple custom evaluation functions"""
+    from xgboost.core import EarlyStopException
+    from xgboost.callback import _fmt_metric
+    from xgboost.training import aggcv
+
+    state = {}
+    metric_shortname = metric.split("-")[1]
+
+    def init(env):
+        """internal function"""
+        bst = env.model
+
+        state['maximize_score'] = maximize
+        state['best_iteration'] = 0
+        if maximize:
+            state['best_score'] = float('-inf')
+        else:
+            state['best_score'] = float('inf')
+
+        if bst is not None:
+            if bst.attr('best_score') is not None:
+                state['best_score'] = float(bst.attr('best_score'))
+                state['best_iteration'] = int(bst.attr('best_iteration'))
+                state['best_msg'] = bst.attr('best_msg')
+            else:
+                bst.set_attr(best_iteration=str(state['best_iteration']))
+                bst.set_attr(best_score=str(state['best_score']))
+        else:
+            assert env.cvfolds is not None
+
+    def callback(env):
+        """internal function"""
+        if not state:
+            init(env)
+
+        bst = env.model
+        i = env.iteration
+        cvfolds = env.cvfolds
+
+        res_dict = {}
+
+        ##### evaluation #####
+        if cvfolds is not None:
+            for feval in fevals:
+                tmp = aggcv([f.eval(i, feval) for f in cvfolds])
+                for k, mean, std in tmp:
+                    res_dict[k] = [mean, std]
+        else:
+            for feval in fevals:
+                bst_eval = bst.eval_set(evals, i, feval)
+                res = [x.split(':') for x in bst_eval.split()]
+                for kv in res[1:]:
+                    res_dict[kv[0]] = [float(kv[1])]
+
+        eval_res = []
+        keys = list(res_dict.keys())
+        keys.sort(key=lambda x: x if metric_shortname not in x else "a" + x)
+        for key in keys:
+            v = res_dict[key]
+            eval_res.append([key] + v)
+
+        ##### print eval result #####
+        infos = ["XGB iter: %3d" % i]
+        for item in eval_res:
+            if 'null' in item[0]:
+                continue
+            infos.append("%s: %.6f" % (item[0], item[1]))
+
+        if not isinstance(verbose_eval, bool) and verbose_eval and i % verbose_eval == 0:
+            logging.debug("\t".join(infos))
+        if log_file:
+            with open(log_file, "a") as fout:
+                fout.write("\t".join(infos) + '\n')
+
+        ##### choose score and do early stopping #####
+        score = None
+        for item in eval_res:
+            if item[0] == metric:
+                score = item[1]
+                break
+        assert score is not None
+
+        best_score = state['best_score']
+        best_iteration = state['best_iteration']
+        maximize_score = state['maximize_score']
+        if (maximize_score and score > best_score) or \
+                (not maximize_score and score < best_score):
+            msg = '[%d] %s' % (
+                env.iteration,
+                '\t'.join([_fmt_metric(x) for x in eval_res]))
+            state['best_msg'] = msg
+            state['best_score'] = score
+            state['best_iteration'] = env.iteration
+            # save the property to attributes, so they will occur in checkpoint.
+            if env.model is not None:
+                env.model.set_attr(best_score=str(state['best_score']),
+                                   best_iteration=str(state['best_iteration']),
+                                   best_msg=state['best_msg'])
+        elif env.iteration - best_iteration >= stopping_rounds:
+            best_msg = state['best_msg']
+            if verbose_eval and env.rank == 0:
+                logging.debug("XGB stopped. Best iteration: %s ", best_msg)
+            raise EarlyStopException(best_iteration)
+
+    return callback
+
+
+# feval wrapper for xgboost
+def xgb_max_curve_score(N):
+    """evaluate max curve score for xgb"""
+    def feval(preds, labels):
+        labels = labels.get_label()
+        trials = np.argsort(preds)[::-1]
+        scores = labels[trials]
+        curve = max_curve(scores)
+        return "Smax@%d" % N, curve[N] / np.max(labels)
+    return feval
+
+def xgb_recalln_curve_score(N):
+    """evaluate recall-n curve score for xgb"""
+    def feval(preds, labels):
+        labels = labels.get_label()
+        trials = np.argsort(preds)[::-1]
+        ranks = get_rank(labels[trials])
+        curve = recall_curve(ranks)
+        return "recall@%d" % N, curve[N]
+    return feval
+
+def xgb_average_recalln_curve_score(N):
+    """evaluate average recall-n curve score for xgb"""
+    def feval(preds, labels):
+        labels = labels.get_label()
+        trials = np.argsort(preds)[::-1]
+        ranks = get_rank(labels[trials])
+        curve = recall_curve(ranks)
+        return "a-recall@%d" % N, np.sum(curve[:N]) / N
+    return feval
+
+def xgb_recallk_curve_score(N, topk):
+    """evaluate recall-k curve score for xgb"""
+    def feval(preds, labels):
+        labels = labels.get_label()
+        trials = np.argsort(preds)[::-1]
+        ranks = get_rank(labels[trials])
+        curve = recall_curve(ranks, topk)
+        return "recall@%d" % topk, curve[N]
+    return feval
+
+def xgb_cover_curve_score(N):
+    """evaluate cover curve score for xgb"""
+    def feval(preds, labels):
+        labels = labels.get_label()
+        trials = np.argsort(preds)[::-1]
+        ranks = get_rank(labels[trials])
+        curve = cover_curve(ranks)
+        return "cover@%d" % N, curve[N]
+    return feval
+
+def xgb_null_score(_):
+    """empty score function for xgb"""
+    def feval(__, ___):
+        return "null", 0
+    return feval
diff --git a/python/tvm/autotvm/tuner/xgboost_tuner.py b/python/tvm/autotvm/tuner/xgboost_tuner.py
new file mode 100644
index 000000000000..237ac4e19ab1
--- /dev/null
+++ b/python/tvm/autotvm/tuner/xgboost_tuner.py
@@ -0,0 +1,64 @@
+"""Tuner that uses xgboost as cost model"""
+
+from .model_based_tuner import ModelBasedTuner, ModelOptimizer
+from .xgboost_cost_model import XGBoostCostModel
+from .sa_model_optimizer import SimulatedAnnealingOptimizer
+
+class XGBTuner(ModelBasedTuner):
+    """Tuner that uses xgboost as cost model
+
+    Parameters
+    ----------
+    task: Task
+        The tuning task
+    plan_size: int
+        The size of a plan. After `plan_size` trials, the tuner will refit a new cost model
+        and do planing for the next `plan_size` trials.
+    feature_type: str, optional
+        If is 'itervar', use features extracted from IterVar (loop variable).
+        If is 'knob', use flatten ConfigEntity directly.
+        If is 'curve', use sampled curve feature (relation feature).
+
+        Note on choosing feature type:
+        For single task tuning, 'itervar' and 'knob' is good.
+                                'itervar' is more accurate but 'knob' is much faster.
+        For cross-shape tuning (e.g. many convolutions with different shapes),
+                               'itervar' and 'curve' has better transferability,
+                               'knob' is faster.
+        For cross-device or cross-operator tuning, you can use 'curve' only.
+    loss_type: str
+        If is 'reg', use regression loss to train cost model.
+                     The cost model predicts the normalized flops.
+        If is 'rank', use pairwise rank loss to train cost model.
+                     The cost model predicts relative rank score.
+    num_threads: int, optional
+        The number of threads.
+    optimizer: str or ModelOptimizer, optional
+        If is 'sa', use a default simulated annealing optimizer.
+        Otherwise it should be a ModelOptimizer object.
+    diversity_filter_ratio: int or float, optional
+        If is not None, the tuner will first select
+        top-(plan_size * diversity_filter_ratio) candidates according to the cost model
+        and then pick batch_size of them according to the diversity metric.
+    log_interval: int, optional
+        The verbose level.
+        If is 0, output nothing.
+        Otherwise, output debug information every `verbose` iterations.
+    """
+    def __init__(self, task, plan_size=32,
+                 feature_type='itervar', loss_type='rank', num_threads=None,
+                 optimizer='sa', diversity_filter_ratio=None, log_interval=50):
+        cost_model = XGBoostCostModel(task,
+                                      feature_type=feature_type,
+                                      loss_type=loss_type,
+                                      num_threads=num_threads,
+                                      log_interval=log_interval // 2)
+        if optimizer == 'sa':
+            optimizer = SimulatedAnnealingOptimizer(task, log_interval=log_interval)
+        else:
+            assert isinstance(optimizer, ModelOptimizer), "Optimizer must be " \
+                                                          "a supported name string" \
+                                                          "or a ModelOptimizer object."
+
+        super(XGBTuner, self).__init__(task, cost_model, optimizer,
+                                       plan_size, diversity_filter_ratio)
diff --git a/python/tvm/autotvm/util.py b/python/tvm/autotvm/util.py
new file mode 100644
index 000000000000..99a2c85aa10e
--- /dev/null
+++ b/python/tvm/autotvm/util.py
@@ -0,0 +1,159 @@
+# pylint: disable=invalid-name
+"""Utilities"""
+import logging
+import multiprocessing
+import time
+
+import numpy as np
+
+from .. import expr, ir_pass
+
+
+class EmptyContext(object):
+    """An empty context"""
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+def get_rank(values):
+    """get rank of items
+
+    Parameters
+    ----------
+    values: Array
+
+    Returns
+    -------
+    ranks: Array of int
+        the rank of this item in the input (the largest value ranks first)
+    """
+    tmp = np.argsort(-values)
+    ranks = np.empty_like(tmp)
+    ranks[tmp] = np.arange(len(tmp))
+    return ranks
+
+
+def sample_ints(low, high, m):
+    """
+    Sample m different integer numbers from [low, high) without replacement
+    This function is an alternative of `np.random.choice` when (high - low) > 2 ^ 32, in
+    which case numpy does not work.
+
+    Parameters
+    ----------
+    low: int
+        low point of sample range
+    high: int
+        high point of sample range
+    m: int
+        The number of sampled int
+
+    Returns
+    -------
+    ints: an array of size m
+    """
+    vis = set()
+    assert m <= high - low
+    while len(vis) < m:
+        new = np.random.randint(low, high)
+        while new in vis:
+            new = np.random.randint(low, high)
+        vis.add(new)
+
+    return list(vis)
+
+
+def pool_map(func, args, batch_size, verbose=False, pool=None):
+    """A wrapper of multiprocessing.pool.Pool.map to support small-batch mapping
+    for large argument list. This can reduce memory usage
+
+    Parameters
+    ----------
+    func: Func(arg) -> np.ndarray
+        mapping function
+    args: List
+        list of arguments
+    batch_size: int
+        batch size in mapping
+    verbose: bool, optional
+        whether print progress
+    pool: multiprocessing.Pool, optional
+        pool objection
+
+    Returns
+    -------
+    converted numpy array
+    """
+
+    ret = None
+    tic = time.time()
+    local_pool = pool or multiprocessing.Pool()
+    if verbose:
+        logging.info("mapping begin")
+    for i in range(0, len(args), batch_size):
+        if verbose:
+            logging.info("mapping %d/%d elapsed %.2f", i, len(args),
+                         time.time() - tic)
+        tmp = np.array(local_pool.map(func, args[i:i+batch_size]))
+        ret = tmp if ret is None else np.concatenate((ret, tmp))
+    if verbose:
+        logging.info("mapping done")
+    if not pool:
+        local_pool.close()
+    return ret
+
+def get_func_name(func):
+    """Get name of a function
+
+    Parameters
+    ----------
+    func: Function
+        The function
+    Returns
+    -------
+    name: str
+        The name
+    """
+
+    return func.func_name if hasattr(func, 'func_name') else func.__name__
+
+
+def get_const_int(exp):
+    """Verifies expr is integer and get the constant value.
+
+    Parameters
+    ----------
+    exp : tvm.Expr or int
+        The input expression.
+
+    Returns
+    -------
+    out_value : int
+        The output.
+    """
+    if isinstance(exp, int):
+        return exp
+    if not isinstance(exp, (expr.IntImm, expr.UIntImm)):
+        exp = ir_pass.Simplify(expr)
+    if not isinstance(exp, (expr.IntImm, expr.UIntImm)):
+        raise ValueError("Expect value to be constant int")
+    return exp.value
+
+
+def get_const_tuple(in_tuple):
+    """Verifies input tuple is IntImm, returns tuple of int.
+
+    Parameters
+    ----------
+    in_tuple : tuple of Expr
+        The input.
+
+    Returns
+    -------
+    out_tuple : tuple of int
+        The output.
+    """
+    return tuple(get_const_int(x) for x in in_tuple)
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
old mode 100644
new mode 100755
index ff6086778e7f..777654af6619
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -5,66 +5,182 @@
 """
 from __future__ import absolute_import as _abs
 import warnings
+import types
 
+from ._ffi.node import NodeBase, register_node
 from . import api
+from . import _api_internal
 from . import tensor
 from . import schedule
 from . import expr
 from . import ir_pass
+from . import stmt as _stmt
 from . import container
 from . import module
 from . import codegen
 from . import ndarray
 from . import target as _target
+from . import make
 
-class BuildConfig(object):
+class DumpIR(object):
+    """
+    Dump IR for each pass.
+    With it, you can dump ir just like gcc/llvm.
+
+    How to use:
+    -----------
+    .. code-block:: python
+
+        with tvm.build_config(dump_pass_ir=True)
+            run()
+    """
+    scope_level = 0
+    def __init__(self):
+        self._pass_id = 0
+        self._recover_list = []
+
+    def decorate(self, func):
+        """ decorate the pass function"""
+        def dump(*args, **kwargs):
+            """dump function"""
+            retv = func(*args, **kwargs)
+            if not isinstance(retv, (_stmt.Stmt, container.LoweredFunc, container.Array)):
+                return retv
+            fname = func.func_name if hasattr(func, 'func_name') else func.__name__
+            pname = str(self._pass_id) + "_" + fname + "_ir.cc"
+            with open(pname, "a") as f:
+                out = retv.body if isinstance(retv, container.LoweredFunc) else retv
+                f.write(str(out))
+                if isinstance(retv, container.Array):
+                    for x in retv:
+                        out = x.body if isinstance(x, container.LoweredFunc) else x
+                        f.write("---------%s\n%s\n-----------\n"%(x.name, str(out)))
+                self._pass_id += 1
+            return retv
+        return dump
+
+    def decorate_irpass(self):
+        """decorate ir_pass and ScheduleOps"""
+        self._old_sgpass = schedule.ScheduleOps
+        schedule.ScheduleOps = self.decorate(schedule.ScheduleOps)
+        vset = vars(ir_pass)
+        k = v = 0
+        def recover():
+            vset[k] = v
+        for k, v in vset.items():
+            self._recover_list.append(recover)
+            vset[k] = self.decorate(v) if isinstance(v, types.FunctionType) else v
+
+    def decorate_custompass(self, custom_pass):
+        """decorate given list of custom passes, and return decorated passes"""
+        custom_pass = custom_pass if custom_pass else []
+        pass_list = []
+        for idx, x in enumerate(custom_pass):
+            x[1].__name__ = "custom{}_phase{}".format(idx, x[0])
+            pass_list += [(x[0], self.decorate(x[1]))]
+        return pass_list
+
+    def enter(self):
+        """only decorate outermost nest"""
+        if DumpIR.scope_level > 0:
+            return
+        self.decorate_irpass()
+        self._pass_id = 0
+        DumpIR.scope_level += 1
+
+    def exit(self):
+        """recover outermost nest"""
+        if DumpIR.scope_level > 1:
+            return
+        # recover decorated functions
+        for f in self._recover_list:
+            f()
+        schedule.ScheduleOps = self._old_sgpass
+        DumpIR.scope_level -= 1
+
+
+@register_node
+class BuildConfig(NodeBase):
     """Configuration scope to set a build config option.
 
-    Parameters
-    ----------
-    kwargs
-        Keyword arguments of configurations to set.
+    Note
+    ----
+    This object is backed by node system in C++, with arguments that can be
+    exchanged between python and C++.
+
+    Do not construct directly, use build_config instead.
+
+    The fields that are backed by the C++ node are immutable once an instance
+    is constructed. See _node_defaults for the fields.
     """
-    current = None
-    defaults = {
+
+    _node_defaults = {
         "auto_unroll_max_step": 0,
-        "auto_unroll_max_depth": 4,
+        "auto_unroll_max_depth": 8,
+        "auto_unroll_max_extent": 0,
         "unroll_explicit": True,
         "detect_global_barrier": False,
+        "partition_const_loop": False,
         "offset_factor": 0,
         "data_alignment": -1,
         "restricted_func": True,
         "double_buffer_split_loop": 1,
-        "add_lower_pass": None
+        "dump_pass_ir": False
     }
-    def __init__(self, **kwargs):
-        self._old_scope = None
-        for k, _ in kwargs.items():
-            if k not in BuildConfig.defaults:
-                raise ValueError(
-                    "invalid argument %s, candidates are %s" % (k, BuildConfig.defaults.keys()))
-        self._attr = kwargs
-
-    def __getattr__(self, name):
-        if name not in self._attr:
-            return BuildConfig.defaults[name]
-        return self._attr[name]
+    _dump_ir = DumpIR()
+
+    # pylint: disable=no-member
+    def __init__(self, handle):
+        """Initialize the function with handle
+
+        Parameters
+        ----------
+        handle : SymbolHandle
+            the handle to the underlying C++ Symbol
+        """
+        super(BuildConfig, self).__init__(handle)
+        self.handle = handle
+
+    @property
+    def add_lower_pass(self):
+        size = _api_internal._BuildConfigGetAddLowerPassInfo(self)
+        result = []
+        for i in range(size):
+            phase = _api_internal._BuildConfigGetAddLowerPassInfo(self, i, True)
+            func = _api_internal._BuildConfigGetAddLowerPassInfo(self, i, False)
+            result += [(phase, func)]
+        return result
+
+    @add_lower_pass.setter
+    def add_lower_pass(self, value):
+        add_lower_pass_args = []
+        for x in value:
+            add_lower_pass_args += [x[0], x[1]]
+        _api_internal._BuildConfigSetAddLowerPass(self, *add_lower_pass_args)
 
     def __enter__(self):
         # pylint: disable=protected-access
-        self._old_scope = BuildConfig.current
-        attr = BuildConfig.current._attr.copy()
-        attr.update(self._attr)
-        self._attr = attr
-        BuildConfig.current = self
+        _api_internal._EnterBuildConfigScope(self)
+        if self.dump_pass_ir:
+            BuildConfig._dump_ir.enter()
         return self
 
     def __exit__(self, ptype, value, trace):
-        assert self._old_scope
-        BuildConfig.current = self._old_scope
+        if self.dump_pass_ir:
+            BuildConfig._dump_ir.exit()
+        _api_internal._ExitBuildConfigScope()
+
+    def __setattr__(self, name, value):
+        if name in BuildConfig._node_defaults:
+            raise AttributeError(
+                "'%s' object cannot set attribute '%s'" % (str(type(self)), name))
+        return super(BuildConfig, self).__setattr__(name, value)
 
 
-BuildConfig.current = BuildConfig()
+def current_build_config():
+    """Get the current build configuration."""
+    return _api_internal._GetCurrentBuildConfig()
+
 
 def build_config(**kwargs):
     """Configure the build behavior by setting config variables.
@@ -75,7 +191,7 @@ def build_config(**kwargs):
         Threshold of number of steps in the loop to be automatically unrolled.
         This takes inner loop count into consideration.
 
-    auto_unroll_max_depth: int, default=4
+    auto_unroll_max_depth: int, default=8
         The maximum nested level of loops that can be automatically unrolled.
 
     unroll_explicit: bool, default=True
@@ -87,6 +203,9 @@ def build_config(**kwargs):
     detect_global_barrier: bool, default=True
         Whether detect global barrier.
 
+    partition_const_loop: bool, default=False
+        Whether partition const loop
+
     data_alignment: int, optional
         The alignment of data pointer in bytes.
         If -1 is passed, the alignment will be set to TVM's internal default.
@@ -106,17 +225,25 @@ def build_config(**kwargs):
         It it is bigger than one, the logic will do a split with factor equals the integer
         and unroll the inner loop. This allows the buffer fetching won't contain condition.
 
-    add_lower_pass: list of tuiple (phase, function(Stmt->Stmt)), default=None
+    add_lower_pass: list of tuple (phase, function(Stmt->Stmt)), default=None
         phase contains an integer on which optimization pass we apply the pass.
         Additional lowering passes to be applied before make_api.
 
+    dump_pass_ir: dump ir of each pass into file idx_passname_ir.cc, default=False
+
     Returns
     -------
     config: BuildConfig
         The build configuration
     """
-    return BuildConfig(**kwargs)
+    node_args = {k: v if k not in kwargs else kwargs[k]
+                 for k, v in BuildConfig._node_defaults.items()}
+    config = make.node("BuildConfig", **node_args)
 
+    if "add_lower_pass" in kwargs:
+        config.add_lower_pass = kwargs["add_lower_pass"]
+
+    return config
 
 def get_binds(args, binds=None):
     """Internal function to get binds and arg_list given arguments.
@@ -140,7 +267,7 @@ def get_binds(args, binds=None):
         The list of symbolic buffers of arguments.
     """
     binds = {} if binds is None else binds.copy()
-    cfg = BuildConfig.current
+    cfg = current_build_config()
     arg_list = []
     for x in args:
         if isinstance(x, tensor.Tensor):
@@ -197,24 +324,38 @@ def lower(sch,
        Then the Stmt before make api is returned.
     """
     binds, arg_list = get_binds(args, binds)
-    cfg = BuildConfig.current
+    cfg = current_build_config()
     add_lower_pass = cfg.add_lower_pass if cfg.add_lower_pass else []
+    if cfg.dump_pass_ir:
+        add_lower_pass = BuildConfig._dump_ir.decorate_custompass(add_lower_pass)
     lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0]
     lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1]
-    lower_phase2 = [x[1] for x in add_lower_pass if x[0] > 1]
-    # normalize schedule first
-    sch = sch.normalize()
+    lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2]
+    lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2]
+
     # Phase 0
-    bounds = schedule.InferBound(sch)
-    stmt = schedule.ScheduleOps(sch, bounds)
-    stmt = ir_pass.InjectPrefetch(stmt)
+    if isinstance(sch, schedule.Schedule):
+        # normalize schedule first
+        sch = sch.normalize()
+        bounds = schedule.InferBound(sch)
+        stmt = schedule.ScheduleOps(sch, bounds)
+        stmt = ir_pass.InjectPrefetch(stmt)
+    else:
+        #So far there is no op for hybrid script, so a plain ir body is given
+        if not isinstance(sch, _stmt.Stmt):
+            raise ValueError("sch should be either a Schedule or a Stmt")
+        stmt = sch
+
     for f in lower_phase0:
         stmt = f(stmt)
     # Phase 1
     stmt = ir_pass.StorageFlatten(stmt, binds, 64)
     stmt = ir_pass.CanonicalSimplify(stmt)
+    for f in lower_phase1:
+        stmt = f(stmt)
+    # Phase 2
     if not simple_mode:
-        stmt = ir_pass.LoopPartition(stmt)
+        stmt = ir_pass.LoopPartition(stmt, cfg.partition_const_loop)
     stmt = ir_pass.VectorizeLoop(stmt)
     stmt = ir_pass.InjectVirtualThread(stmt)
     stmt = ir_pass.InjectDoubleBuffer(stmt, cfg.double_buffer_split_loop)
@@ -223,21 +364,21 @@ def lower(sch,
         stmt,
         cfg.auto_unroll_max_step,
         cfg.auto_unroll_max_depth,
+        cfg.auto_unroll_max_extent,
         cfg.unroll_explicit)
-    for f in lower_phase1:
+    for f in lower_phase2:
         stmt = f(stmt)
     # Phase 2
     stmt = ir_pass.Simplify(stmt)
     stmt = ir_pass.LowerStorageAccessInfo(stmt)
     stmt = ir_pass.RemoveNoOp(stmt)
     stmt = ir_pass.RewriteUnsafeSelect(stmt)
-    for f in lower_phase2:
+    for f in lower_phase3:
         stmt = f(stmt)
     if simple_mode:
         return stmt
     return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
 
-
 def build(sch,
           args=None,
           target=None,
@@ -304,17 +445,24 @@ def build(sch,
             raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
         if x.name in fname_set:
             raise ValueError("Duplicate function name %s" % x.name)
+        fname_set.add(x.name)
 
     target = _target.current_target() if target is None else target
     target = _target.create(target) if target else _target.create("llvm")
+    device_type = ndarray.context(target.target_name, 0).device_type
 
     fhost = []
     fdevice = []
     for func in flist:
+        if not ir_pass.VerifyMemory(func, device_type):
+            raise ValueError(
+                "Direct host side access to device memory is detected in %s. "
+                "Did you forget to bind?" % func.name)
         if func.func_type == container.LoweredFunc.MixedFunc:
-            if BuildConfig.current.detect_global_barrier:
+            if current_build_config().detect_global_barrier:
                 func = ir_pass.ThreadSync(func, "global")
             func = ir_pass.ThreadSync(func, "shared")
+            func = ir_pass.ThreadSync(func, "warp")
             warp_size = target.thread_warp_size
             func = ir_pass.LowerThreadAllreduce(func, warp_size)
             fsplits = [s for s in ir_pass.SplitHostDevice(func)]
@@ -328,11 +476,14 @@ def build(sch,
         else:
             raise ValueError("unknown function type %d" % func.func_type)
 
+    for i, func in enumerate(fdevice):
+        warp_size = target.thread_warp_size
+        fdevice[i] = ir_pass.LowerWarpMemory(func, warp_size)
+
     if "gpu" in target.keys and not fdevice:
         warnings.warn(
             "Specified target %s, but cannot find device code, did you do bind?" % target)
 
-    device_type = ndarray.context(target.target_name, 0).device_type
     fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost]
     fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost]
 
diff --git a/python/tvm/container.py b/python/tvm/container.py
index b0ec44c695cf..27e533113926 100644
--- a/python/tvm/container.py
+++ b/python/tvm/container.py
@@ -20,7 +20,7 @@ def __getitem__(self, i):
             return [self[idx] for idx in range(start, stop, step)]
 
         if i >= len(self):
-            raise IndexError("array index out ot range")
+            raise IndexError("array index out of range")
         return _api_internal._ArrayGetItem(self, i)
 
     def __len__(self):
@@ -32,9 +32,8 @@ class Map(NodeBase):
     """Map container of TVM.
 
     You do not need to create Map explicitly.
-    Normally python dict will be converted automatically
-    to Array during tvm function call.
-    You may get Map in return values of TVM function call.
+    Normally python dict will be converted automaticall to Map during tvm function call.
+    You can use convert to create a dict[NodeBase-> NodeBase] into a Map
     """
     def __getitem__(self, k):
         return _api_internal._MapGetItem(self, k)
@@ -51,6 +50,18 @@ def __len__(self):
         return _api_internal._MapSize(self)
 
 
+@register_node
+class StrMap(Map):
+    """A special map container that has str as key.
+
+    You can use convert to create a dict[str->NodeBase] into a Map.
+    """
+    def items(self):
+        """Get the items from the map"""
+        akvs = _api_internal._MapItems(self)
+        return [(akvs[i].value, akvs[i+1]) for i in range(0, len(akvs), 2)]
+
+
 @register_node
 class Range(NodeBase):
     """Represent range in TVM.
diff --git a/python/tvm/contrib/cblas.py b/python/tvm/contrib/cblas.py
index ae7b48d82f37..eb32cc490347 100644
--- a/python/tvm/contrib/cblas.py
+++ b/python/tvm/contrib/cblas.py
@@ -1,4 +1,4 @@
-"""External function interface to BLAS libraroes."""
+"""External function interface to BLAS libraries."""
 from __future__ import absolute_import as _abs
 
 from .. import api as _api
@@ -7,7 +7,7 @@
 def matmul(lhs, rhs, transa=False, transb=False):
     """Create an extern op that compute matrix mult of A and rhs with CrhsLAS
 
-    This function serves as an example on how to calle external libraries.
+    This function serves as an example on how to call external libraries.
 
     Parameters
     ----------
diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index b941140bfbf0..0ffa6c420243 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -3,8 +3,9 @@
 from __future__ import absolute_import as _abs
 import sys
 import subprocess
-
 import os
+
+from .._ffi.base import py_str
 from .util import tempdir
 
 
@@ -22,8 +23,8 @@ def create_shared(output,
     objects : list
         List of object files.
 
-    options : str
-        The additional options.
+    options : list
+        The list of additional options string.
 
     cc : str, optional
         The compile string.
@@ -53,7 +54,7 @@ def _linux_shared(output, objects, options, cc="g++"):
     (out, _) = proc.communicate()
     if proc.returncode != 0:
         msg = "Compilation error:\n"
-        msg += str(out)
+        msg += py_str(out)
         raise RuntimeError(msg)
 
 
@@ -88,7 +89,7 @@ def _windows_shared(output, objects, options):
                            "please run this in Vistual Studio Command Prompt.")
     if proc.returncode != 0:
         msg = "Compilation error:\n"
-        msg += str(out)
+        msg += py_str(out)
         raise RuntimeError(msg)
     link_cmd = ["link"]
     link_cmd += ["-dll", "-FORCE:MULTIPLE"]
@@ -114,6 +115,6 @@ def _windows_shared(output, objects, options):
                            "please run this in Vistual Studio Command Prompt.")
     if proc.returncode != 0:
         msg = "Compilation error:\n"
-        msg += str(out)
+        msg += py_str(out)
 
         raise RuntimeError(msg)
diff --git a/python/tvm/contrib/clang.py b/python/tvm/contrib/clang.py
new file mode 100644
index 000000000000..19508160d42d
--- /dev/null
+++ b/python/tvm/contrib/clang.py
@@ -0,0 +1,96 @@
+"""Util to invoke clang in the system."""
+# pylint: disable=invalid-name
+from __future__ import absolute_import as _abs
+import subprocess
+
+from .._ffi.base import py_str
+from .. import codegen
+from . import util
+
+
+def find_clang(required=True):
+    """Find clang in system.
+
+    Parameters
+    ----------
+    required : bool
+        Whether it is required,
+        runtime error will be raised if the compiler is required.
+
+    Returns
+    -------
+    valid_list : list of str
+        List of possible paths.
+
+    Note
+    ----
+    This function will first search clang that
+    matches the major llvm version that built with tvm
+    """
+    cc_list = []
+    if hasattr(codegen, "llvm_version_major"):
+        cc_list += ["clang-%d.0" % codegen.llvm_version_major()]
+    cc_list += ["clang"]
+    valid_list = [util.which(x) for x in cc_list]
+    valid_list = [x for x in valid_list if x]
+    if not valid_list and required:
+        raise RuntimeError(
+            "cannot find clang, candidates are: " + str(cc_list))
+    return valid_list
+
+
+def create_llvm(inputs,
+                output=None,
+                options=None,
+                cc=None):
+    """Create llvm text ir.
+
+    Parameters
+    ----------
+    inputs : list of str
+        List of input files name or code source.
+
+    output : str, optional
+        Output file, if it is none
+        a temporary file is created
+
+    options : list
+        The list of additional options string.
+
+    cc : str, optional
+        The clang compiler, if not specified,
+        we will try to guess the matched clang version.
+
+    Returns
+    -------
+    code : str
+        The generated llvm text IR.
+    """
+    cc = cc if cc else find_clang()[0]
+    cmd = [cc]
+    cmd += ["-S", "-emit-llvm"]
+    temp = util.tempdir()
+    output = output if output else temp.relpath("output.ll")
+    inputs = [inputs] if isinstance(inputs, str) else inputs
+    input_files = []
+    for i, code in enumerate(inputs):
+        if util.is_source_path(code):
+            input_files.append(code)
+        else:
+            temp_path = temp.relpath("input%d.cc" % i)
+            with open(temp_path, "w") as output_file:
+                output_file.write(code)
+            input_files.append(temp_path)
+    if options:
+        cmd += options
+    cmd += ["-o", output]
+    cmd += input_files
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    (out, _) = proc.communicate()
+    if proc.returncode != 0:
+        msg = "Compilation error:\n"
+        msg += py_str(out)
+        raise RuntimeError(msg)
+
+    return open(output).read()
diff --git a/python/tvm/contrib/cublas.py b/python/tvm/contrib/cublas.py
new file mode 100644
index 000000000000..eda09fead359
--- /dev/null
+++ b/python/tvm/contrib/cublas.py
@@ -0,0 +1,32 @@
+"""External function interface to cuBLAS libraries."""
+from __future__ import absolute_import as _abs
+
+from .. import api as _api
+from .. import intrin as _intrin
+
+def matmul(lhs, rhs, transa=False, transb=False):
+    """Create an extern op that compute matrix mult of A and rhs with cuBLAS
+
+    Parameters
+    ----------
+    lhs : Tensor
+        The left matrix operand
+    rhs : Tensor
+        The right matrix operand
+    transa : bool
+        Whether transpose lhs
+    transb : bool
+        Whether transpose rhs
+
+    Returns
+    -------
+    C : Tensor
+        The result tensor.
+    """
+    n = lhs.shape[1] if transa else lhs.shape[0]
+    m = rhs.shape[0] if transb else rhs.shape[1]
+    return _api.extern(
+        (n, m), [lhs, rhs],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.cublas.matmul",
+            ins[0], ins[1], outs[0], transa, transb), name="C")
diff --git a/python/tvm/contrib/cudnn.py b/python/tvm/contrib/cudnn.py
index e728e42f614e..8f6708b145ad 100644
--- a/python/tvm/contrib/cudnn.py
+++ b/python/tvm/contrib/cudnn.py
@@ -220,6 +220,70 @@ def conv2d_output_shape(tensor_format,
     return list(oshape)
 
 
+def conv2d_find_algo(tensor_format,
+                     pad_h,
+                     pad_w,
+                     stride_h,
+                     stride_w,
+                     dilation_h,
+                     dilation_w,
+                     x_shape,
+                     w_shape,
+                     y_shape):
+    """Choose the best algo for the given input.
+
+    Paramters
+    ---------
+    tensor_format: int
+        0: CUDNN_TENSOR_NCHW
+        1: CUDNN_TENSOR_NHWC
+        2: CUDNN_TENSOR_NCHW_VECT_C
+    pad_h: int
+        height pad
+    pad_w: int
+        weight pad
+    stride_h: int
+        height stride
+    stride_w: int
+        width stride
+    dilation_h: int
+        height dilation
+    dilation_w: int
+        width dilation
+    x_shape: list
+        input shape
+    w_shape: list
+        weight shape
+    y_shape: list
+        output shape
+
+    Returns
+    -------
+    algo: int
+        algo chosen by CUDNN
+    """
+    func = _get_global_func("tvm.contrib.cudnn.conv2d.find_algo")
+    return func(tensor_format,
+                pad_h,
+                pad_w,
+                stride_h,
+                stride_w,
+                dilation_h,
+                dilation_w,
+                x_shape[0].value,
+                x_shape[1].value,
+                x_shape[2].value,
+                x_shape[3].value,
+                w_shape[0].value,
+                w_shape[1].value,
+                w_shape[2].value,
+                w_shape[3].value,
+                int(y_shape[0]),
+                int(y_shape[1]),
+                int(y_shape[2]),
+                int(y_shape[3]))
+
+
 def conv2d_forward(x,
                    w,
                    stride_h=1,
@@ -230,7 +294,7 @@ def conv2d_forward(x,
                    dilation_w=1,
                    conv_mode=1,
                    tensor_format=0,
-                   algo=0):
+                   algo=-1):
     """Create an extern op that compute 2D convolution with CuDNN
 
     Parameters
@@ -260,6 +324,7 @@ def conv2d_forward(x,
         2: CUDNN_TENSOR_NCHW_VECT_C
     algo: int
         Forward algorithm, get index from ```algo_to_index``` function
+        if algo == -1, the best algo will be chosen by CUDNN
 
     Returns
     -------
@@ -275,6 +340,18 @@ def conv2d_forward(x,
                                  dilation_w,
                                  list(x.shape),
                                  list(w.shape))
+    if algo == -1:
+        algo = conv2d_find_algo(tensor_format,
+                                pad_h,
+                                pad_w,
+                                stride_h,
+                                stride_w,
+                                dilation_h,
+                                dilation_w,
+                                list(x.shape),
+                                list(w.shape),
+                                oshape)
+
     return _api.extern(
         oshape, [x, w],
         lambda ins, outs: _intrin.call_packed(
diff --git a/python/tvm/contrib/download.py b/python/tvm/contrib/download.py
new file mode 100644
index 000000000000..434216a2652c
--- /dev/null
+++ b/python/tvm/contrib/download.py
@@ -0,0 +1,78 @@
+"""Helper utility for downloading"""
+from __future__ import print_function
+from __future__ import absolute_import as _abs
+
+import os
+import sys
+import time
+
+def download(url, path, overwrite=False, size_compare=False, verbose=1):
+    """Downloads the file from the internet.
+    Set the input options correctly to overwrite or do the size comparison
+
+    Parameters
+    ----------
+    url : str
+        Download url.
+
+    path : str
+        Local file path to save downloaded file
+
+    overwrite : bool, optional
+        Whether to overwrite existing file
+
+    size_compare : bool, optional
+        Whether to do size compare to check downloaded file.
+
+    verbose: int, optional
+        Verbose level
+    """
+    if sys.version_info >= (3,):
+        import urllib.request as urllib2
+    else:
+        import urllib2
+
+    if os.path.isfile(path) and not overwrite:
+        if size_compare:
+            import requests
+            file_size = os.path.getsize(path)
+            res_head = requests.head(url)
+            res_get = requests.get(url, stream=True)
+            if 'Content-Length' not in res_head.headers:
+                res_get = urllib2.urlopen(url)
+            url_file_size = int(res_get.headers['Content-Length'])
+            if url_file_size != file_size:
+                print("exist file got corrupted, downloading %s file freshly..." % path)
+                download(url, path, True, False)
+                return
+        print('File {} exists, skip.'.format(path))
+        return
+
+    if verbose >= 1:
+        print('Downloading from url {} to {}'.format(url, path))
+
+    # Stateful start time
+    start_time = time.time()
+
+    def _download_progress(count, block_size, total_size):
+        #pylint: disable=unused-argument
+        """Show the download progress.
+        """
+        if count == 0:
+            return
+        duration = time.time() - start_time
+        progress_size = int(count * block_size)
+        speed = int(progress_size / (1024 * duration))
+        percent = min(int(count * block_size * 100 / total_size), 100)
+        sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
+                         (percent, progress_size / (1024 * 1024), speed, duration))
+        sys.stdout.flush()
+
+    if sys.version_info >= (3,):
+        urllib2.urlretrieve(url, path, reporthook=_download_progress)
+        print("")
+    else:
+        f = urllib2.urlopen(url)
+        data = f.read()
+        with open(path, "wb") as code:
+            code.write(data)
diff --git a/python/tvm/contrib/emscripten.py b/python/tvm/contrib/emscripten.py
index d747b5f039f8..a722a0d673d9 100644
--- a/python/tvm/contrib/emscripten.py
+++ b/python/tvm/contrib/emscripten.py
@@ -1,7 +1,9 @@
 """Util to invoke emscripten compilers in the system."""
 # pylint: disable=invalid-name
 from __future__ import absolute_import as _abs
+
 import subprocess
+from .._ffi.base import py_str
 from .._ffi.libinfo import find_lib_path
 
 def create_js(output,
@@ -26,13 +28,16 @@ def create_js(output,
         The compile string.
     """
     cmd = [cc]
-    cmd += ["-s", "RESERVED_FUNCTION_POINTERS=2"]
-    cmd += ["-s", "NO_EXIT_RUNTIME=1"]
     cmd += ["-Oz"]
-    cmd += ["-o", output]
-    if side_module:
+    if not side_module:
+        cmd += ["-s", "RESERVED_FUNCTION_POINTERS=2"]
+        cmd += ["-s", "NO_EXIT_RUNTIME=1"]
+        extra_methods = ['cwrap', 'getValue', 'setValue', 'addFunction']
+        cfg = "[" + (','.join("\'%s\'" % x for x in extra_methods)) + "]"
+        cmd += ["-s", "EXTRA_EXPORTED_RUNTIME_METHODS=" + cfg]
+    else:
         cmd += ["-s", "SIDE_MODULE=1"]
-
+    cmd += ["-o", output]
     objects = [objects] if isinstance(objects, str) else objects
     with_runtime = False
     for obj in objects:
@@ -47,14 +52,15 @@ def create_js(output,
     if options:
         cmd += options
 
-    args = ' '.join(cmd)
     proc = subprocess.Popen(
-        args, shell=True,
+        cmd,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
 
     if proc.returncode != 0:
         msg = "Compilation error:\n"
-        msg += out
+        msg += py_str(out)
         raise RuntimeError(msg)
+
+create_js.object_format = "bc"
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 7e919586b0c0..9ce9dd602fa3 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -1,7 +1,7 @@
 """Minimum graph runtime that executes graph containing TVM PackedFunc."""
-from . import rpc
 from .._ffi.base import string_types
 from .._ffi.function import get_global_func
+from ..rpc import base as rpc_base
 from .. import ndarray as nd
 
 
@@ -33,12 +33,12 @@ def create(graph_json_str, libmod, ctx):
             raise ValueError("Type %s is not supported" % type(graph_json_str))
     device_type = ctx.device_type
     device_id = ctx.device_id
-    if device_type >= rpc.RPC_SESS_MASK:
+    if device_type >= rpc_base.RPC_SESS_MASK:
         assert libmod.type_key == "rpc"
-        assert rpc._SessTableIndex(libmod) == ctx._rpc_sess._tbl_index
-        hmod = rpc._ModuleHandle(libmod)
+        assert rpc_base._SessTableIndex(libmod) == ctx._rpc_sess._tbl_index
+        hmod = rpc_base._ModuleHandle(libmod)
         fcreate = ctx._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        device_type = device_type % rpc.RPC_SESS_MASK
+        device_type = device_type % rpc_base.RPC_SESS_MASK
         return GraphModule(fcreate(graph_json_str, hmod, device_type, device_id), ctx)
     fcreate = get_global_func("tvm.graph_runtime.create")
     return GraphModule(fcreate(graph_json_str, libmod, device_type, device_id), ctx)
@@ -72,6 +72,11 @@ def __init__(self, module, ctx):
         self._set_input = module["set_input"]
         self._run = module["run"]
         self._get_output = module["get_output"]
+        self._get_input = module["get_input"]
+        try:
+            self._debug_get_output = module["debug_get_output"]
+        except AttributeError:
+            pass
         self._load_params = module["load_params"]
         self.ctx = ctx
 
@@ -107,6 +112,20 @@ def run(self, **input_dict):
             self.set_input(**input_dict)
         self._run()
 
+    def get_input(self, index, out):
+        """Get index-th input to out
+
+        Parameters
+        ----------
+        index : int
+            The input index
+
+        out : NDArray
+            The output array container
+        """
+        self._get_input(index, out)
+        return out
+
     def get_output(self, index, out):
         """Get index-th output to out
 
@@ -121,6 +140,23 @@ def get_output(self, index, out):
         self._get_output(index, out)
         return out
 
+    def debug_get_output(self, node, out):
+        """Run graph upto node and get the output to out
+
+        Parameters
+        ----------
+        node : int / str
+            The node index or name
+
+        out : NDArray
+            The output array container
+        """
+        if hasattr(self, '_debug_get_output'):
+            self._debug_get_output(node, out)
+        else:
+            raise RuntimeError("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0")
+        return out
+
     def load_params(self, params_bytes):
         """Load parameters from serialized byte array of parameter dict.
 
diff --git a/python/tvm/contrib/miopen.py b/python/tvm/contrib/miopen.py
new file mode 100644
index 000000000000..b76e70688c4e
--- /dev/null
+++ b/python/tvm/contrib/miopen.py
@@ -0,0 +1,102 @@
+"""External function interface to MIOpen library."""
+# pylint: disable-msg=C0103
+import ctypes
+import numpy as np
+from .. import api as _api
+from .. import intrin as _intrin
+from .. import get_global_func as _get_global_func
+
+
+def _get_np_int32_array_handle(arr):
+    """Return a void_p handle for a numpy array
+
+    Parameters
+    ----------
+    arr: numpy.NDArray
+        source numpy array
+
+    Returns
+    -------
+    ptr:  ctypes.c_void_p
+        pointer to the data
+    """
+    assert arr.dtype == np.int32
+    ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
+    return ctypes.cast(ptr, ctypes.c_void_p)
+
+
+def conv2d_forward(x,
+                   w,
+                   stride_h=1,
+                   stride_w=1,
+                   pad_h=0,
+                   pad_w=0,
+                   dilation_h=1,
+                   dilation_w=1,
+                   conv_mode=0):
+    """Create an extern op that compute 2D convolution with MIOpen
+
+    Parameters
+    ----------
+    x: Tensor
+        input feature map
+    w: Tensor
+        convolution weight
+    stride_h: int
+        height stride
+    stride_w: int
+        width stride
+    pad_h: int
+        height pad
+    pad_w: int
+        weight pad
+    dilation_h: int
+        height dilation
+    dilation_w: int
+        width dilation
+    conv_mode: int
+        0: miopenConvolution
+        1: miopenTranspose
+
+    Returns
+    -------
+    y: Tensor
+        The result tensor
+    """
+    assert conv_mode == 0, "Transpose convolutions not supported yet."
+    oshape = np.zeros((len(x.shape)), dtype=np.int32)
+    xshape = x.shape
+    wshape = w.shape
+    setup_func = _get_global_func("tvm.contrib.miopen.conv2d.setup")
+    algo = setup_func(conv_mode,
+                      pad_h,
+                      pad_w,
+                      stride_h,
+                      stride_w,
+                      dilation_h,
+                      dilation_w,
+                      xshape[0].value,
+                      xshape[1].value,
+                      xshape[2].value,
+                      xshape[3].value,
+                      wshape[0].value,
+                      wshape[1].value,
+                      wshape[2].value,
+                      wshape[3].value,
+                      _get_np_int32_array_handle(oshape))
+
+    return _api.extern(
+        list(oshape), [x, w],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.miopen.conv2d.forward",
+            conv_mode,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            algo,
+            ins[0],
+            ins[1],
+            outs[0]), name="y")
diff --git a/python/tvm/contrib/mps.py b/python/tvm/contrib/mps.py
new file mode 100644
index 000000000000..43b3b9fb48db
--- /dev/null
+++ b/python/tvm/contrib/mps.py
@@ -0,0 +1,71 @@
+"""External function interface to MPS libraroes."""
+from __future__ import absolute_import as _abs
+from .. import api as _api
+from .. import intrin as _intrin
+
+# pylint: disable=C0103,W0612
+
+def matmul(lhs, rhs, transa=False, transb=False):
+    """Create an extern op that compute matrix mult of A and rhs with CrhsLAS
+
+    This function serves as an example on how to calle external libraries.
+
+    Parameters
+    ----------
+    lhs : Tensor
+        The left matrix operand
+    rhs : Tensor
+        The right matrix operand
+    transa : bool
+        Whether transpose lhs
+    transb : bool
+        Whether transpose rhs
+
+    Returns
+    -------
+    C : Tensor
+        The result tensor.
+    """
+    m = lhs.shape[0] if transa is False else lhs.shape[1]
+    n = rhs.shape[1] if transb is False else rhs.shape[0]
+    if transa:
+        m = b
+    if transb:
+        n = c
+    return _api.extern(
+        (m, n), [lhs, rhs],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.mps.matmul", ins[0], ins[1], outs[0], transa, transb),
+        name="C")
+
+def conv2d(data, weight, pad='SAME', stride=1):
+    """
+    Create an extern op that compute data * weight and return result in output
+
+    Parameters:
+    ----------
+    data: Tensor
+        The input data, format NHWC
+    weight: Tensor
+        The conv weight, format output_feature * kH * kW * input_feature
+    pad: str
+        Padding method, 'SAME' or 'VALID'
+    stride: int
+        convolution stride
+
+    Returns
+    -------
+    output: Tensor
+        The result tensor
+    """
+    n, hi, wi, ci = data.shape
+    co, kh, kw, ciw = weight.shape
+    padding = 0 if pad == 'SAME' else 1
+    ho = hi // stride
+    wo = wi // stride
+
+    return _api.extern(
+        (n, ho, wo, co), [data, weight],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.mps.conv2d", ins[0], ins[1], outs[0], padding, stride),
+        name="C")
diff --git a/python/tvm/contrib/mxnet.py b/python/tvm/contrib/mxnet.py
new file mode 100644
index 000000000000..3a6c92f1b880
--- /dev/null
+++ b/python/tvm/contrib/mxnet.py
@@ -0,0 +1,59 @@
+"""MXNet bridge wrap Function MXNet's async function."""
+from __future__ import absolute_import as _abs
+
+from .. import api, _api_internal, ndarray
+from ..module import Module
+
+# pylint: disable=invalid-name
+_wrap_async = None
+
+
+def to_mxnet_func(func, const_loc=None):
+    """Wrap a TVM function as MXNet function
+
+    MXNet function runs asynchrously via its engine.
+
+    Parameters
+    ----------
+    func : Function
+        A TVM function that can take positional arguments
+
+    const_loc : list of int
+        List of integers indicating the argument position
+        of read only NDArray argument.
+        The NDArray argument location that are not annotated
+        will be viewed as mutable arrays in MXNet's engine.
+
+    Returns
+    -------
+    async_func : Function
+        A function that can take MXNet NDArray as argument
+        in places that used to expect TVM NDArray.
+        Run asynchrously in MXNet's async engine.
+    """
+    # only import mxnet when wrap get called.
+    # pylint: disable=import-self
+    import mxnet
+    if isinstance(func, Module):
+        func = func.entry_func
+
+    def _get_bridge_func():
+        """Get MXNet bridge function"""
+        if not mxnet.base._LIB.MXTVMBridge:
+            raise RuntimeError(
+                "MXTVMBridge not exist in mxnet package,"
+                " please update to latest version")
+
+        fdict = api.extract_ext_funcs(mxnet.base._LIB.MXTVMBridge)
+        ret = fdict["WrapAsyncCall"]
+        ret.is_global = True
+        return ret
+    global _wrap_async
+
+    if _wrap_async is None:
+        # Register extension type in first time
+        _wrap_async = _get_bridge_func()
+        ndarray.register_extension(mxnet.nd.NDArray)
+
+    const_loc = const_loc if const_loc else []
+    return _wrap_async(func, _api_internal._TVMSetStream, len(const_loc), *const_loc)
diff --git a/python/tvm/contrib/ndk.py b/python/tvm/contrib/ndk.py
index 3751a4574b47..a79aae96163d 100644
--- a/python/tvm/contrib/ndk.py
+++ b/python/tvm/contrib/ndk.py
@@ -1,8 +1,10 @@
 """Util to invoke NDK compiler toolchain."""
 # pylint: disable=invalid-name
 from __future__ import absolute_import as _abs
+
 import subprocess
 import os
+from .._ffi.base import py_str
 
 def create_shared(output,
                   objects,
@@ -43,5 +45,5 @@ def create_shared(output,
 
     if proc.returncode != 0:
         msg = "Compilation error:\n"
-        msg += out
+        msg += py_str(out)
         raise RuntimeError(msg)
diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py
index 66e7a9494d89..d6587df26229 100644
--- a/python/tvm/contrib/nnpack.py
+++ b/python/tvm/contrib/nnpack.py
@@ -16,7 +16,7 @@ def config(nthreads):
     """
     _Config(nthreads)
 
-def fully_connected_inference(lhs, rhs):
+def fully_connected_inference(lhs, rhs, nthreads=1):
     """Create an extern op that compute fully connected of 1D tensor lhs and
     2D tensor rhs with nnpack.
 
@@ -37,9 +37,9 @@ def fully_connected_inference(lhs, rhs):
         (m, ), [lhs, rhs],
         lambda ins, outs: _intrin.call_packed(
             "tvm.contrib.nnpack.fully_connected_inference",
-            ins[0], ins[1], outs[0]), name="C")
+            ins[0], ins[1], outs[0], nthreads), name="C")
 
-def fully_connected_output(lhs, rhs):
+def fully_connected_output(lhs, rhs, nthreads=1):
     """Create an extern op that compute fully connected of 2D tensor lhs and
     2D tensor rhs with nnpack.
 
@@ -61,9 +61,9 @@ def fully_connected_output(lhs, rhs):
         (n, m), [lhs, rhs],
         lambda ins, outs: _intrin.call_packed(
             "tvm.contrib.nnpack.fully_connected_output",
-            ins[0], ins[1], outs[0]), name="C")
+            ins[0], ins[1], outs[0], nthreads), name="C")
 
-def convolution_inference(data, kernel, bias, padding, stride):
+def convolution_inference(data, kernel, bias, padding, stride, nthreads=1):
     """Create an extern op to do inference convolution of 3D tensor data and
     4D tensor kernel and 1D tensor bias with nnpack.
 
@@ -104,9 +104,9 @@ def convolution_inference(data, kernel, bias, padding, stride):
         lambda ins, outs: _intrin.call_packed(
             "tvm.contrib.nnpack.convolution_inference", ins[0], ins[1], ins[2],
             outs[0], padding[0], padding[1], padding[2], padding[3],
-            stride[0], stride[1]), name="C")
+            stride[0], stride[1], nthreads), name="C")
 
-def convolution_output(data, kernel, bias, padding):
+def convolution_output(data, kernel, bias, padding, nthreads=1):
     """Create an extern op to compute convolution of 4D tensor data and
     4D tensor kernel and 1D tensor bias with nnpack.
 
@@ -142,6 +142,6 @@ def convolution_output(data, kernel, bias, padding):
         (batch, output_channels, output_height, output_width), [data, kernel, bias],
         lambda ins, outs: _intrin.call_packed(
             "tvm.contrib.nnpack.convolution_output", ins[0], ins[1], ins[2],
-            outs[0], padding[0], padding[1], padding[2], padding[3]), name="C")
+            outs[0], padding[0], padding[1], padding[2], padding[3], nthreads), name="C")
 
 _init_api("tvm.contrib.nnpack")
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index ac8dbf65b2bc..1b7bb840127d 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -72,11 +72,14 @@ def compile_cuda(code,
 
     if proc.returncode != 0:
         msg = "Compilation error:\n"
-        msg += out
+        msg += py_str(out)
         raise RuntimeError(msg)
 
-    return bytearray(open(file_target, "rb").read())
-
+    data = bytearray(open(file_target, "rb").read())
+    if not data:
+        raise RuntimeError(
+            "Compilation error: empty result is generated")
+    return data
 
 def find_cuda_path():
     """Utility function to find cuda path
@@ -101,6 +104,28 @@ def find_cuda_path():
     raise RuntimeError("Cannot find cuda path")
 
 
+def get_cuda_version(cuda_path):
+    """Utility function to get cuda version
+
+    Parameters
+    ----------
+    cuda_path : str
+        Path to cuda root.
+
+    Returns
+    -------
+    version : float
+        The cuda version
+    """
+    version_file_path = os.path.join(cuda_path, "version.txt")
+    try:
+        with open(version_file_path) as f:
+            version_str = f.readline().replace('\n', '').replace('\r', '')
+            return float(version_str.split(" ")[2][:2])
+    except:
+        raise RuntimeError("Cannot read cuda version file")
+
+
 @register_func("tvm_callback_libdevice_path")
 def find_libdevice_path(arch):
     """Utility function to find libdevice
@@ -109,22 +134,31 @@ def find_libdevice_path(arch):
     ----------
     arch : int
         The compute architecture in int
+
+    Returns
+    -------
+    path : str
+        Path to libdevice.
     """
     cuda_path = find_cuda_path()
     lib_path = os.path.join(cuda_path, "nvvm/libdevice")
     selected_ver = 0
     selected_path = None
-
-    for fn in os.listdir(lib_path):
-        if not fn.startswith("libdevice"):
-            continue
-        ver = int(fn.split(".")[-3].split("_")[-1])
-        if ver > selected_ver and ver <= arch:
-            selected_ver = ver
-            selected_path = fn
-    if selected_path is None:
-        raise RuntimeError("Cannot find libdevice for arch {}".format(arch))
-    return os.path.join(lib_path, selected_path)
+    cuda_ver = get_cuda_version(cuda_path)
+    if cuda_ver == 9.0 or cuda_ver == 9.1:
+        path = os.path.join(lib_path, "libdevice.10.bc")
+    else:
+        for fn in os.listdir(lib_path):
+            if not fn.startswith("libdevice"):
+                continue
+            ver = int(fn.split(".")[-3].split("_")[-1])
+            if ver > selected_ver and ver <= arch:
+                selected_ver = ver
+                selected_path = fn
+        if selected_path is None:
+            raise RuntimeError("Cannot find libdevice for arch {}".format(arch))
+        path = os.path.join(lib_path, selected_path)
+    return path
 
 
 def callback_libdevice_path(arch):
@@ -133,3 +167,78 @@ def callback_libdevice_path(arch):
     except RuntimeError:
         warnings.warn("Cannot find libdevice path")
         return ""
+
+
+def parse_compute_version(compute_version):
+    """Parse compute capability string to divide major and minor version
+
+    Parameters
+    ----------
+    compute_version : str
+        compute capability of a GPU (e.g. "6.0")
+
+    Returns
+    -------
+    major : int
+        major version number
+    minor : int
+        minor version number
+    """
+    split_ver = compute_version.split('.')
+    try:
+        major = int(split_ver[0])
+        minor = int(split_ver[1])
+        return major, minor
+    except (IndexError, ValueError) as err:
+        raise RuntimeError("Compute version parsing error: " + str(err))
+
+
+def have_fp16(compute_version):
+    """Either fp16 support is provided in the compute capability or not
+
+    Parameters
+    ----------
+    compute_version: str
+        compute capability of a GPU (e.g. "6.0")
+    """
+    major, minor = parse_compute_version(compute_version)
+    # fp 16 support in reference to:
+    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#arithmetic-instructions
+    if major == 5 and minor == 3:
+        return True
+    # NOTE: exclude compute capability 6.1 devices although it is actually available
+    #       to compute fp16, because these devices only have low-rate fp16 performance.
+    if major == 6 and minor != 1:
+        return True
+    if major == 7:
+        return True
+
+    return False
+
+def have_int8(compute_version):
+    """Either int8 support is provided in the compute capability or not
+
+    Parameters
+    ----------
+    compute_version : str
+        compute capability of a GPU (e.g. "6.1")
+    """
+    major, minor = parse_compute_version(compute_version)
+    if major == 6 and minor == 1:
+        return True
+
+    return False
+
+def have_tensorcore(compute_version):
+    """Either TensorCore support is provided in the compute capability or not
+
+    Parameters
+    ----------
+    compute_version : str
+        compute capability of a GPU (e.g. "7.0")
+    """
+    major, _ = parse_compute_version(compute_version)
+    if major == 7:
+        return True
+
+    return False
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
new file mode 100644
index 000000000000..eefa7bfa0b52
--- /dev/null
+++ b/python/tvm/contrib/peak.py
@@ -0,0 +1,325 @@
+# pylint: disable=invalid-name
+"""measure bandwidth and compute peak"""
+
+import logging
+import tvm
+from . import util
+from .. import rpc
+
+def _convert_to_remote(func, remote):
+    """ convert module function to remote rpc function"""
+    temp = util.tempdir()
+    path_dso = temp.relpath("tmp_func.tar")
+    func.export_library(path_dso)
+
+    remote.upload(path_dso)
+    func = remote.load_module("tmp_func.tar")
+    return func
+
+def measure_bandwidth_sum(total_item, item_per_thread, stride,
+                          base_type, bits, lanes,
+                          target, target_host, remote, ctx, n_times):
+    """ measure memory bandwidth of gpu by product reduction for a given type
+
+    The IR for measurement is
+
+    for each thread
+        for i in 1..num_per_thread:
+            y[global_id] = y[global_id] * x[base + i * stride]
+
+    Parameters
+    ----------
+    total_item: int
+        number of elements in input array
+    item_per_thread: int
+        number of elements each thread accumulates
+    stride: int
+        stride in memory access
+    base_type: str
+        can be "int", "float"
+    bits: int
+        can be 16, 32
+    lanes: int
+       lane of the vector type, can be 1, 2, 4, 8, 16
+    target: :any:`tvm.target.Target`
+        the target and option of the compilation.
+    target_host : str or :any:`tvm.target.Target`
+        host compilation target
+    ctx: TVMcontext
+        the context of array
+    remote: tvm.rpc.RPCSession
+        remote rpc session
+    n_times: int
+        number of runs for taking mean
+
+    Returns
+    -------
+    GBPS: float
+         gigabyte per second
+    """
+    n, m = total_item, item_per_thread
+    n //= lanes
+
+    base_type = str(base_type) + str(bits)
+    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
+
+    k = tvm.reduce_axis((0, m), name="k")
+
+    x = tvm.placeholder((n,), dtype=dtype, name="x")
+    op = tvm.comm_reducer(lambda x, y: x*y, lambda t: tvm.const(1, dtype=t), name="sum")
+    y = tvm.compute((n // m,),
+                    lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k))
+    s = tvm.create_schedule(y.op)
+
+    yo, yi = s[y].split(y.op.axis[0], target.max_num_threads)
+    s[y].bind(yo, tvm.thread_axis("blockIdx.x"))
+    s[y].bind(yi, tvm.thread_axis("threadIdx.x"))
+    s[y].unroll(k)
+
+    try:
+        func = tvm.build(s, [x, y], target, target_host=target_host)
+
+        x = tvm.nd.empty((n,), dtype=dtype, ctx=ctx)
+        y = tvm.nd.empty((n // m,), dtype=dtype, ctx=ctx)
+
+        func = _convert_to_remote(func, remote)
+        time_f = func.time_evaluator(func.entry_name, ctx, number=n_times)
+        time = time_f(x, y).mean
+    except tvm._ffi.base.TVMError:
+        # build error (occur when device does not support half)
+        return -1
+
+    return 1.0 * (total_item * bits / 8) / 1e9 / time
+
+def measure_bandwidth_all_types(total_item, item_per_thread, n_times,
+                                target, target_host, remote, ctx, verbose=True):
+    """ measure memory bandwidth for all types
+
+    Parameters
+    ----------
+    total_item: int
+        number of elements in input array
+    item_per_thread: int
+        number of elements each thread accmulates
+    n_times: int
+        number of runs for averaging
+    target: :any:`tvm.target.Target`
+        the target and option of the compilation.
+    target_host : str or :any:`tvm.target.Target`
+        host compilation target
+    remote: tvm.rpc.RPCSession
+        remote rpc session
+    ctx: TVMcontext
+        the context of array
+    verbose: bool
+        whether outputs immediate result
+
+    Returns
+    -------
+    result: list
+        a list of (type_name, GBPS) pairs
+    """
+    max_threads = target.max_num_threads
+
+    result = []
+    for base_type in ["float"]:
+        for bits in [32]:
+            for lanes in [1, 2, 4, 8, 16]:
+                max_speed = -1e9
+                # try different strides
+                for stride in [max_threads, total_item // (lanes * item_per_thread)]:
+                    speed = measure_bandwidth_sum(total_item, item_per_thread, stride,
+                                                  base_type, bits, lanes, target,
+                                                  target_host, remote, ctx, n_times)
+                    max_speed = max(max_speed, speed)
+                type_name = base_type + str(bits)
+                result.append(["%sx%d" % (type_name, lanes), max_speed])
+                if verbose:
+                    logging.info("\t%-10s %.2f GBPS", result[-1][0], result[-1][1])
+    return result
+
+def measure_compute_mad(total_item, item_per_thread, base_type, bits, lanes,
+                        target, target_host, remote, ctx, n_times):
+    """ measure peak compute speed by computing mad for a type
+
+    The IR for measurement is
+
+    for each thread
+        for i in 1..item_per_thread
+            x = mad(x, x, y)
+            y = mad(y, y, x)
+
+    Parameters
+    ----------
+    total_item: int
+        number of elements in input array
+    item_per_thread: int
+        number of operations each thread does
+    base_type: str
+        can be "int", "float"
+    bits: int
+        can be 16, 32
+    lanes: int
+       lane of the vector type, can be 1, 2, 4, 8, 16
+    target: :any:`tvm.target.Target`
+        the target and option of the compilation.
+    target_host : str or :any:`tvm.target.Target`
+        host compilation target
+    remote: tvm.rpc.RPCSession
+        if it is not None, use remote rpc session
+    ctx: TVMcontext
+        the context of array
+    n_times: int
+        number of runs for taking mean
+
+    Returns
+    -------
+    GOPS: float
+         giga operation per second
+    """
+
+    n = total_item
+
+    if bits >= 64 or lanes >= 16:
+        n //= 2
+
+    max_threads = target.max_num_threads
+
+    base_type = str(base_type) + str(bits)
+    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
+
+    def extern(ins, outs):
+        # pylint: disable=unused-argument
+        """construct measurement function by building IR directly"""
+        ib = tvm.ir_builder.create()
+
+        bx = tvm.thread_axis("blockIdx.x")
+        tx = tvm.thread_axis("threadIdx.x")
+
+        ib.scope_attr(bx, "thread_extent", n // max_threads)
+        ib.scope_attr(tx, "thread_extent", max_threads)
+
+        idx = bx.var * max_threads + tx.var
+
+        a = ib.allocate(dtype, (1), name='a', scope='local')
+        b = ib.allocate(dtype, (1), name='b', scope='local')
+
+        a[0] = outs[0].vload(idx, dtype)
+        b[0] = outs[0].vload(idx, dtype)
+
+        if base_type.find('float') != -1:
+            mad_func = lambda x, y: (x * x + y)
+        else:
+            mad_func = lambda x, y: y * y + x
+
+        for _ in range(item_per_thread // 4 // lanes):
+            a[0] = mad_func(a[0], b[0])
+            b[0] = mad_func(b[0], a[0])
+
+        ib.emit(outs[0].vstore(idx, b[0]))
+        return ib.get()
+
+    y = tvm.extern((n,), [], extern, name="y", dtype=dtype)
+    s = tvm.create_schedule(y.op)
+
+    try:
+        func = tvm.build(s, [y], target, target_host=target_host)
+        func = _convert_to_remote(func, remote)
+        time_f = func.time_evaluator(func.entry_name, ctx, number=n_times)
+        y = tvm.nd.empty((n,), dtype=dtype, ctx=ctx)
+        time = time_f(y).mean
+    except tvm._ffi.base.TVMError:
+        # build error (occur when device does not support half)
+        return -1
+
+    return 1.0 * (n * item_per_thread) / 1e9 / time
+
+def measure_compute_all_types(total_item, item_per_thread, n_times,
+                              target, target_host, remote, ctx, verbose=True):
+    """ measure peak flops for all types
+
+    Parameters
+    ----------
+    total_item: int
+        number of elements in input array
+    item_per_thread: int
+        number of elements each thread accmulates
+    n_times: int
+        number of runs for averaging
+    target: :any:`tvm.target.Target`
+        the target and option of the compilation.
+    target_host : str or :any:`tvm.target.Target`
+        host compilation target
+    remote: tvm.rpc.RPCSession
+        remote rpc session
+    ctx: TVMcontext
+        the context of array
+    verbose: bool
+        whether outputs immediate result
+
+    Returns
+    -------
+    result: list
+        a list of (type_name, GFLOPS/GIOPS) pairs
+    """
+    result = []
+    for base_type in ["float", "int"]:
+        for bits in [16, 32, 64]:
+            for lanes in [1, 2, 4, 8, 16]:
+                if base_type == 'int' and bits != 32:  # only measure int32
+                    continue
+
+                max_speed = -1e9
+                for per_thread in [item_per_thread//2, item_per_thread, item_per_thread*2]:
+                    speed = measure_compute_mad(total_item, per_thread,
+                                                base_type, bits, lanes, target,
+                                                target_host, remote, ctx, n_times)
+                    max_speed = max(max_speed, speed)
+                type_name = base_type + str(bits)
+                result.append(["%sx%d" % (type_name, lanes), max_speed])
+
+                unit = "GFLOPS" if base_type == "float" else "GIOPS"
+
+                if verbose:
+                    logging.info("\t%-10s %.2f %s", result[-1][0], result[-1][1], unit)
+
+    return result
+
+
+def measure_peak_all(target, target_host, host, port):
+    """measure memory bandwidth and peak compute for gpu devices
+
+    Parameters
+    ----------
+    target: str or :any:`tvm.target.Target`
+    target_host: str
+    host: str
+    port: int
+    """
+
+    target = tvm.target.create(target)
+    remote = rpc.connect(host, port)
+    n_times = 20
+
+    bandwidth_total_item = 1 << 25
+    bandwidth_item_per_thread = 32
+
+    compute_total_item = 1 << 21
+    compute_item_per_thread = 4096
+
+    if str(target).startswith("opencl"):
+        ctx = remote.cl()
+    elif str(target).startswith("cuda"):
+        ctx = remote.gpu()
+    elif str(target).startswith("metal"):
+        ctx = remote.metal()
+    else:
+        raise RuntimeError("Unsupported target")
+
+    logging.info("========== measure memory bandwidth ==========")
+    measure_bandwidth_all_types(bandwidth_total_item, bandwidth_item_per_thread,
+                                n_times, target, target_host, remote, ctx)
+
+    logging.info("========== measure peak compute ==========")
+    measure_compute_all_types(compute_total_item, compute_item_per_thread,
+                              n_times, target, target_host, remote, ctx)
diff --git a/python/tvm/contrib/pickle_memoize.py b/python/tvm/contrib/pickle_memoize.py
index 097eb0cb6784..32edd4025e84 100644
--- a/python/tvm/contrib/pickle_memoize.py
+++ b/python/tvm/contrib/pickle_memoize.py
@@ -5,6 +5,7 @@
 import atexit
 from decorator import decorate
 from .._ffi.base import string_types
+
 try:
     import cPickle as pickle
 except ImportError:
@@ -79,7 +80,6 @@ def _memoized_f(func, *args, **kwargs):
                 else:
                     assert isinstance(arg, allow_types)
             if key in cache.cache:
-                print("Use memoize {0}{1}".format(fkey, key))
                 return cache.cache[key]
             res = func(*args)
             cache.cache[key] = res
diff --git a/python/tvm/contrib/random.py b/python/tvm/contrib/random.py
new file mode 100644
index 000000000000..6c32dd2d293b
--- /dev/null
+++ b/python/tvm/contrib/random.py
@@ -0,0 +1,83 @@
+"""External function interface to random library."""
+from __future__ import absolute_import as _abs
+
+from .. import api as _api
+from .. import intrin as _intrin
+from .._ffi.function import _init_api
+
+
+def randint(low, high, size, dtype='int32'):
+    """Return random integers from low (inclusive) to high (exclusive).
+    Return random integers from the "discrete uniform" distribution of the
+    specified dtype in the "half-open" interval [low, high).
+
+    Parameters
+    ----------
+    low : int
+        Lowest (signed) integer to be drawn from the distribution
+    high : int
+        One above the largest (signed) integer to be drawn from the distribution
+
+    Returns
+    -------
+    out : Tensor
+        A tensor with specified size and dtype
+    """
+    assert 'int' in dtype, "the type of randint output must be int or uint"
+    return _api.extern(size, [], lambda ins, outs: _intrin.call_packed(
+        "tvm.contrib.random.randint", int(low), int(high), outs[0]), dtype=dtype)
+
+
+def uniform(low, high, size):
+    """Draw samples from a uniform distribution.
+
+    Samples are uniformly distributed over the half-open interval [low, high)
+    (includes low, but excludes high). In other words, any value within the
+    given interval is equally likely to be drawn by uniform.
+
+    Parameters
+    ----------
+    low : float
+        Lower boundary of the output interval. All values generated will be
+        greater than or equal to low.
+    high : float
+        Upper boundary of the output interval. All values generated will be
+        less than high.
+    size : tuple of ints
+        Output shape. If the given shape is, e.g., (m, n, k), then m * n * k
+        samples are drawn.
+
+    Returns
+    -------
+    out : Tensor
+        A tensor with specified size and dtype.
+    """
+    return _api.extern(size, [], lambda ins, outs: _intrin.call_packed(
+        "tvm.contrib.random.uniform", float(low), float(high), outs[0]), dtype='float32')
+
+
+def normal(loc, scale, size):
+    """Draw samples from a normal distribution.
+
+    Return random samples from a normal distribution.
+
+    Parameters
+    ----------
+    loc : float
+        loc of the distribution.
+    scale : float
+        Standard deviation of the distribution.
+    size : tuple of ints
+        Output shape. If the given shape is, e.g., (m, n, k), then m * n * k
+        samples are drawn.
+
+    Returns
+    ------
+    out : Tensor
+        A tensor with specified size and dtype
+    """
+    return _api.extern(size, [], lambda ins, outs: _intrin.call_packed(
+        "tvm.contrib.random.normal", float(loc), float(scale), outs[0]), dtype='float32')
+
+
+_init_api("tvm.contrib.random")
diff --git a/python/tvm/contrib/rocblas.py b/python/tvm/contrib/rocblas.py
new file mode 100644
index 000000000000..470cff662c4c
--- /dev/null
+++ b/python/tvm/contrib/rocblas.py
@@ -0,0 +1,32 @@
+"""External function interface to rocBLAS libraries."""
+from __future__ import absolute_import as _abs
+
+from .. import api as _api
+from .. import intrin as _intrin
+
+def matmul(lhs, rhs, transa=False, transb=False):
+    """Create an extern op that compute matrix mult of A and rhs with rocBLAS
+
+    Parameters
+    ----------
+    lhs : Tensor
+        The left matrix operand
+    rhs : Tensor
+        The right matrix operand
+    transa : bool
+        Whether transpose lhs
+    transb : bool
+        Whether transpose rhs
+
+    Returns
+    -------
+    C : Tensor
+        The result tensor.
+    """
+    n = lhs.shape[1] if transa else lhs.shape[0]
+    m = rhs.shape[0] if transb else rhs.shape[1]
+    return _api.extern(
+        (n, m), [lhs, rhs],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.rocblas.matmul",
+            ins[0], ins[1], outs[0], transa, transb), name="C")
diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index 0f72a4694a82..10cfaed83e68 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -2,6 +2,7 @@
 import subprocess
 from os.path import join
 from . import util
+from .._ffi.base import py_str
 from ..api import register_func, convert
 
 def rocm_link(in_file, out_file):
@@ -24,7 +25,7 @@ def rocm_link(in_file, out_file):
 
     if proc.returncode != 0:
         msg = "Linking error using ld.lld:\n"
-        msg += str(out)
+        msg += py_str(out)
         raise RuntimeError(msg)
 
 
diff --git a/python/tvm/contrib/rpc.py b/python/tvm/contrib/rpc.py
index 7b29b1ddac01..8f6611ddb905 100644
--- a/python/tvm/contrib/rpc.py
+++ b/python/tvm/contrib/rpc.py
@@ -1,338 +1,9 @@
-"""RPC interface for easy testing.
-
-RPC enables connect to a remote server, upload and launch functions.
-This is useful to for cross-compile and remote testing,
-The compiler stack runs on local server, while we use RPC server
-to run on remote runtime which don't have a compiler available.
-
-The test program compiles the program on local server,
-upload and run remote RPC server, get the result back to verify correctness.
-"""
-from __future__ import absolute_import
-
-import os
-import socket
-import struct
-import logging
-import multiprocessing
-from . import util, cc, tar
-from ..module import load as _load_module
-from .._ffi.function import _init_api, register_func
-from .._ffi.ndarray import context as _context
-from .._ffi.base import py_str
-
-RPC_MAGIC = 0xff271
-RPC_SESS_MASK = 128
-
-def _server_env():
-    """Server environment function return temp dir"""
-    temp = util.tempdir()
-    # pylint: disable=unused-variable
-    @register_func("tvm.contrib.rpc.server.workpath")
-    def get_workpath(path):
-        return temp.relpath(path)
-
-    @register_func("tvm.contrib.rpc.server.load_module", override=True)
-    def load_module(file_name):
-        """Load module from remote side."""
-        path = temp.relpath(file_name)
-        # Try create a shared library in remote
-        if path.endswith(".o"):
-            logging.info("Create shared library based on %s", path)
-            cc.create_shared(path + ".so", path)
-            path += ".so"
-        elif path.endswith(".tar"):
-            tar_temp = util.tempdir()
-            tar.untar(path, tar_temp.temp_dir)
-            files = [tar_temp.relpath(x) for x in tar_temp.listdir()]
-            cc.create_shared(path + ".so", files)
-            path += ".so"
-        m = _load_module(path)
-        logging.info("load_module %s", path)
-        return m
-    return temp
-
-
-def _serve_loop(sock, addr):
-    """Server loop"""
-    sockfd = sock.fileno()
-    temp = _server_env()
-    _ServerLoop(sockfd)
-    temp.remove()
-    logging.info("Finish serving %s", addr)
-
-
-def _recvall(sock, nbytes):
-    res = []
-    nread = 0
-    while nread < nbytes:
-        chunk = sock.recv(min(nbytes - nread, 1024))
-        nread += len(chunk)
-        res.append(chunk)
-    return b"".join(res)
-
-
-def _listen_loop(sock):
-    """Lisenting loop"""
-    while True:
-        conn, addr = sock.accept()
-        logging.info("RPCServer: connection from %s", addr)
-        magic = struct.unpack("@i", _recvall(conn, 4))[0]
-        if magic != RPC_MAGIC:
-            conn.close()
-            continue
-        keylen = struct.unpack("@i", _recvall(conn, 4))[0]
-        key = py_str(_recvall(conn, keylen))
-        if not key.startswith("client:"):
-            conn.sendall(struct.pack("@i", RPC_MAGIC + 2))
-        else:
-            conn.sendall(struct.pack("@i", RPC_MAGIC))
-        logging.info("Connection from %s", addr)
-        process = multiprocessing.Process(target=_serve_loop, args=(conn, addr))
-        process.deamon = True
-        process.start()
-        # close from our side.
-        conn.close()
-
-
-def _connect_proxy_loop(addr, key):
-    key = "server:" + key
-    while True:
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        sock.connect(addr)
-        sock.sendall(struct.pack("@i", RPC_MAGIC))
-        sock.sendall(struct.pack("@i", len(key)))
-        sock.sendall(key.encode("utf-8"))
-        magic = struct.unpack("@i", _recvall(sock, 4))[0]
-        if magic == RPC_MAGIC + 1:
-            raise RuntimeError("key: %s has already been used in proxy" % key)
-        elif magic == RPC_MAGIC + 2:
-            logging.info("RPCProxy do not have matching client key %s", key)
-        elif magic != RPC_MAGIC:
-            raise RuntimeError("%s is not RPC Proxy" % str(addr))
-        logging.info("RPCProxy connected to %s", str(addr))
-        process = multiprocessing.Process(target=_serve_loop, args=(sock, addr))
-        process.deamon = True
-        process.start()
-        process.join()
-
-
-class Server(object):
-    """Start RPC server on a seperate process.
-
-    This is a simple python implementation based on multi-processing.
-    It is also possible to implement a similar C based sever with
-    TVM runtime which does not depend on the python.
-
-    Parameters
-    ----------
-    host : str
-        The host url of the server.
-
-    port : int
-        The port to be bind to
-
-    port_end : int, optional
-        The end port to search
-
-    is_proxy : bool, optional
-        Whether the address specified is a proxy.
-        If this is true, the host and port actually corresponds to the
-        address of the proxy server.
-
-    key : str, optional
-        The key used to identify the server in Proxy connection.
-    """
-    def __init__(self, host, port=9091, port_end=9199, is_proxy=False, key=""):
-        self.host = host
-        self.port = port
-        self.libs = []
-
-        if not is_proxy:
-            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            self.port = None
-            for my_port in range(port, port_end):
-                try:
-                    sock.bind((host, my_port))
-                    self.port = my_port
-                    break
-                except socket.error as sock_err:
-                    if sock_err.errno in [98, 48]:
-                        continue
-                    else:
-                        raise sock_err
-                if not self.port:
-                    raise ValueError("cannot bind to any port in [%d, %d)" % (port, port_end))
-            logging.info("RPCServer: bind to %s:%d", host, self.port)
-            sock.listen(1)
-            self.sock = sock
-            self.proc = multiprocessing.Process(
-                target=_listen_loop, args=(self.sock,))
-        else:
-            self.proc = multiprocessing.Process(
-                target=_connect_proxy_loop, args=((host, port), key))
-        self.proc.deamon = True
-        self.proc.start()
-
-    def terminate(self):
-        """Terminate the server process"""
-        if self.proc:
-            self.proc.terminate()
-            self.proc = None
-
-    def __del__(self):
-        self.terminate()
-
-
-
-class RPCSession(object):
-    """RPC Client session module
-
-    Do not directly create the obhect, call connect
-    """
-    # pylint: disable=invalid-name
-    def __init__(self, sess):
-        self._sess = sess
-        self._tbl_index = _SessTableIndex(sess)
-        self._remote_funcs = {}
-
-    def get_function(self, name):
-        """Get function from the session.
-
-        Parameters
-        ----------
-        name : str
-            The name of the function
-
-        Returns
-        -------
-        f : Function
-            The result function.
-        """
-        return self._sess.get_function(name)
-
-    def context(self, dev_type, dev_id=0):
-        """Construct a remote context.
-
-        Parameters
-        ----------
-        dev_type: int or str
-
-        dev_id: int, optional
-
-        Returns
-        -------
-        ctx: TVMContext
-            The corresponding encoded remote context.
-        """
-        ctx = _context(dev_type, dev_id)
-        encode = (self._tbl_index + 1) * RPC_SESS_MASK
-        ctx.device_type += encode
-        ctx._rpc_sess = self
-        return ctx
-
-    def cpu(self, dev_id=0):
-        """Construct remote CPU device."""
-        return self.context(1, dev_id)
-
-    def gpu(self, dev_id=0):
-        """Construct remote GPU device."""
-        return self.context(2, dev_id)
-
-    def cl(self, dev_id=0):
-        """Construct remote OpenCL device."""
-        return self.context(4, dev_id)
-
-    def metal(self, dev_id=0):
-        """Construct remote Metal device."""
-        return self.context(8, dev_id)
-
-    def ext_dev(self, dev_id=0):
-        """Construct remote extension device."""
-        return self.context(12, dev_id)
-
-    def upload(self, data, target=None):
-        """Upload file to remote runtime temp folder
-
-        Parameters
-        ----------
-        data : str or bytearray
-            The file name or binary in local to upload.
-
-        target : str, optional
-            The path in remote
-        """
-        if isinstance(data, bytearray):
-            if not target:
-                raise ValueError("target must present when file is a bytearray")
-            blob = data
-        else:
-            blob = bytearray(open(data, "rb").read())
-            if not target:
-                target = os.path.basename(data)
-
-        if "upload" not in self._remote_funcs:
-            self._remote_funcs["upload"] = self.get_function(
-                "tvm.contrib.rpc.server.upload")
-        self._remote_funcs["upload"](target, blob)
-
-    def download(self, path):
-        """Download file from remote temp folder.
-
-        Parameters
-        ----------
-        path : str
-            The relative location to remote temp folder.
-
-        Returns
-        -------
-        blob : bytearray
-            The result blob from the file.
-        """
-        if "download" not in self._remote_funcs:
-            self._remote_funcs["download"] = self.get_function(
-                "tvm.contrib.rpc.server.download")
-        return self._remote_funcs["download"](path)
-
-    def load_module(self, path):
-        """Load a remote module, the file need to be uploaded first.
-
-        Parameters
-        ----------
-        path : str
-            The relative location to remote temp folder.
-
-        Returns
-        -------
-        m : Module
-            The remote module containing remote function.
-        """
-        return _LoadRemoteModule(self._sess, path)
-
-
-def connect(url, port, key=""):
-    """Connect to RPC Server
-
-    Parameters
-    ----------
-    url : str
-        The url of the host
-
-    port : int
-        The port to connect to
-
-    key : str, optional
-        Additional key to match server
-
-    Returns
-    -------
-    sess : RPCSession
-        The connected session.
-    """
-    try:
-        sess = _Connect(url, port, key)
-    except NameError:
-        raise RuntimeError("Please compile with USE_RPC=1")
-    return RPCSession(sess)
-
-_init_api("tvm.contrib.rpc")
+"""Deprecation RPC module"""
+# pylint: disable=unused-import
+from __future__ import absolute_import as _abs
+import warnings
+from ..rpc import Server, RPCSession, LocalSession, TrackerSession, connect, connect_tracker
+
+warnings.warn(
+    "Please use tvm.rpc instead of tvm.conrtib.rpc. tvm.contrib.rpc is going to be removed in 0.5",
+    DeprecationWarning)
diff --git a/python/tvm/contrib/sdaccel.py b/python/tvm/contrib/sdaccel.py
new file mode 100644
index 000000000000..0f89911dbdad
--- /dev/null
+++ b/python/tvm/contrib/sdaccel.py
@@ -0,0 +1,69 @@
+"""Utility for Interacting with SDAccel Tools"""
+import subprocess
+import os
+from . import util
+from ..api import register_func
+
+
+@register_func("tvm_callback_sdaccel_compile")
+def compile_vhls(kernel_info, device_name):
+    """Compile Vivado HLS code for SDAccel.
+
+    Parameters
+    ----------
+    kernel_info : list of (str, str)
+        List of kernel information.  The kernel information is a tuple of
+        function name and source code.
+
+    device_name : str
+        The name of the target device
+
+    Return
+    ------
+    xclbin : bytearray
+        The bytearray of the xclbin
+    """
+    tmp_dir = util.tempdir()
+
+    sdk = os.environ.get("XILINX_SDX", None)
+    xocc = os.path.join(sdk, "bin/xocc") if sdk else "xocc"
+    target = os.environ.get("XCL_TARGET",
+                            "sw_emu" if os.environ.get("XCL_EMULATION_MODE") else "hw")
+    advanced_params = ["--xp", "param:compiler.preserveHlsOutput=1",
+                       "--xp", "param:compiler.generateExtraRunData=true"]
+    platform = device_name
+    if not platform:
+        platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM"))
+
+    if platform is None:
+        raise RuntimeError("No Xlinx device specified.")
+
+    tmp_xo_files = []
+    for funcname, code  in kernel_info:
+        funcname = funcname.value
+        code = code.value
+
+        tmp_cpp = tmp_dir.relpath(funcname + ".cpp")
+        tmp_xo = tmp_dir.relpath(funcname + ".xo")
+
+        with open(tmp_cpp, "wb") as out_file:
+            out_file.write(bytes(code))
+
+        # build xo
+        args = [xocc, "-c", "-t", target, "--platform", platform, "-o", tmp_xo, "-k", funcname] + \
+               advanced_params + [tmp_cpp]
+        returncode = subprocess.call(args)
+        if returncode != 0:
+            raise RuntimeError("Compile error")
+
+        tmp_xo_files.append(tmp_xo)
+
+    # build xclbin
+    tmp_xclbin = tmp_dir.relpath("output.xclbin")
+    args = [xocc, "-l", "-t", target, "--platform", platform, "-o", tmp_xclbin] + tmp_xo_files + \
+           advanced_params
+    returncode = subprocess.call(args)
+    if returncode != 0:
+        raise RuntimeError("Link error")
+
+    return bytearray(open(tmp_xclbin, "rb").read())
diff --git a/python/tvm/contrib/spirv.py b/python/tvm/contrib/spirv.py
new file mode 100644
index 000000000000..586da1a7487f
--- /dev/null
+++ b/python/tvm/contrib/spirv.py
@@ -0,0 +1,43 @@
+"""Utility for Interacting with SPIRV Tools"""
+import subprocess
+import os
+from . import util
+from .._ffi.base import py_str
+
+def optimize(spv_bin):
+    """Optimize SPIRV using spirv-opt via CLI
+
+    Note that the spirv-opt is still experimental.
+
+    Parameters
+    ----------
+    spv_bin : bytearray
+        The spirv file
+
+    Return
+    ------
+    cobj_bin : bytearray
+        The HSA Code Object
+    """
+
+    tmp_dir = util.tempdir()
+    tmp_in = tmp_dir.relpath("input.spv")
+    tmp_out = tmp_dir.relpath("output.spv")
+    with open(tmp_in, "wb") as out_file:
+        out_file.write(bytes(spv_bin))
+
+    sdk = os.environ.get("VULKAN_SDK", None)
+    cmd = os.path.join(sdk, "bin/spirv-opt") if sdk else "spirv-opt"
+    args = [cmd, "-O", tmp_in, "-o", tmp_out]
+    proc = subprocess.Popen(
+        args,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT)
+    (out, _) = proc.communicate()
+
+    if proc.returncode != 0:
+        msg = "Opitmizationerror using spirv-opt:\n"
+        msg += py_str(out)
+        raise RuntimeError(msg)
+
+    return bytearray(open(tmp_out, "rb").read())
diff --git a/python/tvm/contrib/tar.py b/python/tvm/contrib/tar.py
index ca3bf3478840..7e075d7a5697 100644
--- a/python/tvm/contrib/tar.py
+++ b/python/tvm/contrib/tar.py
@@ -6,6 +6,7 @@
 import shutil
 import subprocess
 from . import util
+from .._ffi.base import py_str
 
 def tar(output, files):
     """Create tarball containing all files in root.
@@ -38,7 +39,7 @@ def tar(output, files):
 
     if proc.returncode != 0:
         msg = "Tar error:\n"
-        msg += out
+        msg += py_str(out)
         raise RuntimeError(msg)
 
 
@@ -64,5 +65,5 @@ def untar(tar_file, directory):
 
     if proc.returncode != 0:
         msg = "Tar error:\n"
-        msg += out
+        msg += py_str(out)
         raise RuntimeError(msg)
diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py
index 93d5b897c7c1..0d94a8da5058 100644
--- a/python/tvm/contrib/util.py
+++ b/python/tvm/contrib/util.py
@@ -3,6 +3,11 @@
 import os
 import tempfile
 import shutil
+try:
+    import fcntl
+except ImportError:
+    fcntl = None
+
 
 class TempDirectory(object):
     """Helper object to manage temp directory during testing.
@@ -38,7 +43,7 @@ def relpath(self, name):
         return os.path.join(self.temp_dir, name)
 
     def listdir(self):
-        """"List contents in the dir.
+        """List contents in the dir.
 
         Returns
         -------
@@ -47,6 +52,7 @@ def listdir(self):
         """
         return os.listdir(self.temp_dir)
 
+
 def tempdir():
     """Create temp dir which deletes the contents when exit.
 
@@ -56,3 +62,115 @@ def tempdir():
         The temp directory object
     """
     return TempDirectory()
+
+
+class FileLock(object):
+    """File lock object
+
+    Parameters
+    ----------
+    path : str
+        The path to the lock
+    """
+    def __init__(self, path):
+        self.lock_file = open(path, "w")
+        if fcntl:
+            fcntl.lockf(self.lock_file, fcntl.LOCK_EX)
+
+
+    def release(self):
+        """Release the lock"""
+        if self.lock_file:
+            if fcntl:
+                fcntl.lockf(self.lock_file, fcntl.LOCK_UN)
+            self.lock_file.close()
+            self.lock_file = None
+
+
+def filelock(path):
+    """Create a file lock which locks on path
+
+    Parameters
+    ----------
+    path : str
+        The path to the lock
+
+    Returns
+    -------
+    lock : File lock object
+    """
+    return FileLock(path)
+
+
+def is_source_path(path):
+    """Check if path is source code path.
+
+    Parameters
+    ----------
+    path : str
+        A possible path
+
+    Returns
+    -------
+    valid : bool
+        Whether path is a possible source path
+    """
+    if os.path.exists(path):
+        return True
+    if path.find("\n") != -1:
+        return False
+    spath = path.rsplit(".", 1)
+    return len(spath) == 2 and spath[1].strip() == spath[1]
+
+
+def which(exec_name):
+    """Try to find full path of exec_name
+
+    Parameters
+    ----------
+    exec_name : str
+        The executable name
+
+    Returns
+    -------
+    path : str
+        The full path of executable if found, otherwise returns None
+    """
+    base_list = ["", "/bin"] + os.environ.get("PATH", "").split(os.pathsep)
+    for path in base_list:
+        full_path = os.path.join(path, exec_name)
+        if os.path.isfile(full_path) and os.access(full_path, os.X_OK):
+            return full_path
+    return None
+
+def get_lower_ir(s):
+    """Get lower ir code of a schedule.
+    This is useful for debug, since you don't have to find all inputs/outputs
+    for a schedule in a fused subgraph.
+
+    Parameters
+    ----------
+    s: Schedule
+
+    Returns
+    -------
+    ir: str
+        The lower ir
+    """
+    from .. import tensor
+    from ..build_module import lower
+
+    outputs = s.outputs
+
+    inputs = []
+    def find_all(op):
+        if isinstance(op, tensor.PlaceholderOp):
+            inputs.append(op.output(0))
+        else:
+            for x in op.input_tensors:
+                find_all(x.op)
+
+    for out in outputs:
+        find_all(out)
+
+    return lower(s, inputs, simple_mode=True)
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index 7f9b53c5b532..63fbad2a58cf 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -1,9 +1,11 @@
 # pylint: disable=invalid-name
 """Utility to invoke Xcode compiler toolchain"""
 from __future__ import absolute_import as _abs
+
 import os
 import sys
 import subprocess
+from .._ffi.base import py_str
 from . import util
 
 def xcrun(cmd):
@@ -49,7 +51,7 @@ def codesign(lib):
     (out, _) = proc.communicate()
     if proc.returncode != 0:
         msg = "Codesign error:\n"
-        msg += out
+        msg += py_str(out)
         raise RuntimeError(msg)
 
 
@@ -92,7 +94,7 @@ def create_dylib(output, objects, arch, sdk="macosx"):
 
     if proc.returncode != 0:
         msg = "Compilation error:\n"
-        msg += out
+        msg += py_str(out)
         raise RuntimeError(msg)
 
 
@@ -144,6 +146,28 @@ def compile_metal(code, path_target=None, sdk="macosx"):
     return libbin
 
 
+class XCodeRPCServer(object):
+    """Wrapper for RPC server
+
+    Parameters
+    ----------
+    cmd : list of str
+       The command to run
+
+    lock: FileLock
+       Lock on the path
+    """
+    def __init__(self, cmd, lock):
+        self.proc = subprocess.Popen(cmd)
+        self.lock = lock
+
+    def join(self):
+        """Wait server to finish and release its resource
+        """
+        self.proc.wait()
+        self.lock.release()
+
+
 def popen_test_rpc(host,
                    port,
                    key,
@@ -188,6 +212,10 @@ def popen_test_rpc(host,
     if not os.path.exists(proj_path):
         raise RuntimeError("Cannot find tvmrpc.xcodeproj in %s," +
                            (" please set env TVM_IOS_RPC_ROOT correctly" % rpc_root))
+
+    # Lock the path so only one file can run
+    lock = util.filelock(os.path.join(rpc_root, "ios_rpc.lock"))
+
     with open(os.path.join(rpc_root, "rpc_config.txt"), "w") as fo:
         fo.write("%s %d %s\n" % (host, port, key))
         libs = libs if libs else []
@@ -201,5 +229,5 @@ def popen_test_rpc(host,
     if options:
         cmd += options
     cmd += ["test"]
-    proc = subprocess.Popen(cmd)
-    return proc
+
+    return XCodeRPCServer(cmd, lock)
diff --git a/python/tvm/exec/autotvm_log_editor.py b/python/tvm/exec/autotvm_log_editor.py
new file mode 100644
index 000000000000..c524fb5dc785
--- /dev/null
+++ b/python/tvm/exec/autotvm_log_editor.py
@@ -0,0 +1,44 @@
+# pylint: disable=invalid-name
+"""Pick best log entries from a large file and store them to a small file"""
+
+import argparse
+import os
+import logging
+import warnings
+
+from .. import autotvm
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--act", type=str, choices=['pick-best'],
+                        help="The action")
+    parser.add_argument("--i", type=str, help="The input file or directory")
+    parser.add_argument("--o", type=str, help="The output file")
+
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO)
+
+    if args.act == 'pick-best':
+        if os.path.isfile(args.i):
+            args.o = args.o or args.i + ".best.log"
+            autotvm.record.pick_best(args.i, args.o)
+        elif os.path.isdir(args.i):
+            args.o = args.o or "best.log"
+            tmp_filename = args.o + ".tmp"
+
+            with open(tmp_filename, 'w') as tmp_fout:
+                for filename in os.listdir(args.i):
+                    if filename.endswith(".log"):
+                        try:
+                            autotvm.record.pick_best(filename, tmp_fout)
+                        except Exception:  # pylint: disable=broad-except
+                            warnings.warn("Ignore invalid file %s" % filename)
+
+            logging.info("Run final filter...")
+            autotvm.record.pick_best(tmp_filename, args.o)
+            os.remove(tmp_filename)
+            logging.info("Output to %s ...", args.o)
+        else:
+            raise ValueError("Invalid input file: " + args.i)
+    else:
+        raise ValueError("Invalid action " + args.act)
diff --git a/python/tvm/exec/measure_peak.py b/python/tvm/exec/measure_peak.py
new file mode 100644
index 000000000000..b477ac23d3af
--- /dev/null
+++ b/python/tvm/exec/measure_peak.py
@@ -0,0 +1,32 @@
+"""measure bandwidth and compute peak
+
+e.g.
+python3 -m tvm.exec.measure_peak --target cuda --rpc-host 0.0.0.0 --rpc-port 9090
+python3 -m tvm.exec.measure_peak --target opencl --target-host "llvm -target=aarch64-linux-gnu" \
+        --rpc-host $TVM_OPENCL_DEVICE_HOST --rpc-port 9090
+"""
+
+import argparse
+import logging
+
+from ..contrib.peak import measure_peak_all
+
+def main():
+    """Main funciton"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--target', type=str, default="llvm",
+                        help='The build target')
+    parser.add_argument('--target-host', type=str, default=None,
+                        help='The host code compilation target')
+    parser.add_argument('--rpc-host', type=str, default="0.0.0.0",
+                        help='the hostname of the server')
+    parser.add_argument('--rpc-port', type=int, default=9090,
+                        help='The port of the PRC')
+
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO)
+
+    measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port)
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/exec/query_rpc_tracker.py b/python/tvm/exec/query_rpc_tracker.py
new file mode 100644
index 000000000000..32ec4f5da764
--- /dev/null
+++ b/python/tvm/exec/query_rpc_tracker.py
@@ -0,0 +1,32 @@
+"""Tool to query RPC tracker status"""
+from __future__ import absolute_import
+
+import logging
+import argparse
+import os
+from .. import rpc
+
+def main():
+    """Main funciton"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', type=str, default="",
+                        help='the hostname of the tracker')
+    parser.add_argument('--port', type=int, default=None,
+                        help='The port of the PRC')
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO)
+
+    # default to local host or environment variable
+    if not args.host:
+        args.host = os.environ.get("TVM_TRACKER_HOST", "localhost")
+
+    if not args.port:
+        args.port = int(os.environ.get("TVM_TRACKER_PORT", "9190"))
+
+    conn = rpc.connect_tracker(args.host, args.port)
+    # pylint: disable=superfluous-parens
+    print("Tracker address %s:%d\n" % (args.host, args.port))
+    print("%s" % conn.text_summary())
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/exec/rpc_proxy.py b/python/tvm/exec/rpc_proxy.py
index 52fbda818596..678023a10550 100644
--- a/python/tvm/exec/rpc_proxy.py
+++ b/python/tvm/exec/rpc_proxy.py
@@ -1,10 +1,14 @@
+# pylint: disable=redefined-outer-name, invalid-name
 """RPC web proxy, allows redirect to websocket based RPC servers(browsers)"""
 from __future__ import absolute_import
 
 import logging
 import argparse
+import multiprocessing
+import sys
 import os
-from ..contrib.rpc_proxy import Proxy
+from ..rpc.proxy import Proxy
+
 
 def find_example_resource():
     """Find resource examples."""
@@ -22,27 +26,57 @@ def find_example_resource():
     return index_page, js_files
 
 
-def main():
+def main(args):
     """Main funciton"""
+    if args.tracker:
+        url, port = args.tracker.split(":")
+        port = int(port)
+        tracker_addr = (url, port)
+    else:
+        tracker_addr = None
+
+    if args.example_rpc:
+        index, js_files = find_example_resource()
+        prox = Proxy(args.host,
+                     port=args.port,
+                     web_port=args.web_port,
+                     index_page=index,
+                     resource_files=js_files,
+                     tracker_addr=tracker_addr)
+    else:
+        prox = Proxy(args.host,
+                     port=args.port,
+                     web_port=args.web_port,
+                     tracker_addr=tracker_addr)
+    prox.proc.join()
+
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', type=str, default="0.0.0.0",
                         help='the hostname of the server')
     parser.add_argument('--port', type=int, default=9090,
                         help='The port of the PRC')
-    parser.add_argument('--web-port', type=int, default=9190,
+    parser.add_argument('--web-port', type=int, default=8888,
                         help='The port of the http/websocket server')
     parser.add_argument('--example-rpc', type=bool, default=False,
                         help='Whether to switch on example rpc mode')
+    parser.add_argument('--tracker', type=str, default="",
+                        help="Report to RPC tracker")
+    parser.add_argument('--no-fork', dest='fork', action='store_false',
+                        help="Use spawn mode to avoid fork. This option \
+                         is able to avoid potential fork problems with Metal, OpenCL \
+                         and ROCM compilers.")
+    parser.set_defaults(fork=True)
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
-    if args.example_rpc:
-        index, js_files = find_example_resource()
-        prox = Proxy(args.host, port=args.port,
-                     web_port=args.web_port, index_page=index,
-                     resource_files=js_files)
+    if args.fork is False:
+        if sys.version_info[0] < 3:
+            raise RuntimeError(
+                "Python3 is required for spawn mode."
+            )
+        multiprocessing.set_start_method('spawn')
     else:
-        prox = Proxy(args.host, port=args.port, web_port=args.web_port)
-    prox.proc.join()
-
-if __name__ == "__main__":
-    main()
+        logging.info("If you are running ROCM/Metal, \
+        fork with cause compiler internal error. Try to launch with arg ```--no-fork```")
+    main(args)
diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py
index 432860f58d1e..5998e9ffe6ac 100644
--- a/python/tvm/exec/rpc_server.py
+++ b/python/tvm/exec/rpc_server.py
@@ -1,15 +1,38 @@
+# pylint: disable=redefined-outer-name, invalid-name
 """Start an RPC server"""
 from __future__ import absolute_import
 
-import logging
 import argparse
-import os
-import ctypes
-from ..contrib import rpc
-from .._ffi.libinfo import find_lib_path
+import multiprocessing
+import sys
+import logging
+from .. import rpc
+
+def main(args):
+    """Main function"""
+
+    if args.tracker:
+        url, port = args.tracker.split(":")
+        port = int(port)
+        tracker_addr = (url, port)
+        if not args.key:
+            raise RuntimeError(
+                "Need key to present type of resource when tracker is available")
+    else:
+        tracker_addr = None
+
+    server = rpc.Server(args.host,
+                        args.port,
+                        args.port_end,
+                        key=args.key,
+                        tracker_addr=tracker_addr,
+                        load_library=args.load_library,
+                        custom_addr=args.custom_addr,
+                        silent=args.silent)
+    server.proc.join()
+
 
-def main():
-    """Main funciton"""
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', type=str, default="0.0.0.0",
                         help='the hostname of the server')
@@ -17,27 +40,33 @@ def main():
                         help='The port of the PRC')
     parser.add_argument('--port-end', type=int, default=9199,
                         help='The end search port of the PRC')
-    parser.add_argument('--with-executor', type=bool, default=False,
-                        help="Whether to load executor runtime")
-    parser.add_argument('--load-library', type=str, default="",
+    parser.add_argument('--tracker', type=str,
+                        help="The address of RPC tracker in host:port format. "
+                             "e.g. (10.77.1.234:9190)")
+    parser.add_argument('--key', type=str, default="",
+                        help="The key used to identify the device type in tracker.")
+    parser.add_argument('--silent', action='store_true',
+                        help="Whether run in silent mode.")
+    parser.add_argument('--load-library', type=str,
                         help="Additional library to load")
-    args = parser.parse_args()
+    parser.add_argument('--no-fork', dest='fork', action='store_false',
+                        help="Use spawn mode to avoid fork. This option \
+                         is able to avoid potential fork problems with Metal, OpenCL \
+                         and ROCM compilers.")
+    parser.add_argument('--custom-addr', type=str,
+                        help="Custom IP Address to Report to RPC Tracker")
 
+    parser.set_defaults(fork=True)
+    args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
-    load_library = [lib for lib in args.load_library.split(":") if len(lib) != 0]
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    apps_path = os.path.join(curr_path, "../../../apps/graph_executor/lib/")
-    libs = []
-    if args.with_executor:
-        load_library += ["libtvm_graph_exec.so"]
-    for file_name in load_library:
-        file_name = find_lib_path(file_name, apps_path)[0]
-        libs.append(ctypes.CDLL(file_name, ctypes.RTLD_GLOBAL))
-        logging.info("Load additional library %s", file_name)
-
-    server = rpc.Server(args.host, args.port, args.port_end)
-    server.libs += libs
-    server.proc.join()
-
-if __name__ == "__main__":
-    main()
+    if args.fork is False:
+        if sys.version_info[0] < 3:
+            raise RuntimeError(
+                "Python3 is required for spawn mode."
+            )
+        multiprocessing.set_start_method('spawn')
+    else:
+        if not args.silent:
+            logging.info("If you are running ROCM/Metal, fork will cause "
+                         "compiler internal error. Try to launch with arg ```--no-fork```")
+    main(args)
diff --git a/python/tvm/exec/rpc_tracker.py b/python/tvm/exec/rpc_tracker.py
new file mode 100644
index 000000000000..3a89014f77a4
--- /dev/null
+++ b/python/tvm/exec/rpc_tracker.py
@@ -0,0 +1,46 @@
+# pylint: disable=redefined-outer-name, invalid-name
+"""Tool to start RPC tracker"""
+from __future__ import absolute_import
+
+import logging
+import argparse
+import multiprocessing
+import sys
+from .. import rpc
+
+def main(args):
+    """Main funciton"""
+    tracker = rpc.Tracker(args.host, port=args.port, port_end=args.port_end,
+                          silent=args.silent)
+    tracker.proc.join()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', type=str, default="0.0.0.0",
+                        help='the hostname of the tracker')
+    parser.add_argument('--port', type=int, default=9190,
+                        help='The port of the PRC')
+    parser.add_argument('--port-end', type=int, default=9199,
+                        help='The end search port of the PRC')
+    parser.add_argument('--no-fork', dest='fork', action='store_false',
+                        help="Use spawn mode to avoid fork. This option \
+                         is able to avoid potential fork problems with Metal, OpenCL \
+                         and ROCM compilers.")
+    parser.add_argument('--silent', action='store_true',
+                        help="Whether run in silent mode.")
+
+    parser.set_defaults(fork=True)
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO)
+    if args.fork is False:
+        if sys.version_info[0] < 3:
+            raise RuntimeError(
+                "Python3 is required for spawn mode."
+            )
+        multiprocessing.set_start_method('spawn')
+    else:
+        if not args.silent:
+            logging.info("If you are running ROCM/Metal, fork will cause "
+                         "compiler internal error. Try to launch with arg ```--no-fork```")
+    main(args)
diff --git a/python/tvm/exec/tophub.py b/python/tvm/exec/tophub.py
new file mode 100644
index 000000000000..9dd951a52701
--- /dev/null
+++ b/python/tvm/exec/tophub.py
@@ -0,0 +1,36 @@
+# pylint: disable=invalid-name
+"""Download pre-tuned parameters of ops"""
+
+import argparse
+import logging
+
+from ..autotvm.tophub import list_packages, download_package
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--download", type=str, nargs='+',
+                        help="Target to download. Use 'all' to download for all targets")
+    parser.add_argument("-l", "--list", action='store_true', help="List available packages")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+
+    if args.list:
+        info = list_packages()
+        print("\n%-20s %-20s" % ("Target", "Size"))
+        print("-" * 41)
+        for target, info in info:
+            print("%-20s %-20s" % (target, "%.2f MB" % (info['size']/1000000)))
+
+    if args.download:
+        info = list_packages()
+        all_targets = [x[0] for x in info]
+        if 'all' in args.download:
+            targets = all_targets
+        else:
+            targets = args.download
+
+        for t in targets:
+            if t not in all_targets:
+                print("Warning : cannot find tuned parameters of " + t + ". (ignored)")
+            download_package(t)
diff --git a/python/tvm/expr.py b/python/tvm/expr.py
index 750e78909c7a..8bf46b7eee62 100644
--- a/python/tvm/expr.py
+++ b/python/tvm/expr.py
@@ -18,32 +18,34 @@
 from __future__ import absolute_import as _abs
 from ._ffi.node import NodeBase, NodeGeneric, register_node
 from . import make as _make
+from . import generic as _generic
 from . import _api_internal
 
+
 class ExprOp(object):
     def __add__(self, other):
-        return _make.Add(self, other)
+        return _generic.add(self, other)
 
     def __radd__(self, other):
         return self.__add__(other)
 
     def __sub__(self, other):
-        return _make.Sub(self, other)
+        return _generic.subtract(self, other)
 
     def __rsub__(self, other):
-        return _make.Sub(other, self)
+        return _generic.subtract(other, self)
 
     def __mul__(self, other):
-        return _make.Mul(self, other)
+        return _generic.multiply(self, other)
 
     def __rmul__(self, other):
-        return _make.Mul(other, self)
+        return _generic.multiply(other, self)
 
     def __div__(self, other):
-        return _make.Div(self, other)
+        return _generic.divide(self, other)
 
     def __rdiv__(self, other):
-        return _make.Div(other, self)
+        return _generic.divide(other, self)
 
     def __truediv__(self, other):
         return self.__div__(other)
@@ -65,19 +67,19 @@ def __neg__(self):
         return self.__mul__(neg_one)
 
     def __lshift__(self, other):
-        return _make.Call(self.dtype, "shift_left", [self, other], Call.PureIntrinsic, None, 0)
+        return _make.left_shift(self, other)
 
     def __rshift__(self, other):
-        return _make.Call(self.dtype, "shift_right", [self, other], Call.PureIntrinsic, None, 0)
+        return _make.right_shift(self, other)
 
     def __and__(self, other):
-        return _make.Call(self.dtype, "bitwise_and", [self, other], Call.PureIntrinsic, None, 0)
+        return _make.bitwise_and(self, other)
 
     def __or__(self, other):
-        return _make.Call(self.dtype, "bitwise_or", [self, other], Call.PureIntrinsic, None, 0)
+        return _make.bitwise_or(self, other)
 
     def __xor__(self, other):
-        return _make.Call(self.dtype, "bitwise_xor", [self, other], Call.PureIntrinsic, None, 0)
+        return _make.bitwise_xor(self, other)
 
     def __invert__(self):
         return _make.Call(self.dtype, "bitwise_not", [self], Call.PureIntrinsic, None, 0)
@@ -135,7 +137,7 @@ def astype(self, dtype):
         expr : Expr
             Expression with new type
         """
-        return _make.static_cast(dtype, self)
+        return _generic.cast(self, dtype)
 
 
 class EqualOp(NodeGeneric, ExprOp):
diff --git a/python/tvm/generic.py b/python/tvm/generic.py
new file mode 100644
index 000000000000..2926f73d5a02
--- /dev/null
+++ b/python/tvm/generic.py
@@ -0,0 +1,97 @@
+"""Generic opertors in TVM.
+We follow the numpy naming convention for this interface
+(e.g., tvm.generic.multitply ~ numpy.multiply).
+The default implementation is used by tvm.ExprOp.
+"""
+# pylint: disable=unused-argument
+from . import make as _make
+
+#Operator precedence used when overloading.
+__op_priority__ = 0
+
+def add(lhs, rhs):
+    """Generic add operator.
+
+    Parameters
+    ----------
+    lhs : object
+        The left operand.
+    rhs : object
+        The right operand.
+
+    Returns
+    -------
+    op : tvm.Expr
+        The result Expr of add operaton.
+    """
+    return _make.Add(lhs, rhs)
+
+
+def subtract(lhs, rhs):
+    """Generic subtract operator.
+
+    Parameters
+    ----------
+    lhs : object
+        The left operand.
+    rhs : object
+        The right operand.
+
+    Returns
+    -------
+    op : tvm.Expr
+        The result Expr of subtract operaton.
+    """
+    return _make.Sub(lhs, rhs)
+
+
+def multiply(lhs, rhs):
+    """Generic multiply operator.
+
+    Parameters
+    ----------
+    lhs : object
+        The left operand.
+    rhs : object
+        The right operand.
+
+    Returns
+    -------
+    op : tvm.Expr
+        The result Expr of multiply operaton.
+    """
+    return _make.Mul(lhs, rhs)
+
+
+def divide(lhs, rhs):
+    """Generic divide operator.
+
+    Parameters
+    ----------
+    lhs : object
+        The left operand.
+    rhs : object
+        The right operand.
+
+    Returns
+    -------
+    op : tvm.Expr
+        The result Expr of divide operaton.
+    """
+    return _make.Div(lhs, rhs)
+
+
+def cast(src, dtype):
+    """Generic cast operator.
+
+    Parameters
+    ----------
+    src : object
+        The source operand.
+
+    Returns
+    -------
+    op : tvm.Expr
+        The result Expr of divide operaton.
+    """
+    return _make.static_cast(dtype, src)
diff --git a/python/tvm/hybrid/__init__.py b/python/tvm/hybrid/__init__.py
new file mode 100644
index 000000000000..e0a39c562f0f
--- /dev/null
+++ b/python/tvm/hybrid/__init__.py
@@ -0,0 +1,10 @@
+"""Hybrid Programming APIs of TVM Python Package.
+
+This package maps a subset of python to HalideIR so that:
+1. Users can write some preliminary versions of the computation patterns
+have not been supported yet and verify it across the real execution and
+python semantic emulation.
+2. Developers can build HalideIR by writing Python code.
+"""
+
+from .api import script, parse
diff --git a/python/tvm/hybrid/api.py b/python/tvm/hybrid/api.py
new file mode 100644
index 000000000000..48e192d4ba39
--- /dev/null
+++ b/python/tvm/hybrid/api.py
@@ -0,0 +1,56 @@
+"""APIs of lowering the Python subset to HalideIR"""
+from __future__ import absolute_import as _abs
+
+import types
+from .._ffi.base import decorate
+from .parser import parse_python
+
+
+def script(pyfunc):
+    """Decorate a python function function as  hybrid script.
+
+    The hybrid function support emulation mode and parsing to
+    the internal language IR.
+
+    Returns
+    -------
+    hybrid_func : function
+        A decorated hybrid script function.
+    """
+    def wrapped_func(func, *args, **kwargs):
+        from .util import _enter_hybrid_runtime, _restore_runtime, _is_tvm_arg_types
+        if _is_tvm_arg_types(args):
+            return parse(func, args)
+
+        intersect = _enter_hybrid_runtime(func)
+        value = func(*args, **kwargs)
+        _restore_runtime(func, intersect)
+        return value
+    return decorate(pyfunc, wrapped_func)
+
+
+def parse(func, args):
+    """Parse a subset of Python to HalideIR
+
+    Parameters
+    ----------
+    func : str or types.FunctionType
+        If it is a string, parse the source code
+        If it is a function, parse the function
+
+    args : list of Buffer or Tensor or Var
+        The argument lists to the function.
+        Leave it None if no buffer is related to the function to be parsed
+
+    Returns
+    -------
+    root : Stmt
+        The result Halide IR and the parser class instance.
+    """
+    from .util import _pruned_source
+    if isinstance(func, str):
+        src = func
+    else:
+        assert isinstance(func, types.FunctionType)
+        src = _pruned_source(func)
+    return parse_python(src, args)
diff --git a/python/tvm/hybrid/intrin.py b/python/tvm/hybrid/intrin.py
new file mode 100644
index 000000000000..b3fb64579b60
--- /dev/null
+++ b/python/tvm/hybrid/intrin.py
@@ -0,0 +1,114 @@
+"""Intrinsics of TVM-Python Hybrid Script for Python runtime"""
+
+import numpy
+from ..stmt import For
+
+class _range(object):
+    """Base class of the loop ranges in hybrid script"""
+    def __init__(self, a, b=None):
+        if b is None:
+            self.low = 0
+            self.ext = a
+        else:
+            self.low = a
+            self.ext = b
+
+    def __iter__(self):
+        i = 0
+        while i < self.ext:
+            yield i + self.low
+            i += 1
+
+
+class bind(_range): #pylint: disable=invalid-name
+    def __init__(self, tag, ext):
+        super(bind, self).__init__(ext)
+        self.tag = tag
+
+
+unroll = vectorize = parallel = _range #pylint: disable=invalid-name
+
+
+def allocate(shape, dtype='float32', scope='global'): #pylint: disable=unused-argument
+    """Allocate a buffer with given shape
+
+    Parameters
+    ----------
+    shape: Tuple
+        The shape of the tensor to be allocated
+    dtype: string
+        The data type of the tensor
+    scope: string
+        The storage scope of the tensor
+
+    Returns
+    -------
+    tensor: numpy.array
+        The tensor allocated
+    """
+    return numpy.zeros(shape).astype(dtype)
+
+
+def popcount(x):
+    """
+    Count ones in the binary representation of number x
+
+    Parameters
+    ----------
+    x: Integer
+        The number to be counted
+
+    Returns
+    -------
+    cnt: Integer
+        The number of ones in the binary representation of number x
+    """
+    cnt = 0
+    while x:
+        x -= x & -x
+        cnt += 1
+    return cnt
+
+
+def sigmoid(x):
+    """
+    Sigmoid function of x, aka 1/(1+exp(-x)).
+
+    Parameters
+    ----------
+    x: a real number
+
+    Returns
+    -------
+    res: a real number
+        The result of sigmoid function
+    """
+    return 1 / (1 + numpy.exp(-x))
+
+
+HYBRID_GLOBALS = {
+    'unroll'    : unroll,
+    'vectorize' : vectorize,
+    'parallel'  : parallel,
+    'allocate'  : allocate,
+    'bind'      : bind,
+    'sqrt'      : numpy.sqrt,
+    'log'       : numpy.log,
+    'tanh'      : numpy.tanh,
+    'power'     : numpy.power,
+    'exp'       : numpy.exp,
+    'sigmoid'   : sigmoid,
+    'popcount'  : popcount
+}
+
+
+LOOP_INTRIN = {
+    'range'    : For.Serial,
+    'unroll'   : For.Unrolled,
+    'parallel' : For.Parallel,
+    'vectorize': For.Vectorized,
+    'bind'     : None
+}
+
+
+MATH_INTRIN = ['sqrt', 'log', 'exp', 'tanh', 'sigmoid', 'power', 'popcount']
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
new file mode 100644
index 000000000000..1e532367a321
--- /dev/null
+++ b/python/tvm/hybrid/parser.py
@@ -0,0 +1,385 @@
+"""Hybrid Script Parser"""
+
+import ast
+import operator
+import sys
+from .util import make_nop, halide_imm_types
+from .intrin import LOOP_INTRIN, MATH_INTRIN
+from .var_decl import determine_variable_usage
+from ..api import thread_axis
+from .. import expr as _expr
+from .. import make as _make
+from .. import intrin
+from .. import api  as _api
+from .. import ir_pass as _ir_pass
+
+def list_to_block(visit, lst):
+    """Convert a list of Python IR nodes to HalideIR Block"""
+    lst = list(map(visit, lst))
+    lst = [stmt for stmt in lst if not _ir_pass.Equal(stmt, make_nop())]
+    if not lst:
+        return make_nop()
+    if len(lst) == 1:
+        return lst[0]
+    body = lst[0]
+    for i in lst[1:]:
+        body = _make.Block(body, i)
+    return body
+
+
+class HybridParser(ast.NodeVisitor):
+    """Python AST visitor pass which finally lowers it to HalideIR"""
+
+
+    _binop_maker = {
+        ast.Add   : operator.add,
+        ast.Sub   : operator.sub,
+        ast.Mult  : operator.mul,
+        ast.Div   : operator.div if sys.version_info[0] == 2 else operator.truediv,
+        ast.Mod   : operator.mod,
+        ast.BitOr : operator.or_,
+        ast.BitAnd: operator.and_,
+        ast.BitXor: operator.xor,
+        ast.Gt    : operator.gt,
+        ast.GtE   : operator.ge,
+        ast.Lt    : operator.lt,
+        ast.LtE   : operator.le,
+        ast.Eq    : operator.eq,
+        ast.NotEq : operator.ne,
+    }
+
+
+    _unaryop_maker = {
+        ast.USub   : operator.neg,
+        ast.Invert : operator.invert,
+        ast.Not    : operator.not_
+    }
+
+
+    def __init__(self, args, usage, func_name=None):
+        """
+        Parameters
+        ----------
+        args: A list of tvm.placeholder or tvm.var
+            Provided by the user, the argument list of the function to be lowered.
+
+        usage: A dict of variables used in last in this function
+            Provided by last lower pass, which collects this information
+
+        Returns
+        -------
+        func_name: str
+            The name of the function to be lowered; if not provided,
+            the compiler will use the name in the AST
+        """
+        self.args = args[:]
+        self.usage = usage.copy()
+        self._args = {} # Dict maps arg name to actual arg instance (either a var or a buffer)
+        self.var_buffers = {} # Buffers formed by mutatble variables
+        self.alloc_buffers = {} # Buffers formed by allocate instructions
+        self.loops_above = {} # State variable that indicates loop levels above the current node
+        self.var_consts = {} # Variables that are determined as readonly in previous stage
+        self.func_name = func_name # The name of the function to be lowered
+        self.iter_axis = []
+
+
+    def wrap_up_realize(self, node, body):
+        """Wrap up all the variables which will no longer be used"""
+        for key, val in self.usage.items():
+            if key in self.var_consts.keys():
+                continue
+            _, level, _ = val
+            if level == node:
+                if key in self.var_buffers.keys():
+                    _buf = self.var_buffers[key]
+                    _scope = 'global'
+                else:
+                    _buf, _scope = self.alloc_buffers[key]
+                _domain = [_make.range_by_min_extent(0, i) for i in _buf.shape]
+                _dtype = _buf.dtype
+                _true = _api.convert(True)
+                body = _make.Realize(_buf.op, 0, _dtype, _domain, _true, body)
+                body = _make.AttrStmt(_buf.op, 'realize_scope', _api.convert(_scope), body)
+        return body
+
+
+    def _get_buffer_from_id(self, s):
+        if s not in self._args.keys() and s not in self.alloc_buffers.keys():
+            raise ValueError("This %s is expected to be in argument list or allocated buffer!" % s)
+        if s in self._args.keys() and s in self.alloc_buffers.keys():
+            raise ValueError("%s, a buffer cannot be both argument and allocated!" % s)
+        if s in self._args.keys():
+            return self._args[s]
+        return self.alloc_buffers[s][0]
+
+
+
+    #pylint: disable=invalid-name, missing-docstring
+    def visit_Module(self, node):
+        if len(node.body) != 1:
+            raise ValueError("Only one-function source code can be fed to this parser!")
+        return self.visit(node.body[0])
+
+
+    def visit_FunctionDef(self, node):
+        if len(node.args.args) != len(self.args):
+            raise ValueError("The number of arguments passed to the function\
+                should be the same as it is defined!")
+        for idx, arg in enumerate(node.args.args):
+            _attr = 'id' if sys.version_info[0] < 3 else 'arg' # To make py2 and 3 compatible
+            self._args[getattr(arg, _attr)] = self.args[idx]
+        res = list_to_block(self.visit, node.body)
+        res = self.wrap_up_realize(node, res)
+        if self.func_name is None:
+            self.func_name = node.name
+        return res
+
+
+    def visit_Expr(self, node):
+        return self.visit(node.value)
+
+
+    def visit_Name(self, node):
+        _id = node.id
+        if _id in self._args.keys() and isinstance(self._args[_id], _expr.Var):
+            return self._args[_id]
+        elif _id in self.loops_above.keys():
+            return self.loops_above[_id]
+        if _id in self._args.keys():
+            raise ValueError("This id %s should be handled in visit_Subscript!" % _id)
+        if _id  not in self.usage.keys():
+            raise ValueError("This id %s is expected to be a defined variable!" % _id)
+        # Buffer
+        if _id in self.var_buffers.keys():
+            _buf = self.var_buffers[_id]
+            return _make.Call(_buf.dtype, _id, [_api.const(0)], _expr.Call.Halide, _buf.op, 0)
+        # Compilation time constant
+        if _id not in self.var_consts.keys():
+            raise ValueError("This id %s is expected to a compilation time constant!" % _id)
+        return self.var_consts[_id]
+
+
+    def visit_Num(self, node):
+        return _api.const(node.n)
+
+
+    def visit_Assign(self, node):
+        if len(node.targets) != 1:
+            raise ValueError("So far only one-valued assignment is supported!")
+        lhs = node.targets[0]
+        rhs = self.visit(node.value)
+        if isinstance(rhs, _expr.Expr):
+            rhs = _ir_pass.Simplify(rhs)
+        if isinstance(lhs, ast.Name):
+            #TODO: support defined intermediate buffer later
+            lhs_ = lhs
+            lhs = lhs.id
+            if lhs in self.loops_above.keys():
+                raise ValueError("You CAN NEVER overwrite a loop variable!")
+            decl, _, rw = self.usage[lhs]
+            if decl == lhs_:
+                if lhs in self.var_consts.keys():
+                    raise ValueError("BUG: A constant cannot be overwritten!")
+                if lhs in self.var_buffers.keys() or lhs in self.alloc_buffers.keys():
+                    raise ValueError("BUG: This value should not be defined before this point!")
+                if isinstance(rhs, tuple):
+                    shape, dtype, scope = rhs
+                    ph = _api.placeholder(shape, dtype=dtype, name=lhs)
+                    self.alloc_buffers[lhs] = (ph, scope)
+                    return make_nop()
+                if isinstance(rhs, halide_imm_types) and ast.Store not in rw:
+                    self.var_consts[lhs] = rhs
+                else:
+                    self.var_buffers[lhs] = _api.placeholder((1, ), dtype=rhs.dtype, name=lhs)
+            if lhs in self.var_consts.keys():
+                return make_nop()
+            else:
+                if lhs not in self.var_buffers.keys():
+                    raise ValueError("BUG: This variable should be defined before!")
+                tgt = self.var_buffers[lhs]
+                return _make.Provide(tgt.op, 0, rhs, [_api.const(0, dtype=rhs.dtype)])
+        else:
+            lhs = self.visit(lhs)
+            if not isinstance(lhs, _expr.Call):
+                raise ValueError("An array access's LHS is expected to be a expr.Call!")
+            #TODO: support slice later
+            buf = self._get_buffer_from_id(lhs.name)
+            return _make.Provide(buf.op, 0, rhs, lhs.args)
+
+
+    def visit_Index(self, node):
+        if isinstance(node.value, ast.Tuple):
+            return [self.visit(i) for i in node.value.elts]
+        return [self.visit(node.value)]
+
+
+    def visit_Subscript(self, node):
+        args = self.visit(node.slice)
+        if isinstance(node.value, ast.Name):
+            array = node.value.id
+            _buf = self._get_buffer_from_id(array)
+            return _make.Call(_buf.dtype, array, args, _expr.Call.Halide, _buf.op, 0)
+        elif isinstance(node.value, ast.Attribute):
+            if not isinstance(node.value.value, ast.Name):
+                raise ValueError("The root of array access is expect to be a id!")
+            if node.value.attr != "shape":
+                raise ValueError("Attribute access so far only 'shape' is supported!")
+            if len(args) != 1:
+                raise ValueError("For 'shape' access the argument should be only one!")
+            args = args[0]
+            #TODO: maybe support non-constant value later?
+            if not isinstance(args, (_expr.IntImm, _expr.UIntImm)):
+                raise ValueError("So far only constant shape access supported!")
+            buf = self._get_buffer_from_id(node.value.value.id)
+            return buf.shape[args.value]
+        else:
+            raise ValueError("Not supported yet!")
+
+
+    def visit_With(self, node):
+        if sys.version_info[0] < 3:
+            context = node.context_expr
+            option = node.optional_vars
+        else:
+            if len(node.items) != 1:
+                raise ValueError("Only one with element is supported so far!")
+            context = node.items[0].context_expr
+            option = node.items[0].optional_vars
+        if not isinstance(context, ast.Call):
+            raise ValueError("The object must be a Python function call!")
+        if not isinstance(option, ast.Name):
+            raise ValueError("The object after 'as' must be an id!")
+        self.annotation[option.id] = context.func.id
+        return list_to_block(self.visit, node.body)
+
+
+    def visit_If(self, node):
+        cond = self.visit(node.test)
+        if_body = list_to_block(self.visit, node.body)
+        if node.orelse:
+            else_body = list_to_block(self.visit, node.orelse)
+        else:
+            else_body = make_nop()
+        return _make.IfThenElse(cond, if_body, else_body)
+
+
+    def visit_IfExp(self, node):
+        cond = self.visit(node.test)
+        if_body = self.visit(node.body)
+        else_body = self.visit(node.orelse)
+        return _make.Select(cond, if_body, else_body)
+
+
+    def visit_Compare(self, node):
+        lhs = self.visit(node.left)
+        if len(node.ops) != 1:
+            raise ValueError("Only one compare op is supported!")
+        if len(node.comparators) != 1:
+            raise ValueError("Only one comparator is supported!")
+        rhs = self.visit(node.comparators[0])
+        return HybridParser._binop_maker[type(node.ops[0])](lhs, rhs)
+
+
+    def visit_UnaryOp(self, node):
+        operand = self.visit(node.operand)
+        return HybridParser._unaryop_maker[type(node.op)](operand)
+
+
+    def visit_BinOp(self, node):
+        lhs = self.visit(node.left)
+        rhs = self.visit(node.right)
+        return HybridParser._binop_maker[type(node.op)](lhs, rhs)
+
+
+    def visit_Call(self, node):
+        # Yet, no function pointer supported
+        if not isinstance(node.func, ast.Name):
+            raise ValueError("Only id-function function call is supported so far!")
+        func_id = node.func.id
+        n = len(node.args)
+        if func_id in LOOP_INTRIN.keys() and func_id != 'bind':
+            if n == 1:
+                low, ext = _api.const(0, dtype='int32'), self.visit(node.args[0])
+            else:
+                if n != 2:
+                    raise ValueError("A loop intrinsic should only have 1 or 2 arguments!")
+                low, ext = self.visit(node.args[0]), self.visit(node.args[1])
+            if not _ir_pass.Equal(low, _api.const(0, dtype='int32')):
+                ext = ext - low
+            for_type = LOOP_INTRIN[func_id]
+            iter_var = None
+            return iter_var, low, ext, for_type
+        elif func_id == 'bind':
+            if n != 2:
+                raise ValueError("A loop bind should only have 2 arguments!")
+            if not isinstance(node.args[0], ast.Str):
+                raise ValueError("A loop bind's first argument should be a string!")
+            _vn = node.args[0].s
+            iter_var = thread_axis(node.args[0].s)
+            low, ext = _api.const(0, dtype='int32'), self.visit(node.args[1])
+            for_type = None
+            return iter_var, low, ext, for_type
+        elif func_id in MATH_INTRIN:
+            return getattr(intrin, func_id)(*[self.visit(arg) for arg in node.args])
+        elif func_id == 'allocate':
+            if not isinstance(node.args[0], ast.Tuple):
+                raise ValueError("allocate's first argument should be a tuple of shape!")
+            shape = tuple(self.visit(i) for i in node.args[0].elts)
+            for i in shape:
+                if not isinstance(i, _expr.Expr):
+                    raise ValueError("The shape should be an expression")
+            if n > 1:
+                if not isinstance(node.args[1], ast.Str):
+                    raise ValueError("The data type should be an string")
+                dtype = node.args[1].s
+            else:
+                dtype = 'float32'
+            if n > 2:
+                if not isinstance(node.args[2], ast.Str):
+                    raise ValueError("The data type should be an string")
+                scope = node.args[2].s
+            else:
+                scope = 'global'
+            return (shape, dtype, scope)
+        elif func_id == 'max' or func_id == 'min':
+            if n != 2:
+                raise ValueError("Max/Min function should have 2 elements")
+            a, b = self.visit(node.args[0]), self.visit(node.args[1])
+            return getattr(_make, func_id.title())(a, b)
+        else:
+            raise ValueError("Function call not supported yet!")
+
+
+    def visit_For(self, node):
+        iter_var, low, ext, for_type = self.visit(node.iter)
+        if not isinstance(node.target, ast.Name):
+            raise ValueError("The loop iterator should be a variable!")
+        _name = node.target.id
+        if iter_var is None:
+            if for_type is None:
+                raise ValueError("The loop bind function parse error!")
+            offset = iter_var = _api.var(_name)
+            if not _ir_pass.Equal(low, _api.const(0, dtype='int32')):
+                offset = iter_var + low
+            self.loops_above[_name] = offset
+        else:
+            if for_type is not None:
+                raise ValueError("The loop iterating function parse error!")
+            self.loops_above[_name] = iter_var.var
+        _body = list_to_block(self.visit, node.body)
+        _body = self.wrap_up_realize(node, _body)
+        if for_type is None:
+            res = _make.AttrStmt(iter_var, 'thread_extent', ext, _body)
+        else:
+            res = _make.For(iter_var, _api.const(0, dtype='int32'), ext, for_type, 0, _body)
+        self.loops_above.pop(_name)
+        return res
+
+
+def parse_python(src, args):
+    """The helper function of calling the AST visitor"""
+    root = ast.parse(src)
+    var_usage = determine_variable_usage(root, args)
+    parser = HybridParser(args, var_usage)
+    halide_ir = parser.visit(root)
+    return halide_ir
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
new file mode 100644
index 000000000000..43d26e859560
--- /dev/null
+++ b/python/tvm/hybrid/util.py
@@ -0,0 +1,66 @@
+"""Internal utilities for parsing Python subset to HalideIR"""
+
+import inspect
+import numpy
+from .intrin import HYBRID_GLOBALS
+from .._ffi.base import numeric_types
+from .. import api as _api
+from .. import make as _make
+from .. import expr as _expr
+from ..tensor import Tensor
+
+
+#pylint: disable=invalid-name
+np_arg_types = tuple(list(numeric_types) + [numpy.ndarray])
+tvm_arg_types = (Tensor, _expr.Var)
+halide_imm_types = (_expr.IntImm, _expr.FloatImm, _expr.UIntImm)
+
+
+# Useful constants. In avoid of runtime dependences, we use function calls to return them.
+def make_nop():
+    """Returns a 'no operation' node in HalideIR."""
+    return _make.Evaluate(_api.const(0, dtype='int32'))
+
+
+def _pruned_source(func):
+    """Prune source code's extra leading spaces"""
+    lines = inspect.getsource(func).split('\n')
+    leading_space = len(lines[0]) - len(lines[0].lstrip(' '))
+    lines = [line[leading_space:] for line in lines]
+    return '\n'.join(lines)
+
+
+def _is_tvm_arg_types(args):
+    """Determine a list of element is either a list of tvm arguments of a list of numpy arguments.
+    If neither is true, raise a value error."""
+    if isinstance(args[0], tvm_arg_types):
+        for elem in args[1:]:
+            if not isinstance(elem, tvm_arg_types):
+                raise ValueError("Expect a Var or Tensor instance but % get!" % str(type(elem)))
+        return True
+    if not isinstance(args[0], np_arg_types):
+        raise ValueError("Expect a numpy type but % get!" % str(type(args[0])))
+    for elem in args[1:]:
+        if not isinstance(elem, np_arg_types):
+            raise ValueError("Expect a numpy type but % get!" % str(type(elem)))
+    return False
+
+
+def _enter_hybrid_runtime(func):
+    """Put hybrid runtime variables into the global scope"""
+    _globals = func.__globals__
+    intersect = []
+    for elem in list(HYBRID_GLOBALS.keys()):
+        if elem in _globals.keys():
+            intersect.append((elem, _globals[elem]))
+        _globals[elem] = HYBRID_GLOBALS[elem]
+    return intersect
+
+
+def _restore_runtime(func, intersect):
+    """Rollback the modification caused by hybrid runtime"""
+    _globals = func.__globals__
+    for elem in list(HYBRID_GLOBALS.keys()):
+        _globals.pop(elem)
+    for k, v in intersect:
+        _globals[k] = v
diff --git a/python/tvm/hybrid/var_decl.py b/python/tvm/hybrid/var_decl.py
new file mode 100644
index 000000000000..df38bac1acba
--- /dev/null
+++ b/python/tvm/hybrid/var_decl.py
@@ -0,0 +1,76 @@
+"""Determines the declaration, r/w status, and last use of each variable"""
+
+import ast
+import sys
+from .intrin import HYBRID_GLOBALS
+
+
+class PyVariableUsage(ast.NodeVisitor):
+    """The vistor class to determine the declaration, r/w status, and last use of each variable"""
+    #pylint: disable=invalid-name
+    #pylint: disable=missing-docstring
+    def __init__(self, args):
+        self.status = {}
+        self.scope_level = []
+        self._args = {}
+        self.args = args
+
+
+    def visit_FunctionDef(self, node):
+        self.scope_level.append(node)
+        if len(node.args.args) != len(self.args):
+            raise ValueError('#arguments passed should be the same as #arguments defined')
+        for idx, arg in enumerate(node.args.args):
+            _attr = 'id' if sys.version_info[0] < 3 else 'arg' # To make py2 and 3 compatible
+            self._args[getattr(arg, _attr)] = self.args[idx]
+        for i in node.body:
+            self.visit(i)
+
+
+    def visit_For(self, node):
+        if not isinstance(node.target, ast.Name):
+            raise ValueError("For's iterator should be an id")
+        self.visit(node.iter)
+        self.scope_level.append(node)
+        for i in node.body:
+            self.visit(i)
+        self.scope_level.pop()
+
+
+    def visit_Call(self, node):
+        #No function pointer supported so far
+        if not isinstance(node.func, ast.Name):
+            raise ValueError("Function call should be an id")
+        func_id = node.func.id
+        if func_id not in list(HYBRID_GLOBALS.keys()) + ['range', 'max', 'min']:
+            raise ValueError("Function call id not in intrinsics' list")
+        for elem in node.args:
+            self.visit(elem)
+
+
+    def visit_Name(self, node):
+        # If it is from the argument list or loop variable, we do not worry about it!
+        if node.id in self._args.keys():
+            return
+        fors = [loop.target.id for loop in self.scope_level if isinstance(loop, ast.For)]
+        if node.id in fors:
+            return
+        # The loop variable cannot be overwritten when iteration
+        if isinstance(node.ctx, ast.Store) and node.id in fors:
+            raise ValueError("Iter var cannot be overwritten")
+
+        if node.id not in self.status.keys():
+            if not isinstance(node.ctx, ast.Store):
+                raise ValueError('In Python, "first store" indicates "declaration"')
+            self.status[node.id] = (node, self.scope_level[-1], set())
+        else:
+            decl, loop, usage = self.status[node.id]
+            usage.add(type(node.ctx))
+            self.status[node.id] = (decl, loop, usage)
+
+
+def determine_variable_usage(root, args):
+    """The helper function for calling the dedicated visitor."""
+    visitor = PyVariableUsage(args)
+    visitor.visit(root)
+    return visitor.status
diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py
index e13a1162bb0b..30da873b5dcf 100644
--- a/python/tvm/intrin.py
+++ b/python/tvm/intrin.py
@@ -1,4 +1,5 @@
 """Expression Intrinsics and math functions in TVM."""
+# pylint: disable=redefined-builtin
 from __future__ import absolute_import as _abs
 
 from ._ffi.function import register_func as _register_func
@@ -80,6 +81,33 @@ def call_pure_intrin(dtype, func_name, *args):
         dtype, func_name, convert(args), _Call.PureIntrinsic, None, 0)
 
 
+def call_intrin(dtype, func_name, *args):
+    """Build expression by calling an intrinsic function.
+
+    Intrinsics can be overloaded with multiple data types via
+    the intrinsic translation rule.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type of the result.
+
+    func_name: str
+        The intrinsic function name.
+
+    args : list
+        Positional arguments.
+
+    Returns
+    -------
+    call : Expr
+        The call expression.
+    """
+    args = convert(args)
+    return _make.Call(
+        dtype, func_name, convert(args), _Call.Intrinsic, None, 0)
+
+
 def call_pure_extern(dtype, func_name, *args):
     """Build expression by calling a pure extern function.
 
@@ -126,6 +154,31 @@ def call_extern(dtype, func_name, *args):
         dtype, func_name, convert(args), _Call.Extern, None, 0)
 
 
+def call_llvm_intrin(dtype, name, *args):
+    """Build expression by calling an llvm intrinsic function
+
+    Parameters
+    ----------
+    dtype : str
+       The data type of the result.
+
+    name : str
+       The name of the llvm intrinsic function.
+
+    args : list
+       Poistional arguments.
+
+    Returns
+    -------
+    call : Expr
+        The call expression.
+    """
+    import tvm
+    llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(name)
+    assert llvm_id != 0, "%s is not an LLVM intrinsic" % name
+    return call_pure_intrin(dtype, 'llvm_intrin', tvm.const(llvm_id, 'uint32'), *args)
+
+
 def exp(x):
     """Take exponetial of input x.
 
@@ -206,6 +259,89 @@ def sqrt(x):
     return call_pure_intrin(x.dtype, "sqrt", x)
 
 
+def floor(x):
+    """Take floor of float input x.
+
+    Parameters
+    ----------
+    x : Expr
+        Input argument.
+
+    Returns
+    -------
+    y : Expr
+        The result.
+    """
+    return call_pure_intrin(x.dtype, "floor", x)
+
+
+def ceil(x):
+    """Take ceil of float input x.
+
+    Parameters
+    ----------
+    x : Expr
+        Input argument.
+
+    Returns
+    -------
+    y : Expr
+        The result.
+    """
+    return call_pure_intrin(x.dtype, "ceil", x)
+
+
+def trunc(x):
+    """Get truncated value of the input.
+
+    The truncated value of the scalar x is the
+    nearest integer i which is closer to zero than x is.
+
+    Parameters
+    ----------
+    x : Expr
+        Input argument.
+
+    Returns
+    -------
+    y : Expr
+        The result.
+    """
+    return call_pure_intrin(x.dtype, "trunc", x)
+
+
+def abs(x):
+    """Get absolute value of the input element-wise.
+
+    Parameters
+    ----------
+    x : Expr
+        Input argument.
+
+    Returns
+    -------
+    y : Expr
+        The result.
+    """
+    return _make.abs(x)
+
+
+def round(x):
+    """Round elements of the array to the nearest integer.
+
+    Parameters
+    ----------
+    x : Expr
+        Input argument.
+
+    Returns
+    -------
+    y : Expr
+        The result.
+    """
+    return call_pure_intrin(x.dtype, "round", x)
+
+
 def power(x, y):
     """x power y
 
diff --git a/python/tvm/ir_builder.py b/python/tvm/ir_builder.py
index 1888cd9f1d18..dcf5b9d0976b 100644
--- a/python/tvm/ir_builder.py
+++ b/python/tvm/ir_builder.py
@@ -97,6 +97,7 @@ class IRBuilder(object):
     """
     def __init__(self):
         self._seq_stack = [[]]
+        self.nidx = 0
 
     def _pop_seq(self):
         """Pop sequence from stack"""
@@ -167,7 +168,8 @@ def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"):
             The end iteration scope
 
         name : str, optional
-            The name of iteration variable
+            The name of iteration variable, if no input names,
+            using typical index names i, j, k, then i_nidx
 
         dtype : str, optional
             The data type of iteration variable.
@@ -189,6 +191,9 @@ def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"):
             with ib.for_range(1, 10, name="i") as i:
                 x[i] = x[i - 1] + 1
         """
+        if name == 'i':
+            name = chr(ord(name) + self.nidx) if self.nidx < 3 else name + "_" + str(self.nidx - 3)
+            self.nidx += 1
         self._seq_stack.append([])
         loop_var = _api.var(name, dtype=dtype)
         extent = end if begin == 0 else _pass.Simplify(end - begin)
diff --git a/python/tvm/module.py b/python/tvm/module.py
index 055ae7308987..1b83c9b26243 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -1,13 +1,14 @@
 """Container of compiled functions of TVM."""
 from __future__ import absolute_import as _abs
 
+import struct
 from collections import namedtuple
 
 from ._ffi.function import ModuleBase, _set_class_module
 from ._ffi.function import _init_api
 from .contrib import cc as _cc, tar as _tar, util as _util
 
-ProfileResult = namedtuple("ProfileResult", ["mean"])
+ProfileResult = namedtuple("ProfileResult", ["mean", "results"])
 
 
 class Module(ModuleBase):
@@ -83,6 +84,8 @@ def export_library(self,
 
         fcompile : function(target, file_list, kwargs), optional
             Compilation function to use create dynamic library.
+            If fcompile has attribute object_format, will compile host library
+            to that format. Otherwise, will use default format "o".
 
         kwargs : dict, optiona;
             Additional arguments passed to fcompile
@@ -94,7 +97,11 @@ def export_library(self,
         if self.type_key != "llvm":
             raise ValueError("Module[%s]: Only llvm support export shared" % self.type_key)
         temp = _util.tempdir()
-        path_obj = temp.relpath("lib.o")
+        if fcompile is not None and hasattr(fcompile, "object_format"):
+            object_format = fcompile.object_format
+        else:
+            object_format = "o"
+        path_obj = temp.relpath("lib." + object_format)
         self.save(path_obj)
         files = [path_obj]
         is_system_lib = self.get_function("__tvm_is_system_module")()
@@ -110,7 +117,7 @@ def export_library(self,
                 fcompile = _cc.create_shared
         fcompile(file_name, files, **kwargs)
 
-    def time_evaluator(self, func_name, ctx, number):
+    def time_evaluator(self, func_name, ctx, number, repeat=1):
         """Get an evaluator that measures time cost of running function.
 
         Parameters
@@ -122,11 +129,15 @@ def time_evaluator(self, func_name, ctx, number):
             The context we should run this function on.
 
         number: int
-            The number of repeative times to run evaluation.
+            The number of steps used in measuring each time interval
+
+        repeat: int, optional
+            Number of times to run the timer measurement
+            If repeat equals 3, then we will get 3 numbers in the ProfileResult.
 
         Note
         ----
-        The function will be invoked number + 1 times,
+        The function will be invoked  repeat * number + 1 times,
         with the first call discarded in case there is lazy initialization.
 
         Returns
@@ -137,13 +148,16 @@ def time_evaluator(self, func_name, ctx, number):
         """
         try:
             feval = _RPCTimeEvaluator(
-                self, func_name, ctx.device_type, ctx.device_id, number)
+                self, func_name, ctx.device_type, ctx.device_id, number, repeat)
 
             def evaluator(*args):
                 """Internal wrapped evaluator."""
                 # Wrap feval so we can add more stats in future.
-                mean = feval(*args)
-                return ProfileResult(mean=mean)
+                blob = feval(*args)
+                fmt = "@" + ("d" * repeat)
+                results = struct.unpack(fmt, blob)
+                mean = sum(results) / float(repeat)
+                return ProfileResult(mean=mean, results=results)
 
             return evaluator
         except NameError:
@@ -172,7 +186,7 @@ def system_lib():
 
 
 def load(path, fmt=""):
-    """Load module from file
+    """Load module from file.
 
     Parameters
     ----------
@@ -187,7 +201,24 @@ def load(path, fmt=""):
     -------
     module : Module
         The loaded module
+
+    Note
+    ----
+    This function will automatically call
+    cc.create_shared if the path is in format .o or .tar
     """
+    # High level handling for .o and .tar file.
+    # We support this to be consistent with RPC module load.
+    if path.endswith(".o"):
+        _cc.create_shared(path + ".so", path)
+        path += ".so"
+    elif path.endswith(".tar"):
+        tar_temp = _util.tempdir()
+        _tar.untar(path, tar_temp.temp_dir)
+        files = [tar_temp.relpath(x) for x in tar_temp.listdir()]
+        _cc.create_shared(path + ".so", files)
+        path += ".so"
+    # Redirect to the load API
     return _LoadFromFile(path, fmt)
 
 
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index 1556c4912a35..18e958973d94 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -120,6 +120,39 @@ def vpi(dev_id=0):
     """
     return TVMContext(9, dev_id)
 
+
+def vulkan(dev_id=0):
+    """Construct a Vulkan device
+
+    Parameters
+    ----------
+    dev_id : int, optional
+        The integer device id
+
+    Returns
+    -------
+    ctx : TVMContext
+        The created context
+    """
+    return TVMContext(7, dev_id)
+
+
+def opengl(dev_id=0):
+    """Construct a OpenGL device
+
+    Parameters
+    ----------
+    dev_id : int, optional
+        The integer device id
+
+    Returns
+    -------
+    ctx : TVMContext
+        The created context
+    """
+    return TVMContext(11, dev_id)
+
+
 def ext_dev(dev_id=0):
     """Construct a extension device
 
diff --git a/python/tvm/rpc/__init__.py b/python/tvm/rpc/__init__.py
new file mode 100644
index 000000000000..974151c1e5b0
--- /dev/null
+++ b/python/tvm/rpc/__init__.py
@@ -0,0 +1,15 @@
+"""Lightweight TVM RPC module.
+
+RPC enables connect to a remote server, upload and launch functions.
+This is useful to for cross-compile and remote testing,
+The compiler stack runs on local server, while we use RPC server
+to run on remote runtime which don't have a compiler available.
+
+The test program compiles the program on local server,
+upload and run remote RPC server, get the result back to verify correctness.
+"""
+
+from .server import Server
+from .tracker import Tracker
+from .proxy import Proxy
+from .client import RPCSession, LocalSession, TrackerSession, connect, connect_tracker
diff --git a/python/tvm/rpc/base.py b/python/tvm/rpc/base.py
new file mode 100644
index 000000000000..9d1df9f1e7ec
--- /dev/null
+++ b/python/tvm/rpc/base.py
@@ -0,0 +1,162 @@
+"""Base definitions for RPC."""
+from __future__ import absolute_import
+
+import socket
+import time
+import json
+import errno
+import struct
+import random
+import logging
+
+from .._ffi.function import _init_api
+from .._ffi.base import py_str
+
+# Magic header for RPC data plane
+RPC_MAGIC = 0xff271
+# magic header for RPC tracker(control plane)
+RPC_TRACKER_MAGIC = 0x2f271
+# sucess response
+RPC_CODE_SUCCESS = RPC_MAGIC + 0
+# duplicate key in proxy
+RPC_CODE_DUPLICATE = RPC_MAGIC + 1
+# cannot found matched key in server
+RPC_CODE_MISMATCH = RPC_MAGIC + 2
+
+
+class TrackerCode(object):
+    """Enumeration code for the RPC tracker"""
+    FAIL = -1
+    SUCCESS = 0
+    PING = 1
+    STOP = 2
+    PUT = 3
+    REQUEST = 4
+    UPDATE_INFO = 5
+    SUMMARY = 6
+    GET_PENDING_MATCHKEYS = 7
+
+RPC_SESS_MASK = 128
+
+
+def recvall(sock, nbytes):
+    """Receive all nbytes from socket.
+
+    Parameters
+    ----------
+    sock: Socket
+       The socket
+
+    nbytes : int
+       Number of bytes to be received.
+    """
+    res = []
+    nread = 0
+    while nread < nbytes:
+        chunk = sock.recv(min(nbytes - nread, 1024))
+        if not chunk:
+            raise IOError("connection reset")
+        nread += len(chunk)
+        res.append(chunk)
+    return b"".join(res)
+
+
+def sendjson(sock, data):
+    """send a python value to remote via json
+
+    Parameters
+    ----------
+    sock : Socket
+        The socket
+
+    data : object
+        Python value to be sent.
+    """
+    data = json.dumps(data)
+    sock.sendall(struct.pack("<i", len(data)))
+    sock.sendall(data.encode("utf-8"))
+
+
+def recvjson(sock):
+    """receive python value from remote via json
+
+    Parameters
+    ----------
+    sock : Socket
+        The socket
+
+    Returns
+    -------
+    value : object
+        The value received.
+    """
+    size = struct.unpack("<i", recvall(sock, 4))[0]
+    data = json.loads(py_str(recvall(sock, size)))
+    return data
+
+
+def random_key(prefix, cmap=None):
+    """Generate a random key
+
+    Parameters
+    ----------
+    prefix : str
+        The string prefix
+
+    cmap : dict
+        Conflict map
+
+    Returns
+    -------
+    key : str
+        The generated random key
+    """
+    if cmap:
+        while True:
+            key = prefix + str(random.random())
+            if key not in cmap:
+                return key
+    else:
+        return prefix + str(random.random())
+
+
+def connect_with_retry(addr, timeout=60, retry_period=5, silent=False):
+    """Connect to a TPC address with retry
+
+    This function is only reliable to short period of server restart.
+
+    Parameters
+    ----------
+    addr : tuple
+        address tuple
+
+    timeout : float
+         Timeout during retry
+
+    retry_period : float
+         Number of seconds before we retry again.
+
+    silent: bool
+        whether run in silent mode
+    """
+    tstart = time.time()
+    while True:
+        try:
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.connect(addr)
+            return sock
+        except socket.error as sock_err:
+            if sock_err.args[0] not in (errno.ECONNREFUSED,):
+                raise sock_err
+            period = time.time() - tstart
+            if period > timeout:
+                raise RuntimeError(
+                    "Failed to connect to server %s" % str(addr))
+            if not silent:
+                logging.info("Cannot connect to tracker%s, retry in %g secs...",
+                             str(addr), retry_period)
+            time.sleep(retry_period)
+
+
+# Still use tvm.rpc for the foreign functions
+_init_api("tvm.rpc", "tvm.rpc.base")
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
new file mode 100644
index 000000000000..57f368b0e660
--- /dev/null
+++ b/python/tvm/rpc/client.py
@@ -0,0 +1,388 @@
+"""RPC client tools"""
+from __future__ import absolute_import
+
+import os
+import socket
+import struct
+import time
+
+from . import base
+from ..contrib import util
+from .._ffi.base import TVMError
+from .._ffi import function as function
+from .._ffi import ndarray as nd
+from ..module import load as _load_module
+
+
+class RPCSession(object):
+    """RPC Client session module
+
+    Do not directly create the obhect, call connect
+    """
+    # pylint: disable=invalid-name
+    def __init__(self, sess):
+        self._sess = sess
+        self._tbl_index = base._SessTableIndex(sess)
+        self._remote_funcs = {}
+
+    def get_function(self, name):
+        """Get function from the session.
+
+        Parameters
+        ----------
+        name : str
+            The name of the function
+
+        Returns
+        -------
+        f : Function
+            The result function.
+        """
+        return self._sess.get_function(name)
+
+    def context(self, dev_type, dev_id=0):
+        """Construct a remote context.
+
+        Parameters
+        ----------
+        dev_type: int or str
+
+        dev_id: int, optional
+
+        Returns
+        -------
+        ctx: TVMContext
+            The corresponding encoded remote context.
+        """
+        ctx = nd.context(dev_type, dev_id)
+        encode = (self._tbl_index + 1) * base.RPC_SESS_MASK
+        ctx.device_type += encode
+        ctx._rpc_sess = self
+        return ctx
+
+    def upload(self, data, target=None):
+        """Upload file to remote runtime temp folder
+
+        Parameters
+        ----------
+        data : str or bytearray
+            The file name or binary in local to upload.
+
+        target : str, optional
+            The path in remote
+        """
+        if isinstance(data, bytearray):
+            if not target:
+                raise ValueError("target must present when file is a bytearray")
+            blob = data
+        else:
+            blob = bytearray(open(data, "rb").read())
+            if not target:
+                target = os.path.basename(data)
+
+        if "upload" not in self._remote_funcs:
+            self._remote_funcs["upload"] = self.get_function(
+                "tvm.rpc.server.upload")
+        self._remote_funcs["upload"](target, blob)
+
+    def download(self, path):
+        """Download file from remote temp folder.
+
+        Parameters
+        ----------
+        path : str
+            The relative location to remote temp folder.
+
+        Returns
+        -------
+        blob : bytearray
+            The result blob from the file.
+        """
+        if "download" not in self._remote_funcs:
+            self._remote_funcs["download"] = self.get_function(
+                "tvm.rpc.server.download")
+        return self._remote_funcs["download"](path)
+
+    def load_module(self, path):
+        """Load a remote module, the file need to be uploaded first.
+
+        Parameters
+        ----------
+        path : str
+            The relative location to remote temp folder.
+
+        Returns
+        -------
+        m : Module
+            The remote module containing remote function.
+        """
+        return base._LoadRemoteModule(self._sess, path)
+
+    def cpu(self, dev_id=0):
+        """Construct CPU device."""
+        return self.context(1, dev_id)
+
+    def gpu(self, dev_id=0):
+        """Construct GPU device."""
+        return self.context(2, dev_id)
+
+    def cl(self, dev_id=0):
+        """Construct OpenCL device."""
+        return self.context(4, dev_id)
+
+    def metal(self, dev_id=0):
+        """Construct Metal device."""
+        return self.context(8, dev_id)
+
+    def opengl(self, dev_id=0):
+        """Construct OpenGL device."""
+        return self.context(11, dev_id)
+
+    def ext_dev(self, dev_id=0):
+        """Construct extension device."""
+        return self.context(12, dev_id)
+
+
+class LocalSession(RPCSession):
+    """RPCSession interface backed by local environment.
+
+    This class can be used to implement functions that
+    need to be ran both locally and remotely.
+    """
+    def __init__(self):
+        # pylint: disable=super-init-not-called
+        self.context = nd.context
+        self.get_function = function.get_global_func
+        self._temp = util.tempdir()
+
+    def upload(self, data, target=None):
+        if isinstance(data, bytearray):
+            if not target:
+                raise ValueError("target must present when file is a bytearray")
+            blob = data
+        else:
+            blob = bytearray(open(data, "rb").read())
+            if not target:
+                target = os.path.basename(data)
+        with open(self._temp.relpath(target), "wb") as f:
+            f.write(blob)
+
+    def download(self, path):
+        return bytearray(open(self._temp.relpath(path), "rb").read())
+
+    def load_module(self, path):
+        return _load_module(self._temp.relpath(path))
+
+
+class TrackerSession(object):
+    """Tracker client session.
+
+    Parameters
+    ----------
+    addr : tuple
+        The address tuple
+    """
+    def __init__(self, addr):
+        self._addr = addr
+        self._sock = None
+        self._connect()
+
+    def __del__(self):
+        self.close()
+
+    def _connect(self):
+        self._sock = base.connect_with_retry(self._addr)
+        self._sock.sendall(struct.pack("<i", base.RPC_TRACKER_MAGIC))
+        magic = struct.unpack("<i", base.recvall(self._sock, 4))[0]
+        if magic != base.RPC_TRACKER_MAGIC:
+            raise RuntimeError("%s is not RPC Tracker" % str(self._addr))
+
+    def close(self):
+        """Close the tracker connection."""
+        if self._sock:
+            self._sock.close()
+            self._sock = None
+
+    def summary(self):
+        """Get the summary dict of the tracker."""
+        base.sendjson(self._sock, [base.TrackerCode.SUMMARY])
+        value = base.recvjson(self._sock)
+        if value[0] != base.TrackerCode.SUCCESS:
+            raise RuntimeError("Invalid return value %s" % str(value))
+        return value[1]
+
+    def text_summary(self):
+        """Get a text summary of the tracker."""
+        data = self.summary()
+        res = ""
+        res += "Server List\n"
+        res += "----------------------------\n"
+        res += "server-address\tkey\n"
+        res += "----------------------------\n"
+        for item in data["server_info"]:
+            addr = item["addr"]
+            res += addr[0] + ":" + str(addr[1])+ "\t"
+            res += item["key"] + "\n"
+        res += "----------------------------\n"
+        res += "\n"
+
+        # compute max length of device key
+        queue_info = data['queue_info']
+        keys = list(queue_info.keys())
+        if keys:
+            keys.sort()
+            max_key_len = max([len(k) for k in keys])
+        else:
+            max_key_len = 0
+
+        res += "Queue Status\n"
+        res += "----------------------------\n"
+        res += ("%%-%ds" % max_key_len + "\tfree\tpending\n") % 'key'
+        res += "----------------------------\n"
+        for k in keys:
+            res += ("%%-%ds" % max_key_len + "\t%d\t%g\n") % \
+                   (k, queue_info[k]["free"], queue_info[k]["pending"])
+
+        res += "----------------------------\n"
+        return res
+
+    def request(self, key, priority=1, session_timeout=0, max_retry=5):
+        """Request a new connection from the tracker.
+
+        Parameters
+        ----------
+        key : str
+            The type key of the device.
+
+        priority : int, optional
+            The priority of the request.
+
+        session_timeout : float, optional
+            The duration of the session, allows server to kill
+            the connection when duration is longer than this value.
+            When duration is zero, it means the request must always be kept alive.
+
+        max_retry : int, optional
+            Maximum number of times to retry before give up.
+        """
+        last_err = None
+        for _ in range(max_retry):
+            try:
+                if self._sock is None:
+                    self._connect()
+                base.sendjson(self._sock,
+                              [base.TrackerCode.REQUEST, key, "", priority])
+                value = base.recvjson(self._sock)
+                if value[0] != base.TrackerCode.SUCCESS:
+                    raise RuntimeError("Invalid return value %s" % str(value))
+                url, port, matchkey = value[1]
+                return connect(url, port, matchkey, session_timeout)
+            except socket.error as err:
+                self.close()
+                last_err = err
+            except TVMError as err:
+                last_err = err
+        raise RuntimeError(
+            "Cannot request %s after %d retry, last_error:%s" % (
+                key, max_retry, str(last_err)))
+
+    def request_and_run(self,
+                        key,
+                        func,
+                        priority=1,
+                        session_timeout=0,
+                        max_retry=2):
+        """Request a resource from tracker and run the func.
+
+        This function safe-guard rare server node dropout during execution.
+        In such case, a new resource will be requested and func will be ran again.
+
+        Parameters
+        ----------
+        key : str
+            The type key of the device.
+
+        func : function of session -> value
+            A stateless function
+
+        priority : int, optional
+            The priority of the request.
+
+        session_timeout : float, optional
+            The duration of the session, allows server to kill
+            the connection when duration is longer than this value.
+            When duration is zero, it means the request must always be kept alive.
+
+        max_retry : int, optional
+            Maximum number of times to retry the function before give up.
+        """
+        last_err = None
+        for _ in range(max_retry):
+            try:
+                sess = self.request(key,
+                                    priority=priority,
+                                    session_timeout=session_timeout)
+                tstart = time.time()
+                return func(sess)
+            except TVMError as err:
+                duration = time.time() - tstart
+                # roughly estimate if the error is due to timeout termination
+                if session_timeout and duration >= session_timeout * 0.95:
+                    raise RuntimeError(
+                        "Session timeout when running %s" % func.__name__)
+                last_err = err
+        raise RuntimeError(
+            "Failed to run on %s after %d retry, last_error:%s" % (
+                key, max_retry, str(last_err)))
+
+
+def connect(url, port, key="", session_timeout=0):
+    """Connect to RPC Server
+
+    Parameters
+    ----------
+    url : str
+        The url of the host
+
+    port : int
+        The port to connect to
+
+    key : str, optional
+        Additional key to match server
+
+    session_timeout : float, optional
+        The duration of the session, allows server to kill
+        the connection when duration is longer than this value.
+        When duration is zero, it means the request must always be kept alive.
+
+    Returns
+    -------
+    sess : RPCSession
+        The connected session.
+    """
+    try:
+        if session_timeout:
+            key += " -timeout=%s" % str(session_timeout)
+        sess = base._Connect(url, port, key)
+    except NameError:
+        raise RuntimeError("Please compile with USE_RPC=1")
+    return RPCSession(sess)
+
+
+def connect_tracker(url, port):
+    """Connect to a RPC tracker
+
+    Parameters
+    ----------
+    url : str
+        The url of the host
+
+    port : int
+        The port to connect to
+
+    Returns
+    -------
+    sess : TrackerSession
+        The connected tracker session.
+    """
+    return TrackerSession((url, port))
diff --git a/python/tvm/contrib/rpc_proxy.py b/python/tvm/rpc/proxy.py
similarity index 57%
rename from python/tvm/contrib/rpc_proxy.py
rename to python/tvm/rpc/proxy.py
index fe289935e7cb..9afb9ca1a667 100644
--- a/python/tvm/contrib/rpc_proxy.py
+++ b/python/tvm/rpc/proxy.py
@@ -14,30 +14,39 @@
 import multiprocessing
 import errno
 import struct
+import time
+
 try:
     import tornado
     from tornado import gen
     from tornado import websocket
     from tornado import ioloop
-    from tornado import websocket
+    from . import tornado_util
 except ImportError as error_msg:
     raise ImportError("RPCProxy module requires tornado package %s" % error_msg)
-from . import rpc
-from .rpc import RPC_MAGIC, _server_env
+
+from . import base
+from .base import TrackerCode
+from .server import _server_env
 from .._ffi.base import py_str
 
+
 class ForwardHandler(object):
     """Forward handler to forward the message."""
     def _init_handler(self):
         """Initialize handler."""
         self._init_message = bytes()
         self._init_req_nbytes = 4
-        self.forward_proxy = None
         self._magic = None
         self.timeout = None
         self._rpc_key_length = None
-        self.rpc_key = None
         self._done = False
+        self._proxy = ProxyServerHandler.current
+        assert self._proxy
+        self.rpc_key = None
+        self.match_key = None
+        self.forward_proxy = None
+        self.alloc_time = None
 
     def __del__(self):
         logging.info("Delete %s...", self.name())
@@ -49,27 +58,30 @@ def name(self):
     def _init_step(self, message):
         if self._magic is None:
             assert len(message) == 4
-            self._magic = struct.unpack('@i', message)[0]
-            if self._magic != RPC_MAGIC:
+            self._magic = struct.unpack('<i', message)[0]
+            if self._magic != base.RPC_MAGIC:
                 logging.info("Invalid RPC magic from %s", self.name())
                 self.close()
             self._init_req_nbytes = 4
         elif self._rpc_key_length is None:
             assert len(message) == 4
-            self._rpc_key_length = struct.unpack('@i', message)[0]
+            self._rpc_key_length = struct.unpack('<i', message)[0]
             self._init_req_nbytes = self._rpc_key_length
         elif self.rpc_key is None:
             assert len(message) == self._rpc_key_length
             self.rpc_key = py_str(message)
+            # match key is used to do the matching
+            self.match_key = self.rpc_key[7:].split()[0]
             self.on_start()
         else:
             assert False
 
     def on_start(self):
         """Event when the initialization is completed"""
-        ProxyServerHandler.current.handler_ready(self)
+        self._proxy.handler_ready(self)
 
     def on_data(self, message):
+        """on data"""
         assert isinstance(message, bytes)
         if self.forward_proxy:
             self.forward_proxy.send_data(message)
@@ -98,104 +110,41 @@ def close_pair(self):
         self.close()
 
     def on_close_event(self):
+        """on close event"""
         assert not self._done
         logging.info("RPCProxy:on_close %s ...", self.name())
+        if self.match_key:
+            key = self.match_key
+            if self._proxy._client_pool.get(key, None) == self:
+                self._proxy._client_pool.pop(key)
+            if self._proxy._server_pool.get(key, None) == self:
+                self._proxy._server_pool.pop(key)
         self._done = True
         self.forward_proxy = None
-        if self.rpc_key:
-            key = self.rpc_key[6:]
-            if ProxyServerHandler.current._client_pool.get(key, None) == self:
-                ProxyServerHandler.current._client_pool.pop(key)
-            if ProxyServerHandler.current._server_pool.get(key, None) == self:
-                ProxyServerHandler.current._server_pool.pop(key)
 
 
-class TCPHandler(ForwardHandler):
+class TCPHandler(tornado_util.TCPHandler, ForwardHandler):
     """Event driven TCP handler."""
     def __init__(self, sock, addr):
+        super(TCPHandler, self).__init__(sock)
         self._init_handler()
-        self.sock = sock
-        assert self.sock
         self.addr = addr
-        self.loop = ioloop.IOLoop.current()
-        self.sock.setblocking(0)
-        self.pending_write = []
-        self._signal_close = False
-        def event_handler(_, events):
-            self._on_event(events)
-        ioloop.IOLoop.current().add_handler(
-            self.sock.fileno(), event_handler, self.loop.READ | self.loop.ERROR)
 
     def name(self):
-        return "TCPSocket: %s:%s"  % (str(self.addr), self.rpc_key)
+        return "TCPSocketProxy:%s:%s"  % (str(self.addr[0]), self.rpc_key)
 
     def send_data(self, message, binary=True):
-        assert binary
-        self.pending_write.append(message)
-        self._on_write()
-
-    def _on_write(self):
-        while self.pending_write:
-            try:
-                msg = self.pending_write[0]
-                nsend = self.sock.send(msg)
-                if nsend != len(msg):
-                    self.pending_write[0] = msg[nsend:]
-                else:
-                    del self.pending_write[0]
-            except socket.error as err:
-                if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
-                    break
-                else:
-                    self.on_error(err)
-        if self.pending_write:
-            self.loop.update_handler(
-                self.sock.fileno(), self.loop.READ | self.loop.ERROR | self.loop.WRITE)
-        else:
-            if self._signal_close:
-                self.close()
-            else:
-                self.loop.update_handler(
-                    self.sock.fileno(), self.loop.READ | self.loop.ERROR)
+        self.write_message(message, True)
 
-    def _on_read(self):
-        try:
-            msg = bytes(self.sock.recv(4096))
-            if msg:
-                self.on_data(msg)
-                return True
-            else:
-                self.close_pair()
-        except socket.error as err:
-            if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
-                pass
-            else:
-                self.on_error(e)
-        return False
-
-    def _on_event(self, events):
-        if (events & self.loop.ERROR) or (events & self.loop.READ):
-            if self._on_read() and (events & self.loop.WRITE):
-                self._on_write()
-        elif events & self.loop.WRITE:
-            self._on_write()
-
-    def signal_close(self):
-        if not self.pending_write:
-            self.close()
-        else:
-            self._signal_close = True
+    def on_message(self, message):
+        self.on_data(message)
 
-    def close(self):
-        if self.sock is not None:
-            logging.info("%s Close socket..", self.name())
-            try:
-                ioloop.IOLoop.current().remove_handler(self.sock.fileno())
-                self.sock.close()
-            except socket.error:
-                pass
-            self.sock = None
-            self.on_close_event()
+    def on_close(self):
+        if self.forward_proxy:
+            self.forward_proxy.signal_close()
+            self.forward_proxy = None
+        logging.info("%s Close socket..", self.name())
+        self.on_close_event()
 
 
 class WebSocketHandler(websocket.WebSocketHandler, ForwardHandler):
@@ -205,7 +154,7 @@ def __init__(self, *args, **kwargs):
         self._init_handler()
 
     def name(self):
-        return "WebSocketProxy"
+        return "WebSocketProxy:%s" % (self.rpc_key)
 
     def on_message(self, message):
         self.on_data(message)
@@ -232,12 +181,16 @@ def signal_close(self):
 class RequestHandler(tornado.web.RequestHandler):
     """Handles html request."""
     def __init__(self, *args, **kwargs):
-        self.page = open(kwargs.pop("file_path")).read()
-        web_port = kwargs.pop("rpc_web_port", None)
-        if web_port:
-            self.page = self.page.replace(
-                "ws://localhost:9190/ws",
-                "ws://localhost:%d/ws" % web_port)
+        file_path = kwargs.pop("file_path")
+        if file_path.endswith("html"):
+            self.page = open(file_path).read()
+            web_port = kwargs.pop("rpc_web_port", None)
+            if web_port:
+                self.page = self.page.replace(
+                    "ws://localhost:9190/ws",
+                    "ws://localhost:%d/ws" % web_port)
+        else:
+            self.page = open(file_path, "rb").read()
         super(RequestHandler, self).__init__(*args, **kwargs)
 
     def data_received(self, _):
@@ -247,15 +200,16 @@ def get(self, *args, **kwargs):
         self.write(self.page)
 
 
-
 class ProxyServerHandler(object):
     """Internal proxy server handler class."""
     current = None
     def __init__(self,
                  sock,
+                 listen_port,
                  web_port,
                  timeout_client,
                  timeout_server,
+                 tracker_addr,
                  index_page=None,
                  resource_files=None):
         assert ProxyServerHandler.current is None
@@ -285,8 +239,21 @@ def event_handler(_, events):
             self.sock.fileno(), event_handler, self.loop.READ)
         self._client_pool = {}
         self._server_pool = {}
+        self.timeout_alloc = 5
         self.timeout_client = timeout_client
         self.timeout_server = timeout_server
+        # tracker information
+        self._listen_port = listen_port
+        self._tracker_addr = tracker_addr
+        self._tracker_conn = None
+        self._tracker_pending_puts = []
+        self._key_set = set()
+        self.update_tracker_period = 2
+        if tracker_addr:
+            logging.info("Tracker address:%s", str(tracker_addr))
+            def _callback():
+                self._update_tracker(True)
+            self.loop.call_later(self.update_tracker_period, _callback)
         logging.info("RPCProxy: Websock port bind to %d", web_port)
 
     def _on_event(self, _):
@@ -301,14 +268,114 @@ def _on_event(self, _):
     def _pair_up(self, lhs, rhs):
         lhs.forward_proxy = rhs
         rhs.forward_proxy = lhs
-        lhs.send_data(struct.pack('@i', RPC_MAGIC))
-        rhs.send_data(struct.pack('@i', RPC_MAGIC))
+
+        lhs.send_data(struct.pack('<i', base.RPC_CODE_SUCCESS))
+        lhs.send_data(struct.pack('<i', len(rhs.rpc_key)))
+        lhs.send_data(rhs.rpc_key.encode("utf-8"))
+
+        rhs.send_data(struct.pack('<i', base.RPC_CODE_SUCCESS))
+        rhs.send_data(struct.pack('<i', len(lhs.rpc_key)))
+        rhs.send_data(lhs.rpc_key.encode("utf-8"))
         logging.info("Pairup connect %s  and %s", lhs.name(), rhs.name())
 
-    def handler_ready(self, handler):
-        """Report handler to be ready."""
-        logging.info("Handler ready %s", handler.name())
-        key = handler.rpc_key[6:]
+    def _regenerate_server_keys(self, keys):
+        """Regenerate keys for server pool"""
+        keyset = set(self._server_pool.keys())
+        new_keys = []
+        # re-generate the server match key, so old information is invalidated.
+        for key in keys:
+            rpc_key, _ = key.split(":")
+            handle = self._server_pool[key]
+            del self._server_pool[key]
+            new_key = base.random_key(rpc_key + ":", keyset)
+            self._server_pool[new_key] = handle
+            keyset.add(new_key)
+            new_keys.append(new_key)
+        return new_keys
+
+    def _update_tracker(self, period_update=False):
+        """Update information on tracker."""
+        try:
+            if self._tracker_conn is None:
+                self._tracker_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                self._tracker_conn.connect(self._tracker_addr)
+                self._tracker_conn.sendall(struct.pack("<i", base.RPC_TRACKER_MAGIC))
+                magic = struct.unpack("<i", base.recvall(self._tracker_conn, 4))[0]
+                if magic != base.RPC_TRACKER_MAGIC:
+                    self.loop.stop()
+                    raise RuntimeError("%s is not RPC Tracker" % str(self._tracker_addr))
+                # just connect to tracker, need to update all keys
+                self._tracker_pending_puts = self._server_pool.keys()
+
+            if self._tracker_conn and period_update:
+                # periodically update tracker information
+                # regenerate key if the key is not in tracker anymore
+                # and there is no in-coming connection after timeout_alloc
+                base.sendjson(self._tracker_conn, [TrackerCode.GET_PENDING_MATCHKEYS])
+                pending_keys = set(base.recvjson(self._tracker_conn))
+                update_keys = []
+                for k, v in self._server_pool.items():
+                    if k not in pending_keys:
+                        if v.alloc_time is None:
+                            v.alloc_time = time.time()
+                        elif time.time() - v.alloc_time > self.timeout_alloc:
+                            update_keys.append(k)
+                            v.alloc_time = None
+                if update_keys:
+                    logging.info("RPCProxy: No incoming conn on %s, regenerate keys...",
+                                 str(update_keys))
+                    new_keys = self._regenerate_server_keys(update_keys)
+                    self._tracker_pending_puts += new_keys
+
+            need_update_info = False
+            # report new connections
+            for key in self._tracker_pending_puts:
+                rpc_key = key.split(":")[0]
+                base.sendjson(self._tracker_conn,
+                              [TrackerCode.PUT, rpc_key,
+                               (self._listen_port, key), None])
+                assert base.recvjson(self._tracker_conn) == TrackerCode.SUCCESS
+                if rpc_key not in self._key_set:
+                    self._key_set.add(rpc_key)
+                    need_update_info = True
+
+            if need_update_info:
+                keylist = "[" + ",".join(self._key_set) + "]"
+                cinfo = {"key": "server:proxy" + keylist}
+                base.sendjson(self._tracker_conn,
+                              [TrackerCode.UPDATE_INFO, cinfo])
+                assert base.recvjson(self._tracker_conn) == TrackerCode.SUCCESS
+            self._tracker_pending_puts = []
+        except (socket.error, IOError) as err:
+            logging.info(
+                "Lost tracker connection: %s, try reconnect in %g sec",
+                str(err), self.update_tracker_period)
+            self._tracker_conn.close()
+            self._tracker_conn = None
+            self._regenerate_server_keys(self._server_pool.keys())
+
+        if period_update:
+            def _callback():
+                self._update_tracker(True)
+            self.loop.call_later(self.update_tracker_period, _callback)
+
+    def _handler_ready_tracker_mode(self, handler):
+        """tracker mode to handle handler ready."""
+        if handler.rpc_key.startswith("server:"):
+            key = base.random_key(handler.match_key + ":", self._server_pool)
+            handler.match_key = key
+            self._server_pool[key] = handler
+            self._tracker_pending_puts.append(key)
+            self._update_tracker()
+        else:
+            if handler.match_key in self._server_pool:
+                self._pair_up(self._server_pool.pop(handler.match_key), handler)
+            else:
+                handler.send_data(struct.pack('<i', base.RPC_CODE_MISMATCH))
+                handler.signal_close()
+
+    def _handler_ready_proxy_mode(self, handler):
+        """Normal proxy mode when handler is ready."""
         if handler.rpc_key.startswith("server:"):
             pool_src, pool_dst = self._client_pool, self._server_pool
             timeout = self.timeout_server
@@ -316,6 +383,7 @@ def handler_ready(self, handler):
             pool_src, pool_dst = self._server_pool, self._client_pool
             timeout = self.timeout_client
 
+        key = handler.match_key
         if key in pool_src:
             self._pair_up(pool_src.pop(key), handler)
             return
@@ -327,28 +395,41 @@ def cleanup():
                     logging.info("Timeout client connection %s, cannot find match key=%s",
                                  handler.name(), key)
                     pool_dst.pop(key)
-                    handler.send_data(struct.pack('@i', RPC_MAGIC + 2))
+                    handler.send_data(struct.pack('<i', base.RPC_CODE_MISMATCH))
                     handler.signal_close()
             self.loop.call_later(timeout, cleanup)
         else:
             logging.info("Duplicate connection with same key=%s", key)
-            handler.send_data(struct.pack('@i', RPC_MAGIC + 1))
+            handler.send_data(struct.pack('<i', base.RPC_CODE_DUPLICATE))
             handler.signal_close()
 
+    def handler_ready(self, handler):
+        """Report handler to be ready."""
+        logging.info("Handler ready %s", handler.name())
+        if self._tracker_addr:
+            self._handler_ready_tracker_mode(handler)
+        else:
+            self._handler_ready_proxy_mode(handler)
+
     def run(self):
         """Run the proxy server"""
         ioloop.IOLoop.current().start()
 
+
 def _proxy_server(listen_sock,
+                  listen_port,
                   web_port,
                   timeout_client,
                   timeout_server,
+                  tracker_addr,
                   index_page,
                   resource_files):
     handler = ProxyServerHandler(listen_sock,
+                                 listen_port,
                                  web_port,
                                  timeout_client,
                                  timeout_server,
+                                 tracker_addr,
                                  index_page,
                                  resource_files)
     handler.run()
@@ -379,6 +460,10 @@ class Proxy(object):
     timeout_server : float, optional
         Timeout of server until it sees a matching connection.
 
+    tracker_addr: Tuple (str, int) , optional
+        The address of RPC Tracker in tuple (host, ip) format.
+        If is not None, the server will register itself to the tracker.
+
     index_page : str, optional
         Path to an index page that can be used to display at proxy index.
 
@@ -390,8 +475,9 @@ def __init__(self,
                  port=9091,
                  port_end=9199,
                  web_port=0,
-                 timeout_client=240,
+                 timeout_client=600,
                  timeout_server=600,
+                 tracker_addr=None,
                  index_page=None,
                  resource_files=None):
         sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -411,10 +497,12 @@ def __init__(self,
         logging.info("RPCProxy: client port bind to %s:%d", host, self.port)
         sock.listen(1)
         self.proc = multiprocessing.Process(
-            target=_proxy_server, args=(sock, web_port,
-                                        timeout_client, timeout_server,
-                                        index_page, resource_files))
+            target=_proxy_server,
+            args=(sock, self.port, web_port,
+                  timeout_client, timeout_server,
+                  tracker_addr, index_page, resource_files))
         self.proc.start()
+        sock.close()
         self.host = host
 
     def terminate(self):
@@ -444,30 +532,33 @@ def _fsend(data):
             data = bytes(data)
             conn.write_message(data, binary=True)
             return len(data)
-        on_message = rpc._CreateEventDrivenServer(_fsend, "WebSocketProxyServer")
+        on_message = base._CreateEventDrivenServer(
+            _fsend, "WebSocketProxyServer", "%toinit")
         return on_message
 
     @gen.coroutine
     def _connect(key):
         conn = yield websocket.websocket_connect(url)
         on_message = create_on_message(conn)
-        temp = _server_env()
+        temp = _server_env(None, None)
         # Start connecton
-        conn.write_message(struct.pack('@i', RPC_MAGIC), binary=True)
+        conn.write_message(struct.pack('<i', base.RPC_MAGIC), binary=True)
         key = "server:" + key
-        conn.write_message(struct.pack('@i', len(key)), binary=True)
+        conn.write_message(struct.pack('<i', len(key)), binary=True)
         conn.write_message(key.encode("utf-8"), binary=True)
         msg = yield conn.read_message()
         assert len(msg) >= 4
-        magic = struct.unpack('@i', msg[:4])[0]
-        if magic == RPC_MAGIC + 1:
+        magic = struct.unpack('<i', msg[:4])[0]
+        if magic == base.RPC_CODE_DUPLICATE:
             raise RuntimeError("key: %s has already been used in proxy" % key)
-        elif magic == RPC_MAGIC + 2:
+        elif magic == base.RPC_CODE_MISMATCH:
             logging.info("RPCProxy do not have matching client key %s", key)
-        elif magic != RPC_MAGIC:
+        elif magic != base.RPC_CODE_SUCCESS:
             raise RuntimeError("%s is not RPC Proxy" % url)
-        logging.info("Connection established")
         msg = msg[4:]
+
+        logging.info("Connection established with remote")
+
         if msg:
             on_message(bytearray(msg), 3)
 
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
new file mode 100644
index 000000000000..1d6c0226f138
--- /dev/null
+++ b/python/tvm/rpc/server.py
@@ -0,0 +1,390 @@
+"""RPC server implementation.
+
+Note
+----
+Server is TCP based with the following protocol:
+- Initial handshake to the peer
+  - [RPC_MAGIC, keysize(int32), key-bytes]
+- The key is in format
+   - {server|client}:device-type[:random-key] [-timeout=timeout]
+"""
+from __future__ import absolute_import
+
+import os
+import ctypes
+import socket
+import select
+import struct
+import logging
+import multiprocessing
+import subprocess
+import time
+import sys
+import signal
+
+from .._ffi.function import register_func
+from .._ffi.base import py_str
+from .._ffi.libinfo import find_lib_path
+from ..module import load as _load_module
+from ..contrib import util
+from . import base
+from . base import TrackerCode
+
+def _server_env(load_library, logger):
+    """Server environment function return temp dir"""
+    temp = util.tempdir()
+    if logger is None:
+        logger = logging.getLogger()
+
+    # pylint: disable=unused-variable
+    @register_func("tvm.rpc.server.workpath")
+    def get_workpath(path):
+        return temp.relpath(path)
+
+    @register_func("tvm.rpc.server.load_module", override=True)
+    def load_module(file_name):
+        """Load module from remote side."""
+        path = temp.relpath(file_name)
+        m = _load_module(path)
+        logger.info("load_module %s", path)
+        return m
+
+    libs = []
+    load_library = load_library.split(":") if load_library else []
+    for file_name in load_library:
+        file_name = find_lib_path(file_name)[0]
+        libs.append(ctypes.CDLL(file_name, ctypes.RTLD_GLOBAL))
+        logger.info("Load additional library %s", file_name)
+    temp.libs = libs
+    return temp
+
+
+def _serve_loop(sock, addr, load_library, silent):
+    """Server loop"""
+    logger = logging.getLogger("RPCServer")
+    if silent:
+        logger.disabled = True
+    sockfd = sock.fileno()
+    temp = _server_env(load_library, logger)
+    base._ServerLoop(sockfd)
+    temp.remove()
+    logger.info("Finish serving %s", addr)
+
+
+def _parse_server_opt(opts):
+    # parse client options
+    ret = {}
+    for kv in opts:
+        if kv.startswith("-timeout="):
+            ret["timeout"] = float(kv[9:])
+    return ret
+
+def _listen_loop(sock, port, rpc_key, tracker_addr, load_library, custom_addr, silent):
+    """Listening loop of the server master."""
+    logger = logging.getLogger("RPCServer")
+    if silent:
+        logger.disabled = True
+
+    def _accept_conn(listen_sock, tracker_conn, ping_period=2):
+        """Accept connection from the other places.
+
+        Parameters
+        ----------
+        listen_sock: Socket
+            The socket used by listening process.
+
+        tracker_conn : connnection to tracker
+            Tracker connection
+
+        ping_period : float, optional
+            ping tracker every k seconds if no connection is accepted.
+        """
+        old_keyset = set()
+        # Report resource to tracker
+        if tracker_conn:
+            matchkey = base.random_key(rpc_key + ":")
+            base.sendjson(tracker_conn,
+                          [TrackerCode.PUT, rpc_key, (port, matchkey), custom_addr])
+            assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS
+        else:
+            matchkey = rpc_key
+
+        unmatch_period_count = 0
+        unmatch_timeout = 4
+        # Wait until we get a valid connection
+        while True:
+            if tracker_conn:
+                trigger = select.select([listen_sock], [], [], ping_period)
+                if not listen_sock in trigger[0]:
+                    base.sendjson(tracker_conn, [TrackerCode.GET_PENDING_MATCHKEYS])
+                    pending_keys = base.recvjson(tracker_conn)
+                    old_keyset.add(matchkey)
+                    # if match key not in pending key set
+                    # it means the key is acquired by a client but not used.
+                    if matchkey not in pending_keys:
+                        unmatch_period_count += 1
+                    else:
+                        unmatch_period_count = 0
+                    # regenerate match key if key is acquired but not used for a while
+                    if unmatch_period_count * ping_period > unmatch_timeout + ping_period:
+                        logger.info("no incoming connections, regenerate key ...")
+                        matchkey = base.random_key(rpc_key + ":", old_keyset)
+                        base.sendjson(tracker_conn,
+                                      [TrackerCode.PUT, rpc_key, (port, matchkey),
+                                       custom_addr])
+                        assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS
+                        unmatch_period_count = 0
+                    continue
+            conn, addr = listen_sock.accept()
+            magic = struct.unpack("<i", base.recvall(conn, 4))[0]
+            if magic != base.RPC_MAGIC:
+                conn.close()
+                continue
+            keylen = struct.unpack("<i", base.recvall(conn, 4))[0]
+            key = py_str(base.recvall(conn, keylen))
+            arr = key.split()
+            expect_header = "client:" + matchkey
+            server_key = "server:" + rpc_key
+            if arr[0] != expect_header:
+                conn.sendall(struct.pack("<i", base.RPC_CODE_MISMATCH))
+                conn.close()
+                logger.info("mismatch key from %s", addr)
+                continue
+            else:
+                conn.sendall(struct.pack("<i", base.RPC_CODE_SUCCESS))
+                conn.sendall(struct.pack("<i", len(server_key)))
+                conn.sendall(server_key.encode("utf-8"))
+                return conn, addr, _parse_server_opt(arr[1:])
+
+    # Server logic
+    tracker_conn = None
+    while True:
+        try:
+            # step 1: setup tracker and report to tracker
+            if tracker_addr and tracker_conn is None:
+                tracker_conn = base.connect_with_retry(tracker_addr, silent=silent)
+                tracker_conn.sendall(struct.pack("<i", base.RPC_TRACKER_MAGIC))
+                magic = struct.unpack("<i", base.recvall(tracker_conn, 4))[0]
+                if magic != base.RPC_TRACKER_MAGIC:
+                    raise RuntimeError("%s is not RPC Tracker" % str(tracker_addr))
+                # report status of current queue
+                cinfo = {"key" : "server:" + rpc_key}
+                base.sendjson(tracker_conn,
+                              [TrackerCode.UPDATE_INFO, cinfo])
+                assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS
+
+            # step 2: wait for in-coming connections
+            conn, addr, opts = _accept_conn(sock, tracker_conn)
+        except (socket.error, IOError):
+            # retry when tracker is dropped
+            if tracker_conn:
+                tracker_conn.close()
+                tracker_conn = None
+            continue
+        except RuntimeError as exc:
+            if silent:
+                return
+            else:
+                raise exc
+
+        # step 3: serving
+        logger.info("connection from %s", addr)
+        server_proc = multiprocessing.Process(target=_serve_loop,
+                                              args=(conn, addr, load_library, silent))
+        server_proc.deamon = True
+        server_proc.start()
+        # close from our side.
+        conn.close()
+        # wait until server process finish or timeout
+        server_proc.join(opts.get("timeout", None))
+        if server_proc.is_alive():
+            logger.info("Timeout in RPC session, kill..")
+            server_proc.terminate()
+
+
+def _connect_proxy_loop(addr, key, load_library, silent):
+    logger = logging.getLogger("RPCProxy")
+    if silent:
+        logger.disabled = True
+    key = "server:" + key
+    retry_count = 0
+    max_retry = 5
+    retry_period = 5
+    while True:
+        try:
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.connect(addr)
+            sock.sendall(struct.pack("<i", base.RPC_MAGIC))
+            sock.sendall(struct.pack("<i", len(key)))
+            sock.sendall(key.encode("utf-8"))
+            magic = struct.unpack("<i", base.recvall(sock, 4))[0]
+            if magic == base.RPC_CODE_DUPLICATE:
+                raise RuntimeError("key: %s has already been used in proxy" % key)
+            elif magic == base.RPC_CODE_MISMATCH:
+                logger.info("RPCProxy do not have matching client key %s", key)
+            elif magic != base.RPC_CODE_SUCCESS:
+                raise RuntimeError("%s is not RPC Proxy" % str(addr))
+            keylen = struct.unpack("<i", base.recvall(sock, 4))[0]
+            remote_key = py_str(base.recvall(sock, keylen))
+            opts = _parse_server_opt(remote_key.split()[1:])
+            logger.info("connected to %s", str(addr))
+            process = multiprocessing.Process(
+                target=_serve_loop, args=(sock, addr, load_library, silent))
+            process.deamon = True
+            process.start()
+            sock.close()
+            process.join(opts.get("timeout", None))
+            if process.is_alive():
+                logger.info("Timeout in RPC session, kill..")
+                process.terminate()
+            retry_count = 0
+        except (socket.error, IOError) as err:
+            retry_count += 1
+            logger.info("Error encountered %s, retry in %g sec", str(err), retry_period)
+            if retry_count > max_retry:
+                raise RuntimeError("Maximum retry error: last error: %s" % str(err))
+            time.sleep(retry_period)
+
+def _popen(cmd):
+    proc = subprocess.Popen(cmd,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT,
+                            env=os.environ)
+    (out, _) = proc.communicate()
+    if proc.returncode != 0:
+        msg = "Server invoke error:\n"
+        msg += out
+        raise RuntimeError(msg)
+
+
+class Server(object):
+    """Start RPC server on a separate process.
+
+    This is a simple python implementation based on multi-processing.
+    It is also possible to implement a similar C based sever with
+    TVM runtime which does not depend on the python.
+
+    Parameters
+    ----------
+    host : str
+        The host url of the server.
+
+    port : int
+        The port to be bind to
+
+    port_end : int, optional
+        The end port to search
+
+    is_proxy : bool, optional
+        Whether the address specified is a proxy.
+        If this is true, the host and port actually corresponds to the
+        address of the proxy server.
+
+    use_popen : bool, optional
+        Whether to use Popen to start a fresh new process instead of fork.
+        This is recommended to switch on if we want to do local RPC demonstration
+        for GPU devices to avoid fork safety issues.
+
+    tracker_addr: Tuple (str, int) , optional
+        The address of RPC Tracker in tuple(host, ip) format.
+        If is not None, the server will register itself to the tracker.
+
+    key : str, optional
+        The key used to identify the device type in tracker.
+
+    load_library : str, optional
+        List of additional libraries to be loaded during execution.
+
+    custom_addr: str, optional
+        Custom IP Address to Report to RPC Tracker
+
+    silent: bool, optional
+        Whether run this server in silent mode.
+    """
+    def __init__(self,
+                 host,
+                 port=9091,
+                 port_end=9199,
+                 is_proxy=False,
+                 use_popen=False,
+                 tracker_addr=None,
+                 key="",
+                 load_library=None,
+                 custom_addr=None,
+                 silent=False):
+        try:
+            if base._ServerLoop is None:
+                raise RuntimeError("Please compile with USE_RPC=1")
+        except NameError:
+            raise RuntimeError("Please compile with USE_RPC=1")
+        self.host = host
+        self.port = port
+        self.libs = []
+        self.custom_addr = custom_addr
+        self.use_popen = use_popen
+
+        self.logger = logging.getLogger("RPCServer")
+        if silent:
+            self.logger.disabled = True
+
+        if use_popen:
+            cmd = [sys.executable,
+                   "-m", "tvm.exec.rpc_server",
+                   "--host=%s" % host,
+                   "--port=%s" % port]
+            if tracker_addr:
+                assert key
+                cmd += ["--tracker=%s:%d" % tracker_addr,
+                        "--key=%s" % key]
+            if load_library:
+                cmd += ["--load-library", load_library]
+            if custom_addr:
+                cmd += ["--custom-addr", custom_addr]
+            if silent:
+                cmd += ["--silent"]
+
+            self.proc = subprocess.Popen(cmd, preexec_fn=os.setsid)
+            time.sleep(0.5)
+        elif not is_proxy:
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            self.port = None
+            for my_port in range(port, port_end):
+                try:
+                    sock.bind((host, my_port))
+                    self.port = my_port
+                    break
+                except socket.error as sock_err:
+                    if sock_err.errno in [98, 48]:
+                        continue
+                    else:
+                        raise sock_err
+            if not self.port:
+                raise ValueError("cannot bind to any port in [%d, %d)" % (port, port_end))
+            self.logger.info("bind to %s:%d", host, self.port)
+            sock.listen(1)
+            self.sock = sock
+            self.proc = multiprocessing.Process(
+                target=_listen_loop, args=(
+                    self.sock, self.port, key, tracker_addr, load_library,
+                    self.custom_addr, silent))
+            self.proc.deamon = True
+            self.proc.start()
+        else:
+            self.proc = multiprocessing.Process(
+                target=_connect_proxy_loop, args=((host, port), key, load_library, silent))
+            self.proc.deamon = True
+            self.proc.start()
+
+    def terminate(self):
+        """Terminate the server process"""
+        if self.use_popen:
+            if self.proc:
+                os.killpg(self.proc.pid, signal.SIGTERM)
+                self.proc = None
+        else:
+            if self.proc:
+                self.proc.terminate()
+                self.proc = None
+
+    def __del__(self):
+        self.terminate()
diff --git a/python/tvm/rpc/tornado_util.py b/python/tvm/rpc/tornado_util.py
new file mode 100644
index 000000000000..00e1fd13865b
--- /dev/null
+++ b/python/tvm/rpc/tornado_util.py
@@ -0,0 +1,104 @@
+"""Utilities used in tornado."""
+
+import socket
+import errno
+from tornado import ioloop
+
+class TCPHandler(object):
+    """TCP socket handler backed tornado event loop.
+
+    Parameters
+    ----------
+    sock : Socket
+        The TCP socket, will set it to non-blocking mode.
+    """
+    def __init__(self, sock):
+        self._sock = sock
+        self._ioloop = ioloop.IOLoop.current()
+        self._sock.setblocking(0)
+        self._pending_write = []
+        self._signal_close = False
+        def _event_handler(_, events):
+            self._event_handler(events)
+        self._ioloop.add_handler(
+            self._sock.fileno(), _event_handler,
+            self._ioloop.READ | self._ioloop.ERROR)
+
+    def signal_close(self):
+        """Signal the handler to close.
+
+        The handler will be closed after the existing
+        pending message are sent to the peer.
+        """
+        if not self._pending_write:
+            self.close()
+        else:
+            self._signal_close = True
+
+    def close(self):
+        """Close the socket"""
+        if self._sock is not None:
+            try:
+                self._ioloop.remove_handler(self._sock.fileno())
+                self._sock.close()
+            except socket.error:
+                pass
+            self._sock = None
+            self.on_close()
+
+    def write_message(self, message, binary=True):
+        assert binary
+        if self._sock is None:
+            raise IOError("socket is already closed")
+        self._pending_write.append(message)
+        self._update_write()
+
+    def _event_handler(self, events):
+        """centeral event handler"""
+        if (events & self._ioloop.ERROR) or (events & self._ioloop.READ):
+            if self._update_read() and (events & self._ioloop.WRITE):
+                self._update_write()
+        elif events & self._ioloop.WRITE:
+            self._update_write()
+
+    def _update_write(self):
+        """Update the state on write"""
+        while self._pending_write:
+            try:
+                msg = self._pending_write[0]
+                nsend = self._sock.send(msg)
+                if nsend != len(msg):
+                    self._pending_write[0] = msg[nsend:]
+                else:
+                    self._pending_write.pop(0)
+            except socket.error as err:
+                if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
+                    break
+                else:
+                    self.on_error(err)
+        if self._pending_write:
+            self._ioloop.update_handler(
+                self._sock.fileno(), self._ioloop.READ | self._ioloop.ERROR | self._ioloop.WRITE)
+        else:
+            if self._signal_close:
+                self.close()
+            else:
+                self._ioloop.update_handler(
+                    self._sock.fileno(), self._ioloop.READ | self._ioloop.ERROR)
+
+    def _update_read(self):
+        """Update state when there is read event"""
+        try:
+            msg = bytes(self._sock.recv(4096))
+            if msg:
+                self.on_message(msg)
+                return True
+            else:
+                # normal close, remote is closed
+                self.close()
+        except socket.error as err:
+            if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
+                pass
+            else:
+                self.on_error(err)
+        return False
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
new file mode 100644
index 000000000000..02d226123f1c
--- /dev/null
+++ b/python/tvm/rpc/tracker.py
@@ -0,0 +1,388 @@
+"""RPC Tracker, tracks and distributes the TVM RPC resources.
+
+This folder implemements the tracker server logic.
+
+Note
+----
+Tracker is a TCP based rest api with the following protocol:
+- Initial handshake to the peer
+  - RPC_TRACKER_MAGIC
+- Normal message: [size(int32), json-data]
+- Each message is initiated by the client, and the tracker replies with a json.
+
+List of available APIs:
+
+- PING: check if tracker is alive
+  - input: [TrackerCode.PING]
+  - return: TrackerCode.SUCCESS
+- PUT: report resource to tracker
+  - input: [TrackerCode.PUT, [port, match-key]]
+  - return: TrackerCode.SUCCESS
+  - note: match-key is a randomly generated identify the resource during connection.
+- REQUEST: request a new resource from tracker
+  - input: [TrackerCode.REQUEST, [key, user, priority]]
+  - return: [TrackerCode.SUCCESS, [url, port, match-key]]
+"""
+import heapq
+import time
+import logging
+import socket
+import multiprocessing
+import errno
+import struct
+import json
+
+try:
+    from tornado import ioloop
+    from . import tornado_util
+except ImportError as error_msg:
+    raise ImportError(
+        "RPCTracker module requires tornado package %s" % error_msg)
+
+from .._ffi.base import py_str
+from . import base
+from .base import RPC_TRACKER_MAGIC, TrackerCode
+
+
+class Scheduler(object):
+    """Abstratc interface of scheduler."""
+    def put(self, value):
+        """Push a resource into the scheduler.
+
+        This function can trigger callbacks in the scheduler.
+
+        Parameters
+        ----------
+        value : object
+            The resource to be put in the scheduler.
+        """
+        raise NotImplementedError()
+
+    def request(self, user, priority, callback):
+        """Request a resource.
+
+        Parameters
+        ----------
+        user : str
+            The user who is requesting the resource.
+
+        priority : int
+            The job priority
+
+        callback : function: value->bool
+            Callback function to receive an resource when ready
+            returns True if the resource is consumed.
+        """
+        raise NotImplementedError()
+
+    def summary(self):
+        """Get summary information of the scheduler."""
+        raise NotImplementedError()
+
+
+class PriorityScheduler(Scheduler):
+    """Priority based scheduler, FIFO based on time"""
+    def __init__(self, key):
+        self._key = key
+        self._values = []
+        self._requests = []
+
+    def _schedule(self):
+        while self._requests and self._values:
+            value = self._values.pop(0)
+            item = heapq.heappop(self._requests)
+            callback = item[-1]
+            if callback(value[1:]):
+                value[0].pending_matchkeys.remove(value[-1])
+            else:
+                self._values.append(value)
+
+    def put(self, value):
+        self._values.append(value)
+        self._schedule()
+
+    def request(self, user, priority, callback):
+        heapq.heappush(self._requests, (-priority, time.time(), callback))
+        self._schedule()
+
+    def summary(self):
+        """Get summary information of the scheduler."""
+        return {"free": len(self._values),
+                "pending": len(self._requests)}
+
+
+class TCPEventHandler(tornado_util.TCPHandler):
+    """Base asynchronize message handler.
+
+    The tracker and client follows a simple message protocol.
+    The message is in form [nbytes(int32)] [json-str].
+    All the information is packed in json-str
+    """
+    def __init__(self, tracker, sock, addr):
+        super(TCPEventHandler, self).__init__(sock)
+        self._data = bytearray()
+        self._tracker = tracker
+        self._msg_size = 0
+        self._addr = addr
+        self._init_req_nbytes = 4
+        self._info = {"addr": addr}
+        # list of pending match keys that has not been used.
+        self.pending_matchkeys = set()
+        self._tracker._connections.add(self)
+
+    def name(self):
+        """name of connection"""
+        return "TCPSocket: %s" % str(self._addr)
+
+    def summary(self):
+        """Summary of this connection"""
+        return self._info
+
+    def _init_conn(self, message):
+        """Initialie the connection"""
+        if len(message) != 4:
+            logging.info("Invalid connection from %s", self.name())
+            self.close()
+        magic = struct.unpack('<i', message)[0]
+        if magic != RPC_TRACKER_MAGIC:
+            logging.info("Invalid magic from %s", self.name())
+            self.close()
+        self.write_message(struct.pack('<i', RPC_TRACKER_MAGIC), binary=True)
+        self._init_req_nbytes = 0
+
+    def on_message(self, message):
+        """Callback when a message is received.
+
+        Parameters
+        ----------
+        message : bytearray
+            The bytes received
+        """
+        assert isinstance(message, bytes)
+        if self._init_req_nbytes:
+            self._init_conn(message)
+            return
+
+        self._data += message
+
+        while True:
+            if self._msg_size == 0:
+                if len(self._data) >= 4:
+                    self._msg_size = struct.unpack('<i', self._data[:4])[0]
+                else:
+                    return
+            if self._msg_size != 0 and len(self._data) >= self._msg_size + 4:
+                msg = py_str(bytes(self._data[4:4 + self._msg_size]))
+                del self._data[:4 + self._msg_size]
+                self._msg_size = 0
+                # pylint: disable=broad-except
+                self.call_handler(json.loads(msg))
+            else:
+                return
+
+    def ret_value(self, data):
+        """return value to the output"""
+        data = json.dumps(data)
+        self.write_message(
+            struct.pack('<i', len(data)), binary=True)
+        self.write_message(data.encode("utf-8"), binary=True)
+
+    def call_handler(self, args):
+        """Event handler when json request arrives."""
+        code = args[0]
+        if code == TrackerCode.PUT:
+            key = args[1]
+            port, matchkey = args[2]
+            self.pending_matchkeys.add(matchkey)
+            # got custom address (from rpc server)
+            if args[3] is not None:
+                self._tracker.put(key, (self, args[3], port, matchkey))
+            else:
+                self._tracker.put(key, (self, self._addr[0], port, matchkey))
+            self.ret_value(TrackerCode.SUCCESS)
+        elif code == TrackerCode.REQUEST:
+            key = args[1]
+            user = args[2]
+            priority = args[3]
+            def _cb(value):
+                # if the connection is already closed
+                if not self._sock:
+                    return False
+                try:
+                    self.ret_value([TrackerCode.SUCCESS, value])
+                except (socket.error, IOError):
+                    return False
+                return True
+            self._tracker.request(key, user, priority, _cb)
+        elif code == TrackerCode.PING:
+            self.ret_value(TrackerCode.SUCCESS)
+        elif code == TrackerCode.GET_PENDING_MATCHKEYS:
+            self.ret_value(list(self.pending_matchkeys))
+        elif code == TrackerCode.STOP:
+            # safe stop tracker
+            if self._tracker._stop_key == args[1]:
+                self.ret_value(TrackerCode.SUCCESS)
+                self._tracker.stop()
+            else:
+                self.ret_value(TrackerCode.FAIL)
+        elif code == TrackerCode.UPDATE_INFO:
+            self._info.update(args[1])
+            self.ret_value(TrackerCode.SUCCESS)
+        elif code == TrackerCode.SUMMARY:
+            status = self._tracker.summary()
+            self.ret_value([TrackerCode.SUCCESS, status])
+        else:
+            logging.info("Unknown tracker code %d", code)
+            self.close()
+
+    def on_close(self):
+        self._tracker._connections.remove(self)
+
+    def on_error(self, err):
+        logging.info("%s: Error in RPC Tracker: %s", self.name(), err)
+        self.close()
+
+
+class TrackerServerHandler(object):
+    """Tracker that tracks the resources."""
+    def __init__(self, sock, stop_key):
+        self._scheduler_map = {}
+        self._sock = sock
+        self._sock.setblocking(0)
+        self._ioloop = ioloop.IOLoop.current()
+        self._stop_key = stop_key
+        self._connections = set()
+        def _event_handler(_, events):
+            self._on_event(events)
+        self._ioloop.add_handler(
+            self._sock.fileno(), _event_handler, self._ioloop.READ)
+
+    def _on_event(self, _):
+        while True:
+            try:
+                conn, addr = self._sock.accept()
+                TCPEventHandler(self, conn, addr)
+            except socket.error as err:
+                if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
+                    break
+
+    def create_scheduler(self, key):
+        """Create a new scheduler."""
+        return PriorityScheduler(key)
+
+    def put(self, key, value):
+        """Report a new resource to the tracker."""
+        if key not in self._scheduler_map:
+            self._scheduler_map[key] = self.create_scheduler(key)
+        self._scheduler_map[key].put(value)
+
+    def request(self, key, user, priority, callback):
+        """Request a new resource."""
+        if key not in self._scheduler_map:
+            self._scheduler_map[key] = self.create_scheduler(key)
+        self._scheduler_map[key].request(user, priority, callback)
+
+    def stop(self):
+        """Safely stop tracker."""
+        for conn in list(self._connections):
+            conn.close()
+        self._sock.close()
+        self._ioloop.stop()
+
+    def summary(self):
+        """Return a dict summarizing current status."""
+        qinfo = {}
+        for k, v in self._scheduler_map.items():
+            qinfo[k] = v.summary()
+        cinfo = []
+        # ignore client connections without key
+        for conn in self._connections:
+            res = conn.summary()
+            if res.get("key", "").startswith("server"):
+                cinfo.append(res)
+        return {"queue_info": qinfo, "server_info": cinfo}
+
+    def run(self):
+        """Run the tracker server"""
+        self._ioloop.start()
+
+def _tracker_server(listen_sock, stop_key):
+    handler = TrackerServerHandler(listen_sock, stop_key)
+    handler.run()
+
+
+class Tracker(object):
+    """Start RPC tracker on a seperate process.
+
+    Python implementation based on multi-processing.
+
+    Parameters
+    ----------
+    host : str
+        The host url of the server.
+
+    port : int
+        The TCP port to be bind to
+
+    port_end : int, optional
+        The end TCP port to search
+
+    silent: bool, optional
+        Whether run in silent mode
+    """
+    def __init__(self,
+                 host,
+                 port=9190,
+                 port_end=9199,
+                 silent=False):
+        self.logger = logging.getLogger("RPCTracker")
+        if silent:
+            self.logger.disabled = True
+
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.port = None
+        self.stop_key = base.random_key("tracker")
+        for my_port in range(port, port_end):
+            try:
+                sock.bind((host, my_port))
+                self.port = my_port
+                break
+            except socket.error as sock_err:
+                if sock_err.errno in [98, 48]:
+                    continue
+                else:
+                    raise sock_err
+        if not self.port:
+            raise ValueError("cannot bind to any port in [%d, %d)" % (port, port_end))
+        self.logger.info("bind to %s:%d", host, self.port)
+        sock.listen(1)
+        self.proc = multiprocessing.Process(
+            target=_tracker_server, args=(sock, self.stop_key))
+        self.proc.start()
+        self.host = host
+        # close the socket on this process
+        sock.close()
+
+    def _stop_tracker(self):
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.connect((self.host, self.port))
+        sock.sendall(struct.pack("<i", base.RPC_TRACKER_MAGIC))
+        magic = struct.unpack("<i", base.recvall(sock, 4))[0]
+        assert magic == base.RPC_TRACKER_MAGIC
+        base.sendjson(sock, [TrackerCode.STOP, self.stop_key])
+        assert base.recvjson(sock) == TrackerCode.SUCCESS
+        sock.close()
+
+    def terminate(self):
+        """Terminate the server process"""
+        if self.proc:
+            if self.proc.is_alive():
+                self._stop_tracker()
+                self.proc.join(1)
+            if self.proc.is_alive():
+                self.logger.info("Terminating Tracker Server...")
+                self.proc.terminate()
+            self.proc = None
+
+    def __del__(self):
+        self.terminate()
diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py
index 26be2de1a69a..594c2f2dc8bd 100644
--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -2,12 +2,34 @@
 from __future__ import absolute_import as _abs
 from ._ffi.base import string_types
 from ._ffi.node import NodeBase, register_node
-from ._ffi.function import _init_api
+from ._ffi.node import convert_to_node as _convert_to_node
+from ._ffi.function import _init_api, Function
+from ._ffi.function import convert_to_tvm_func as _convert_tvm_func
 from . import _api_internal
 from . import tensor as _tensor
 from . import expr as _expr
 from . import container as _container
 
+def convert(value):
+    """Convert value to TVM node or function.
+
+    Parameters
+    ----------
+    value : python value
+
+    Returns
+    -------
+    tvm_val : Node or Function
+        Converted value in TVM
+    """
+    if isinstance(value, (Function, NodeBase)):
+        return value
+
+    if callable(value):
+        return _convert_tvm_func(value)
+
+    return _convert_to_node(value)
+
 @register_node
 class Buffer(NodeBase):
     """Symbolic data buffer in TVM.
@@ -25,7 +47,7 @@ class Buffer(NodeBase):
     READ = 1
     WRITE = 2
 
-    def access_ptr(self, access_mask, ptr_type="handle"):
+    def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0):
         """Get an access pointer to the head of buffer.
 
         This is the recommended method to get buffer data
@@ -41,6 +63,14 @@ def access_ptr(self, access_mask, ptr_type="handle"):
             The data type of the result pointer. Do not specify
             unless we want to cast pointer to specific type.
 
+        content_lanes: int, optional
+            The number of lanes for the data type. This value
+            is greater than one for vector types.
+
+        offset: Expr, optional
+            The offset of pointer. We can use it to offset by
+            the number of elements from the address of ptr.
+
         Examples
         --------
         .. code-block:: python
@@ -52,6 +82,8 @@ def access_ptr(self, access_mask, ptr_type="handle"):
           buffer.access_ptr(Buffer.READ | Buffer.WRITE)
           # Get access ptr for read/write with str flag
           buffer.access_ptr("rw")
+          # Get access ptr for read with offset
+          buffer.access_ptr("r", offset = 100)
         """
         if isinstance(access_mask, string_types):
             mask = 0
@@ -63,7 +95,9 @@ def access_ptr(self, access_mask, ptr_type="handle"):
                 else:
                     raise ValueError("Unknown access_mask %s" % access_mask)
             access_mask = mask
-        return _api_internal._BufferAccessPtr(self, access_mask, ptr_type)
+        offset = convert(offset)
+        return _api_internal._BufferAccessPtr(self, access_mask, ptr_type,
+                                              content_lanes, offset)
 
     def vload(self, begin, dtype=None):
         """Generate an Expr that loads dtype from begin index.
@@ -118,6 +152,12 @@ class Fuse(NodeBase):
     pass
 
 
+@register_node
+class Singleton(NodeBase):
+    """Singleton axis."""
+    pass
+
+
 @register_node
 class IterVar(NodeBase, _expr.ExprOp):
     """Represent iteration variable.
@@ -258,8 +298,8 @@ def cache_write(self, tensor, scope):
 
         Parameters
         ----------
-        tensor : Tensor
-            The tensor to be feed to.
+        tensor : Tensor, list or tuple
+            The tensors to be feed to. All the tensors must be produced by one computeOp
         scope : str
             The scope of cached
 
@@ -270,7 +310,7 @@ def cache_write(self, tensor, scope):
         """
         return _api_internal._ScheduleCacheWrite(self, tensor, scope)
 
-    def rfactor(self, tensor, axis):
+    def rfactor(self, tensor, axis, factor_axis=0):
         """ Factor a reduction axis in tensor's schedule to be an explicit axis.
 
         This will create a new stage that generated the new tensor with axis
@@ -283,13 +323,15 @@ def rfactor(self, tensor, axis):
             The tensor to be factored.
         axis : IterVar
             The reduction axis in the schedule to be factored.
+        factor_axis : int
+            The position where the new axis is placed.
 
         Returns
         -------
         tfactor : Tensor or Array of Tensor
             The created factored tensor.
         """
-        factored = _api_internal._ScheduleRFactor(self, tensor, axis)
+        factored = _api_internal._ScheduleRFactor(self, tensor, axis, factor_axis)
         return factored[0] if len(factored) == 1 else factored
 
 
@@ -344,10 +386,7 @@ def fuse(self, *args):
         fused : IterVar
             The fused variable of iteration.
         """
-        assert len(args) >= 1, "Length of the arguments must be >=1 for fuse."
-        fused = args[0]
-        for i in range(1, len(args)):
-            fused = _api_internal._StageFuse(self, fused, args[i])
+        fused = _api_internal._StageFuse(self, args)
         return fused
 
     def set_scope(self, scope):
@@ -516,7 +555,7 @@ def parallel(self, var):
         """
         _api_internal._StageParallel(self, var)
 
-    def pragma(self, var, pragma_type):
+    def pragma(self, var, pragma_type, pragma_value=None):
         """Annotate the iteration with pragma
 
         This will translate to a pragma_scope surrounding
@@ -531,6 +570,9 @@ def pragma(self, var, pragma_type):
         pragma_type : str
              The pragma string to be annotated
 
+        pragma_value : Expr, optional
+             The pragma value to pass along the pragma
+
         Note
         ----
         Most pragmas are advanced/experimental features
@@ -561,7 +603,9 @@ def pragma(self, var, pragma_type):
           :code:`for (int i = task_id; i < end; i += num_task)`
 
         """
-        _api_internal._StagePragma(self, var, pragma_type)
+        if isinstance(pragma_value, string_types):
+            pragma_value = convert(pragma_value)
+        _api_internal._StagePragma(self, var, pragma_type, pragma_value)
 
     def prefetch(self, tensor, var, offset):
         """Prefetch the specified variable
@@ -606,4 +650,11 @@ def double_buffer(self):
         """
         _api_internal._StageDoubleBuffer(self)
 
+    def opengl(self):
+        """The special OpenGL schedule
+
+        Maps each output element to a pixel.
+        """
+        _api_internal._StageOpenGL(self)
+
 _init_api("tvm.schedule")
diff --git a/python/tvm/tag.py b/python/tvm/tag.py
index afe207f275b4..5f6091a80a17 100644
--- a/python/tvm/tag.py
+++ b/python/tvm/tag.py
@@ -1,11 +1,5 @@
 """Tag class for TVM operators."""
-from ._ffi.base import _LIB_NAME
-try:
-    from decorator import decorate
-except ImportError as err_msg:
-    # Allow decorator to be missing in runtime
-    if _LIB_NAME != "libtvm_runtime.so":
-        raise err_msg
+from ._ffi.base import decorate
 
 class TagScope(object):
     """Tag scope object to set tag for operators, working as context
diff --git a/python/tvm/target.py b/python/tvm/target.py
index 1bcd1de7d3d9..fed20c3914c6 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -41,7 +41,10 @@
 from __future__ import absolute_import
 
 import warnings
+
 from ._ffi.base import _LIB_NAME
+from ._ffi.node import NodeBase, register_node
+from . import _api_internal
 
 try:
     from decorator import decorate
@@ -50,7 +53,6 @@
     if _LIB_NAME != "libtvm_runtime.so":
         raise err_msg
 
-
 def _merge_opts(opts, new_opts):
     """Helper function to merge options"""
     if isinstance(new_opts, str):
@@ -62,82 +64,208 @@ def _merge_opts(opts, new_opts):
     return opts
 
 
-class Target(object):
+@register_node
+class Target(NodeBase):
     """Target device information, use through TVM API.
 
-    Parameters
-    ----------
-    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "ext_dev"}
-        The major target name.
-
-    options : list of str, optional
-        Additional arguments appended to the target.
-
     Note
     ----
     Do not use class constructor, you can create target using the following functions
 
     - :any:`tvm.target.create` create target from string
-    - :any:`tvm.target.rasp` create raspberry pi target
+    - :any:`tvm.target.arm_cpu` create arm_cpu target
     - :any:`tvm.target.cuda` create CUDA target
     - :any:`tvm.target.rocm` create ROCM target
+    - :any:`tvm.target.mali` create Mali target
+    - :any:`tvm.target.intel_graphics` create Intel Graphics target
     """
-    current = None
-
-    def __init__(self,
-                 target_name,
-                 options=None):
-        self.target_name = target_name
-        self.options = _merge_opts([], options)
-        self.device_name = ""
-        # Parse device option
-        for item in self.options:
-            if item.startswith("-device="):
-                self.device_name = item.split("=")[1]
-        # Target query searchs device name first
-        if self.device_name:
-            self.keys = (self.device_name,)
-        else:
-            self.keys = ()
-        # Target configuration handling
-        self.thread_warp_size = 1
-        if target_name in ("llvm", ):
-            self.keys += ("cpu",)
-        elif target_name in ("cuda", "nvptx"):
-            self.keys += ("cuda", "gpu")
-            self.max_num_threads = 512
-            self.thread_warp_size = 32
-        elif target_name in ("rocm", "opencl"):
-            # For now assume rocm schedule for opencl
-            self.keys += ("rocm", "gpu")
-            self.max_num_threads = 256
-        elif target_name in ("metal",):
-            self.keys += ("gpu",)
-            self.max_num_threads = 256
-        elif target_name in ("stackvm", "ext_dev"):
-            # Do not now class for stacvm or ext_dev
-            pass
-        else:
-            raise ValueError("Unknown target name %s" % target_name)
-
-    def __str__(self):
-        return " ".join([self.target_name] + self.options)
-
-    def __repr__(self):
-        return self.__str__()
+    def __init__(self, handle):
+        super(Target, self).__init__(handle)
+        self._keys = None
+        self._options = None
+        self._libs = None
+
+    @property
+    def keys(self):
+        if not self._keys:
+            self._keys = [k.value for k in self.keys_array]
+        return self._keys
+
+    @property
+    def options(self):
+        if not self._options:
+            self._options = [o.value for o in self.options_array]
+        return self._options
+
+    @property
+    def libs(self):
+        if not self._libs:
+            self._libs = [l.value for l in self.libs_array]
+        return self._libs
 
     def __enter__(self):
-        self._old_target = Target.current
-        if self._old_target is not None and str(self) != str(self._old_target):
-            warnings.warn(
-                "Override target '%s' with new target scope '%s'" % (
-                    self._old_target, self))
-        Target.current = self
+        _api_internal._EnterTargetScope(self)
         return self
 
     def __exit__(self, ptype, value, trace):
-        Target.current = self._old_target
+        _api_internal._ExitTargetScope()
+
+
+@register_node
+class GenericFunc(NodeBase):
+    """GenericFunc node reference. This represents a generic function
+    that may be specialized for different targets. When this object is
+    called, a specialization is chosen based on the current target.
 
+    Note
+    ----
+    Do not construct an instance of this object, it should only ever be
+    used as a return value from calling into C++.
+    """
+    def __call__(self, *args):
+        return _api_internal._GenericFuncCallFunc(self, *args)
+
+    def set_default(self, func, allow_override=False):
+        """Set the default function to be used if no specializations match
+        the current target.
+
+        Parameters
+        ----------
+        func : function
+            The default function
+
+        allow_override : bool
+            Whether to allow the current default to be overridden
+        """
+        _api_internal._GenericFuncSetDefault(self, func, allow_override)
+
+    def register(self, func, key_list, allow_override=False):
+        """Register a specialization for this GenericFunc.
+
+        Parameters
+        ----------
+        func : function
+            The function to be registered.
+
+        key : str or list of str
+            The key to be registered.
+
+        allow_override : bool, optional
+            Whether to allow existing keys to be overridden.
+        """
+        key_list = [key_list] if isinstance(key_list, str) else key_list
+        _api_internal._GenericFuncRegisterFunc(self, func, key_list, allow_override)
+
+
+def get_native_generic_func(name):
+    """Get a generic function from the global registry. If no
+    function is registered under the given name, a new generic
+    function is created.
+
+    Parameters
+    ----------
+    name : string
+        The name of the generic function to get
+
+    Returns
+    -------
+    func : GenericFunc
+        The generic function for the given name
+    """
+    return _api_internal._GenericFuncGetGlobal(name)
+
+
+def override_native_generic_func(func_name):
+    """Override a generic function defined in C++
+
+    Generic function allows registration of further functions
+    that can be dispatched on current target context.
+    If no registered dispatch is matched, the fdefault will be called.
+
+    Parameters
+    ----------
+    func_name : string
+        The name of the generic func to be overridden
+
+    Returns
+    -------
+    fgeneric : function
+        A wrapped generic function.
+
+    Example
+    -------
+    .. code-block:: python
+
+      import tvm
+      # wrap function as target generic
+      @tvm.target.override_native_generic_func("my_func")
+      def my_func(a):
+          return a + 1
+      # register specialization of my_func under target cuda
+      @my_func.register("cuda")
+      def my_func_cuda(a):
+          return a + 2
+      # displays 3, because my_func is called
+      print(my_func(2))
+      # displays 4, because my_func_cuda is called
+      with tvm.target.cuda():
+          print(my_func(2))
+    """
+    generic_func_node = get_native_generic_func(func_name)
+
+    def fdecorate(fdefault):
+        """Wrap a target generic function, overriding the previous
+        default that was set for the generic function.
+
+        Parameters
+        ----------
+        fdefault : function
+            The default function.
+
+        Returns
+        -------
+        fgeneric : function
+            A wrapped generic function.
+
+        """
+        generic_func_node.set_default(fdefault, allow_override=True)
+
+        def register(key, func=None, override=True):
+            """Register function to be the dispatch function.
+
+            Parameters
+            ----------
+            key : str or list of str
+                The key to be registered.
+
+            func : function
+                The function to be registered.
+
+            override : bool, optional
+                Whether override existing registration.
+
+            Returns
+            -------
+            The register function is necessary.
+            """
+            def _do_reg(myf):
+                generic_func_node.register(myf, key, override)
+                return myf
+            if func:
+                return _do_reg(func)
+            return _do_reg
+
+        def dispatch_func(func, *args, **kwargs):
+            #pylint: disable=unused-argument
+            """The wrapped dispath function"""
+            if kwargs:
+                raise RuntimeError(
+                    "Keyword arguments cannot be used when invoking generic_func %s" % func_name)
+            return generic_func_node(*args)
+        fresult = decorate(fdefault, dispatch_func)
+        fresult.register = register
+        return fresult
+    return fdecorate
 
 def generic_func(fdefault):
     """Wrap a target generic function.
@@ -205,7 +333,7 @@ def _do_reg(myf):
                 dispatch_dict[k] = myf
             return myf
         if func:
-            return _do_reg(myf)
+            return _do_reg(func)
         return _do_reg
 
     def dispatch_func(func, *args, **kwargs):
@@ -219,6 +347,7 @@ def dispatch_func(func, *args, **kwargs):
         return func(*args, **kwargs)
     fdecorate = decorate(fdefault, dispatch_func)
     fdecorate.register = register
+    fdecorate.fdefault = fdefault
     return fdecorate
 
 
@@ -227,10 +356,11 @@ def cuda(options=None):
 
     Parameters
     ----------
-    options : list of str
+    options : str or list of str
         Additional options
     """
-    return Target("cuda", options)
+    options = _merge_opts([], options)
+    return _api_internal._TargetCreate("cuda", *options)
 
 
 def rocm(options=None):
@@ -238,26 +368,95 @@ def rocm(options=None):
 
     Parameters
     ----------
-    options : list of str
+    options : str or list of str
         Additional options
     """
-    return Target("rocm", options)
+    options = _merge_opts([], options)
+    return _api_internal._TargetCreate("rocm", *options)
 
 
-def rasp(options=None):
-    """Returns a rasp target.
+def mali(options=None):
+    """Returns a ARM Mali GPU target.
+
+    Parameters
+    ----------
+    options : str or list of str
+        Additional options
+    """
+    opts = ["-device=mali"]
+    opts = _merge_opts(opts, options)
+    return _api_internal._TargetCreate("opencl", *opts)
+
+
+def intel_graphics(options=None):
+    """Returns an Intel Graphics target.
+
+    Parameters
+    ----------
+    options : str or list of str
+        Additional options
+    """
+    opts = ["-device=intel_graphics"]
+    opts = _merge_opts(opts, options)
+    return _api_internal._TargetCreate("opencl", *opts)
+
+
+def opengl(options=None):
+    """Returns a OpenGL target.
+
+    Parameters
+    ----------
+    options : str or list of str
+        Additional options
+    """
+    options = _merge_opts([], options)
+    return _api_internal._TargetCreate("opengl", *options)
+
+
+def arm_cpu(model='unknown', options=None):
+    """Returns a ARM CPU target.
+    This function will also download pre-tuned op parameters when there is none.
 
     Parameters
     ----------
-    options : list of str
+    model: str
+        SoC name or phone name of the arm board.
+    options : str or list of str
         Additional options
     """
-    opts = ["-device=rasp",
-            "-mtriple=armv7l-none-linux-gnueabihf",
-            "-mcpu=cortex-a53",
-            "-mattr=+neon"]
+    from . import autotvm
+
+    trans_table = {
+        "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android"],
+        "mate10":    ["-model=kirin970", "-target=arm64-linux-android"],
+        "mate10pro": ["-model=kirin970", "-target=arm64-linux-android"],
+        "p20":       ["-model=kirin970", "-target=arm64-linux-android"],
+        "p20pro":    ["-model=kirin970", "-target=arm64-linux-android"],
+        "rasp3b":    ["-model=bcm2837", "-target=armv7l-linux-gnueabihf"],
+        "rk3399":    ["-model=rk3399", "-target=aarch64-linux-gnu"],
+        "pynq":      ["-model=pynq", "-target=armv7a-linux-eabi"],
+    }
+    pre_defined_opt = trans_table.get(model, ["-model=%s" % model])
+
+    # download pre-tuned parameters for arm_cpu if there is not any.
+    autotvm.tophub.check_package('arm_cpu')
+
+    opts = ["-device=arm_cpu"] + pre_defined_opt
     opts = _merge_opts(opts, options)
-    return Target("llvm", opts)
+    return _api_internal._TargetCreate("llvm", *opts)
+
+
+def rasp(options=None):
+    """Return a Raspberry 3b target.
+
+    Parameters
+    ----------
+    options : str or list of str
+        Additional options
+    """
+    warnings.warn('tvm.target.rasp() is going to be deprecated. '
+                  'Please use tvm.target.arm_cpu("rasp3b")')
+    return arm_cpu('rasp3b', options)
 
 
 def create(target_str):
@@ -281,15 +480,8 @@ def create(target_str):
         return target_str
     if not isinstance(target_str, str):
         raise ValueError("target_str has to be string type")
-    arr = target_str.split()
-    # Parse device option
-    device_name = ""
-    for item in arr[1:]:
-        if item.startswith("-device="):
-            device_name = item.split("=")[1]
-    if device_name == "rasp":
-        return rasp(arr[1:])
-    return Target(arr[0], arr[1:])
+
+    return _api_internal._TargetFromString(target_str)
 
 
 def current_target(allow_none=True):
@@ -304,10 +496,5 @@ def current_target(allow_none=True):
     ------
     ValueError if current target is not set.
     """
-    if Target.current:
-        return Target.current
-    if not allow_none:
-        raise RuntimeError(
-            "Requires a current target in generic function, but it is not set. "
-            "Please set it using `with TargetObject:`")
-    return Target.current
+    target_str = _api_internal._GetCurrentTarget(allow_none)
+    return create(target_str) if target_str is not None else None
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index 98a142e8c128..f169ff1b64ac 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -32,7 +32,7 @@ def dtype(self):
 itervar_cls = None
 
 @register_node
-class Tensor(NodeBase):
+class Tensor(NodeBase, _expr.ExprOp):
     """Tensor object, to construct, see function.Tensor"""
     def __call__(self, *indices):
         ndim = self.ndim
@@ -60,7 +60,13 @@ def __hash__(self):
 
     def __eq__(self, other):
         if not isinstance(other, Tensor):
+            if isinstance(other, _expr.ExprOp):
+                return _expr.EqualOp(self, other)
             return False
+        if self.ndim == 0 and other.ndim == 0:
+            raise ValueError("Equal == comparison among rank-0 tensor is ambiguous, "
+                             "use Tensor.equal for content expression equvalence, "
+                             "use Tensor.same_as for exact reference comparison")
         return _api_internal._TensorEqual(self, other)
 
     @property
diff --git a/python/tvm/tensor_intrin.py b/python/tvm/tensor_intrin.py
index e166f3706a7b..62f8c8897d10 100644
--- a/python/tvm/tensor_intrin.py
+++ b/python/tvm/tensor_intrin.py
@@ -6,7 +6,7 @@
 from . import stmt as _stmt
 from . import make as _make
 from . import tensor as _tensor
-from .build_module import BuildConfig
+from .build_module import current_build_config
 from ._ffi.node import NodeBase, register_node
 
 @register_node
@@ -74,7 +74,7 @@ def decl_tensor_intrin(op,
         if not isinstance(t.op, _tensor.PlaceholderOp):
             raise ValueError("Donot yet support composition op")
 
-    cfg = BuildConfig.current
+    cfg = current_build_config()
     for t in tensors:
         buf = (binds[t] if t in binds else
                _api.decl_buffer(t.shape, t.dtype, t.op.name,
diff --git a/src/README.md b/src/README.md
index e4fc992b2d8f..dfa7a1d33d22 100644
--- a/src/README.md
+++ b/src/README.md
@@ -4,13 +4,13 @@ Header files in include are public APIs that share across modules.
 There can be internal header files within each module that sit in src.
 
 ## Modules
-- common Internal common utilities.
-- api API function registration
-- lang The definition of DSL related data structure
-- arithmetic Arithmetic expression and set simplification
-- op The detail implementations about each operation(compute, scan, placeholder)
-- schedule The operations on the schedule graph before converting to IR.
-- pass The optimization pass on the IR structure
-- codegen The code generator.
-- runtime Minimum runtime related codes
-- contrib Contrib extension libraries
+- common: Internal common utilities.
+- api: API function registration
+- lang: The definition of DSL related data structure
+- arithmetic: Arithmetic expression and set simplification
+- op: The detail implementations about each operation(compute, scan, placeholder)
+- schedule: The operations on the schedule graph before converting to IR.
+- pass: The optimization pass on the IR structure
+- codegen: The code generator.
+- runtime: Minimum runtime related codes
+- contrib: Contrib extension libraries
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index df8469903533..cc76f6a8f50b 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -36,4 +36,9 @@ TVM_REGISTER_API("_load_json")
 TVM_REGISTER_API("_nop")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
   });
+
+TVM_REGISTER_API("_TVMSetStream")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    TVMSetStream(args[0], args[1], args[2]);
+  });
 }  // namespace tvm
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index 0a26af0ca43c..bc9293c20b7a 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -7,6 +7,7 @@
 #include <tvm/ir.h>
 #include <ir/IROperator.h>
 #include <tvm/api_registry.h>
+#include <tvm/ir_operator.h>
 
 namespace tvm {
 namespace ir {
@@ -16,6 +17,11 @@ TVM_REGISTER_API("_Var")
     *ret = Variable::make(args[1], args[0]);
   });
 
+TVM_REGISTER_API("make.abs")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    *ret = tvm::abs(args[0]);
+  });
+
 TVM_REGISTER_API("make._range_by_min_extent")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     *ret = Range::make_by_min_extent(args[0], args[1]);
@@ -27,7 +33,7 @@ TVM_REGISTER_API("make.For")
                      args[1],
                      args[2],
                      static_cast<ForType>(args[3].operator int()),
-                     static_cast<Halide::DeviceAPI>(args[4].operator int()),
+                     static_cast<HalideIR::DeviceAPI>(args[4].operator int()),
                      args[5]);
   });
 
@@ -111,12 +117,25 @@ TVM_REGISTER_API("make.CommReducer")
       *ret = Node::make(args[0], args[1], args[2], args[3], args[4]);   \
     })                                                                  \
 
-#define REGISTER_MAKE_BINARY_OP(Node)                        \
+#define REGISTER_MAKE_BINARY_OP(Node, Func)                  \
   TVM_REGISTER_API("make."#Node)                             \
   .set_body([](TVMArgs args,  TVMRetValue *ret) {            \
       Expr a = args[0], b = args[1];                         \
-      match_types(a, b);                                     \
-      *ret = Node::make(a, b);                               \
+      *ret = (Func(a, b));                                   \
+    })
+
+#define REGISTER_MAKE_BIT_OP(Node, Func)                                \
+  TVM_REGISTER_API("make."#Node)                                        \
+  .set_body([](TVMArgs args,  TVMRetValue *ret) {                       \
+      bool lhs_is_int = args[0].type_code() == kDLInt;                  \
+      bool rhs_is_int = args[1].type_code() == kDLInt;                  \
+      if (lhs_is_int) {                                                 \
+        *ret = (Func(args[0].operator int(), args[1].operator Expr())); \
+      } else if (rhs_is_int) {                                          \
+        *ret = (Func(args[0].operator Expr(), args[1].operator int())); \
+      } else {                                                          \
+        *ret = (Func(args[0].operator Expr(), args[1].operator Expr())); \
+      }                                                                 \
     })
 
 REGISTER_MAKE5(Reduce);
@@ -126,21 +145,26 @@ REGISTER_MAKE2(IntImm);
 REGISTER_MAKE2(UIntImm);
 REGISTER_MAKE2(FloatImm);
 REGISTER_MAKE1(StringImm);
-REGISTER_MAKE_BINARY_OP(Add);
-REGISTER_MAKE_BINARY_OP(Sub);
-REGISTER_MAKE_BINARY_OP(Mul);
-REGISTER_MAKE_BINARY_OP(Div);
-REGISTER_MAKE_BINARY_OP(Mod);
-REGISTER_MAKE_BINARY_OP(Min);
-REGISTER_MAKE_BINARY_OP(Max);
-REGISTER_MAKE_BINARY_OP(EQ);
-REGISTER_MAKE_BINARY_OP(NE);
-REGISTER_MAKE_BINARY_OP(LT);
-REGISTER_MAKE_BINARY_OP(LE);
-REGISTER_MAKE_BINARY_OP(GT);
-REGISTER_MAKE_BINARY_OP(GE);
-REGISTER_MAKE_BINARY_OP(And);
-REGISTER_MAKE_BINARY_OP(Or);
+REGISTER_MAKE_BINARY_OP(Add, operator+);
+REGISTER_MAKE_BINARY_OP(Sub, operator-);
+REGISTER_MAKE_BINARY_OP(Mul, operator*);
+REGISTER_MAKE_BINARY_OP(Div, operator/);
+REGISTER_MAKE_BINARY_OP(Mod, operator%);
+REGISTER_MAKE_BINARY_OP(Min, min);
+REGISTER_MAKE_BINARY_OP(Max, max);
+REGISTER_MAKE_BINARY_OP(EQ, operator==);
+REGISTER_MAKE_BINARY_OP(NE, operator!=);
+REGISTER_MAKE_BINARY_OP(LT, operator<); // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(LE, operator<=); // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(GT, operator>);  // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(GE, operator>=);
+REGISTER_MAKE_BINARY_OP(And, operator&&);
+REGISTER_MAKE_BINARY_OP(Or, operator||);
+REGISTER_MAKE_BIT_OP(bitwise_and, operator&);
+REGISTER_MAKE_BIT_OP(bitwise_or, operator|);
+REGISTER_MAKE_BIT_OP(bitwise_xor, operator^);
+REGISTER_MAKE_BIT_OP(left_shift, operator<<); // NOLINT(*)
+REGISTER_MAKE_BIT_OP(right_shift, operator>>);
 REGISTER_MAKE1(Not);
 REGISTER_MAKE3(Select);
 REGISTER_MAKE3(Ramp);
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index 50531d73010f..8c55684ed851 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -10,6 +10,7 @@
 #include <tvm/buffer.h>
 #include <tvm/schedule.h>
 #include <tvm/api_registry.h>
+#include <tvm/build_module.h>
 
 namespace tvm {
 
@@ -27,9 +28,9 @@ TVM_REGISTER_API("_max_value")
 
 TVM_REGISTER_API("_const")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    if (args[0].type_code() == kInt) {
+    if (args[0].type_code() == kDLInt) {
       *ret = make_const(args[1], args[0].operator int64_t());
-    } else if (args[0].type_code() == kFloat) {
+    } else if (args[0].type_code() == kDLFloat) {
       *ret = make_const(args[1], args[0].operator double());
     } else {
       LOG(FATAL) << "only accept int or float";
@@ -75,63 +76,108 @@ TVM_REGISTER_API("_ArraySize")
 TVM_REGISTER_API("_Map")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     CHECK_EQ(args.size() % 2, 0);
-    MapNode::ContainerType data;
-    for (int i = 0; i < args.num_args; i += 2) {
-      CHECK(args[i].type_code() == kNodeHandle)
-          << "need content of array to be NodeBase";
-      CHECK(args[i + 1].type_code() == kNodeHandle)
-          << "need content of array to be NodeBase";
-      data.emplace(std::make_pair(args[i].node_sptr(),
-                                  args[i + 1].node_sptr()));
+    if (args.size() != 0 && args[0].type_code() == kStr) {
+      // StrMap
+      StrMapNode::ContainerType data;
+      for (int i = 0; i < args.num_args; i += 2) {
+        CHECK(args[i].type_code() == kStr)
+            << "key of str map need to be str";
+        CHECK(args[i + 1].type_code() == kNodeHandle)
+            << "value of the map to be NodeRef";
+        data.emplace(std::make_pair(args[i].operator std::string(),
+                                    args[i + 1].node_sptr()));
+      }
+      auto node = std::make_shared<StrMapNode>();
+      node->data = std::move(data);
+      *ret = node;
+    } else {
+      // Container node.
+      MapNode::ContainerType data;
+      for (int i = 0; i < args.num_args; i += 2) {
+        CHECK(args[i].type_code() == kNodeHandle)
+            << "key of str map need to be str";
+        CHECK(args[i + 1].type_code() == kNodeHandle)
+            << "value of map to be NodeRef";
+        data.emplace(std::make_pair(args[i].node_sptr(),
+                                    args[i + 1].node_sptr()));
+      }
+      auto node = std::make_shared<MapNode>();
+      node->data = std::move(data);
+      *ret = node;
     }
-    auto node = std::make_shared<MapNode>();
-    node->data = std::move(data);
-    *ret = node;
   });
 
 TVM_REGISTER_API("_MapSize")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     auto& sptr = args[0].node_sptr();
-    CHECK(sptr->is_type<MapNode>());
-    auto* n = static_cast<const MapNode*>(sptr.get());
-    *ret = static_cast<int64_t>(n->data.size());
+    if (sptr->is_type<MapNode>()) {
+      auto* n = static_cast<const MapNode*>(sptr.get());
+      *ret = static_cast<int64_t>(n->data.size());
+    } else {
+      CHECK(sptr->is_type<StrMapNode>());
+      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      *ret = static_cast<int64_t>(n->data.size());
+    }
   });
 
 TVM_REGISTER_API("_MapGetItem")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     CHECK(args[0].type_code() == kNodeHandle);
-    CHECK(args[1].type_code() == kNodeHandle);
     auto& sptr = args[0].node_sptr();
-    CHECK(sptr->is_type<MapNode>());
-    auto* n = static_cast<const MapNode*>(sptr.get());
-    auto it = n->data.find(args[1].node_sptr());
-    CHECK(it != n->data.end())
-        << "cannot find the corresponding key in the Map";
-    *ret = (*it).second;
+    if (sptr->is_type<MapNode>()) {
+      CHECK(args[1].type_code() == kNodeHandle);
+      auto* n = static_cast<const MapNode*>(sptr.get());
+      auto it = n->data.find(args[1].node_sptr());
+      CHECK(it != n->data.end())
+          << "cannot find the corresponding key in the Map";
+      *ret = (*it).second;
+    } else {
+      CHECK(sptr->is_type<StrMapNode>());
+      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      auto it = n->data.find(args[1].operator std::string());
+      CHECK(it != n->data.end())
+          << "cannot find the corresponding key in the Map";
+      *ret = (*it).second;
+    }
   });
 
 TVM_REGISTER_API("_MapCount")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     CHECK(args[0].type_code() == kNodeHandle);
-    CHECK(args[1].type_code() == kNodeHandle);
     auto& sptr = args[0].node_sptr();
-    CHECK(sptr->is_type<MapNode>());
-    auto* n = static_cast<const MapNode*>(sptr.get());
-    *ret = static_cast<int64_t>(
-        n->data.count(args[1].node_sptr()));
+    if (sptr->is_type<MapNode>()) {
+      auto* n = static_cast<const MapNode*>(sptr.get());
+      CHECK(args[1].type_code() == kNodeHandle);
+      *ret = static_cast<int64_t>(
+          n->data.count(args[1].node_sptr()));
+    } else {
+      CHECK(sptr->is_type<StrMapNode>());
+      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      *ret = static_cast<int64_t>(
+          n->data.count(args[1].operator std::string()));
+    }
   });
 
 TVM_REGISTER_API("_MapItems")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     auto& sptr = args[0].node_sptr();
-    CHECK(sptr->is_type<MapNode>());
-    auto* n = static_cast<const MapNode*>(sptr.get());
-    auto rkvs = std::make_shared<ArrayNode>();
-    for (const auto& kv : n->data) {
-      rkvs->data.push_back(kv.first);
-      rkvs->data.push_back(kv.second);
+    if (sptr->is_type<MapNode>()) {
+      auto* n = static_cast<const MapNode*>(sptr.get());
+      auto rkvs = std::make_shared<ArrayNode>();
+      for (const auto& kv : n->data) {
+        rkvs->data.push_back(kv.first);
+        rkvs->data.push_back(kv.second);
+      }
+      *ret = rkvs;
+    } else {
+      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      auto rkvs = std::make_shared<ArrayNode>();
+      for (const auto& kv : n->data) {
+        rkvs->data.push_back(ir::StringImm::make(kv.first).node_);
+        rkvs->data.push_back(kv.second);
+      }
+      *ret = rkvs;
     }
-    *ret = rkvs;
   });
 
 TVM_REGISTER_API("Range")
@@ -159,7 +205,7 @@ TVM_REGISTER_API("_Buffer")
 TVM_REGISTER_API("_BufferAccessPtr")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     *ret = args[0].operator Buffer()
-        .access_ptr(args[1], args[2]);
+        .access_ptr(args[1], args[2], args[3], args[4]);
   });
 
 TVM_REGISTER_API("_BufferVLoad")
@@ -216,7 +262,8 @@ TVM_REGISTER_API("_ComputeOp")
     *ret = ComputeOpNode::make(args[0],
                                args[1],
                                args[2],
-                               args[3]);
+                               args[3],
+                               args[4]);
   });
 
 TVM_REGISTER_API("_ScanOp")
@@ -227,7 +274,8 @@ TVM_REGISTER_API("_ScanOp")
                             args[3],
                             args[4],
                             args[5],
-                            args[6]);
+                            args[6],
+                            args[7]);
   });
 
 TVM_REGISTER_API("_ExternOp")
@@ -237,7 +285,8 @@ TVM_REGISTER_API("_ExternOp")
                               args[2],
                               args[3],
                               args[4],
-                              args[5]);
+                              args[5],
+                              args[6]);
   });
 
 TVM_REGISTER_API("_OpGetOutput")
@@ -301,7 +350,7 @@ TVM_REGISTER_API("_StageFuse")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     IterVar fused;
     args[0].operator Stage()
-        .fuse(args[1], args[2], &fused);
+        .fuse(args[1], &fused);
     *ret = fused;
   });
 
@@ -379,7 +428,7 @@ TVM_REGISTER_API("_StageParallel")
 TVM_REGISTER_API("_StagePragma")
   .set_body([](TVMArgs args, TVMRetValue* ret) {
     args[0].operator Stage()
-        .pragma(args[1], args[2]);
+        .pragma(args[1], args[2], args[3]);
   });
 
 TVM_REGISTER_API("_StagePrefetch")
@@ -399,6 +448,11 @@ TVM_REGISTER_API("_StageDoubleBuffer")
     args[0].operator Stage().double_buffer();
   });
 
+TVM_REGISTER_API("_StageOpenGL")
+  .set_body([](TVMArgs args, TVMRetValue *ret) {
+    args[0].operator Stage().opengl();
+  });
+
 TVM_REGISTER_API("_ScheduleNormalize")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = args[0].operator Schedule()
@@ -419,14 +473,19 @@ TVM_REGISTER_API("_ScheduleCacheRead")
 
 TVM_REGISTER_API("_ScheduleCacheWrite")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = args[0].operator Schedule()
-        .cache_write(args[1], args[2]);
+    if (args[1].IsNodeType<Tensor>()) {
+      *ret = args[0].operator Schedule()
+          .cache_write(args[1].operator Tensor(), args[2]);
+    } else {
+      *ret = args[0].operator Schedule()
+          .cache_write(args[1].operator Array<Tensor>(), args[2]);
+    }
   });
 
 TVM_REGISTER_API("_ScheduleRFactor")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = args[0].operator Schedule()
-        .rfactor(args[1], args[2]);
+        .rfactor(args[1], args[2], args[3]);
   });
 
 TVM_REGISTER_API("_CommReducerCombine")
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index 2dacb32e54f7..a0048a2ed771 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -16,18 +16,43 @@ namespace ir {
 TVM_REGISTER_API("ir_pass.Simplify")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     if (args[0].IsNodeType<Stmt>()) {
-      *ret = Simplify(args[0].operator Stmt());
+      if (args.size() > 1) {
+        *ret = Simplify(args[0].operator Stmt(), args[1]);
+      } else {
+        *ret = Simplify(args[0].operator Stmt());
+      }
     } else {
-      *ret = Simplify(args[0].operator Expr());
+      if (args.size() > 1) {
+        *ret = Simplify(args[0].operator Expr(), args[1]);
+      } else {
+        *ret = Simplify(args[0].operator Expr());
+      }
     }
   });
 
 TVM_REGISTER_API("ir_pass.CanonicalSimplify")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     if (args[0].IsNodeType<Stmt>()) {
-      *ret = CanonicalSimplify(args[0].operator Stmt());
+      if (args.size() > 1) {
+        *ret = CanonicalSimplify(args[0].operator Stmt(), args[1]);
+      } else {
+        *ret = CanonicalSimplify(args[0].operator Stmt());
+      }
     } else {
-      *ret = CanonicalSimplify(args[0].operator Expr());
+      if (args.size() > 1) {
+        *ret = CanonicalSimplify(args[0].operator Expr(), args[1]);
+      } else {
+        *ret = CanonicalSimplify(args[0].operator Expr());
+      }
+    }
+  });
+
+TVM_REGISTER_API("ir_pass.Substitute")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    if (args[0].IsNodeType<Stmt>()) {
+      *ret = Substitute(args[0].operator Stmt(), args[1].operator Map<Var, Expr>());
+    } else {
+      *ret = Substitute(args[0].operator Expr(), args[1].operator Map<Var, Expr>());
     }
   });
 
@@ -91,7 +116,7 @@ REGISTER_PASS4(Inline);
 REGISTER_PASS3(StorageFlatten);
 REGISTER_PASS4(IRTransform);
 REGISTER_PASS1(VectorizeLoop);
-REGISTER_PASS4(UnrollLoop);
+REGISTER_PASS5(UnrollLoop);
 REGISTER_PASS3(InjectCopyIntrin);
 REGISTER_PASS2(ThreadSync);
 REGISTER_PASS5(MakeAPI);
@@ -103,14 +128,18 @@ REGISTER_PASS1(LowerStorageAccessInfo);
 REGISTER_PASS1(InjectVirtualThread);
 REGISTER_PASS1(InjectPrefetch);
 REGISTER_PASS2(InjectDoubleBuffer);
-REGISTER_PASS1(LoopPartition);
+REGISTER_PASS2(LoopPartition);
 REGISTER_PASS1(RemoveNoOp);
 REGISTER_PASS2(SplitPipeline);
 REGISTER_PASS2(LiftAttrScope);
 REGISTER_PASS1(NarrowChannelAccess);
 REGISTER_PASS2(LowerThreadAllreduce);
+REGISTER_PASS2(LowerWarpMemory);
+REGISTER_PASS2(RemapThreadAxis);
 REGISTER_PASS2(LowerIntrin);
 REGISTER_PASS1(LowerTVMBuiltin);
 REGISTER_PASS1(CombineContextCall);
+REGISTER_PASS2(VerifyMemory);
+REGISTER_PASS2(VerifyGPUCode);
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/api/api_schedule.cc b/src/api/api_schedule.cc
index 0b5ef251503c..14191d79ec02 100644
--- a/src/api/api_schedule.cc
+++ b/src/api/api_schedule.cc
@@ -24,6 +24,14 @@ TVM_REGISTER_API("schedule.AutoInlineInjective")
     AutoInlineInjective(args[0]);
   });
 
+TVM_REGISTER_API("schedule.ScheduleOps")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  if (args.size() == 2)
+    *ret = ScheduleOps(args[0], args[1], false);
+  else
+    *ret = ScheduleOps(args[0], args[1], args[2]);
+});
+
 #define REGISTER_SCHEDULE_PASS1(PassName)                         \
   TVM_REGISTER_API("schedule."#PassName)                          \
   .set_body([](TVMArgs args,  TVMRetValue *ret) {                 \
@@ -43,7 +51,6 @@ REGISTER_SCHEDULE_PASS2(PostDFSOrder);
 REGISTER_SCHEDULE_PASS1(CreateAttachPath);
 REGISTER_SCHEDULE_PASS1(ScanGetBody);
 REGISTER_SCHEDULE_PASS1(ScanFixPointAnalysis);
-REGISTER_SCHEDULE_PASS2(ScheduleOps);
 
 }  // namespace schedule
 }  // namespace tvm
diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc
index 4e247ed2bf4c..80d7c3163e10 100644
--- a/src/api/dsl_api.cc
+++ b/src/api/dsl_api.cc
@@ -32,7 +32,7 @@ using TVMAPINode = std::shared_ptr<Node>;
 struct APIAttrGetter : public AttrVisitor {
   std::string skey;
   TVMRetValue* ret;
-  bool found_node_ref{false};
+  bool found_ref_object{false};
 
   void Visit(const char* key, double* value) final {
     if (skey == key) *ret = value[0];
@@ -63,7 +63,13 @@ struct APIAttrGetter : public AttrVisitor {
   void Visit(const char* key, NodeRef* value) final {
     if (skey == key) {
       *ret = value[0];
-      found_node_ref = true;
+      found_ref_object = true;
+    }
+  }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    if (skey == key) {
+      *ret = value[0];
+      found_ref_object = true;
     }
   }
 };
@@ -98,6 +104,9 @@ struct APIAttrDir : public AttrVisitor {
   void Visit(const char* key, NodeRef* value) final {
     names->push_back(key);
   }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    names->push_back(key);
+  }
 };
 
 class DSLAPIImpl : public DSLAPI {
@@ -130,7 +139,7 @@ class DSLAPIImpl : public DSLAPI {
       *ret_success = 1;
     } else {
       (*tnode)->VisitAttrs(&getter);
-      *ret_success = getter.found_node_ref || rv.type_code() != kNull;
+      *ret_success = getter.found_ref_object || rv.type_code() != kNull;
       if (rv.type_code() == kStr ||
           rv.type_code() == kTVMType) {
         TVMAPIThreadLocalEntry *e = TVMAPIThreadLocalStore::Get();
diff --git a/src/arithmetic/bound_deducer.cc b/src/arithmetic/bound_deducer.cc
index e09834923c37..c9779bbbe24d 100644
--- a/src/arithmetic/bound_deducer.cc
+++ b/src/arithmetic/bound_deducer.cc
@@ -16,7 +16,7 @@ namespace tvm {
 namespace arith {
 
 using namespace ir;
-using Halide::Internal::Interval;
+using HalideIR::Internal::Interval;
 
 // a visitor to find the path to the target variable
 // from a expression.
@@ -128,13 +128,25 @@ class BoundDeducer: public IRVisitor {
     }
 
     // always use relax bound
-    result = result / operand + (is_greater ? 1 : -1);
+    bool divided = can_prove(result % operand == 0);
+    result = result / operand;
+    // since system will round down when not divided
+    // eg. 2/4 -> 0; -2/4 -> -1
+    // no need fix for !is_greater:
+    // eg. a <= 2/4 -> a <= 0
+    // eg. a <= 0/4 -> a <= 0
+    // so just fix for not divided and is_greater
+    // eg. a >= 2/4 -> a >= 0 + 1
+    // eg. a >= 0/4 -> a >= 0
+    if (is_greater && !divided) {
+       result += 1;
+    }
+
     Visit(left ? op->a : op->b);
   }
 
   Expr result;
   bool is_greater{true};
-  bool is_equal{true};
   bool success{true};
 
  private:
@@ -178,22 +190,20 @@ void BoundDeducer::Init() {
 void BoundDeducer::Transform() {
   if (const LT* op = expr_.as<LT>()) {
     is_greater = false;
-    is_equal   = false;
     expr_      = op->a;
-    result     = op->b;
+    // a < b -> a <= b - 1
+    result     = op->b - 1;
   } else if (const LE* op = expr_.as<LE>()) {
     is_greater = false;
-    is_equal   = true;
     expr_      = op->a;
     result     = op->b;
   } else if (const GT* op = expr_.as<GT>()) {
     is_greater = true;
-    is_equal   = false;
     expr_      = op->a;
-    result     = op->b;
+    // a > b -> a >= b + 1
+    result     = op->b + 1;
   } else if (const GE* op = expr_.as<GE>()) {
     is_greater = true;
-    is_equal   = true;
     expr_      = op->a;
     result     = op->b;
   } else {
@@ -237,9 +247,9 @@ IntSet DeduceBound(Expr v, Expr e,
   if (!d.success) return IntSet::nothing();
   Expr min = Interval::neg_inf, max = Interval::pos_inf;
   if (d.is_greater) {
-    min = d.is_equal ? d.result : d.result + 1;
+    min = d.result;
   } else {
-    max = d.is_equal ? d.result : d.result - 1;
+    max = d.result;
   }
   return IntSet::interval(min, max);
 }
diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc
index 933a8f78ea16..ed6239961a3b 100644
--- a/src/arithmetic/canonical.cc
+++ b/src/arithmetic/canonical.cc
@@ -5,8 +5,10 @@
  */
 #include <tvm/ir_mutator.h>
 #include <tvm/arithmetic.h>
+#include <tvm/ir_pass.h>
 #include "./canonical.h"
 #include "./compute_expr.h"
+#include "arithmetic/Simplify.h"
 
 namespace tvm {
 namespace arith {
@@ -27,7 +29,17 @@ struct ComExprEntry {
   inline bool operator<(const ComExprEntry& other) const {
     if (level < other.level) return true;
     if (level > other.level) return false;
-    return value.get() < other.value.get();
+    // compare top operator of entries and sort on that if possible (fast check)
+    if (value.type_index() < other.value.type_index()) return true;
+    if (value.type_index() > other.value.type_index()) return false;
+    // if none of the above distinguishes the terms, compare the expression tree of the entries.
+    // This is a slower check.
+    int compare_result = Compare(value, other.value);
+    if (compare_result < 0) return true;
+    if (compare_result > 0) return false;
+    // it's a problem if we see identical entries at this point. They should've been merged earlier.
+    LOG(WARNING) << "we should not have identical entries at this point";
+    return false;
   }
 };
 
@@ -128,6 +140,11 @@ inline Expr Binary_(const T* op,
 // internal of canonical engine.
 class Canonical::Internal : public IRMutator {
  public:
+  explicit Internal(Map<Var, Range> vrange) {
+    for (auto kv : vrange) {
+      SetRange(kv.first, kv.second, 0);
+    }
+  }
   // stack entry.
   struct StackEntry {
     int max_level{0};
@@ -295,13 +312,43 @@ class Canonical::Internal : public IRMutator {
       return e;
     }
   }
-  // binary ops
+  // Div operator
   Expr Mutate_(const Div* op, const Expr& e) final {
-    return Binary(op, e);
+    if (!EnableOpt(op->type)) {
+      return Binary(op, e);
+    }
+    CacheEntry a = Produce(op->a);
+    CacheEntry b = Produce(op->b);
+    if (a.has_side_effect || b.has_side_effect) {
+      return Binary_(op, e, a.value, b.value);
+    }
+    if (is_const(a.value) && is_const(b.value)) {
+      return ComputeExpr<Div>(a.value, b.value);
+    } else if (is_const(b.value)) {
+      return SumDivConst(a.AsSum(), b.value);
+    } else {
+      return Binary(op, e);
+    }
   }
+  // Mod operator
   Expr Mutate_(const Mod* op, const Expr& e) final {
-    return Binary(op, e);
+    if (!EnableOpt(op->type)) {
+      return Binary(op, e);
+    }
+    CacheEntry a = Produce(op->a);
+    CacheEntry b = Produce(op->b);
+    if (a.has_side_effect || b.has_side_effect) {
+      return Binary_(op, e, a.value, b.value);
+    }
+    if (is_const(a.value) && is_const(b.value)) {
+      return ComputeExpr<Mul>(a.value, b.value);
+    } else if (is_const(b.value)) {
+      return SumModConst(a.AsSum(), b.value);
+    } else {
+      return Binary(op, e);
+    }
   }
+
   Expr Mutate_(const And* op, const Expr& e) final {
     Expr expr = IRMutator::Mutate_(op, e);
     op = expr.as<And>();
@@ -366,7 +413,7 @@ class Canonical::Internal : public IRMutator {
 
  private:
   template<typename T>
-  Expr Binary(const T* op, const Expr& e) {
+  Expr Binary(const T* op, Expr e) {
     Expr a = this->Mutate(op->a);
     Expr b = this->Mutate(op->b);
     BinaryExpr key{static_cast<int>(T::_type_info), a, b};
@@ -397,8 +444,8 @@ class Canonical::Internal : public IRMutator {
   std::vector<Var> var_rec_;
   // level counter
   int level_counter_{0};
-  // subroutine to do produce
-  Expr SumMulConst(ComExpr a, Expr v) {
+  // get constant int value
+  int64_t GetConstIntValue(const Expr& v) {
     int64_t value = 0;
     const int64_t *v1 = as_const_int(v);
     const uint64_t *v2 = as_const_uint(v);
@@ -410,7 +457,96 @@ class Canonical::Internal : public IRMutator {
                static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
       value = static_cast<int64_t>(*v2);
     }
-
+    return value;
+  }
+  // Detect if a = x * coeff + y, where y \in [0, coeff), x >= 0
+  // return true if such detection is successful
+  // return false if it is not.
+  std::vector<ComExpr> TryLinearEquation(const ComExpr& a,
+                                         const Expr& coeff) {
+    Type type = coeff.type();
+    int64_t value = GetConstIntValue(coeff);
+    if (value < 0) return {};
+    std::shared_ptr<ComExprNode> xnode = std::make_shared<ComExprNode>();
+    std::shared_ptr<ComExprNode> ynode = std::make_shared<ComExprNode>();
+    if (a->base % value == 0) {
+      xnode->base = a->base;
+    } else {
+      ynode->base = a->base;
+    }
+    for (const auto& e : a->elem) {
+      if (e.scale % value == 0) {
+        xnode->elem.push_back(e);
+      } else {
+        ynode->elem.push_back(e);
+      }
+    }
+    Expr yres = Sum2Expr(ComExpr(ynode), type);
+    IntSet yset = EvalSet(yres, var_range_);
+    // This relies on the integer division rounds down
+    // Most cases it is good for integer division.
+    if (yset.min().type() == type &&
+        can_prove(yset.min() >= make_zero(type)) &&
+        yset.max().type() == type &&
+        can_prove(yset.max() < coeff)) {
+      xnode->base /= value;
+      for (auto &e : xnode->elem) {
+        e.scale /= value;
+      }
+      return {ComExpr(xnode), ComExpr(ynode)};
+    } else {
+      return {};
+    }
+  }
+  // subroutine to do produce a % v
+  Expr SumModConst(ComExpr a, Expr v) {
+    std::vector<ComExpr> pair = TryLinearEquation(a, v);
+    if (pair.size() == 0) {
+      int64_t value = GetConstIntValue(v);
+      std::shared_ptr<ComExprNode> n = std::make_shared<ComExprNode>();
+      n->base = a->base % value;
+      for (auto e : a->elem) {
+        if (e.scale % value == 0) continue;
+        e.scale = e.scale % value;
+        n->elem.push_back(e);
+      }
+      Expr ret = Sum2Expr(ComExpr(n), v.type()) % v;
+      return Binary(ret.as<Mod>(), ret);
+    }
+    ret_entry_.sum = pair[1];
+    ret_entry_.max_level = stack_.back().max_level;
+    ret_entry_.has_side_effect = stack_.back().has_side_effect;
+    auto it = cache_sum_.find(ret_entry_.sum);
+    if (it != cache_sum_.end()) {
+      ret_entry_ = it->second;
+    } else {
+      ret_entry_.value = Sum2Expr(ret_entry_.sum, v.type());
+      cache_sum_[ret_entry_.sum] = ret_entry_;
+    }
+    return ret_entry_.value;
+  }
+  // subroutine to do produce a % v
+  Expr SumDivConst(ComExpr a, Expr v) {
+    std::vector<ComExpr> pair = TryLinearEquation(a, v);
+    if (pair.size() == 0) {
+      Expr ret = Sum2Expr(a, v.type()) / v;
+      return Binary(ret.as<Div>(), ret);
+    }
+    ret_entry_.sum = pair[0];
+    ret_entry_.max_level = stack_.back().max_level;
+    ret_entry_.has_side_effect = stack_.back().has_side_effect;
+    auto it = cache_sum_.find(ret_entry_.sum);
+    if (it != cache_sum_.end()) {
+      ret_entry_ = it->second;
+    } else {
+      ret_entry_.value = Sum2Expr(ret_entry_.sum, v.type());
+      cache_sum_[ret_entry_.sum] = ret_entry_;
+    }
+    return ret_entry_.value;
+  }
+  // subroutine to do produce
+  Expr SumMulConst(ComExpr a, Expr v) {
+    int64_t value = GetConstIntValue(v);
     if (value == 0) {
       return make_zero(v.type());
     }
@@ -420,9 +556,9 @@ class Canonical::Internal : public IRMutator {
     for (auto& e : vsum->elem) {
       e.scale *= value;
     }
+    ret_entry_.sum = ComExpr(vsum);
     ret_entry_.max_level = stack_.back().max_level;
     ret_entry_.has_side_effect = stack_.back().has_side_effect;
-    ret_entry_.sum = ComExpr(vsum);
     auto it = cache_sum_.find(ret_entry_.sum);
     if (it != cache_sum_.end()) {
       ret_entry_ = it->second;
@@ -535,8 +671,8 @@ class Canonical::Internal : public IRMutator {
 
 using CInternal = Canonical::Internal;
 
-Canonical::Canonical()
-    : ptr_(std::make_shared<Internal>()) {}
+Canonical::Canonical(Map<Var, Range> vrange)
+    : ptr_(std::make_shared<Internal>(vrange)) {}
 
 Expr Canonical::Simplify(Expr expr) {
   return ptr_->Mutate(expr);
@@ -552,12 +688,54 @@ void Canonical::SetRange(Var v, Range r, int level) {
 }  // namespace arith
 
 namespace ir {
-Stmt CanonicalSimplify(Stmt stmt) {
-  return arith::Canonical().Simplify(stmt);
+
+Stmt CanonicalSimplify(Stmt stmt, Map<Var, Range> vrange) {
+  return arith::Canonical(vrange).Simplify(stmt);
+}
+
+Expr CanonicalSimplify(Expr expr, Map<Var, Range> vrange) {
+  return arith::Canonical(vrange).Simplify(expr);
+}
+
+template<typename T>
+T Simplify_(T a, Map<Var, Range> vrange) {
+  using namespace HalideIR::Internal;
+  Scope<Interval> rscope;
+  for (auto kv : vrange) {
+    Range r = kv.second;
+    rscope.push(
+        kv.first.get(),
+        Interval(r->min,
+                 simplify(r->min + r->extent - make_const(r->min.type(), 1))));
+  }
+  return HalideIR::Internal::simplify(a, true, rscope);
+}
+
+
+Expr Simplify(Expr a, Map<Var, Range> vrange) {
+  // We should not pass an expression having a non-HalideIR op to
+  // Halide::Internal::simplify. Reduce op is the only such op at this time
+  // and it only appears as the top op in an expression. So we strip it
+  // first and send the sub-expressions to the simplifier.
+  if (const Reduce* r = a.as<Reduce>()) {
+    Array<Expr> new_source;
+    for (auto& e : r->source) {
+      new_source.push_back(Simplify_(e, vrange));
+    }
+    Expr new_condition = Simplify_(r->condition, vrange);
+    if (r->source.same_as(new_source) &&
+        r->condition.same_as(new_condition)) {
+      return a;
+    } else {
+      return Reduce::make(
+              r->combiner, new_source, r->axis, new_condition, r->value_index);
+    }
+  }
+  return Simplify_(a, vrange);
 }
 
-Expr CanonicalSimplify(Expr expr) {
-  return arith::Canonical().Simplify(expr);
+Stmt Simplify(Stmt a, Map<Var, Range> vrange) {
+  return Simplify_(a, vrange);
 }
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/arithmetic/canonical.h b/src/arithmetic/canonical.h
index 174acc20aebe..37f9a178f696 100644
--- a/src/arithmetic/canonical.h
+++ b/src/arithmetic/canonical.h
@@ -22,7 +22,7 @@ namespace arith {
 class Canonical {
  public:
   /*! \brief constructor */
-  Canonical();
+  explicit Canonical(Map<Var, Range> var_range);
   /*!
    * \brief simplify expression e.
    * \param expr The expression to be simplified.
diff --git a/src/arithmetic/compute_expr.h b/src/arithmetic/compute_expr.h
index 18ae8530fbe8..5f44347f3539 100644
--- a/src/arithmetic/compute_expr.h
+++ b/src/arithmetic/compute_expr.h
@@ -14,9 +14,9 @@
 namespace tvm {
 namespace arith {
 
-using Halide::Internal::add_would_overflow;
-using Halide::Internal::sub_would_overflow;
-using Halide::Internal::mul_would_overflow;
+using HalideIR::Internal::add_would_overflow;
+using HalideIR::Internal::sub_would_overflow;
+using HalideIR::Internal::mul_would_overflow;
 
 /*!
  * \brief Compute the expression with the given binary op.
@@ -33,11 +33,14 @@ inline Expr ComputeExpr(Expr lhs, Expr rhs) {
 /*!
  * \brief Compute an reduction with Op
  * \param values The input values.
+ * \param empty_value The value when return if it is empty, can be Expr()
+ *        which will cause an error to be rasied.
  * \tparam Op The computation operator
  * \return The result.
  */
 template<typename Op>
-inline Expr ComputeReduce(const Array<Expr>& values);
+inline Expr ComputeReduce(
+    const Array<Expr>& values, Expr empty_value);
 
 template<typename T>
 inline bool GetConst(Expr e, T* out);
@@ -130,17 +133,20 @@ inline Expr ComputeExpr<ir::Mod>(Expr a, Expr b) {
 
 template<>
 inline Expr ComputeExpr<ir::Max>(Expr a, Expr b) {
-  return Halide::Internal::Interval::make_max(a, b);
+  return HalideIR::Internal::Interval::make_max(a, b);
 }
 
 template<>
 inline Expr ComputeExpr<ir::Min>(Expr a, Expr b) {
-  return Halide::Internal::Interval::make_min(a, b);
+  return HalideIR::Internal::Interval::make_min(a, b);
 }
 
 template<typename Op>
-inline Expr ComputeReduce(const Array<Expr>& values) {
-  CHECK_NE(values.size(), 0U);
+inline Expr ComputeReduce(const Array<Expr>& values, Expr empty_value) {
+  if (values.size() == 0U) {
+    CHECK(empty_value.defined());
+    return empty_value;
+  }
   Expr res = values[0];
   for (size_t i = 1; i < values.size(); ++i) {
     res = ComputeExpr<Op>(res, values[i]);
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 63f582160312..642a866866d2 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -123,25 +123,28 @@ class LinearEqDetector
 };
 
 Array<Expr> DetectLinearEquation(const Expr& e, const Array<Var>& vars) {
-  CHECK_GE(vars.size(), 1U);
   Expr base = e;
   Array<Expr> coeff;
 
-  for (Var v : vars) {
-    LinearEqEntry ret;
-    if (!LinearEqDetector(v).Detect(base, &ret)) {
-      return Array<Expr>();
+  if (0 == vars.size()) {
+    coeff.push_back(make_const(Int(32), 1));
+  } else {
+    for (Var v : vars) {
+      LinearEqEntry ret;
+      if (!LinearEqDetector(v).Detect(base, &ret)) {
+        return Array<Expr>();
+      }
+      coeff.push_back(ret.coeff);
+      base = std::move(ret.base);
     }
-    coeff.push_back(ret.coeff);
-    base = std::move(ret.base);
-  }
 
-  std::unordered_set<const Variable*> vset;
-  for (size_t i = vars.size(); i != 1; --i) {
-    vset.insert(vars[i - 1].get());
-    // The previous coeff contains the variable
-    if (ExprUseVar(coeff[i - 2], vset)) {
-      return Array<Expr>();
+    std::unordered_set<const Variable*> vset;
+    for (size_t i = vars.size(); i != 1; --i) {
+      vset.insert(vars[i - 1].get());
+      // The previous coeff contains the variable
+      if (ExprUseVar(coeff[i - 2], vset)) {
+        return Array<Expr>();
+      }
     }
   }
   coeff.push_back(base);
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index 8a88ed23e262..c004b9666a58 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -15,7 +15,7 @@
 namespace tvm {
 namespace arith {
 
-using Halide::Internal::Interval;
+using HalideIR::Internal::Interval;
 using namespace ir;
 
 inline IntSet IntSet::cover_interval() const {
@@ -27,7 +27,7 @@ inline IntSet IntSet::cover_interval() const {
     for (size_t i = 0; i < s->extents.size(); ++i) {
       max = max + s->extents[i] * s->strides[i] - s->strides[i];
     }
-    return IntervalSet::make(s->base.min, max);
+    return IntervalSet::make(s->base.min, Simplify(max));
   }
   LOG(FATAL) << "cannot convert set " << (*this)->type_key() << " to interval";
   return IntSet::everything();
diff --git a/src/arithmetic/int_set_internal.h b/src/arithmetic/int_set_internal.h
index fca4b819905f..9284e6e016e0 100644
--- a/src/arithmetic/int_set_internal.h
+++ b/src/arithmetic/int_set_internal.h
@@ -13,7 +13,7 @@
 namespace tvm {
 namespace arith {
 
-using Halide::Internal::Interval;
+using HalideIR::Internal::Interval;
 
 /*! \brief Set of continuous interval */
 struct IntervalSet : public IntSetNode {
diff --git a/src/autotvm/feature_visitor.cc b/src/autotvm/feature_visitor.cc
new file mode 100644
index 000000000000..01330e97b886
--- /dev/null
+++ b/src/autotvm/feature_visitor.cc
@@ -0,0 +1,93 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file feature_visitor.cc
+ * \brief Base class for feature extractor.
+ *        These features are used for machine learning cost model
+ */
+
+#include "feature_visitor.h"
+
+namespace tvm {
+namespace autotvm {
+
+// for loop
+void FeatureVisitor::Visit_(const For *op) {
+  const auto *extent = op->extent.as<IntImm>();
+  int64_t loop_extent = -1;
+  if (extent != nullptr)
+    loop_extent = extent->value;
+  AnnotationType ann = kSerial;
+  switch (op->for_type) {
+    case ForType ::Parallel:
+      ann = kParallel;
+      break;
+    case ForType::Unrolled:
+      ann = kUnrolled;
+      break;
+    case ForType::Vectorized:
+      ann = kVectorized;
+      break;
+    case ForType::Serial:
+      ann = kSerial;
+      break;
+  }
+
+  if (EnterItervar_(op->loop_var, loop_extent, ann)) {
+    IRVisitor::Visit_(op);
+    ExitItervar_();
+  }
+}
+
+// parallel axis, virtual thread
+void FeatureVisitor::Visit_(const AttrStmt *op) {
+  if (op->attr_key == attr::thread_extent ||
+      op->attr_key == attr::virtual_thread) {
+    VarExpr var = op->node.as<tvm::IterVarNode>()->var;
+    const auto *extent = op->value.as<IntImm>();
+    CHECK(extent);
+
+    std::string name = var.get()->name_hint;
+    AnnotationType ann = kParallel;
+    if (op->attr_key == attr::thread_extent) {
+      if (name == "blockIdx.x")
+        ann = kBlockX;
+      else if (name == "blockIdx.y")
+        ann = kBlockY;
+      else if (name == "blockIdx.z")
+        ann = kBlockZ;
+      else if (name == "threadIdx.x")
+        ann = kThreadX;
+      else if (name == "threadIdx.y")
+        ann = kThreadY;
+      else if (name == "threadIdx.z")
+        ann = kThreadZ;
+      else
+        LOG(FATAL) << "invalid thread itervar " + name;
+    } else {
+      ann = kVirtualThread;
+    }
+
+    if (EnterItervar_(var, extent->value, ann)) {
+      IRVisitor::Visit_(op);
+      ExitItervar_();
+    }
+  } else {
+    IRVisitor::Visit_(op);
+  }
+}
+
+// memory access
+void FeatureVisitor::Visit_(const Load *op) {
+  EnterMem_(op->buffer_var, op->index);
+  IRVisitor::Visit_(op);
+  ExitMem_();
+}
+
+void FeatureVisitor::Visit_(const Store *op) {
+  EnterMem_(op->buffer_var, op->index);
+  IRVisitor::Visit_(op);
+  ExitMem_();
+}
+
+}  // namespace autotvm
+}  // namespace tvm
diff --git a/src/autotvm/feature_visitor.h b/src/autotvm/feature_visitor.h
new file mode 100644
index 000000000000..509a27e2638c
--- /dev/null
+++ b/src/autotvm/feature_visitor.h
@@ -0,0 +1,67 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file feature_visitor.h
+ * \brief Base class for feature extractor.
+ *        These features are used for machine learning cost model
+ */
+
+#ifndef TVM_AUTOTVM_FEATURE_VISITOR_H_
+#define TVM_AUTOTVM_FEATURE_VISITOR_H_
+
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <string>
+
+namespace tvm {
+namespace autotvm {
+
+using namespace tvm::ir;
+
+/*!
+ * \brief Type of for loop, used as one-hot encoding in features
+ */
+enum AnnotationType {
+  kBlockX, kBlockY, kBlockZ, kThreadX, kThreadY, kThreadZ,
+  kUnrolled, kVectorized, kParallel, kSerial, kVirtualThread,
+  kNum,
+};
+
+/*!
+ * \brief A base class for feature extractor, used for processing
+ * for loop and memory access in the IR
+ */
+class FeatureVisitor : public IRVisitor {
+ public:
+  // for loop
+  void Visit_(const For *op);
+  void Visit_(const AttrStmt *op);
+
+  // memory access
+  void Visit_(const Load *op);
+  void Visit_(const Store *op);
+
+ protected:
+  /*!
+ * \brief Enter a for loop node
+ * \param var The expression to be printed.
+ * \param length The output stream
+ * \param ann_type The type for the for loop
+ * \return skip Whether skip this node
+ */
+  virtual bool EnterItervar_(tvm::VarExpr var, int64_t length, AnnotationType ann_type) = 0;
+  /*! \brief Exit a for loop subtree */
+  virtual void ExitItervar_() = 0;
+  /*!
+   * \brief Enter a memory access node
+   * \param buffer_var The buffer to access.
+   * \param index Index expression
+   */
+  virtual void EnterMem_(tvm::VarExpr buffer_var, tvm::Expr index) = 0;
+  /*! \brief Exit a memory access node */
+  virtual void ExitMem_() = 0;
+};
+
+}  // namespace autotvm
+}  // namespace tvm
+
+#endif  // TVM_AUTOTVM_FEATURE_VISITOR_H_
diff --git a/src/autotvm/touch_extractor.cc b/src/autotvm/touch_extractor.cc
new file mode 100644
index 000000000000..a220bab042c4
--- /dev/null
+++ b/src/autotvm/touch_extractor.cc
@@ -0,0 +1,510 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file touch_extractor.cc
+ * \brief Extract feature of touch pattern of axes in lowered IR
+ */
+
+#include "touch_extractor.h"
+
+#include <set>
+#include <algorithm>
+#include <cmath>
+
+namespace tvm {
+namespace autotvm {
+
+int ParallelLevel(AnnotationType ann) {
+  switch (ann) {
+    case kBlockX: case kBlockY: case kBlockZ:
+      return 2;
+    case kThreadX: case kThreadY: case kThreadZ: case kParallel:
+      return 1;
+    default:
+      return 0;
+  }
+}
+
+// get touch pattern from index expression
+class IndexParser: public IRVisitor {
+ public:
+  void Parse(Expr expr) {
+    pattern_map.clear();
+    this->Visit(expr);
+  }
+
+  void Visit_(const Variable *op) {
+    // TODO(lmzheng): handle more index types (multiple occurrence)
+    if (pattern_map.count(op) == 0) {
+      pattern_map[op] = TouchPattern();
+      pattern_map[op].stride = next_stride_;
+      next_stride_ = 1;
+    }
+  }
+
+  void Visit_(const Mul *op) {
+    if (op->a.as<Variable>()) {
+      if (const auto stride = op->b.as<IntImm>()) {
+        next_stride_ = stride->value;
+      }
+    }
+    IRVisitor::Visit_(op);
+  }
+
+  std::unordered_map<const Variable*, TouchPattern> pattern_map;
+
+ private:
+  int64_t next_stride_ = 1;
+};
+
+// extract iter vars and their touch pattern from ir
+bool TouchExtractor::EnterItervar_(VarExpr var, int64_t length, AnnotationType ann_type) {
+  // do not insert duplicated occurrences of virtual thread
+  if (ann_type == kVirtualThread && itervar_map.count(var) != 0) {
+    skip_stack_size_.push_back(itervar_stack_.size());
+    return true;
+  } else {
+    itervar_stack_.push_back(var);
+    topdown_product_ *= length;
+
+    if (itervar_map.count(var) != 0) {
+      // find two duplicated axes
+      // these happens when we create tvm.thread_axis("threadIdx.x") once and
+      // bind it twice. Here we treat them as two axes
+      // so we create a snapshot for the old one and freeze it
+      VarExpr old = VarExpr(var.get()->name_hint);
+      itervar_map.insert({old, itervar_map[var]});
+      itervar_map.erase(var);
+    }
+
+    itervar_map.insert({var, ItervarFeature(var, length,
+                                            static_cast<int>(itervar_stack_.size()),
+                                            ann_type,
+                                            topdown_product_,
+                                            static_cast<int>(itervar_counter_++))});
+  }
+
+  return true;
+}
+
+void TouchExtractor::ExitItervar_() {
+  if (!skip_stack_size_.empty() && skip_stack_size_.back() == itervar_stack_.size()) {
+    skip_stack_size_.pop_back();
+    return;
+  }
+  VarExpr var = itervar_stack_.back();
+
+  // update count and reuse ratio for upper iter vars (includes self)
+  for (auto kv : itervar_map[var].touch_feature) {
+    if (kv.second.stride != 0) {  // multiply count
+      for (auto stack_var : itervar_stack_) {
+        auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
+        CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
+        touch_pattern->second.count *= itervar_map[var].length;
+      }
+    } else {                      // multiply reuse ratio
+      for (auto stack_var : itervar_stack_) {
+        auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
+        CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
+        touch_pattern->second.reuse *= itervar_map[var].length;
+      }
+    }
+  }
+  itervar_stack_.pop_back();
+
+  topdown_product_ /= itervar_map[var].length;
+  int64_t bottomup_product = -1;
+  for (auto kv : itervar_map[var].touch_feature) {
+    bottomup_product = std::max(bottomup_product, kv.second.count * kv.second.reuse);
+  }
+
+  itervar_map[var].bottomup_product = bottomup_product;
+
+  // push base to upper parallel axis
+  int para_level = ParallelLevel(itervar_map[var].ann);
+  // if is the separate line of parallel level, push the base to upper parallel level
+  if (!itervar_stack_.empty() &&
+      ParallelLevel(itervar_map[itervar_stack_.back()].ann) == para_level + 1) {
+    for (auto kv : itervar_map[var].touch_feature) {
+      for (auto stack_var : itervar_stack_) {
+        if (ParallelLevel(itervar_map[stack_var].ann) == para_level + 1) {
+          auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
+          CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
+          touch_pattern->second.thread_reuse = -kv.second.reuse;
+          touch_pattern->second.thread_count = -kv.second.count;
+          // NOTE: use minus as a flag to denote it is a base,
+          // indicating it is not the final value
+        }
+      }
+    }
+  }
+
+  for (auto kv : itervar_map[var].touch_feature) {
+    if (kv.second.thread_count < 0) {
+      itervar_map[var].touch_feature[kv.first].thread_count =
+          kv.second.count / (-kv.second.thread_count);
+      itervar_map[var].touch_feature[kv.first].thread_reuse =
+          kv.second.reuse / (-kv.second.thread_reuse);
+    }
+  }
+}
+
+void TouchExtractor::EnterMem_(VarExpr buffer_var, Expr index) {
+  std::string name = buffer_var.get()->name_hint;
+  TouchedBuffer buf = name + "_" + std::to_string(buffer_counter_[name]++);
+
+  // extract touch pattern from index
+  IndexParser parser;
+  parser.Parse(index);
+
+  // push up mem access info
+  for (auto var : itervar_stack_) {
+    auto x = parser.pattern_map.find(var.get());
+    if (x != parser.pattern_map.end()) {
+      itervar_map[var].touch_feature[buf] = x->second;
+    } else {
+      itervar_map[var].touch_feature[buf] = TouchPattern();
+    }
+  }
+}
+
+void TouchExtractor::ExitMem_() {
+}
+
+/*!
+ * \brief Get axis-based feature for all axes
+ * \param stmt The statement to be extracted
+ * \param bool Whether take log for numerical feature
+ * \param ret_feature The buffer where the return value is stored
+ *
+ * \note The format of return value is
+ * ((
+ *   ('_itervar_',  var),
+ *   ('_attr_',     length, nest_level, topdown, bottomup, one_hot_annotation),
+ *   ('_arith_',    add_ct, mul_ct, div_ct),
+ *   ('data_vec_0', stride, mod, count, reuse, thread_count, thread_reuse),
+ *   ('conv_0',     stride, mod, count, reuse, thread_count, thread_reuse),
+ * ),
+ * (
+ *   ('_itervar_',    var2),
+ *   ('_attr_',       length, nest_level, one_hot_annotation),
+ *   ('_arith_',      add_ct, mul_ct, div_ct),
+ *   ('kernel_vec_0', stride, mod, count, reuse, thread_count, thread_reuse),
+ *   ('conv_1',       stride, mod, count, reuse, thread_count, thread_reuse),
+ * ))
+ *
+ * Itervars are sorted according to their first occurrence position in IR.
+ * Buffers touched by an itervar are sorted by their unique names.
+ *
+ * \note If you want to flatten these features as the input of your model,
+ * You can use the faster one GetItervarFeatureFlatten below.
+ */
+void GetItervarFeature(Stmt stmt, bool take_log, Array<Array<Array<Expr> > > *ret_feature) {
+  // extract
+  TouchExtractor touch_analyzer;
+  touch_analyzer.Analyze(stmt);
+
+  // sort according to order
+  std::vector<VarExpr> vars;
+  for (auto kv : touch_analyzer.itervar_map) {
+    vars.push_back(kv.first);
+  }
+  std::sort(vars.begin(), vars.end(), [&](const VarExpr &lhs, const VarExpr &rhs) -> bool {
+    return touch_analyzer.itervar_map[lhs].order < touch_analyzer.itervar_map[rhs].order;
+  });
+
+  // whether take log for numerical feature
+  std::function<double(int64_t)> trans;
+  if (take_log) {
+    trans = [](int64_t x) {
+      if (x < 0)
+        return -std::log(-x+1) / std::log(2);
+      x = x + 1;
+      return std::log(x) / std::log(2);
+    };
+  } else {
+    trans = [](int64_t x) {
+      return x;
+    };
+  }
+
+  // serialize for front end
+  for (auto var : vars) {
+    Array<Array<Expr> > feature_row;
+    ItervarFeature &fea = touch_analyzer.itervar_map[var];
+    feature_row.push_back(Array<Expr>{std::string("_itervar_"), var});
+
+    Array<Expr> attr{std::string("_attr_"),
+                     FloatImm::make(Float(32), trans(fea.length)),
+                     IntImm::make(Int(32), fea.nest_level),
+                     FloatImm::make(Float(32), trans(fea.topdown_product)),
+                     FloatImm::make(Float(32), trans(fea.bottomup_product)),
+    };
+    // one hot annotation
+    for (int i = 0; i < kNum; i++) {
+      attr.push_back(i == fea.ann);
+    }
+    feature_row.push_back(attr);
+
+    // arithmetic
+    feature_row.push_back(Array<Expr>{std::string("_arith_"),
+                                      FloatImm::make(Float(32), trans(fea.add_ct)),
+                                      FloatImm::make(Float(32), trans(fea.mul_ct)),
+                                      FloatImm::make(Float(32), trans(fea.div_ct)),
+    });
+
+    // touch map
+    std::vector<TouchedBuffer> bufs;
+    for (auto kv : fea.touch_feature) {
+      bufs.push_back(kv.first);
+    }
+    std::sort(bufs.begin(), bufs.end());
+    for (auto k : bufs) {
+      TouchPattern &v = fea.touch_feature[k];
+      feature_row.push_back(Array<Expr>{k,
+                                        FloatImm::make(Float(32), trans(v.stride)),
+                                        FloatImm::make(Float(32), trans(v.mod)),
+                                        FloatImm::make(Float(32), trans(v.count)),
+                                        FloatImm::make(Float(32), trans(v.reuse)),
+                                        FloatImm::make(Float(32), trans(v.thread_count)),
+                                        FloatImm::make(Float(32), trans(v.thread_reuse)),
+      });
+    }
+
+    ret_feature->push_back(feature_row);
+  }
+}
+
+/*!
+ * \brief Get axis-based feature for all axes and flatten them into a one-dimensional vector.
+ * \param stmt The statement to be extracted
+ * \param bool Whether take log for numerical feature
+ * \param ret_feature The buffer where the return value is stored
+ *
+ * \note See GetItervarFeature for more details about the return value.
+ *       This is an optimized version of GetItervarFeature + Flatten. This runs much faster.
+ */
+void GetItervarFeatureFlatten(Stmt stmt, bool take_log, std::vector<float> *ret_feature) {
+  // extract touch feature
+  TouchExtractor touch_analyzer;
+  touch_analyzer.Analyze(stmt);
+
+  // sort according to order
+  std::vector<VarExpr> vars;
+  for (auto kv : touch_analyzer.itervar_map) {
+    vars.push_back(kv.first);
+  }
+  std::sort(vars.begin(), vars.end(), [&](const VarExpr &lhs, const VarExpr &rhs) -> bool {
+    return touch_analyzer.itervar_map[lhs].order < touch_analyzer.itervar_map[rhs].order;
+  });
+
+  // whether take log for numerical feature
+  std::function<float(int64_t)> trans;
+  if (take_log) {
+    trans = [](int64_t x) {
+      if (x < 0)
+        return -std::log(-x+1) / std::log(2);
+      x = x + 1;
+      return std::log(x) / std::log(2);
+    };
+  } else {
+    trans = [](int64_t x) {
+      return x;
+    };
+  }
+
+  // serialize for front end
+  for (auto var : vars) {
+    ItervarFeature &fea = touch_analyzer.itervar_map[var];
+
+    ret_feature->push_back(trans(fea.length));
+    ret_feature->push_back(fea.nest_level);
+    ret_feature->push_back(trans(fea.topdown_product));
+    ret_feature->push_back(trans(fea.bottomup_product));
+
+    // one hot annotation
+    for (int i = 0; i < kNum; i++) {
+      ret_feature->push_back(i == fea.ann);
+    }
+
+    // arithmetic
+    ret_feature->push_back(trans(fea.add_ct));
+    ret_feature->push_back(trans(fea.mul_ct));
+    ret_feature->push_back(trans(fea.div_ct));
+
+    // touch map
+    std::vector<TouchedBuffer> bufs;
+    for (auto kv : fea.touch_feature) {
+      bufs.push_back(kv.first);
+    }
+    std::sort(bufs.begin(), bufs.end());
+    for (auto k : bufs) {
+      TouchPattern &v = fea.touch_feature[k];
+      ret_feature->push_back(trans(v.stride));
+      ret_feature->push_back(trans(v.mod));
+      ret_feature->push_back(trans(v.count));
+      ret_feature->push_back(trans(v.reuse));
+      ret_feature->push_back(trans(v.thread_count));
+      ret_feature->push_back(trans(v.thread_reuse));
+    }
+  }
+}
+
+/*!
+ * \brief Get curve sample feature (relation feature) and flatten them into a one-dimensional vector.
+ * \param stmt The statement to be extracted
+ * \param sample_n The number of points used for sampling a curve (along one dimension)
+ * \param ret_feature The buffer where the return value is stored
+ */
+void GetCurveSampleFeatureFlatten(Stmt stmt, int sample_n, std::vector<float> *ret_feature) {
+  // extract touch feature
+  TouchExtractor touch_ext;
+  touch_ext.Analyze(stmt);
+
+  // sort according to order
+  std::vector<VarExpr> vars;
+  for (auto kv : touch_ext.itervar_map) {
+    vars.push_back(kv.first);
+  }
+  std::sort(vars.begin(), vars.end(), [&](const VarExpr &lhs, const VarExpr &rhs) -> bool {
+    return touch_ext.itervar_map[lhs].order < touch_ext.itervar_map[rhs].order;
+  });
+
+  int max_depth = 0;
+  std::map<TouchedBuffer, std::vector<double> > reuse_curve;
+  std::map<TouchedBuffer, std::vector<double> > count_curve;
+  std::map<TouchedBuffer, std::vector<double> > topdown_curve;
+  std::map<TouchedBuffer, std::vector<double> > bottomup_curve;
+  std::set<TouchedBuffer> innermost_buffers;
+  std::set<std::string> added;
+
+  // find maximum depth of loop nest
+  for (auto var : vars) {
+    ItervarFeature &fea = touch_ext.itervar_map[var];
+    max_depth = std::max(max_depth, fea.nest_level);
+  }
+
+  // mark inner most buffer
+  for (auto iter = vars.rbegin(); iter != vars.rend(); iter++) {
+    auto var = *iter;
+    ItervarFeature &fea = touch_ext.itervar_map[var];
+    if (fea.nest_level == max_depth) {
+      for (auto kv : fea.touch_feature) {
+        // delete buffer no (e.g. 'A_0' -> 'A', 'A_1' -> 'A')
+        std::string raw_name = kv.first.substr(0, kv.first.rfind("_"));
+
+        // delete memory scope (e.g. 'A.local' -> 'A', 'A.shared' -> 'A')
+        size_t pos = raw_name.find(".");
+        if (pos < kv.first.size())
+          raw_name = raw_name.substr(0, pos);
+
+        // If there are multiple innermost buffers that are derived from a same raw buffer
+        // We only record the last occurrence (note the `iter` is in reverse order)
+        // e.g. `A.local`, `A.shared` are derived from `A`, if they all occurred at the inner most
+        // level, we will only record the last occurrence,
+        if (added.find(raw_name) == added.end()) {
+          innermost_buffers.insert(kv.first);
+          added.insert(raw_name);
+        }
+      }
+    }
+  }
+
+  // pad the first point (zero) for all curves
+  for (auto buf : innermost_buffers) {
+    reuse_curve[buf].push_back(0);
+    count_curve[buf].push_back(0);
+    topdown_curve[buf].push_back(0);
+    bottomup_curve[buf].push_back(0);
+  }
+
+  // extract curves
+  for (auto var : vars) {
+    ItervarFeature &fea = touch_ext.itervar_map[var];
+    for (auto kv : fea.touch_feature) {
+      if (innermost_buffers.find(kv.first) != innermost_buffers.end()) {
+        reuse_curve[kv.first].emplace_back(std::log(kv.second.reuse) / std::log(2));
+        count_curve[kv.first].emplace_back(std::log(kv.second.count) / std::log(2));
+        topdown_curve[kv.first].emplace_back(std::log(fea.topdown_product) / std::log(2));
+        bottomup_curve[kv.first].emplace_back(std::log(fea.bottomup_product) / std::log(2));
+      }
+    }
+  }
+
+  // sample relation in the curve
+  auto sample_curve = [&](const std::vector<double> &x, const std::vector<double> &y,
+                          double weight) {
+    for (int i = 0; i < sample_n; i++) {
+      double xx = i * weight;
+      for (int j = static_cast<int>(x.size()) - 1; j >= 0; j--) {
+        if (xx > x[j] - 1e-6) {
+          ret_feature->emplace_back(y[j]);
+          ret_feature->emplace_back(xx - x[j]);
+          break;
+        }
+      }
+    }
+  };
+
+  // serialize to frontend
+  for (auto k : innermost_buffers) {
+    std::vector<double> &count = count_curve[k];
+    std::vector<double> &reuse = reuse_curve[k];
+    std::vector<double> &top_down = topdown_curve[k];
+
+    std::sort(count.begin(), count.end());
+    std::sort(reuse.begin(), reuse.end());
+    std::sort(top_down.begin(), top_down.end());
+
+    sample_curve(count, reuse, 1);
+    sample_curve(reuse, count, 1);
+    sample_curve(count, top_down, 1);
+    sample_curve(top_down, count, 1);
+  }
+}
+
+
+// register API for front end
+TVM_REGISTER_API("autotvm.feature.GetItervarFeature")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  Stmt stmt = args[0];
+  bool take_log = args[1];
+  Array<Array<Array<Expr > > > ret_feature;
+
+  GetItervarFeature(stmt, take_log, &ret_feature);
+
+  *ret = ret_feature;
+});
+
+
+TVM_REGISTER_API("autotvm.feature.GetItervarFeatureFlatten")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  Stmt stmt = args[0];
+  bool take_log = args[1];
+  std::vector<float> ret_feature;
+
+  GetItervarFeatureFlatten(stmt, take_log, &ret_feature);
+
+  TVMByteArray arr;
+  arr.size = sizeof(float) * ret_feature.size();
+  arr.data = reinterpret_cast<char *>(ret_feature.data());
+  *ret = arr;
+});
+
+
+TVM_REGISTER_API("autotvm.feature.GetCurveSampleFeatureFlatten")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  Stmt stmt = args[0];
+  bool take_log = args[1];
+  std::vector<float> ret_feature;
+
+  GetCurveSampleFeatureFlatten(stmt, take_log, &ret_feature);
+
+  TVMByteArray arr;
+  arr.size = sizeof(float) * ret_feature.size();
+  arr.data = reinterpret_cast<char *>(ret_feature.data());
+  *ret = arr;
+});
+
+
+}  // namespace autotvm
+}  // namespace tvm
diff --git a/src/autotvm/touch_extractor.h b/src/autotvm/touch_extractor.h
new file mode 100644
index 000000000000..1fe5ee20f70c
--- /dev/null
+++ b/src/autotvm/touch_extractor.h
@@ -0,0 +1,124 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file touch_extractor.h
+ * \brief Extract feature of touch pattern of axes in lowered IR
+ */
+
+#ifndef TVM_AUTOTVM_TOUCH_EXTRACTOR_H_
+#define TVM_AUTOTVM_TOUCH_EXTRACTOR_H_
+
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/api_registry.h>
+#include <stack>
+#include <vector>
+#include <map>
+#include <string>
+#include <deque>
+#include "feature_visitor.h"
+
+namespace tvm {
+namespace autotvm {
+
+using TouchedBuffer = std::string;
+
+// touch pattern buf[(stride * var) % mod) + other]
+struct TouchPattern {
+  int64_t stride{0};
+  int64_t mod{-1};  // -1 for +inf
+
+  int64_t count{1};
+  int64_t reuse{1};
+  int64_t thread_count{0};  // count when move thread axis into innermost
+  int64_t thread_reuse{0};  // reuse ratio move thread axis into innermost
+};
+
+// all the feature of an iter var
+struct ItervarFeature {
+  ItervarFeature(VarExpr var,
+                 int64_t extent,
+                 int nest,
+                 AnnotationType ann_type,
+                 int64_t topdown,
+                 int counter)
+      : length(extent), nest_level(nest), ann(ann_type), topdown_product(topdown), order(counter) {}
+  ItervarFeature() {}
+
+  // Axis Attributes
+  int64_t length;
+  int nest_level;
+  AnnotationType ann;         // one-hot axis type
+  int64_t topdown_product;    // accumulative product of axis length, in top-down order
+  int64_t bottomup_product;   // accumulative product of axis length, in bottom-up order
+  // bottomup_product = reuse * count for any touched buffer
+
+  int order;  // used for soring axis
+
+  // Arithmetic feature
+  int add_ct{0};
+  int mul_ct{0};
+  int div_ct{0};
+
+  // Memory Touch Feature
+  std::unordered_map<TouchedBuffer, TouchPattern> touch_feature;
+};
+
+// extract iter vars and their touch pattern from ir
+class TouchExtractor : public FeatureVisitor {
+ public:
+  void Analyze(Stmt stmt) {
+    this->Visit(stmt);
+  }
+
+  // arithmetic stats
+  void Visit_(const Add *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].add_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Sub *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].add_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Mul *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].mul_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Div *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].div_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Mod *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].div_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  std::unordered_map<VarExpr, ItervarFeature, tvm::ExprHash, tvm::ExprEqual> itervar_map;
+
+ private:
+  bool EnterItervar_(VarExpr var, int64_t length, AnnotationType ann_type);
+  void ExitItervar_();
+  void EnterMem_(VarExpr buffer_var, Expr index);
+  void ExitMem_();
+
+  int64_t topdown_product_{1};
+  std::map<std::string, size_t> buffer_counter_;
+  size_t itervar_counter_{0};
+  std::deque<VarExpr> itervar_stack_;  // use deque instead of stack for indexing
+  std::deque<size_t> skip_stack_size_;
+
+  using IRVisitor::Visit_;
+};
+
+}  // namespace autotvm
+}  // namespace tvm
+
+#endif  // TVM_AUTOTVM_TOUCH_EXTRACTOR_H_
diff --git a/src/codegen/build_metal.cc b/src/codegen/build_metal.cc
deleted file mode 100644
index f2a7e14f9a9f..000000000000
--- a/src/codegen/build_metal.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- *  Build metal modules from source.
- * \file build_metal.cc
- */
-#include <tvm/base.h>
-#include <tvm/runtime/config.h>
-#include "./codegen_metal.h"
-#include "./build_common.h"
-
-#if TVM_METAL_RUNTIME
-#include "../runtime/metal/metal_module.h"
-#endif   // TVM_METAL_RUNTIME
-
-namespace tvm {
-namespace codegen {
-
-runtime::Module BuildMetal(Array<LoweredFunc> funcs) {
-  using tvm::runtime::Registry;
-  bool output_ssa = false;
-  CodeGenMetal cg;
-  cg.Init(output_ssa);
-  for (LoweredFunc f : funcs) {
-    cg.AddFunction(f);
-  }
-  std::string code = cg.Finish();
-#if TVM_METAL_RUNTIME
-  std::string fmt = "metal";
-  std::string source = "";
-  if (const auto* f = Registry::Get("tvm_callback_metal_compile")) {
-    source = code;
-    code = (*f)(code).operator std::string();
-    fmt = "metallib";
-  }
-  return MetalModuleCreate(code, fmt, ExtractFuncInfo(funcs), source);
-#else
-  LOG(WARNING) << "Metal runtime not enabled, return a source module...";
-  return SourceModuleCreate(code, "metal");
-#endif   // TVM_METAL_RUNTIME
-}
-
-TVM_REGISTER_API("codegen.build_metal")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildMetal(args[0]);
-  });
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
new file mode 100644
index 000000000000..45d10617de05
--- /dev/null
+++ b/src/codegen/build_module.cc
@@ -0,0 +1,733 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ *  Compile executable modules.
+ * \file build_module.cc
+ */
+#include <dmlc/thread_local.h>
+#include <tvm/build_module.h>
+#include <tvm/operation.h>
+#include <tvm/ir_pass.h>
+#include <tvm/codegen.h>
+
+#include <algorithm>
+#include <mutex>
+#include <stack>
+
+namespace tvm {
+
+TVM_REGISTER_NODE_TYPE(TargetNode);
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<TargetNode>([](const TargetNode *op, IRPrinter *p) {
+  p->stream << op->str();
+  });
+
+
+/*!
+* \brief Construct a Target node from the given name and options.
+* \param target_name The major target name. Should be one of
+* {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"}
+* \param options Additional options appended to the target
+* \return The constructed Target
+*/
+Target CreateTarget(const std::string& target_name,
+                    const std::vector<std::string>& options) {
+  auto target = Target(std::make_shared<TargetNode>());
+  auto t = static_cast<TargetNode*>(target.node_.get());
+
+  t->target_name = target_name;
+
+  std::string libs_flag = "-libs=";
+  std::string device_flag = "-device=";
+  for (auto& item : options) {
+    t->options_array.push_back(ir::StringImm::make(item));
+
+    if (item.find(libs_flag) == 0) {
+      std::stringstream ss(item.substr(libs_flag.length()));
+      std::string lib_item;
+      while (std::getline(ss, lib_item, ',')) {
+        t->libs_array.push_back(ir::StringImm::make(lib_item));
+      }
+    } else if (item.find(device_flag) == 0) {
+      t->device_name = item.substr(device_flag.length());
+    }
+  }
+
+  if (t->device_name.length() > 0) {
+    t->keys_array.push_back(ir::StringImm::make(t->device_name));
+  }
+  t->device_type = kDLCPU;
+  t->thread_warp_size = 1;
+  if (target_name == "llvm") {
+    t->keys_array.push_back(ir::StringImm::make("cpu"));
+  } else if (target_name == "cuda" || target_name == "nvptx") {
+    t->device_type = kDLGPU;
+    t->keys_array.push_back(ir::StringImm::make("cuda"));
+    t->keys_array.push_back(ir::StringImm::make("gpu"));
+    t->max_num_threads = 512;
+    t->thread_warp_size = 32;
+  } else if (target_name == "rocm" || target_name == "opencl") {
+    // For now assume rocm schedule for opencl
+    if (target_name == "opencl") {
+      t->device_type = kDLOpenCL;
+    } else {
+      t->device_type = kDLROCM;
+    }
+    t->keys_array.push_back(ir::StringImm::make(target_name));
+    t->keys_array.push_back(ir::StringImm::make("gpu"));
+    t->max_num_threads = 256;
+    if (t->device_name == "intel_graphics") {
+      t->thread_warp_size = 16;
+    }
+  } else if (target_name == "metal" || target_name == "vulkan") {
+    if (target_name == "metal") {
+      t->device_type = kDLMetal;
+    } else {
+      t->device_type = kDLVulkan;
+    }
+    t->keys_array.push_back(ir::StringImm::make(target_name));
+    t->keys_array.push_back(ir::StringImm::make("gpu"));
+    t->max_num_threads = 256;
+  } else if (target_name == "sdaccel") {
+    t->device_type = kDLOpenCL;
+    t->keys_array.push_back(ir::StringImm::make("sdaccel"));
+  } else if (target_name == "aocl") {
+    t->device_type = kDLAOCL;
+    t->keys_array.push_back(ir::StringImm::make("aocl"));
+  } else if (target_name == "opengl") {
+    t->device_type = kOpenGL;
+    t->keys_array.push_back(ir::StringImm::make("opengl"));
+  } else if (target_name == "stackvm") {
+    t->device_type = kDLCPU;
+  } else if (target_name == "ext_dev") {
+    t->device_type = kExtDev;
+  } else {
+    LOG(ERROR) << "Unknown target name " << target_name;
+    return target::stackvm();
+  }
+
+  return target;
+}
+
+TVM_REGISTER_API("_TargetCreate")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  std::string target_name = args[0];
+  std::vector<std::string> options;
+  for (int i = 1; i < args.num_args; ++i) {
+    std::string arg = args[i];
+    options.push_back(arg);
+  }
+
+  *ret = CreateTarget(target_name, options);
+  });
+
+TVM_REGISTER_API("_TargetFromString")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  std::string target_str = args[0];
+
+  *ret = Target::create(target_str);
+  });
+
+std::vector<std::string> TargetNode::keys() const {
+  std::vector<std::string> result;
+  for (auto& expr : keys_array) {
+    result.push_back(expr.as<ir::StringImm>()->value);
+  }
+  return result;
+}
+
+std::vector<std::string> TargetNode::options() const {
+  std::vector<std::string> result;
+  for (auto& expr : options_array) {
+    result.push_back(expr.as<ir::StringImm>()->value);
+  }
+  return result;
+}
+
+std::unordered_set<std::string> TargetNode::libs() const {
+  std::unordered_set<std::string> result;
+  for (auto& expr : libs_array) {
+    result.insert(expr.as<ir::StringImm>()->value);
+  }
+  return result;
+}
+
+std::string TargetNode::str() const {
+  std::ostringstream result;
+  result << target_name;
+  for (const auto &x : options()) {
+    result << " " << x;
+  }
+  return result.str();
+}
+
+
+bool StartsWith(const std::string& str, const std::string& pattern) {
+  return str.compare(0, pattern.length(), pattern) == 0;
+}
+
+std::string GetDeviceName(const std::string& target_str) {
+  std::istringstream ss(target_str);
+  std::string target_name;
+  ss >> target_name;
+
+  std::string item;
+  while (ss >> item) {
+    if (StartsWith(item, "-device=")) {
+      return item.substr(std::string("-device=").length());
+    }
+  }
+
+  return "";
+}
+
+Target Target::create(const std::string& target_str) {
+  if (target_str.length() == 0) {
+    LOG(ERROR) << "target_str must not be empty";
+  }
+
+  std::istringstream ss(target_str);
+  std::string target_name;
+
+  ss >> target_name;
+  auto device_name = GetDeviceName(target_str);
+
+  std::vector<std::string> options;
+  std::string item;
+  while (ss >> item) {
+    options.push_back(item);
+  }
+
+  return CreateTarget(target_name, options);
+}
+
+/*! \brief Entry to hold the Target context stack. */
+struct TVMTargetThreadLocalEntry {
+  /*! \brief The current target context */
+  std::stack<tvm::Target> context_stack;
+
+  TVMTargetThreadLocalEntry() {
+  }
+};
+
+/*! \brief Thread local store to hold the Target context stack. */
+typedef dmlc::ThreadLocalStore<TVMTargetThreadLocalEntry> TVMTargetThreadLocalStore;
+
+void Target::EnterTargetScope(const tvm::Target& target) {
+  TVMTargetThreadLocalEntry *entry = TVMTargetThreadLocalStore::Get();
+  entry->context_stack.push(target);
+}
+
+void Target::ExitTargetScope() {
+  TVMTargetThreadLocalEntry *entry = TVMTargetThreadLocalStore::Get();
+  entry->context_stack.pop();
+}
+
+tvm::Target Target::current_target(bool allow_not_defined) {
+  TVMTargetThreadLocalEntry *entry = TVMTargetThreadLocalStore::Get();
+  if (entry->context_stack.size() > 0) {
+    return entry->context_stack.top();
+  }
+  CHECK(allow_not_defined)
+    << "Target context required. Please set it by constructing a TargetContext";
+
+  return Target();
+}
+
+namespace target {
+std::vector<std::string> MergeOptions(std::vector<std::string> opts,
+                                             const std::vector<std::string>& new_opts) {
+  opts.insert(opts.end(), new_opts.begin(), new_opts.end());
+  return opts;
+}
+
+Target llvm(const std::vector<std::string>& options) {
+  return CreateTarget("llvm", options);
+}
+
+Target cuda(const std::vector<std::string>& options) {
+  return CreateTarget("cuda", options);
+}
+
+Target rocm(const std::vector<std::string>& options) {
+  return CreateTarget("rocm", options);
+}
+
+Target opencl(const std::vector<std::string>& options) {
+  return CreateTarget("opencl", options);
+}
+
+Target metal(const std::vector<std::string>& options) {
+  return CreateTarget("metal", options);
+}
+
+Target mali(const std::vector<std::string>& options) {
+  return CreateTarget("opencl", MergeOptions(options, {
+    "-device=mali"
+  }));
+}
+
+Target intel_graphics(const std::vector<std::string>& options) {
+  return CreateTarget("opencl", MergeOptions(options, {
+    "-device=intel_graphics"
+  }));
+}
+
+Target stackvm(const std::vector<std::string>& options) {
+  return CreateTarget("stackvm", options);
+}
+}  // namespace target
+
+bool LLVMEnabled() {
+  const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.build_llvm");
+  return pf != nullptr;
+}
+
+/*! \return The default host target for a given device target */
+Target DefaultTargetHost(Target target) {
+  if (target->device_type == kDLCPU) {
+    return target;
+  } else {
+    if (LLVMEnabled()) {
+      return target::llvm();
+    } else {
+      return target::stackvm();
+    }
+  }
+}
+
+Buffer BufferWithOffsetAlignment(Array<Expr> shape,
+                                 Type dtype,
+                                 std::string name,
+                                 int data_alignment,
+                                 int offset_factor) {
+  auto data = Var(name, Handle());
+
+  Expr elem_offset;
+  if (offset_factor != 0) {
+    elem_offset = Var(name + "_elem_offset", shape[0].type());
+  } else {
+    elem_offset = Expr();
+  }
+
+  return BufferNode::make(data, dtype, shape, Array<Expr>(), elem_offset, name, "",
+    data_alignment, offset_factor);
+}
+
+void GetBinds(const Array<Tensor>& args,
+              const std::unordered_map<Tensor, Buffer>& binds,
+              Map<Tensor, Buffer>* out_binds,
+              Array<NodeRef>* out_arg_list,
+              const BuildConfig& config) {
+  *out_binds = binds;
+
+  for (const auto &x : args) {
+    if (out_binds->find(x) == out_binds->end()) {
+      auto buf = BufferWithOffsetAlignment(x->shape, x->dtype, x->op->name,
+        config->data_alignment, config->offset_factor);
+      out_binds->Set(x, buf);
+      out_arg_list->push_back(buf);
+    } else {
+      out_arg_list->push_back((*out_binds)[x]);
+    }
+  }
+}
+
+/*!
+* \brief Build a Stmt given a schedule, args and binds. This function runs the IR passes.
+* \param sch The schedule to build.
+* \param args The arguments for the schedule.
+* \param binds Buffer assignments.
+* \param loop_partition True if the LoopPartition pass should be included.
+* \param out_arg_list Returns the arguments for the Stmt.
+* \param config The build configuration.
+* \return The built Stmt.
+*/
+Stmt BuildStmt(Schedule sch,
+               const Array<Tensor>& args,
+               const std::unordered_map<Tensor, Buffer>& binds,
+               bool loop_partition,
+               Array<NodeRef> *out_arg_list,
+               const BuildConfig& config) {
+  Map<Tensor, Buffer> out_binds;
+  GetBinds(args, binds, &out_binds, out_arg_list, config);
+
+  sch = sch.normalize();
+
+  // Phase 0
+  auto bounds = schedule::InferBound(sch);
+  auto stmt = schedule::ScheduleOps(sch, bounds, false);
+  stmt = ir::InjectPrefetch(stmt);
+
+  // Phase 1
+  stmt = ir::StorageFlatten(stmt, out_binds, 64);
+  stmt = ir::CanonicalSimplify(stmt);
+  if (loop_partition) {
+    stmt = ir::LoopPartition(stmt, config->partition_const_loop);
+  }
+  stmt = ir::VectorizeLoop(stmt);
+  stmt = ir::InjectVirtualThread(stmt);
+  stmt = ir::InjectDoubleBuffer(stmt, config->double_buffer_split_loop);
+  stmt = ir::StorageRewrite(stmt);
+  stmt = ir::UnrollLoop(stmt, config->auto_unroll_max_step, config->auto_unroll_max_depth,
+    config->auto_unroll_max_extent, config->unroll_explicit);
+
+  // Phase 2
+  stmt = ir::Simplify(stmt);
+  stmt = ir::LowerStorageAccessInfo(stmt);
+  stmt = ir::RemoveNoOp(stmt);
+  stmt = ir::RewriteUnsafeSelect(stmt);
+
+  return stmt;
+}
+
+Array<LoweredFunc> lower(Schedule sch,
+                         const Array<Tensor>& args,
+                         const std::string& name,
+                         const std::unordered_map<Tensor, Buffer>& binds,
+                         const BuildConfig& config) {
+  Array<NodeRef> out_arg_list;
+  auto stmt = BuildStmt(sch, args, binds, true, &out_arg_list, config);
+  return Array<LoweredFunc>({ ir::MakeAPI(stmt, name, out_arg_list, 0, config->restricted_func) });
+}
+
+runtime::Module build(const Array<LoweredFunc>& funcs,
+                      const Target& target,
+                      const Target& target_host,
+                      const BuildConfig& config) {
+  std::unordered_set<std::string> all_names;
+  for (const auto &x : funcs) {
+    CHECK(all_names.count(x->name) == 0) << "Duplicate function name " << x->name;
+    all_names.insert(x->name);
+  }
+
+  auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
+
+  Array<LoweredFunc> fhost;
+  Array<LoweredFunc> fdevice;
+
+  for (const auto& x : funcs) {
+    CHECK(ir::VerifyMemory(x, target->device_type))
+        << "Direct host side access to device memory is detected in " << x->func_name()
+        << ". Did you forget to bind?";
+
+    if (x->func_type == kMixedFunc) {
+      auto func = x;
+      if (config->detect_global_barrier) {
+        func = ir::ThreadSync(func, "global");
+      }
+
+      func = ir::ThreadSync(func, "shared");
+      func = ir::LowerThreadAllreduce(func, target->thread_warp_size);
+      auto fsplits = ir::SplitHostDevice(func);
+      fhost.push_back(fsplits[0]);
+      for (auto f = fsplits.begin() + 1; f != fsplits.end(); ++f) {
+        fdevice.push_back(*f);
+      }
+    } else if (x->func_type == kHostFunc) {
+      fhost.push_back(x);
+    } else if (x->func_type == kDeviceFunc) {
+      fdevice.push_back(x);
+    } else {
+      LOG(FATAL) << "unknown function type " << x->func_type;
+    }
+  }
+
+  auto keys = target->keys();
+  bool target_is_gpu =
+    std::find(keys.begin(), keys.end(), "gpu") != keys.end();
+  if (target_is_gpu && fdevice.size() == 0) {
+    LOG(WARNING) << "Specified target " + target->str() +
+      " but cannot find device code. Did you forget to bind?";
+  }
+
+  for (size_t i = 0; i < fhost.size(); ++i) {
+    auto func = fhost[i];
+    func = ir::BindDeviceType(func, target->device_type);
+    func = ir::LowerTVMBuiltin(func);
+    fhost.Set(i, func);
+  }
+
+
+  for (size_t i = 0; i < fdevice.size(); ++i) {
+    auto func = fdevice[i];
+    func = ir::LowerIntrin(func, target->target_name);
+    fdevice.Set(i, func);
+  }
+
+  for (size_t i = 0; i < fhost.size(); ++i) {
+    auto func = fhost[i];
+    func = ir::LowerIntrin(func, target_host_val->target_name);
+    func = ir::CombineContextCall(func);
+    fhost.Set(i, func);
+  }
+
+  auto mhost = codegen::Build(fhost, target_host_val->str());
+
+  if (fdevice.size() > 0) {
+    auto mdev = codegen::Build(fdevice, target->str());
+    mhost.Import(mdev);
+  }
+
+  return mhost;
+}
+
+BuildConfig build_config() {
+  return BuildConfig(std::make_shared<BuildConfigNode>());
+}
+
+/*! \brief Entry to hold the BuildConfig context stack. */
+struct TVMBuildConfigThreadLocalEntry {
+  /*! \brief The default build config if the stack is empty */
+  tvm::BuildConfig default_config;
+
+  /*! \brief The current build config context */
+  std::stack<tvm::BuildConfig> context_stack;
+
+  TVMBuildConfigThreadLocalEntry() :
+    default_config(build_config()) {
+  }
+};
+
+/*! \brief Thread local store to hold the BuildConfig context stack. */
+typedef dmlc::ThreadLocalStore<TVMBuildConfigThreadLocalEntry> TVMBuildConfigThreadLocalStore;
+
+void BuildConfig::EnterBuildConfigScope(const tvm::BuildConfig& build_config) {
+  TVMBuildConfigThreadLocalEntry *entry = TVMBuildConfigThreadLocalStore::Get();
+  entry->context_stack.push(build_config);
+}
+
+void BuildConfig::ExitBuildConfigScope() {
+  TVMBuildConfigThreadLocalEntry *entry = TVMBuildConfigThreadLocalStore::Get();
+  entry->context_stack.pop();
+}
+
+tvm::BuildConfig BuildConfig::Current() {
+  TVMBuildConfigThreadLocalEntry *entry = TVMBuildConfigThreadLocalStore::Get();
+  if (entry->context_stack.size() > 0) {
+    return entry->context_stack.top();
+  }
+
+  return entry->default_config;
+}
+
+TVM_REGISTER_NODE_TYPE(BuildConfigNode);
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<BuildConfigNode>([](const BuildConfigNode *op, IRPrinter *p) {
+  p->stream << "build_config(";
+  p->stream << "data_alignment=" << op->data_alignment << ", ";
+  p->stream << "offset_factor=" << op->offset_factor << ", ";
+  p->stream << "double_buffer_split_loop=" << op->double_buffer_split_loop << ", ";
+  p->stream << "auto_unroll_max_step=" << op->auto_unroll_max_step << ", ";
+  p->stream << "auto_unroll_max_depth=" << op->auto_unroll_max_depth << ", ";
+  p->stream << "auto_unroll_max_extent=" << op->auto_unroll_max_extent << ", ";
+  p->stream << "unroll_explicit=" << op->unroll_explicit << ", ";
+  p->stream << "restricted_func=" << op->restricted_func << ", ";
+  p->stream << "detect_global_barrier=" << op->detect_global_barrier << ", ";
+  p->stream << "partition_const_loop=" << op->partition_const_loop << ", ";
+  p->stream << "dump_pass_ir=" << op->dump_pass_ir;
+  p->stream << ")";
+});
+
+struct GenericFunc::Manager {
+  std::unordered_map<std::string, std::shared_ptr<Node> > fmap;
+  // mutex
+  std::mutex mutex;
+
+  Manager() {
+  }
+
+  static Manager* Global() {
+    static Manager inst;
+    return &inst;
+  }
+};
+
+GenericFunc GenericFunc::Get(const std::string& name) {
+  Manager* m = Manager::Global();
+  std::lock_guard<std::mutex>(m->mutex);
+  auto it = m->fmap.find(name);
+  if (it == m->fmap.end()) {
+    auto f = std::make_shared<GenericFuncNode>();
+    f->name_ = name;
+    m->fmap[name] = f;
+    return GenericFunc(f);
+  } else {
+    return GenericFunc(it->second);
+  }
+}
+
+void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name) {
+  Manager* m = Manager::Global();
+  std::lock_guard<std::mutex>(m->mutex);
+  auto it = m->fmap.find(name);
+  CHECK(it == m->fmap.end()) << "GenericFunc already registered " << name;
+  func->name_ = name;
+  m->fmap[name] = func.node_;
+}
+
+GenericFunc& GenericFunc::set_default(const PackedFunc value,
+                                           bool allow_override) {
+  auto node = static_cast<GenericFuncNode*>(node_.get());
+  if (!allow_override) {
+    CHECK(node->generic_func_ == nullptr)
+      << "Generic function already registered for " << node->name_;
+  }
+  node->generic_func_ = value;
+  return *this;
+}
+
+GenericFunc& GenericFunc::register_func(const std::vector<std::string>& tags,
+                                        const PackedFunc value,
+                                        bool allow_override) {
+  for (auto &t : tags) {
+    if (!allow_override) {
+      auto iter = (*this)->dispatch_dict_.find(t);
+      CHECK(iter == (*this)->dispatch_dict_.end())
+        << "Tag " << t << " already registered for schedule factory " << (*this)->name_;
+    }
+    (*this)->dispatch_dict_[t] = value;
+  }
+  return *this;
+}
+
+void GenericFunc::CallPacked(TVMArgs args, TVMRetValue* ret) const {
+  auto node = static_cast<GenericFuncNode*>(node_.get());
+  auto target = Target::current_target(true);
+  PackedFunc func;
+
+  if (target.defined()) {
+    for (auto &k : target->keys()) {
+      auto iter = node->dispatch_dict_.find(k);
+      if (iter != node->dispatch_dict_.end()) {
+        func = iter->second;
+        break;
+      }
+    }
+  }
+
+  if (func == nullptr) {
+    CHECK(node->generic_func_ != nullptr) << "No generic function registered for " << node->name_;
+    func = node->generic_func_;
+  }
+
+  func.CallPacked(args, ret);
+}
+
+TVM_REGISTER_API("_GetCurrentBuildConfig")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = BuildConfig::Current();
+  });
+
+TVM_REGISTER_API("_EnterBuildConfigScope")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  BuildConfig target = args[0];
+  BuildConfig::EnterBuildConfigScope(target);
+  });
+
+TVM_REGISTER_API("_ExitBuildConfigScope")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  BuildConfig::ExitBuildConfigScope();
+  });
+
+TVM_REGISTER_API("_BuildConfigSetAddLowerPass")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  BuildConfig cfg = args[0];
+  std::vector< std::pair<int, PackedFunc> > add_lower_pass;
+  CHECK_EQ(args.size() % 2, 1);
+  for (int i = 1; i < args.size(); i += 2) {
+    add_lower_pass.push_back(std::make_pair(
+      args[i].operator int(),
+      args[i + 1].operator tvm::runtime::PackedFunc()));
+  }
+  cfg->add_lower_pass = add_lower_pass;
+  });
+
+TVM_REGISTER_API("_BuildConfigGetAddLowerPassInfo")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  // Return one of the following:
+  //  * Size of add_lower_pass if num_args == 1
+  //  * Phase index of pass if args are (config, index, true)
+  //  * Function of pass if args are (config, index, false)
+  BuildConfig cfg = args[0];
+  if (args.num_args == 1) {
+    *ret = static_cast<int64_t>(cfg->add_lower_pass.size());
+  } else {
+    int index = args[1];
+    bool get_phase = args[2];
+    auto item = cfg->add_lower_pass[index];
+    if (get_phase) {
+      *ret = item.first;
+    } else {
+      *ret = item.second;
+    }
+  }
+  });
+
+TVM_REGISTER_API("_GenericFuncCreate")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = GenericFunc(std::make_shared<GenericFuncNode>());
+  });
+
+TVM_REGISTER_API("_GenericFuncGetGlobal")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  std::string func_name = args[0];
+  *ret = GenericFunc::Get(func_name);
+  });
+
+TVM_REGISTER_API("_GenericFuncSetDefault")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  GenericFunc generic_func = args[0];
+  // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+  PackedFunc* func = new PackedFunc(args[1].operator PackedFunc());
+  bool allow_override = args[2];
+
+  generic_func
+    .set_default(*func, allow_override);
+  });
+
+TVM_REGISTER_API("_GenericFuncRegisterFunc")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  GenericFunc generic_func = args[0];
+  // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+  PackedFunc* func = new PackedFunc(args[1].operator PackedFunc());
+  Array<Expr> tags = args[2];
+  bool allow_override = args[3];
+
+  std::vector<std::string> tags_vector;
+  for (auto& tag : tags) {
+    tags_vector.push_back(tag.as<tvm::ir::StringImm>()->value);
+  }
+
+  generic_func
+    .register_func(tags_vector, *func, allow_override);
+  });
+
+TVM_REGISTER_API("_GenericFuncCallFunc")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  GenericFunc generic_func = args[0];
+  TVMArgs func_args(&args.values[1], &args.type_codes[1], args.num_args - 1);
+
+  generic_func
+    .CallPacked(func_args, ret);
+  });
+
+TVM_REGISTER_API("_GetCurrentTarget")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  bool allow_not_defined = args[0];
+  *ret = Target::current_target(allow_not_defined);
+  });
+
+TVM_REGISTER_API("_EnterTargetScope")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  Target target = args[0];
+  Target::EnterTargetScope(target);
+  });
+
+TVM_REGISTER_API("_ExitTargetScope")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  Target::ExitTargetScope();
+  });
+
+}  // namespace tvm
diff --git a/src/codegen/build_opencl.cc b/src/codegen/build_opencl.cc
deleted file mode 100644
index 499c88a009cd..000000000000
--- a/src/codegen/build_opencl.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- *  Build opencl modules from source.
- * \file build_opencl.cc
- */
-#include <tvm/base.h>
-#include <tvm/runtime/config.h>
-#include "./codegen_opencl.h"
-#include "./build_common.h"
-
-#if TVM_OPENCL_RUNTIME
-#include "../runtime/opencl/opencl_module.h"
-#endif   // TVM_OPENCL_RUNTIME
-
-namespace tvm {
-namespace codegen {
-
-runtime::Module BuildOpenCL(Array<LoweredFunc> funcs) {
-  bool output_ssa = false;
-  CodeGenOpenCL cg;
-  cg.Init(output_ssa);
-  for (LoweredFunc f : funcs) {
-    cg.AddFunction(f);
-  }
-  std::string code = cg.Finish();
-#if TVM_OPENCL_RUNTIME
-  return OpenCLModuleCreate(code, "cl", ExtractFuncInfo(funcs));
-#else
-  LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
-  return SourceModuleCreate(code, "cl");
-#endif   // TVM_OPENCL_RUNTIME
-}
-
-TVM_REGISTER_API("codegen.build_opencl")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildOpenCL(args[0]);
-  });
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc
index e1f003d32b10..8bc7d238a866 100644
--- a/src/codegen/codegen.cc
+++ b/src/codegen/codegen.cc
@@ -38,7 +38,7 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) {
   stream->Write(sz);
   for (runtime::Module im : mod->imports()) {
     CHECK_EQ(im->imports().size(), 0U)
-        << "Only support simply one-level hierachy";
+        << "Only support simply one-level hierarchy";
     std::string tkey = im->type_key();
     std::string bin;
     stream->Write(tkey);
@@ -54,9 +54,9 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) {
   os << "#ifdef __cplusplus\n"
      << "extern \"C\" {\n"
      << "#endif\n";
-  os << "TVM_EXPORT extern const char " << runtime::symbol::tvm_dev_mblob << "[];\n";
+  os << "TVM_EXPORT extern const unsigned char " << runtime::symbol::tvm_dev_mblob << "[];\n";
   uint64_t nbytes = bin.length();
-  os << "const char " << runtime::symbol::tvm_dev_mblob
+  os << "const unsigned char " << runtime::symbol::tvm_dev_mblob
      << "[" << bin.length() + sizeof(nbytes) << "] = {\n  ";
   os << std::hex;
   size_t nunit = 80 / 4;
diff --git a/src/codegen/codegen_aocl.cc b/src/codegen/codegen_aocl.cc
new file mode 100644
index 000000000000..8830588758ef
--- /dev/null
+++ b/src/codegen/codegen_aocl.cc
@@ -0,0 +1,62 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file codegen_aocl.cc
+ */
+#include <tvm/build_module.h>
+#include <vector>
+#include <string>
+#include "./codegen_opencl.h"
+#include "./build_common.h"
+#include "../runtime/opencl/aocl/aocl_module.h"
+#include "../runtime/file_util.h"
+
+namespace tvm {
+namespace codegen {
+
+runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
+  // Get code.
+  using tvm::runtime::Registry;
+  bool output_ssa = false;
+  CodeGenOpenCL cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  std::string code = cg.Finish();
+  if (const auto* f = Registry::Get("tvm_callback_opencl_postproc")) {
+    code = (*f)(code).operator std::string();
+  }
+
+  // Write a .cl file.
+  runtime::SaveBinaryToFile("aocl.cl", code.c_str());
+
+  // Compile the .cl file.
+  Target target = Target::create(target_str);
+  if (target->device_name == "") {
+    LOG(FATAL) << "AOCL device name not specified in build target.";
+  }
+  std::string cmd = "aoc aocl.cl";
+  for (std::string option : target->options()) {
+    if (option == "-mattr=emulator") {
+      cmd += " -march=emulator";
+    }
+  }
+  cmd += " -board=" + target->device_name;
+  if (system(cmd.c_str()) != 0) {
+    LOG(FATAL) << "OpenCL offline compilation error.";
+  }
+
+  // Read .aocx file
+  std::string aocxbin;
+  runtime::LoadBinaryFromFile("aocl.aocx", &aocxbin);
+
+  return AOCLModuleCreate(aocxbin, "aocx", ExtractFuncInfo(funcs), code);
+}
+
+TVM_REGISTER_API("codegen.build_aocl")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildAOCL(args[0], args[1]);
+  });
+
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index 2bbf8324eb51..ec27f41cc702 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -5,6 +5,7 @@
 #include <iomanip>
 #include <cctype>
 #include "./codegen_c.h"
+#include "../pass/ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
@@ -26,6 +27,7 @@ void CodeGenC::AddFunction(LoweredFunc f) {
   this->InitFuncState(f);
   // skip the first underscore, so SSA variable starts from _1
   GetUniqueName("_");
+  GetUniqueName("extern");
   // add to alloc buffer type.
   for (const auto & kv : f->handle_data_type) {
     RegisterHandleType(kv.first.get(), kv.second.type());
@@ -38,14 +40,17 @@ void CodeGenC::AddFunction(LoweredFunc f) {
     if (i != 0) stream << ", ";
     if (v.type().is_handle()) {
       auto it = alloc_storage_scope_.find(v.get());
-      if (it != alloc_storage_scope_.end()) {
+      if (it != alloc_storage_scope_.end())
         PrintStorageScope(it->second, stream);
-        stream << ' ';
+      stream << ' ';
+
+      if (handle_data_type_.count(v.get())) {
+        PrintType(handle_data_type_.at(v.get()), stream);
+      } else {
+        stream << "void";
       }
-    }
-    if (handle_data_type_.count(v.get())) {
-      PrintType(handle_data_type_.at(v.get()), stream);
       stream << "*";
+
       if (f->is_restricted && restrict_keyword_.length() != 0) {
         stream << ' ' << restrict_keyword_;
       }
@@ -55,6 +60,7 @@ void CodeGenC::AddFunction(LoweredFunc f) {
     stream << ' ' << vid;
   }
   stream << ") {\n";
+  this->PreFunctionBody(f);
   int func_scope = this->BeginScope();
   this->PrintStmt(f->body);
   this->EndScope(func_scope);
@@ -228,7 +234,7 @@ void CodeGenC::RegisterHandleType(const Variable* buf_var, Type t) {
 void CodeGenC::PrintVecElemLoad(const std::string& vec,
                                 Type t, int i,
                                 std::ostream& os) {  // NOLINT(*)
-  os << vec << ".s" << std::hex << i;
+  os << vec << ".s" << std::hex << i << std::dec;
 }
 
 void CodeGenC::PrintVecElemStore(const std::string& vec,
@@ -236,7 +242,7 @@ void CodeGenC::PrintVecElemStore(const std::string& vec,
                                  const std::string& value) {
   this->PrintIndent();
   stream << vec << ".s" << std::hex << i
-         << " = " << value << ";\n";
+         << " = " << value << ";\n" << std::dec;
 }
 
 std::string CodeGenC::GetVecLoad(
@@ -272,7 +278,7 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) { /
   CHECK_EQ(scope, "global");
 }
 
-void CodeGenC::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
+void CodeGenC::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   CHECK_EQ(t.lanes(), 1)
       << "do not yet support vector types";
   if (t.is_handle()) {
@@ -402,10 +408,9 @@ inline void PrintBinaryIntrinsitc(const Call* op,
   }
 }
 void CodeGenC::VisitExpr_(const Cast *op, std::ostream& os) {  // NOLINT(*)
-  this->PrintType(op->type, os);
-  os << '(';
-  this->PrintExpr(op->value, os);
-  os << ')';
+  std::stringstream value;
+  this->PrintExpr(op->value, value);
+  os << CastFromTo(value.str(), op->value.type(), op->type);
 }
 void CodeGenC::VisitExpr_(const Variable *op, std::ostream& os) {  // NOLINT(*)
   os << GetVarID(op);
@@ -542,15 +547,6 @@ void CodeGenC::PrintVecBinaryOp(
   }
 }
 
-inline bool TryGetRamp1Base(Expr index, int lanes, Expr *base) {
-  const Ramp* r = index.as<Ramp>();
-  if (!r) return false;
-  if (!is_one(r->stride)) return false;
-  CHECK_EQ(r->lanes, lanes);
-  *base = r->base;
-  return true;
-}
-
 void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) {  // NOLINT(*)
   int lanes = op->type.lanes();
   // delcare type.
@@ -561,10 +557,14 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) {  // NOLINT(*)
     CHECK(is_one(op->predicate))
         << "predicated load is not supported";
     Expr base;
-    if (TryGetRamp1Base(op->index, op->type.lanes(), &base)) {
+    if (GetRamp1Base(op->index, op->type.lanes(), &base)) {
       std::string ref = GetVecLoad(op->type, op->buffer_var.get(), base);
       os << ref;
     } else {
+      // The assignment below introduces side-effect, and the resulting value cannot
+      // be reused across multiple expression, thus a new scope is needed
+      int vec_scope = BeginScope();
+
       // load seperately.
       std::string svalue = GetUniqueName("_");
       this->PrintIndent();
@@ -577,6 +577,13 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) {  // NOLINT(*)
         std::ostringstream value_temp;
         if (!HandleTypeMatch(op->buffer_var.get(), elem_type)) {
           value_temp << "((";
+          if (op->buffer_var.get()->type.is_handle()) {
+            auto it = alloc_storage_scope_.find(op->buffer_var.get());
+            if (it != alloc_storage_scope_.end()) {
+              PrintStorageScope(it->second, value_temp);
+              value_temp << ' ';
+            }
+          }
           PrintType(elem_type, value_temp);
           value_temp << "*)" << vid << ')';
         } else {
@@ -588,6 +595,7 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) {  // NOLINT(*)
         PrintVecElemStore(svalue, op->type, i, value_temp.str());
       }
       os << svalue;
+      EndScope(vec_scope);
     }
   }
 }
@@ -603,10 +611,14 @@ void CodeGenC::VisitStmt_(const Store* op) {
     CHECK(is_one(op->predicate))
         << "Predicated store is not supported";
     Expr base;
-    if (TryGetRamp1Base(op->index, t.lanes(), &base)) {
+    if (GetRamp1Base(op->index, t.lanes(), &base)) {
       std::string value = this->PrintExpr(op->value);
       this->PrintVecStore(op->buffer_var.get(), t, base, value);
     } else {
+      // The assignment below introduces side-effect, and the resulting value cannot
+      // be reused across multiple expression, thus a new scope is needed
+      int vec_scope = BeginScope();
+
       // store elements seperately
       std::string index = SSAGetID(PrintExpr(op->index), op->index.type());
       std::string value = SSAGetID(PrintExpr(op->value), op->value.type());
@@ -616,6 +628,13 @@ void CodeGenC::VisitStmt_(const Store* op) {
         Type elem_type = t.element_of();
         if (!HandleTypeMatch(op->buffer_var.get(), elem_type)) {
           stream << "((";
+          if (op->buffer_var.get()->type.is_handle()) {
+            auto it = alloc_storage_scope_.find(op->buffer_var.get());
+            if (it != alloc_storage_scope_.end()) {
+              PrintStorageScope(it->second, stream);
+              stream << ' ';
+            }
+          }
           PrintType(elem_type, stream);
           stream << "*)" << vid << ')';
         } else {
@@ -627,6 +646,7 @@ void CodeGenC::VisitStmt_(const Store* op) {
         PrintVecElemLoad(value, op->value.type(), i, stream);
         stream << ";\n";
       }
+      EndScope(vec_scope);
     }
   }
 }
@@ -640,7 +660,15 @@ void CodeGenC::VisitExpr_(const Let* op, std::ostream& os) {  // NOLINT(*)
 }
 
 void CodeGenC::VisitExpr_(const Ramp* op, std::ostream& os) {  // NOLINT(*)
-  LOG(FATAL) << "Ramp: not supported ";
+  // constraint of current logic
+  CHECK_EQ(op->base.type(), Int(32));
+  os << "((int" << op->lanes << ")(";
+  for (int i = 0; i < op->lanes; i++) {
+    os << "(" << PrintExpr(op->base) << ")" << "+(" << PrintExpr(op->stride) << "*" << i <<")";
+    if (i != op->lanes - 1)
+      os << ", ";
+  }
+  os << "))";
 }
 
 void CodeGenC::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 8141c57c84a5..0f14415f2af6 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -24,6 +24,11 @@ using namespace ir;
  * \brief A base class to generate C code.
  *
  *  CodeGenC have two modes: generate SSA formed C code or normal form.
+ *
+ * **NOTE** CodeGenC does not aim at generating C codes consumed by MSVC or GCC,
+ * Rather, it's providing infrastructural abstraction for C variants like CUDA
+ * and OpenCL-C. You might find some odd variant features, e.g., type `int3` for
+ * a vector of 3 `int`s. For native C code generator, see `CodeGenLLVM`.
  */
 class CodeGenC :
       public ExprFunctor<void(const Expr&, std::ostream&)>,
@@ -68,6 +73,11 @@ class CodeGenC :
     return os.str();
   }
   // The following parts are overloadable print operations.
+  /*!
+   * \brief Insert statement before function body.
+   * \param f The function to be compiled.
+   */
+  virtual void PreFunctionBody(LoweredFunc f) {}
   /*!
    * \brief Initialize codegen state for generating f.
    * \param f The function to be compiled.
@@ -118,7 +128,7 @@ class CodeGenC :
    * \param t The type representation.
    * \param os The stream to print the ctype into
    */
-  virtual void PrintType(Type t, std::ostream& os) const; // NOLINT(*)
+  virtual void PrintType(Type t, std::ostream& os); // NOLINT(*)
   /*!
    * \brief Print expr representing the thread tag
    * \param IterVar iv The thread index to be binded;
@@ -142,16 +152,16 @@ class CodeGenC :
   // print store of single element.
   virtual void PrintVecElemStore(
       const std::string& vec, Type t, int i, const std::string& value);
+  // Get a cast type from to
+  virtual std::string CastFromTo(std::string value, Type from, Type target);
 
  protected:
   // Print reference to struct location
   std::string GetStructRef(
       Type t, const Expr& buffer, const Expr& index, int kind);
   // print reference to a buffer as type t in index.
-  std::string GetBufferRef(
+  virtual std::string GetBufferRef(
       Type t, const Variable* buffer, Expr index);
-  // Get a cast type from to
-  std::string CastFromTo(std::string value, Type from, Type target);
   /*!
    * \brief If buffer is allocated as type t.
    * \param buf_var The buffer variable.
diff --git a/src/codegen/codegen_common.h b/src/codegen/codegen_common.h
new file mode 100644
index 000000000000..5e76af12e583
--- /dev/null
+++ b/src/codegen/codegen_common.h
@@ -0,0 +1,59 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file codegen_common.h
+ * \brief Common utility for codegen.
+ */
+#ifndef TVM_CODEGEN_CODEGEN_COMMON_H_
+#define TVM_CODEGEN_CODEGEN_COMMON_H_
+
+#include <tvm/arithmetic.h>
+#include "../arithmetic/compute_expr.h"
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Visit AssertStmt recursively, update align_map from condition.
+ * \param op The AssertStmt
+ * \param align_map The alignmap
+ * \param fvisit The recursive visitor
+ * \tparam FVisit the recursive visitor
+ */
+template<typename FVisit>
+inline void VisitAssert(
+    const ir::AssertStmt* op,
+    std::unordered_map<const Variable*, arith::ModularEntry>* align_map,
+    FVisit fvisit) {
+  using namespace ir;
+  auto& align_map_ = *align_map;
+  // Detect useful invariant pattern and use them to visit child.
+  // Pattern: Var % const  == 0
+  // TODO(tqchen) merge these pattern to a generic scope info visitor.
+  if (const EQ* eq = op->condition.as<EQ>()) {
+    const Mod* mod = eq->a.as<Mod>();
+    int64_t factor = 0, offset = 0;
+    if (mod && arith::GetConst(eq->b, &offset)) {
+      const Variable *var = mod->a.as<Variable>();
+      if (var && arith::GetConst(mod->b, &factor)) {
+        arith::ModularEntry old = align_map_[var];
+        if (factor > old.coeff) {
+          arith::ModularEntry e;
+          e.coeff = static_cast<int>(factor);
+          e.base = static_cast<int>(offset);
+          // new alignment info,
+          align_map_[var] = e;
+          fvisit(op->body);
+          // restore old info
+          align_map_[var] = old;
+          return;
+        }
+      }
+    }
+  }
+  fvisit(op->body);
+}
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_CODEGEN_CODEGEN_COMMON_H_
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 136f6eb4c3d5..70ab807c9509 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -3,7 +3,6 @@
  * \file codegen_cuda.cc
  */
 #include <tvm/base.h>
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <vector>
@@ -30,6 +29,18 @@ void CodeGenCUDA::AddFunction(LoweredFunc f) {
   CodeGenC::AddFunction(f);
 }
 
+std::string CodeGenCUDA::Finish() {
+  if (enable_fp16_) {
+    decl_stream << "#include <cuda_fp16.h>\n";
+  }
+
+  if (enable_int8_) {
+    decl_stream << "#include <sm_61_intrinsics.h>\n";
+  }
+
+  return CodeGenC::Finish();
+}
+
 void CodeGenCUDA::VisitStmt_(const ir::For* op) {
   CHECK(is_zero(op->min));
   if (op->for_type == ir::ForType::Unrolled) {
@@ -45,7 +56,7 @@ void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
       CastFromTo(iv->thread_tag, UInt(32), iv->var.type());
 }
 
-void CodeGenCUDA::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
+void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
     CHECK_EQ(lanes, 1)
@@ -55,7 +66,9 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
   bool fail = false;
   if (t.is_float()) {
     switch (t.bits()) {
-      case 16: os << "half"; break;
+      case 16: os << "half";
+        enable_fp16_ = true;
+        break;
       case 32: os << "float"; break;
       case 64: os << "double"; break;
       default: fail = true; break;
@@ -66,27 +79,52 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
     }
   } else if (t.is_uint() || t.is_int()) {
     if (t.is_uint()) {
-      os << 'u';
-    }
-    if (t.bits() == 8 && t.lanes() == 4) {
-      // directly 4 8 bit int in integer.
-      os << "int"; return;
+      if (t.lanes() != 1) {
+        os << "u";
+      } else {
+        os << "unsigned ";
+      }
     }
     switch (t.bits()) {
-      case 8: os << "char"; break;
+      case 8: {
+        if (t.lanes() == 4) {
+          // directly 4 8 bit int in integer.
+          enable_int8_ = true;
+          os << "char4"; return;
+        } else if (t.lanes() == 8) {
+          enable_int8_ = true;
+          os << "int2"; return;
+        } else if (t.lanes() == 16) {
+          enable_int8_ = true;
+          os << "int4"; return;
+        } else if (!t.is_uint() && t.lanes() == 1) {
+          os << "signed char"; break;
+        } else {
+          os << "char"; break;
+        }
+      }
       case 16: os << "short"; break;
       case 32: os << "int"; break;
       case 64: {
-        if (lanes != 1 && sizeof(long) == 64) {  // NOLINT(*)
-          os << "long"; break;
+        if (sizeof(long) != 8) { // NOLINT(*)
+          if (t.lanes() == 1) {
+            os << "long long"; break;
+          } else if (t.lanes() == 2) {
+            os << "longlong"; break;
+          } else {
+            // No longlong3, longlong4
+            LOG(FATAL) << "Cannot convert type " << t << " to CUDA type on a L32 platform";
+          }
         } else {
-          os << "int64_t"; break;
+          os << "long"; break;
         }
       }
       case 1: os << "int"; break;
       default: fail = true; break;
     }
-    if (!fail && lanes == 1) return;
+    if (!fail && lanes == 1) {
+      return;
+    }
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes; return;
     }
@@ -101,6 +139,10 @@ void CodeGenCUDA::PrintVecBinaryOp(
   int lanes = t.lanes();
 
   {
+    // The assignment below introduces side-effect, and the resulting value cannot
+    // be reused across multiple expression, thus a new scope is needed
+    int vec_scope = BeginScope();
+
     // default: unpack into individual ops.
     std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.type());
     std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.type());
@@ -129,6 +171,7 @@ void CodeGenCUDA::PrintVecBinaryOp(
       PrintVecElemStore(sret, t, i, value_temp.str());
     }
     os << sret;
+    EndScope(vec_scope);
   }
 }
 
@@ -213,6 +256,16 @@ void CodeGenCUDA::VisitStmt_(const Evaluate *op) {
   }
 }
 
+void CodeGenCUDA::VisitExpr_(const Ramp* op, std::ostream& os) {
+  os << "((make_int" << op->lanes << ")(";
+  for (int i = 0; i < op->lanes; i++) {
+    os << "(" << PrintExpr(op->base) << ")" << "+(" << PrintExpr(op->stride) << "*" << i <<")";
+    if (i != op->lanes - 1)
+      os << ", ";
+  }
+  os << "))";
+}
+
 void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
   std::string v = PrintExpr(op->value);
   os << "make_";
@@ -225,5 +278,30 @@ void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLIN
   os << ')';
 }
 
+
+inline void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p) { // NOLINT(*)
+  switch (op->type.bits()) {
+    case 64: case 32: {
+      std::ostringstream temp;
+      temp << std::scientific << op->value;
+      if (op->type.bits() == 32) temp << 'f';
+      p->MarkConst(temp.str());
+      os << temp.str();
+      break;
+    }
+    case 16: {
+      os << "__float2half_rn";
+      os << '(' << std::scientific << op->value << 'f' << ')';
+      break;
+    }
+    default: LOG(FATAL) << "Bad bit-width for float: " << op->type << "\n";
+  }
+}
+
+
+void CodeGenCUDA::VisitExpr_(const FloatImm *op, std::ostream& os) { // NOLINT(*)
+  PrintConst(op, os, this);
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index c0703523dfeb..f5d9861ec6b2 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -19,6 +19,8 @@ class CodeGenCUDA final : public CodeGenC {
   CodeGenCUDA();
   void Init(bool output_ssa);
   void AddFunction(LoweredFunc f);
+  std::string Finish();
+  bool need_include_path() { return (enable_fp16_ || enable_int8_); }
   // override behavior
   void VisitStmt_(const ir::For* op) final;
   void PrintStorageSync(const Call* op) final;
@@ -26,14 +28,16 @@ class CodeGenCUDA final : public CodeGenC {
   void PrintVecBinaryOp(
       const std::string&op, Type t,
       Expr lhs, Expr rhs, std::ostream& os) final;  // NOLINT(*)
-  void PrintType(Type t, std::ostream& os) const final; // NOLINT(*)
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
   void PrintVecElemLoad(
       const std::string& vec, Type t, int i, std::ostream& os) final;  // NOLINT(*)
   void PrintVecElemStore(
       const std::string& vec, Type t, int i, const std::string& value) final;
   void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
   // overload visitor
+  void VisitExpr_(const Ramp* op, std::ostream& os) final; // NOLINT(*)
   void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
+  void VisitExpr_(const FloatImm *op, std::ostream& os) final;
   void VisitStmt_(const Evaluate *op) final;
 
  private:
@@ -43,6 +47,10 @@ class CodeGenCUDA final : public CodeGenC {
   std::string vid_global_barrier_state_;
   // Global barrier expected node.
   std::string vid_global_barrier_expect_;
+  // whether enable fp16
+  bool enable_fp16_{false};
+  // whether enable int8
+  bool enable_int8_{false};
 };
 
 }  // namespace codegen
diff --git a/src/codegen/codegen_metal.cc b/src/codegen/codegen_metal.cc
index 7eb1a03c2019..37121ccb755c 100644
--- a/src/codegen/codegen_metal.cc
+++ b/src/codegen/codegen_metal.cc
@@ -2,11 +2,12 @@
  *  Copyright (c) 2017 by Contributors
  * \file codegen_metal.cc
  */
-#include <tvm/runtime/config.h>
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
 #include "./codegen_metal.h"
+#include "./build_common.h"
+#include "../runtime/metal/metal_module.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
@@ -132,7 +133,7 @@ void CodeGenMetal::BindThreadIndex(const IterVar& iv) {
       CastFromTo(iv->thread_tag, UInt(thread_index_bits_), iv->var.type());
 }
 
-void CodeGenMetal::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
+void CodeGenMetal::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
     CHECK_EQ(lanes, 1)
@@ -186,6 +187,20 @@ void CodeGenMetal::PrintStorageSync(const Call* op) {
   }
 }
 
+void CodeGenMetal::PrintVecElemLoad(const std::string& vec,
+                                    Type t, int i,
+                                    std::ostream& os) {  // NOLINT(*)
+  os << vec << "[" << i << "]";
+}
+
+void CodeGenMetal::PrintVecElemStore(const std::string& vec,
+                                     Type t, int i,
+                                     const std::string& value) {
+  this->PrintIndent();
+  stream << vec << "[" << i << "]"
+         << " = " << value << ";\n";
+}
+
 void CodeGenMetal::PrintStorageScope(
     const std::string& scope, std::ostream& os) { // NOLINT(*)
   if (scope == "global") {
@@ -207,5 +222,29 @@ void CodeGenMetal::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLI
   }
   os << ')';
 }
+
+runtime::Module BuildMetal(Array<LoweredFunc> funcs) {
+  using tvm::runtime::Registry;
+  bool output_ssa = false;
+  CodeGenMetal cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  std::string code = cg.Finish();
+  std::string fmt = "metal";
+  std::string source = "";
+  if (const auto* f = Registry::Get("tvm_callback_metal_compile")) {
+    source = code;
+    code = (*f)(code).operator std::string();
+    fmt = "metallib";
+  }
+  return MetalModuleCreate(code, fmt, ExtractFuncInfo(funcs), source);
+}
+
+TVM_REGISTER_API("codegen.build_metal")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildMetal(args[0]);
+  });
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/codegen_metal.h b/src/codegen/codegen_metal.h
index e1bc09f0a939..6f8bef64bbcf 100644
--- a/src/codegen/codegen_metal.h
+++ b/src/codegen/codegen_metal.h
@@ -23,8 +23,14 @@ class CodeGenMetal final : public CodeGenC {
   void InitFuncState(LoweredFunc f) final;
   void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*)
   void PrintStorageSync(const Call* op) final;  // NOLINT(*)
-  void PrintType(Type t, std::ostream& os) const final; // NOLINT(*)
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
   void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
+  // print load of single element
+  void PrintVecElemLoad(
+      const std::string& vec, Type t, int i, std::ostream& os) final;  // NOLINT(*)
+  // print store of single element.
+  void PrintVecElemStore(
+      const std::string& vec, Type t, int i, const std::string& value) final;
   // overload visitor
   void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
 
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index 9d0e16f2862e..2d5026e827e2 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -2,12 +2,13 @@
  *  Copyright (c) 2017 by Contributors
  * \file codegen_opencl.cc
  */
-#include <tvm/runtime/config.h>
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
 #include "./codegen_opencl.h"
+#include "./build_common.h"
 #include "../runtime/thread_storage_scope.h"
+#include "../runtime/opencl/opencl_module.h"
 
 namespace tvm {
 namespace codegen {
@@ -30,6 +31,35 @@ void CodeGenOpenCL::AddFunction(LoweredFunc f) {
   CodeGenC::AddFunction(f);
 }
 
+std::string CodeGenOpenCL::Finish() {
+  // inject extension enable pragma for fp16 and fp64
+  if (enable_fp16_) {
+    decl_stream
+        << "#ifdef cl_khr_fp16\n"
+           "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+           "#elif defined(cl_amd_fp16)\n"
+           "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n"
+           "#else\n"
+           "#error \"Half precision floating point not supported"
+                    "by OpenCL implementation on your device.\" \n"
+           "#endif\n\n";
+  }
+
+  if (enable_fp64_) {
+    decl_stream
+        << "#ifdef cl_khr_fp64\n"
+           "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+           "#elif defined(cl_amd_fp64)\n"
+           "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+           "#else\n"
+           "#error \"Double precision floating point not supported"
+                    "by OpenCL implementation on your device.\" \n"
+           "#endif\n\n";
+  }
+
+  return CodeGenC::Finish();
+}
+
 void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
   CHECK(!var_idmap_.count(iv->var.get()));
   runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag);
@@ -43,7 +73,7 @@ void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
       CastFromTo(os.str(), UInt(64), iv->var.type());
 }
 
-void CodeGenOpenCL::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
+void CodeGenOpenCL::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
     CHECK_EQ(lanes, 1)
@@ -53,9 +83,15 @@ void CodeGenOpenCL::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
   bool fail = false;
   if (t.is_float()) {
     switch (t.bits()) {
-      case 16: os << "half"; break;
+      case 16:
+        os << "half";
+        enable_fp16_ = true;
+        break;
       case 32: os << "float"; break;
-      case 64: os << "double"; break;
+      case 64:
+        os << "double";
+        enable_fp64_ = true;
+        break;
       default: fail = true; break;
     }
     if (!fail && lanes == 1) return;
@@ -122,7 +158,8 @@ void CodeGenOpenCL::PrintVecStore(const Variable* buffer,
 void CodeGenOpenCL::PrintStorageSync(const Call* op) {
   const std::string& sync = op->args[0].as<StringImm>()->value;
   if (sync == "warp") {
-    LOG(FATAL) << "warp sync not supported in opencl";
+    this->PrintIndent();
+    this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n";
   } else if (sync == "shared") {
     this->PrintIndent();
     this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n";
@@ -140,16 +177,53 @@ void CodeGenOpenCL::PrintStorageScope(
   }
 }
 
+std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) {
+  if (from == target) return value;
+  std::ostringstream os;
+  if (target.lanes() == 1) {
+    os << "((";
+    this->PrintType(target, os);
+    os << ")" << value << ")";
+  } else {  // convert vector type
+    os << "(";
+    os << "convert_";
+    this->PrintType(target, os);
+    os << "(" << value << "))";
+  }
+  return os.str();
+}
+
 void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  os << '(';
+  os << "((";
   PrintType(op->type, os);
   os << ")(";
   for (int i = 0; i < op->lanes; ++i) {
     if (i != 0) os << ", ";
     os << v;
   }
-  os << ')';
+  os << "))";
 }
+
+
+runtime::Module BuildOpenCL(Array<LoweredFunc> funcs) {
+  using tvm::runtime::Registry;
+  bool output_ssa = false;
+  CodeGenOpenCL cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  std::string code = cg.Finish();
+  if (const auto* f = Registry::Get("tvm_callback_opencl_postproc")) {
+    code = (*f)(code).operator std::string();
+  }
+  return OpenCLModuleCreate(code, "cl", ExtractFuncInfo(funcs), code);
+}
+
+TVM_REGISTER_API("codegen.build_opencl")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildOpenCL(args[0]);
+  });
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h
index a39d4e104c47..424bfa5ae2b3 100644
--- a/src/codegen/codegen_opencl.h
+++ b/src/codegen/codegen_opencl.h
@@ -18,12 +18,14 @@ class CodeGenOpenCL final : public CodeGenC {
  public:
   CodeGenOpenCL();
   void AddFunction(LoweredFunc f);
+  std::string Finish();
+
   // override print thread tag.
   void InitFuncState(LoweredFunc f) final;
   void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
   void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*)
   void PrintStorageSync(const Call* op) final;  // NOLINT(*)
-  void PrintType(Type t, std::ostream& os) const final; // NOLINT(*)
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
   std::string GetVecLoad(Type t, const Variable* buffer,
                          Expr base) final;
   void PrintVecStore(const Variable* buffer,
@@ -32,8 +34,15 @@ class CodeGenOpenCL final : public CodeGenC {
   // the address of load/store
   void PrintVecAddr(const Variable* buffer, Type t,
                     Expr base, std::ostream& os);  // NOLINT(*)
+  std::string CastFromTo(std::string value, Type from, Type target); // NOLINT(*)
+
   // overload visitor
   void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
+
+ private:
+  // whether enable fp16 and fp64 extension
+  bool enable_fp16_{false};
+  bool enable_fp64_{false};
 };
 
 }  // namespace codegen
diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc
new file mode 100644
index 000000000000..5e750a39e598
--- /dev/null
+++ b/src/codegen/codegen_opengl.cc
@@ -0,0 +1,289 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_opengl.cc
+ *
+ * We are targeting OpenGL 3.3. The reason of not targeting a recent version
+ * of OpenGL is to have better compatibility of WebGL 2.
+ */
+#include <tvm/packed_func_ext.h>
+#include <vector>
+#include <string>
+#include "./codegen_opengl.h"
+#include "./build_common.h"
+#include "../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace codegen {
+
+CodeGenOpenGL::CodeGenOpenGL()
+    : output_(nullptr), output_iter_var_(nullptr) {}
+
+void CodeGenOpenGL::InitFuncState(LoweredFunc f) {
+  CodeGenC::InitFuncState(f);
+  output_ = nullptr;
+  inputs_.clear();
+  output_iter_var_ = nullptr;
+  thread_extent_var_ = "";
+  this->decl_stream.str("");
+  this->stream.str("");
+}
+
+void CodeGenOpenGL::AddFunction(LoweredFunc f) {
+  // clear previous generated state.
+  this->InitFuncState(f);
+
+  this->decl_stream << "#version 300 es\n";
+  this->decl_stream << "precision highp float;\n";
+
+  // skip the first underscore, so SSA variable starts from _1
+  GetUniqueName("_");
+  // add to alloc buffer type.
+  for (const auto& kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+
+  // Allocate argument names. Store in `var_idmap_`.
+  for (auto arg : f->args) {
+    auto arg_name = GetUniqueName(arg.get()->name_hint);
+    var_idmap_[arg.get()] = arg_name;
+  }
+
+  thread_extent_var_ = GetUniqueName("thread_extent");
+  this->decl_stream << "uniform int " << thread_extent_var_ << ";\n";
+
+  this->stream << "void main() {\n";
+
+  int func_scope = this->BeginScope();
+  this->PrintStmt(f->body);
+  this->EndScope(func_scope);
+
+  this->PrintIndent();
+  this->stream << "}\n\n";
+
+  // Declare arguments.
+  for (auto arg : f->args) {
+    if (this->inputs_.find(arg.get()) != this->inputs_.cend()) {
+      // Declare input texture.
+      // Format:
+      // - Float: "uniform sampler2D {name};"
+      // - Int: "uniform isampler2D {name};"
+      // - UInt: "uniform usampler2D {name};"
+
+      auto arg_name = GetVarID(arg.get());
+
+      auto type_it = this->handle_data_type_.find(arg.get());
+      CHECK(type_it != this->handle_data_type_.cend()) << "Cannot find type.";
+      auto type = Type2TVMType(type_it->second);
+      CHECK_EQ(type.lanes, 1) << "Vector type not supported.";
+
+      switch (type.code) {
+        case kDLInt:
+          this->decl_stream << "uniform isampler2D " << arg_name << ";\n";
+          break;
+        case kDLUInt:
+          this->decl_stream << "uniform usampler2D " << arg_name << ";\n";
+          break;
+        case kDLFloat:
+          this->decl_stream << "uniform sampler2D " << arg_name << ";\n";
+          break;
+        default:
+          LOG(FATAL) << "Unsupported type code.";
+      }
+
+    } else if (this->output_ == arg.get()) {
+      // Declare output texture.
+      // Format: "out {type} {name};"
+
+      auto arg_name = GetVarID(arg.get());
+
+      auto type_it = this->handle_data_type_.find(arg.get());
+      CHECK(type_it != this->handle_data_type_.cend()) << "Cannot find type.";
+      auto type = type_it->second;
+
+      this->decl_stream << "out ";
+      PrintType(type, this->decl_stream);
+      this->decl_stream << " " << arg_name << ";\n";
+
+    } else {
+      // Declare uniform value.
+      // Format: "uniform {type} {name};"
+
+      auto arg_name = GetVarID(arg.get());
+      auto type = arg.get()->type;
+
+      this->decl_stream << "uniform ";
+      PrintType(type, this->decl_stream);
+      this->decl_stream << " " << arg_name << ";\n";
+    }
+  }
+
+  std::vector<std::string> arg_names;
+  std::vector<runtime::OpenGLArgKind> arg_kinds;
+  for (auto arg : f->args) {
+    std::string name = GetVarID(arg.get());
+
+    runtime::OpenGLArgKind kind;
+    if (inputs_.find(arg.get()) != inputs_.cend()) {
+      kind = runtime::OpenGLArgKind::kInputTexture;
+    } else if (output_ == arg.get()) {
+      kind = runtime::OpenGLArgKind::kOutputTexture;
+    } else {
+      kind = runtime::OpenGLArgKind::kUniform;
+    }
+
+    arg_names.push_back(name);
+    arg_kinds.push_back(kind);
+  }
+
+  shaders_[f->name] = runtime::OpenGLShader(
+      this->decl_stream.str() + this->stream.str(),
+      std::move(arg_names), std::move(arg_kinds),
+      this->thread_extent_var_);
+}
+
+std::unordered_map<std::string, runtime::OpenGLShader> CodeGenOpenGL::Finish() {
+  return shaders_;
+}
+
+void CodeGenOpenGL::BindThreadIndex(const IterVar& iv) {
+  CHECK_EQ(iv->thread_tag, "threadIdx.x") << "Must be threadIdx.x";
+  CHECK(var_idmap_.find(iv->var.get()) == var_idmap_.end())
+    << "Only support one thread iter var";
+  CHECK(output_iter_var_ == nullptr) << "Only support one thread iter var";
+
+  var_idmap_[iv->var.get()] = iv->thread_tag;
+  output_iter_var_ = iv->var.get();
+
+  // Declare threadIdx local variable.
+  this->PrintIndent();
+  this->stream << "ivec2 threadIdx = ivec2(" << runtime::kTextureRowSize
+               << " * int(gl_FragCoord.y) + int(gl_FragCoord.x), 0);\n";
+
+  // Return directly if threadIdx.x >= thread_extent.
+  this->PrintIndent();
+  this->stream << "if (threadIdx.x >= " << thread_extent_var_ << ") {\n";
+  this->PrintIndent();
+  this->stream << "  return;\n";
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenOpenGL::VisitStmt_(const Store* op) {
+  LOG(FATAL) << "Store statement not supported in OpenGL."
+             << " Texture store should be a Call statement.";
+}
+
+// texelFetch(tex, ivec2(idx & kTextureRowMask, idx >> kTextureRowBits), 0).r
+std::string CodeGenOpenGL::TexelFetch(const Variable* buffer, Expr index) {
+  std::ostringstream os;
+  os << "texelFetch(" << GetVarID(buffer) << ", ivec2(int(";
+  PrintExpr(index, os);
+  os << ") & " << runtime::kTextureRowMask << ", int(";
+  PrintExpr(index, os);
+  os << ") >> " << runtime::kTextureRowBits << "), 0).r";
+  return os.str();
+}
+
+// Print a reference expression to a buffer.
+// Format: texelFetch(buffer, index, 0).r
+std::string CodeGenOpenGL::GetBufferRef(
+    Type t, const Variable* buffer, Expr index) {
+  CHECK_EQ(t.lanes(), 1) << "Vector type not supported.";
+  CHECK(HandleTypeMatch(buffer, t)) << "Type mismatch not supported.";
+
+  if (buffer == this->output_) {
+    // This is the output texture.
+    return GetVarID(buffer);
+  } else {
+    // This is an input texture.
+    this->inputs_.insert(buffer);
+    return TexelFetch(buffer, index);
+  }
+}
+
+void CodeGenOpenGL::PrintType(Type t, std::ostream& os) {
+  switch (t.code()) {
+    case halideir_type_int:
+      CHECK_EQ(t.bits(), 32) << "Only support 32-bit int.";
+      os << "int";
+      break;
+    case halideir_type_uint:
+      CHECK_EQ(t.bits(), 32) << "Only support 32-bit uint.";
+      os << "uint";
+      break;
+    case halideir_type_float:
+      CHECK_EQ(t.bits(), 32) << "Only support 32-bit float.";
+      os << "float";
+      break;
+    default:
+      LOG(FATAL) << "Unsupported type code.";
+  }
+}
+
+// Codegen for immediate values
+
+void CodeGenOpenGL::VisitExpr_(const IntImm* op, std::ostream& os) {
+  CHECK_EQ(op->type, Int(32)) << "GLSL 3.0 only supports 32-bit ints.";
+  CodeGenC::VisitExpr_(op, os);
+}
+
+void CodeGenOpenGL::VisitExpr_(const UIntImm* op, std::ostream& os) {
+  CHECK_EQ(op->type, UInt(32)) << "GLSL 3.0 only supports 32-bit uints.";
+  CodeGenC::VisitExpr_(op, os);
+}
+
+void CodeGenOpenGL::VisitExpr_(const FloatImm* op, std::ostream& os) {
+  CHECK_EQ(op->type, Float(32)) << "GLSL 3.0 only supports 32-bit floats.";
+  CodeGenC::VisitExpr_(op, os);
+}
+
+void CodeGenOpenGL::VisitExpr_(const StringImm*, std::ostream& os) {
+  LOG(FATAL) << "GLSL 3.0 doesn't support strings.";
+}
+
+void CodeGenOpenGL::VisitStmt_(const Evaluate* op) {
+  auto call = op->value.as<Call>();
+  if (call == nullptr || call->name != Call::glsl_texture_store) {
+    // Fallback to normal logic.
+    CodeGenC::VisitStmt_(op);
+  }
+
+  CHECK_EQ(call->args.size(), 2);
+  auto buffer = call->args[0].as<Variable>();
+  auto value = call->args[1];
+
+  // Doesn't support store to vector.
+  auto type = value.type();
+  CHECK_EQ(type.lanes(), 1)
+    << "Vectorized store not implemented, type = " << type;
+
+  CHECK(inputs_.find(buffer) == inputs_.cend())
+    << "Texture has been read from before. Must not store to it.";
+  if (output_ == nullptr) {
+    output_ = buffer;  // Record that this texture is the output.
+  } else {
+    CHECK(output_ == buffer) << "GLSL can only write to 1 texture.";
+  }
+
+  this->PrintIndent();
+  this->stream << GetVarID(buffer) << " = " << PrintExpr(value) << ";\n";
+}
+
+runtime::Module BuildOpenGL(Array<LoweredFunc> funcs) {
+  bool output_ssa = false;
+  CodeGenOpenGL cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  auto shaders = cg.Finish();
+  return OpenGLModuleCreate(shaders, "gl", ExtractFuncInfo(funcs));
+}
+
+TVM_REGISTER_API("codegen.build_opengl")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = BuildOpenGL(args[0]);
+});
+
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/codegen_opengl.h b/src/codegen/codegen_opengl.h
new file mode 100644
index 000000000000..3cae1e323ec4
--- /dev/null
+++ b/src/codegen/codegen_opengl.h
@@ -0,0 +1,51 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_opengl.h
+ * \brief Generate OpenGL device code.
+ */
+#ifndef TVM_CODEGEN_CODEGEN_OPENGL_H_
+#define TVM_CODEGEN_CODEGEN_OPENGL_H_
+
+#include <tvm/codegen.h>
+#include <tvm/packed_func_ext.h>
+#include <string>
+#include "./codegen_c.h"
+#include "../runtime/opengl/opengl_module.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenOpenGL final : public CodeGenC {
+ public:
+  CodeGenOpenGL();
+  void AddFunction(LoweredFunc f);
+  std::unordered_map<std::string, runtime::OpenGLShader> Finish();
+
+  void InitFuncState(LoweredFunc f) final;
+  void BindThreadIndex(const IterVar& iv) final;
+  void VisitStmt_(const Store* op) final;
+  std::string TexelFetch(const Variable* buffer, Expr index);
+  std::string GetBufferRef(Type t, const Variable* buffer, Expr index) final;
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
+
+  // Codegen for immediate values
+  void VisitExpr_(const IntImm* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const UIntImm* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const FloatImm* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const StringImm* op, std::ostream& os) final;  // NOLINT(*)
+
+  // Match glsl_texture_store Call.
+  void VisitStmt_(const Evaluate* op) final;  // NOLINT(*)
+
+ private:
+  const Variable* output_{nullptr};
+  std::unordered_set<const Variable*> inputs_;
+  const Variable* output_iter_var_{nullptr};
+  std::unordered_map<std::string, runtime::OpenGLShader> shaders_;
+  std::string thread_extent_var_;
+};
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_CODEGEN_CODEGEN_OPENGL_H_
diff --git a/src/codegen/codegen_source_base.h b/src/codegen/codegen_source_base.h
index 0ee5b71d017c..89c5bbc05ce4 100644
--- a/src/codegen/codegen_source_base.h
+++ b/src/codegen/codegen_source_base.h
@@ -10,7 +10,9 @@
 #include <tvm/codegen.h>
 #include <string>
 #include <vector>
+#include <functional>
 #include <unordered_map>
+#include "../runtime/meta_data.h"
 
 namespace tvm {
 namespace codegen {
@@ -108,6 +110,21 @@ class CodeGenSourceBase {
  * \param fmt The code. format.
  */
 runtime::Module SourceModuleCreate(std::string code, std::string fmt);
+
+/*!
+ * \brief Create a source module for viewing and limited saving for device.
+ * \param data The code data to be viewed.
+ * \param fmt The code. format.
+ * \param fmap The map function information map of each function.
+ * \param type_key The type_key of the runtime module of this source code
+ * \param fget_source a closure to replace default get source behavior.
+ */
+runtime::Module DeviceSourceModuleCreate(
+  std::string data,
+  std::string fmt,
+  std::unordered_map<std::string, runtime::FunctionInfo> fmap,
+  std::string type_key,
+  std::function<std::string(const std::string&)> fget_source = nullptr);
 }  // namespace codegen
 }  // namespace tvm
 #endif  // TVM_CODEGEN_CODEGEN_SOURCE_BASE_H_
diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc
new file mode 100644
index 000000000000..0795670bded6
--- /dev/null
+++ b/src/codegen/codegen_vhls.cc
@@ -0,0 +1,112 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file codegen_vhls.cc
+ */
+#include <tvm/build_module.h>
+#include <vector>
+#include <string>
+#include "./codegen_vhls.h"
+#include "./build_common.h"
+#include "../runtime/opencl/sdaccel/sdaccel_module.h"
+
+namespace tvm {
+namespace codegen {
+
+void CodeGenVivadoHLS::Init(bool output_ssa) {
+  CodeGenC::Init(output_ssa);
+
+  this->stream << "#include <ap_int.h>\n\n";
+}
+
+void CodeGenVivadoHLS::PrintType(Type t, std::ostream& os) {
+  if (t.is_uint()) {
+    switch (t.bits()) {
+      case 8:
+        os << "unsigned char"; break;
+      case 16:
+        os << "unsigned short"; break;
+      case 32:
+        os << "unsigned int"; break;
+      case 64:
+        os << "unsigned long long"; break;
+      default:
+        os << "ap_uint<" << t.bits() << ">"; break;
+    }
+  } else if (t.is_int()) {
+    switch (t.bits()) {
+      case 8:
+        os << "char"; break;
+      case 16:
+        os << "short"; break;
+      case 32:
+        os << "int"; break;
+      case 64:
+        os << "long long"; break;
+      default:
+        os << "ap_int<" << t.bits() << ">"; break;
+    }
+  } else {
+    CodeGenC::PrintType(t, os);
+  }
+}
+
+void CodeGenVivadoHLS::AddFunction(LoweredFunc f) {
+  this->stream << "extern \"C\" ";
+  CodeGenC::AddFunction(f);
+}
+
+void CodeGenVivadoHLS::PreFunctionBody(LoweredFunc f) {
+  for (size_t i = 0; i < f->args.size(); ++i) {
+    Var v = f->args[i];
+    std::string vid = GetVarID(v.get());
+    if (v.type().is_handle()) {
+      this->stream << "#pragma HLS INTERFACE m_axi port=" << vid << "  offset=slave bundle=gmem\n";
+    }
+    this->stream << "#pragma HLS INTERFACE s_axilite port=" << vid << " bundle=control\n";
+  }
+  this->stream << "#pragma HLS INTERFACE s_axilite port=return bundle=control\n\n";
+}
+
+
+runtime::Module BuildSDAccel(Array<LoweredFunc> funcs, std::string target_str) {
+  using tvm::runtime::Registry;
+  bool output_ssa = false;
+  CodeGenVivadoHLS cg;
+
+  // Generate source code for get_source().
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  std::string whole_code = cg.Finish();
+
+  // Generate source code for compilation.
+  Array<Array<Expr> > kernel_info;
+  for (LoweredFunc f : funcs) {
+    CodeGenVivadoHLS cg;
+    cg.Init(output_ssa);
+    cg.AddFunction(f);
+    std::string code = cg.Finish();
+    if (const auto* f = runtime::Registry::Get("tvm_callback_vhls_postproc")) {
+      code = (*f)(code).operator std::string();
+    }
+    kernel_info.push_back(Array<Expr>({f->name, code}));
+  }
+
+  std::string xclbin;
+  if (const auto* f = Registry::Get("tvm_callback_sdaccel_compile")) {
+    Target target = Target::create(target_str);
+    xclbin = (*f)(kernel_info, target->device_name).operator std::string();
+  } else {
+    LOG(FATAL) << "Cannot compile Vivado HLS code.";
+  }
+  return SDAccelModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), whole_code);
+}
+
+TVM_REGISTER_API("codegen.build_sdaccel")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildSDAccel(args[0], args[1]);
+  });
+
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/codegen_vhls.h b/src/codegen/codegen_vhls.h
new file mode 100644
index 000000000000..02a0da2dcfa5
--- /dev/null
+++ b/src/codegen/codegen_vhls.h
@@ -0,0 +1,28 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file codegen_vhls.h
+ * \brief Utility to generate vhls code
+ */
+#ifndef TVM_CODEGEN_CODEGEN_VHLS_H_
+#define TVM_CODEGEN_CODEGEN_VHLS_H_
+
+#include <tvm/codegen.h>
+#include <tvm/packed_func_ext.h>
+#include <string>
+#include "./codegen_c.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenVivadoHLS final : public CodeGenC {
+ public:
+  void Init(bool output_ssa);
+  void PrintType(Type t, std::ostream& os);
+  void AddFunction(LoweredFunc f);
+  void PreFunctionBody(LoweredFunc f);
+};
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_CODEGEN_CODEGEN_VHLS_H_
diff --git a/src/codegen/intrin_rule.h b/src/codegen/intrin_rule.h
index e66b55dcc861..c900c9088880 100644
--- a/src/codegen/intrin_rule.h
+++ b/src/codegen/intrin_rule.h
@@ -30,18 +30,14 @@ struct FloatSuffix {
   }
 };
 
-// Add float suffix to the intrinsics
-struct FloatDirect {
+// Return the intrinsic name
+struct Direct {
   std::string operator()(Type t, std::string name) const {
-    if (t.is_float()) {
-      return name;
-    } else {
-      return "";
-    }
+    return name;
   }
 };
 
-// Directly call pure extern function for floats.
+// Call pure extern function.
 template<typename T>
 inline void DispatchExtern(const TVMArgs& args, TVMRetValue* rv) {
   Expr e = args[0];
diff --git a/src/codegen/intrin_rule_cuda.cc b/src/codegen/intrin_rule_cuda.cc
index a2441d597d86..43461a15932d 100644
--- a/src/codegen/intrin_rule_cuda.cc
+++ b/src/codegen/intrin_rule_cuda.cc
@@ -36,6 +36,40 @@ struct CUDAFastMath : public CUDAMath {
   }
 };
 
+struct CUDAPopcount {
+  std::string operator()(Type t, std::string name) const {
+    if (t.lanes() == 1 && t.is_uint()) {
+      switch (t.bits()) {
+        case 32: return "__popc";
+        case 64: return "__popcll";
+        default: return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAShuffle {
+  std::string operator()(Type t, std::string name) const {
+    return "__shfl";
+  }
+};
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.floor")
+.set_body(DispatchExtern<CUDAMath>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.ceil")
+.set_body(DispatchExtern<CUDAMath>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.trunc")
+.set_body(DispatchExtern<CUDAMath>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.fabs")
+.set_body(DispatchExtern<CUDAMath>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.round")
+.set_body(DispatchExtern<CUDAMath>);
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp")
 .set_body(DispatchExtern<CUDAFastMath>);
 
@@ -51,6 +85,13 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sqrt")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.pow")
 .set_body(DispatchExtern<CUDAMath>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.popcount")
+.set_body(DispatchExtern<CUDAPopcount>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.tvm_warp_shuffle")
+.set_body(DispatchExtern<CUDAShuffle>);
+
+
 }  // namespace intrin
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/intrin_rule_metal.cc b/src/codegen/intrin_rule_metal.cc
index fbadf3a19bdf..3c210919132e 100644
--- a/src/codegen/intrin_rule_metal.cc
+++ b/src/codegen/intrin_rule_metal.cc
@@ -9,20 +9,38 @@ namespace tvm {
 namespace codegen {
 namespace intrin {
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.trunc")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.fabs")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.round")
+.set_body(DispatchExtern<Direct>);
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.exp")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.log")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.tanh")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.sqrt")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.pow")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.popcount")
+.set_body(DispatchExtern<Direct>);
 
 }  // namespace intrin
 }  // namespace codegen
diff --git a/src/codegen/intrin_rule_opencl.cc b/src/codegen/intrin_rule_opencl.cc
index a947715acdac..d91deaeda5fe 100644
--- a/src/codegen/intrin_rule_opencl.cc
+++ b/src/codegen/intrin_rule_opencl.cc
@@ -9,20 +9,49 @@ namespace tvm {
 namespace codegen {
 namespace intrin {
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.trunc")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.fabs")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.round")
+.set_body(DispatchExtern<Direct>);
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.exp")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.log")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.tanh")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.sqrt")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.pow")
-.set_body(DispatchExtern<FloatDirect>);
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.popcount")
+.set_body(DispatchExtern<Direct>);
+
+// There is no warp shuffle instruction in standard OpenCL
+// When shuffle is used, we assume it is intel's shuffle extension
+struct IntelShuffle {
+  std::string operator()(Type t, std::string name) const {
+    return "intel_sub_group_shuffle";
+  }
+};
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.tvm_warp_shuffle")
+.set_body(DispatchExtern<IntelShuffle>);
 
 }  // namespace intrin
 }  // namespace codegen
diff --git a/src/codegen/intrin_rule_opengl.cc b/src/codegen/intrin_rule_opengl.cc
new file mode 100644
index 000000000000..e9728a25b40c
--- /dev/null
+++ b/src/codegen/intrin_rule_opengl.cc
@@ -0,0 +1,38 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file intrin_rule_opencl.cc
+ * \brief OpenCL intrinsic rules.
+ */
+#include "./intrin_rule.h"
+
+namespace tvm {
+namespace codegen {
+namespace intrin {
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.exp")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.log")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.tanh")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.sqrt")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.pow")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.popcount")
+.set_body(DispatchExtern<Direct>);
+
+}  // namespace intrin
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/intrin_rule_vhls.cc b/src/codegen/intrin_rule_vhls.cc
new file mode 100644
index 000000000000..32869eec1db5
--- /dev/null
+++ b/src/codegen/intrin_rule_vhls.cc
@@ -0,0 +1,33 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file intrin_rule_vhls.cc
+ * \brief VHLS intrinsic rules.
+ */
+#include "./intrin_rule.h"
+
+namespace tvm {
+namespace codegen {
+namespace intrin {
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.exp")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.log")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.tanh")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.sqrt")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.pow")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.popcount")
+.set_body(DispatchExtern<Direct>);
+
+
+}  // namespace intrin
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index 9b02f2bc299f..9d1decb43227 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -4,13 +4,13 @@
  * \brief AMDGPU code generator.
  */
 #ifdef TVM_LLVM_VERSION
-#if TVM_ROCM_RUNTIME
 
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/registry.h>
 #include "./codegen_llvm.h"
 #include "../build_common.h"
+#include "../codegen_source_base.h"
 #include "../../pass/ir_util.h"
 #include "../../runtime/rocm/rocm_module.h"
 
@@ -44,7 +44,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       if (info.alignment > 16) {
         info.alignment = 16;
       }
-      if (info.scope.rank == 2) {
+      if (info.scope.rank == runtime::StorageRank::kLocal) {
         // const int local_address_space = 5;
         // TODO(tqchen): for higher version of LLVM, local address space can be set.
         llvm::AllocaInst* alloca = builder_->CreateAlloca(
@@ -54,7 +54,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
         }
         buf = alloca;
       } else {
-        CHECK_EQ(info.scope.rank, 1)
+        CHECK(info.scope.rank == runtime::StorageRank::kShared)
             << "Can only allocate shared or local memory inside kernel";
         // Shared memory: address space  == 3
         const unsigned shared_address_space = 3;
@@ -102,7 +102,6 @@ class CodeGenAMDGPU : public CodeGenLLVM {
   llvm::Value* CreateStorageSync(const Call* op) final {
     const std::string& sync = op->args[0].as<StringImm>()->value;
     if (sync == "warp") {
-      // TODO(tqchen) warp sync in CUDA9
       return nullptr;
     } else if (sync == "shared") {
       llvm::Function* f = llvm::Intrinsic::getDeclaration(
@@ -131,19 +130,27 @@ class CodeGenAMDGPU : public CodeGenLLVM {
   }
 };
 
-inline int DetectROCMComputeVersion() {
+inline int DetectROCMComputeVersion(const std::string& target) {
+  size_t pos = target.find("=gfx");
+  if (pos != std::string::npos) {
+    int value;
+    std::stringstream is(target.substr(pos + 4));
+    if (is >> value) return value;
+  }
   TVMContext tvm_ctx;
-  tvm_ctx.device_type = kROCM;
+  tvm_ctx.device_type = kDLROCM;
   tvm_ctx.device_id = 0;
-  TVMRetValue val;
-  tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(
-      tvm_ctx, tvm::runtime::kExist, &val);
-  if (val.operator int() == 1) {
-    tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(tvm_ctx, tvm::runtime::kComputeVersion, &val);
-    return val.operator int();
-  } else {
-    return 803;
+  tvm::runtime::DeviceAPI* api = tvm::runtime::DeviceAPI::Get(tvm_ctx, true);
+  if (api != nullptr) {
+    TVMRetValue val;
+    api->GetAttr(tvm_ctx, tvm::runtime::kExist, &val);
+    if (val.operator int() == 1) {
+      tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(tvm_ctx, tvm::runtime::kComputeVersion, &val);
+      return val.operator int();
+    }
   }
+  LOG(WARNING) << "Cannot find -mcpu to specify rocm compute version assume gfx803";
+  return 803;
 }
 
 runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
@@ -151,7 +158,7 @@ runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
         target.substr(0, 4) == "rocm");
   std::ostringstream config;
   config << "-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx"
-         << DetectROCMComputeVersion()
+         << DetectROCMComputeVersion(target)
          << target.substr(4, target.length() - 4);
   llvm::TargetMachine* tm = GetLLVMTargetMachine(config.str());
   std::unique_ptr<CodeGenAMDGPU> cg(new CodeGenAMDGPU());
@@ -176,6 +183,9 @@ runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
     }
     mlib->setTargetTriple(tm->getTargetTriple().str());
     mlib->setDataLayout(tm->createDataLayout());
+    for (llvm::Function &f : mlib->functions()) {
+      f.addFnAttr(llvm::Attribute::AlwaysInline);
+    }
     cg->AddLinkModule(std::move(mlib));
   }
 
@@ -186,20 +196,37 @@ runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
   dest_ll.SetUnbuffered();
   destAsm.SetUnbuffered();
   module->print(dest_ll, nullptr);
+#if TVM_LLVM_VERSION <= 60
   std::unique_ptr<llvm::Module> mAsm = llvm::CloneModule(module.get());
   std::unique_ptr<llvm::Module> mObj = llvm::CloneModule(module.get());
+#else
+  std::unique_ptr<llvm::Module> mAsm = llvm::CloneModule(*module.get());
+  std::unique_ptr<llvm::Module> mObj = llvm::CloneModule(*module.get());
+#endif
   llvm::legacy::PassManager pass;
 
+#if TVM_LLVM_VERSION <= 60
   CHECK(tm->addPassesToEmitFile(
             pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0)
             << "Cannot emit target CGFT_ObjectFile";
+#else
+  CHECK(tm->addPassesToEmitFile(
+            pass, destObj, nullptr, llvm::TargetMachine::CGFT_ObjectFile) == 0)
+            << "Cannot emit target CGFT_ObjectFile";
+#endif
   pass.run(*mObj);
   std::string obj(dataObj.begin(), dataObj.end());
 
   llvm::legacy::PassManager passAsm;
+#if TVM_LLVM_VERSION <= 60
   CHECK(tm->addPassesToEmitFile(passAsm, destAsm,
                                 llvm::TargetMachine::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_AssemblyFile";
+#else
+  CHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr,
+                                llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+      << "Cannot emit target CGFT_AssemblyFile";
+#endif
   passAsm.run(*mAsm);
   std::string assembly(dataAsm.begin(), dataAsm.end());
 
@@ -212,7 +239,6 @@ runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
 
   std::string hsaco = (*f)(arr);
   std::string ll(data_ll.begin(), data_ll.end());
-
   return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll, assembly);
 }
 
@@ -223,5 +249,4 @@ TVM_REGISTER_API("codegen.build_rocm")
 
 }  // namespace codegen
 }  // namespace tvm
-#endif   // TVM_ROCM_RUNTIME
 #endif  // TVM_LLVM_VERSION
diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc
index b87b6ec88808..18a0eb54e182 100644
--- a/src/codegen/llvm/codegen_arm.cc
+++ b/src/codegen/llvm/codegen_arm.cc
@@ -18,8 +18,90 @@ class CodeGenARM final : public CodeGenCPU {
     native_vector_bits_ = 16 * 8;
     CodeGenCPU::InitTarget(tm);
   }
+  llvm::Value* CreateIntrinsic(const Call* op) override;
+
+ private:
+  Expr ARMPopcount(const Call* op);
 };
 
+llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) {
+  if (op->is_intrinsic("llvm_intrin")) {
+    llvm::Intrinsic::ID id = static_cast<llvm::Intrinsic::ID>(
+        op->args[0].as<UIntImm>()->value);
+    if (id == ::llvm::Intrinsic::ctpop) {
+      Expr e = ARMPopcount(op);
+      return CodeGenCPU::CreateIntrinsic(e.as<Call>());
+    }
+  }
+  return CodeGenCPU::CreateIntrinsic(op);
+}
+
+Expr CodeGenARM::ARMPopcount(const Call *call) {
+  using namespace ir;
+  const Expr& e = call->args[2];
+  ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop;
+  ::llvm::Intrinsic::ID vpaddlu_id = ::llvm::Intrinsic::arm_neon_vpaddlu;
+
+  // Fallback to default llvm lowering rule if input type not a full vector or half vector length
+  int total_size =  call->type.bits() * call->type.lanes();
+  if (!call->type.is_vector() || call->type.bits() == 8 ||
+     (total_size != 128 && total_size != 64)) {
+    Array<Expr> vcnt_args;
+    vcnt_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id));
+    vcnt_args.push_back(ir::UIntImm::make(UInt(32), 1));
+    vcnt_args.push_back(e);
+    return ir::Call::make(call->type,  "llvm_intrin", vcnt_args, Call::PureIntrinsic);
+  }
+
+  // Popcount lowering rule:
+  // Reinterpret input vector as a vector of 8bit values and preform popcount
+  // Pairwise add between adjacent elements and double width with vpaddlu
+  // to return back to original input type
+
+  // Dvisions are always divisible (number of bits = 64 or 128)
+  Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8);
+  Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
+  Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
+
+  // Interpret input as vector of 8bit values
+  Expr input8 = reinterpret(uint8_type, e);
+  // Popcount 8bit->8bit
+  const Call* c0 = input8.as<Call>();
+  CHECK(c0 != nullptr);
+  Array<Expr> vcnt8_args;
+  vcnt8_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id));
+  vcnt8_args.push_back(ir::UIntImm::make(UInt(32), 1));
+  vcnt8_args.push_back(input8);
+  Expr vcnt8 = ir::Call::make(uint8_type,  "llvm_intrin", vcnt8_args, Call::PureIntrinsic);
+
+  // Accumulation 8->16bit
+  Array<Expr> vcnt16_args;
+  vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id));
+  vcnt16_args.push_back(ir::UIntImm::make(UInt(32), 1));
+  vcnt16_args.push_back(vcnt8);
+  Expr vcnt16 = ir::Call::make(uint16_type, "llvm_intrin", vcnt16_args, Call::PureIntrinsic);
+  if (call->type.bits() == 16) {
+    return vcnt16;
+  }
+
+  // Accumulation 16->32bit
+  Array<Expr> vcnt32_args;
+  vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id));
+  vcnt32_args.push_back(ir::UIntImm::make(UInt(32), 1));
+  vcnt32_args.push_back(vcnt16);
+  Expr vcnt32 = ir::Call::make(uint32_type,  "llvm_intrin", vcnt32_args, Call::PureIntrinsic);
+  if (call->type.bits() == 32) {
+    return vcnt32;
+  }
+
+  // Accumulation 32->64bit
+  Array<Expr> vcnt64_args;
+  vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id));
+  vcnt64_args.push_back(ir::UIntImm::make(UInt(32), 1));
+  vcnt64_args.push_back(vcnt32);
+  return ir::Call::make(call->type,  "llvm_intrin", vcnt64_args, Call::PureIntrinsic);
+}
+
 TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_arm")
 .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
     CodeGenLLVM* cg = new CodeGenARM();
diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc
index bc04a6337f28..a8a2127febde 100644
--- a/src/codegen/llvm/codegen_cpu.cc
+++ b/src/codegen/llvm/codegen_cpu.cc
@@ -337,7 +337,11 @@ void CodeGenCPU::CreateComputeScope(const AttrStmt* op) {
   builder_->SetInsertPoint(compute_call_end);
 }
 
-llvm::Value* CodeGenCPU::PackClosureData(const Array<Var>& vfields) {
+llvm::Value* CodeGenCPU::PackClosureData(const Array<Var>& vfields, uint64_t* num_bytes) {
+  if (vfields.size() == 0) {
+    *num_bytes = 0U;
+    return llvm::Constant::getNullValue(t_void_p_);
+  }
   std::vector<llvm::Type*> fields;
   for (Var v : vfields) {
     auto it = var_map_.find(v.get());
@@ -352,6 +356,8 @@ llvm::Value* CodeGenCPU::PackClosureData(const Array<Var>& vfields) {
         var_map_.at(vfields[i].get()),
         builder_->CreateInBoundsGEP(cdata, {zero, ConstInt32(i)}));
   }
+  *num_bytes = data_layout_->getTypeAllocSize(
+      llvm::cast<llvm::PointerType>(cdata->getType())->getElementType());
   return cdata;
 }
 
@@ -374,7 +380,8 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) {
       "__tvm_parallel_lambda", module_.get());
   // allocate and setup the closure, call the closure.
   Array<Var> vfields = ir::UndefinedVars(body, {});
-  llvm::Value* cdata = PackClosureData(vfields);
+  uint64_t nbytes;
+  llvm::Value* cdata = PackClosureData(vfields, &nbytes);
   BasicBlock* par_launch_end = CheckCallSuccess(
       builder_->CreateCall(
           RuntimeTVMParallelLaunch(),
@@ -412,6 +419,16 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) {
   builder_->SetInsertPoint(par_launch_end);
 }
 
+llvm::Value* CodeGenCPU::CreateStaticHandle() {
+  llvm::GlobalVariable* gv = new llvm::GlobalVariable(
+      *module_, t_void_p_, false,
+      llvm::GlobalValue::PrivateLinkage, 0,
+      "__tvm_static_handle");
+  gv->setAlignment(data_layout_->getTypeAllocSize(t_void_p_));
+  gv->setInitializer(llvm::Constant::getNullValue(t_void_p_));
+  return gv;
+}
+
 void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& body) {
   using llvm::BasicBlock;
   // closure data
@@ -419,26 +436,20 @@ void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& bod
       ftype_tvm_static_init_callback_,
       llvm::Function::PrivateLinkage,
       "__tvm_static_init_lambda", module_.get());
-  llvm::GlobalVariable* gv = new llvm::GlobalVariable(
-      *module_, t_void_p_, false,
-      llvm::GlobalValue::PrivateLinkage, 0,
-      "__tvm_static_handle");
-  gv->setAlignment(data_layout_->getTypeAllocSize(t_void_p_));
-  gv->setInitializer(llvm::Constant::getNullValue(t_void_p_));
+  llvm::Value* gv = CreateStaticHandle();
   llvm::Function* finit = module_->getFunction(init_fname);
   if (finit == nullptr) {
     finit = llvm::Function::Create(
         ftype_tvm_static_init_, llvm::Function::ExternalLinkage, init_fname, module_.get());
   }
   // allocate and setup the closure, call the closure.
+  uint64_t nbytes;
   Array<Var> vfields = ir::UndefinedVars(body, {});
-  llvm::Value* cdata = PackClosureData(vfields);
-  llvm::Value* nbytes = ConstInt32(data_layout_->getTypeAllocSize(
-      llvm::cast<llvm::PointerType>(cdata->getType())->getElementType()));
+  llvm::Value* cdata = PackClosureData(vfields, &nbytes);
   BasicBlock* init_end = CheckCallSuccess(
       builder_->CreateCall(
           finit,
-          {gv, f, builder_->CreatePointerCast(cdata, t_void_p_), nbytes}));
+          {gv, f, builder_->CreatePointerCast(cdata, t_void_p_), ConstInt32(nbytes)}));
   // Setup the closure function.
   BasicBlock *lambda_entry = BasicBlock::Create(*ctx_, "entry", f);
   builder_->SetInsertPoint(lambda_entry);
@@ -472,7 +483,7 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
     // create the function handle
     hptr = new llvm::GlobalVariable(
         *module_, t_tvm_func_handle_, false,
-        llvm::GlobalValue::LinkOnceAnyLinkage, 0, ".tvm_func." + fname);
+        llvm::GlobalValue::InternalLinkage, nullptr, ".tvm_func." + fname);
     hptr->setAlignment(align);
     hptr->setInitializer(llvm::Constant::getNullValue(t_tvm_func_handle_));
     func_handle_map_[fname] = hptr;
@@ -593,6 +604,8 @@ void CodeGenCPU::AddStartupFunction() {
 llvm::Value* CodeGenCPU::CreateIntrinsic(const Call* op) {
   if (op->is_intrinsic(intrinsic::tvm_call_packed_lowered)) {
     return CreateCallPacked(op);
+  } else if (op->is_intrinsic(intrinsic::tvm_static_handle)) {
+    return CreateStaticHandle();
   } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) {
     builder_->CreateRet(ConstInt32(-1));
     return ConstInt32(-1);
@@ -670,16 +683,15 @@ void CodeGenCPU::VisitStmt_(const AttrStmt* op) {
     this->CreateStaticInit(op->value.as<StringImm>()->value, op->body);
   } else  if (op->attr_key == ir::attr::compute_scope) {
     this->CreateComputeScope(op);
-  } else if (op->attr_key == ir::attr::pragma_scope) {
-    const std::string& pname = op->value.as<StringImm>()->value;
-    if (pname == "parallel_stride_pattern") {
+  } else if (attr::IsPragmaKey(op->attr_key)) {
+    if (op->attr_key == "pragma_parallel_stride_pattern") {
       CHECK(parallel_env_.penv != nullptr)
           << "Pragma parallel_stride_pattern only valid in parallel launch";
       parallel_env_.stride_pattern = true;
       this->VisitStmt(op->body);
-    } else if (pname == "parallel_launch_point") {
+    } else if (op->attr_key == "pragma_parallel_launch_point") {
       CreateParallelLaunch(op->body, 0);
-    } else if (pname == "parallel_barrier_when_finish") {
+    } else if (op->attr_key == "pragma_parallel_barrier_when_finish") {
       CHECK(parallel_env_.penv != nullptr)
           << "Cannot run barrier without parallel environment";
       CHECK(!parallel_env_.in_parallel_loop)
@@ -689,8 +701,13 @@ void CodeGenCPU::VisitStmt_(const AttrStmt* op) {
       builder_->CreateCall(
           RuntimeTVMParallelBarrier(),
           {MakeValue(parallel_env_.task_id),  parallel_env_.penv});
+    } else if (op->attr_key == ir::attr::pragma_import_llvm) {
+      const StringImm* value = op->value.as<StringImm>();
+      CHECK(value != nullptr);
+      this->HandleImport(value->value);
+      this->VisitStmt(op->body);
     } else {
-      LOG(WARNING) << "Unknown pragma " << pname;
+      LOG(WARNING) << "Unknown pragma " << op->attr_key;
       this->VisitStmt(op->body);
     }
   } else {
diff --git a/src/codegen/llvm/codegen_cpu.h b/src/codegen/llvm/codegen_cpu.h
index ac7f2e34dc64..5027dab911bd 100644
--- a/src/codegen/llvm/codegen_cpu.h
+++ b/src/codegen/llvm/codegen_cpu.h
@@ -72,8 +72,9 @@ class CodeGenCPU : public CodeGenLLVM {
   llvm::Value* RuntimeTVMAPISetLastError();
   llvm::Value* RuntimeTVMParallelLaunch();
   llvm::Value* RuntimeTVMParallelBarrier();
+  llvm::Value* CreateStaticHandle();
   llvm::Value* GetPackedFuncHandle(const std::string& str);
-  llvm::Value* PackClosureData(const Array<Var>& fields);
+  llvm::Value* PackClosureData(const Array<Var>& fields, uint64_t *num_bytes);
   llvm::Value* CreateStructRefPtr(Type t, llvm::Value* buffer, llvm::Value* index, int kind);
   void UnpackClosureData(llvm::Value*cdata,
                          const Array<Var>& fields,
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 2654dee0f7e5..ae576c981395 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -9,6 +9,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include "./codegen_llvm.h"
 #include "./codegen_cpu.h"
+#include "../codegen_common.h"
 #include "../../pass/ir_util.h"
 #include "../../arithmetic/compute_expr.h"
 
@@ -158,6 +159,42 @@ std::unique_ptr<llvm::Module> CodeGenLLVM::Finish() {
   return std::move(module_);
 }
 
+
+void CodeGenLLVM::HandleImport(const std::string& code) {
+  std::unique_ptr<llvm::Module> mlib;
+  llvm::SMDiagnostic err;
+  if (code.length() >= 3 &&
+      (code.substr(code.length() - 3) == ".ll" ||
+       code.substr(code.length() - 3) == ".bc")) {
+    mlib = llvm::parseIRFile(code, err, *ctx_);
+    if (mlib.get() == nullptr) {
+      std::string msg = err.getMessage();
+      LOG(FATAL) << "Fail to load bitcode file " << code << "\n"
+                 << "line " << err.getLineNo() << ":" << msg;
+    }
+  } else {
+    std::unique_ptr<llvm::MemoryBuffer> buf =
+        llvm::MemoryBuffer::getMemBuffer(code);
+    mlib = llvm::parseIR(*buf, err, *ctx_);
+    if (mlib.get() == nullptr) {
+      std::string msg = err.getMessage();
+      LOG(FATAL) << "Fail to load llvm ir "
+                 << "line " << err.getLineNo() << ":" << msg
+                 << "\ncontent:\n"  << code;
+    }
+  }
+  mlib->setTargetTriple(target_machine_->getTargetTriple().str());
+  mlib->setDataLayout(target_machine_->createDataLayout());
+  // mark all the functions as force inline
+  for (llvm::Function &f : mlib->functions()) {
+    f.removeFnAttr(llvm::Attribute::NoInline);
+    f.addFnAttr(llvm::Attribute::AlwaysInline);
+    f.setLinkage(llvm::GlobalValue::AvailableExternallyLinkage);
+  }
+  // add to linker libraries.
+  this->AddLinkModule(std::move(mlib));
+}
+
 void CodeGenLLVM::AddLinkModule(std::unique_ptr<llvm::Module>&& mod) {
   link_modules_.emplace_back(std::move(mod));
 }
@@ -242,7 +279,7 @@ llvm::Type* CodeGenLLVM::LLVMType(const Type& t) const {
     CHECK_EQ(t.lanes(), 1);
     return t_void_p_;
   }
-  llvm::Type* etype;
+  llvm::Type* etype = nullptr;
   if (t.is_int() || t.is_uint()) {
     etype = llvm::Type::getIntNTy(*ctx_, t.bits());
   } else if (t.is_float()) {
@@ -338,10 +375,11 @@ void CodeGenLLVM::GetAlignment(Type t,
   }
 
   arith::ModularEntry me = arith::EvalModular(index, align_map_);
+
   int align_bits = t.bits();
   while (align_bits < max_align_bits &&
          me.base % 2  == 0 &&
-         me.coeff %2 == 0) {
+         me.coeff % 2 == 0) {
     me.base =  me.base / 2;
     me.coeff =  me.coeff / 2;
     align_bits *= 2;
@@ -364,7 +402,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
 llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) {
   int num_elems = static_cast<int>(vec->getType()->getVectorNumElements());
   if (extent == num_elems && begin == 0) return vec;
-  CHECK_LT(begin + extent, num_elems);
+  CHECK_LE(begin + extent, num_elems);
   std::vector<unsigned> indices;
   for (int i = 0; i < extent; ++i) {
     indices.push_back(begin + i);
@@ -509,6 +547,18 @@ llvm::Value* CodeGenLLVM::CreateBufferPtr(
   return builder_->CreateInBoundsGEP(buffer, index);
 }
 
+llvm::Value* CodeGenLLVM::CreateBufferVecPtr(
+    Type t, llvm::Value* buffer, llvm::Value* index) {
+  CHECK_GT(t.lanes(), 1);
+  llvm::PointerType* btype = llvm::dyn_cast<llvm::PointerType>(buffer->getType());
+  CHECK(btype != nullptr);
+  llvm::PointerType* ptype = LLVMType(t)->getPointerTo(btype->getAddressSpace());
+  if (btype != ptype) {
+    buffer = builder_->CreatePointerCast(buffer, ptype);
+  }
+  return builder_->CreateInBoundsGEP(buffer, index);
+}
+
 llvm::Value* CodeGenLLVM::GetVarValue(const Variable* v) const {
   auto it = var_map_.find(v);
   CHECK(it != var_map_.end()) << "cannot find variable " << v->name_hint;
@@ -548,6 +598,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
         sig_type.push_back(arg_value.back()->getType());
       }
     }
+    llvm::Type *return_type = LLVMType(op->type);
+    if (sig_type.size() > 0 && return_type != sig_type[0]) {
+      sig_type.insert(sig_type.begin(), return_type);
+    }
     llvm::Function* f = llvm::Intrinsic::getDeclaration(
         module_.get(), id, sig_type);
     return builder_->CreateCall(f, arg_value);
@@ -572,11 +626,22 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
   } else if (op->is_intrinsic(intrinsic::tvm_address_of)) {
     const Load *l = op->args[0].as<Load>();
     CHECK(op->args.size() == 1 && l);
-    llvm::Value* ptr = CreateBufferPtr(
-        l->type, MakeValue(l->buffer_var), MakeValue(l->index));
-    unsigned addrspace = llvm::dyn_cast<llvm::PointerType>(
-        ptr->getType())->getAddressSpace();
-    return builder_->CreatePointerCast(ptr, t_void_->getPointerTo(addrspace));
+    const Ramp *r = l->index.as<Ramp>();
+    llvm::Value* ptr;
+    unsigned addrspace;
+    if (!r) {
+        ptr = CreateBufferPtr(
+          l->type, MakeValue(l->buffer_var), MakeValue(l->index));
+        addrspace = llvm::dyn_cast<llvm::PointerType>(
+          ptr->getType())->getAddressSpace();
+    } else {
+        Expr index = r->base / make_const(Int(32), r->lanes);
+        ptr = CreateBufferVecPtr(
+          l->type, MakeValue(l->buffer_var), MakeValue(index));
+        addrspace = llvm::dyn_cast<llvm::PointerType>(
+          ptr->getType())->getAddressSpace();
+    }
+    return builder_->CreatePointerCast(ptr, t_char_->getPointerTo(addrspace));
   } else if (op->is_intrinsic(Call::reinterpret) && is_zero(op->args[0])) {
     return llvm::Constant::getNullValue(t_void_p_);
   } else if (op->is_intrinsic(intrinsic::tvm_handle_is_null)) {
@@ -603,6 +668,26 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
     value->addIncoming(then_value, then_value_block);
     value->addIncoming(else_value, else_value_block);
     return value;
+  } else if (op->is_intrinsic(Call::reinterpret)) {
+    llvm::Type * target = LLVMType(op->type);
+    return builder_->CreateBitCast(MakeValue(op->args[0]), target);
+  } else if (op->is_intrinsic("vectorlow")) {
+    llvm::Value *v = MakeValue(op->args[0]);
+    int l = v->getType()->getVectorNumElements();
+    return CreateVecSlice(v, 0, l/2);
+  } else if (op->is_intrinsic("vectorhigh")) {
+    llvm::Value *v = MakeValue(op->args[0]);
+    int l = v->getType()->getVectorNumElements();
+    return CreateVecSlice(v, l/2, l/2);
+  } else if (op->is_intrinsic("vectorcombine")) {
+    llvm::Value *v0 = MakeValue(op->args[0]);
+    llvm::Value *v1 = MakeValue(op->args[1]);
+    int num_elems = static_cast<int>(v0->getType()->getVectorNumElements()) * 2;
+    std::vector<unsigned> indices;
+    for (int i = 0; i < num_elems; ++i) {
+      indices.push_back(i);
+    }
+    return builder_->CreateShuffleVector(v0, v1, indices);
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->name;
     return nullptr;
@@ -790,13 +875,13 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Let* op) {
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const Load* op) {
   Type t = op->type;
-  int alignment, native_bits;
   bool is_volatile = volatile_buf_.count(op->buffer_var.get());
-  GetAlignment(t, op->buffer_var.get(), op->index, &alignment, &native_bits);
   llvm::Value* buffer = MakeValue(op->buffer_var);
   llvm::Value* index = MakeValue(op->index);
 
   if (t.lanes() == 1) {
+    int alignment, native_bits;
+    GetAlignment(t, op->buffer_var.get(), op->index, &alignment, &native_bits);
     llvm::Value* ptr = CreateBufferPtr(t, buffer, index);
     llvm::LoadInst* load = builder_->CreateAlignedLoad(ptr, alignment, is_volatile);
     AddAliasInfo(load, op->buffer_var.get(), op->index, t);
@@ -807,6 +892,8 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Load* op) {
       buffer->getType())->getAddressSpace();
     if (const Ramp* ramp = op->index.as<Ramp>()) {
       if (is_one(ramp->stride)) {
+        int alignment, native_bits;
+        GetAlignment(t, op->buffer_var.get(), ramp->base, &alignment, &native_bits);
         CHECK_EQ(ramp->lanes, t.lanes());
         llvm::Value* ptr = CreateBufferPtr(
             t.element_of(), buffer, MakeValue(ramp->base));
@@ -861,14 +948,14 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Broadcast* op) {
 void CodeGenLLVM::VisitStmt_(const Store* op) {
   CHECK(is_one(op->predicate));
   Type t = op->value.type();
-  int alignment, native_bits;
   bool is_volatile = volatile_buf_.count(op->buffer_var.get());
-  GetAlignment(t, op->buffer_var.get(), op->index, &alignment, &native_bits);
   llvm::Value* buffer = MakeValue(op->buffer_var);
   llvm::Value* index = MakeValue(op->index);
   llvm::Value* value = MakeValue(op->value);
 
   if (t.lanes() == 1) {
+    int alignment, native_bits;
+    GetAlignment(t, op->buffer_var.get(), op->index, &alignment, &native_bits);
     llvm::Value* ptr = CreateBufferPtr(t, buffer, index);
     llvm::StoreInst* store = builder_->CreateAlignedStore(value, ptr, alignment, is_volatile);
     AddAliasInfo(store, op->buffer_var.get(), op->index, op->value.type());
@@ -879,6 +966,8 @@ void CodeGenLLVM::VisitStmt_(const Store* op) {
         buffer->getType())->getAddressSpace();
     if (const Ramp* ramp = op->index.as<Ramp>()) {
       if (is_one(ramp->stride)) {
+        int alignment, native_bits;
+        GetAlignment(t, op->buffer_var.get(), ramp->base, &alignment, &native_bits);
         CHECK_EQ(ramp->lanes, t.lanes());
         llvm::Value* ptr = CreateBufferPtr(
             t.element_of(), buffer, MakeValue(ramp->base));
@@ -1003,31 +1092,9 @@ void CodeGenLLVM::VisitStmt_(const AttrStmt* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AssertStmt* op) {
-  // Detect useful invariant pattern and use them to visit child.
-  // Pattern: Var % const  == 0
-  // TODO(tqchen) move these pattern to a generic scope info visitor.
-  if (const EQ* eq = op->condition.as<EQ>()) {
-    const Mod* mod = eq->a.as<Mod>();
-    int64_t factor = 0, offset = 0;
-    if (mod && arith::GetConst(eq->b, &offset)) {
-      const Variable *var = mod->a.as<Variable>();
-      if (var && arith::GetConst(mod->b, &factor)) {
-        arith::ModularEntry old = align_map_[var];
-        if (factor > old.coeff) {
-          arith::ModularEntry e;
-          e.coeff = static_cast<int>(factor);
-          e.base = static_cast<int>(offset);
-          // new alignment info,
-          align_map_[var] = e;
-          this->VisitStmt(op->body);
-          // restore old info
-          align_map_[var] = old;
-          return;
-        }
-      }
-    }
-  }
-  this->VisitStmt(op->body);
+  VisitAssert(op, &align_map_, [this](const Stmt& body) {
+      this->VisitStmt(body);
+    });
 }
 
 void CodeGenLLVM::VisitStmt_(const LetStmt* op) {
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index e4a0b24d381a..4e61247f4acf 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -178,6 +178,8 @@ class CodeGenLLVM :
   // do a scalarize call with f
   llvm::Value* CreateScalarizedCall(
       const Call* op, llvm::Function* f, const std::vector<llvm::Value*>& args);
+  // handle module import
+  void HandleImport(const std::string& code);
   // cast operatpr
   llvm::Value* CreateCast(Type from, Type to, llvm::Value* value);
   // comparison op
@@ -191,6 +193,7 @@ class CodeGenLLVM :
   llvm::Value* CreateMul(Type t, llvm::Value* a, llvm::Value* b);
   llvm::Value* CreateBroadcast(llvm::Value* value, int lanes);
   llvm::Value* CreateBufferPtr(Type t, llvm::Value* buffer, llvm::Value* index);
+  llvm::Value* CreateBufferVecPtr(Type t, llvm::Value* buffer, llvm::Value* index);
   // Vector concatenation.
   llvm::Value* CreateVecSlice(llvm::Value* vec, int begin, int extent);
   llvm::Value* CreateVecFlip(llvm::Value* vec);
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index d147709ff1a2..1cca1eacfe85 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -4,7 +4,6 @@
  * \brief NVPTX code generator.
  */
 #ifdef TVM_LLVM_VERSION
-#if TVM_CUDA_RUNTIME
 
 #include <tvm/runtime/device_api.h>
 #include "./codegen_llvm.h"
@@ -47,7 +46,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
       if (info.alignment > 16) {
         info.alignment = 16;
       }
-      if (info.scope.rank == 2) {
+      if (info.scope.rank == runtime::StorageRank::kLocal) {
         // const int local_address_space = 5;
         // TODO(tqchen): for higher version of LLVM, local address space can be set.
         llvm::AllocaInst* alloca = builder_->CreateAlloca(
@@ -57,7 +56,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
         }
         buf = alloca;
       } else {
-        CHECK_EQ(info.scope.rank, 1)
+        CHECK(info.scope.rank == runtime::StorageRank::kShared)
             << "Can only allocate shared or local memory inside kernel";
         // Shared memory: address space  == 3
         const unsigned shared_address_space = 3;
@@ -122,6 +121,20 @@ class CodeGenNVPTX : public CodeGenLLVM {
     // Additional optimization hook to tweak the builder.
   }
 
+  void Optimize() final {
+    for (auto& f : *module_) {
+      auto fname = static_cast<std::string>(f.getName());
+      if (fname.substr(0, 4) != "__nv") continue;
+      // This is to strip off unused __nv_* functions from the final module
+      // The one that is actually used will be inlined at call site
+      // Adapted from Halide's runtime linker
+      if (!f.isDeclaration() && !f.hasFnAttribute(llvm::Attribute::NoInline)) {
+        f.setLinkage(llvm::GlobalValue::AvailableExternallyLinkage);
+      }
+    }
+    CodeGenLLVM::Optimize();
+  }
+
  protected:
   void InitTarget(llvm::TargetMachine* tm) final {
     // Maximum vector lane = float4
@@ -132,7 +145,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
 
 inline int DetectCUDAComputeVersion() {
   TVMContext tvm_ctx;
-  tvm_ctx.device_type = kGPU;
+  tvm_ctx.device_type = kDLGPU;
   tvm_ctx.device_id = 0;
   TVMRetValue val;
   tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(
@@ -180,8 +193,7 @@ runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
       }
       mlib->setTargetTriple(tm->getTargetTriple().str());
       mlib->setDataLayout(tm->createDataLayout());
-      // TODO(tqchen) libdevice linking not yet working.
-      // cg->AddLinkModule(std::move(mlib));
+      cg->AddLinkModule(std::move(mlib));
     }
   }
   std::unique_ptr<llvm::Module> module = cg->Finish();
@@ -194,9 +206,15 @@ runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
   std::string ll(data_ll.begin(), data_ll.end());
   // emit ptx
   llvm::legacy::PassManager pass;
+#if TVM_LLVM_VERSION <= 60
   CHECK(tm->addPassesToEmitFile(
       pass, dest_ptx, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
+#else
+  CHECK(tm->addPassesToEmitFile(
+      pass, dest_ptx, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+      << "Cannot emit target CGFT_ObjectFile";
+#endif
   pass.run(*module);
   std::string ptx(data_ptx.begin(), data_ptx.end());
   return CUDAModuleCreate(ptx, "ptx", ExtractFuncInfo(funcs), ll);
@@ -209,5 +227,4 @@ TVM_REGISTER_API("codegen.build_nvptx")
 
 }  // namespace codegen
 }  // namespace tvm
-#endif   // TVM_CUDA_RUNTIME
 #endif  // TVM_LLVM_VERSION
diff --git a/src/codegen/llvm/intrin_rule_llvm.cc b/src/codegen/llvm/intrin_rule_llvm.cc
index 0338f007cac6..4b2a3ca5bd02 100644
--- a/src/codegen/llvm/intrin_rule_llvm.cc
+++ b/src/codegen/llvm/intrin_rule_llvm.cc
@@ -25,6 +25,42 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.log")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.sqrt")
 .set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::sqrt, 1>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.floor")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::floor, 1>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.ceil")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::ceil, 1>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.trunc")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::trunc, 1>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.fabs")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::fabs, 1>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.round")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::round, 1>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.tanh")
+.set_body([](const TVMArgs& targs, TVMRetValue* rv) {
+  Expr e = targs[0];
+  const ir::Call* call = e.as<ir::Call>();
+  CHECK(call != nullptr);
+  const Expr& x = call->args[0];
+  Expr one = make_const(x.type(), 1);
+  Expr two = make_const(x.type(), 2);
+  Expr neg_two = make_const(x.type(), -2);
+
+  Expr exp_neg2x = ir::Call::make(
+      x.type(), "exp", {neg_two * x}, ir::Call::PureIntrinsic);
+  Expr exp_pos2x = ir::Call::make(
+      x.type(), "exp", {two * x}, ir::Call::PureIntrinsic);
+
+  Expr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
+  Expr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
+  *rv = ir::Select::make(
+      x >= make_zero(x.type()), tanh_pos, tanh_neg);
+});
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.pow")
 .set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::pow, 1>);
 
diff --git a/src/codegen/llvm/intrin_rule_nvptx.cc b/src/codegen/llvm/intrin_rule_nvptx.cc
new file mode 100644
index 000000000000..d0b9f3693192
--- /dev/null
+++ b/src/codegen/llvm/intrin_rule_nvptx.cc
@@ -0,0 +1,67 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file intrin_rule_nvptx.cc
+ */
+#ifdef TVM_LLVM_VERSION
+
+#include <tvm/ir.h>
+#include <tvm/expr.h>
+#include <tvm/api_registry.h>
+#include <sstream>
+
+namespace tvm {
+namespace codegen {
+
+inline void DispatchExternLibDevice(const TVMArgs& args, TVMRetValue* rv) {
+  Expr e = args[0];
+  using namespace ir;
+  const Call* call = e.as<Call>();
+  CHECK(call != nullptr);
+  CHECK(call->type.bits() == 32 || call->type.bits() == 64) << "Only support float32 or float64.";
+  std::ostringstream intrinsic_name;
+  intrinsic_name << "__nv_" << call->name;
+  if (call->type.bits() == 32) intrinsic_name << "f";
+  *rv = Call::make(call->type, intrinsic_name.str(), call->args,
+                   Call::PureExtern);
+}
+
+namespace llvm {
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.floor")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.ceil")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.round")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.trunc")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.fabs")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.exp")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.fma")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.log")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.sqrt")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.pow")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.tanh")
+.set_body(DispatchExternLibDevice);
+
+}  // namespace llvm
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // LLVM_VERSION
diff --git a/src/codegen/llvm/intrin_rule_rocm.cc b/src/codegen/llvm/intrin_rule_rocm.cc
index fa27701870f9..b9bee94e9c24 100644
--- a/src/codegen/llvm/intrin_rule_rocm.cc
+++ b/src/codegen/llvm/intrin_rule_rocm.cc
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file intrin_rule_llvm.cc
+ * \file intrin_rule_rocm.cc
  */
 #ifdef TVM_LLVM_VERSION
 
@@ -26,6 +26,21 @@ inline void DispatchExternOCML(const TVMArgs& args, TVMRetValue* rv) {
 
 namespace llvm {
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.floor")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::floor, 1>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.ceil")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::ceil, 1>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.round")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::round, 1>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.trunc")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::trunc, 1>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.fabs")
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::fabs, 1>);
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.exp")
 .set_body(DispatchExternOCML);
 
@@ -43,6 +58,8 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.sqrt")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.pow")
 .set_body(DispatchExternOCML);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.tanh")
+.set_body(DispatchExternOCML);
 }  // namespace llvm
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc
index dfe51fb373f7..01f2c8869dc1 100644
--- a/src/codegen/llvm/llvm_common.cc
+++ b/src/codegen/llvm/llvm_common.cc
@@ -24,7 +24,7 @@ struct LLVMEnv {
 void InitializeLLVM() {
   LLVMEnv* e = LLVMEnv::Global();
   if (!e->all_initialized) {
-    std::lock_guard<std::mutex>(e->mu);
+    std::lock_guard<std::mutex> lock(e->mu);
     if (!e->all_initialized) {
       e->all_initialized = true;
       llvm::InitializeAllTargetInfos();
@@ -36,9 +36,11 @@ void InitializeLLVM() {
   }
 }
 
-llvm::TargetMachine*
-GetLLVMTargetMachine(const std::string& target_str,
-                     bool allow_null) {
+void ParseLLVMTargetOptions(const std::string& target_str,
+                            std::string* triple,
+                            std::string* mcpu,
+                            std::string* mattr,
+                            llvm::TargetOptions* options) {
   // setup target triple
   size_t start = 0;
   if (target_str.length() >= 4 &&
@@ -46,9 +48,10 @@ GetLLVMTargetMachine(const std::string& target_str,
     start = 4;
   }
   // simple parser
-  std::string target_triple = "";
-  std::string cpu = "generic";
-  std::string attr = "";
+  triple->resize(0);
+  mcpu->resize(0);
+  mattr->resize(0);
+
   bool soft_float_abi = false;
   std::string key, value;
   std::istringstream is(target_str.substr(start, target_str.length() - start));
@@ -69,11 +72,11 @@ GetLLVMTargetMachine(const std::string& target_str,
     }
     if (key == "-target" ||
         key == "-mtriple") {
-      target_triple = value;
+      *triple = value;
     } else if (key == "-mcpu") {
-      cpu = value;
+      *mcpu = value;
     } else if (key == "-mattr") {
-      attr = value;
+      *mattr = value;
     } else if (key == "-mfloat-abi") {
       if (value == "hard") {
         soft_float_abi = false;
@@ -82,26 +85,20 @@ GetLLVMTargetMachine(const std::string& target_str,
       } else {
         LOG(FATAL) << "invalid -mfloat-abi option " << value;
       }
-    } else if (key == "-device") {
+    } else if (key == "-device" || key == "-libs" || key == "-model") {
       // pass
     } else {
       LOG(FATAL) << "unknown option " << key;
     }
   }
 
-  if (target_triple.length() == 0 ||
-      target_triple == "default") {
-    target_triple = llvm::sys::getDefaultTargetTriple();
-  }
-  std::string err;
-  const llvm::Target* target =
-      llvm::TargetRegistry::lookupTarget(target_triple, err);
-  if (target == nullptr) {
-    CHECK(allow_null) << err << " target_triple=" << target_triple;
-    return nullptr;
+  if (triple->length() == 0 ||
+      *triple == "default") {
+    *triple = llvm::sys::getDefaultTargetTriple();
   }
   // set target option
-  llvm::TargetOptions opt;
+  llvm::TargetOptions& opt = *options;
+  opt = llvm::TargetOptions();
   #if TVM_LLVM_VERSION < 50
   opt.LessPreciseFPMADOption = true;
   #endif
@@ -114,8 +111,38 @@ GetLLVMTargetMachine(const std::string& target_str,
   } else {
     opt.FloatABIType = llvm::FloatABI::Hard;
   }
+}
+
+
+llvm::TargetMachine*
+GetLLVMTargetMachine(const std::string& target_str,
+                     bool allow_null) {
+  std::string target_triple, mcpu, mattr;
+  llvm::TargetOptions opt;
+
+  ParseLLVMTargetOptions(target_str,
+                         &target_triple,
+                         &mcpu,
+                         &mattr,
+                         &opt);
+
+  if (target_triple.length() == 0 ||
+      target_triple == "default") {
+    target_triple = llvm::sys::getDefaultTargetTriple();
+  }
+  if (mcpu.length() == 0) {
+    mcpu = "generic";
+  }
+
+  std::string err;
+  const llvm::Target* target =
+      llvm::TargetRegistry::lookupTarget(target_triple, err);
+  if (target == nullptr) {
+    CHECK(allow_null) << err << " target_triple=" << target_triple;
+    return nullptr;
+  }
   llvm::TargetMachine* tm = target->createTargetMachine(
-      target_triple, cpu, attr, opt, llvm::Reloc::PIC_);
+      target_triple, mcpu, mattr, opt, llvm::Reloc::PIC_);
   return tm;
 }
 
diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h
index 11ff66d8ca38..d5d27bf83d71 100644
--- a/src/codegen/llvm/llvm_common.h
+++ b/src/codegen/llvm/llvm_common.h
@@ -34,6 +34,7 @@
 #include <llvm/Transforms/IPO.h>
 
 #include <llvm/Support/FileSystem.h>
+#include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Support/Casting.h>
 #include <llvm/Support/TargetRegistry.h>
@@ -57,6 +58,20 @@ namespace codegen {
  */
 void InitializeLLVM();
 
+/*!
+ * \brief Parse target options
+ * \param target_str Target string, in format "llvm -target=xxx -mcpu=xxx"
+ * \param triple Target triple
+ * \param mcpu cpu info
+ * \param options the options
+ * \param mattr The attributes
+ */
+void ParseLLVMTargetOptions(const std::string& target_str,
+                            std::string* triple,
+                            std::string* mcpu,
+                            std::string* mattr,
+                            llvm::TargetOptions* options);
+
 /*!
  * \brief Get target machine from target_str string.
  * \param target_str Target string, in format "llvm -target=xxx -mcpu=xxx"
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 52c4f8897888..1b0e43f9c23a 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -51,13 +51,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     BackendPackedCFunc faddr =
         reinterpret_cast<BackendPackedCFunc>(GetFunctionAddr(fname));
     if (faddr == nullptr) return PackedFunc();
-    return PackedFunc([faddr, sptr_to_self](TVMArgs args, TVMRetValue* rv) {
-        int ret = (*faddr)(
-            (void*)args.values, // NOLINT(*)
-            (int*)args.type_codes, // NOLINT(*)
-            args.num_args);
-        CHECK_EQ(ret, 0) << TVMGetLastError();
-      });
+    return WrapPackedFunc(faddr, sptr_to_self);
   }
 
   void SaveToFile(const std::string& file_name,
@@ -68,25 +62,49 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     CHECK_EQ(ecode.value(), 0) << "Cannot open file: " << file_name
                                << " " << ecode.message();
     if (fmt == "o" || fmt == "obj") {
+#if TVM_LLVM_VERSION <= 60
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(mptr_);
+#else
+      std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
+#endif
       llvm::legacy::PassManager pass;
       CHECK(tm_);
+#if TVM_LLVM_VERSION <= 60
       CHECK(tm_->addPassesToEmitFile(
           pass, dest, llvm::TargetMachine::CGFT_ObjectFile) == 0)
           << "Cannot emit target CGFT_ObjectFile";
+#else
+      CHECK(tm_->addPassesToEmitFile(
+          pass, dest, nullptr, llvm::TargetMachine::CGFT_ObjectFile) == 0)
+          << "Cannot emit target CGFT_ObjectFile";
+#endif
       pass.run(*m);
     } else if (fmt == "s" || fmt == "asm") {
+#if TVM_LLVM_VERSION <= 60
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(mptr_);
+#else
+      std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
+#endif
       llvm::legacy::PassManager pass;
       CHECK(tm_);
+#if TVM_LLVM_VERSION <= 60
       CHECK(tm_->addPassesToEmitFile(
           pass, dest, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
+#else
+      CHECK(tm_->addPassesToEmitFile(
+          pass, dest, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+          << "Cannot emit target CGFT_AssemblyFile";
+#endif
       pass.run(*m);
     } else if (fmt == "ll") {
       mptr_->print(dest, nullptr);
     } else if (fmt == "bc") {
+#if TVM_LLVM_VERSION <= 60
       llvm::WriteBitcodeToFile(mptr_, dest);
+#else
+      llvm::WriteBitcodeToFile(*mptr_, dest);
+#endif
     } else {
       LOG(FATAL) << "Do not know how to save file "
                  << file_name << " with format=\'"<< format << "\'";
@@ -99,11 +117,41 @@ class LLVMModuleNode final : public runtime::ModuleNode {
   }
 
   std::string GetSource(const std::string& format) final {
+    std::string fmt = runtime::GetFileFormat("", format);
     std::string type_str;
-    llvm::raw_string_ostream rso(type_str);
-    CHECK(mptr_ != nullptr);
-    mptr_->print(rso, nullptr);
-    return rso.str();
+    llvm::SmallString<256> str;
+    llvm::raw_svector_ostream rso(str);
+
+    if (fmt == "s" || fmt == "asm") {
+    #if TVM_LLVM_VERSION <= 60
+          std::unique_ptr<llvm::Module> m = llvm::CloneModule(mptr_);
+    #else
+          std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
+    #endif
+          llvm::legacy::PassManager pass;
+          CHECK(tm_);
+    #if TVM_LLVM_VERSION <= 60
+          CHECK(tm_->addPassesToEmitFile(
+              pass, rso, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+              << "Cannot emit target CGFT_AssemblyFile";
+    #else
+          CHECK(tm_->addPassesToEmitFile(
+              pass, rso, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+              << "Cannot emit target CGFT_AssemblyFile";
+    #endif
+          pass.run(*m);
+          return rso.str().str();
+    } else if (fmt == "" || fmt == "ll") {
+      std::string type_str;
+      llvm::raw_string_ostream rso(type_str);
+      CHECK(mptr_ != nullptr);
+      mptr_->print(rso, nullptr);
+      return rso.str();
+    } else {
+      LOG(FATAL) << "Do not know how to get source code with format: "
+                 << format << "\'";
+    }
+    return "";
   }
 
   void Init(const Array<LoweredFunc>& funcs, std::string target) {
@@ -120,6 +168,10 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     }
     cg->AddMainFunction(funcs[0]->name);
     module_ = cg->Finish();
+    module_->addModuleFlag(
+        llvm::Module::Warning, "tvm_target",
+        llvm::MDString::get(*ctx_, target));
+    target_ = target;
     mptr_ = module_.get();
   }
 
@@ -133,11 +185,19 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       LOG(FATAL) << "Fail to load ir file " << file_name << "\n"
                  << "line " << err.getLineNo() << ":" << msg;
     }
-    std::string target = module_->getTargetTriple();
+    std::string target_;
+    llvm::Metadata* mtarget = module_->getModuleFlag("tvm_target");
+    if (mtarget != nullptr) {
+      llvm::MDString* pstr = llvm::dyn_cast<llvm::MDString>(mtarget);
+      CHECK(pstr != nullptr);
+      target_ = pstr->getString();
+    } else {
+      std::ostringstream os;
+      os << "llvm -target " << module_->getTargetTriple();
+      target_ = os.str();
+    }
     mptr_ = module_.get();
-    std::ostringstream os;
-    os << "llvm -target " << target;
-    tm_ = GetLLVMTargetMachine(os.str());
+    tm_ = GetLLVMTargetMachine(target_);
   }
 
  private:
@@ -145,8 +205,19 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     CHECK(ee_ == nullptr);
     std::lock_guard<std::mutex> lock(mutex_);
     llvm::EngineBuilder builder(std::move(module_));
+    std::string triple, mcpu, mattr;
+    llvm::TargetOptions opt;
+    ParseLLVMTargetOptions(target_, &triple, &mcpu, &mattr, &opt);
     builder.setEngineKind(llvm::EngineKind::JIT);
     builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
+    if (mcpu.length() != 0) {
+      builder.setMCPU(mcpu);
+    }
+    if (mattr.length() != 0) {
+      std::vector<std::string> mattrs{mattr};
+      builder.setMAttrs(mattrs);
+    }
+    builder.setTargetOptions(opt);
     llvm::TargetMachine *tm = builder.selectTarget();
     llvm::TargetMachine *tm_sys = GetLLVMTargetMachine("llvm");
     if (tm_sys->getTargetTriple().getArch() != tm->getTargetTriple().getArch()) {
@@ -211,6 +282,15 @@ class LLVMModuleNode final : public runtime::ModuleNode {
   std::shared_ptr<llvm::LLVMContext> ctx_;
 };
 
+unsigned LookupLLVMIntrinsic(const std::string& name) {
+  return llvm::Function::lookupIntrinsicID(name);
+}
+
+TVM_REGISTER_API("codegen.llvm_lookup_intrinsic_id")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = static_cast<int64_t>(LookupLLVMIntrinsic(args[0]));
+  });
+
 TVM_REGISTER_API("codegen.build_llvm")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     std::shared_ptr<LLVMModuleNode> n = std::make_shared<LLVMModuleNode>();
@@ -218,6 +298,13 @@ TVM_REGISTER_API("codegen.build_llvm")
     *rv = runtime::Module(n);
   });
 
+TVM_REGISTER_API("codegen.llvm_version_major")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    std::ostringstream os;
+    int major = TVM_LLVM_VERSION / 10;
+    *rv = major;
+  });
+
 TVM_REGISTER_API("module.loadfile_ll")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     std::shared_ptr<LLVMModuleNode> n = std::make_shared<LLVMModuleNode>();
diff --git a/src/codegen/opt/README b/src/codegen/opt/README
new file mode 100644
index 000000000000..535e8c03d114
--- /dev/null
+++ b/src/codegen/opt/README
@@ -0,0 +1 @@
+This folder contains optional builds for codegen depending on compilation options.
diff --git a/src/codegen/opt/build_aocl_off.cc b/src/codegen/opt/build_aocl_off.cc
new file mode 100644
index 000000000000..535036016247
--- /dev/null
+++ b/src/codegen/opt/build_aocl_off.cc
@@ -0,0 +1,21 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Optional module when build aocl is switched to off
+ */
+#include "../codegen_source_base.h"
+#include "../../runtime/opencl/opencl_module.h"
+
+namespace tvm {
+namespace runtime {
+
+Module AOCLModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
+  LOG(WARNING) << "AOCL runtime not enabled, return a source module...";
+  return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "aocl");
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/codegen/opt/build_cuda_off.cc b/src/codegen/opt/build_cuda_off.cc
new file mode 100644
index 000000000000..4e2bfd03d71a
--- /dev/null
+++ b/src/codegen/opt/build_cuda_off.cc
@@ -0,0 +1,18 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Optional module when build cuda is switched to off
+ */
+#include "../../runtime/cuda/cuda_module.h"
+namespace tvm {
+namespace runtime {
+
+Module CUDAModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string cuda_source) {
+  LOG(FATAL) << "CUDA is not enabled";
+  return Module();
+}
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/codegen/build_cuda.cc b/src/codegen/opt/build_cuda_on.cc
similarity index 51%
rename from src/codegen/build_cuda.cc
rename to src/codegen/opt/build_cuda_on.cc
index aaac4115e4d3..2e5766f53b76 100644
--- a/src/codegen/build_cuda.cc
+++ b/src/codegen/opt/build_cuda_on.cc
@@ -1,17 +1,23 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  *  Build cuda modules from source.
+ *  requires cuda to be available.
+ *
  * \file build_cuda.cc
  */
+#if defined(__linux__)
+#include <sys/stat.h>
+#endif
+#include <cuda_runtime.h>
 #include <tvm/base.h>
-#include <tvm/runtime/config.h>
-#include "./codegen_cuda.h"
-#include "./build_common.h"
-
-#if TVM_CUDA_RUNTIME
 #include <nvrtc.h>
-#include "../runtime/cuda/cuda_common.h"
-#include "../runtime/cuda/cuda_module.h"
+#include <cstdlib>
+
+#include "../codegen_cuda.h"
+#include "../build_common.h"
+#include "../../runtime/cuda/cuda_common.h"
+#include "../../runtime/cuda/cuda_module.h"
+
 
 namespace tvm {
 namespace codegen {
@@ -26,11 +32,66 @@ namespace codegen {
     }                                                                   \
   }
 
-std::string NVRTCCompile(const std::string& code) {
+
+std::string FindCUDAIncludePath() {
+#if defined(_WIN32)
+  const std::string delimiter = "\\";
+#else
+  const std::string delimiter = "/";
+#endif
+  std::string cuda_include_path;
+  const char* cuda_path_env = std::getenv("CUDA_PATH");
+  if (cuda_path_env != nullptr) {
+    cuda_include_path += cuda_path_env;
+    cuda_include_path += delimiter + "include";
+    return cuda_include_path;
+  }
+
+#if defined(__linux__)
+  struct stat st;
+  cuda_include_path = "/usr/local/cuda/include";
+  if (stat(cuda_include_path.c_str(), &st) == 0) {
+    return cuda_include_path;
+  }
+#endif
+  LOG(FATAL) << "Cannot find cuda include path."
+             << "CUDA_PATH is not set or CUDA is not installed in the default installation path."
+             << "In other than linux, it is necessary to set CUDA_PATH.";
+  return cuda_include_path;
+}
+
+
+std::string NVRTCCompile(const std::string& code, bool include_path = false) {
+  std::vector<std::string> compile_params;
+  std::vector<const char*> param_cstrings{};
   nvrtcProgram prog;
+  cudaDeviceProp device_prop;
+  std::string cc = "30";
+  cudaError_t e = cudaGetDeviceProperties(&device_prop, 0);
+
+  if (e == cudaSuccess) {
+    cc = std::to_string(device_prop.major) + std::to_string(device_prop.minor);
+  } else {
+    LOG(WARNING) << "cannot detect compute capability from your device, "
+                 << "fall back to compute_30.";
+  }
+
+  compile_params.push_back("-arch=compute_" + cc);
+
+  if (include_path) {
+    std::string include_option = "--include-path=" + FindCUDAIncludePath();
+
+    compile_params.push_back(include_option);
+  }
+
+  for (const auto& string : compile_params) {
+      param_cstrings.push_back(string.c_str());
+  }
   NVRTC_CALL(nvrtcCreateProgram(
       &prog, code.c_str(), nullptr, 0, nullptr, nullptr));
-  nvrtcResult compile_res = nvrtcCompileProgram(prog, 0, nullptr);
+  nvrtcResult compile_res =
+      nvrtcCompileProgram(prog, param_cstrings.size(), param_cstrings.data());
+
   size_t log_size;
   NVRTC_CALL(nvrtcGetProgramLogSize(prog, &log_size));
   std::string log; log.resize(log_size);
@@ -43,6 +104,7 @@ std::string NVRTCCompile(const std::string& code) {
   ptx.resize(ptx_size);
   NVRTC_CALL(nvrtcGetPTX(prog, &ptx[0]));
   NVRTC_CALL(nvrtcDestroyProgram(&prog));
+
   return ptx;
 }
 
@@ -68,7 +130,7 @@ runtime::Module BuildCUDA(Array<LoweredFunc> funcs) {
     // TODO(tqchen) more reliable checks
     if (ptx[0] != '/') fmt = "cubin";
   } else {
-    ptx = NVRTCCompile(code);
+    ptx = NVRTCCompile(code, cg.need_include_path());
   }
   return CUDAModuleCreate(ptx, fmt, ExtractFuncInfo(funcs), code);
 }
@@ -79,4 +141,3 @@ TVM_REGISTER_API("codegen.build_cuda")
   });
 }  // namespace codegen
 }  // namespace tvm
-#endif   // TVM_CUDA_RUNTIME
diff --git a/src/codegen/opt/build_metal_off.cc b/src/codegen/opt/build_metal_off.cc
new file mode 100644
index 000000000000..dfa2cc28cc8b
--- /dev/null
+++ b/src/codegen/opt/build_metal_off.cc
@@ -0,0 +1,20 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Optional module when build metal is switched to off
+ */
+#include "../codegen_source_base.h"
+#include "../../runtime/metal/metal_module.h"
+
+namespace tvm {
+namespace runtime {
+
+Module MetalModuleCreate(std::string data,
+                         std::string fmt,
+                         std::unordered_map<std::string, FunctionInfo> fmap,
+                         std::string source) {
+  LOG(WARNING) << "Metal runtime not enabled, return a source module...";
+  return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "metal");
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/codegen/opt/build_opencl_off.cc b/src/codegen/opt/build_opencl_off.cc
new file mode 100644
index 000000000000..adadb84e9b1c
--- /dev/null
+++ b/src/codegen/opt/build_opencl_off.cc
@@ -0,0 +1,20 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Optional module when build opencl is switched to off
+ */
+#include "../codegen_source_base.h"
+#include "../../runtime/opencl/opencl_module.h"
+
+namespace tvm {
+namespace runtime {
+
+Module OpenCLModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
+  return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "opencl");
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/codegen/opt/build_opengl_off.cc b/src/codegen/opt/build_opengl_off.cc
new file mode 100644
index 000000000000..3f01dc98f0a5
--- /dev/null
+++ b/src/codegen/opt/build_opengl_off.cc
@@ -0,0 +1,20 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Optional module when build opencl is switched to off
+ */
+#include "../codegen_source_base.h"
+#include "../../runtime/opengl/opengl_module.h"
+
+namespace tvm {
+namespace runtime {
+
+Module OpenGLModuleCreate(std::unordered_map<std::string, OpenGLShader> shaders,
+                          std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap) {
+  LOG(WARNING) << "OpenGL runtime not enabled, return a source module...";
+  auto data = ToJSON(shaders);
+  return codegen::DeviceSourceModuleCreate(data, "gl", fmap, "opengl");
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/codegen/opt/build_rocm_off.cc b/src/codegen/opt/build_rocm_off.cc
new file mode 100644
index 000000000000..009fd13668d1
--- /dev/null
+++ b/src/codegen/opt/build_rocm_off.cc
@@ -0,0 +1,30 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Optional module when build rocm is switched to off
+ */
+#include "../codegen_source_base.h"
+#include "../../runtime/rocm/rocm_module.h"
+
+namespace tvm {
+namespace runtime {
+
+Module ROCMModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string rocm_source,
+    std::string assembly) {
+
+  LOG(WARNING) << "ROCM runtime is not enabled, return a source module...";
+  auto fget_source = [rocm_source, assembly](const std::string& format) {
+    if (format.length() == 0) return assembly;
+    if (format == "ll" || format == "llvm") return rocm_source;
+    if (format == "asm") return assembly;
+    return std::string("");
+  };
+  return codegen::DeviceSourceModuleCreate(
+      data, fmt, fmap, "hsaco", fget_source);
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/codegen/opt/build_sdaccel_off.cc b/src/codegen/opt/build_sdaccel_off.cc
new file mode 100644
index 000000000000..f5e57cb10f9e
--- /dev/null
+++ b/src/codegen/opt/build_sdaccel_off.cc
@@ -0,0 +1,21 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Optional module when build opencl is switched to off
+ */
+#include "../codegen_source_base.h"
+#include "../../runtime/opencl/opencl_module.h"
+
+namespace tvm {
+namespace runtime {
+
+Module SDAccelModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
+  LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
+  return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "sdaccel");
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc
index 1ad2168ae06e..69dbda49976b 100644
--- a/src/codegen/source_module.cc
+++ b/src/codegen/source_module.cc
@@ -5,6 +5,8 @@
  */
 #include <tvm/runtime/packed_func.h>
 #include "./codegen_source_base.h"
+#include "../runtime/file_util.h"
+#include "../runtime/meta_data.h"
 
 namespace tvm {
 namespace codegen {
@@ -12,8 +14,14 @@ namespace codegen {
 using runtime::TVMArgs;
 using runtime::TVMRetValue;
 using runtime::PackedFunc;
+
+using runtime::GetFileFormat;
+using runtime::GetMetaFilePath;
+using runtime::FunctionInfo;
+using runtime::SaveBinaryToFile;
+
 // Simulator function
-class SourceModuleNode final : public runtime::ModuleNode {
+class SourceModuleNode : public runtime::ModuleNode {
  public:
   SourceModuleNode(std::string code,
                    std::string fmt)
@@ -21,6 +29,7 @@ class SourceModuleNode final : public runtime::ModuleNode {
   const char* type_key() const {
     return "source";
   }
+
   PackedFunc GetFunction(
       const std::string& name,
       const std::shared_ptr<ModuleNode>& sptr_to_self) final {
@@ -33,7 +42,7 @@ class SourceModuleNode final : public runtime::ModuleNode {
     return code_;
   }
 
- private:
+ protected:
   std::string code_;
   std::string fmt_;
 };
@@ -44,6 +53,75 @@ runtime::Module SourceModuleCreate(std::string code, std::string fmt) {
   return runtime::Module(n);
 }
 
+// supports limited save without cross compile
+class DeviceSourceModuleNode final : public runtime::ModuleNode {
+ public:
+  DeviceSourceModuleNode(std::string data,
+                         std::string fmt,
+                         std::unordered_map<std::string, FunctionInfo> fmap,
+                         std::string type_key,
+                         std::function<std::string(const std::string&)> fget_source)
+    : data_(data),
+      fmt_(fmt),
+      fmap_(fmap),
+      type_key_(type_key),
+      fget_source_(fget_source) {}
+
+  PackedFunc GetFunction(
+        const std::string& name,
+        const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    LOG(FATAL) << "Source module cannot execute, to get executable module"
+               << " build TVM with \'" << fmt_ << "\' runtime support";
+    return PackedFunc();
+  }
+
+  std::string GetSource(const std::string& format) final {
+    if (fget_source_ != nullptr) {
+      return fget_source_(format);
+    } else {
+      return data_;
+    }
+  }
+
+  const char* type_key() const {
+    return type_key_.c_str();
+  }
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final {
+    std::string fmt = GetFileFormat(file_name, format);
+    CHECK_EQ(fmt, fmt_)
+        << "Can only save to format=" << fmt_;
+    std::string meta_file = GetMetaFilePath(file_name);
+    SaveMetaDataToFile(meta_file, fmap_);
+    SaveBinaryToFile(file_name, data_);
+  }
+
+  void SaveToBinary(dmlc::Stream* stream) final {
+    stream->Write(fmt_);
+    stream->Write(fmap_);
+    stream->Write(data_);
+  }
+
+ private:
+  std::string data_;
+  std::string fmt_;
+  std::unordered_map<std::string, FunctionInfo> fmap_;
+  std::string type_key_;
+  std::function<std::string(const std::string&)> fget_source_;
+};
+
+runtime::Module DeviceSourceModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string type_key,
+    std::function<std::string(const std::string&)> fget_source) {
+  std::shared_ptr<DeviceSourceModuleNode> n =
+      std::make_shared<DeviceSourceModuleNode>(data, fmt, fmap, type_key, fget_source);
+  return runtime::Module(n);
+}
+
 TVM_REGISTER_GLOBAL("module.source_module_create")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     *rv = SourceModuleCreate(args[0], args[1]);
diff --git a/src/codegen/spirv/build_vulkan.cc b/src/codegen/spirv/build_vulkan.cc
new file mode 100644
index 000000000000..3cd1b56cda43
--- /dev/null
+++ b/src/codegen/spirv/build_vulkan.cc
@@ -0,0 +1,92 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file build_vulkan.cc
+ * \brief Build SPIRV block
+ */
+// Use libspirv for parsing and validating code.
+#include <libspirv.h>
+#include <dmlc/memory_io.h>
+#include <tvm/ir_pass.h>
+
+#include "./codegen_spirv.h"
+#include "../build_common.h"
+#include "../../runtime/vulkan/vulkan_module.h"
+
+namespace tvm {
+namespace codegen {
+
+class SPIRVTools {
+ public:
+  SPIRVTools() {
+    ctx_ = spvContextCreate(SPV_ENV_VULKAN_1_0);
+  }
+  ~SPIRVTools() {
+    spvContextDestroy(ctx_);
+  }
+  std::string BinaryToText(const std::vector<uint32_t>& bin) {
+    spv_text text = nullptr;
+    spv_diagnostic diagnostic;
+    spv_const_binary_t spv_bin{bin.data(), bin.size()};
+    spv_result_t res;
+
+    res = spvBinaryToText(
+       ctx_, spv_bin.code, spv_bin.wordCount,
+      SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES |
+           SPV_BINARY_TO_TEXT_OPTION_INDENT,
+        &text, &diagnostic);
+
+    CHECK_EQ(res, SPV_SUCCESS)
+        << " line=" << diagnostic->position.line
+        << " column=" << diagnostic->position.column
+        << " index=" << diagnostic->position.index
+        << " error:" << diagnostic->error;
+
+    std::string ret(text->str);
+    spvTextDestroy(text);
+    return ret;
+  }
+
+ private:
+  spv_context ctx_;
+};
+
+runtime::Module BuildSPIRV(Array<LoweredFunc> funcs) {
+  using tvm::runtime::Registry;
+  using tvm::runtime::VulkanShader;
+
+  std::ostringstream code_data;
+  static SPIRVTools spirv_tools;
+  std::unordered_map<std::string, VulkanShader> smap;
+
+  const auto* postproc = Registry::Get("tvm_callback_vulkan_postproc");
+
+  CodeGenSPIRV cg;
+  for (LoweredFunc f : funcs) {
+    f = PointerValueTypeRewrite(f);
+    VulkanShader shader;
+    shader.data = cg.BuildFunction(f);
+
+    if (postproc != nullptr) {
+      TVMByteArray arr;
+      arr.data = reinterpret_cast<const char*>(dmlc::BeginPtr(shader.data));
+      arr.size = shader.data.size() * sizeof(uint32_t);
+      std::string transformed = (*postproc)(arr);
+      CHECK_EQ(transformed.length() % 4U, 0U);
+      shader.data.resize(transformed.size() / 4U);
+      std::copy(transformed.begin(), transformed.end(),
+                reinterpret_cast<char*>(dmlc::BeginPtr(shader.data)));
+    }
+    code_data << spirv_tools.BinaryToText(shader.data);
+    smap[f->name] = std::move(shader);
+  }
+  return runtime::VulkanModuleCreate(
+     smap, ExtractFuncInfo(funcs), code_data.str());
+}
+
+TVM_REGISTER_API("codegen.build_vulkan")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildSPIRV(args[0]);
+  });
+
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
new file mode 100644
index 000000000000..395bdff1477d
--- /dev/null
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -0,0 +1,637 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file codegen_spirv.cc
+ * \brief Generate SPIRV block
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_pass.h>
+#include "../codegen_common.h"
+#include "./codegen_spirv.h"
+
+namespace tvm {
+namespace codegen {
+
+std::vector<uint32_t> CodeGenSPIRV::BuildFunction(const LoweredFunc& f) {
+  this->InitFuncState();
+  CHECK(f->is_restricted)
+      << "SPIRV only takes restricted memory model";
+  std::vector<Var> pod_args;
+  uint32_t num_buffer = 0;
+  for (Var arg : f->args) {
+    Type t = arg.type();
+    if (t.is_handle()) {
+      auto it = f->handle_data_type.find(arg);
+      if (it != f->handle_data_type.end()) {
+        Type value_type = (*it).second.type();
+        spirv::Value arg_value = builder_->BufferArgument(
+            builder_->GetSType(value_type), 0, num_buffer);
+        storage_info_[arg.get()].UpdateContentType(value_type);
+        var_map_[arg.get()] = arg_value;
+      } else {
+        LOG(FATAL) << "require all handles to be typed";
+      }
+      ++num_buffer;
+    } else {
+      pod_args.push_back(arg);
+    }
+  }
+  spirv::Value func_ptr = builder_->NewFunction();
+  builder_->StartFunction(func_ptr);
+
+  // All the POD arguments are passed in through PushConstant
+  if (pod_args.size() != 0) {
+    std::vector<spirv::SType> value_types;
+    for (size_t i = 0; i < pod_args.size(); ++i) {
+      value_types.push_back(builder_->GetSType(pod_args[i].type()));
+    }
+    spirv::Value ptr = builder_->DeclarePushConstant(value_types);
+    for (size_t i = 0; i < pod_args.size(); ++i) {
+      spirv::Value value = builder_->GetPushConstant(
+          ptr, value_types[i], static_cast<uint32_t>(i));
+      var_map_[pod_args[i].get()] = value;
+    }
+  }
+  this->VisitStmt(f->body);
+  builder_->SetLocalSize(func_ptr, workgroup_size_);
+  builder_->MakeInst(spv::OpReturn);
+  builder_->MakeInst(spv::OpFunctionEnd);
+
+  builder_->CommitKernelFunction(func_ptr, f->name);
+
+  return builder_->Finalize();
+}
+
+void CodeGenSPIRV::InitFuncState() {
+  std::fill(workgroup_size_, workgroup_size_ + 3, 1);
+  var_map_.clear();
+  storage_info_.clear();
+  align_map_.clear();
+  builder_.reset(new spirv::IRBuilder());
+  builder_->InitHeader();
+}
+
+spirv::Value CodeGenSPIRV::GetThreadIndex(
+    const IterVar& iv, const Expr& extent) {
+  runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag);
+  spirv::Value v;
+  if (ts.rank == 1) {
+    v = builder_->GetLocalID(ts.dim_index);
+    int size = 0;
+    CHECK(arith::GetConstInt(extent, &size))
+        << "SPIRV only allows constant thread group size " << " get " << extent;
+    CHECK_LT(ts.dim_index, 3);
+    workgroup_size_[ts.dim_index] = static_cast<uint32_t>(size);
+  } else {
+    v = builder_->GetWorkgroupID(ts.dim_index);
+  }
+  return builder_->Cast(builder_->GetSType(iv->var.type()), v);
+}
+
+spirv::Value CodeGenSPIRV::CreateStorageSync(const Call* op) {
+  const std::string& sync = op->args[0].as<StringImm>()->value;
+  spirv::Value value;
+  if (sync == "warp") {
+    return value;
+  } else if (sync == "shared") {
+    auto type_int = builder_->GetSType(Int(32));
+    builder_->MakeInst(
+      spv::OpControlBarrier,
+      builder_->IntImm(type_int, static_cast<int64_t>(spv::ScopeWorkgroup)),
+      builder_->IntImm(type_int, static_cast<int64_t>(spv::ScopeWorkgroup)),
+      builder_->IntImm(type_int, static_cast<int64_t>(
+        spv::MemorySemanticsSequentiallyConsistentMask |
+        spv::MemorySemanticsWorkgroupMemoryMask)));
+  } else {
+    LOG(FATAL) << "Do not support sync " << sync;
+  }
+  return value;
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Variable* op) {
+  auto it = var_map_.find(op);
+  CHECK(it != var_map_.end()) << "cannot find variable " << op->name_hint;
+  return it->second;
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const IntImm* op) {
+  return builder_->IntImm(builder_->GetSType(op->type), op->value);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const UIntImm* op) {
+  return builder_->UIntImm(builder_->GetSType(op->type), op->value);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const FloatImm* op) {
+  return builder_->FloatImm(builder_->GetSType(op->type), op->value);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const StringImm* op) {
+  LOG(FATAL) << "StringImm is not supported in Device code";
+  return spirv::Value();
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Cast* op) {
+  return builder_->Cast(builder_->GetSType(op->type), MakeValue(op->value));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Add* op) {
+  return builder_->Add(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Sub* op) {
+  return builder_->Sub(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Mul* op) {
+  return builder_->Mul(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Div* op) {
+  return builder_->Div(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Mod* op) {
+  return builder_->Mod(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Min* op) {
+  spirv::Value a = MakeValue(op->a);
+  spirv::Value b = MakeValue(op->b);
+  return builder_->Select(builder_->LT(a, b), a, b);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Max* op) {
+  spirv::Value a = MakeValue(op->a);
+  spirv::Value b = MakeValue(op->b);
+  return builder_->Select(builder_->GT(a, b), a, b);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const LT* op) {
+  return builder_->LT(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const LE* op) {
+  return builder_->LE(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const GT* op) {
+  return builder_->GT(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const GE* op) {
+  return builder_->GE(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const EQ* op) {
+  return builder_->EQ(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const NE* op) {
+  return builder_->NE(MakeValue(op->a), MakeValue(op->b));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const And* op) {
+  spirv::Value a = MakeValue(op->a);
+  spirv::Value b = MakeValue(op->b);
+  return builder_->MakeValue(spv::OpLogicalAnd, a.stype, a, b);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Or* op) {
+  spirv::Value a = MakeValue(op->a);
+  spirv::Value b = MakeValue(op->b);
+  return builder_->MakeValue(spv::OpLogicalOr, a.stype, a, b);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Not* op) {
+  spirv::Value a = MakeValue(op->a);
+  return builder_->MakeValue(spv::OpLogicalNot, a.stype, a);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Select* op) {
+  return builder_->Select(MakeValue(op->condition),
+                          MakeValue(op->true_value),
+                          MakeValue(op->false_value));
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Let* op) {
+  CHECK(!var_map_.count(op->var.get()));
+  var_map_[op->var.get()] = MakeValue(op->value);
+  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  return MakeValue(op->body);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Call* op) {
+  if (op->is_intrinsic("spirv_glsl450")) {
+    CHECK_GE(op->args.size(), 2U);
+    uint32_t inst_id = op->args[0].as<UIntImm>()->value;
+    std::vector<spirv::Value> values;
+    for (size_t i = 1; i < op->args.size(); ++i) {
+      values.push_back(MakeValue(op->args[i]));
+    }
+    return builder_->CallGLSL450(
+        builder_->GetSType(op->type), inst_id, values);
+  } else if (op->is_intrinsic(Call::bitwise_and)) {
+    CHECK_EQ(op->args.size(), 2U);
+    spirv::Value a = MakeValue(op->args[0]);
+    spirv::Value b = MakeValue(op->args[1]);
+    return builder_->MakeValue(spv::OpBitwiseAnd, a.stype, a, b);
+  } else if (op->is_intrinsic(Call::bitwise_xor)) {
+    CHECK_EQ(op->args.size(), 2U);
+    spirv::Value a = MakeValue(op->args[0]);
+    spirv::Value b = MakeValue(op->args[1]);
+    return builder_->MakeValue(spv::OpBitwiseXor, a.stype, a, b);
+  } else if (op->is_intrinsic(Call::bitwise_or)) {
+    CHECK_EQ(op->args.size(), 2U);
+    spirv::Value a = MakeValue(op->args[0]);
+    spirv::Value b = MakeValue(op->args[1]);
+    return builder_->MakeValue(spv::OpBitwiseOr, a.stype, a, b);
+  } else if (op->is_intrinsic(Call::bitwise_not)) {
+    CHECK_EQ(op->args.size(), 1U);
+    spirv::Value a = MakeValue(op->args[0]);
+    return builder_->MakeValue(spv::OpNot, a.stype, a);
+  } else if (op->is_intrinsic(Call::shift_left)) {
+    CHECK_EQ(op->args.size(), 2U);
+    spirv::Value a = MakeValue(op->args[0]);
+    spirv::Value b = MakeValue(op->args[1]);
+    return builder_->MakeValue(spv::OpShiftLeftLogical, a.stype, a, b);
+  } else if (op->is_intrinsic(Call::shift_right)) {
+    CHECK_EQ(op->args.size(), 2U);
+    spirv::Value a = MakeValue(op->args[0]);
+    spirv::Value b = MakeValue(op->args[1]);
+    if (op->args[0].type().is_int()) {
+      return builder_->MakeValue(spv::OpShiftRightArithmetic, a.stype, a, b);
+    } else {
+      return builder_->MakeValue(spv::OpShiftRightLogical, a.stype, a, b);
+    }
+  } else if (op->is_intrinsic(intrinsic::tvm_storage_sync)) {
+    return this->CreateStorageSync(op);
+  } else if (op->is_intrinsic(intrinsic::tvm_if_then_else)) {
+    CHECK_EQ(op->args.size(), 3U);
+    spirv::Value cond = MakeValue(op->args[0]);
+    spirv::Label then_label = builder_->NewLabel();
+    spirv::Label else_label = builder_->NewLabel();
+    spirv::Label merge_label = builder_->NewLabel();
+    builder_->MakeInst(
+        spv::OpSelectionMerge, merge_label, spv::SelectionControlMaskNone);
+    builder_->MakeInst(
+        spv::OpBranchConditional, cond, then_label, else_label);
+    // then block, must get label after we see the value
+    builder_->StartLabel(then_label);
+    spirv::Value then_value = MakeValue(op->args[1]);
+    spirv::Label then_value_label = builder_->CurrentLabel();
+    builder_->MakeInst(spv::OpBranch, merge_label);
+    // else block
+    builder_->StartLabel(else_label);
+    spirv::Value else_value = MakeValue(op->args[2]);
+    spirv::Label else_value_label = builder_->CurrentLabel();
+    builder_->MakeInst(spv::OpBranch, merge_label);
+    // merge block
+    builder_->StartLabel(merge_label);
+    spirv::PhiValue phi = builder_->MakePhi(then_value.stype, 2);
+    phi.SetIncoming(0, then_value, then_value_label);
+    phi.SetIncoming(1, else_value, else_value_label);
+    return phi;
+  } else if (op->is_intrinsic("popcount")) {
+    return builder_->MakeValue(
+        spv::OpBitCount,
+        builder_->GetSType(op->type),
+        MakeValue(op->args[0]));
+  } else {
+    if (op->call_type == Call::Intrinsic ||
+        op->call_type == Call::PureIntrinsic) {
+      LOG(FATAL) << "Unresolved intrinsic " << op->name
+                 << " with return type " << op->type;
+    } else if (op->call_type == Call::Extern ||
+               op->call_type == Call::PureExtern) {
+      LOG(FATAL) << "Unresolved extern " << op->name
+                 << " with return type " << op->type;
+    } else {
+      LOG(FATAL) << "Unresolved call type " << op->call_type;
+    }
+    return spirv::Value();
+  }
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Ramp* op) {
+  std::vector<spirv::Value> values;
+  spirv::Value base = MakeValue(op->base);
+  for (int i = 0; i < op->lanes; ++i) {
+    spirv::Value v = base;
+    if (i != 0) {
+      spirv::Value offset = MakeValue(
+          arith::ComputeExpr<Mul>(make_const(op->stride.type(), i), op->stride));
+      v = builder_->Add(v, offset);
+    }
+    values.push_back(v);
+  }
+  return builder_->Concat(values);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Broadcast* op) {
+  std::vector<spirv::Value> values;
+  spirv::Value v = MakeValue(op->value);
+  for (int i = 0; i < op->lanes; i++) {
+    values.push_back(v);
+  }
+  return builder_->Concat(values);
+}
+
+spirv::Value CodeGenSPIRV::VisitExpr_(const Load* op) {
+  CHECK(is_one(op->predicate));
+  auto it = storage_info_.find(op->buffer_var.get());
+  CHECK(it != storage_info_.end());
+  StorageInfo& info = it->second;
+  if (!info.content_fixed) {
+    info.UpdateContentType(op->type);
+  }
+
+  spirv::SType content_type = builder_->GetSType(info.content_type);
+  spirv::Value buffer = MakeValue(op->buffer_var);
+  spirv::SType ptr_type = builder_->GetPointerType(
+      content_type, buffer.stype.storage_class);
+
+  uint32_t mask = spv::MemoryAccessMaskNone;
+  if (info.is_volatile) {
+    mask |= spv::MemoryAccessVolatileMask;
+  }
+  if (op->type.lanes() == 1) {
+    CHECK_EQ(info.content_type, op->type)
+        << "Vulkan only allow one type access to the same buffer";
+    spirv::Value index = MakeValue(op->index);
+    spirv::Value ptr = builder_->StructArrayAccess(
+        ptr_type, buffer, index);
+    return builder_->MakeValue(spv::OpLoad, content_type, ptr, mask);
+  } else {
+    if (op->type.element_of() == info.content_type) {
+      // because content type is element type, we can only do scalarize load.
+      std::vector<spirv::Value> values;
+      auto f = [&](int i, spirv::Value index) {
+        spirv::Value ptr = builder_->StructArrayAccess(
+            ptr_type, buffer, index);
+        values.emplace_back(
+            builder_->MakeValue(spv::OpLoad, content_type, ptr, mask));
+      };
+      this->Scalarize(op->index, f);
+      return builder_->Concat(values);
+    } else {
+      if (const Ramp* ramp = op->index.as<Ramp>()) {
+        if (is_one(ramp->stride)) {
+          CHECK_EQ(ramp->lanes, op->type.lanes());
+          arith::ModularEntry me = arith::EvalModular(ramp->base, align_map_);
+          CHECK((me.coeff % ramp->lanes) == 0 &&
+                (me.base % ramp->lanes)  == 0)
+              << "Only aligned vector access is allowed in SPIRV";
+          Expr vec_index = ir::Simplify(
+              ramp->base / make_const(ramp->base.type(), ramp->lanes));
+          spirv::Value ptr = builder_->StructArrayAccess(
+              ptr_type, buffer, MakeValue(vec_index));
+          return builder_->MakeValue(spv::OpLoad, content_type, ptr, mask);
+        }
+      }
+    }
+    LOG(FATAL) << "Only aligned continuous vector access is allowed in SPIRV";
+  }
+  LOG(FATAL) << "Only aligned continuous vector access is allowed in SPIRV";
+  return spirv::Value();
+}
+
+void CodeGenSPIRV::Scalarize(const Expr& e,
+                             std::function<void(int i, spirv::Value v)> f) {
+  if (const Ramp* ramp = e.as<Ramp>()) {
+    for (int i = 0; i < ramp->type.lanes(); ++i) {
+      Expr offset = arith::ComputeExpr<Add>(
+          ramp->base,
+          arith::ComputeExpr<Mul>(ramp->stride, i));
+      f(i, MakeValue(offset));
+    }
+  } else {
+    spirv::SType etype = builder_->GetSType(e.type().element_of());
+    spirv::Value value = MakeValue(e);
+    for (int i = 0; i < e.type().lanes(); ++i) {
+      f(i, builder_->MakeValue(
+          spv::OpCompositeExtract, etype, value, i));
+    }
+  }
+}
+
+void CodeGenSPIRV::VisitStmt_(const Store* op) {
+  CHECK(is_one(op->predicate));
+  auto it = storage_info_.find(op->buffer_var.get());
+  CHECK(it != storage_info_.end());
+  StorageInfo& info = it->second;
+
+  if (!info.content_fixed) {
+    info.UpdateContentType(op->value.type());
+  }
+
+  spirv::SType content_type = builder_->GetSType(info.content_type);
+  spirv::Value buffer = MakeValue(op->buffer_var);
+  spirv::Value value = MakeValue(op->value);
+  spirv::SType ptr_type = builder_->GetPointerType(
+      content_type, buffer.stype.storage_class);
+
+  uint32_t mask = spv::MemoryAccessMaskNone;
+  if (info.is_volatile) {
+    mask |= spv::MemoryAccessVolatileMask;
+  }
+
+  if (op->value.type().lanes() == 1) {
+    CHECK_EQ(info.content_type, op->value.type())
+        << "Vulkan only allow one type access to the same buffer";
+    spirv::Value index = MakeValue(op->index);
+    spirv::Value ptr = builder_->StructArrayAccess(
+        ptr_type, buffer, index);
+    builder_->MakeInst(spv::OpStore, ptr, value, mask);
+  } else {
+    if (op->value.type().element_of() == info.content_type) {
+      // because content type is element type, we can only do scalarize load.
+      auto f = [&](int i, spirv::Value index) {
+        spirv::Value elem = builder_->MakeValue(
+            spv::OpCompositeExtract, content_type, value, i);
+        spirv::Value ptr = builder_->StructArrayAccess(
+            ptr_type, buffer, index);
+        builder_->MakeInst(spv::OpStore, ptr, elem, mask);
+      };
+      this->Scalarize(op->index, f);
+    } else {
+      if (const Ramp* ramp = op->index.as<Ramp>()) {
+        if (is_one(ramp->stride)) {
+          CHECK_EQ(ramp->lanes, op->value.type().lanes());
+          arith::ModularEntry me = arith::EvalModular(ramp->base, align_map_);
+          CHECK((me.coeff % ramp->lanes) == 0 &&
+                (me.base % ramp->lanes)  == 0)
+              << "Only aligned vector access is allowed in SPIRV";
+          Expr vec_index = ir::Simplify(
+              ramp->base / make_const(ramp->base.type(), ramp->lanes));
+          spirv::Value ptr = builder_->StructArrayAccess(
+              ptr_type, buffer, MakeValue(vec_index));
+          builder_->MakeInst(spv::OpStore, ptr, value, mask);
+          return;
+        }
+      }
+      LOG(FATAL) << "Only aligned continuous vector access is allowed in SPIRV";
+    }
+  }
+}
+
+void CodeGenSPIRV::VisitStmt_(const For* op) {
+  CHECK(is_zero(op->min));
+  spirv::Value init_value = MakeValue(op->min);
+  spirv::Value extent_value = MakeValue(op->extent);
+  // Must get init label after making value(to make sure they are correct)
+  spirv::Label init_label = builder_->CurrentLabel();
+  spirv::Label head_label = builder_->NewLabel();
+  spirv::Label body_label = builder_->NewLabel();
+  spirv::Label continue_label = builder_->NewLabel();
+  spirv::Label merge_label = builder_->NewLabel();
+  builder_->MakeInst(spv::OpBranch, head_label);
+
+  // Loop head
+  builder_->StartLabel(head_label);
+  spirv::PhiValue loop_var = builder_->MakePhi(init_value.stype, 2);
+  loop_var.SetIncoming(0, init_value, init_label);
+  spirv::Value loop_cond = builder_->LT(loop_var, extent_value);
+  uint32_t control = (
+      op->for_type == ForType::Unrolled ?
+      spv::LoopControlUnrollMask : spv::LoopControlMaskNone);
+  builder_->MakeInst(
+      spv::OpLoopMerge, merge_label, continue_label, control);
+  builder_->MakeInst(
+      spv::OpBranchConditional, loop_cond, body_label, merge_label,
+      weight_likely_branch_, 1);
+
+  // loop body
+  builder_->StartLabel(body_label);
+  var_map_[op->loop_var.get()] = spirv::Value(loop_var);
+  this->VisitStmt(op->body);
+  builder_->MakeInst(spv::OpBranch, continue_label);
+
+  // loop continue
+  builder_->StartLabel(continue_label);
+  spirv::Value one =
+      op->loop_var.type().is_int() ?
+      builder_->IntImm(loop_var.stype, 1) :
+      builder_->UIntImm(loop_var.stype, 1);
+  spirv::Value next_value = builder_->Add(loop_var, one);
+  loop_var.SetIncoming(1, next_value, builder_->CurrentLabel());
+  builder_->MakeInst(spv::OpBranch, head_label);
+  // loop merge
+  builder_->StartLabel(merge_label);
+}
+
+void CodeGenSPIRV::VisitStmt_(const IfThenElse* op) {
+  spirv::Value cond = MakeValue(op->condition);
+  spirv::Label then_label = builder_->NewLabel();
+  spirv::Label merge_label = builder_->NewLabel();
+  if (op->else_case.defined()) {
+    spirv::Label else_label = builder_->NewLabel();
+    builder_->MakeInst(
+        spv::OpSelectionMerge, merge_label, spv::SelectionControlMaskNone);
+    builder_->MakeInst(
+        spv::OpBranchConditional, cond, then_label, else_label);
+    // then block
+    builder_->StartLabel(then_label);
+    this->VisitStmt(op->then_case);
+    builder_->MakeInst(spv::OpBranch, merge_label);
+    // else block
+    builder_->StartLabel(else_label);
+    this->VisitStmt(op->else_case);
+    builder_->MakeInst(spv::OpBranch, merge_label);
+  } else {
+    builder_->MakeInst(
+        spv::OpSelectionMerge, merge_label, spv::SelectionControlMaskNone);
+    builder_->MakeInst(
+        spv::OpBranchConditional, cond, then_label, merge_label,
+        weight_likely_branch_, 1);
+    // then block
+    builder_->StartLabel(then_label);
+    this->VisitStmt(op->then_case);
+    builder_->MakeInst(spv::OpBranch, merge_label);
+  }
+  // start merge label;
+  builder_->StartLabel(merge_label);
+}
+
+void CodeGenSPIRV::VisitStmt_(const Allocate* op) {
+  CHECK(!is_zero(op->condition));
+  CHECK(!op->new_expr.defined());
+  CHECK(!op->type.is_handle());
+  int32_t constant_size = op->constant_allocation_size();
+  CHECK_GT(constant_size, 0)
+      << "Can only handle constant size stack allocation in GPU";
+  spirv::Value buf;
+  StorageInfo& info = storage_info_[op->buffer_var.get()];
+  spirv::SType etype = builder_->GetSType(op->type);
+  if (info.scope.rank == runtime::StorageRank::kLocal) {
+    buf = builder_->Allocate(
+        etype, static_cast<uint32_t>(constant_size),
+        spv::StorageClassFunction);
+  } else {
+    // shared memory
+    CHECK(info.scope.rank == runtime::StorageRank::kShared)
+        << "Can only allocate shared or local memory inside kernel";
+    // Shared memory
+    buf = builder_->Allocate(
+        etype, static_cast<uint32_t>(constant_size),
+        spv::StorageClassWorkgroup);
+  }
+  CHECK(!info.content_fixed);
+  info.UpdateContentType(op->type);
+  CHECK(!var_map_.count(op->buffer_var.get()));
+  var_map_[op->buffer_var.get()] = buf;
+  this->VisitStmt(op->body);
+}
+
+void CodeGenSPIRV::VisitStmt_(const AttrStmt* op) {
+  if (op->attr_key == attr::thread_extent) {
+    IterVar iv(op->node.node_);
+    if (iv->thread_tag.length() != 0) {
+      if (!var_map_.count(iv->var.get())) {
+        var_map_[iv->var.get()] = GetThreadIndex(iv, op->value);
+      }
+    }
+  } else if (op->attr_key == ir::attr::storage_scope) {
+    const Variable* v = op->node.as<Variable>();
+    CHECK(v);
+    storage_info_[v].scope =
+        runtime::StorageScope::make(op->value.as<StringImm>()->value);
+  } else if (op->attr_key == ir::attr::volatile_scope) {
+    const Variable* v = op->node.as<Variable>();
+    CHECK(v);
+    storage_info_[v].is_volatile = true;
+  }
+  this->VisitStmt(op->body);
+}
+
+void CodeGenSPIRV::VisitStmt_(const AssertStmt* op) {
+  VisitAssert(op, &align_map_, [this](const Stmt& body) {
+      this->VisitStmt(body);
+    });
+}
+
+void CodeGenSPIRV::VisitStmt_(const LetStmt* op) {
+  CHECK(!var_map_.count(op->var.get()));
+  CHECK(!align_map_.count(op->var.get()));
+  CHECK(!op->var.type().is_handle());
+  var_map_[op->var.get()] = MakeValue(op->value);
+  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  this->VisitStmt(op->body);
+}
+
+void CodeGenSPIRV::VisitStmt_(const Block* op) {
+  VisitStmt(op->first);
+  if (op->rest.defined()) {
+    this->VisitStmt(op->rest);
+  }
+}
+
+void CodeGenSPIRV::VisitStmt_(const Evaluate* op) {
+  MakeValue(op->value);
+}
+
+void CodeGenSPIRV::VisitStmt_(const ProducerConsumer* op) {
+  this->VisitStmt(op->body);
+}
+
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/spirv/codegen_spirv.h b/src/codegen/spirv/codegen_spirv.h
new file mode 100644
index 000000000000..a6c09362ddf7
--- /dev/null
+++ b/src/codegen/spirv/codegen_spirv.h
@@ -0,0 +1,133 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file ir_builder.h
+ * \brief Utility for building SPIRV code block
+ */
+#ifndef TVM_CODEGEN_SPIRV_CODEGEN_SPIRV_H_
+#define TVM_CODEGEN_SPIRV_CODEGEN_SPIRV_H_
+
+#include <tvm/ir.h>
+#include <tvm/ir_functor_ext.h>
+#include <tvm/lowered_func.h>
+
+#include <vector>
+
+#include "./ir_builder.h"
+#include "../../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace codegen {
+
+using namespace ir;
+
+/*!
+ * \brief Code generator into SPIRV
+ */
+class CodeGenSPIRV:
+      public ExprFunctor<spirv::Value(const Expr&)>,
+      public StmtFunctor<void(const Stmt&)> {
+ public:
+  /*!
+   * \brief Compile and add function f to the current module.
+   * \param f The function to be added.
+   * \return The final spirv module.
+   */
+  virtual std::vector<uint32_t> BuildFunction(const LoweredFunc& f);
+  /*!
+   * \brief Create Value for expression e
+   * \param e The expression to be created value for.
+   * \return created value.
+   */
+  spirv::Value MakeValue(const Expr& e) {
+    return VisitExpr(e);
+  }
+  // override codegen
+  spirv::Value VisitExpr_(const Variable* op) override;
+  spirv::Value VisitExpr_(const Cast* op) override;
+  spirv::Value VisitExpr_(const IntImm* op) override;
+  spirv::Value VisitExpr_(const UIntImm* op) override;
+  spirv::Value VisitExpr_(const FloatImm* op) override;
+  spirv::Value VisitExpr_(const StringImm* op) override;
+  spirv::Value VisitExpr_(const Add* op) override;
+  spirv::Value VisitExpr_(const Sub* op) override;
+  spirv::Value VisitExpr_(const Mul* op) override;
+  spirv::Value VisitExpr_(const Div* op) override;
+  spirv::Value VisitExpr_(const Mod* op) override;
+  spirv::Value VisitExpr_(const Min* op) override;
+  spirv::Value VisitExpr_(const Max* op) override;
+  spirv::Value VisitExpr_(const LT* op) override;
+  spirv::Value VisitExpr_(const LE* op) override;
+  spirv::Value VisitExpr_(const GT* op) override;
+  spirv::Value VisitExpr_(const GE* op) override;
+  spirv::Value VisitExpr_(const EQ* op) override;
+  spirv::Value VisitExpr_(const NE* op) override;
+  spirv::Value VisitExpr_(const And* op) override;
+  spirv::Value VisitExpr_(const Or* op) override;
+  spirv::Value VisitExpr_(const Not* op) override;
+  spirv::Value VisitExpr_(const Select* op) override;
+  spirv::Value VisitExpr_(const Let* op) override;
+  spirv::Value VisitExpr_(const Call* op) override;
+  spirv::Value VisitExpr_(const Ramp* op) override;
+  spirv::Value VisitExpr_(const Broadcast* op) override;
+  spirv::Value VisitExpr_(const Load* op) override;
+  // stmt
+  void VisitStmt_(const Store* op) override;
+  void VisitStmt_(const For* op) override;
+  void VisitStmt_(const IfThenElse* op) override;
+  void VisitStmt_(const Allocate* op) override;
+  void VisitStmt_(const AttrStmt* op) override;
+  void VisitStmt_(const AssertStmt* op) override;
+  void VisitStmt_(const LetStmt* op) override;
+  void VisitStmt_(const Block* op) override;
+  void VisitStmt_(const Evaluate* op) override;
+  void VisitStmt_(const ProducerConsumer* op) override;
+
+ protected:
+  /*! \brief The storage information */
+  struct StorageInfo {
+    /*! \brief The storage scope */
+    runtime::StorageScope scope;
+    /*! \brief Whether it is volatile */
+    bool is_volatile{false};
+    /*! \brief Whether it is volatile */
+    bool content_fixed{false};
+    /*! \brief Current content type */
+    Type content_type{Handle()};
+
+    // Update content type if it hasn't beenupdated.
+    void UpdateContentType(Type type) {
+      if (content_fixed) {
+        CHECK_EQ(type, content_type)
+            << "Cannot use two different content type in GLSL model";
+      } else {
+        this->content_type = type;
+        content_fixed = true;
+      }
+    }
+  };
+  // Reset the state so it works for a new function.
+  void InitFuncState();
+  // Get the thread index
+  spirv::Value GetThreadIndex(const IterVar& iv, const Expr& extent);
+  spirv::Value CreateStorageSync(const Call* op);
+  void Scalarize(const Expr& e,
+                 std::function<void(int i, spirv::Value v)> f);
+  // The builder
+  std::unique_ptr<spirv::IRBuilder> builder_;
+  // Work group size of three
+  uint32_t workgroup_size_[3];
+  // Likely branch
+  uint32_t weight_likely_branch_{128};
+  // the storage scope of allocation
+  std::unordered_map<const Variable*, StorageInfo> storage_info_;
+  // The definition of local variable.
+  std::unordered_map<const Variable*, spirv::Value> var_map_;
+  // The alignment information
+  std::unordered_map<const Variable*, arith::ModularEntry> align_map_;
+};
+
+}  // namespace codegen
+}  // namespace tvm
+
+
+#endif  // TVM_CODEGEN_SPIRV_CODEGEN_SPIRV_H_
diff --git a/src/codegen/spirv/intrin_rule_spirv.cc b/src/codegen/spirv/intrin_rule_spirv.cc
new file mode 100644
index 000000000000..a7fa46bda60a
--- /dev/null
+++ b/src/codegen/spirv/intrin_rule_spirv.cc
@@ -0,0 +1,64 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file intrin_rule_spirv.cc
+ */
+#include <tvm/packed_func_ext.h>
+#include <tvm/ir.h>
+#include <GLSL.std.450.h>
+
+namespace tvm {
+namespace codegen {
+namespace spirv {
+
+using namespace runtime;
+
+// num_signature means number of arguments used to query signature
+template<unsigned id>
+inline void DispatchGLSLPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
+  Expr e = targs[0];
+  const ir::Call* call = e.as<ir::Call>();
+  CHECK(call != nullptr);
+  Array<Expr> cargs;
+  // intrin id.
+  cargs.push_back(ir::UIntImm::make(UInt(32), id));
+
+  for (Expr arg : call->args) {
+    cargs.push_back(arg);
+  }
+  *rv = ir::Call::make(
+      call->type, "spirv_glsl450", cargs, ir::Call::PureIntrinsic);
+}
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.floor")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450Floor>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.ceil")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450Ceil>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.round")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450Round>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.trunc")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450Trunc>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.fabs")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450FAbs>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.exp")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450Exp>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.log")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450Log>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.sqrt")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450Sqrt>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.pow")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450Pow>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.tanh")
+.set_body(DispatchGLSLPureIntrin<GLSLstd450Tanh>);
+
+}  // namespace spirv
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/spirv/ir_builder.cc b/src/codegen/spirv/ir_builder.cc
new file mode 100644
index 000000000000..eb7a67228e60
--- /dev/null
+++ b/src/codegen/spirv/ir_builder.cc
@@ -0,0 +1,563 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file ir_builder.cc
+ * \brief IRBuilder for SPIRV block
+ */
+#include "./ir_builder.h"
+
+namespace tvm {
+namespace codegen {
+namespace spirv {
+
+// implementations
+
+void IRBuilder::InitHeader() {
+  CHECK_EQ(header_.size(), 0U);
+  header_.push_back(spv::MagicNumber);
+  header_.push_back(spv::Version);
+  // generator: set to 0, unknown
+  header_.push_back(0U);
+  // Bound: set during Finalize
+  header_.push_back(0U);
+  // Schema: reserved
+  header_.push_back(0U);
+  // shader
+  ib_.Begin(spv::OpCapability).Add(spv::CapabilityShader).Commit(&header_);
+  // memory model
+  ib_.Begin(spv::OpMemoryModel).AddSeq(
+        spv::AddressingModelLogical,
+        spv::MemoryModelGLSL450).Commit(&entry_);
+  this->InitPreDefs();
+}
+
+void IRBuilder::InitPreDefs() {
+  ext_glsl450_ = ExtInstImport("GLSL.std.450");
+  t_int32_ = DeclareType(Int(32));
+  t_uint32_ = DeclareType(UInt(32));
+  t_bool_ = DeclareType(UInt(1));
+  t_fp32_ = DeclareType(Float(32));
+  const_i32_zero_ = IntImm(t_int32_, 0);
+  // declare void, and void functions
+  t_void_.id = id_counter_++;
+  ib_.Begin(spv::OpTypeVoid).Add(t_void_).Commit(&global_);
+  t_void_func_.id = id_counter_++;
+  ib_.Begin(spv::OpTypeFunction)
+      .AddSeq(t_void_func_, t_void_).Commit(&global_);
+}
+
+SType IRBuilder::GetSType(const Type& dtype) {
+  if (dtype == Int(32)) {
+    return t_int32_;
+  } else if (dtype == UInt(1)) {
+    return t_bool_;
+  } else if (dtype == Float(32)) {
+    return t_fp32_;
+  } else if (dtype == UInt(32)) {
+    return t_uint32_;
+  }
+  uint32_t type_key;
+  type_key = static_cast<uint32_t>(dtype.code());
+  type_key |= static_cast<uint32_t>(dtype.bits()) << 8U;
+  type_key |= static_cast<uint32_t>(dtype.lanes()) << 16U;
+  auto it = pod_type_tbl_.find(type_key);
+  if (it != pod_type_tbl_.end()) {
+    return it->second;
+  }
+  SType t = DeclareType(dtype);
+  pod_type_tbl_[type_key] = t;
+  return t;
+}
+
+SType IRBuilder::GetPointerType(const SType& value_type,
+                                spv::StorageClass storage_class) {
+  CHECK_NE(storage_class, spv::StorageClassMax);
+  auto key = std::make_pair(value_type.id, storage_class);
+  auto it = pointer_type_tbl_.find(key);
+  if (it != pointer_type_tbl_.end()) {
+    return it->second;
+  }
+  SType t;
+  t.id = id_counter_++;
+  t.type = Handle();
+  t.element_type_id = value_type.id;
+  t.storage_class = storage_class;
+  ib_.Begin(spv::OpTypePointer)
+      .AddSeq(t, storage_class, value_type).Commit(&global_);
+  pointer_type_tbl_[key] = t;
+  return t;
+}
+
+SType IRBuilder::GetStructArrayType(const SType& value_type,
+                                    uint32_t num_elems) {
+  auto key = std::make_pair(value_type.id, num_elems);
+  auto it = struct_array_type_tbl_.find(key);
+  if (it != struct_array_type_tbl_.end()) {
+    return it->second;
+  }
+
+  SType arr_type;
+  arr_type.id = id_counter_++;
+  arr_type.type = Handle();
+  arr_type.element_type_id = value_type.id;
+
+  if (num_elems != 0) {
+    Value length = UIntImm(GetSType(UInt(32)), num_elems);
+    ib_.Begin(spv::OpTypeArray)
+        .AddSeq(arr_type, value_type, length).Commit(&global_);
+  } else {
+    ib_.Begin(spv::OpTypeRuntimeArray)
+        .AddSeq(arr_type, value_type).Commit(&global_);
+  }
+  int nbits = value_type.type.bits() * value_type.type.lanes();
+  CHECK_EQ(nbits % 8, 0);
+  uint32_t nbytes = static_cast<uint32_t>(nbits) / 8;
+  // decorate the array type.
+  this->Decorate(spv::OpDecorate,
+                 arr_type, spv::DecorationArrayStride, nbytes);
+  // declare struct of array
+  SType struct_type;
+  struct_type.id = id_counter_++;
+  struct_type.type = Handle();
+  struct_type.element_type_id = value_type.id;
+  ib_.Begin(spv::OpTypeStruct)
+      .AddSeq(struct_type, arr_type).Commit(&global_);
+  // decorate the array type.
+  ib_.Begin(spv::OpMemberDecorate)
+      .AddSeq(struct_type, 0, spv::DecorationOffset, 0)
+      .Commit(&decorate_);
+  // runtime array are always decorated as BufferBlock(shader storage buffer)
+  if (num_elems == 0) {
+    this->Decorate(spv::OpDecorate,
+                   struct_type, spv::DecorationBufferBlock);
+  }
+  struct_array_type_tbl_[key] = struct_type;
+  return struct_type;
+}
+
+Value IRBuilder::StructArrayAccess(const SType& res_type,
+                                   Value buffer,
+                                   Value index) {
+  CHECK(buffer.flag == kStructArrayPtr);
+  return MakeValue(spv::OpInBoundsAccessChain,
+                   res_type, buffer,
+                   const_i32_zero_, index);
+}
+
+Value IRBuilder::IntImm(const SType& dtype, int64_t value) {
+  return GetConst_(dtype, reinterpret_cast<uint64_t*>(&value));
+}
+
+Value IRBuilder::UIntImm(const SType& dtype, uint64_t value) {
+  return GetConst_(dtype, &value);
+}
+
+Value IRBuilder::FloatImm(const SType& dtype, double value) {
+  if (dtype.type.bits() == 64) {
+    return GetConst_(dtype, reinterpret_cast<uint64_t*>(&value));
+  } else if (dtype.type.bits() == 32) {
+    float fvalue = static_cast<float>(value);
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(&fvalue);
+    uint64_t data = ptr[0];
+    return GetConst_(dtype, &data);
+  } else {
+    CHECK_EQ(dtype.type.bits(), 16);
+    return Cast(dtype,
+                FloatImm(GetSType(Float(32)), value));
+  }
+}
+
+Value IRBuilder::BufferArgument(const SType& value_type,
+                                uint32_t descriptor_set,
+                                uint32_t binding) {
+  SType sarr_type = GetStructArrayType(value_type, 0);
+  SType ptr_type = GetPointerType(sarr_type, spv::StorageClassUniform);
+  Value val = NewValue(ptr_type, kStructArrayPtr);
+  ib_.Begin(spv::OpVariable)
+      .AddSeq(ptr_type, val, spv::StorageClassUniform).Commit(&global_);
+  this->Decorate(spv::OpDecorate,
+                 val, spv::DecorationDescriptorSet, descriptor_set);
+  this->Decorate(spv::OpDecorate,
+                 val, spv::DecorationBinding, binding);
+  return val;
+}
+
+Value IRBuilder::DeclarePushConstant(const std::vector<SType>& value_types) {
+  CHECK_EQ(push_const_.id, 0);
+  SType struct_type;
+  struct_type.id = id_counter_++;
+  struct_type.type = Handle();
+  ib_.Begin(spv::OpTypeStruct).Add(struct_type);
+  for (const SType& vtype : value_types) {
+    ib_.Add(vtype);
+  }
+  ib_.Commit(&global_);
+
+  uint32_t offset = 0;
+  for (uint32_t i = 0; i < value_types.size(); ++i) {
+    ib_.Begin(spv::OpMemberDecorate)
+        .AddSeq(struct_type, i, spv::DecorationOffset, offset)
+        .Commit(&decorate_);
+    Type t = value_types[i].type;
+    uint32_t nbits = t.bits() * t.lanes();
+    CHECK_EQ(nbits % 8 , 0);
+    offset += nbits / 8;
+  }
+  // Decorate push constants as UBO
+  this->Decorate(spv::OpDecorate, struct_type, spv::DecorationBlock);
+
+  SType ptr_type = GetPointerType(
+      struct_type, spv::StorageClassPushConstant);
+  Value val = NewValue(ptr_type, kPushConstantPtr);
+  ib_.Begin(spv::OpVariable)
+      .AddSeq(ptr_type, val, spv::StorageClassPushConstant).Commit(&global_);
+  return val;
+}
+
+Value IRBuilder::GetPushConstant(
+    Value ptr_push_const, const SType& v_type, uint32_t index) {
+  SType ptr_vtype = this->GetPointerType(v_type, spv::StorageClassPushConstant);
+  Value ptr = this->MakeValue(
+      spv::OpAccessChain, ptr_vtype, ptr_push_const,
+      IntImm(t_int32_, static_cast<int64_t>(index)));
+  return this->MakeValue(spv::OpLoad, v_type, ptr);
+}
+
+Value IRBuilder::NewFunction() {
+  return NewValue(t_void_func_, kFunction);
+}
+
+void IRBuilder::CommitKernelFunction(const Value& func, const std::string& name) {
+  CHECK_EQ(func.flag, kFunction);
+  ib_.Begin(spv::OpEntryPoint)
+    .AddSeq(spv::ExecutionModelGLCompute, func, name);
+  if (workgroup_id_.id != 0) {
+    ib_.Add(workgroup_id_);
+  }
+  if (local_id_.id != 0) {
+    ib_.Add(local_id_);
+  }
+  ib_.Commit(&entry_);
+}
+
+void IRBuilder::StartFunction(const Value& func) {
+  CHECK_EQ(func.flag, kFunction);
+  this->MakeInst(
+      spv::OpFunction, t_void_, func, 0, t_void_func_);
+  spirv::Label start_label = this->NewLabel();
+  this->StartLabel(start_label);
+}
+
+void IRBuilder::SetLocalSize(const Value& func,
+                             uint32_t local_size[3]) {
+  CHECK_EQ(func.flag, kFunction);
+  ib_.Begin(spv::OpExecutionMode)
+      .AddSeq(func, spv::ExecutionModeLocalSize,
+              local_size[0], local_size[1], local_size[2])
+      .Commit(&exec_mode_);
+}
+
+Value IRBuilder::Allocate(const SType& value_type,
+                          uint32_t num_elems,
+                          spv::StorageClass storage_class) {
+  CHECK_NE(num_elems, 0U);
+  SType sarr_type = GetStructArrayType(value_type, num_elems);
+  SType ptr_type = GetPointerType(sarr_type, storage_class);
+  Value val = NewValue(ptr_type, kStructArrayPtr);
+  if (storage_class == spv::StorageClassFunction) {
+    ib_.Begin(spv::OpVariable)
+        .AddSeq(ptr_type, val, storage_class).Commit(&function_);
+  } else {
+    ib_.Begin(spv::OpVariable)
+        .AddSeq(ptr_type, val, storage_class).Commit(&global_);
+  }
+  return val;
+}
+
+Value IRBuilder::GetWorkgroupID(uint32_t dim_index) {
+  if (workgroup_id_.id == 0) {
+    SType vec3_type = this->GetSType(Int(32).with_lanes(3));
+    SType ptr_type = this->GetPointerType(
+        vec3_type, spv::StorageClassInput);
+    workgroup_id_ = NewValue(ptr_type, kVectorPtr);
+    ib_.Begin(spv::OpVariable)
+        .AddSeq(ptr_type, workgroup_id_, spv::StorageClassInput)
+        .Commit(&global_);
+    this->Decorate(spv::OpDecorate, workgroup_id_,
+                   spv::DecorationBuiltIn, spv::BuiltInWorkgroupId);
+  }
+  SType pint_type = this->GetPointerType(t_int32_, spv::StorageClassInput);
+  Value ptr = this->MakeValue(
+      spv::OpAccessChain, pint_type, workgroup_id_,
+      IntImm(t_int32_, static_cast<int64_t>(dim_index)));
+  return this->MakeValue(spv::OpLoad, t_int32_, ptr);
+}
+
+Value IRBuilder::GetLocalID(uint32_t dim_index) {
+  if (local_id_.id == 0) {
+    SType vec3_type = this->GetSType(Int(32).with_lanes(3));
+    SType ptr_type = this->GetPointerType(vec3_type, spv::StorageClassInput);
+    local_id_ = NewValue(ptr_type, kVectorPtr);
+    ib_.Begin(spv::OpVariable)
+        .AddSeq(ptr_type, local_id_, spv::StorageClassInput)
+        .Commit(&global_);
+    this->Decorate(spv::OpDecorate, local_id_,
+                   spv::DecorationBuiltIn, spv::BuiltInLocalInvocationId);
+  }
+  SType pint_type = this->GetPointerType(t_int32_, spv::StorageClassInput);
+  Value ptr = this->MakeValue(
+      spv::OpAccessChain, pint_type, local_id_,
+      UIntImm(t_int32_, static_cast<int64_t>(dim_index)));
+  return this->MakeValue(spv::OpLoad, t_int32_, ptr);
+}
+
+Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
+  auto key = std::make_pair(dtype.id, pvalue[0]);
+  auto it = const_tbl_.find(key);
+  if (it != const_tbl_.end()) {
+    return it->second;
+  }
+  CHECK_LE(dtype.type.bits(), 64);
+  Value ret = NewValue(dtype, kConstant);
+  if (dtype.type == UInt(1)) {
+    // bool types.
+    if (*pvalue) {
+      ib_.Begin(spv::OpConstantTrue).AddSeq(ret);
+    } else {
+      ib_.Begin(spv::OpConstantFalse).AddSeq(ret);
+    }
+  } else {
+    // Integral/floating-point types.
+    ib_.Begin(spv::OpConstant).AddSeq(dtype, ret);
+    uint64_t mask = 0xFFFFFFFFUL;
+    ib_.Add(static_cast<uint32_t>(pvalue[0] & mask));
+    if (dtype.type.bits() > 32) {
+      if (dtype.type.is_int()) {
+        int64_t sign_mask = 0xFFFFFFFFL;
+        const int64_t* sign_ptr =
+            reinterpret_cast<const int64_t*>(pvalue);
+          ib_.Add(static_cast<uint32_t>((sign_ptr[0] >> 32L) & sign_mask));
+      } else {
+        ib_.Add(static_cast<uint32_t>((pvalue[0] >> 32UL) & mask));
+      }
+    }
+  }
+  ib_.Commit(&global_);
+  const_tbl_[key] = ret;
+  return ret;
+}
+
+SType IRBuilder::DeclareType(const Type& dtype) {
+  if (dtype.lanes() == 1) {
+    SType t;
+    t.id = id_counter_++;
+    t.type = dtype;
+    if (dtype.bits() == 1) {
+      CHECK(dtype.is_uint());
+      ib_.Begin(spv::OpTypeBool).Add(t).Commit(&global_);
+    } else if (dtype.is_int()) {
+      ib_.Begin(spv::OpTypeInt).AddSeq(t, dtype.bits(), 1).Commit(&global_);
+    } else if (dtype.is_uint()) {
+      ib_.Begin(spv::OpTypeInt).AddSeq(t, dtype.bits(), 0).Commit(&global_);
+    } else if (dtype.is_float()) {
+      ib_.Begin(spv::OpTypeFloat).AddSeq(t, dtype.bits()).Commit(&global_);
+    } else {
+      LOG(FATAL) << "declare type do not support handle";
+    }
+    return t;
+  } else {
+    SType t;
+    t.id = id_counter_++;
+    t.type = dtype;
+    SType base_type = GetSType(dtype.element_of());
+    ib_.Begin(spv::OpTypeVector).AddSeq(
+        t, base_type, dtype.lanes()).Commit(&global_);
+    return t;
+  }
+}
+
+PhiValue IRBuilder::MakePhi(const SType& out_type, uint32_t num_incoming) {
+  Value val = NewValue(out_type, kNormal);
+  ib_.Begin(spv::OpPhi).AddSeq(out_type, val);
+  for (uint32_t i = 0; i < 2 * num_incoming; ++i) {
+    ib_.Add(0);
+  }
+  PhiValue phi;
+  phi.id = val.id;
+  phi.stype = out_type;
+  phi.flag = kNormal;
+  phi.instr = ib_.Commit(&function_);
+  CHECK_EQ(phi.instr.WordCount(), 2 * num_incoming + 3);
+  return phi;
+}
+
+Value IRBuilder::CallGLSL450(const SType& ret_type,
+                             uint32_t inst_id,
+                             const std::vector<Value>& args) {
+  Value val = NewValue(ret_type, kNormal);
+  ib_.Begin(spv::OpExtInst)
+      .AddSeq(ret_type, val, ext_glsl450_, inst_id);
+  for (const Value& v : args) {
+    ib_.Add(v);
+  }
+  ib_.Commit(&function_);
+  return val;
+}
+
+Value IRBuilder::Concat(const std::vector<Value>& vec) {
+  bool is_const = vec[0].flag == kConstant;
+  Type etype = vec[0].stype.type;
+  int lanes = etype.lanes();
+  for (size_t i = 1; i < vec.size(); ++i) {
+    CHECK_EQ(etype, vec[i].stype.type.element_of())
+        << "Cannot concat vector of different element type";
+    lanes += vec[i].stype.type.lanes();
+    is_const = is_const && (vec[i].flag == kConstant);
+  }
+  Value ret = NewValue(GetSType(etype.with_lanes(lanes)), kNormal);
+  if (is_const && vec.size() == static_cast<size_t>(lanes)) {
+    ib_.Begin(spv::OpConstantComposite);
+    ib_.AddSeq(ret.stype, ret);
+    for (const Value& v : vec) {
+      ib_.Add(v);
+    }
+    ib_.Commit(&global_);
+  } else {
+    ib_.Begin(spv::OpCompositeConstruct);
+    ib_.AddSeq(ret.stype, ret);
+    for (const Value& v : vec) {
+      ib_.Add(v);
+    }
+    ib_.Commit(&function_);
+  }
+  return ret;
+}
+
+Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
+  CHECK_NE(value.stype.id, 0U);
+  if (value.stype.id == dst_type.id) return value;
+  const tvm::Type& from = value.stype.type;
+  const tvm::Type& to = dst_type.type;
+  CHECK_EQ(from.lanes(), to.lanes());
+
+  if (from.is_int() && to.is_int()) {
+    return MakeValue(spv::OpSConvert, dst_type, value);
+  } else if (from.is_uint() && to.is_uint()) {
+    return MakeValue(spv::OpUConvert, dst_type, value);
+  } else if (from.is_uint() && to.is_int()) {
+    if (from.bits() != to.bits()) {
+      value = MakeValue(
+          spv::OpUConvert, GetSType(from.with_bits(to.bits())), value);
+    }
+    return MakeValue(spv::OpBitcast, dst_type, value);
+  } else if (from.is_int() && to.is_uint()) {
+    if (from.bits() != to.bits()) {
+      value = MakeValue(
+          spv::OpSConvert, GetSType(from.with_bits(to.bits())), value);
+    }
+    return MakeValue(spv::OpBitcast, dst_type, value);
+  } else if (from.is_float() && to.is_int()) {
+    return MakeValue(spv::OpConvertFToS, dst_type, value);
+  } else if (from.is_float() && to.is_uint()) {
+    return MakeValue(spv::OpConvertFToU, dst_type, value);
+  } else if (from.is_int() && to.is_float()) {
+    return MakeValue(spv::OpConvertSToF, dst_type, value);
+  } else if (from.is_uint() && to.is_float()) {
+    return MakeValue(spv::OpConvertUToF, dst_type, value);
+  } else if (from.is_float() && to.is_float()) {
+    return MakeValue(spv::OpFConvert, dst_type, value);
+  } else {
+    LOG(FATAL) << "do not support type cast from "
+               << from << " to " << to;
+    return Value();
+  }
+}
+
+#define DEFINE_BUILDER_BINARY_USIGN_OP(_OpName, _Op)              \
+  Value IRBuilder::_OpName(Value a, Value b) {                    \
+    CHECK_EQ(a.stype.id, b.stype.id);                             \
+    if (a.stype.type.is_int() || a.stype.type.is_uint()) {        \
+      return MakeValue(spv::OpI ## _Op, a.stype, a, b);           \
+    } else {                                                      \
+      CHECK(a.stype.type.is_float());                             \
+      return MakeValue(spv::OpF ## _Op, a.stype, a, b);           \
+    }                                                             \
+  }
+
+#define DEFINE_BUILDER_BINARY_SIGN_OP(_OpName, _Op)               \
+  Value IRBuilder::_OpName(Value a, Value b) {                    \
+    CHECK_EQ(a.stype.id, b.stype.id);                             \
+    if (a.stype.type.is_int()) {                                   \
+      return MakeValue(spv::OpS ## _Op, a.stype, a, b);            \
+    } else if (a.stype.type.is_uint()) {                           \
+      return MakeValue(spv::OpU ## _Op, a.stype, a, b);            \
+    } else {                                                       \
+      CHECK(a.stype.type.is_float());                              \
+      return MakeValue(spv::OpF ## _Op, a.stype, a, b);            \
+    }                                                              \
+  }
+
+DEFINE_BUILDER_BINARY_USIGN_OP(Add, Add);
+DEFINE_BUILDER_BINARY_USIGN_OP(Sub, Sub);
+DEFINE_BUILDER_BINARY_USIGN_OP(Mul, Mul);
+DEFINE_BUILDER_BINARY_SIGN_OP(Div, Div);
+
+Value IRBuilder::Mod(Value a, Value b) {
+  CHECK_EQ(a.stype.id, b.stype.id);
+  if (a.stype.type.is_int()) {
+    return MakeValue(spv::OpSRem, a.stype, a, b);
+  } else if (a.stype.type.is_uint()) {
+    return MakeValue(spv::OpUMod, a.stype, a, b);
+  } else {
+    CHECK(a.stype.type.is_float());
+    return MakeValue(spv::OpFRem, a.stype, a, b);
+  }
+}
+
+
+#define DEFINE_BUILDER_CMP_OP(_OpName, _Op)                        \
+  Value IRBuilder:: _OpName(Value a, Value b) {                    \
+    CHECK_EQ(a.stype.id, b.stype.id);                              \
+    if (t_bool_.id == 0) {                                         \
+      t_bool_ = DeclareType(UInt(1));                              \
+    }                                                              \
+    if (a.stype.type.is_int()) {                                   \
+      return MakeValue(spv::OpS ## _Op, t_bool_, a, b);            \
+    } else if (a.stype.type.is_uint()) {                           \
+      return MakeValue(spv::OpU ## _Op, t_bool_, a, b);            \
+    } else {                                                       \
+      CHECK(a.stype.type.is_float());                              \
+      return MakeValue(spv::OpFOrd ## _Op, t_bool_, a, b);         \
+    }                                                              \
+  }
+
+DEFINE_BUILDER_CMP_OP(LT, LessThan);
+DEFINE_BUILDER_CMP_OP(LE, LessThanEqual);
+DEFINE_BUILDER_CMP_OP(GT, GreaterThan);
+DEFINE_BUILDER_CMP_OP(GE, GreaterThanEqual);
+
+#define DEFINE_BUILDER_CMP_UOP(_OpName, _Op)                       \
+  Value IRBuilder:: _OpName(Value a, Value b) {                    \
+    CHECK_EQ(a.stype.id, b.stype.id);                              \
+    if (t_bool_.id == 0) {                                         \
+      t_bool_ = DeclareType(UInt(1));                              \
+    }                                                              \
+    if (a.stype.type.is_int() || a.stype.type.is_uint()) {         \
+      return MakeValue(spv::OpI ## _Op, t_bool_, a, b);            \
+    } else {                                                       \
+      CHECK(a.stype.type.is_float());                              \
+      return MakeValue(spv::OpFOrd ## _Op, t_bool_, a, b);         \
+    }                                                              \
+  }
+
+DEFINE_BUILDER_CMP_UOP(EQ, Equal);
+DEFINE_BUILDER_CMP_UOP(NE, NotEqual);
+
+Value IRBuilder::Select(Value cond, Value a, Value b) {
+  CHECK_EQ(a.stype.id, b.stype.id);
+  CHECK_EQ(cond.stype.type, UInt(1));
+  return MakeValue(spv::OpSelect, a.stype, cond, a, b);
+}
+
+}  // namespace spirv
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/spirv/ir_builder.h b/src/codegen/spirv/ir_builder.h
new file mode 100644
index 000000000000..7c7afabdf566
--- /dev/null
+++ b/src/codegen/spirv/ir_builder.h
@@ -0,0 +1,604 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file ir_builder.h
+ * \brief Utility for building SPIRV code block
+ */
+#ifndef TVM_CODEGEN_SPIRV_IR_BUILDER_H_
+#define TVM_CODEGEN_SPIRV_IR_BUILDER_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <tvm/ir.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include <string>
+#include <map>
+
+#include <spirv.hpp>
+
+namespace tvm {
+namespace codegen {
+namespace spirv {
+
+/*! \brief Represent the SPIRV Type */
+struct SType {
+  /*! \brief The Id to represent type */
+  uint32_t id{0};
+  /*! \brief corresponding TVM type */
+  tvm::Type type;
+  /*! \brief content type id if it is a pointer/struct-array class */
+  uint32_t element_type_id{0};
+  /*! \brief The storage class, if it is a pointer */
+  spv::StorageClass storage_class{spv::StorageClassMax};
+};
+
+enum ValueKind {
+  kNormal,
+  kConstant,
+  kVectorPtr,
+  kStructArrayPtr,
+  kPushConstantPtr,
+  kFunction,
+  kExtInst
+};
+
+/*! \brief Represent the SPIRV Value */
+struct Value {
+  /*! \brief The Id to represent value */
+  uint32_t id{0};
+  /*! \brief The data type */
+  SType stype;
+  /*! \brief additional flags about the value */
+  ValueKind flag{kNormal};
+};
+
+/*! \brief Represent the SPIRV Label */
+struct Label {
+  /*! \brief The Id to represent label */
+  uint32_t id{0};
+};
+
+/*!
+ * \brief A SPIRV instruction,
+ *  can be used as handle to modify its content later
+ */
+class Instr {
+ public:
+  /*! \return the word count */
+  uint32_t WordCount() const {
+    return word_count_;
+  }
+  /*!
+   * \brief Access idx-th word of instruction
+   * \param idx The index
+   * \return reference to idx-th word.
+   */
+  uint32_t& operator[](uint32_t idx) {
+    CHECK_LT(idx, word_count_);
+    return (*data_)[begin_ + idx];
+  }
+
+ private:
+  friend class InstrBuilder;
+  /*!
+   * \brief the data that backs this instruction
+   *  Have to use vector reference because
+   *  vector can change.
+   */
+  std::vector<uint32_t>* data_{nullptr};
+  /*! \brief begin location of instruction */
+  uint32_t begin_{0};
+  /*! \brief work count */
+  uint32_t word_count_{0};
+};
+
+/*! \brief Representation of phi value */
+struct PhiValue : public Value {
+  /*! \brief The corresponding instr */
+  Instr instr;
+  /*!
+   * \brief Add incoming information of a PhiValue
+   * \param index The location of Phi
+   * \param value The value to come
+   * \param parent The parent label.
+   */
+  void SetIncoming(uint32_t index,
+                   const Value& value,
+                   const Label& parent) {
+    CHECK_EQ(this->stype.id, value.stype.id);
+    instr[3 + index * 2] = value.id;
+    instr[3 + index * 2 + 1] = parent.id;
+  }
+};
+
+/*!
+ * \brief Helper class to build SPIRV instruction.
+ *
+ * \code
+ *
+ *  std::vector<uint32_t> func_seg_vec_;
+ *  InstrBuilder ib;
+ *
+ *  // construct and append to the end of func_seg_vec_;
+ *  ib.Begin(spv::OpIAdd)
+ *    .Add(result).Add(v1).Add(v2)
+ *    .Commit(&func_seg_vec_);
+ *
+ * \endcode
+ */
+class InstrBuilder {
+ public:
+  /*!
+   * \brief Begin construction of instruction.
+   * \param op The op code
+   * \return reference to self.
+   */
+  InstrBuilder& Begin(spv::Op op) {  // NOLINT(*);
+    // finish previous build
+    CHECK_EQ(data_.size(), 0U);
+    op_ = op;
+    data_.push_back(0);
+    return *this;
+  }
+  /*!
+   * \brief Add v to end of instruction.
+   * \param v The value to be appended to the instruction.
+   * \return reference to self.
+   */
+  InstrBuilder& Add(const Value& v) {
+    data_.push_back(v.id);
+    return *this;
+  }
+  /*!
+   * \brief Add v to end of instruction.
+   * \param v The type to be appended to the instruction.
+   * \return reference to self.
+   */
+  InstrBuilder& Add(const SType& v) {
+    data_.push_back(v.id);
+    return *this;
+  }
+  /*!
+   * \brief Add v to end of instruction.
+   * \param v The label to be appended to the instruction.
+   * \return reference to self.
+   */
+  InstrBuilder& Add(const Label& v) {
+    data_.push_back(v.id);
+    return *this;
+  }
+  /*!
+   * \brief Add a word to end of instruction.
+   * \param v The value to be added.
+   * \return reference to self.
+   */
+  InstrBuilder& Add(const uint32_t& v) {
+    data_.push_back(v);
+    return *this;
+  }
+  /*!
+   * \brief Add string literal of end of instruction.
+   * \param v The string literal to be appended.
+   * \return reference to self.
+   */
+  InstrBuilder& Add(const std::string& v) {
+    const uint32_t kWordSize = sizeof(uint32_t);
+    uint32_t nwords =
+        (static_cast<uint32_t>(v.length()) + kWordSize) / kWordSize;
+    size_t begin = data_.size();
+    data_.resize(begin + nwords, 0U);
+    std::copy(v.begin(), v.end(),
+              reinterpret_cast<char*>(&data_[begin]));
+    return *this;
+  }
+  /*!
+   * \brief add sequence of values to instruction
+   * \param args The instruction sequence
+   * \return reference to self.
+   * \tparams Args The positional arguments
+   */
+  template<typename... Args>
+  InstrBuilder& AddSeq(Args&& ...args) {
+    AddSeqHelper helper;
+    helper.builder = this;
+    runtime::detail::for_each(helper, std::forward<Args>(args)...);
+    return *this;
+  }
+  /*!
+   * \brief Finish build, commit the current
+   *   instruction to the end of seg.
+   *
+   * \param seg The code segment to commit to
+   * \return The result instruction.
+   */
+  Instr Commit(std::vector<uint32_t>* seg) {
+    Instr ret;
+    ret.data_ = seg;
+    ret.begin_ = seg->size();
+    ret.word_count_ = static_cast<uint32_t>(data_.size());
+    data_[0] = op_ | (ret.word_count_ << spv::WordCountShift);
+    seg->insert(seg->end(), data_.begin(), data_.end());
+    data_.clear();
+    return ret;
+  }
+
+ private:
+  // current op code.
+  spv::Op op_;
+  // The internal data to store code
+  std::vector<uint32_t> data_;
+  // helper class to support variadic arguments
+  struct AddSeqHelper {
+    // The reference to builder
+    InstrBuilder* builder;
+    // invoke function
+    template<typename T>
+    void operator()(size_t, const T& v) const {
+      builder->Add(v);
+    }
+  };
+};
+
+/*!
+ * \brief Builder to build up a single SPIR-V module
+ *
+ *  This is a thin wrapper to build SPIRV binary.
+ *  SPIRV adopts structure control-flow.
+ *  We can build the code by always appending to the end of the
+ *  binary code block and revisit some
+ *
+ *  This IRBuilder did not introduce concept of BasicBlock.
+ *  instead instructions are append to end of each segment.
+ */
+class IRBuilder {
+ public:
+  /*! \brief Initialize header */
+  void InitHeader();
+  /*! \brief Initialize the predefined contents */
+  void InitPreDefs();
+  /*!
+   * \brief Import additional extension libraries.
+   * \param name The name of the library.
+   * \return The finalized binary instruction.
+   */
+  Value ExtInstImport(const std::string& name) {
+    Value val = NewValue(SType(), kExtInst);
+    ib_.Begin(spv::OpExtInstImport).AddSeq(val, name).Commit(&header_);
+    return val;
+  }
+  /*!
+   * \brief Get the final binary built from the builder
+   * \return The finalized binary instruction.
+   */
+  std::vector<uint32_t> Finalize() {
+    std::vector<uint32_t> data;
+    // set bound
+    const int kBoundLoc = 3;
+    header_[kBoundLoc] = id_counter_;
+    data.insert(data.end(), header_.begin(), header_.end());
+    data.insert(data.end(), entry_.begin(), entry_.end());
+    data.insert(data.end(), exec_mode_.begin(), exec_mode_.end());
+    data.insert(data.end(), debug_.begin(), debug_.end());
+    data.insert(data.end(), decorate_.begin(), decorate_.end());
+    data.insert(data.end(), global_.begin(), global_.end());
+    data.insert(data.end(), function_.begin(), function_.end());
+    return data;
+  }
+  /*!
+   * \brief Create new label
+   * \return The created new label
+   */
+  Label NewLabel() {
+    Label label;
+    label.id = id_counter_++;
+    return label;
+  }
+  /*!
+   * \brief Start a new block with given label
+   * \param label The label we use.
+   */
+  void StartLabel(Label label) {
+    MakeInst(spv::OpLabel, label);
+    curr_label_ = label;
+  }
+  /*! \return The current label */
+  Label CurrentLabel() const {
+    return curr_label_;
+  }
+  /*!
+   * \brief Add code to debug segment.
+   * \param op The operator
+   * \param args The instruction sequence
+   * \tparams Args The positional arguments
+   */
+  template<typename... Args>
+  void Debug(spv::Op op, Args&& ...args) {
+    ib_.Begin(op).AddSeq(std::forward<Args>(args)...).Commit(&debug_);
+  }
+  /*!
+   * \brief Add Execution mode to a function.
+   * \param func The function value
+   * \param args The instruction sequence
+   * \tparams Args The positional arguments
+   */
+  template<typename... Args>
+  void ExecutionMode(Value func, Args&& ...args) {
+    ib_.Begin(spv::OpExecutionMode).AddSeq(
+        func, std::forward<Args>(args)...).Commit(&exec_mode_);
+  }
+  /*!
+   * \brief Add code to decorate segment.
+   * \param op The operator
+   * \param args The instruction sequence
+   * \tparams Args The positional arguments
+   */
+  template<typename... Args>
+  void Decorate(spv::Op op, Args&& ...args) {
+    ib_.Begin(op).AddSeq(std::forward<Args>(args)...).Commit(&decorate_);
+  }
+  /*!
+   * \brief Add code to global segment.
+   * \param op The operator
+   * \param args The instruction sequence
+   * \tparams Args The positional arguments
+   */
+  template<typename... Args>
+  void DeclareGlobal(spv::Op op, Args&& ...args) {
+    ib_.Begin(op).AddSeq(std::forward<Args>(args)...).Commit(&decorate_);
+  }
+  /*!
+   * \brief Make a new instruction and append it to end of function segment.
+   *
+   * \param op The operator
+   * \param args The instruction sequence
+   * \return The result SSA value.
+   * \tparams Args The positional arguments
+   */
+  template<typename... Args>
+  Instr MakeInst(spv::Op op, Args&& ...args) {
+    return ib_.Begin(op).AddSeq(std::forward<Args>(args)...).Commit(&function_);
+  }
+  /*!
+   * \brief Make a new SSA value,
+   *
+   * \param op The operator.
+   * \param out_type The result type.
+   * \param args The instruction sequence
+   * \return The result SSA value.
+   * \tparams Args The positional arguments
+   */
+  template<typename... Args>
+  Value MakeValue(spv::Op op, const SType& out_type, Args&& ...args) {
+    Value val = NewValue(out_type, kNormal);
+    MakeInst(op, out_type, val, std::forward<Args>(args)...);
+    return val;
+  }
+  /*!
+   * \brief Make a phi value.
+   *
+   * \param out_type The output data type.
+   * \param num_incoming number of incoming blocks.
+   * \return The result Phi value.
+   */
+  PhiValue MakePhi(const SType& out_type, uint32_t num_incoming);
+  /*!
+   * \brief Create a GLSL450 call
+   *
+   * \param ret_type The result type.
+   * \param inst_id The instance id of the function.
+   * \param args The arguments
+   * \return The result value.
+   */
+  Value CallGLSL450(const SType& ret_type,
+                    uint32_t inst_id,
+                    const std::vector<Value>& args);
+  /*!
+   * \brief Build vector by concatenating components
+   *
+   * \param vec The vector component
+   * \tparams Args The positional arguments
+   */
+  Value Concat(const std::vector<Value>& vec);
+  /*!
+   * \brief Get the spirv type for a given tvm data type.
+   * \param dtype The data type.
+   * \return The corresponding spirv type.
+   */
+  SType GetSType(const tvm::Type& dtype);
+  /*!
+   * \brief Get the pointer type that points to value_type
+   * \param value_type.
+   * \param storage_class The storage class
+   * \return The corresponding spirv type.
+   */
+  SType GetPointerType(const SType& value_type,
+                       spv::StorageClass storage_class);
+  /*!
+   * \brief Get a struct{ value_type[num_elems] } type.
+   * \param value_type the content value type.
+   * \param num_elems number of elements in array
+   *   num_elems = 0 means runtime array with BufferBlock Decoration
+   *
+   * \return The corresponding spirv type.
+   */
+  SType GetStructArrayType(const SType& value_type,
+                           uint32_t num_elems);
+  /*!
+   * \brief Get a struct array access with a given index.
+   * \param ptr_type The pointer type.
+   * \param buffer The buffer ptr to struct array
+   * \param index The array index.
+   */
+  Value StructArrayAccess(const SType& ptr_type,
+                          Value buffer,
+                          Value index);
+  /*!
+   * \brief Create a cast that cast value to dst_type
+   * \param dst_type The target type.
+   * \param value the source value.
+   * \return The result value
+   */
+  Value Cast(const SType& dst_type, Value value);
+  /*
+   * \brief Create a const integer.
+   * \param dtype The content data type.
+   * \param value The data value.
+   */
+  Value IntImm(const SType& dtype, int64_t value);
+  /*
+   * \brief Create a const unsigned integer.
+   * \param dtype The content data type.
+   * \param value The data value.
+   */
+  Value UIntImm(const SType& dtype, uint64_t value);
+  /*
+   * \brief Create a const float.
+   * \param dtype The content data type.
+   * \param value The data value.
+   */
+  Value FloatImm(const SType& dtype, double value);
+  /*
+   * \brief Declare buffer argument of function
+   *
+   * \param arg_type The type of argument.
+   * \param descriptor_set The descriptor set we want to use.
+   * \param binding The binding locaiton in descriptor set.
+   * \param The argument type.
+   */
+  Value BufferArgument(const SType& value_type,
+                       uint32_t descriptor_set,
+                       uint32_t binding);
+  /*!
+   * \brief Declare POD arguments through push constants.
+   *
+   * \note Only call this function once!
+   * \param value_types The values in the push constant
+   * \return reference to self.
+   */
+  Value DeclarePushConstant(const std::vector<SType>& value_types);
+  /*!
+   * \brief Get i-th push constant
+   * \param v_type The value type
+   * \param index The push constant index
+   * \return the value of push constant
+   */
+  Value GetPushConstant(Value ptr_push_const, const SType& v_type, uint32_t index);
+  /*!
+   * \brief Declare a new function
+   * \return The created function ID.
+   */
+  Value NewFunction();
+  /*!
+   * \brief Declare the entry point for a kernel function. This should be
+   * invoked after building the function so the builder is aware of which
+   * variables to declare as part of the function's interface.
+   * \param func The previously declared function.
+   * \param name Name of the entry point.
+   */
+  void CommitKernelFunction(const Value& func, const std::string& name);
+  /*!
+   * \brief Start function scope.
+   * \param func function to be started.
+   */
+  void StartFunction(const Value& func);
+  /*!
+   * \brief Set the local size of the function
+   * \param func function of interest
+   * \param local_size The local workgroup_size
+   */
+  void SetLocalSize(const Value& func, uint32_t local_size[3]);
+  /*
+   * \brief Allocate space
+   * \param value_type The content value type
+   * \param num_elems Number of elements to allocate.
+   * \param storage_class The storage class we want to store to.
+   */
+  Value Allocate(const SType& value_type,
+                 uint32_t num_elems,
+                 spv::StorageClass storage_class);
+  /*
+   * \brief Get the i-th workgroup id.
+   * \return The value representing the workgroup id.
+   */
+  Value GetWorkgroupID(uint32_t dim_index);
+  /*
+   * \brief Get the i-th local id.
+   * \return The value representing the local id.
+   */
+  Value GetLocalID(uint32_t dim_index);
+  // Expressions
+  Value Add(Value a, Value b);
+  Value Sub(Value a, Value b);
+  Value Mul(Value a, Value b);
+  Value Div(Value a, Value b);
+  Value Mod(Value a, Value b);
+  Value EQ(Value a, Value b);
+  Value NE(Value a, Value b);
+  Value LT(Value a, Value b);
+  Value LE(Value a, Value b);
+  Value GT(Value a, Value b);
+  Value GE(Value a, Value b);
+  Value Select(Value cond, Value a, Value b);
+
+ private:
+  /*!
+   * \brief Create new value
+   * \return The created new label
+   */
+  Value NewValue(const SType& stype, ValueKind flag) {
+    Value val;
+    val.id = id_counter_++;
+    val.stype = stype;
+    val.flag = flag;
+    return val;
+  }
+  // get constant given value encoded in uint64_t
+  Value GetConst_(const SType& dtype, const uint64_t* pvalue);
+  // declare type
+  SType DeclareType(const Type& dtype);
+  /*! \brief internal instruction builder  */
+  InstrBuilder ib_;
+  /*! \brief Current label */
+  Label curr_label_;
+  /*! \brief The current maximum id */
+  uint32_t id_counter_{1};
+  /*! \brief glsl 450 extension */
+  Value ext_glsl450_;
+  /*! \brief Special cache int32, fp32, void*/
+  SType t_bool_, t_int32_, t_uint32_, t_fp32_, t_void_, t_void_func_;
+  /*! \brief quick cache for const one i32 */
+  Value const_i32_zero_;
+  /*! \brief cache value for workgroup_id, local_id */
+  Value workgroup_id_, local_id_;
+  /*! \brief whether push constant is defined */
+  Value push_const_;
+  /*! \brief map from type code to the type */
+  std::unordered_map<uint32_t, SType> pod_type_tbl_;
+  /*! \brief map from value to array type */
+  std::map<std::pair<uint32_t, uint32_t>, SType> struct_array_type_tbl_;
+  /*! \brief map from value to its pointer type */
+  std::map<std::pair<uint32_t, spv::StorageClass>, SType> pointer_type_tbl_;
+  /*! \brief map from constant int to its value */
+  std::map<std::pair<uint32_t, uint64_t>, Value> const_tbl_;
+  /*! \brief Header segment, include import */
+  std::vector<uint32_t> header_;
+  /*! \brief engtry point segment */
+  std::vector<uint32_t> entry_;
+  /*! \brief Header segment */
+  std::vector<uint32_t> exec_mode_;
+  /*! \brief Debug segment */
+  std::vector<uint32_t> debug_;
+  /*! \brief Annotation segment */
+  std::vector<uint32_t> decorate_;
+    /*! \brief Global segment: types, variables, types */
+  std::vector<uint32_t> global_;
+  /*! \brief Function segment */
+  std::vector<uint32_t> function_;
+};
+
+}  // namespace spirv
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_CODEGEN_SPIRV_IR_BUILDER_H_
diff --git a/src/codegen/stack_vm/codegen_stack_vm.cc b/src/codegen/stack_vm/codegen_stack_vm.cc
index 97a2388f16e4..168e411fa6e2 100644
--- a/src/codegen/stack_vm/codegen_stack_vm.cc
+++ b/src/codegen/stack_vm/codegen_stack_vm.cc
@@ -197,10 +197,12 @@ void CodeGenStackVM::VisitExpr_(const Call* op) {
     vm_.stack_size += size;
     this->PushOp(StackVM::TVM_STACK_ALLOCA_BY_8BYTE, static_cast<int>(size));
   } else if (op->name == "TVMBackendAllocWorkspace") {
-    CHECK_EQ(op->args.size(), 3U);
+    CHECK_EQ(op->args.size(), 5U);
     this->Push(op->args[0]);
     this->Push(op->args[1]);
     this->Push(op->args[2]);
+    this->Push(op->args[3]);
+    this->Push(op->args[4]);
     this->PushOp(StackVM::TVM_DEVICE_ALLOCA);
   } else if (op->name == "TVMBackendFreeWorkspace") {
     CHECK_EQ(op->args.size(), 3U);
@@ -362,6 +364,7 @@ void CodeGenStackVM::VisitExpr_(const Or *op) {
 }
 
 void CodeGenStackVM::VisitExpr_(const Not* op) {
+  this->Push(op->a);
   this->PushOp(StackVM::NOT);
 }
 
diff --git a/src/codegen/stack_vm/stack_vm.cc b/src/codegen/stack_vm/stack_vm.cc
index a133c9797b1b..95feeae3679e 100644
--- a/src/codegen/stack_vm/stack_vm.cc
+++ b/src/codegen/stack_vm/stack_vm.cc
@@ -455,12 +455,15 @@ void StackVM::Run(State* s) const {
         break;
       }
       case TVM_DEVICE_ALLOCA: {
-        int device_type = static_cast<int>(stack[sp - 2].v_int64);
-        int device_id = static_cast<int>(stack[sp - 1].v_int64);
-        size_t nbytes = static_cast<size_t>(stack[sp].v_int64);
-        void* ptr = TVMBackendAllocWorkspace(device_type, device_id, nbytes);
-        stack[sp - 2].v_handle = ptr;
-        sp = sp - 2;
+        int device_type = static_cast<int>(stack[sp - 4].v_int64);
+        int device_id = static_cast<int>(stack[sp - 3].v_int64);
+        size_t nbytes = static_cast<size_t>(stack[sp - 2].v_int64);
+        int dtype_code_hint = static_cast<int>(stack[sp - 1].v_int64);
+        int dtype_bits_hint = static_cast<int>(stack[sp].v_int64);
+        void* ptr = TVMBackendAllocWorkspace(device_type, device_id, nbytes,
+                                             dtype_code_hint, dtype_bits_hint);
+        stack[sp - 4].v_handle = ptr;
+        sp = sp - 4;
         pc = pc + 1;
         break;
       }
diff --git a/src/codegen/stack_vm/stack_vm.h b/src/codegen/stack_vm/stack_vm.h
index 7663a4586274..54972d39a5df 100644
--- a/src/codegen/stack_vm/stack_vm.h
+++ b/src/codegen/stack_vm/stack_vm.h
@@ -340,16 +340,16 @@ class StackVM {
   static OpCode GetLoad(TVMType t) {
     CHECK_EQ(t.lanes, 1U);
     if (t.code == kHandle) return ARRAY_LOAD_HANDLE;
-    if (t.code == kInt) {
+    if (t.code == kDLInt) {
       switch (t.bits) {
         case 32 : return ARRAY_LOAD_INT32;
         case 64 : return ARRAY_LOAD_INT64;
       }
-    } else if (t.code == kUInt) {
+    } else if (t.code == kDLUInt) {
       switch (t.bits) {
         case 32 : return ARRAY_LOAD_UINT32;
       }
-    } else if (t.code == kFloat) {
+    } else if (t.code == kDLFloat) {
       switch (t.bits) {
         case 64 : return ARRAY_LOAD_FP64;
       }
@@ -365,16 +365,16 @@ class StackVM {
   static OpCode GetStore(TVMType t) {
     CHECK_EQ(t.lanes, 1U);
     if (t.code == kHandle) return ARRAY_STORE_HANDLE;
-    if (t.code == kInt) {
+    if (t.code == kDLInt) {
       switch (t.bits) {
         case 32 : return ARRAY_STORE_INT32;
         case 64 : return ARRAY_STORE_INT64;
       }
-    } else if (t.code == kUInt) {
+    } else if (t.code == kDLUInt) {
       switch (t.bits) {
         case 32 : return ARRAY_STORE_UINT32;
       }
-    } else if (t.code == kFloat) {
+    } else if (t.code == kDLFloat) {
       switch (t.bits) {
         case 64 : return ARRAY_STORE_FP64;
       }
diff --git a/src/codegen/verilog/vpi_device_api.cc b/src/codegen/verilog/vpi_device_api.cc
index 2977a4d45ce5..d53a12962fd7 100644
--- a/src/codegen/verilog/vpi_device_api.cc
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -49,7 +49,10 @@ class VPIDeviceAPI final : public runtime::DeviceAPI {
       *rv = 1;
     }
   }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t size,
+                       size_t alignment,
+                       TVMType type_hint) final {
     // always align to 32 bytes at least.
     CHECK_LE(alignment, runtime::kAllocAlignment);
     alignment = runtime::kAllocAlignment;
@@ -90,11 +93,12 @@ class VPIDeviceAPI final : public runtime::DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
-    if (static_cast<int>(ctx_from.device_type) == kVPI) {
+    if (static_cast<int>(ctx_from.device_type) == kDLVPI) {
       from = RealAddr(static_cast<const char*>(from) + from_offset, size);
     }
-    if (static_cast<int>(ctx_to.device_type) == kVPI) {
+    if (static_cast<int>(ctx_to.device_type) == kDLVPI) {
       to = RealAddr(static_cast<char*>(to) + to_offset, size);
     }
     memcpy(to, from, size);
diff --git a/src/common/base64.h b/src/common/base64.h
new file mode 100644
index 000000000000..31b02d3ca2a3
--- /dev/null
+++ b/src/common/base64.h
@@ -0,0 +1,284 @@
+/*!
+ * Copyright 2018 by Contributors
+ *
+ * \file base64.h
+ * \brief data stream support to input and output from/to base64 stream
+ *   base64 is easier to store and pass as text format in mapreduce
+ */
+#ifndef TVM_COMMON_BASE64_H_
+#define TVM_COMMON_BASE64_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/logging.h>
+#include <cctype>
+#include <cstdio>
+#include <string>
+
+namespace tvm {
+namespace common {
+/*! \brief namespace of base64 decoding and encoding table */
+namespace base64 {
+// decoding table
+const char DecodeTable[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  62,  // '+'
+  0, 0, 0,
+  63,  // '/'
+  52, 53, 54, 55, 56, 57, 58, 59, 60, 61,  // '0'-'9'
+  0, 0, 0, 0, 0, 0, 0,
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+  13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'A'-'Z'
+  0, 0, 0, 0, 0, 0,
+  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+  39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  // 'a'-'z'
+};
+// encoding table
+static const char EncodeTable[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+}  // namespace base64
+
+/*!
+ * \brief Buffer reader from stream to avoid
+ *  virtual call overhead on each read.
+ */
+class StreamBufferReader {
+ public:
+  explicit StreamBufferReader(size_t buffer_size) {
+    buffer_.resize(buffer_size);
+  }
+  /*!
+   * \brief set input stream
+   * \param stream The stream to be set
+   */
+  void set_stream(dmlc::Stream *stream) {
+    stream_ = stream;
+    read_len_ = read_ptr_ = 1;
+  }
+  /*!
+   * \return allows quick read using get char
+   */
+  char GetChar() {
+    while (true) {
+      if (read_ptr_ < read_len_) {
+        return buffer_[read_ptr_++];
+      } else {
+        read_len_ = stream_->Read(&buffer_[0], buffer_.length());
+        if (read_len_ == 0) return EOF;
+        read_ptr_ = 0;
+      }
+    }
+  }
+  /*! \return whether we are reaching the end of file */
+  bool AtEnd() const {
+    return read_len_ == 0;
+  }
+
+ private:
+  /*! \brief the underlying stream */
+  dmlc::Stream *stream_{nullptr};
+  /*! \brief buffer to hold data */
+  std::string buffer_;
+  /*! \brief length of valid data in buffer */
+  size_t read_len_{1};
+  /*! \brief pointer in the buffer */
+  size_t read_ptr_{1};
+};
+
+/*!
+ * \brief Input stream from base64 encoding
+ */
+class Base64InStream: public dmlc::Stream {
+ public:
+  explicit Base64InStream(dmlc::Stream *fs) : reader_(256) {
+    reader_.set_stream(fs);
+  }
+  /*!
+   * \brief initialize the stream position to beginning of next base64 stream
+   * \note call this function before actually start read
+   */
+  void InitPosition(void) {
+    // get a character
+    do {
+      temp_ch_ = reader_.GetChar();
+    } while (isspace(temp_ch_));
+  }
+  /*! \brief whether current position is end of a base64 stream */
+  bool IsEOF(void) const {
+    return num_prev_ == 0 && (temp_ch_ == EOF || isspace(temp_ch_));
+  }
+  // override read function.
+  virtual size_t Read(void *ptr, size_t size) {
+    using base64::DecodeTable;
+    if (size == 0) return 0;
+    // use tlen to record left size
+    size_t tlen = size;
+    unsigned char *cptr = static_cast<unsigned char*>(ptr);
+    // if anything left, load from previous buffered result
+    if (num_prev_ != 0) {
+      if (num_prev_ == 2) {
+        if (tlen >= 2) {
+          *cptr++ = buf_prev[0];
+          *cptr++ = buf_prev[1];
+          tlen -= 2;
+          num_prev_ = 0;
+        } else {
+          // assert tlen == 1
+          *cptr++ = buf_prev[0]; --tlen;
+          buf_prev[0] = buf_prev[1];
+          num_prev_ = 1;
+        }
+      } else {
+        // assert num_prev_ == 1
+        *cptr++ = buf_prev[0]; --tlen; num_prev_ = 0;
+      }
+    }
+    if (tlen == 0) return size;
+    int nvalue;
+    // note: everything goes with 4 bytes in Base64
+    // so we process 4 bytes a unit
+    while (tlen && temp_ch_ != EOF && !isspace(temp_ch_)) {
+      // first byte
+      nvalue = DecodeTable[temp_ch_] << 18;
+      {
+        // second byte
+        temp_ch_ = reader_.GetChar();
+        CHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
+        nvalue |= DecodeTable[temp_ch_] << 12;
+        *cptr++ = (nvalue >> 16) & 0xFF; --tlen;
+        }
+      {
+        // third byte
+        temp_ch_ = reader_.GetChar();
+        CHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
+        // handle termination
+        if (temp_ch_ == '=') {
+          temp_ch_ = reader_.GetChar();
+          CHECK(temp_ch_ == '=') << "invalid base64 format";
+          temp_ch_ = reader_.GetChar();
+          CHECK(temp_ch_ == EOF || isspace(temp_ch_))
+              << "invalid base64 format";
+          break;
+        }
+        nvalue |= DecodeTable[temp_ch_] << 6;
+        if (tlen) {
+          *cptr++ = (nvalue >> 8) & 0xFF; --tlen;
+        } else {
+          buf_prev[num_prev_++] = (nvalue >> 8) & 0xFF;
+        }
+      }
+      {
+        // fourth byte
+        temp_ch_ = reader_.GetChar();
+        CHECK(temp_ch_ != EOF && !isspace(temp_ch_))
+            << "invalid base64 format";
+        if (temp_ch_ == '=') {
+          temp_ch_ = reader_.GetChar();
+          CHECK(temp_ch_ == EOF || isspace(temp_ch_))
+              << "invalid base64 format";
+          break;
+        }
+        nvalue |= DecodeTable[temp_ch_];
+        if (tlen) {
+          *cptr++ = nvalue & 0xFF; --tlen;
+        } else {
+          buf_prev[num_prev_ ++] = nvalue & 0xFF;
+        }
+      }
+      // get next char
+      temp_ch_ = reader_.GetChar();
+    }
+    if (kStrictCheck) {
+      CHECK_EQ(tlen, 0) << "Base64InStream: read incomplete";
+    }
+    return size - tlen;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    LOG(FATAL) << "Base64InStream do not support write";
+  }
+
+ private:
+  // internal reader
+  StreamBufferReader reader_;
+  int temp_ch_{0};
+  int num_prev_{0};
+  unsigned char buf_prev[2];
+  // whether we need to do strict check
+  static const bool kStrictCheck = false;
+};
+
+/*!
+ * \brief Stream to write to base64 format.
+ */
+class Base64OutStream: public dmlc::Stream {
+ public:
+  explicit Base64OutStream(dmlc::Stream *fp) : fp_(fp) {
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    using base64::EncodeTable;
+    size_t tlen = size;
+    const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
+    while (tlen) {
+      while (buf__top_ < 3  && tlen != 0) {
+        buf_[++buf__top_] = *cptr++; --tlen;
+      }
+      if (buf__top_ == 3) {
+        // flush 4 bytes out
+        PutChar(EncodeTable[buf_[1] >> 2]);
+        PutChar(EncodeTable[((buf_[1] << 4) | (buf_[2] >> 4)) & 0x3F]);
+        PutChar(EncodeTable[((buf_[2] << 2) | (buf_[3] >> 6)) & 0x3F]);
+        PutChar(EncodeTable[buf_[3] & 0x3F]);
+        buf__top_ = 0;
+      }
+    }
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    LOG(FATAL) << "Base64OutStream do not support read";
+    return 0;
+  }
+  /*!
+   * \brief finish writing of all current base64 stream, do some post processing
+   * \param endch character to put to end of stream, if it is EOF, then nothing will be appended.
+   */
+  void Finish(char endch = EOF) {
+    using base64::EncodeTable;
+    if (buf__top_ == 1) {
+      PutChar(EncodeTable[buf_[1] >> 2]);
+      PutChar(EncodeTable[(buf_[1] << 4) & 0x3F]);
+      PutChar('=');
+      PutChar('=');
+    }
+    if (buf__top_ == 2) {
+      PutChar(EncodeTable[buf_[1] >> 2]);
+      PutChar(EncodeTable[((buf_[1] << 4) | (buf_[2] >> 4)) & 0x3F]);
+      PutChar(EncodeTable[(buf_[2] << 2) & 0x3F]);
+      PutChar('=');
+    }
+    buf__top_ = 0;
+    if (endch != EOF) PutChar(endch);
+    this->Flush();
+  }
+
+ private:
+  static constexpr size_t kBufferSize = 256;
+
+  dmlc::Stream *fp_{nullptr};
+  int buf__top_{0};
+  unsigned char buf_[4];
+  std::string out_buf_;
+
+
+  void PutChar(char ch) {
+    out_buf_ += ch;
+    if (out_buf_.length() >= kBufferSize) Flush();
+  }
+  void Flush(void) {
+    if (out_buf_.length() != 0) {
+      fp_->Write(&out_buf_[0], out_buf_.length());
+      out_buf_.clear();
+    }
+  }
+};
+}  // namespace common
+}  // namespace tvm
+#endif  // TVM_COMMON_BASE64_H_
diff --git a/src/contrib/cblas/cblas.cc b/src/contrib/cblas/cblas.cc
index 01ef12f0d04d..24ed9deb97cd 100644
--- a/src/contrib/cblas/cblas.cc
+++ b/src/contrib/cblas/cblas.cc
@@ -7,7 +7,11 @@
 #include <dmlc/logging.h>
 
 extern "C" {
+#if USE_MKL_BLAS == 1
+#include <mkl_cblas.h>
+#else
 #include <cblas.h>
+#endif
 }
 
 namespace tvm {
@@ -30,15 +34,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul")
     CHECK(C->strides == nullptr);
     CHECK(B->strides == nullptr);
     CHECK(A->strides == nullptr);
-    CHECK(TypeMatch(A->dtype, kFloat, 32));
-    CHECK(TypeMatch(B->dtype, kFloat, 32));
-    CHECK(TypeMatch(C->dtype, kFloat, 32));
+    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
     cblas_sgemm(CblasColMajor,
                 transb ? CblasTrans : CblasNoTrans,
                 transa ? CblasTrans : CblasNoTrans,
                 transb ? B->shape[0] : B->shape[1],
                 transa ? A->shape[1] : A->shape[0],
-                transa ? B->shape[1] : B->shape[0],
+                transb ? B->shape[1] : B->shape[0],
                 1.0f,
                 reinterpret_cast<float*>(static_cast<char*>(B->data) + B->byte_offset),
                 B->shape[1],
diff --git a/src/contrib/cublas/cublas.cc b/src/contrib/cublas/cublas.cc
new file mode 100644
index 000000000000..4171aadf6381
--- /dev/null
+++ b/src/contrib/cublas/cublas.cc
@@ -0,0 +1,81 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file Use external cblas library call.
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/util.h>
+#include <dmlc/logging.h>
+
+extern "C" {
+#include <cublas_v2.h>
+}
+
+namespace tvm {
+namespace contrib {
+
+using namespace runtime;
+
+#ifndef CHECK_CUBLAS_ERROR
+#define CHECK_CUBLAS_ERROR(error) \
+if (error != CUBLAS_STATUS_SUCCESS) { \
+  fprintf(stderr, "cuBLAS error: "); \
+  if (error == CUBLAS_STATUS_NOT_INITIALIZED) fprintf(stderr, "CUBLAS_STATUS_NOT_INITIALIZED"); \
+  if (error == CUBLAS_STATUS_ALLOC_FAILED) fprintf(stderr, "CUBLAS_STATUS_ALLOC_FAILED"); \
+  if (error == CUBLAS_STATUS_INVALID_VALUE) fprintf(stderr, "CUBLAS_STATUS_INVALID_VALUE"); \
+  if (error == CUBLAS_STATUS_ARCH_MISMATCH) fprintf(stderr, "CUBLAS_STATUS_ARCH_MISMATCH"); \
+  if (error == CUBLAS_STATUS_MAPPING_ERROR) fprintf(stderr, "CUBLAS_STATUS_MAPPING_ERROR"); \
+  if (error == CUBLAS_STATUS_EXECUTION_FAILED) fprintf(stderr, "CUBLAS_STATUS_EXECUTION_FAILED"); \
+  if (error == CUBLAS_STATUS_INTERNAL_ERROR) fprintf(stderr, "CUBLAS_STATUS_INTERNAL_ERROR"); \
+  if (error == CUBLAS_STATUS_NOT_SUPPORTED) fprintf(stderr, "CUBLAS_STATUS_NOT_SUPPORTED"); \
+  if (error == CUBLAS_STATUS_LICENSE_ERROR) fprintf(stderr, "CUBLAS_STATUS_LICENSE_ERROR"); \
+  fprintf(stderr, "\n"); \
+  exit(EXIT_FAILURE); \
+}
+#endif
+
+// matrix multiplication for row major
+TVM_REGISTER_GLOBAL("tvm.contrib.cublas.matmul")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    DLTensor* A = args[0];
+    DLTensor* B = args[1];
+    DLTensor* C = args[2];
+    bool transa = args[3];
+    bool transb = args[4];
+    // call gemm for simple compact code.
+    CHECK_EQ(A->ndim, 2);
+    CHECK_EQ(B->ndim, 2);
+    CHECK_EQ(C->ndim, 2);
+    CHECK(C->strides == nullptr);
+    CHECK(B->strides == nullptr);
+    CHECK(A->strides == nullptr);
+    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+
+    cublasHandle_t handle;
+    CHECK_CUBLAS_ERROR(cublasCreate(&handle));
+    float alpha = 1.0;
+    float beta = 0.0;
+    float *A_ptr = reinterpret_cast<float*>(static_cast<char*>(B->data) + B->byte_offset);
+    float *B_ptr = reinterpret_cast<float*>(static_cast<char*>(A->data) + A->byte_offset);
+    float *C_ptr = reinterpret_cast<float*>(static_cast<char*>(C->data) + C->byte_offset);
+
+    CHECK_CUBLAS_ERROR(cublasSgemm(handle,
+                                   transb ? CUBLAS_OP_T : CUBLAS_OP_N,
+                                   transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+                                   transb ? B->shape[0] : B->shape[1],
+                                   transa ? A->shape[1] : A->shape[0],
+                                   transb ? B->shape[1] : B->shape[0],
+                                   &alpha,
+                                   A_ptr,
+                                   B->shape[1],
+                                   B_ptr,
+                                   A->shape[1],
+                                   &beta,
+                                   C_ptr,
+                                   C->shape[1]));
+
+    CHECK_CUBLAS_ERROR(cublasDestroy(handle));
+});
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/cudnn/conv_forward.cc b/src/contrib/cudnn/conv_forward.cc
index fadcfa03b021..4cd25f0c2fe4 100644
--- a/src/contrib/cudnn/conv_forward.cc
+++ b/src/contrib/cudnn/conv_forward.cc
@@ -114,7 +114,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.output_shape")
   int x_dim3 = args[10];
   int w_dim0 = args[11];
   int w_dim1 = args[12];
-  int w_dim2 = args[12];
+  int w_dim2 = args[13];
   int w_dim3 = args[14];
   void *out_shape = args[15];
   // Set Format
@@ -153,7 +153,103 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.output_shape")
                                                    static_cast<int*>(out_shape) + 1,
                                                    static_cast<int*>(out_shape) + 2,
                                                    static_cast<int*>(out_shape) + 3));
-  });
+});
+
+
+TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.find_algo")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
+  int format = args[0];
+  int pad_h = args[1];
+  int pad_w = args[2];
+  int stride_h = args[3];
+  int stride_w = args[4];
+  int dilation_h = args[5];
+  int dilation_w = args[6];
+  int x_dim0 = args[7];
+  int x_dim1 = args[8];
+  int x_dim2 = args[9];
+  int x_dim3 = args[10];
+  int w_dim0 = args[11];
+  int w_dim1 = args[12];
+  int w_dim2 = args[13];
+  int w_dim3 = args[14];
+  int y_dim0 = args[15];
+  int y_dim1 = args[16];
+  int y_dim2 = args[17];
+  int y_dim3 = args[18];
+
+  // Set Format
+  entry_ptr->conv_entry.tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  // conv desc
+  CUDNN_CALL(cudnnSetConvolution2dDescriptor(entry_ptr->conv_entry.conv_desc,
+                                             pad_h,
+                                             pad_w,
+                                             stride_h,
+                                             stride_w,
+                                             dilation_h,
+                                             dilation_w,
+                                             CUDNN_CROSS_CORRELATION,
+                                             entry_ptr->conv_entry.data_type));
+  // input desc
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc,
+                                        entry_ptr->conv_entry.tensor_format,
+                                        CUDNN_DATA_FLOAT,
+                                        x_dim0,
+                                        x_dim1,
+                                        x_dim2,
+                                        x_dim3));
+  // filter desc
+  CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc,
+                                        CUDNN_DATA_FLOAT,
+                                        CUDNN_TENSOR_NCHW,
+                                        w_dim0,
+                                        w_dim1,
+                                        w_dim2,
+                                        w_dim3));
+
+  // output desc
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.output_desc,
+                                        entry_ptr->conv_entry.tensor_format,
+                                        entry_ptr->conv_entry.data_type,
+                                        y_dim0,
+                                        y_dim1,
+                                        y_dim2,
+                                        y_dim3));
+
+  int returned_algo_count = 0;
+  cudnnConvolutionFwdAlgoPerf_t perf_results[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
+  CUDNN_CALL(cudnnFindConvolutionForwardAlgorithm(entry_ptr->handle,
+                                                  entry_ptr->conv_entry.input_desc,
+                                                  entry_ptr->conv_entry.filter_desc,
+                                                  entry_ptr->conv_entry.conv_desc,
+                                                  entry_ptr->conv_entry.output_desc,
+                                                  CUDNN_CONVOLUTION_FWD_ALGO_COUNT,
+                                                  &returned_algo_count,
+                                                  perf_results));
+
+  const std::vector<std::string> fwd_algo_names{
+      "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM",
+      "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM",
+      "CUDNN_CONVOLUTION_FWD_ALGO_GEMM",
+      "CUDNN_CONVOLUTION_FWD_ALGO_DIRECT",
+      "CUDNN_CONVOLUTION_FWD_ALGO_FFT",
+      "CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING",
+      "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD",
+      "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED"
+  };
+
+  auto best_algo = perf_results[0].algo;
+  LOG(INFO) << "\tCUDNN Found " << returned_algo_count
+            << " fwd algorithms, choosing " << fwd_algo_names[best_algo];
+  for (int i = 0; i < returned_algo_count; ++i) {
+    LOG(INFO) << "\t\t" << i << ") " << fwd_algo_names[perf_results[i].algo]
+              << " - time: " << perf_results[i].time << " ms"
+              << ", Memory: " << perf_results[i].memory;
+  }
+
+  ret[0] = best_algo;
+});
 
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/cudnn/cudnn_utils.cc b/src/contrib/cudnn/cudnn_utils.cc
index 5929e2ffbc0f..5ca558f7d8ba 100644
--- a/src/contrib/cudnn/cudnn_utils.cc
+++ b/src/contrib/cudnn/cudnn_utils.cc
@@ -13,17 +13,17 @@ namespace contrib {
 // CuDNN Data Type
 cudnnDataType_t CuDNNDataType::DLTypeToCuDNNType(const DLDataType &dtype) {
   switch (dtype.code) {
-      case kInt:
+      case kDLInt:
         if (dtype.bits == 8 && dtype.lanes == 1) return CUDNN_DATA_INT8;
         else if (dtype.bits == 32 && dtype.lanes == 1) return CUDNN_DATA_INT32;
         else if (dtype.bits == 8 && dtype.lanes == 4) return CUDNN_DATA_INT8x4;
         else
           LOG(FATAL) << "Unsupported type";
         break;
-      case kUInt:
+      case kDLUInt:
         LOG(FATAL) << "Unsupported type";
         break;
-      case kFloat:
+      case kDLFloat:
         if (dtype.bits == 32 && dtype.lanes == 1) return CUDNN_DATA_FLOAT;
         else if (dtype.bits == 64 && dtype.lanes == 1) return CUDNN_DATA_DOUBLE;
         else if (dtype.bits == 16 && dtype.lanes == 1) return CUDNN_DATA_HALF;
diff --git a/src/contrib/miopen/conv_forward.cc b/src/contrib/miopen/conv_forward.cc
new file mode 100644
index 000000000000..7090560e3889
--- /dev/null
+++ b/src/contrib/miopen/conv_forward.cc
@@ -0,0 +1,223 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file Use external miopen utils function
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/util.h>
+#include <tvm/runtime/device_api.h>
+#include "miopen_utils.h"
+
+namespace tvm {
+namespace contrib {
+namespace miopen {
+
+using namespace runtime;
+
+TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  const int mode = args[0];
+  const int pad_h = args[1];
+  const int pad_w = args[2];
+  const int stride_h = args[3];
+  const int stride_w = args[4];
+  const int dilation_h = args[5];
+  const int dilation_w = args[6];
+  const int x_dim0 = args[7];
+  const int x_dim1 = args[8];
+  const int x_dim2 = args[9];
+  const int x_dim3 = args[10];
+  const int w_dim0 = args[11];
+  const int w_dim1 = args[12];
+  const int w_dim2 = args[13];
+  const int w_dim3 = args[14];
+  void *out_shape = args[15];
+
+  MIOpenThreadEntry* entry_ptr = MIOpenThreadEntry::ThreadLocal();
+  // Set Mode
+  entry_ptr->conv_entry.mode = static_cast<miopenConvolutionMode_t>(mode);
+  // Set Ctx
+  entry_ptr->conv_entry.ctx = TVMContext{kDLROCM, 0};
+  // Set Data Type
+  entry_ptr->conv_entry.data_type = miopenFloat;  // MIOpen only suppports fp32
+  // Set Desc
+  MIOPEN_CALL(miopenInitConvolutionDescriptor(entry_ptr->conv_entry.conv_desc,
+                                              entry_ptr->conv_entry.mode,
+                                              pad_h,
+                                              pad_w,
+                                              stride_h,
+                                              stride_w,
+                                              dilation_h,
+                                              dilation_w));
+  // Set Filter
+  MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.filter_desc,
+                                          entry_ptr->conv_entry.data_type,
+                                          w_dim0,
+                                          w_dim1,
+                                          w_dim2,
+                                          w_dim3));
+  // Set Input
+  MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.input_desc,
+                                          entry_ptr->conv_entry.data_type,
+                                          x_dim0,
+                                          x_dim1,
+                                          x_dim2,
+                                          x_dim3));
+
+  // Set Output shape
+  MIOPEN_CALL(miopenGetConvolutionForwardOutputDim(entry_ptr->conv_entry.conv_desc,
+                                                   entry_ptr->conv_entry.input_desc,
+                                                   entry_ptr->conv_entry.filter_desc,
+                                                   static_cast<int*>(out_shape),
+                                                   static_cast<int*>(out_shape) + 1,
+                                                   static_cast<int*>(out_shape) + 2,
+                                                   static_cast<int*>(out_shape) + 3));
+
+  const int *oshape = static_cast<int*>(out_shape);
+  // Set Output
+  MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.output_desc,
+                                          entry_ptr->conv_entry.data_type,
+                                          oshape[0],
+                                          oshape[1],
+                                          oshape[2],
+                                          oshape[3]));
+
+  // Set workspace
+  size_t workspace_size = 0;
+  MIOPEN_CALL(miopenConvolutionForwardGetWorkSpaceSize(entry_ptr->handle,
+                                                       entry_ptr->conv_entry.filter_desc,
+                                                       entry_ptr->conv_entry.input_desc,
+                                                       entry_ptr->conv_entry.conv_desc,
+                                                       entry_ptr->conv_entry.output_desc,
+                                                       &workspace_size));
+  entry_ptr->conv_entry.UpdateWorkspace(workspace_size);
+
+  const size_t input_size = x_dim0 * x_dim1 * x_dim2 * x_dim3;
+  const size_t filter_size = w_dim0 * w_dim1 * w_dim2 * w_dim3;
+  const size_t output_size = oshape[0] * oshape[1] * oshape[2] * oshape[3];
+
+  runtime::DeviceAPI* rocm_api = entry_ptr->conv_entry.rocm_api;
+  float* input_buf = static_cast<float*>(rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx,
+                                                                  input_size * sizeof(float)));
+  float* filter_buf = static_cast<float*>(rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx,
+                                                                   filter_size * sizeof(float)));
+  float* output_buf = static_cast<float*>(rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx,
+                                                                   output_size * sizeof(float)));
+
+  const int request_algo_count = 4;
+  const bool exhaustive_search = false;
+  void* workspace = entry_ptr->conv_entry.workspace;
+  if (workspace_size == 0) workspace = nullptr;
+  int returned_algo_count = 0;
+  miopenConvAlgoPerf_t perfs[4];
+
+  MIOPEN_CALL(miopenFindConvolutionForwardAlgorithm(entry_ptr->handle,
+                                                    entry_ptr->conv_entry.input_desc,
+                                                    input_buf,
+                                                    entry_ptr->conv_entry.filter_desc,
+                                                    filter_buf,
+                                                    entry_ptr->conv_entry.conv_desc,
+                                                    entry_ptr->conv_entry.output_desc,
+                                                    output_buf,
+                                                    request_algo_count,
+                                                    &returned_algo_count,
+                                                    perfs,
+                                                    workspace,
+                                                    workspace_size,
+                                                    exhaustive_search));
+
+  rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, input_buf);
+  rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, filter_buf);
+  rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, output_buf);
+
+  const std::vector<std::string> fwd_algo_names{
+      "miopenConvolutionFwdAlgoGEMM",
+      "miopenConvolutionFwdAlgoDirect",
+      "miopenConvolutionFwdAlgoFFT",
+      "miopenConvolutionFwdAlgoWinograd",
+  };
+  const auto best_algo = perfs[0].fwd_algo;
+  LOG(INFO) << "\tMIOpen Found " << returned_algo_count
+            << " fwd algorithms, choosing " << fwd_algo_names[best_algo];
+  for (int i = 0; i < returned_algo_count; ++i) {
+    LOG(INFO) << "\t\t" << i << ") " << fwd_algo_names[perfs[i].fwd_algo]
+              << " - time: " << perfs[i].time << " ms"
+              << ", Memory: " << perfs[i].memory;
+  }
+  // Set Algo
+  ret[0] = static_cast<int>(best_algo);
+});
+
+
+TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.forward")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  const int mode = args[0];
+  const int pad_h = args[1];
+  const int pad_w = args[2];
+  const int stride_h = args[3];
+  const int stride_w = args[4];
+  const int dilation_h = args[5];
+  const int dilation_w = args[6];
+  const int algo = args[7];
+  const DLTensor *x = args[8];
+  const DLTensor *w = args[9];
+  const DLTensor *y = args[10];
+
+  MIOpenThreadEntry* entry_ptr = MIOpenThreadEntry::ThreadLocal();
+  entry_ptr->conv_entry.fwd_algo = static_cast<miopenConvFwdAlgorithm_t>(algo);
+  // Set Mode
+  entry_ptr->conv_entry.mode = static_cast<miopenConvolutionMode_t>(mode);
+  // Set Ctx
+  entry_ptr->conv_entry.ctx = x->ctx;
+  // Set Data Type
+  entry_ptr->conv_entry.data_type = miopenFloat;  // MIOpen only suppports fp32
+  // Set Desc
+  MIOPEN_CALL(miopenInitConvolutionDescriptor(entry_ptr->conv_entry.conv_desc,
+                                              entry_ptr->conv_entry.mode,
+                                              pad_h,
+                                              pad_w,
+                                              stride_h,
+                                              stride_w,
+                                              dilation_h,
+                                              dilation_w));
+  // Set Filter
+  MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.filter_desc,
+                                          entry_ptr->conv_entry.data_type,
+                                          w->shape[0],
+                                          w->shape[1],
+                                          w->shape[2],
+                                          w->shape[3]));
+  // Set Input
+  MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.input_desc,
+                                          entry_ptr->conv_entry.data_type,
+                                          x->shape[0],
+                                          x->shape[1],
+                                          x->shape[2],
+                                          x->shape[3]));
+  // Set Output
+  MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.output_desc,
+                                          entry_ptr->conv_entry.data_type,
+                                          y->shape[0],
+                                          y->shape[1],
+                                          y->shape[2],
+                                          y->shape[3]));
+
+  const float alpha = 1.f;
+  const float beta = 0.f;
+  MIOPEN_CALL(miopenConvolutionForward(entry_ptr->handle,
+                                       &alpha,
+                                       entry_ptr->conv_entry.input_desc,
+                                       x->data,
+                                       entry_ptr->conv_entry.filter_desc,
+                                       w->data,
+                                       entry_ptr->conv_entry.conv_desc,
+                                       entry_ptr->conv_entry.fwd_algo,
+                                       &beta,
+                                       entry_ptr->conv_entry.output_desc,
+                                       y->data,
+                                       entry_ptr->conv_entry.workspace,
+                                       entry_ptr->conv_entry.workspace_size));
+});
+
+}  // namespace miopen
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/miopen/miopen_utils.cc b/src/contrib/miopen/miopen_utils.cc
new file mode 100644
index 000000000000..3019b1b83e81
--- /dev/null
+++ b/src/contrib/miopen/miopen_utils.cc
@@ -0,0 +1,78 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file Use external miopen utils function
+ */
+#include "miopen_utils.h"
+#include <dmlc/thread_local.h>
+#include <tvm/runtime/registry.h>
+#include <vector>
+#include <string>
+
+namespace tvm {
+namespace contrib {
+namespace miopen {
+
+std::string miopenGetErrorString(int error_code) {
+  const std::vector<std::string> mio_err{
+      "StatusSuccess        ", "StatusNotInitialized ", "StatusInvalidValue   ",
+      "StatusBadParm        ", "StatusAllocFailed    ", "StatusInternalError  ",
+      "StatusNotImplemented ", "StatusUnknownError   "};
+  return mio_err[error_code];
+}
+
+// MiopenThreadEntry
+MIOpenThreadEntry::MIOpenThreadEntry() {
+  auto stream = runtime::ROCMThreadEntry::ThreadLocal()->stream;
+  auto func = runtime::Registry::Get("device_api.rocm");
+  void *ret = (*func)();
+  rocm_api = static_cast<runtime::DeviceAPI*>(ret);
+  MIOPEN_CALL(miopenCreate(&handle));
+  MIOPEN_CALL(miopenSetStream(handle, stream));
+  conv_entry.rocm_api = rocm_api;
+}
+
+MIOpenThreadEntry::~MIOpenThreadEntry() {
+  MIOPEN_CALL(miopenDestroy(handle));
+}
+
+typedef dmlc::ThreadLocalStore<MIOpenThreadEntry> MIOpenThreadStore;
+
+MIOpenThreadEntry* MIOpenThreadEntry::ThreadLocal() {
+  return MIOpenThreadStore::Get();
+}
+
+// ConvEntry
+
+ConvEntry::ConvEntry() {
+  MIOPEN_CALL(miopenCreateConvolutionDescriptor(&conv_desc));
+  MIOPEN_CALL(miopenCreateTensorDescriptor(&filter_desc));
+  MIOPEN_CALL(miopenCreateTensorDescriptor(&input_desc));
+  MIOPEN_CALL(miopenCreateTensorDescriptor(&output_desc));
+}
+
+ConvEntry::~ConvEntry() {
+  MIOPEN_CALL(miopenDestroyConvolutionDescriptor(conv_desc));
+  MIOPEN_CALL(miopenDestroyTensorDescriptor(filter_desc));
+  MIOPEN_CALL(miopenDestroyTensorDescriptor(input_desc));
+  MIOPEN_CALL(miopenDestroyTensorDescriptor(output_desc));
+  CleanWorkspace();
+}
+
+void ConvEntry::UpdateWorkspace(const size_t wsize) {
+  if (workspace_size < wsize) {
+    if (workspace != nullptr) {
+      CleanWorkspace();
+    }
+    workspace_size = wsize;
+    workspace = rocm_api->AllocWorkspace(ctx, workspace_size);
+  }
+}
+
+void ConvEntry::CleanWorkspace() {
+  if (workspace) rocm_api->FreeWorkspace(ctx, workspace);
+  workspace_size = 0;
+}
+
+}  // namespace miopen
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/miopen/miopen_utils.h b/src/contrib/miopen/miopen_utils.h
new file mode 100644
index 000000000000..b01fc017ed95
--- /dev/null
+++ b/src/contrib/miopen/miopen_utils.h
@@ -0,0 +1,59 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file Use external miopen utils function
+ */
+
+#ifndef TVM_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
+#define TVM_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
+
+#include <dmlc/logging.h>
+#include <miopen/miopen.h>
+#include <tvm/runtime/device_api.h>
+#include <string>
+#include "../../runtime/rocm/rocm_common.h"
+
+namespace tvm {
+namespace contrib {
+namespace miopen {
+
+std::string miopenGetErrorString(int error_code);
+
+#define MIOPEN_CALL(func)                                                      \
+  {                                                                            \
+    miopenStatus_t e = (func);                                                 \
+    CHECK_EQ(e, miopenStatusSuccess)                                           \
+        << "miopen error: " << miopenGetErrorString(e);                        \
+  }
+
+struct ConvEntry {
+  miopenConvolutionDescriptor_t conv_desc;
+  miopenConvolutionMode_t mode{miopenConvolution};
+  miopenTensorDescriptor_t filter_desc;
+  miopenDataType_t data_type{miopenFloat};
+  miopenTensorDescriptor_t input_desc;
+  miopenTensorDescriptor_t output_desc;
+  miopenConvFwdAlgorithm_t fwd_algo;
+  TVMContext ctx;
+  runtime::DeviceAPI *rocm_api;
+  void *workspace{nullptr};
+  size_t workspace_size{0};
+  ConvEntry();
+  ~ConvEntry();
+  void UpdateWorkspace(const size_t wsize);
+  void CleanWorkspace();
+};  // ConvThreadEntry
+
+struct MIOpenThreadEntry {
+  MIOpenThreadEntry();
+  ~MIOpenThreadEntry();
+  miopenHandle_t handle{nullptr};
+  ConvEntry conv_entry;
+  runtime::DeviceAPI *rocm_api{nullptr};
+  static MIOpenThreadEntry *ThreadLocal();
+};  // MIOpenThreadEntry
+
+}  // namespace miopen
+}  // namespace contrib
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
diff --git a/src/contrib/mps/conv.mm b/src/contrib/mps/conv.mm
new file mode 100644
index 000000000000..fa279bd5cc95
--- /dev/null
+++ b/src/contrib/mps/conv.mm
@@ -0,0 +1,154 @@
+#include "mps_utils.h"
+
+namespace tvm {
+namespace contrib {
+
+using namespace runtime;
+
+TVM_REGISTER_GLOBAL("tvm.contrib.mps.buffer2img")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  DLTensor *buf = args[0];
+  DLTensor *img = args[1];
+  // copy to temp
+  id<MTLBuffer> mtlbuf = (__bridge id<MTLBuffer>)(buf->data);
+  MetalThreadEntry *entry_ptr = MetalThreadEntry::ThreadLocal();
+  runtime::metal::MetalThreadEntry *rt =
+      runtime::metal::MetalThreadEntry::ThreadLocal();
+  id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(buf->ctx);
+  id<MTLBuffer> temp = rt->GetTempBuffer(buf->ctx, [mtlbuf length]);
+  entry_ptr->metal_api->CopyDataFromTo(
+      (__bridge void *)mtlbuf, 0, (__bridge void *)temp, 0, [mtlbuf length],
+      buf->ctx, buf->ctx, nullptr
+  );
+
+  MPSImageDescriptor *desc = [MPSImageDescriptor
+      imageDescriptorWithChannelFormat:MPSImageFeatureChannelFormatFloat32
+                                 width:buf->shape[2]
+                                height:buf->shape[1]
+                       featureChannels:buf->shape[3]];
+
+  MPSImage *mpsimg = entry_ptr->AllocMPSImage(dev, desc);
+
+  [mpsimg writeBytes:[temp contents]
+          dataLayout:MPSDataLayoutHeightxWidthxFeatureChannels
+          imageIndex:0];
+
+  img->data = (__bridge void *)mpsimg;
+
+  [mpsimg readBytes:[temp contents]
+         dataLayout:MPSDataLayoutHeightxWidthxFeatureChannels
+         imageIndex:0];
+  
+  });
+
+TVM_REGISTER_GLOBAL("tvm.contrib.mps.img2buffer")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  DLTensor *img = args[0];
+  DLTensor *buf = args[1];
+  id<MTLBuffer> mtlbuf = (__bridge id<MTLBuffer>)(buf->data);
+  MPSImage *mpsimg = (__bridge MPSImage *)(img->data);
+  MetalThreadEntry *entry_ptr = MetalThreadEntry::ThreadLocal();
+  runtime::metal::MetalThreadEntry *rt =
+      runtime::metal::MetalThreadEntry::ThreadLocal();
+  id<MTLBuffer> temp = rt->GetTempBuffer(buf->ctx, [mtlbuf length]);
+
+  [mpsimg readBytes:[temp contents]
+         dataLayout:MPSDataLayoutHeightxWidthxFeatureChannels
+         imageIndex:0];
+
+  entry_ptr->metal_api->CopyDataFromTo(
+      (__bridge void *)temp, 0, (__bridge void *)mtlbuf, 0, [mtlbuf length],
+      buf->ctx, buf->ctx, nullptr);
+
+    });
+
+TVM_REGISTER_GLOBAL("tvm.contrib.mps.conv2d")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  // MPS-NHWC
+  DLTensor *data = args[0];
+  DLTensor *weight = args[1];
+  DLTensor *output = args[2];
+  int pad = args[3];
+  int stride = args[4];
+
+  CHECK_EQ(data->ndim, 4);
+  CHECK_EQ(weight->ndim, 4);
+  CHECK_EQ(output->ndim, 4);
+  CHECK(output->strides == nullptr);
+  CHECK(weight->strides == nullptr);
+  CHECK(data->strides == nullptr);
+
+  CHECK_EQ(data->shape[0], 1);
+  CHECK_EQ(output->shape[0], 1);
+
+  int oCh = weight->shape[0];
+  int kH = weight->shape[1];
+  int kW = weight->shape[2];
+  int iCh = weight->shape[3];
+
+  auto f_buf2img = runtime::Registry::Get("tvm.contrib.mps.buffer2img");
+  auto f_img2buf = runtime::Registry::Get("tvm.contrib.mps.img2buffer");
+  // Get Metal device API
+  MetalThreadEntry *entry_ptr = MetalThreadEntry::ThreadLocal();
+  runtime::metal::MetalThreadEntry *rt =
+      runtime::metal::MetalThreadEntry::ThreadLocal();
+  id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(data->ctx);
+  id<MTLCommandQueue> queue =
+      entry_ptr->metal_api->GetCommandQueue(data->ctx);
+  id<MTLCommandBuffer> cb = [queue commandBuffer];
+  // data to MPSImage
+  DLTensor tmp_in;
+  (*f_buf2img)(data, &tmp_in);
+  MPSImage *tempA = (__bridge MPSImage *)tmp_in.data;
+  // weight to temp memory
+  id<MTLBuffer> bufB = (__bridge id<MTLBuffer>)(weight->data);
+  id<MTLBuffer> tempB = rt->GetTempBuffer(weight->ctx, [bufB length]);
+  entry_ptr->metal_api->CopyDataFromTo(
+      (__bridge void *)bufB, 0, (__bridge void *)tempB, 0, [bufB length],
+      weight->ctx, weight->ctx, nullptr);
+  float *ptr_w = (float *)[tempB contents];
+  // output to MPSImage
+  DLTensor tmp_out;
+  (*f_buf2img)(output, &tmp_out);
+  MPSImage *tempC = (__bridge MPSImage *)tmp_out.data;
+  // conv desc
+
+  MPSCNNConvolutionDescriptor *conv_desc = [MPSCNNConvolutionDescriptor
+      cnnConvolutionDescriptorWithKernelWidth:kW
+                                 kernelHeight:kH
+                         inputFeatureChannels:iCh
+                        outputFeatureChannels:oCh];
+  [conv_desc setStrideInPixelsX:stride];
+  [conv_desc setStrideInPixelsY:stride];
+
+  MPSCNNConvolution *conv =
+      [[MPSCNNConvolution alloc] initWithDevice:dev
+                          convolutionDescriptor:conv_desc
+                                  kernelWeights:ptr_w
+                                      biasTerms:nil
+                                          flags:MPSCNNConvolutionFlagsNone];
+  if (pad == 0) {
+    conv.padding = [MPSNNDefaultPadding
+        paddingWithMethod:MPSNNPaddingMethodAddRemainderToTopLeft |
+                          MPSNNPaddingMethodAlignCentered |
+                          MPSNNPaddingMethodSizeSame];
+  } else if (pad == 1) {
+    conv.padding = [MPSNNDefaultPadding
+        paddingWithMethod:MPSNNPaddingMethodAddRemainderToTopLeft |
+                          MPSNNPaddingMethodAlignCentered |
+                          MPSNNPaddingMethodSizeValidOnly];
+  }
+  [conv encodeToCommandBuffer:cb sourceImage:tempA destinationImage:tempC];
+
+  [cb commit];
+  id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
+  [encoder synchronizeResource:tempC.texture];
+  [encoder endEncoding];
+  [cb waitUntilCompleted];
+
+  (*f_img2buf)(&tmp_out, output);
+  
+  });
+
+} // namespace contrib
+} // namespace tvm
diff --git a/src/contrib/mps/gemm.mm b/src/contrib/mps/gemm.mm
new file mode 100644
index 000000000000..1d92ad2851d0
--- /dev/null
+++ b/src/contrib/mps/gemm.mm
@@ -0,0 +1,87 @@
+
+#include "mps_utils.h"
+
+namespace tvm {
+namespace contrib {
+
+using namespace runtime;
+
+TVM_REGISTER_GLOBAL("tvm.contrib.mps.matmul")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  DLTensor *A = args[0];
+  DLTensor *B = args[1];
+  DLTensor *C = args[2];
+  bool transa = args[3];
+  bool transb = args[4];
+  // call gemm for simple compact code.
+  CHECK_EQ(A->ndim, 2);
+  CHECK_EQ(B->ndim, 2);
+  CHECK_EQ(C->ndim, 2);
+  CHECK(C->strides == nullptr);
+  CHECK(B->strides == nullptr);
+  CHECK(A->strides == nullptr);
+  CHECK(TypeMatch(A->dtype, kDLFloat, 32));
+  CHECK(TypeMatch(B->dtype, kDLFloat, 32));
+  CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+  // Get Metal device API
+  MetalThreadEntry *entry_ptr = MetalThreadEntry::ThreadLocal();
+  // CHECK_EQ(A->ctx, B->ctx);
+  // CHECK_EQ(A->ctx, C->ctx);
+  id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(A->ctx);
+  id<MTLCommandQueue> queue = entry_ptr->metal_api->GetCommandQueue(A->ctx);
+  id<MTLCommandBuffer> cb = [queue commandBuffer];
+  NSUInteger M = A->shape[0 + (transa ? 1 : 0)];
+  NSUInteger N = B->shape[1 - (transb ? 1 : 0)];
+  NSUInteger K = B->shape[0 + (transb ? 1 : 0)];
+
+  CHECK_EQ(A->shape[1 - (transa ? 1 : 0)], K);
+  // mps a
+  MPSDataType dtype = MPSType::DLTypeToMPSType(A->dtype);
+  MPSMatrixDescriptor *descA = [MPSMatrixDescriptor
+      matrixDescriptorWithDimensions:M
+                             columns:K
+                            rowBytes:K * sizeof(MPSDataTypeFloat32)
+                            dataType:MPSDataTypeFloat32];
+  id<MTLBuffer> bufA = (__bridge id<MTLBuffer>)(A->data);
+  MPSMatrix *matrixA =
+      [[MPSMatrix alloc] initWithBuffer:bufA descriptor:descA];
+  // mps b
+  MPSMatrixDescriptor *descB =
+      [MPSMatrixDescriptor matrixDescriptorWithDimensions:K
+                                                  columns:N
+                                                  rowBytes:N * sizeof(dtype)
+                                                  dataType:dtype];
+  id<MTLBuffer> bufB = (__bridge id<MTLBuffer>)(B->data);
+  MPSMatrix *matrixB =
+      [[MPSMatrix alloc] initWithBuffer:bufB descriptor:descB];
+  // mps c
+  MPSMatrixDescriptor *descC =
+      [MPSMatrixDescriptor matrixDescriptorWithDimensions:M
+                                                  columns:N
+                                                 rowBytes:N * sizeof(dtype)
+                                                 dataType:dtype];
+  id<MTLBuffer> bufC = (__bridge id<MTLBuffer>)(C->data);
+  MPSMatrix *matrixC =
+      [[MPSMatrix alloc] initWithBuffer:bufC descriptor:descC];
+  // kernel
+
+  MPSMatrixMultiplication *mul_obj = [[MPSMatrixMultiplication alloc] init];
+  MPSMatrixMultiplication *sgemm = [mul_obj initWithDevice:dev
+                                             transposeLeft:transa
+                                            transposeRight:transb
+                                                resultRows:M
+                                             resultColumns:N
+                                           interiorColumns:K
+                                                     alpha:1.0f
+                                                      beta:0.0f];
+  CHECK(sgemm != nil);
+  [sgemm encodeToCommandBuffer:cb
+                    leftMatrix:matrixA
+                   rightMatrix:matrixB
+                  resultMatrix:matrixC];
+  [cb commit];
+  
+  });
+
+} // namespace contrib
+} // namespace tvm
diff --git a/src/contrib/mps/mps_utils.h b/src/contrib/mps/mps_utils.h
new file mode 100644
index 000000000000..f07156a252a3
--- /dev/null
+++ b/src/contrib/mps/mps_utils.h
@@ -0,0 +1,40 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file Use external mps utils function
+ */
+
+#ifndef TVM_CONTRIB_MPS_MPS_UTILS_H_
+#define TVM_CONTRIB_MPS_MPS_UTILS_H_
+
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <dmlc/logging.h>
+#include <dmlc/thread_local.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/util.h>
+#include <vector>
+#include "../../runtime/metal/metal_common.h"
+
+namespace tvm {
+namespace contrib {
+
+/*! breif Convert DLTensor type to MPS type */
+struct MPSType {
+  static MPSDataType DLTypeToMPSType(const DLDataType &dtype);
+};  // struct MPSType
+
+struct MetalThreadEntry {
+  MetalThreadEntry();
+  ~MetalThreadEntry();
+  MPSImage *AllocMPSImage(id<MTLDevice> dev, MPSImageDescriptor *desc);
+  MPSTemporaryImage *AllocTempImage(id<MTLCommandBuffer> cb,
+                                    MPSImageDescriptor *desc);
+  runtime::metal::MetalWorkspace *metal_api{nullptr};
+  static MetalThreadEntry *ThreadLocal();
+  std::vector<MPSImage *> img_table;
+};  // MetalThreadEntry
+
+}  // namespace contrib
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_MPS_MPS_UTILS_H_
diff --git a/src/contrib/mps/mps_utils.mm b/src/contrib/mps/mps_utils.mm
new file mode 100644
index 000000000000..bed8278a1d50
--- /dev/null
+++ b/src/contrib/mps/mps_utils.mm
@@ -0,0 +1,80 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file Use external mps utils function
+ */
+#include "mps_utils.h"
+
+namespace tvm {
+namespace contrib {
+
+// MPS Data Type
+MPSDataType MPSType::DLTypeToMPSType(const DLDataType &dtype) {
+  switch (dtype.code) {
+  case kDLInt:
+    if (dtype.bits == 8 && dtype.lanes == 1)
+      return MPSDataTypeInt8;
+    else if (dtype.bits == 16 && dtype.lanes == 1)
+      return MPSDataTypeInt16;
+    else
+      LOG(FATAL) << "Unsupported type";
+    break;
+  case kDLUInt:
+    if (dtype.bits == 8 && dtype.lanes == 1)
+      return MPSDataTypeUInt8;
+    else if (dtype.bits == 16 && dtype.lanes == 1)
+      return MPSDataTypeUInt16;
+    else if (dtype.bits == 32 && dtype.lanes == 1)
+      return MPSDataTypeUInt32;
+    LOG(FATAL) << "Unsupported type";
+    break;
+  case kDLFloat:
+    if (dtype.bits == 16 && dtype.lanes == 1)
+      return MPSDataTypeFloat16;
+    else if (dtype.bits == 32 && dtype.lanes == 1)
+      return MPSDataTypeFloat32;
+    else
+      LOG(FATAL) << "Unsupported type";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type";
+  }
+  return MPSDataTypeFloat32;
+}
+
+// MetalThreadEntry
+
+MPSImage *MetalThreadEntry::AllocMPSImage(id<MTLDevice> dev,
+                                          MPSImageDescriptor *desc) {
+  MPSImage *mpsimg = [[MPSImage alloc] initWithDevice:dev imageDescriptor:desc];
+  img_table.push_back(mpsimg);
+  return mpsimg;
+}
+
+MPSTemporaryImage *MetalThreadEntry::AllocTempImage(id<MTLCommandBuffer> cb,
+                                                    MPSImageDescriptor *desc) {
+  MPSTemporaryImage *mpsimg =
+      [MPSTemporaryImage temporaryImageWithCommandBuffer:cb
+                                         imageDescriptor:desc];
+  return mpsimg;
+}
+
+MetalThreadEntry::MetalThreadEntry() {
+  auto func = runtime::Registry::Get("device_api.metal");
+  void *ret = (*func)();
+  metal_api = static_cast<runtime::metal::MetalWorkspace *>(ret);
+}
+
+MetalThreadEntry::~MetalThreadEntry() {
+  for (int i = 0; i < img_table.size(); ++i) {
+    [img_table[i] dealloc];
+  }
+}
+
+typedef dmlc::ThreadLocalStore<MetalThreadEntry> MetalThreadStore;
+
+MetalThreadEntry *MetalThreadEntry::ThreadLocal() {
+  return MetalThreadStore::Get();
+}
+
+} // namespace contrib
+} // namespace tvm
diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc
index f7a8597f2de9..9ca02118aeb3 100644
--- a/src/contrib/nnpack/convolution.cc
+++ b/src/contrib/nnpack/convolution.cc
@@ -24,6 +24,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
     nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
     uint64_t stride_width = args[8], stride_height = args[9];
     nnp_size stride_size{stride_width, stride_height};
+    NNPackConfig(args[10]);
 
     CHECK_EQ(input->ndim, 3);
     CHECK_EQ(kernel->ndim, 4);
@@ -44,10 +45,10 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
     CHECK(kernel->strides == nullptr);
     CHECK(bias->strides == nullptr);
 
-    CHECK(TypeMatch(input->dtype, kFloat, 32));
-    CHECK(TypeMatch(kernel->dtype, kFloat, 32));
-    CHECK(TypeMatch(bias->dtype, kFloat, 32));
-    CHECK(TypeMatch(output->dtype, kFloat, 32));
+    CHECK(TypeMatch(input->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(output->dtype, kDLFloat, 32));
 
     nnp_convolution_inference(nnp_convolution_algorithm_auto,
                               nnp_convolution_transform_strategy_block_based,
@@ -80,6 +81,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_output")
     DLTensor* output = args[3];
     uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6], pad_left = args[7];
     nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
+    NNPackConfig(args[8]);
 
     CHECK_EQ(input->ndim, 4);
     CHECK_EQ(kernel->ndim, 4);
@@ -102,10 +104,10 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_output")
     CHECK(kernel->strides == nullptr);
     CHECK(bias->strides == nullptr);
 
-    CHECK(TypeMatch(input->dtype, kFloat, 32));
-    CHECK(TypeMatch(kernel->dtype, kFloat, 32));
-    CHECK(TypeMatch(bias->dtype, kFloat, 32));
-    CHECK(TypeMatch(output->dtype, kFloat, 32));
+    CHECK(TypeMatch(input->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(output->dtype, kDLFloat, 32));
 
     nnp_convolution_output(nnp_convolution_algorithm_auto,
                            batch_size,
diff --git a/src/contrib/nnpack/fully_connected.cc b/src/contrib/nnpack/fully_connected.cc
index 2ae60d61bd51..df6356d933aa 100644
--- a/src/contrib/nnpack/fully_connected.cc
+++ b/src/contrib/nnpack/fully_connected.cc
@@ -21,6 +21,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_inference")
     DLTensor* A = args[0];
     DLTensor* B = args[1];
     DLTensor* C = args[2];
+    NNPackConfig(args[3]);
+
     CHECK_EQ(A->ndim, 1);
     CHECK_EQ(B->ndim, 2);
     CHECK_EQ(C->ndim, 1);
@@ -29,9 +31,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_inference")
     CHECK(C->strides == nullptr);
     CHECK(B->strides == nullptr);
     CHECK(A->strides == nullptr);
-    CHECK(TypeMatch(A->dtype, kFloat, 32));
-    CHECK(TypeMatch(B->dtype, kFloat, 32));
-    CHECK(TypeMatch(C->dtype, kFloat, 32));
+    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
     nnp_fully_connected_inference(B->shape[1],
                                   B->shape[0],
@@ -49,6 +51,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_output")
     DLTensor* A = args[0];
     DLTensor* B = args[1];
     DLTensor* C = args[2];
+    NNPackConfig(args[3]);
+
     CHECK_EQ(A->ndim, 2);
     CHECK_EQ(B->ndim, 2);
     CHECK_EQ(C->ndim, 2);
@@ -58,9 +62,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_output")
     CHECK(C->strides == nullptr);
     CHECK(B->strides == nullptr);
     CHECK(A->strides == nullptr);
-    CHECK(TypeMatch(A->dtype, kFloat, 32));
-    CHECK(TypeMatch(B->dtype, kFloat, 32));
-    CHECK(TypeMatch(C->dtype, kFloat, 32));
+    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
     nnp_fully_connected_output(A->shape[0],
                                B->shape[1],
diff --git a/src/contrib/nnpack/nnpack_utils.cc b/src/contrib/nnpack/nnpack_utils.cc
index e1e2773c1c8d..631f25b36647 100644
--- a/src/contrib/nnpack/nnpack_utils.cc
+++ b/src/contrib/nnpack/nnpack_utils.cc
@@ -14,18 +14,23 @@ NNPackThreadLocalEntry* NNPackThreadLocalEntry::ThreadLocal() {
   return NNPackThreadLocalStore::Get();
 }
 
+bool NNPackConfig(uint64_t nthreads) {
+  NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
+  if (entry->threadpool != NULL &&
+      pthreadpool_get_threads_count(entry->threadpool) != nthreads) {
+    pthreadpool_destroy(entry->threadpool);
+    entry->threadpool = NULL;
+  }
+  if (entry->threadpool == NULL) {
+    entry->threadpool = pthreadpool_create(nthreads);
+  }
+  return true;
+}
+
+
 TVM_REGISTER_GLOBAL("contrib.nnpack._Config")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-    size_t nthreads = args[0].operator uint64_t();
-    if (entry->threadpool != NULL &&
-        pthreadpool_get_threads_count(entry->threadpool) != nthreads) {
-      pthreadpool_destroy(entry->threadpool);
-      entry->threadpool = NULL;
-    }
-    if (entry->threadpool == NULL) {
-      entry->threadpool = pthreadpool_create(nthreads);
-    }
+    CHECK(NNPackConfig(args[0]));
   });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/nnpack/nnpack_utils.h b/src/contrib/nnpack/nnpack_utils.h
index 7a2232add145..fe7420786bde 100644
--- a/src/contrib/nnpack/nnpack_utils.h
+++ b/src/contrib/nnpack/nnpack_utils.h
@@ -18,6 +18,8 @@ struct NNPackThreadLocalEntry {
   pthreadpool_t threadpool{NULL};
   static NNPackThreadLocalEntry* ThreadLocal();
 };
+
+bool NNPackConfig(uint64_t nthreads);
 }  // namespace contrib
 }  // namespace tvm
 #endif  // TVM_CONTRIB_NNPACK_NNPACK_UTILS_H_
diff --git a/src/contrib/random/mt_random_engine.cc b/src/contrib/random/mt_random_engine.cc
new file mode 100644
index 000000000000..b4fcbde02dae
--- /dev/null
+++ b/src/contrib/random/mt_random_engine.cc
@@ -0,0 +1,111 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file random/mt_random_engine.cc
+ * \brief mt19937 random engine
+ */
+#include <dmlc/logging.h>
+#include <algorithm>
+#include <ctime>
+#include <random>
+
+namespace tvm {
+namespace contrib {
+
+/*!
+ * \brief An interface for generating [tensors of] random numbers.
+ */
+class RandomEngine {
+ public:
+   /*!
+    * \brief Creates a RandomEngine using a default seed.
+    */
+  RandomEngine() {
+    this->Seed(time(0));
+  }
+
+   /*!
+    * \brief Creates a RandomEngine, suggesting the use of a provided seed.
+    */
+  explicit RandomEngine(unsigned seed) {
+    this->Seed(seed);
+  }
+
+   /*!
+    * \brief Seeds the underlying RNG, if possible.
+    */
+  inline void Seed(unsigned seed) {
+    rnd_engine_.seed(seed);
+    this->rseed_ = static_cast<unsigned>(seed);
+  }
+
+   /*!
+    * \return the seed associated with the underlying RNG.
+    */
+  inline unsigned GetSeed() const {
+    return rseed_;
+  }
+
+   /*!
+    * \return a random integer sampled from the RNG.
+    */
+  inline unsigned GetRandInt() {
+    return rnd_engine_();
+  }
+
+   /*!
+    * \brief Fills a tensor with values drawn from Unif(low, high)
+    */
+  void SampleUniform(DLTensor* data, float low, float high) {
+    CHECK_GT(high, low) << "high must be bigger than low";
+    CHECK(data->strides == nullptr);
+
+    DLDataType dtype = data->dtype;
+    int64_t size = 1;
+    for (int i = 0; i < data->ndim; ++i) {
+      size *= data->shape[i];
+    }
+
+    CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
+
+    if (data->ctx.device_type == kDLCPU) {
+      std::uniform_real_distribution<float> uniform_dist(low, high);
+      std::generate_n(static_cast<float*>(data->data), size, [&] () {
+        return uniform_dist(rnd_engine_);
+      });
+    } else {
+      LOG(FATAL) << "Do not support random.uniform on this device yet";
+    }
+  }
+
+   /*!
+    * \brief Fills a tensor with values drawn from Normal(loc, scale**2)
+    */
+  void SampleNormal(DLTensor* data, float loc, float scale) {
+    CHECK_GT(scale, 0) << "standard deviation must be positive";
+    CHECK(data->strides == nullptr);
+
+    DLDataType dtype = data->dtype;
+    int64_t size = 1;
+    for (int i = 0; i < data->ndim; ++i) {
+      size *= data->shape[i];
+    }
+
+    CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
+
+    if (data->ctx.device_type == kDLCPU) {
+      std::normal_distribution<float> normal_dist(loc, scale);
+      std::generate_n(static_cast<float*>(data->data), size, [&] () {
+        return normal_dist(rnd_engine_);
+      });
+    } else {
+      LOG(FATAL) << "Do not support random.normal on this device yet";
+    }
+  }
+
+ private:
+  std::mt19937 rnd_engine_;
+  unsigned rseed_;
+};
+
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/random/random.cc b/src/contrib/random/random.cc
new file mode 100644
index 000000000000..27e2b065a01b
--- /dev/null
+++ b/src/contrib/random/random.cc
@@ -0,0 +1,112 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file External random functions for tensor.
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/util.h>
+#include <dmlc/logging.h>
+#include <dmlc/thread_local.h>
+#include <algorithm>
+#ifndef _LIBCPP_SGX_CONFIG
+#include "./mt_random_engine.cc"
+#else
+#include "./sgx_random_engine.cc"
+#endif
+
+#define DLPACK_INTEGER_TYPE_SWITCH(type, DType, ...)    \
+  if (type.code == kDLInt && type.bits == 32) {         \
+    typedef int32_t DType;                              \
+    {__VA_ARGS__}                                       \
+  } else if (type.code == kDLInt && type.bits == 16) {  \
+    typedef int16_t DType;                              \
+    {__VA_ARGS__}                                       \
+  } else if (type.code == kDLInt && type.bits == 8) {   \
+    typedef int8_t DType;                               \
+    {__VA_ARGS__}                                       \
+  } else if (type.code == kDLUInt && type.bits == 32) { \
+    typedef uint32_t DType;                             \
+    {__VA_ARGS__}                                       \
+  } else if (type.code == kDLUInt && type.bits == 16) { \
+    typedef uint16_t DType;                             \
+    {__VA_ARGS__}                                       \
+  } else if (type.code == kDLUInt && type.bits == 8) {  \
+    typedef uint8_t DType;                              \
+    {__VA_ARGS__}                                       \
+  } else {                                              \
+    LOG(FATAL) << "unknown data type";                  \
+  }
+
+namespace tvm {
+namespace contrib {
+
+using namespace runtime;
+
+struct RandomThreadLocalEntry {
+  RandomEngine random_engine;
+  static RandomThreadLocalEntry* ThreadLocal();
+};
+
+typedef dmlc::ThreadLocalStore<RandomThreadLocalEntry> RandomThreadLocalStore;
+
+RandomThreadLocalEntry* RandomThreadLocalEntry::ThreadLocal() {
+  return RandomThreadLocalStore::Get();
+}
+
+
+TVM_REGISTER_GLOBAL("tvm.contrib.random.randint")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    RandomThreadLocalEntry *entry = RandomThreadLocalEntry::ThreadLocal();
+    int64_t low = args[0];
+    int64_t high = args[1];
+    DLTensor* out = args[2];
+    CHECK_GT(high, low) << "high must be bigger than low";
+    CHECK(out->strides == nullptr);
+
+    DLDataType dtype = out->dtype;
+    int64_t size = 1;
+    for (int i = 0; i < out->ndim; ++i) {
+      size *= out->shape[i];
+    }
+
+    DLPACK_INTEGER_TYPE_SWITCH(dtype, DType, {
+      int64_t numeric_low = std::numeric_limits<DType>::min();
+      int64_t numeric_high = std::numeric_limits<DType>::max();
+      numeric_high += 1;  // exclusive upper bound
+      low = std::max(low, numeric_low);
+      high = std::min(high, numeric_high);
+
+      if (out->ctx.device_type == kDLCPU) {
+          // file the data with random byte
+          std::generate_n(static_cast<DType*>(out->data), size, [&] () {
+            unsigned rint = entry->random_engine.GetRandInt();
+            return low + rint % (high - low);
+          });
+      } else {
+        LOG(FATAL) << "Do not support random.randint on this device yet";
+      }
+    })
+  });
+
+
+TVM_REGISTER_GLOBAL("tvm.contrib.random.uniform")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    RandomThreadLocalEntry *entry = RandomThreadLocalEntry::ThreadLocal();
+    double low = args[0];
+    double high = args[1];
+    DLTensor* out = args[2];
+    entry->random_engine.SampleUniform(out, low, high);
+  });
+
+
+TVM_REGISTER_GLOBAL("tvm.contrib.random.normal")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    RandomThreadLocalEntry *entry = RandomThreadLocalEntry::ThreadLocal();
+    double loc = args[0];
+    double scale = args[1];
+    DLTensor* out = args[2];
+    entry->random_engine.SampleNormal(out, loc, scale);
+  });
+
+
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/random/sgx_random_engine.cc b/src/contrib/random/sgx_random_engine.cc
new file mode 100644
index 000000000000..4b40cb282bcc
--- /dev/null
+++ b/src/contrib/random/sgx_random_engine.cc
@@ -0,0 +1,290 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file random/sgx_random_engine.h
+ * \brief SGX trusted random engine
+ */
+#include <dmlc/logging.h>
+#include <sgx_trts.h>
+#include <algorithm>
+#include <cmath>
+#include "../../runtime/sgx/common.h"
+
+namespace tvm {
+namespace contrib {
+
+/*!
+ * \brief An interface for generating [tensors of] random numbers.
+ */
+class RandomEngine {
+ public:
+   /*!
+    * \brief Creates a RandomEngine, suggesting the use of a provided seed.
+    */
+  explicit RandomEngine(unsigned seed) {
+    LOG(WARNING) << "SGX RandomEngine does not support seeding.";
+  }
+
+   /*!
+    * \brief Seeds the underlying RNG, if possible.
+    */
+  inline void Seed(unsigned seed) {
+    LOG(WARNING) << "SGX RandomEngine does not support seeding.";
+  }
+
+   /*!
+    * \return the seed associated with the underlying RNG.
+    */
+  inline unsigned GetSeed() const {
+    LOG(WARNING) << "SGX RandomEngine does not support seeding.";
+    return 0;
+  }
+
+   /*!
+    * \return a random integer sampled from the RNG.
+    */
+  inline unsigned GetRandInt() {
+    int rand_int;
+    TVM_SGX_CHECKED_CALL(
+        sgx_read_rand(reinterpret_cast<unsigned char*>(&rand_int), sizeof(int)));
+    return rand_int;
+  }
+
+   /*!
+    * \return a random integer sampled from Unif(low, high).
+    */
+  inline float GetUniform(float low, float high) {
+    float max_int = static_cast<float>(std::numeric_limits<unsigned>::max());
+    float unif01 = GetRandInt() / max_int;
+    return low + unif01 * (high - low);
+  }
+
+   /*!
+    * \return a random value sampled from Normal(loc, scale**2).
+    */
+  inline float GetNormal(float loc, float scale) {
+    float sign = GetUniform(-1, 1);
+    float sample = GetStandardNormalOneside();
+    return loc + (sign > 0 ? scale : -scale) * sample;
+  }
+
+   /*!
+    * \brief Fills a tensor with values drawn from Unif(low, high)
+    */
+  void SampleUniform(DLTensor* data, float low, float high) {
+    CHECK_GT(high, low) << "high must be bigger than low";
+    CHECK(data->strides == nullptr);
+
+    DLDataType dtype = data->dtype;
+    int64_t size = 1;
+    for (int i = 0; i < data->ndim; ++i) {
+      size *= data->shape[i];
+    }
+
+    CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
+
+    std::generate_n(static_cast<float*>(data->data), size, [&] () {
+      float max_int = static_cast<float>(std::numeric_limits<unsigned>::max());
+      float unif01 = GetRandInt() / max_int;
+      return low + unif01 * (high - low);
+    });
+  }
+
+   /*!
+    * \brief Fills a tensor with values drawn from Normal(loc, scale)
+    */
+  void SampleNormal(DLTensor* data, float loc, float scale) {
+    CHECK_GT(scale, 0) << "scale must be positive";
+    CHECK(data->strides == nullptr);
+
+    DLDataType dtype = data->dtype;
+    int64_t size = 1;
+    for (int i = 0; i < data->ndim; ++i) {
+      size *= data->shape[i];
+    }
+
+    CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
+
+    std::generate_n(static_cast<float*>(data->data), size, [&] () {
+      return GetNormal(loc, scale);
+    });
+  }
+
+ private:
+   /*!
+    * \return a random value sampled from Normal(0, 1) such that the
+    * sampled value is greater than tail
+    */
+  inline float GetStandardNormalTail(float tail) {
+    while (true) {
+      float u1 = GetUniform(0, 1);
+      float u2 = GetUniform(0, 1);
+      float x = - log(u1) / tail;
+      float y = - log(u2);
+      if (2 * y < x * x) {
+        return x + tail;
+      }
+    }
+  }
+
+   /*!
+    * \return a random positive value sampled from Normal(0, 1).
+    */
+  inline float GetStandardNormalOneside() {
+    while (true) {
+      unsigned i = GetRandInt() & 255;
+      float x = GetUniform(0, ZIG_NORM_X[i]);
+      if (x < ZIG_NORM_X[i+1]) {
+        return x;
+      }
+      if (i == 0) {
+        return GetStandardNormalTail(ZIG_NORM_X[1]);
+      }
+      float y = GetUniform(ZIG_NORM_F[i], ZIG_NORM_F[i+1]);
+      if (y < exp(-0.5 * x * x)) {
+        return x;
+      }
+    }
+  }
+
+   /*!
+    * Tables for normal distribution which is sampled using the ziggurat algorithm.
+    */
+  static constexpr float ZIG_NORM_X[257] =
+    {3.910757959537090045, 3.654152885361008796, 3.449278298560964462, 3.320244733839166074,
+     3.224575052047029100, 3.147889289517149969, 3.083526132001233044, 3.027837791768635434,
+     2.978603279880844834, 2.934366867207854224, 2.894121053612348060, 2.857138730872132548,
+     2.822877396825325125, 2.790921174000785765, 2.760944005278822555, 2.732685359042827056,
+     2.705933656121858100, 2.680514643284522158, 2.656283037575502437, 2.633116393630324570,
+     2.610910518487548515, 2.589575986706995181, 2.569035452680536569, 2.549221550323460761,
+     2.530075232158516929, 2.511544441625342294, 2.493583041269680667, 2.476149939669143318,
+     2.459208374333311298, 2.442725318198956774, 2.426670984935725972, 2.411018413899685520,
+     2.395743119780480601, 2.380822795170626005, 2.366237056715818632, 2.351967227377659952,
+     2.337996148795031370, 2.324308018869623016, 2.310888250599850036, 2.297723348901329565,
+     2.284800802722946056, 2.272108990226823888, 2.259637095172217780, 2.247375032945807760,
+     2.235313384928327984, 2.223443340090905718, 2.211756642882544366, 2.200245546609647995,
+     2.188902771624720689, 2.177721467738641614, 2.166695180352645966, 2.155817819875063268,
+     2.145083634046203613, 2.134487182844320152, 2.124023315687815661, 2.113687150684933957,
+     2.103474055713146829, 2.093379631137050279, 2.083399693996551783, 2.073530263516978778,
+     2.063767547809956415, 2.054107931648864849, 2.044547965215732788, 2.035084353727808715,
+     2.025713947862032960, 2.016433734904371722, 2.007240830558684852, 1.998132471356564244,
+     1.989106007615571325, 1.980158896898598364, 1.971288697931769640, 1.962493064942461896,
+     1.953769742382734043, 1.945116560006753925, 1.936531428273758904, 1.928012334050718257,
+     1.919557336591228847, 1.911164563769282232, 1.902832208548446369, 1.894558525668710081,
+     1.886341828534776388, 1.878180486290977669, 1.870072921069236838, 1.862017605397632281,
+     1.854013059758148119, 1.846057850283119750, 1.838150586580728607, 1.830289919680666566,
+     1.822474540091783224, 1.814703175964167636, 1.806974591348693426, 1.799287584547580199,
+     1.791640986550010028, 1.784033659547276329, 1.776464495522344977, 1.768932414909077933,
+     1.761436365316706665, 1.753975320315455111, 1.746548278279492994, 1.739154261283669012,
+     1.731792314050707216, 1.724461502945775715, 1.717160915015540690, 1.709889657069006086,
+     1.702646854797613907, 1.695431651932238548, 1.688243209434858727, 1.681080704722823338,
+     1.673943330923760353, 1.666830296159286684, 1.659740822855789499, 1.652674147080648526,
+     1.645629517902360339, 1.638606196773111146, 1.631603456932422036, 1.624620582830568427,
+     1.617656869570534228, 1.610711622367333673, 1.603784156023583041, 1.596873794420261339,
+     1.589979870021648534, 1.583101723393471438, 1.576238702733332886, 1.569390163412534456,
+     1.562555467528439657, 1.555733983466554893, 1.548925085471535512, 1.542128153226347553,
+     1.535342571438843118, 1.528567729435024614, 1.521803020758293101, 1.515047842773992404,
+     1.508301596278571965, 1.501563685112706548, 1.494833515777718391, 1.488110497054654369,
+     1.481394039625375747, 1.474683555695025516, 1.467978458615230908, 1.461278162507407830,
+     1.454582081885523293, 1.447889631277669675, 1.441200224845798017, 1.434513276002946425,
+     1.427828197027290358, 1.421144398672323117, 1.414461289772464658, 1.407778276843371534,
+     1.401094763676202559, 1.394410150925071257, 1.387723835686884621, 1.381035211072741964,
+     1.374343665770030531, 1.367648583594317957, 1.360949343030101844, 1.354245316759430606,
+     1.347535871177359290, 1.340820365893152122, 1.334098153216083604, 1.327368577624624679,
+     1.320630975217730096, 1.313884673146868964, 1.307128989027353860, 1.300363230327433728,
+     1.293586693733517645, 1.286798664489786415, 1.279998415710333237, 1.273185207661843732,
+     1.266358287014688333, 1.259516886060144225, 1.252660221891297887, 1.245787495544997903,
+     1.238897891102027415, 1.231990574742445110, 1.225064693752808020, 1.218119375481726552,
+     1.211153726239911244, 1.204166830140560140, 1.197157747875585931, 1.190125515422801650,
+     1.183069142678760732, 1.175987612011489825, 1.168879876726833800, 1.161744859441574240,
+     1.154581450355851802, 1.147388505416733873, 1.140164844363995789, 1.132909248648336975,
+     1.125620459211294389, 1.118297174115062909, 1.110938046009249502, 1.103541679420268151,
+     1.096106627847603487, 1.088631390649514197, 1.081114409698889389, 1.073554065787871714,
+     1.065948674757506653, 1.058296483326006454, 1.050595664586207123, 1.042844313139370538,
+     1.035040439828605274, 1.027181966030751292, 1.019266717460529215, 1.011292417434978441,
+     1.003256679539591412, 0.995156999629943084, 0.986990747093846266, 0.978755155288937750,
+     0.970447311058864615, 0.962064143217605250, 0.953602409875572654, 0.945058684462571130,
+     0.936429340280896860, 0.927710533396234771, 0.918898183643734989, 0.909987953490768997,
+     0.900975224455174528, 0.891855070726792376, 0.882622229578910122, 0.873271068082494550,
+     0.863795545546826915, 0.854189171001560554, 0.844444954902423661, 0.834555354079518752,
+     0.824512208745288633, 0.814306670128064347, 0.803929116982664893, 0.793369058833152785,
+     0.782615023299588763, 0.771654424216739354, 0.760473406422083165, 0.749056662009581653,
+     0.737387211425838629, 0.725446140901303549, 0.713212285182022732, 0.700661841097584448,
+     0.687767892786257717, 0.674499822827436479, 0.660822574234205984, 0.646695714884388928,
+     0.632072236375024632, 0.616896989996235545, 0.601104617743940417, 0.584616766093722262,
+     0.567338257040473026, 0.549151702313026790, 0.529909720646495108, 0.509423329585933393,
+     0.487443966121754335, 0.463634336771763245, 0.437518402186662658, 0.408389134588000746,
+     0.375121332850465727, 0.335737519180459465, 0.286174591747260509, 0.215241895913273806,
+     0.000000000000000000};
+  static constexpr float ZIG_NORM_F[257] =
+    {0.000477467764586655, 0.001260285930498598, 0.002609072746106363, 0.004037972593371872,
+     0.005522403299264754, 0.007050875471392110, 0.008616582769422917, 0.010214971439731100,
+     0.011842757857943104, 0.013497450601780807, 0.015177088307982072, 0.016880083152595839,
+     0.018605121275783350, 0.020351096230109354, 0.022117062707379922, 0.023902203305873237,
+     0.025705804008632656, 0.027527235669693315, 0.029365939758230111, 0.031221417192023690,
+     0.033093219458688698, 0.034980941461833073, 0.036884215688691151, 0.038802707404656918,
+     0.040736110656078753, 0.042684144916619378, 0.044646552251446536, 0.046623094902089664,
+     0.048613553216035145, 0.050617723861121788, 0.052635418276973649, 0.054666461325077916,
+     0.056710690106399467, 0.058767952921137984, 0.060838108349751806, 0.062921024437977854,
+     0.065016577971470438, 0.067124653828023989, 0.069245144397250269, 0.071377949059141965,
+     0.073522973714240991, 0.075680130359194964, 0.077849336702372207, 0.080030515814947509,
+     0.082223595813495684, 0.084428509570654661, 0.086645194450867782, 0.088873592068594229,
+     0.091113648066700734, 0.093365311913026619, 0.095628536713353335, 0.097903279039215627,
+     0.100189498769172020, 0.102487158942306270, 0.104796225622867056, 0.107116667775072880,
+     0.109448457147210021, 0.111791568164245583, 0.114145977828255210, 0.116511665626037014,
+     0.118888613443345698, 0.121276805485235437, 0.123676228202051403, 0.126086870220650349,
+     0.128508722280473636, 0.130941777174128166, 0.133386029692162844, 0.135841476571757352,
+     0.138308116449064322, 0.140785949814968309, 0.143274978974047118, 0.145775208006537926,
+     0.148286642733128721, 0.150809290682410169, 0.153343161060837674, 0.155888264725064563,
+     0.158444614156520225, 0.161012223438117663, 0.163591108232982951, 0.166181285765110071,
+     0.168782774801850333, 0.171395595638155623, 0.174019770082499359, 0.176655321444406654,
+     0.179302274523530397, 0.181960655600216487, 0.184630492427504539, 0.187311814224516926,
+     0.190004651671193070, 0.192709036904328807, 0.195425003514885592, 0.198152586546538112,
+     0.200891822495431333, 0.203642749311121501, 0.206405406398679298, 0.209179834621935651,
+     0.211966076307852941, 0.214764175252008499, 0.217574176725178370, 0.220396127481011589,
+     0.223230075764789593, 0.226076071323264877, 0.228934165415577484, 0.231804410825248525,
+     0.234686861873252689, 0.237581574432173676, 0.240488605941449107, 0.243408015423711988,
+     0.246339863502238771, 0.249284212419516704, 0.252241126056943765, 0.255210669955677150,
+     0.258192911338648023, 0.261187919133763713, 0.264195763998317568, 0.267216518344631837,
+     0.270250256366959984, 0.273297054069675804, 0.276356989296781264, 0.279430141762765316,
+     0.282516593084849388, 0.285616426816658109, 0.288729728483353931, 0.291856585618280984,
+     0.294997087801162572, 0.298151326697901342, 0.301319396102034120, 0.304501391977896274,
+     0.307697412505553769, 0.310907558127563710, 0.314131931597630143, 0.317370638031222396,
+     0.320623784958230129, 0.323891482377732021, 0.327173842814958593, 0.330470981380537099,
+     0.333783015832108509, 0.337110066638412809, 0.340452257045945450, 0.343809713148291340,
+     0.347182563958251478, 0.350570941482881204, 0.353974980801569250, 0.357394820147290515,
+     0.360830600991175754, 0.364282468130549597, 0.367750569780596226, 0.371235057669821344,
+     0.374736087139491414, 0.378253817247238111, 0.381788410875031348, 0.385340034841733958,
+     0.388908860020464597, 0.392495061461010764, 0.396098818517547080, 0.399720314981931668,
+     0.403359739222868885, 0.407017284331247953, 0.410693148271983222, 0.414387534042706784,
+     0.418100649839684591, 0.421832709231353298, 0.425583931339900579, 0.429354541031341519,
+     0.433144769114574058, 0.436954852549929273, 0.440785034667769915, 0.444635565397727750,
+     0.448506701509214067, 0.452398706863882505, 0.456311852680773566, 0.460246417814923481,
+     0.464202689050278838, 0.468180961407822172, 0.472181538469883255, 0.476204732721683788,
+     0.480250865911249714, 0.484320269428911598, 0.488413284707712059, 0.492530263646148658,
+     0.496671569054796314, 0.500837575128482149, 0.505028667945828791, 0.509245245998136142,
+     0.513487720749743026, 0.517756517232200619, 0.522052074674794864, 0.526374847174186700,
+     0.530725304406193921, 0.535103932383019565, 0.539511234259544614, 0.543947731192649941,
+     0.548413963257921133, 0.552910490428519918, 0.557437893621486324, 0.561996775817277916,
+     0.566587763258951771, 0.571211506738074970, 0.575868682975210544, 0.580559996103683473,
+     0.585286179266300333, 0.590047996335791969, 0.594846243770991268, 0.599681752622167719,
+     0.604555390700549533, 0.609468064928895381, 0.614420723892076803, 0.619414360609039205,
+     0.624450015550274240, 0.629528779928128279, 0.634651799290960050, 0.639820277456438991,
+     0.645035480824251883, 0.650298743114294586, 0.655611470583224665, 0.660975147780241357,
+     0.666391343912380640, 0.671861719900766374, 0.677388036222513090, 0.682972161648791376,
+     0.688616083008527058, 0.694321916130032579, 0.700091918140490099, 0.705928501336797409,
+     0.711834248882358467, 0.717811932634901395, 0.723864533472881599, 0.729995264565802437,
+     0.736207598131266683, 0.742505296344636245, 0.748892447223726720, 0.755373506511754500,
+     0.761953346841546475, 0.768637315803334831, 0.775431304986138326, 0.782341832659861902,
+     0.789376143571198563, 0.796542330428254619, 0.803849483176389490, 0.811307874318219935,
+     0.818929191609414797, 0.826726833952094231, 0.834716292992930375, 0.842915653118441077,
+     0.851346258465123684, 0.860033621203008636, 0.869008688043793165, 0.878309655816146839,
+     0.887984660763399880, 0.898095921906304051, 0.908726440060562912, 0.919991505048360247,
+     0.932060075968990209, 0.945198953453078028, 0.959879091812415930, 0.977101701282731328,
+     1.000000000000000000};
+};
+
+constexpr float RandomEngine::ZIG_NORM_X[];
+constexpr float RandomEngine::ZIG_NORM_F[];
+
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/rocblas/rocblas.cc b/src/contrib/rocblas/rocblas.cc
new file mode 100644
index 000000000000..1dbf429461eb
--- /dev/null
+++ b/src/contrib/rocblas/rocblas.cc
@@ -0,0 +1,76 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file Use external rocblas library call.
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/util.h>
+#include <dmlc/logging.h>
+#include "rocblas.h"
+
+namespace tvm {
+namespace contrib {
+
+using namespace runtime;
+
+#ifndef CHECK_ROCBLAS_ERROR
+#define CHECK_ROCBLAS_ERROR(error) \
+if (error != rocblas_status_success) { \
+  fprintf(stderr, "rocBLAS error: "); \
+  if (error == rocblas_status_invalid_handle) fprintf(stderr, "rocblas_status_invalid_handle"); \
+  if (error == rocblas_status_not_implemented) fprintf(stderr, " rocblas_status_not_implemented"); \
+  if (error == rocblas_status_invalid_pointer) fprintf(stderr, "rocblas_status_invalid_pointer"); \
+  if (error == rocblas_status_invalid_size) fprintf(stderr, "rocblas_status_invalid_size"); \
+  if (error == rocblas_status_memory_error) fprintf(stderr, "rocblas_status_memory_error"); \
+  if (error == rocblas_status_internal_error) fprintf(stderr, "rocblas_status_internal_error"); \
+  fprintf(stderr, "\n"); \
+  exit(EXIT_FAILURE); \
+}
+#endif
+
+
+// matrix multiplication for row major
+TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.matmul")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    DLTensor* A = args[0];
+    DLTensor* B = args[1];
+    DLTensor* C = args[2];
+    bool transa = args[3];
+    bool transb = args[4];
+    // call gemm for simple compact code.
+    CHECK_EQ(A->ndim, 2);
+    CHECK_EQ(B->ndim, 2);
+    CHECK_EQ(C->ndim, 2);
+    CHECK(C->strides == nullptr);
+    CHECK(B->strides == nullptr);
+    CHECK(A->strides == nullptr);
+    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
+    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+
+    rocblas_handle handle;
+    CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle));
+    float alpha = 1.0;
+    float beta = 0.0;
+    float *A_ptr = reinterpret_cast<float*>(static_cast<char*>(B->data) + B->byte_offset);
+    float *B_ptr = reinterpret_cast<float*>(static_cast<char*>(A->data) + A->byte_offset);
+    float *C_ptr = reinterpret_cast<float*>(static_cast<char*>(C->data) + C->byte_offset);
+
+    CHECK_ROCBLAS_ERROR(rocblas_sgemm(handle,
+                                      transb ? rocblas_operation_transpose : rocblas_operation_none,
+                                      transa ? rocblas_operation_transpose : rocblas_operation_none,
+                                      transb ? B->shape[0] : B->shape[1],
+                                      transa ? A->shape[1] : A->shape[0],
+                                      transb ? B->shape[1] : B->shape[0],
+                                      &alpha,
+                                      A_ptr,
+                                      B->shape[1],
+                                      B_ptr,
+                                      A->shape[1],
+                                      &beta,
+                                      C_ptr,
+                                      C->shape[1]));
+
+    CHECK_ROCBLAS_ERROR(rocblas_destroy_handle(handle));
+});
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/sort/sort.cc b/src/contrib/sort/sort.cc
new file mode 100644
index 000000000000..160e479b86b5
--- /dev/null
+++ b/src/contrib/sort/sort.cc
@@ -0,0 +1,95 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file Use standard C library call.
+ */
+
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/util.h>
+#include <dlpack/dlpack.h>
+#include <algorithm>
+#include <vector>
+
+namespace tvm {
+namespace contrib {
+
+using namespace runtime;
+
+template<typename DType>
+bool CompareAscend(const std::pair<int32_t, DType>& lhs,
+                   const std::pair<int32_t, DType>& rhs) {
+  return lhs.second < rhs.second;
+}
+
+template<typename DType>
+bool CompareDescend(const std::pair<int32_t, DType>& lhs,
+                    const std::pair<int32_t, DType>& rhs) {
+  return lhs.second > rhs.second;
+}
+
+
+// Argsort implemented C library sort.
+// Return indices of sorted tensor.
+// By default, the last axis will be used to sort.
+// sort_num specify the number of elements to be sorted.
+// If input tensor has dimension (d0, d1, ..., d(k-1), dk, d(k+1), ..., d(n-1))
+// and sort axis is dk. sort_num should have dimension of
+// (d1, d2, ..., d(k-1), d(k+1), ..., dn).
+TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  DLTensor *input = args[0];
+  DLTensor *sort_num = args[1];
+  DLTensor *output = args[2];
+  int32_t axis = args[3];
+  bool is_descend = args[4];
+
+  auto dtype = input->dtype;
+  auto data_ptr = static_cast<float *>(input->data);
+  auto sort_num_ptr = static_cast<int32_t *>(sort_num->data);
+  std::vector<std::pair<int32_t, float>> sorter;
+  int64_t axis_mul_before = 1;
+  int64_t axis_mul_after = 1;
+
+  if (axis < 0) {
+    axis = input->ndim + axis;
+  }
+
+  // Currently only supports input dtype to be float32.
+  CHECK_EQ(dtype.code, 2) << "Currently only supports input dtype "
+      "to be float32.";
+  CHECK_EQ(dtype.bits, 32) << "Currently only supports input dtype "
+      "to be float32.";
+  CHECK_LT(axis, input->ndim) << "Axis out of boundary for "
+      "input ndim " << input->ndim;
+
+  for (int i = 0; i < input->ndim; ++i) {
+    if (i < axis) {
+      axis_mul_before *= input->shape[i];
+    } else if (i > axis) {
+      axis_mul_after *= input->shape[i];
+    }
+  }
+
+  for (int64_t i = 0 ; i < axis_mul_before; ++i) {
+    for (int64_t j = 0 ; j < axis_mul_after; ++j) {
+      sorter.clear();
+      int32_t current_sort_num = *(sort_num_ptr + i * axis_mul_after + j);
+      int64_t base_idx = i * input->shape[axis] * axis_mul_after + j;
+      for (int64_t k = 0; k < current_sort_num; ++k) {
+        int64_t full_idx = base_idx + k * axis_mul_after;
+        sorter.emplace_back(std::make_pair(k, *(data_ptr + full_idx)));
+      }
+      if (is_descend) {
+        std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<float>);
+      } else {
+        std::stable_sort(sorter.begin(), sorter.end(), CompareAscend<float>);
+      }
+      for (int32_t k = 0; k < input->shape[axis]; ++k) {
+        *(static_cast<int32_t *>(output->data) + base_idx + k * axis_mul_after)
+            = k < static_cast<int32_t>(sorter.size()) ? sorter[k].first : k;
+      }
+    }
+  }
+});
+
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc
index 34abada14118..3f23c2d480bf 100644
--- a/src/lang/buffer.cc
+++ b/src/lang/buffer.cc
@@ -11,15 +11,6 @@
 
 namespace tvm {
 
-Array<Expr> GetStrides(Array<Expr> shape) {
-  CHECK_NE(shape.size(), 0U);
-  std::vector<Expr> vec{make_const(shape[0].type(), 1)};
-  for (size_t i = shape.size() - 1; i != 0; --i) {
-    vec.push_back(shape[i - 1] * vec.back());
-  }
-  return Array<Expr>(vec.rbegin(), vec.rend());
-}
-
 Array<Expr> SimplifyArray(Array<Expr> array) {
   for (size_t i = 0; i < array.size(); ++i) {
     array.Set(i, ir::Simplify(array[i]));
@@ -235,10 +226,12 @@ inline Expr ElemOffset(const BufferNode* n, Array<Expr> index) {
   Expr base = n->elem_offset;
   if (n->strides.size() == 0) {
     CHECK_EQ(n->shape.size(), index.size());
-    if (is_zero(base)) {
-      base = index[0];
-    } else {
-      base = base + index[0];
+    if (n->shape.size() != 0) {
+      if (is_zero(base)) {
+        base = index[0];
+      } else {
+        base = base + index[0];
+      }
     }
     base = MergeMulMod(base);
     for (size_t i = 1; i < index.size(); ++i) {
@@ -294,9 +287,10 @@ Stmt Buffer::vstore(Array<Expr> begin, Expr value) const {
 
 Buffer Buffer::MakeStrideView() const {
   if ((*this)->strides.size() != 0) return *this;
+  if ((*this)->shape.size() == 0) return *this;
   std::vector<Expr> temp;
   auto n = std::make_shared<BufferNode>(*operator->());
-  Expr acc = make_const(n->shape[0].type(), 1);
+  Expr acc = make_const(n->DefaultIndexType(), 1);
   for (size_t i = n->shape.size(); i != 0 ; --i) {
     temp.push_back(acc);
     acc = acc * n->shape[i - 1];
@@ -341,14 +335,30 @@ Buffer Buffer::MakeSlice(Array<Expr> begins, Array<Expr> extents) const {
                           0);
 }
 
-Expr Buffer::access_ptr(int access_mask, Type ptr_type) const {
+Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes, Expr offset) const {
   const BufferNode* self = operator->();
-  Expr e_dtype = make_zero(self->dtype);
-  Expr extent = (self->strides.size() == self->shape.size() ?
-                 arith::ComputeExpr<ir::Mul>(self->strides[0], self->shape[0]):
-                 arith::ComputeReduce<ir::Mul>(self->shape));
+  Expr e_dtype;
+  Expr extent;
+  if (self->shape.size() == 0) {
+    extent = make_const(self->DefaultIndexType(), 1);
+  } else if (self->strides.size() == self->shape.size()) {
+    int highest_dim = 0;
+    extent = arith::ComputeExpr<ir::Mul>(
+        self->strides[highest_dim], self->shape[highest_dim]);
+  } else {
+    extent = arith::ComputeReduce<ir::Mul>(self->shape, Expr());
+  }
+  Expr elem_offset = self->elem_offset + offset;
+  if (content_lanes > 1) {
+    e_dtype = ir::TypeAnnotation(self->dtype.with_lanes(content_lanes));
+    extent = extent / make_const(self->elem_offset.type(), content_lanes);
+    elem_offset = self->elem_offset / make_const(self->elem_offset.type(),
+                                                 content_lanes);
+  } else {
+    e_dtype = ir::TypeAnnotation(self->dtype);
+  }
   Array<Expr> acc_args{
-    e_dtype, self->data, self->elem_offset,
+    e_dtype, self->data, elem_offset,
         extent, make_const(Int(32), access_mask)};
   return ir::Call::make(
       ptr_type, ir::intrinsic::tvm_access_ptr, acc_args, ir::Call::Intrinsic);
@@ -374,7 +384,7 @@ Buffer BufferNode::make(Var data,
   }
   n->scope = std::move(scope);
   if (!elem_offset.defined()) {
-    elem_offset = make_const(n->shape[0].type(), 0);
+    elem_offset = make_const(n->DefaultIndexType(), 0);
   }
   if (data_alignment <= 0) {
     data_alignment = runtime::kAllocAlignment;
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index 348733bad626..684211079e94 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -10,7 +10,7 @@
 
 namespace tvm {
 
-using Halide::IR::RangeNode;
+using HalideIR::IR::RangeNode;
 
 Range::Range(Expr begin, Expr end)
     : Range(std::make_shared<RangeNode>(
@@ -19,7 +19,7 @@ Range::Range(Expr begin, Expr end)
 }
 
 Range Range::make_by_min_extent(Expr min, Expr extent) {
-  return Range(std::make_shared<Halide::IR::RangeNode>(min, extent));
+  return Range(std::make_shared<HalideIR::IR::RangeNode>(min, extent));
 }
 
 IterVar IterVarNode::make(Range dom, Var var,
@@ -47,6 +47,10 @@ std::ostream& operator<<(std::ostream& os, const NodeRef& n) {  // NOLINT(*)
   return os;
 }
 
+Var var(const std::string& name_hint, Type t) {
+  return Var(name_hint, t);
+}
+
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<IterVarNode>([](const IterVarNode *op, IRPrinter *p) {
     p->stream << "iter_var(";
@@ -63,13 +67,14 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   });
 
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
-.set_dispatch<RangeNode>([](const Halide::IR::RangeNode *op, IRPrinter *p) {
+.set_dispatch<RangeNode>([](const HalideIR::IR::RangeNode *op, IRPrinter *p) {
     p->stream << "range(min=" << op->min << ", ext=" << op->extent << ')';
   });
 
 
 TVM_REGISTER_NODE_TYPE(ArrayNode);
 TVM_REGISTER_NODE_TYPE(MapNode);
+TVM_REGISTER_NODE_TYPE(StrMapNode);
 TVM_REGISTER_NODE_TYPE(RangeNode);
 TVM_REGISTER_NODE_TYPE(IterVarNode);
 
diff --git a/src/lang/ir.cc b/src/lang/ir.cc
index 776f1f2368f5..1e0a6e5065f4 100644
--- a/src/lang/ir.cc
+++ b/src/lang/ir.cc
@@ -11,7 +11,7 @@
 #include <memory>
 #include "../pass/ir_util.h"
 
-namespace Halide {
+namespace HalideIR {
 namespace Internal {
 
 using tvm::ir::CommReducerNode;
@@ -43,7 +43,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
             << ")";
 });
 }  // namespace Internal
-}  // namespace Halide
+}  // namespace HalideIR
 
 namespace tvm {
 namespace ir {
diff --git a/src/lang/ir_operator.cc b/src/lang/ir_operator.cc
index f06ccc060165..ded27bbdce7e 100644
--- a/src/lang/ir_operator.cc
+++ b/src/lang/ir_operator.cc
@@ -4,6 +4,7 @@
  */
 #include <tvm/base.h>
 #include <tvm/ir.h>
+#include <tvm/ir_operator.h>
 
 namespace tvm {
 
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index f40e399ef911..7c4e862f0abb 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -7,8 +7,11 @@
 #include <tvm/expr.h>
 #include <tvm/container.h>
 #include <tvm/packed_func_ext.h>
+#include <tvm/runtime/ndarray.h>
 #include <dmlc/json.h>
+#include <dmlc/memory_io.h>
 #include <string>
+#include "../common/base64.h"
 
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::tvm::NodeFactoryReg);
@@ -23,9 +26,10 @@ inline std::string Type2String(const Type& t) {
   return os.str();
 }
 
+
 inline Type String2Type(std::string s) {
   std::istringstream is(s);
-  halide_type_code_t code = Type::Int;
+  halideir_type_code_t code = Type::Int;
   if (s.substr(0, 3) == "int") {
     code = Type::Int; s = s.substr(3);
   } else if (s.substr(0, 4) == "uint") {
@@ -52,6 +56,8 @@ class NodeIndexer : public AttrVisitor {
  public:
   std::unordered_map<Node*, size_t> node_index{{nullptr, 0}};
   std::vector<Node*> node_list{nullptr};
+  std::unordered_map<DLTensor*, size_t> tensor_index;
+  std::vector<DLTensor*> tensor_list;
 
   void Visit(const char* key, double* value) final {}
   void Visit(const char* key, int64_t* value) final {}
@@ -64,7 +70,13 @@ class NodeIndexer : public AttrVisitor {
   void Visit(const char* key, NodeRef* value) final {
     MakeIndex(value->node_.get());
   }
-
+  void Visit(const char* key, runtime::NDArray* value) final {
+    DLTensor* ptr = const_cast<DLTensor*>((*value).operator->());
+    if (tensor_index.count(ptr)) return;
+    CHECK_EQ(tensor_index.size(), tensor_list.size());
+    tensor_index[ptr] = tensor_list.size();
+    tensor_list.push_back(ptr);
+  }
   // make index of all the children of node
   void MakeIndex(Node* node) {
     if (node == nullptr) return;
@@ -84,6 +96,11 @@ class NodeIndexer : public AttrVisitor {
         MakeIndex(kv.first.get());
         MakeIndex(kv.second.get());
       }
+    } else if (node->is_type<StrMapNode>()) {
+      StrMapNode* n = static_cast<StrMapNode*>(node);
+      for (const auto& kv : n->data) {
+        MakeIndex(kv.second.get());
+      }
     } else {
       node->VisitAttrs(this);
     }
@@ -99,6 +116,8 @@ struct JSONNode {
   std::string type_key;
   // the attributes
   AttrMap attrs;
+  // container keys
+  std::vector<std::string> keys;
   // container data
   std::vector<size_t> data;
 
@@ -108,6 +127,9 @@ struct JSONNode {
     if (attrs.size() != 0) {
       writer->WriteObjectKeyValue("attrs", attrs);
     }
+    if (keys.size() != 0) {
+      writer->WriteObjectKeyValue("keys", keys);
+    }
     if (data.size() != 0) {
       writer->WriteObjectKeyValue("data", data);
     }
@@ -121,6 +143,7 @@ struct JSONNode {
     dmlc::JSONObjectReadHelper helper;
     helper.DeclareOptionalField("type_key", &type_key);
     helper.DeclareOptionalField("attrs", &attrs);
+    helper.DeclareOptionalField("keys", &keys);
     helper.DeclareOptionalField("data", &data);
     helper.ReadAllFields(reader);
   }
@@ -129,6 +152,7 @@ struct JSONNode {
 class JSONAttrGetter : public AttrVisitor {
  public:
   const std::unordered_map<Node*, size_t>* node_index_;
+  const std::unordered_map<DLTensor*, size_t>* tensor_index_;
   JSONNode* node_;
 
   void Visit(const char* key, double* value) final {
@@ -159,6 +183,10 @@ class JSONAttrGetter : public AttrVisitor {
     node_->attrs[key] = std::to_string(
         node_index_->at(value->node_.get()));
   }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    node_->attrs[key] = std::to_string(
+        tensor_index_->at(const_cast<DLTensor*>((*value).operator->())));
+  }
   // Get the node
   void Get(Node* node) {
     if (node == nullptr) {
@@ -176,13 +204,19 @@ class JSONAttrGetter : public AttrVisitor {
       }
     } else if (node->is_type<MapNode>()) {
       MapNode* n = static_cast<MapNode*>(node);
-      std::vector<std::pair<size_t, size_t> > elems;
       for (const auto& kv : n->data) {
         node_->data.push_back(
             node_index_->at(kv.first.get()));
         node_->data.push_back(
             node_index_->at(kv.second.get()));
       }
+    } else if (node->is_type<StrMapNode>()) {
+      StrMapNode* n = static_cast<StrMapNode*>(node);
+      for (const auto& kv : n->data) {
+        node_->keys.push_back(kv.first);
+        node_->data.push_back(
+            node_index_->at(kv.second.get()));
+      }
     } else {
       node->VisitAttrs(this);
     }
@@ -192,6 +226,7 @@ class JSONAttrGetter : public AttrVisitor {
 class JSONAttrSetter : public AttrVisitor {
  public:
   const std::vector<std::shared_ptr<Node> >* node_list_;
+  const std::vector<runtime::NDArray>* tensor_list_;
   JSONNode* node_;
 
   std::string GetValue(const char* key) const {
@@ -237,10 +272,16 @@ class JSONAttrSetter : public AttrVisitor {
   void Visit(const char* key, NodeRef* value) final {
     size_t index;
     ParseValue(key, &index);
+    CHECK_LE(index, node_list_->size());
     value->node_ = node_list_->at(index);
   }
-
-  // Get the node
+  void Visit(const char* key, runtime::NDArray* value) final {
+    size_t index;
+    ParseValue(key, &index);
+    CHECK_LE(index, tensor_list_->size());
+    *value = tensor_list_->at(index);
+  }
+  // set node to be current JSONNode
   void Set(Node* node) {
     if (node == nullptr) return;
     if (node->is_type<ArrayNode>()) {
@@ -256,6 +297,13 @@ class JSONAttrSetter : public AttrVisitor {
         n->data[node_list_->at(node_->data[i])]
             = node_list_->at(node_->data[i + 1]);
       }
+    } else if (node->is_type<StrMapNode>()) {
+      StrMapNode* n = static_cast<StrMapNode*>(node);
+      CHECK_EQ(node_->data.size(), node_->keys.size());
+      for (size_t i = 0; i < node_->data.size(); ++i) {
+        n->data[node_->keys[i]]
+            = node_list_->at(node_->data[i]);
+      }
     } else {
       node->VisitAttrs(this);
     }
@@ -268,6 +316,8 @@ struct JSONGraph {
   size_t root;
   // the nodes of the graph
   std::vector<JSONNode> nodes;
+  // base64 b64ndarrays of arrays
+  std::vector<std::string> b64ndarrays;
   // global attributes
   AttrMap attrs;
 
@@ -275,6 +325,7 @@ struct JSONGraph {
     writer->BeginObject();
     writer->WriteObjectKeyValue("root", root);
     writer->WriteObjectKeyValue("nodes", nodes);
+    writer->WriteObjectKeyValue("b64ndarrays", b64ndarrays);
     if (attrs.size() != 0) {
       writer->WriteObjectKeyValue("attrs", attrs);
     }
@@ -286,6 +337,7 @@ struct JSONGraph {
     dmlc::JSONObjectReadHelper helper;
     helper.DeclareField("root", &root);
     helper.DeclareField("nodes", &nodes);
+    helper.DeclareOptionalField("b64ndarrays", &b64ndarrays);
     helper.DeclareOptionalField("attrs", &attrs);
     helper.ReadAllFields(reader);
   }
@@ -296,14 +348,24 @@ struct JSONGraph {
     indexer.MakeIndex(root.node_.get());
     JSONAttrGetter getter;
     getter.node_index_ = &indexer.node_index;
+    getter.tensor_index_ = &indexer.tensor_index;
     for (Node* n : indexer.node_list) {
       JSONNode jnode;
       getter.node_ = &jnode;
       getter.Get(n);
       g.nodes.emplace_back(std::move(jnode));
     }
-    g.attrs["tvm_version"] = "0.1.0";
+    g.attrs["tvm_version"] = TVM_VERSION;
     g.root = indexer.node_index.at(root.node_.get());
+    // serialize tensor
+    for (DLTensor* tensor : indexer.tensor_list) {
+      std::string blob;
+      dmlc::MemoryStringStream mstrm(&blob);
+      common::Base64OutStream b64strm(&mstrm);
+      runtime::SaveDLTensor(&b64strm, tensor);
+      b64strm.Finish();
+      g.b64ndarrays.emplace_back(std::move(blob));
+    }
     return g;
   }
 };
@@ -323,6 +385,16 @@ std::shared_ptr<Node> LoadJSON_(std::string json_str) {
   // load in json graph.
   jgraph.Load(&reader);
   std::vector<std::shared_ptr<Node> > nodes;
+  std::vector<runtime::NDArray> tensors;
+  // load in tensors
+  for (const std::string& blob : jgraph.b64ndarrays) {
+    dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&blob));
+    common::Base64InStream b64strm(&mstrm);
+    b64strm.InitPosition();
+    runtime::NDArray temp;
+    CHECK(temp.Load(&b64strm));
+    tensors.emplace_back(temp);
+  }
   // node 0 is always null
   nodes.reserve(jgraph.nodes.size());
   for (const JSONNode& jnode : jgraph.nodes) {
@@ -338,6 +410,7 @@ std::shared_ptr<Node> LoadJSON_(std::string json_str) {
   CHECK_EQ(nodes.size(), jgraph.nodes.size());
   JSONAttrSetter setter;
   setter.node_list_ = &nodes;
+  setter.tensor_list_ = &tensors;
 
   for (size_t i = 0; i < nodes.size(); ++i) {
     setter.node_ = &jgraph.nodes[i];
@@ -351,41 +424,46 @@ class NodeAttrSetter : public AttrVisitor {
   std::string type_key;
   std::unordered_map<std::string, runtime::TVMArgValue> attrs;
 
-  template<typename T>
-  void SetValue(const char* key, T* value) {
-    auto it = attrs.find(key);
-    if (it == attrs.end()) {
-      LOG(FATAL) << type_key << ": require field " << key;
-    }
-    *value = it->second.operator T();
-    attrs.erase(it);
-  }
   void Visit(const char* key, double* value) final {
-    SetValue(key, value);
+    *value = GetAttr(key).operator double();
   }
   void Visit(const char* key, int64_t* value) final {
-    SetValue(key, value);
+    *value = GetAttr(key).operator int64_t();
   }
   void Visit(const char* key, uint64_t* value) final {
-    SetValue(key, value);
+    *value = GetAttr(key).operator uint64_t();
   }
   void Visit(const char* key, int* value) final {
-    SetValue(key, value);
+    *value = GetAttr(key).operator int();
   }
   void Visit(const char* key, bool* value) final {
-    SetValue(key, value);
+    *value = GetAttr(key).operator bool();
   }
   void Visit(const char* key, std::string* value) final {
-    SetValue(key, value);
+    *value = GetAttr(key).operator std::string();
   }
   void Visit(const char* key, void** value) final {
-    SetValue(key, value);
+    *value = GetAttr(key).operator void*();
   }
   void Visit(const char* key, Type* value) final {
-    SetValue(key, value);
+    *value = GetAttr(key).operator Type();
   }
   void Visit(const char* key, NodeRef* value) final {
-    SetValue(key, value);
+    *value = GetAttr(key).operator NodeRef();
+  }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    *value = GetAttr(key).operator runtime::NDArray();
+  }
+
+ private:
+  runtime::TVMArgValue GetAttr(const char* key) {
+    auto it = attrs.find(key);
+    if (it == attrs.end()) {
+      LOG(FATAL) << type_key << ": require field " << key;
+    }
+    runtime::TVMArgValue v = it->second;
+    attrs.erase(it);
+    return v;
   }
 };
 
diff --git a/src/lang/tensor.cc b/src/lang/tensor.cc
index 5ed36287c54c..5db4f45e799f 100644
--- a/src/lang/tensor.cc
+++ b/src/lang/tensor.cc
@@ -16,7 +16,7 @@ Expr Tensor::operator()(Array<Var> indices) const {
 }
 
 Expr Tensor::operator()(Array<Expr> indices) const {
-  using Halide::Internal::Call;
+  using HalideIR::Internal::Call;
   CHECK_EQ(ndim(), indices.size())
       << "Tensor dimension mismatch in read"
       << "ndim = " << ndim() << ", indices.size=" << indices.size();
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index 11731361148d..366ea2c78fe6 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -24,6 +24,9 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 
 TVM_REGISTER_NODE_TYPE(ComputeOpNode);
 
+/// Verify if ComputeOp is valid with respect to Reduce operations.
+static void VerifyComputeOp(const ComputeOpNode *op);
+
 inline bool ReduceEqual(const ir::Reduce* a, const ir::Reduce* b) {
   return (a->combiner.same_as(b->combiner)) &&
          (a->source.same_as(b->source)) &&
@@ -63,7 +66,8 @@ Array<Expr> ComputeOpNode::output_shape(size_t idx) const {
 Tensor compute(Array<Expr> shape,
                FCompute fcompute,
                std::string name,
-               std::string tag) {
+               std::string tag,
+               Map<std::string, NodeRef> attrs) {
   auto op_node = std::make_shared<ComputeOpNode>();
   // compute dimension.
   size_t ndim = shape.size();
@@ -77,13 +81,15 @@ Tensor compute(Array<Expr> shape,
     args.push_back(axis.back()->var);
   }
 
-  return ComputeOpNode::make(name, tag, axis, {fcompute(args)}).output(0);
+  return ComputeOpNode::make(
+      name, tag, attrs, axis, {fcompute(args)}).output(0);
 }
 
 Array<Tensor> compute(Array<Expr> shape,
                       FBatchCompute fcompute,
                       std::string name,
-                      std::string tag) {
+                      std::string tag,
+                      Map<std::string, NodeRef> attrs) {
   auto op_node = std::make_shared<ComputeOpNode>();
   // compute dimension.
   size_t ndim = shape.size();
@@ -97,7 +103,7 @@ Array<Tensor> compute(Array<Expr> shape,
     args.push_back(axis.back()->var);
   }
 
-  Operation op = ComputeOpNode::make(name, tag, axis, fcompute(args));
+  Operation op = ComputeOpNode::make(name, tag, attrs, axis, fcompute(args));
   Array<Tensor> outputs;
   for (int idx = 0; idx < op->num_outputs(); ++idx) {
     outputs.push_back(op.output(idx));
@@ -107,24 +113,20 @@ Array<Tensor> compute(Array<Expr> shape,
 
 Operation ComputeOpNode::make(std::string name,
                               std::string tag,
+                              Map<std::string, NodeRef> attrs,
                               Array<IterVar> axis,
                               Array<Expr> body) {
   auto n = std::make_shared<ComputeOpNode>();
-  n->name = name;
-  n->tag = tag;
-  n->axis = axis;
-  n->body = body;
+  n->name = std::move(name);
+  n->tag = std::move(tag);
+  n->attrs = std::move(attrs);
+  n->axis = std::move(axis);
+  n->body = std::move(body);
   if (n->body[0]->is_type<ir::Reduce>()) {
     const ir::Reduce* reduce = n->body[0].as<ir::Reduce>();
-    for (size_t i = 1; i < n->body.size(); ++i) {
-      const ir::Reduce* reduce_ = n->body[i].as<ir::Reduce>();
-      CHECK(reduce_);
-      CHECK(ReduceEqual(reduce_, reduce))
-        << "The Reduce inputs of ComputeOp should "
-        << "have the same attribute except value_index";
-    }
     n->reduce_axis = reduce->axis;
   }
+  VerifyComputeOp(n.get());
   return Operation(n);
 }
 
@@ -151,18 +153,11 @@ Operation ComputeOpNode::ReplaceInputs(
     const Operation& self,
     const std::unordered_map<Tensor, Tensor>& rmap) const {
   CHECK_EQ(self.operator->(), this);
+  VerifyComputeOp(this);
   Array<Expr> arr;
   if (this->body[0]->is_type<ir::Reduce>()) {
     // Specially handle reduce so the replaced op
     // still share all the components
-    const ir::Reduce* reduce = this->body[0].as<ir::Reduce>();
-    for (size_t i = 1; i < this->body.size(); ++i) {
-      const ir::Reduce* reduce_ = this->body[i].as<ir::Reduce>();
-      CHECK(reduce_);
-      CHECK(ReduceEqual(reduce_, reduce))
-        << "The Reduce inputs of ComputeOp should "
-        << "have the same attribute except value_index";
-    }\
     Expr new_reduce = op::ReplaceTensor(this->body[0], rmap);
     if (!new_reduce.same_as(this->body[0])) {
       const ir::Reduce* r = new_reduce.as<ir::Reduce>();
@@ -181,7 +176,8 @@ Operation ComputeOpNode::ReplaceInputs(
       });
   }
   if (!arr.same_as(this->body)) {
-    return ComputeOpNode::make(name, tag, axis, arr);
+    return ComputeOpNode::make(
+        this->name, this->tag, this->attrs, this->axis, arr);
   } else {
     return self;
   }
@@ -228,7 +224,7 @@ Stmt ComputeOpNode::BuildRealize(
     const std::unordered_map<IterVar, Range>& realize_map,
     const Stmt& realize_body) const {
   CHECK_EQ(stage->op.get(), this);
-  Halide::Internal::Region bounds;
+  HalideIR::Internal::Region bounds;
   for (IterVar iv : this->axis) {
     bounds.push_back(realize_map.at(iv));
   }
@@ -305,9 +301,10 @@ Stmt MakeProvide(const ComputeOpNode* op,
 
 Stmt MakeComputeStmt(const ComputeOpNode* self,
                      const Stage& stage,
-                     const std::unordered_map<IterVar, Range>& dom_map) {
+                     const std::unordered_map<IterVar, Range>& dom_map,
+                     bool debug_keep_trivial_loop) {
   // grab the nest structure
-  ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map);
+  ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map, debug_keep_trivial_loop);
   // Normal loop structure
   n.init_nest.emplace_back(op::MakeIfNest(n.init_predicates));
   n.main_nest.emplace_back(op::MakeIfNest(n.main_predicates));
@@ -328,7 +325,11 @@ Stmt MakeComputeStmt(const ComputeOpNode* self,
         n.main_nest.begin() + n.num_common_loop + 1, n.main_nest.end());
     provide = op::Substitute(provide, n.main_vmap);
     provide = MergeNest(reduce, provide);
-    return MergeNest(common, Block::make(init, provide));
+    if (debug_keep_trivial_loop) {
+      return MergeNest(common, provide);
+    } else {
+      return MergeNest(common, Block::make(init, provide));
+    }
   } else {
     std::vector<Stmt> provides;
     for (size_t i = 0; i < self->body.size(); ++i) {
@@ -387,28 +388,31 @@ ComputeType DetectComputeType(const ComputeOpNode* self,
 // implement the provide utility.
 Stmt ComputeOpNode::BuildProvide(
     const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map) const {
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) const {
   CHECK_EQ(stage->op.operator->(), this);
   ComputeType ctype = DetectComputeType(this, stage);
   if (ctype == ComputeType::kCrossThreadReduction) {
     // specially handle cross thread reduction.
-    return MakeCrossThreadReduction(this, stage, dom_map);
+    return MakeCrossThreadReduction(this, stage, dom_map, debug_keep_trivial_loop);
   } else if (ctype == ComputeType::kTensorize) {
-    return MakeTensorize(this, stage, dom_map);
+    return MakeTensorize(this, stage, dom_map, debug_keep_trivial_loop);
   } else {
-    return MakeComputeStmt(this, stage, dom_map);
+    return MakeComputeStmt(this, stage, dom_map, debug_keep_trivial_loop);
   }
 }
 
 ComputeLoopNest ComputeLoopNest::make(
     const ComputeOpNode* self,
     const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map) {
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) {
   CHECK_EQ(stage->op.operator->(), self);
   ComputeLoopNest ret;
   // make main loop nest
   ret.main_nest = op::MakeLoopNest(
-      stage, dom_map, 0, false, std::unordered_set<IterVar>(), &ret.main_vmap);
+      stage, dom_map, 0, false, std::unordered_set<IterVar>(), &ret.main_vmap,
+      debug_keep_trivial_loop);
   ret.main_predicates = schedule::MakeBoundCheck(
       stage, dom_map, ret.main_vmap, false,
       std::unordered_set<IterVar>());
@@ -450,7 +454,7 @@ ComputeLoopNest ComputeLoopNest::make(
     }
     ret.init_nest = op::MakeLoopNest(
         stage, dom_map, begin_loop, true,
-        skip_iter, &(ret.init_vmap));
+        skip_iter, &(ret.init_vmap), debug_keep_trivial_loop);
     ret.init_predicates = schedule::MakeBoundCheck(
         stage, dom_map, ret.init_vmap, true, skip_iter);
     for (auto& e : ret.init_predicates) {
@@ -463,4 +467,78 @@ ComputeLoopNest ComputeLoopNest::make(
   // copy elison here.
   return ret;
 }
+
+namespace {
+/*!
+ * \brief Verify if ComputeOp is valid with respect to Reduce operations.
+ *
+ *  The following two properties are verified:
+ *  (1) All Reduce operations must exist at top level.
+ *  (2) For a list of operations, if one is Reduce, then the others
+ *      must be Reduce as well; and their inputs should have the
+ *      same attribute except value_index.
+ */
+class ComputeVerifier final : protected ir::IRVisitor {
+ public:
+  /// Special member functions
+  //@{
+  explicit ComputeVerifier(const ComputeOpNode* compute)
+      : compute_(compute), reduce_(compute->body[0].as<ir::Reduce>()) {}
+  virtual ~ComputeVerifier() = default;
+  ComputeVerifier(const ComputeVerifier&) = delete;
+  ComputeVerifier(ComputeVerifier&&) = delete;
+  ComputeVerifier& operator=(const ComputeVerifier&) = delete;
+  ComputeVerifier& operator=(ComputeVerifier&&) = delete;
+  //@}
+
+  /// Interface to perform compute verification
+  void Run() {
+    for (const Expr e : compute_->body) {
+      // Check for consistency of top level reductions
+      const ir::Reduce* reduce = e.as<ir::Reduce>();
+      CHECK((reduce && reduce_) || (!reduce && !reduce_))
+          << "All ComputeOp should be consistent "
+          << "with being Reduce operation or not.";
+
+      if (reduce && reduce_) {
+        CHECK(ReduceEqual(reduce, reduce_))
+            << "The Reduce inputs of ComputeOp should "
+            << "have the same attribute except value_index";
+      }
+
+      level_ = 0;
+      ir::IRVisitor::Visit(e);
+    }
+  }
+
+ protected:
+  /// Visitor implementation
+  //@{
+  void Visit(const NodeRef& n) final {
+    ++level_;
+    ir::IRVisitor::Visit(n);
+    --level_;
+  }
+
+  void Visit_(const ir::Reduce* op) final {
+    // Check for non top level reductions
+    CHECK(0 == level_)
+        << "Reductions are only allowed at the top level of compute. "
+        << "Please create another tensor for further composition.";
+  }
+  //@}
+
+ private:
+  const ComputeOpNode* compute_{nullptr};  ///< ComputeOpNode to verify
+  const ir::Reduce* reduce_{nullptr};      ///< Top level Reduce operation
+  int level_{0};                           ///< Level of op being processed
+};
+}  // namespace
+
+/// Verify if ComputeOp is valid with respect to Reduce operations.
+static void VerifyComputeOp(const ComputeOpNode* op) {
+  ComputeVerifier v(op);
+  v.Run();
+}
+
 }  // namespace tvm
diff --git a/src/op/compute_op.h b/src/op/compute_op.h
index 95dc0f44d8d4..996764c6cdc1 100644
--- a/src/op/compute_op.h
+++ b/src/op/compute_op.h
@@ -37,12 +37,14 @@ struct ComputeLoopNest {
    * \param self The pointer to compute op.
    * \param stage The scxhedule stage.
    * \param dom_map The domain map.
+   * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
    * \return The constructed loop nest
    */
   static ComputeLoopNest make(
       const ComputeOpNode* self,
       const Stage& stage,
-      const std::unordered_map<IterVar, Range>& dom_map);
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop);
 };
 
 /*!
@@ -50,23 +52,27 @@ struct ComputeLoopNest {
  * \param self The pointer to ComputeOpNode
  * \param stage The schedule stage.
  * \param dom_map The domain map.
+ * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
  * \return The created statement.
  */
 Stmt MakeCrossThreadReduction(
     const ComputeOpNode* self,
     const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map);
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop);
 
 /*!
  * \brief Build body of compute for tensorization.
  * \param self The pointer to ComputeOpNode
  * \param stage The schedule stage.
  * \param dom_map The domain map.
+ * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
  * \return The created statement.
  */
 Stmt MakeTensorize(const ComputeOpNode* self,
                    const Stage& stage,
-                   const std::unordered_map<IterVar, Range>& dom_map);
+                   const std::unordered_map<IterVar, Range>& dom_map,
+                   bool debug_keep_trivial_loop);
 }  // namespace tvm
 
 #endif  // TVM_OP_COMPUTE_OP_H_
diff --git a/src/op/cross_thread_reduction.cc b/src/op/cross_thread_reduction.cc
index 6eec3bd69d6a..eb320388860a 100644
--- a/src/op/cross_thread_reduction.cc
+++ b/src/op/cross_thread_reduction.cc
@@ -13,14 +13,15 @@ using namespace ir;
 Stmt MakeCrossThreadReduction(
     const ComputeOpNode* self,
     const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map) {
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) {
   Array<Expr>  args;
   for (IterVar iv : self->axis) {
     args.push_back(iv->var);
   }
   std::unordered_map<IterVar, Expr> value_map;
   auto nest = op::MakeLoopNest(
-      stage, dom_map, 0, false, std::unordered_set<IterVar>(), &value_map);
+      stage, dom_map, 0, false, std::unordered_set<IterVar>(), &value_map, debug_keep_trivial_loop);
   auto conds = schedule::MakeBoundCheck(
       stage, dom_map, value_map, false,
       std::unordered_set<IterVar>());
diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc
index 68a51df32616..759e258e90ef 100644
--- a/src/op/extern_op.cc
+++ b/src/op/extern_op.cc
@@ -38,23 +38,25 @@ Array<Expr> ExternOpNode::output_shape(size_t i) const {
 
 Operation ExternOpNode::make(std::string name,
                              std::string tag,
+                             Map<std::string, NodeRef> attrs,
                              Array<Tensor> inputs,
                              Array<Buffer> input_placeholders,
                              Array<Buffer> output_placeholders,
                              Stmt body) {
   auto n = std::make_shared<ExternOpNode>();
-  n->name = name;
-  n->tag = tag;
+  n->name = std::move(name);
+  n->tag = std::move(tag);
+  n->attrs = std::move(attrs);
   CHECK_EQ(inputs.size(), input_placeholders.size());
   for (size_t i = 0; i < inputs.size(); ++i) {
     CHECK_EQ(inputs[i]->dtype, input_placeholders[i]->dtype);
     CHECK(inputs[i]->shape.same_as(input_placeholders[i]->shape));
     CHECK_EQ(input_placeholders[i]->strides.size(), 0U);
   }
-  n->inputs = inputs;
-  n->input_placeholders = input_placeholders;
-  n->output_placeholders = output_placeholders;
-  n->body = body;
+  n->inputs = std::move(inputs);
+  n->input_placeholders = std::move(input_placeholders);
+  n->output_placeholders = std::move(output_placeholders);
+  n->body = std::move(body);
   return Operation(n);
 }
 
@@ -113,7 +115,7 @@ Stmt ExternOpNode::BuildRealize(
   Stmt realize_body = body;
   for (int k = 0; k < num_outputs(); ++k) {
     Tensor t = stage->op.output(k);
-    Halide::Internal::Region bounds;
+    HalideIR::Internal::Region bounds;
     for (size_t i = 0; i < t->shape.size(); ++i) {
       bounds.push_back(
           Range::make_by_min_extent(
@@ -128,9 +130,10 @@ Stmt ExternOpNode::BuildRealize(
 
 Stmt ExternOpNode::BuildProvide(
     const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map) const {
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) const {
   CHECK_EQ(stage->op.operator->(), this);
-  Stmt ret = this->body;
+  Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body);
   auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) {
     Array<NodeRef> bind_spec;
     Array<Expr> tuple;
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index 7cf6711d2270..4f34d8d972ce 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -1,5 +1,5 @@
 /*!
- *  Copyright (c) 2017 by5A Contributors
+ *  Copyright (c) 2017 by Contributors
  * \brief Utility to make loop nest.
  * \file op_util.cc
  */
@@ -23,7 +23,8 @@ MakeLoopNest(const Stage& stage,
              size_t begin_iter_pos,
              bool new_loop_var,
              const std::unordered_set<IterVar>& skip_iter,
-             std::unordered_map<IterVar, Expr>* p_value_map) {
+             std::unordered_map<IterVar, Expr>* p_value_map,
+             bool debug_keep_trivial_loop) {
   auto leaf_iter_vars = stage->leaf_iter_vars;
   Stmt no_op = Evaluate::make(0);
   // create the loop nest
@@ -70,12 +71,18 @@ MakeLoopNest(const Stage& stage,
                               << it_attr->iter_type
                               << " in the iter_var_attrs";
         }
-        for (Expr p : it_attr->pragmas) {
+        CHECK_EQ(it_attr->pragma_keys.size(), it_attr->pragma_values.size());
+        for (size_t k = 0; k < it_attr->pragma_keys.size(); ++k) {
+          const std::string& pkey = it_attr->pragma_keys[k].as<StringImm>()->value;
+          Expr pvalue = it_attr->pragma_values[k];
+          if (!pvalue.defined()) {
+            pvalue = make_const(Int(32), 1);
+          }
           nest[i + 1].emplace_back(
-              AttrStmt::make(iv, ir::attr::pragma_scope, p, no_op));
+              AttrStmt::make(iv, ir::attr::pragma_scope_prefix + pkey, pvalue, no_op));
         }
       }
-      if (is_one(dom->extent)) {
+      if (!debug_keep_trivial_loop && is_one(dom->extent)) {
         nest[i + 1].emplace_back(
             LetStmt::make(var, dom->min, no_op));
         value_map[iv] = dom->min;
@@ -130,7 +137,7 @@ MakeLoopNest(const Stage& stage,
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(
           AttrStmt::make(bind_iv, ir::attr::thread_extent, dom->extent, no_op));
-      if (is_one(dom->extent)) {
+      if (!debug_keep_trivial_loop && is_one(dom->extent)) {
         value_map[iv] = dom->min;
       } else {
         value_map[iv] = var;
diff --git a/src/op/op_util.h b/src/op/op_util.h
index 783fbb989422..558e8d4e7324 100644
--- a/src/op/op_util.h
+++ b/src/op/op_util.h
@@ -29,6 +29,7 @@ using ir::MergeNest;
  * \param new_loop_var Whether create new loop variable.
  * \param skip_iter Whether skip certain iteration.
  * \param p_value_map The result value of each IterVar.
+ * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
  */
 std::vector<std::vector<Stmt> >
 MakeLoopNest(const Stage& stage,
@@ -36,7 +37,8 @@ MakeLoopNest(const Stage& stage,
              size_t begin_iter_pos,
              bool new_loop_var,
              const std::unordered_set<IterVar>& skip_iter,
-             std::unordered_map<IterVar, Expr>* p_value_map);
+             std::unordered_map<IterVar, Expr>* p_value_map,
+             bool debug_keep_trivial_loop);
 
 /*!
  * \brief Create a nest of if checking the predicates.
diff --git a/src/op/placeholder_op.cc b/src/op/placeholder_op.cc
index 4e9d1d094d74..a2cd0eb2d81f 100644
--- a/src/op/placeholder_op.cc
+++ b/src/op/placeholder_op.cc
@@ -78,7 +78,8 @@ Stmt PlaceholderOpNode::BuildRealize(
 
 Stmt PlaceholderOpNode::BuildProvide(
     const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map) const {
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) const {
   return Stmt();
 }
 }  // namespace tvm
diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc
index 48565b6eb6df..626c8eba46b4 100644
--- a/src/op/scan_op.cc
+++ b/src/op/scan_op.cc
@@ -45,6 +45,7 @@ Array<Expr> ScanOpNode::output_shape(size_t i) const {
 
 Operation ScanOpNode::make(std::string name,
                            std::string tag,
+                           Map<std::string, NodeRef> attrs,
                            IterVar axis,
                            Array<Tensor> init,
                            Array<Tensor> update,
@@ -86,13 +87,14 @@ Operation ScanOpNode::make(std::string name,
           init[i]->shape[k], state_placeholder[i]->shape[k]));
     }
   }
-  n->name = name;
-  n->tag = tag;
-  n->scan_axis = axis;
-  n->init = init;
-  n->update = update;
-  n->state_placeholder = state_placeholder;
-  n->inputs = inputs;
+  n->name = std::move(name);
+  n->tag = std::move(tag);
+  n->attrs = std::move(attrs);
+  n->scan_axis = std::move(axis);
+  n->init = std::move(init);
+  n->update = std::move(update);
+  n->state_placeholder = std::move(state_placeholder);
+  n->inputs = std::move(inputs);
   return Operation(n);
 }
 
@@ -101,14 +103,16 @@ Array<Tensor> scan(Array<Tensor> init,
                    Array<Tensor> state_placeholder,
                    Array<Tensor> inputs,
                    std::string name,
-                   std::string tag) {
+                   std::string tag,
+                   Map<std::string, NodeRef> attrs) {
   IterVar scan_axis =
       IterVarNode::make(
           Range::make_by_min_extent(
               init[0]->shape[0], update[0]->shape[0] - init[0]->shape[0]),
           Var(name + ".idx"), kOrdered);
   Operation op = ScanOpNode::make(
-      name, tag, scan_axis, init, update, state_placeholder, inputs);
+      name, tag, attrs, scan_axis,
+      init, update, state_placeholder, inputs);
   Array<Tensor> res;
   for (int i = 0; i < op->num_outputs(); ++i) {
     res.push_back(op.output(i));
@@ -238,7 +242,7 @@ Stmt ScanOpNode::BuildRealize(
   for (size_t i = 0; i < update.size(); ++i) {
     Tensor t = stage->op.output(i);
     CHECK_EQ(static_cast<size_t>(t->value_index), i);
-    Halide::Internal::Region bounds;
+    HalideIR::Internal::Region bounds;
     bounds.push_back(tdom);
     for (size_t k = 1; k < this->update[i]->shape.size(); ++k, ++sp_idx) {
       IterVar sp_ax = this->spatial_axis_[sp_idx];
@@ -252,7 +256,8 @@ Stmt ScanOpNode::BuildRealize(
 
 Stmt ScanOpNode::BuildProvide(
     const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map) const {
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) const {
   CHECK_EQ(stage->op.operator->(), this);
   Stmt provide = AttrStmt::make(
       stage->op, attr::scan_update_scope, this->scan_axis->var,
@@ -270,7 +275,7 @@ Stmt ScanOpNode::BuildProvide(
   std::unordered_map<IterVar, Expr> vmap;
   std::unordered_set<IterVar> empty;
   auto nest = op::MakeLoopNest(
-      stage, dom_map, 0, false, empty, &vmap);
+      stage, dom_map, 0, false, empty, &vmap, debug_keep_trivial_loop);
   nest[begin_scan].push_back(init);
   nest.push_back(
       op::MakeIfNest(
diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc
index 9715fcbab1d6..148ad0f90fe7 100644
--- a/src/op/tensorize.cc
+++ b/src/op/tensorize.cc
@@ -10,6 +10,7 @@
 #include "./op_util.h"
 #include "./compute_op.h"
 #include "../schedule/message_passing.h"
+#include "../arithmetic/compute_expr.h"
 
 namespace tvm {
 
@@ -187,7 +188,8 @@ class TensorIntrinMatcher final : public IRMutator {
             const Stage& stage,
             const std::unordered_map<IterVar, Range>& out_dom,
             const std::unordered_map<Tensor, Array<Range> >& in_region,
-            const TensorIntrin& intrin) {
+            const TensorIntrin& intrin,
+            Map<Var, Range>* compute_intrin_iter_space) {
     CHECK(self == stage->op.get());
     // input remap.
     Array<Tensor> inputs = self->InputTensors();
@@ -232,6 +234,7 @@ class TensorIntrinMatcher final : public IRMutator {
       Range r = out_dom.at(iv);
       var_remap_[iv->var.get()] = target_iv->var + r->min;
       axis_remap_[iv] = target_iv;
+      compute_intrin_iter_space->Set(target_iv->var, target_iv->dom);
     }
     // Remap reduction axis
     CHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
@@ -251,6 +254,7 @@ class TensorIntrinMatcher final : public IRMutator {
       Range r = out_dom.at(iv);
       var_remap_[iv->var.get()] = target_iv->var + r->min;
       axis_remap_[iv] = target_iv;
+      compute_intrin_iter_space->Set(target_iv->var, target_iv->dom);
     }
   }
 
@@ -275,9 +279,10 @@ Array<Expr> MatchTensorizeBody(
     const Stage& stage,
     const std::unordered_map<IterVar, Range>& out_dom,
     const std::unordered_map<Tensor, Array<Range> >& in_region,
-    const TensorIntrin& intrin) {
+    const TensorIntrin& intrin,
+    Map<Var, Range>* compute_intrin_iter_space) {
   TensorIntrinMatcher matcher;
-  matcher.Init(self, stage, out_dom, in_region, intrin);
+  matcher.Init(self, stage, out_dom, in_region, intrin, compute_intrin_iter_space);
   Array<Expr> ret;
   for (Expr expr : self->body) {
     ret.push_back(matcher.Mutate(expr));
@@ -291,14 +296,18 @@ void VerifyTensorizeBody(
     const std::unordered_map<IterVar, Range>& out_dom,
     const std::unordered_map<Tensor, Array<Range> >& in_region,
     const TensorIntrin& intrin) {
-  Array<Expr> body = MatchTensorizeBody(self, stage, out_dom, in_region, intrin);
+  Map<Var, Range> compute_intrin_iter_space;
+  Array<Expr> body = MatchTensorizeBody(self, stage, out_dom, in_region, intrin,
+                                        &compute_intrin_iter_space);
   const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
   CHECK(intrin_compute) << "Only support compute intrinsic for now";
   CHECK_EQ(body.size(), intrin_compute->body.size())
       << "Tensorize failed: body size mismatch";
   for (size_t i = 0; i < body.size(); ++i) {
-    Expr lhs = CanonicalSimplify(body[i]);
-    Expr rhs = CanonicalSimplify(intrin_compute->body[i]);
+    Expr lhs = Simplify(body[i], compute_intrin_iter_space);
+    lhs = CanonicalSimplify(lhs, compute_intrin_iter_space);
+    Expr rhs = Simplify(intrin_compute->body[i], compute_intrin_iter_space);
+    rhs = CanonicalSimplify(rhs, compute_intrin_iter_space);
     if (lhs.type() != rhs.type()) {
       LOG(FATAL)
           << "Failed to match the data type with TensorIntrin "
@@ -314,16 +323,61 @@ void VerifyTensorizeBody(
   }
 }
 
+/*!
+ * \brief Transform the update part when there is no init func in tensorizing
+ * \param stage The stage for tensorizing.
+ * \param dom_map The range of each iter var.
+ * \param n The loop nest structured used in compute. 
+ * \param body The body func in tensorize intrin
+ * \param update The update func in tensorize intrin
+ * \return Transformed result.
+ */
+Stmt TransformUpdate(const Stage& stage,
+                     const std::unordered_map<IterVar, Range>& dom_map,
+                     const ComputeLoopNest& n,
+                     Stmt body,
+                     Stmt update) {
+  Array<Expr> conds;
+  std::unordered_set<const Variable*> banned;
+  for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
+    IterVar iv = stage->leaf_iter_vars[i];
+    auto iit = stage->iter_var_attrs.find(iv);
+    if (iit != stage->iter_var_attrs.end()) {
+      const IterVarAttr& attr = (*iit).second;
+      if (attr->iter_type == kTensorized) {
+        break;
+      }
+    }
+    if (iv->iter_type == kCommReduce) {
+      auto vit = dom_map.find(iv);
+      CHECK(vit != dom_map.end());
+      const Range& vrange = vit->second;
+      conds.push_back(likely(iv->var > vrange->min));
+      banned.insert(iv->var.get());
+    }
+  }
+  for (const Expr& pred : n.main_predicates) {
+    if (ir::ExprUseVar(pred, banned)) {
+      LOG(FATAL) << "Tensorize update transform failed, the condition "
+                 << pred << " has a conflict with the reset condition";
+    }
+  }
+
+  return IfThenElse::make(arith::ComputeReduce<ir::Or>(conds, const_true(1)),
+                          update, body);
+}
+
 Stmt MakeTensorize(const ComputeOpNode* self,
                    const Stage& stage,
-                   const std::unordered_map<IterVar, Range>& dom_map) {
+                   const std::unordered_map<IterVar, Range>& dom_map,
+                   bool debug_keep_trivial_loop) {
   std::unordered_map<IterVar, Range> out_dom;
   std::unordered_map<Tensor, Array<Range> > in_region;
   size_t tloc = InferTensorizeRegion(self, stage, dom_map, &out_dom, &in_region);
   TensorIntrin intrin = stage->iter_var_attrs.at(
       stage->leaf_iter_vars[tloc])->tensor_intrin;
   CHECK(intrin.defined());
-  ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map);
+  ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map, debug_keep_trivial_loop);
   VerifyTensorizeLoopNest(self, stage, n, tloc);
   VerifyTensorizeBody(self, stage, out_dom, in_region, intrin);
   // Start bind data.
@@ -408,32 +462,47 @@ Stmt MakeTensorize(const ComputeOpNode* self,
     return MergeNest(nest, body);
   } else {
     // Need to split reduction
-    CHECK(intrin->reduce_init.defined())
-        << "Reduction init op for intrin " << intrin << " is not defined";
     CHECK(intrin->reduce_update.defined())
         << "Reduction update op for intrin " << intrin << " is not defined";
     // Need init and update steps
     CHECK_NE(self->reduce_axis.size(), 0U);
     std::vector<std::vector<Stmt> > common(
         n.main_nest.begin(), n.main_nest.begin() + n.num_common_loop + 1);
-    // init nest
-    std::vector<std::vector<Stmt> > init_nest(
-        n.init_nest.begin(), n.init_nest.begin() + tloc + 1);
-    init_nest.emplace_back(op::MakeIfNest(n.init_predicates));
-    Stmt init = MergeNest(output_bind_nest, intrin->reduce_init);
-    init = Substitute(init, n.init_vmap);
-    init = MergeNest(init_nest, init);
-    // The update
     std::vector<std::vector<Stmt> > update_nest(
         n.main_nest.begin() + n.num_common_loop + 1, n.main_nest.begin() + tloc + 1);
     update_nest.emplace_back(op::MakeIfNest(n.main_predicates));
-    Stmt update = MergeNest(output_bind_nest, intrin->reduce_update);
-    update = MergeNest(input_bind_nest, update);
-    update = Substitute(update, vmap);
-    update = MergeNest(binder.asserts(), update);
-    update = Substitute(update, n.main_vmap);
-    update = MergeNest(update_nest, update);
-    return MergeNest(common, Block::make(init, update));
+
+    if (intrin->reduce_init.defined()) {
+      // init nest
+      std::vector<std::vector<Stmt> > init_nest(
+          n.init_nest.begin(), n.init_nest.begin() + tloc + 1);
+      init_nest.emplace_back(op::MakeIfNest(n.init_predicates));
+      Stmt init = MergeNest(output_bind_nest, intrin->reduce_init);
+      init = Substitute(init, n.init_vmap);
+      init = MergeNest(init_nest, init);
+      // The update
+      Stmt update = MergeNest(output_bind_nest, intrin->reduce_update);
+      update = MergeNest(input_bind_nest, update);
+      update = Substitute(update, vmap);
+      update = MergeNest(binder.asserts(), update);
+      update = Substitute(update, n.main_vmap);
+      update = MergeNest(update_nest, update);
+      return MergeNest(common, Block::make(init, update));
+    } else {
+      // When init op is not available, use body op for reset in the first iter.
+      CHECK(intrin->body.defined())
+          << "Normal body op for intrin " << intrin << " is not defined";
+      Stmt update = TransformUpdate(stage, dom_map, n,
+                                    intrin->body,
+                                    intrin->reduce_update);
+      update = MergeNest(output_bind_nest, update);
+      update = MergeNest(input_bind_nest, update);
+      update = Substitute(update, vmap);
+      update = MergeNest(binder.asserts(), update);
+      update = Substitute(update, n.main_vmap);
+      update = MergeNest(update_nest, update);
+      return MergeNest(common, update);
+    }
   }
 }
 
@@ -459,11 +528,13 @@ TVM_REGISTER_API("test.op.MatchTensorizeBody")
     Map<IterVar, Range> out_dom = args[1];
     Map<Tensor, Array<Range> > in_region = args[2];
     TensorIntrin intrin = args[3];
+    Map<Var, Range> vrange;
     CHECK(stage->op.as<ComputeOpNode>());
     *ret = MatchTensorizeBody(stage->op.as<ComputeOpNode>(),
                               stage,
                               as_unordered_map(out_dom),
                               as_unordered_map(in_region),
-                              intrin);
+                              intrin,
+                              &vrange);
   });
 }  // namespace tvm
diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc
index f9969cc5dba2..390c918d9692 100644
--- a/src/pass/arg_binder.cc
+++ b/src/pass/arg_binder.cc
@@ -136,12 +136,6 @@ inline Expr TVMArrayGet(Type t, Var arr, intrinsic::TVMStructFieldKind kind) {
   return TVMStructGet(t, arr, 0, kind);
 }
 
-inline Stmt AssertNull(Var handle, std::string msg) {
-  return AssertStmt::make(Call::make(
-      Bool(1), intrinsic::tvm_handle_is_null,
-      {handle}, Call::PureIntrinsic), msg, Evaluate::make(0));
-}
-
 void ArgBinder::BindDLTensor(const Buffer& buffer,
                              const Expr& device_type,
                              const Expr& device_id,
@@ -174,7 +168,7 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
   if (Bind_(buffer->data, TVMArrayGet(Handle(), handle, intrinsic::kArrData),
             arg_name + ".data", true)) {
     Var vptr(buffer->data);
-    def_handle_dtype_.Set(vptr, make_const(buffer->dtype, 0));
+    def_handle_dtype_.Set(vptr, ir::TypeAnnotation(buffer->dtype));
     // mark alignment of external bufs
     init_nest_.emplace_back(AttrStmt::make(
         vptr, ir::attr::storage_alignment,
@@ -196,16 +190,42 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
   }
   // strides field
   Var v_strides(arg_name + ".strides", Handle());
-  def_handle_dtype_.Set(v_strides, make_const(tvm_shape_type, 0));
+  def_handle_dtype_.Set(v_strides, ir::TypeAnnotation(tvm_shape_type));
   init_nest_.emplace_back(LetStmt::make(
       v_strides, TVMArrayGet(Handle(), handle, intrinsic::kArrStrides),
       nop));
+  Expr is_null = Call::make(
+    Bool(1), intrinsic::tvm_handle_is_null,
+    {v_strides}, Call::PureIntrinsic);
   if (buffer->strides.size() == 0) {
+    // Assert the buffer is compact
+    Type stype = buffer->DefaultIndexType();
+    Expr expect_stride = make_const(stype, 1);
+    Array<Expr> conds;
+    for (size_t i = buffer->shape.size(); i != 0; --i) {
+      size_t k = i - 1;
+      Expr svalue = cast(
+          stype,
+          Load::make(tvm_shape_type, v_strides,
+                     IntImm::make(Int(32), k), const_true(1)));
+      conds.push_back(expect_stride == svalue);
+      expect_stride = expect_stride * buffer->shape[k];
+    }
     std::ostringstream stride_err_msg;
     stride_err_msg << arg_name << ".strides:"
-                   << " expected to be nullptr for contiguous array";
-    init_nest_.emplace_back(AssertNull(v_strides, stride_err_msg.str()));
+                   << " expected to be compact array";
+    if (conds.size() != 0) {
+      Stmt check =
+          AssertStmt::make(arith::ComputeReduce<ir::And>(conds, Expr()),
+                           stride_err_msg.str(), Evaluate::make(0));
+      check = IfThenElse::make(Not::make(is_null), check, Stmt());
+      init_nest_.emplace_back(Block::make(check, Evaluate::make(0)));
+    }
   } else {
+    std::ostringstream stride_null_err_msg;
+    stride_null_err_msg << arg_name << ".strides: expected non-null strides.";
+    asserts_.emplace_back(AssertStmt::make(Not::make(is_null), stride_null_err_msg.str(), nop));
+
     for (size_t k = 0; k < buffer->strides.size(); ++k) {
       std::ostringstream field_name;
       field_name << v_strides->name_hint << '[' << k << ']';
diff --git a/src/pass/coproc_sync.cc b/src/pass/coproc_sync.cc
index fa77942b6058..b3e64a989702 100644
--- a/src/pass/coproc_sync.cc
+++ b/src/pass/coproc_sync.cc
@@ -338,6 +338,256 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
 };
 
 
+class CoProcInstDepDetector : public IRVisitor {
+ public:
+  explicit CoProcInstDepDetector(
+      const IterVar& coproc_axis,
+      const std::string& coproc_name)
+      : coproc_axis_(coproc_axis) {
+    sync_push_name_ = coproc_name + ".coproc_dep_push";
+    sync_pop_name_ = coproc_name + ".coproc_dep_pop";
+  }
+
+  void Plan(Stmt stmt) {
+    this->Visit(stmt);
+    if (last_state_.node != nullptr) {
+      MatchFixEnterPop(first_state_);
+      MatchFixExitPush(last_state_);
+    }
+  }
+
+  void Visit_(const AttrStmt* op) final {
+    if (op->attr_key == attr::coproc_scope &&
+        op->node.same_as(coproc_axis_)) {
+      const IntImm* ctx_id = op->value.as<IntImm>();
+      CHECK(ctx_id != nullptr);
+      curr_state_.clear();
+      curr_state_.node = op->body.get();
+      curr_state_.enter_ctx.insert(ctx_id->value);
+      curr_state_.exit_ctx.insert(ctx_id->value);
+      UpdateState();
+    } else {
+      IRVisitor::Visit_(op);
+    }
+  }
+
+  void Visit_(const For* op) final {
+    SyncState temp_first, temp_last;
+    std::swap(first_state_, temp_first);
+    std::swap(last_state_, temp_last);
+    this->Visit(op->body);
+    curr_state_.clear();
+    if (last_state_.node != nullptr) {
+      curr_state_.node = op;
+      CHECK(first_state_.node != nullptr);
+      // loop carry dependency
+      InjectSync(last_state_, first_state_,
+                 &(curr_state_.exit_push),
+                 &(curr_state_.enter_pop));
+      curr_state_.enter_ctx = first_state_.enter_ctx;
+      curr_state_.exit_ctx = last_state_.exit_ctx;
+    }
+    std::swap(first_state_, temp_first);
+    std::swap(last_state_, temp_last);
+    if (curr_state_.node != nullptr) {
+      UpdateState();
+    }
+  }
+
+  void Visit_(const IfThenElse* op) final {
+    SyncState temp_first, temp_last, curr_state;
+    std::swap(first_state_, temp_first);
+    std::swap(last_state_, temp_last);
+    {
+      // then stmt
+      this->Visit(op->then_case);
+      if (last_state_.node != nullptr) {
+        curr_state.node = op;
+        MatchFixEnterPop(first_state_);
+        MatchFixExitPush(last_state_);
+        curr_state.enter_ctx.insert(
+            first_state_.enter_ctx.begin(),
+            first_state_.enter_ctx.end());
+        curr_state.exit_ctx.insert(
+            last_state_.exit_ctx.begin(),
+            last_state_.exit_ctx.end());
+      }
+      first_state_.clear();
+      last_state_.clear();
+    }
+    if (op->else_case.defined()) {
+      this->Visit(op->else_case);
+      if (last_state_.node != nullptr) {
+        curr_state.node = op;
+        MatchFixEnterPop(first_state_);
+        MatchFixExitPush(last_state_);
+        curr_state.enter_ctx.insert(
+            first_state_.enter_ctx.begin(),
+            first_state_.enter_ctx.end());
+        curr_state.exit_ctx.insert(
+            last_state_.exit_ctx.begin(),
+            last_state_.exit_ctx.end());
+      }
+    }
+    // update in the trace.
+    std::swap(first_state_, temp_first);
+    std::swap(last_state_, temp_last);
+    std::swap(curr_state_, curr_state);
+    if (curr_state_.node != nullptr) {
+      UpdateState();
+    }
+  }
+
+  // insert before is stored in reverse order
+  // the first element is closest to the node.
+  std::unordered_map<const Node*, std::vector<Stmt> > insert_before_;
+  std::unordered_map<const Node*, std::vector<Stmt> > insert_after_;
+
+ private:
+  // state in the sync entry
+  struct SyncState {
+    // The statement of the state.
+    const Node* node{nullptr};
+    // Set of all possible contexts in the entering moment.
+    std::unordered_set<int> enter_ctx;
+    // Set of all possible contexts in the exit moment.
+    std::unordered_set<int> exit_ctx;
+    // existing pop performed at enter
+    std::vector<std::pair<int, int> > enter_pop;
+    // existing push peformed at exit
+    std::vector<std::pair<int, int> > exit_push;
+    // clear the state
+    void clear() {
+      node = nullptr;
+      enter_ctx.clear();
+      exit_ctx.clear();
+      enter_pop.clear();
+      exit_push.clear();
+    }
+  };
+  // inject proper sync into the pair
+  // record the push/pop sequence that could be possibly un-matched.
+  // return the push/pop message at enter/exit of the Block
+  // after considering the existing unmatcheded events and added events
+  void InjectSync(const SyncState& prev,
+                  const SyncState& next,
+                  std::vector<std::pair<int, int> >* prev_exit_push,
+                  std::vector<std::pair<int, int> >* next_enter_pop) {
+    prev_exit_push->clear();
+    next_enter_pop->clear();
+    // quick path
+    if (prev.exit_push.size() == 0 && next.enter_pop.size() == 0 &&
+        prev.exit_ctx.size() == 1 && next.enter_ctx.size() == 1) {
+      int from = *prev.exit_ctx.begin();
+      int to = *next.enter_ctx.begin();
+      if (from != to) {
+        insert_after_[prev.node].emplace_back(MakePush(from, to));
+        insert_before_[next.node].emplace_back(MakePop(from, to));
+        prev_exit_push->emplace_back(std::make_pair(from, to));
+        next_enter_pop->emplace_back(std::make_pair(from, to));
+      }
+      return;
+    }
+    // complicate path.
+    std::vector<std::pair<int, int> > vpush = prev.exit_push;
+    std::vector<std::pair<int, int> > vpop = next.enter_pop;
+    std::vector<std::pair<int, int> > pending;
+    for (int from : prev.exit_ctx) {
+      for (int to : next.enter_ctx) {
+        if (from != to) {
+          pending.emplace_back(std::make_pair(from, to));
+        }
+      }
+    }
+    // policy 1
+    std::vector<Stmt> prev_after, next_before;
+    for (const std::pair<int, int>& p : pending) {
+      if (std::find(prev.exit_push.begin(),
+                    prev.exit_push.end(), p) ==
+          prev.exit_push.end()) {
+        vpush.push_back(p);
+        prev_after.emplace_back(MakePush(p.first, p.second));
+      }
+      if (std::find(next.enter_pop.begin(),
+                    next.enter_pop.end(), p) ==
+          next.enter_pop.end()) {
+        vpop.push_back(p);
+        next_before.emplace_back(MakePop(p.first, p.second));
+      }
+    }
+    // fix pending
+    for (const std::pair<int, int>& p : vpush) {
+      if (std::find(vpop.begin(), vpop.end(), p) == vpop.end()) {
+        prev_after.emplace_back(MakePop(p.first, p.second));
+      } else {
+        prev_exit_push->push_back(p);
+      }
+    }
+    for (const std::pair<int, int>& p : vpop) {
+      if (std::find(vpush.begin(), vpush.end(), p) == vpush.end()) {
+        next_before.emplace_back(MakePush(p.first, p.second));
+      } else {
+        next_enter_pop->push_back(p);
+      }
+    }
+    if (prev_after.size() != 0) {
+      auto &v1 = insert_after_[prev.node];
+      v1.insert(v1.end(), prev_after.begin(), prev_after.end());
+    }
+    if (next_before.size() != 0) {
+      auto &v2 = insert_before_[next.node];
+      v2.insert(v2.end(), next_before.begin(), next_before.end());
+    }
+  }
+
+  void MatchFixEnterPop(const SyncState& state) {
+    if (state.enter_pop.size() == 0) return;
+    auto &vec = insert_before_[state.node];
+    for (const std::pair<int, int>& p : state.enter_pop) {
+      vec.push_back(MakePush(p.first, p.second));
+    }
+  }
+
+  void MatchFixExitPush(const SyncState& state) {
+    if (state.exit_push.size() == 0) return;
+    auto &vec = insert_after_[state.node];
+    for (const std::pair<int, int>& p : state.exit_push) {
+      vec.push_back(MakePop(p.first, p.second));
+    }
+  }
+
+  void UpdateState() {
+    if (last_state_.node != nullptr) {
+      std::vector<std::pair<int, int> > t1, t2;
+      InjectSync(last_state_, curr_state_, &t1, &t2);
+      std::swap(last_state_, curr_state_);
+    } else {
+      CHECK(first_state_.node == nullptr);
+      first_state_ = curr_state_;
+      last_state_ = curr_state_;
+    }
+  }
+
+  Stmt MakePush(int from, int to) {
+    return Evaluate::make(Call::make(
+        Int(32), sync_push_name_,
+        {make_const(Int(32), from), make_const(Int(32), to)},
+        Call::Intrinsic));
+  }
+  Stmt MakePop(int from, int to) {
+    return Evaluate::make(Call::make(
+        Int(32), sync_pop_name_,
+        {make_const(Int(32), from), make_const(Int(32), to)},
+        Call::Intrinsic));
+  }
+  // sync states.
+  SyncState first_state_, last_state_, curr_state_;
+  // Variables
+  IterVar coproc_axis_;
+  std::string sync_push_name_, sync_pop_name_;
+};
+
+
 class CoProcSyncInserter : public IRMutator {
  public:
   Stmt Insert(Stmt stmt) {
@@ -372,6 +622,18 @@ class CoProcSyncInserter : public IRMutator {
       auto& vec = insert_after_[kv.first];
       vec.insert(vec.end(), kv.second.begin(), kv.second.end());
     }
+    // Detect barrier
+    CoProcInstDepDetector sync_detector(
+        *visitor.coproc_.begin(), coproc_name);
+    sync_detector.Plan(stmt);
+    for (const auto& kv : sync_detector.insert_before_) {
+      auto& vec = insert_before_[kv.first];
+      vec.insert(vec.end(), kv.second.begin(), kv.second.end());
+    }
+    for (const auto& kv : sync_detector.insert_after_) {
+      auto& vec = insert_after_[kv.first];
+      vec.insert(vec.end(), kv.second.begin(), kv.second.end());
+    }
     return Mutate(stmt);
   }
 
@@ -379,7 +641,8 @@ class CoProcSyncInserter : public IRMutator {
     Stmt before, after;
     auto it = insert_before_.find(stmt.get());
     if (it != insert_before_.end()) {
-      before = MergeSeq(it->second);
+      before = MergeSeq(std::vector<Stmt>(
+          it->second.rbegin(), it->second.rend()));
     }
     it = insert_after_.find(stmt.get());
     if (it != insert_after_.end()) {
@@ -396,10 +659,13 @@ class CoProcSyncInserter : public IRMutator {
   }
 
  private:
+  // insert before is stored in reverse order
+  // the first element is closest to the node.
   std::unordered_map<const Node*, std::vector<Stmt> > insert_before_;
   std::unordered_map<const Node*, std::vector<Stmt> > insert_after_;
 };
 
+
 Stmt CoProcSync(Stmt stmt) {
   return CoProcSyncInserter().Insert(stmt);
 }
diff --git a/src/pass/inject_copy_intrin.cc b/src/pass/inject_copy_intrin.cc
index a7151ed0aeb5..094a5932400b 100644
--- a/src/pass/inject_copy_intrin.cc
+++ b/src/pass/inject_copy_intrin.cc
@@ -17,7 +17,7 @@ class CopyIntrinInjector : public IRMutator {
  public:
   CopyIntrinInjector(const std::string& pragma_key,
                      const PackedFunc& flower_copy_fromto)
-      : pragma_key_(pragma_key),
+      : pragma_key_(attr::pragma_scope_prefix+  pragma_key),
         flower_copy_fromto_(flower_copy_fromto) {
   }
 
@@ -25,14 +25,11 @@ class CopyIntrinInjector : public IRMutator {
     if (op->attr_key == attr::storage_scope) {
       const Variable* buf = op->node.as<Variable>();
       storage_scope_[buf] = op->value.as<StringImm>()->value;
-    } else if (op->attr_key == ir::attr::pragma_scope) {
-      const std::string& pname = op->value.as<StringImm>()->value;
-      if (pname == pragma_key_) {
-        Stmt ret;
-        CHECK(MatchCopyPattern(op->body, &ret))
-            << "Cannot match copy pattern of " << op->body;
-        return ret;
-      }
+    } else if (op->attr_key == pragma_key_) {
+      Stmt ret;
+      CHECK(MatchCopyPattern(op->body, &ret))
+          << "Cannot match copy pattern of " << op->body;
+      return ret;
     }
     return IRMutator::Mutate_(op, s);
   }
@@ -40,6 +37,7 @@ class CopyIntrinInjector : public IRMutator {
  private:
   bool MatchCopyPattern(Stmt stmt, Stmt *out) {
     Stmt body = stmt;
+    bool is_single_point_copy = false;
 
     // strip the loops
     std::vector<const For*> loops;
@@ -51,12 +49,20 @@ class CopyIntrinInjector : public IRMutator {
     const Store* store = body.as<Store>();
     if (store == nullptr) return false;
     const Select* select = store->value.as<Select>();
+    const Cast* cast = store->value.as<Cast>();
     const Load* load = store->value.as<Load>();
-
+    if (0 == loops.size()) {
+      is_single_point_copy = true;
+      CHECK(select == nullptr);
+    }
     // for now only support true condition matching
     if (select != nullptr) {
       load = select->true_value.as<Load>();
     }
+    // cast can be part of the pattern
+    if (cast != nullptr) {
+      load = cast->value.as<Load>();
+    }
     if (load == nullptr) return false;
     if (load->type.lanes() != 1) return false;
     Array<Var> loop_vars;
@@ -69,13 +75,19 @@ class CopyIntrinInjector : public IRMutator {
         arith::DetectLinearEquation(load->index, loop_vars);
     if (load_strides.size()  == 0 || store_strides.size() == 0) return false;
     Array<Expr> dst_shape;
-    for (const For* op : loops) {
-      dst_shape.push_back(op->extent);
+    auto loop_var_size = loop_vars.size();
+    if (is_single_point_copy) {
+      loop_var_size = 1;
+      dst_shape.push_back(make_const(Int(32), 1));
+    } else {
+      for (const For* op : loops) {
+        dst_shape.push_back(op->extent);
+      }
     }
     Array<Expr> src_shape = dst_shape;
     Array<Expr> pad_before, pad_after;
     Expr pad_value;
-    Expr src_elem_offset = load_strides[loop_vars.size()];
+    Expr src_elem_offset = load_strides[loop_var_size];
     if (select != nullptr) {
       Array<Expr> clip_bound =
           arith::DetectClipBound(select->condition, loop_vars);
@@ -109,15 +121,15 @@ class CopyIntrinInjector : public IRMutator {
       src_elem_offset = Simplify(src_elem_offset);
     }
     CHECK_EQ(load_strides.size(), store_strides.size());
-    CHECK_EQ(load_strides.size(), loop_vars.size() + 1);
-    Array<Expr> src_strides(load_strides.begin(), load_strides.begin() + loop_vars.size());
-    Array<Expr> dst_strides(store_strides.begin(), store_strides.begin() + loop_vars.size());
+    CHECK_EQ(load_strides.size(), loop_var_size + 1);
+    Array<Expr> src_strides(load_strides.begin(), load_strides.begin() + loop_var_size);
+    Array<Expr> dst_strides(store_strides.begin(), store_strides.begin() + loop_var_size);
     Buffer dst = BufferNode::make(
         Var(store->buffer_var.node_),
-        load->type,
+        store->value.type(),
         dst_shape,
         dst_strides,
-        store_strides[loop_vars.size()],
+        store_strides[loop_var_size],
         store->buffer_var->name_hint,
         GetStorageScope(store->buffer_var.get()),
         0, 0);
@@ -144,7 +156,7 @@ class CopyIntrinInjector : public IRMutator {
     }
   }
   // pragma key
-  const std::string& pragma_key_;
+  std::string pragma_key_;
   // function to lower copy intrinsics.
   const PackedFunc& flower_copy_fromto_;
   // Storage scope
diff --git a/src/pass/inject_double_buffer.cc b/src/pass/inject_double_buffer.cc
index e9bd8594ab4d..03ffdb01e107 100644
--- a/src/pass/inject_double_buffer.cc
+++ b/src/pass/inject_double_buffer.cc
@@ -81,7 +81,8 @@ class DoubleBufferInjector : public IRMutator {
   Stmt Mutate_(const Allocate* op, const Stmt& s) final {
     auto it = dbuffer_info_.find(op->buffer_var.get());
     if (it != dbuffer_info_.end()) {
-      it->second.stride = arith::ComputeReduce<Mul>(op->extents) * op->type.lanes();
+      it->second.stride = arith::ComputeReduce<Mul>
+          (op->extents, Expr()) * op->type.lanes();
       Stmt stmt = IRMutator::Mutate_(op, s);
       op = stmt.as<Allocate>();
       Array<Expr> new_extents{make_const(op->extents[0].type(), 2)};
diff --git a/src/pass/inject_prefetch.cc b/src/pass/inject_prefetch.cc
index 6d903292a004..2749cb6a2db3 100644
--- a/src/pass/inject_prefetch.cc
+++ b/src/pass/inject_prefetch.cc
@@ -15,7 +15,7 @@ namespace ir {
 
 using arith::IntSet;
 using arith::DomainTouched;
-using Halide::Internal::Region;
+using HalideIR::Internal::Region;
 
 class PrefetchInjector : public IRMutator {
  public:
diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc
index 72b5753adbcf..833513756053 100644
--- a/src/pass/inject_virtual_thread.cc
+++ b/src/pass/inject_virtual_thread.cc
@@ -15,11 +15,12 @@ namespace ir {
 // If expression is touched by var.
 class ExprTouched final : public IRVisitor {
  public:
-  explicit ExprTouched(const std::unordered_set<const Variable*> &touched)
-      : touched_var_(touched) {}
+  explicit ExprTouched(const std::unordered_set<const Variable*> &touched,
+                       bool check_write)
+      : touched_var_(touched), check_write_(check_write) {}
   void Visit(const NodeRef& n) final {
     // early stopping
-    if (expr_touched_) return;
+    if (expr_touched_ && !check_write_) return;
     IRVisitor::Visit(n);
   }
   void Visit_(const Load *op) final {
@@ -29,6 +30,24 @@ class ExprTouched final : public IRVisitor {
   void Visit_(const Variable *op) final {
     HandleUseVar(op);
   }
+  void Visit_(const Call *op) final {
+    if (op->is_intrinsic(intrinsic::tvm_access_ptr)) {
+      int rw_mask = 0;
+      CHECK(arith::GetConstInt(op->args[4], &rw_mask));
+      const Variable* buffer_var = op->args[1].as<Variable>();
+      CHECK(buffer_var);
+      // read
+      if (rw_mask & 1) {
+        HandleUseVar(buffer_var);
+      }
+      if (rw_mask & 2) {
+        HandleWriteVar(buffer_var);
+      }
+      this->Visit(op->args[2]);
+    } else {
+      IRVisitor::Visit_(op);
+    }
+  }
   void HandleUseVar(const Variable* var) {
     auto it = touched_var_.find(var);
     if (it != touched_var_.end()) {
@@ -40,36 +59,49 @@ class ExprTouched final : public IRVisitor {
       used_vars_.push_back(var);
     }
   }
+  void HandleWriteVar(const Variable* var) {
+    write_vars_.push_back(var);
+  }
   // the fields.
   bool expr_touched_{false};
   std::vector<const Variable*> used_vars_;
+  std::vector<const Variable*> write_vars_;
   const std::unordered_set<const Variable*>& touched_var_;
+  bool check_write_;
 };
 
 // Analyze if the buffers are invariant to value of var
 class VarTouchedAnalysis : public IRVisitor {
  public:
   void Visit_(const LetStmt *op) {
-    ExprTouched tc(touched_var_);
+    ExprTouched tc(touched_var_, false);
     tc.Visit(op->value);
     Record(op->var.get(), tc);
     this->Visit(op->body);
   }
   void Visit_(const Store *op) {
-    ExprTouched tc(touched_var_);
+    ExprTouched tc(touched_var_, false);
     tc.Visit(op->value);
     tc.Visit(op->index);
     Record(op->buffer_var.get(), tc);
   }
   void Visit_(const For *op) {
-    ExprTouched tc(touched_var_);
+    ExprTouched tc(touched_var_, false);
     tc.Visit(op->min);
     tc.Visit(op->extent);
     Record(op->loop_var.get(), tc);
     this->Visit(op->body);
   }
+  // external function call
+  void Visit_(const Evaluate *op) {
+    ExprTouched tc(touched_var_, true);
+    tc.Visit(op->value);
+    for (const Variable* var : tc.write_vars_) {
+      Record(var, tc);
+    }
+  }
   void Visit_(const Allocate *op) {
-    ExprTouched tc(touched_var_);
+    ExprTouched tc(touched_var_, false);
     for (size_t i = 0; i < op->extents.size(); ++i) {
       tc.Visit(op->extents[i]);
     }
@@ -87,7 +119,9 @@ class VarTouchedAnalysis : public IRVisitor {
       touched_var_.insert(var);
     } else {
       for (const Variable* r : tc.used_vars_) {
-        affect_[r].push_back(var);
+        if (r != var) {
+          affect_[r].push_back(var);
+        }
       }
     }
   }
@@ -229,7 +263,8 @@ class VTInjector : public IRMutator {
     if (visit_touched_var_ && !vt_loop_injected_) {
       return InjectVTLoop(s, true);
     } else if (!allow_share_ && !vt_loop_injected_ &&
-               op->attr_key == attr::coproc_uop_scope) {
+               (op->attr_key == attr::coproc_uop_scope ||
+                op->attr_key == attr::coproc_scope)) {
       return InjectVTLoop(s, true);
     } else {
       Stmt body = Mutate(op->body);
@@ -341,7 +376,8 @@ class VTInjector : public IRMutator {
     // always rewrite if not allow sharing.
     if (touched_var_.count(op->buffer_var.get()) || !allow_share_) {
       // place v on highest dimension.
-      Expr stride = arith::ComputeReduce<Mul>(op->extents) * op->type.lanes();
+      Expr stride = arith::ComputeReduce<Mul>(
+          op->extents, Expr()) * op->type.lanes();
       Array<Expr> other;
       other.push_back(make_const(op->extents[0].type(), num_threads_));
       for (Expr e : extents) {
@@ -382,7 +418,8 @@ class VTInjector : public IRMutator {
     // reset the flags after processing.
     vt_loop_injected_ = false;
     visit_touched_var_ = false;
-    if (max_loop_depth_ == 0) {
+    // only unroll if number of vthreads are small
+    if (max_loop_depth_ == 0 && num_threads_ < 16) {
       // do unrolling if it is inside innermost content.
       Stmt blk = Substitute(stmt, {{var_, make_zero(var_.type())}});
       for (int i = 1; i < num_threads_; ++i) {
diff --git a/src/pass/ir_deep_compare.cc b/src/pass/ir_deep_compare.cc
index 9bb764129c36..8a1b09e49339 100644
--- a/src/pass/ir_deep_compare.cc
+++ b/src/pass/ir_deep_compare.cc
@@ -328,8 +328,8 @@ class IRDeepCompare :
     return order_;
   }
 
-  int CompareRegion(const Halide::Internal::Region& lhs,
-                    const Halide::Internal::Region& rhs) {
+  int CompareRegion(const HalideIR::Internal::Region& lhs,
+                    const HalideIR::Internal::Region& rhs) {
     if (order_ != 0) return order_;
     if (CompareValue(lhs.size(), rhs.size()) != 0) return order_;
     for (size_t i = 0; i < lhs.size(); ++i) {
diff --git a/src/pass/ir_mutator.cc b/src/pass/ir_mutator.cc
index 993b68f835d7..9ca9ccd190ff 100644
--- a/src/pass/ir_mutator.cc
+++ b/src/pass/ir_mutator.cc
@@ -206,7 +206,7 @@ Stmt IRMutator::Mutate_(const Provide* op, const Stmt& s) {
 
 Stmt IRMutator::Mutate_(const Realize* op, const Stmt& s) {
   IRMutator* m = this;
-  Halide::Internal::Region new_bounds;
+  HalideIR::Internal::Region new_bounds;
   bool bounds_changed = false;
 
   // Mutate the bounds
@@ -236,7 +236,7 @@ Stmt IRMutator::Mutate_(const Realize* op, const Stmt& s) {
 
 Stmt IRMutator::Mutate_(const Prefetch* op, const Stmt& s) {
   IRMutator* m = this;
-  Halide::Internal::Region new_bounds;
+  HalideIR::Internal::Region new_bounds;
   bool bounds_changed = false;
 
   // Mutate the bounds
diff --git a/src/pass/ir_util.cc b/src/pass/ir_util.cc
index 12551947ad7c..579706ca9964 100644
--- a/src/pass/ir_util.cc
+++ b/src/pass/ir_util.cc
@@ -33,6 +33,11 @@ Stmt MergeNest(const std::vector<Stmt>& nest, Stmt body) {
       CHECK(!n->else_case.defined());
       n->then_case = body;
       body = Stmt(n);
+    } else if (s.as<Block>()) {
+      auto n = std::make_shared<Block>(*s.as<Block>());
+      CHECK(is_no_op(n->rest));
+      n->rest = body;
+      body = Stmt(n);
     } else if (s.as<AssertStmt>()) {
       auto n = std::make_shared<AssertStmt>(*s.as<AssertStmt>());
       CHECK(is_no_op(n->body));
diff --git a/src/pass/ir_util.h b/src/pass/ir_util.h
index ae7a026c1ecb..f871133fb74f 100644
--- a/src/pass/ir_util.h
+++ b/src/pass/ir_util.h
@@ -102,6 +102,7 @@ inline Expr AddressOffset(Var handle, Type dtype, int offset) {
 inline Expr AddressOffset(Var handle, Type dtype, Expr offset) {
   if (dtype.lanes() != 1) {
     offset = offset * make_const(offset.type(), dtype.lanes());
+    offset = Ramp::make(offset, make_const(offset.type(), 1), dtype.lanes());
   }
   return Call::make(
       Handle(), intrinsic::tvm_address_of,
@@ -153,13 +154,30 @@ inline Type APIType(Type t) {
 inline int GetTempAllocaAlignment(Type type, int32_t const_size) {
   int align = runtime::kTempAllocaAlignment;
   if (const_size > 0) {
-    const_size = const_size * type.bits() * type.lanes() / 8;
-    while (align > const_size) {
+    int64_t const_s = static_cast<int64_t>(const_size) * type.bits() * type.lanes() / 8;
+    while (align > const_s) {
       align = align / 2;
     }
   }
   return align;
 }
+
+/*!
+ * \brief Pattern match index to Ramp with stride=1
+ *        This is a common pattern in continuous memory load.
+ * \param index The index formula
+ * \param lanes number of lanes in the ramp
+ * \param base The result base.
+ * \return true if pattern match success and store the base to base.
+ */
+inline bool GetRamp1Base(Expr index, int lanes, Expr *base) {
+  const Ramp* r = index.as<Ramp>();
+  if (!r) return false;
+  if (!is_one(r->stride)) return false;
+  CHECK_EQ(r->lanes, lanes);
+  *base = r->base;
+  return true;
+}
 }  // namespace ir
 }  // namespace tvm
 #endif  // TVM_PASS_IR_UTIL_H_
diff --git a/src/pass/ir_visitor.cc b/src/pass/ir_visitor.cc
index 27add40f2bd9..12bad2bd5e86 100644
--- a/src/pass/ir_visitor.cc
+++ b/src/pass/ir_visitor.cc
@@ -134,6 +134,7 @@ DEFINE_BINOP_VISIT_(Or)
 void IRVisitor::Visit_(const Reduce* op) {
   VisitRDom(op->axis, this);
   VisitArray(op->source, this);
+  this->Visit(op->condition);
 }
 
 void IRVisitor::Visit_(const Cast* op) {
diff --git a/src/pass/lift_attr_scope.cc b/src/pass/lift_attr_scope.cc
index fdf692782523..a3a60aaac4d1 100644
--- a/src/pass/lift_attr_scope.cc
+++ b/src/pass/lift_attr_scope.cc
@@ -7,6 +7,7 @@
  */
 #include <tvm/ir_pass.h>
 #include <tvm/ir_mutator.h>
+#include "./ir_util.h"
 
 namespace tvm {
 namespace ir {
@@ -57,41 +58,16 @@ class AttrScopeLifter : public IRMutator {
   }
 
   Stmt Mutate_(const Block* op, const Stmt& s) final {
-    Stmt first = this->Mutate(op->first);
-    NodeRef first_node_;
-    Expr first_value_;
-    std::swap(first_node_, attr_node_);
-    std::swap(first_value_, attr_value_);
-    Stmt rest = this->Mutate(op->rest);
-    if (attr_node_.defined() &&
-        attr_value_.defined() &&
-        first_node_.defined() &&
-        first_value_.defined() &&
-        attr_node_.same_as(first_node_) &&
-        attr_value_.same_as(first_value_)) {
-      if (first.same_as(op->first) && rest.same_as(op->rest)) {
-        return s;
-      } else {
-        return Block::make(first, rest);
-      }
-    } else {
-      if (first_node_.defined()) {
-        first = AttrStmt::make(
-            first_node_, attr_key_, first_value_, first);
-      }
-      if (attr_node_.defined()) {
-        rest = AttrStmt::make(
-            attr_node_, attr_key_, attr_value_, rest);
-        // undefine them
-        attr_node_ = NodeRef();
-        attr_value_ = Expr();
-      }
-      if (first.same_as(op->first) && rest.same_as(op->rest)) {
-        return s;
-      } else {
-        return Block::make(first, rest);
-      }
+    std::vector<Stmt> seq;
+    FlattenSeq(op->first, &seq);
+    FlattenSeq(op->rest, &seq);
+    seq = MutateSeq(seq);
+    if (seq.size() == 2 &&
+        seq[0].same_as(op->first) &&
+        seq[1].same_as(op->rest)) {
+      return s;
     }
+    return MergeSeq(seq);
   }
 
   Stmt Mutate_(const IfThenElse* op, const Stmt& s) final {
@@ -99,17 +75,17 @@ class AttrScopeLifter : public IRMutator {
       return IRMutator::Mutate_(op, s);
     }
     Stmt then_case = this->Mutate(op->then_case);
-    NodeRef first_node_;
-    Expr first_value_;
-    std::swap(first_node_, attr_node_);
-    std::swap(first_value_, attr_value_);
+    NodeRef first_node;
+    Expr first_value;
+    std::swap(first_node, attr_node_);
+    std::swap(first_value, attr_value_);
     Stmt else_case = this->Mutate(op->else_case);
     if (attr_node_.defined() &&
         attr_value_.defined() &&
-        first_node_.defined() &&
-        first_value_.defined() &&
-        attr_node_.same_as(first_node_) &&
-        attr_value_.same_as(first_value_)) {
+        first_node.defined() &&
+        first_value.defined() &&
+        attr_node_.same_as(first_node) &&
+        ValueSame(attr_value_, first_value)) {
       if (then_case.same_as(op->then_case) &&
           else_case.same_as(op->else_case)) {
         return s;
@@ -117,9 +93,9 @@ class AttrScopeLifter : public IRMutator {
         return IfThenElse::make(op->condition, then_case, else_case);
       }
     } else {
-      if (first_node_.defined()) {
+      if (first_node.defined()) {
         then_case = AttrStmt::make(
-            first_node_, attr_key_, first_value_, then_case);
+            first_node, attr_key_, first_value, then_case);
       }
       if (attr_node_.defined()) {
         else_case = AttrStmt::make(
@@ -138,6 +114,82 @@ class AttrScopeLifter : public IRMutator {
   }
 
  private:
+  void FlattenSeq(Stmt s, std::vector<Stmt>* res) {
+    if (const Block* op = s.as<Block>()) {
+      FlattenSeq(op->first, res);
+      FlattenSeq(op->rest, res);
+    } else if (const ProducerConsumer* op = s.as<ProducerConsumer>()) {
+      if (!op->is_producer) {
+        FlattenSeq(op->body, res);
+      } else {
+        res->emplace_back(s);
+      }
+    } else {
+      res->emplace_back(s);
+    }
+  }
+
+  std::vector<Stmt> MutateSeq(const std::vector<Stmt>& seq) {
+    std::vector<Stmt> res_seq;
+    NodeRef curr_node;
+    Expr curr_value;
+    Stmt curr_stmt;
+    for (const Stmt & stmt : seq) {
+      attr_node_ = NodeRef();
+      attr_value_ = Expr();
+      Stmt rest = this->Mutate(stmt);
+      if (attr_node_.defined() &&
+          attr_value_.defined() &&
+          curr_node.defined() &&
+          curr_value.defined() &&
+          attr_node_.same_as(curr_node) &&
+          ValueSame(attr_value_, curr_value)) {
+        curr_stmt = Block::make(curr_stmt, rest);
+      } else {
+        if (curr_stmt.defined()) {
+          if (curr_node.defined()) {
+            curr_stmt = AttrStmt::make(
+                curr_node, attr_key_, curr_value, curr_stmt);
+          }
+          res_seq.push_back(curr_stmt);
+        }
+        curr_stmt = rest;
+        curr_node = attr_node_;
+        curr_value = attr_value_;
+      }
+    }
+
+    if (curr_stmt.defined()) {
+      // keep attr_node_, attr_node_
+      if (res_seq.size() == 0) {
+        return {curr_stmt};
+      }
+      if (curr_node.defined()) {
+        curr_stmt = AttrStmt::make(
+            curr_node, attr_key_, curr_value, curr_stmt);
+      }
+      res_seq.push_back(curr_stmt);
+      // reset
+      attr_node_ = NodeRef();
+      attr_value_ = Expr();
+    }
+    return res_seq;
+  }
+
+  // value comparison that also compares content of int constant
+  static bool ValueSame(const Expr& a, const Expr& b) {
+    if (a.same_as(b)) return true;
+    if (a->type_key() != b->type_key()) return false;
+    if (a.type() != b.type()) return false;
+    if (const IntImm* op = a.as<IntImm>()) {
+      return op->value == b.as<IntImm>()->value;
+    }
+    if (const UIntImm* op = a.as<UIntImm>()) {
+      return op->value == b.as<UIntImm>()->value;
+    }
+    return false;
+  }
+
   std::string attr_key_;
   NodeRef attr_node_;
   Expr attr_value_;
diff --git a/src/pass/loop_partition.cc b/src/pass/loop_partition.cc
index 0834fe6ab3df..0de8a88edb00 100644
--- a/src/pass/loop_partition.cc
+++ b/src/pass/loop_partition.cc
@@ -45,10 +45,12 @@ bool ExprUseVars(Expr expr, const std::unordered_set<const Variable*>& vars) {
 class CandidateSelector final : public IRVisitor {
  public:
   using VarIsUsed = bool;
-  CandidateSelector() {}
+  explicit CandidateSelector(bool split_const_loop)
+      : split_const_loop_(split_const_loop) {}
 
   void Visit_(const For* op) {
-    if (!is_const(op->min) || !is_const(op->extent)) {
+    // partition const loop when sets split_const_loop_
+    if (!is_const(op->min) || !is_const(op->extent) || split_const_loop_) {
       const Variable* var = op->loop_var.get();
       record_.insert({var, false});
       IRVisitor::Visit_(op);
@@ -67,7 +69,7 @@ class CandidateSelector final : public IRVisitor {
       CHECK(iv);
       Var var = iv->var;
       runtime::ThreadScope scope = runtime::ThreadScope::make(iv->thread_tag);
-      if ((scope.rank == 0) && !is_const(op->value)) {
+      if ((scope.rank == 0) && (!is_const(op->value) || split_const_loop_)) {
         record_.insert({var.get(), false});
         IRVisitor::Visit_(op);
         if (record_.at(var.get()) && !no_split_) {
@@ -115,6 +117,7 @@ class CandidateSelector final : public IRVisitor {
  private:
   bool in_likely_{false};
   bool no_split_{false};
+  bool split_const_loop_{false};
   std::unordered_map<const Variable*, VarIsUsed> record_;
 };
 
@@ -297,8 +300,13 @@ class LoopPartitioner : public IRMutator {
   std::unordered_map<const Variable*, IntSet> relax_map_;
 };
 
-Stmt LoopPartitioner::TryPartition(const Node* node, const Stmt& stmt,
-    VarExpr var, Expr min, Expr max, Stmt body, bool partition_thread_scope) {
+Stmt LoopPartitioner::TryPartition(const Node* node,
+                                   const Stmt& stmt,
+                                   VarExpr var,
+                                   Expr min,
+                                   Expr max,
+                                   Stmt body,
+                                   bool partition_thread_scope) {
   PartitionFinder finder(var, hint_map_, relax_map_);
   finder.Visit(body);
   const auto& partitions = finder.partitions;
@@ -337,7 +345,8 @@ Stmt LoopPartitioner::TryPartition(const Node* node, const Stmt& stmt,
   if (true_itrv.as<arith::IntervalSet>()->i.has_upper_bound()) {
     post_doubt_begin = true_itrv.max() + 1;
     if (!can_prove(true_itrv.max() == max)) {
-      Expr cond = (max - post_doubt_begin >= 0);
+      // require the extent to be non-negative
+      Expr cond = (max - post_doubt_begin + 1 >= 0);
       if (!can_prove(cond)) {
         LOG(WARNING) << "Cannot prove: " << cond
                      << ", when generating the post doubt loop";
@@ -392,8 +401,8 @@ class RemoveLikelyTags : public IRMutator {
   }
 };
 
-Stmt LoopPartition(Stmt stmt) {
-  CandidateSelector selector;
+Stmt LoopPartition(Stmt stmt, bool split_const_loop) {
+  CandidateSelector selector(split_const_loop);
   selector.Visit(stmt);
   stmt = LoopPartitioner(selector.candidates).Mutate(stmt);
   stmt = RemoveLikelyTags().Mutate(stmt);
diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc
index a12f96c2282e..a63fef07bd12 100644
--- a/src/pass/lower_tvm_builtin.cc
+++ b/src/pass/lower_tvm_builtin.cc
@@ -72,7 +72,7 @@ class BuiltinLower : public IRMutator {
     int64_t nbytes = GetVectorBytes(op->type);
     if (device_type_.defined()) {
       if (arith::GetConst(device_type_, &dev_type)) {
-        if (dev_type == kCPU) {
+        if (dev_type == kDLCPU) {
           int32_t constant_size = op->constant_allocation_size();
           if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
             return stmt;
@@ -96,14 +96,18 @@ class BuiltinLower : public IRMutator {
                                     {op->buffer_var}, Call::PureIntrinsic),
                          throw_last_error),
         op->body);
-    Stmt alloca = LetStmt::make(op->buffer_var,
-                                Call::make(op->buffer_var.type(),
-                                           "TVMBackendAllocWorkspace",
-                                           {cast(Int(32), device_type_),
-                                                 cast(Int(32), device_id_),
-                                                 cast(UInt(64), total_bytes)},
-                                           Call::Extern),
-                                body);
+
+    Stmt alloca = LetStmt::make(
+        op->buffer_var,
+        Call::make(op->buffer_var.type(),
+                   "TVMBackendAllocWorkspace",
+                   {cast(Int(32), device_type_),
+                    cast(Int(32), device_id_),
+                    cast(UInt(64), total_bytes),
+                    IntImm::make(Int(32), op->type.code()),
+                    IntImm::make(Int(32), op->type.bits())},
+                   Call::Extern),
+        body);
 
     Expr free_op = Call::make(Int(32),
                               "TVMBackendFreeWorkspace",
diff --git a/src/pass/lower_warp_memory.cc b/src/pass/lower_warp_memory.cc
new file mode 100644
index 000000000000..8f153fd61188
--- /dev/null
+++ b/src/pass/lower_warp_memory.cc
@@ -0,0 +1,326 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *
+ * Lower warp memory to use local memory
+ * and shuffle intrinsics.
+ *
+ * \file lower_warp_memory.cc
+ */
+// Thanks to Andrew Adams and Vinod Grover for
+// explaining the concept of warp shuffle.
+#include <tvm/ir.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/ir_pass.h>
+#include <unordered_set>
+#include "./ir_util.h"
+#include "../arithmetic/compute_expr.h"
+#include "../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace ir {
+
+// Rewrite Rule
+//
+// There is no special warp memory in most GPUs.
+// Instead, we can stripe the data into threads
+// and store the data into local memory.
+//
+// This requires us to do the following rewriting:
+// - Rewrite allocation to use local memory.
+// - Rewrite store of warp memory to local store.
+// - Rewrite load of waro memory to local plus a shuffle.
+//
+// Define a generic shuffle instrinsic warp_shuffle(data, warp_index).
+// We can use the following rewriting rule
+//
+// Before rewrite,
+//
+//   alloc warp warp_mem[n * warp_size * m]
+//   store warp_mem[m * warp_index + (warp_size * m) * y + x]
+//   load warp_mem[m * z + (warp_size * m) * y + x]
+//   subject to x \in [0, m), y \in [0, n)
+//
+// After rewrite:
+//
+//   alloc local local_mem[n * m]
+//   store warp_mem[m * y + x]
+//   warp_shuffle(load warp_mem[m * y + x], z)
+//   subject to (m * y + x) is invariant to warp_index
+
+// Algorithm
+//
+// To implement this rewrite rule, we can do the follow step:
+// For each warp memory alloc
+// - Use linear pattern detector on load index to find m
+// - Deduce n given warp_size and alloc size
+// - Now that we have m, n, warp_size, we can proceed with the rewrite
+
+// Visitor to find m in pattern
+// store warp_mem[m * warp_index + (warp_size * m) * y + x]
+class WarpStoreCoeffFinder : private IRVisitor {
+ public:
+  WarpStoreCoeffFinder(const Variable* buffer,
+                       Var warp_index)
+      : buffer_(buffer), warp_index_(warp_index) {
+  }
+  // find the warp co-efficient in the statement given the warp size
+  int Find(const Stmt& stmt) {
+    this->Visit(stmt);
+    return warp_coeff_;
+  }
+
+ private:
+  /// Visitor implementation
+  void Visit_(const Store *op) final {
+    if (op->buffer_var.get() == buffer_) {
+      if (op->value.type().lanes() == 1) {
+        UpdatePattern(op->index);
+      } else {
+        Expr base;
+        CHECK(GetRamp1Base(op->index, op->value.type().lanes(), &base))
+            << "LowerWarpMemory failed due to store index=" << op->index
+            << ", can only handle continuous store";
+        UpdatePattern(base);
+      }
+    } else {
+      IRVisitor::Visit_(op);
+    }
+  }
+
+  void UpdatePattern(const Expr& index) {
+    Array<Expr> m =
+        arith::DetectLinearEquation(index, {warp_index_});
+    CHECK_EQ(m.size(), 2U)
+        << "LowerWarpMemory failed due to store index=" << index;
+    int coeff;
+    Expr mcoeff = ir::Simplify(m[0]);
+
+    CHECK(arith::GetConstInt(mcoeff, &coeff) && coeff > 0)
+        << "LowerWarpMemory failed due to store index=" << index
+        << ", require positive constant coefficient on warp index " << warp_index_
+        << " but get " << mcoeff;
+
+    if (warp_coeff_ != 0) {
+      CHECK_EQ(warp_coeff_, coeff)
+          << "LowerWarpMemory failed due to two different store coefficient to warp index";
+    } else {
+      warp_coeff_ = coeff;
+    }
+  }
+
+  // The buffer variable
+  const Variable* buffer_;
+  // the warp index
+  Var warp_index_;
+  // the coefficient
+  int warp_coeff_{0};
+};
+
+
+// Visitor to find the warp index
+class WarpIndexFinder : private IRVisitor {
+ public:
+  explicit WarpIndexFinder(int warp_size)
+      : warp_size_(warp_size) {
+  }
+  // find the warp co-efficient in the statement given the warp size
+  IterVar Find(const Stmt& stmt) {
+    this->Visit(stmt);
+    CHECK(warp_index_.defined())
+        << "Cannot find warp index(threadIdx.x) within the scope of warp memory";
+    return warp_index_;
+  }
+
+ private:
+  /// Visitor implementation
+  void Visit_(const AttrStmt *op) final {
+    if (op->attr_key == attr::thread_extent) {
+      IterVar iv(op->node.node_);
+      if (iv->thread_tag == "threadIdx.x") {
+        int value;
+        CHECK(arith::GetConstInt(op->value, &value) &&
+              value == warp_size_)
+            << "Expect threadIdx.x 's size to be equal to warp size("
+            << warp_size_ << ")" << " to enable warp memory"
+            << " but get " << op->value << " instead";
+        if (warp_index_.defined()) {
+          CHECK(warp_index_.same_as(iv))
+              << "Find two instance of " << warp_index_->thread_tag
+              << " in the same kernel. "
+              << "Please create it using thread_axis once and reuse the axis "
+              << "across multiple binds in the same kernel";
+        } else {
+          warp_index_ = iv;
+        }
+      }
+    }
+    IRVisitor::Visit_(op);
+  }
+  // warp size
+  int warp_size_{0};
+  // the warp index
+  IterVar warp_index_{nullptr};
+};
+// Mutator to change the read pattern
+class WarpAccessRewriter : protected IRMutator {
+ public:
+  explicit WarpAccessRewriter(int warp_size)
+      : warp_size_(warp_size) {}
+  // Rewrite the allocate statement which transforms
+  // warp memory to local memory.
+  Stmt Rewrite(const Allocate* op, const Stmt& stmt) {
+    buffer_ = op->buffer_var.get();
+    int alloc_size = op->constant_allocation_size();
+    CHECK_GT(alloc_size, 0)
+        << "warp memory only support constant alloc size";
+    alloc_size *= op->type.lanes();
+    warp_index_ = WarpIndexFinder(warp_size_).Find(op->body)->var;
+    warp_coeff_ = WarpStoreCoeffFinder(
+        buffer_, warp_index_).Find(op->body);
+    CHECK_EQ(alloc_size % (warp_size_ * warp_coeff_), 0)
+        << "Warp memory must be multiple of warp size";
+    warp_group_ = alloc_size / (warp_size_ * warp_coeff_);
+    return Allocate::make(
+        op->buffer_var,
+        op->type,
+        {make_const(Int(32), alloc_size / warp_size_)},
+        op->condition,
+        this->Mutate(op->body));
+  }
+
+ protected:
+  Expr Mutate_(const Variable* op, const Expr& expr) {
+    CHECK(op != buffer_)
+        << "Cannot access address of warp memory directly";
+    return IRMutator::Mutate_(op, expr);
+  }
+
+  Stmt Mutate_(const Store* op, const Stmt& stmt) {
+    if (op->buffer_var.get() == buffer_) {
+      Expr local_index, group;
+      std::tie(local_index, group) = SplitIndexByGroup(op->index);
+      return Store::make(op->buffer_var, op->value, local_index, op->predicate);
+    } else {
+      return IRMutator::Mutate_(op, stmt);
+    }
+  }
+
+  Expr Mutate_(const Load* op, const Expr& expr) {
+    if (op->buffer_var.get() == buffer_) {
+      Expr local_index, group;
+      std::tie(local_index, group) = SplitIndexByGroup(op->index);
+      // invariance: local index must do not contain warp id
+      CHECK(!ExprUseVar(local_index, {warp_index_.get()}))
+          << "LowerWarpMemory failed to rewrite load to shuffle for index "
+          << op->index << " local_index=" << local_index;
+      Expr load_value = Load::make(
+          op->type, op->buffer_var, local_index, op->predicate);
+      return Call::make(load_value.type(),
+                        intrinsic::tvm_warp_shuffle,
+                        {load_value, group},
+                        Call::Intrinsic);
+    } else {
+      return IRMutator::Mutate_(op, expr);
+    }
+  }
+  // Split the index to the two component
+  // <local_index, source_index>
+  // local index is the index in the local
+  // source index is the corresponding source index
+  // in this access pattern.
+  std::pair<Expr, Expr> SplitIndexByGroup(const Expr& index) {
+    if (index.type().lanes() != 1) {
+      Expr base, local_index, group;
+      CHECK(GetRamp1Base(index, index.type().lanes(), &base));
+      std::tie(local_index, group) = SplitIndexByGroup(base);
+      local_index =
+          Ramp::make(local_index, make_const(local_index.type(), 1), index.type().lanes());
+      return std::make_pair(local_index, group);
+    }
+    Expr m = make_const(index.type(), warp_coeff_);
+    Range rng = Range::make_by_min_extent(
+        make_zero(index.type()), make_const(index.type(), warp_size_));
+    Map<Var, Range> vrange({{warp_index_, rng}});
+
+    // simple case, warp index is on the highest.
+    if (warp_group_ == 1) {
+      Expr x = Simplify(index % m, vrange);
+      Expr z = Simplify(index / m, vrange);
+      return std::make_pair(x, z);
+    } else {
+      Expr x = Simplify(index % m, vrange);
+      Expr y = index / make_const(index.type(), warp_coeff_ * warp_size_);
+      y = y * m + x;
+      Expr z = index % make_const(index.type(), warp_coeff_ * warp_size_) / m;
+      return std::make_pair(Simplify(y, vrange), Simplify(z, vrange));
+    }
+  }
+
+ private:
+  // the warp size
+  int warp_size_{0};
+  // The buffer variable
+  const Variable* buffer_;
+  // Warp index
+  Var warp_index_;
+  // the coefficient m
+  int warp_coeff_{0};
+  // the coefficient n
+  int warp_group_{0};
+};
+
+// Mutator to change the read pattern
+class WarpMemoryRewriter : private IRMutator {
+ public:
+  explicit WarpMemoryRewriter(int warp_size)
+      : warp_size_(warp_size) {
+  }
+
+  Stmt Rewrite(Stmt stmt) {
+    if (warp_size_ == 1) return stmt;
+    stmt = this->Mutate(stmt);
+    stmt = CanonicalSimplify(stmt);
+    return stmt;
+  }
+
+ private:
+  Stmt Mutate_(const Allocate* op, const Stmt& stmt) {
+    if (warp_buffer_.count(op->buffer_var.get())) {
+      WarpAccessRewriter rewriter(warp_size_);
+      return rewriter.Rewrite(op, stmt);
+    } else {
+      return IRMutator::Mutate_(op, stmt);
+    }
+  }
+
+  Stmt Mutate_(const AttrStmt* op, const Stmt& stmt) {
+    using runtime::StorageScope;
+    if (op->attr_key == attr::storage_scope) {
+      const Variable* buf = op->node.as<Variable>();
+      StorageScope scope = StorageScope::make(op->value.as<StringImm>()->value);
+      if (scope.rank == runtime::StorageRank::kWarp) {
+        warp_buffer_.insert(buf);
+        Stmt ret = IRMutator::Mutate_(op, stmt);
+        op = ret.as<AttrStmt>();
+        return AttrStmt::make(
+            op->node, op->attr_key, StringImm::make("local"), op->body);
+      }
+    }
+    return IRMutator::Mutate_(op, stmt);
+  }
+
+  int warp_size_{0};
+  std::unordered_set<const Variable*> warp_buffer_;
+};
+
+LoweredFunc
+LowerWarpMemory(LoweredFunc f, int warp_size) {
+  CHECK_EQ(f->func_type, kDeviceFunc);
+  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  n->body = WarpMemoryRewriter(warp_size).Rewrite(n->body);
+  return LoweredFunc(n);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 5f124e5690c6..206bd95010ce 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -102,17 +102,19 @@ LoweredFunc MakeAPI(Stmt body,
         msg << name << ": Expect arg[" << i << "] to be pointer";
         seq_check.emplace_back(
             AssertStmt::make(tcode == kHandle ||
+                             tcode == kNDArrayContainer ||
                              tcode == kArrayHandle ||
                              tcode == kNull, msg.str(), nop));
       } else if (t.is_int() || t.is_uint()) {
         std::ostringstream msg;
         msg << name << ": Expect arg[" << i << "] to be int";
-        seq_check.emplace_back(AssertStmt::make(tcode == kInt, msg.str(), nop));
+        seq_check.emplace_back(AssertStmt::make(tcode == kDLInt, msg.str(), nop));
       } else {
         CHECK(t.is_float());
         std::ostringstream msg;
         msg << name << ": Expect arg[" << i << "] to be float";
-        seq_check.emplace_back(AssertStmt::make(tcode == kFloat, msg.str(), nop));
+        seq_check.emplace_back(
+            AssertStmt::make(tcode == kDLFloat, msg.str(), nop));
       }
     } else {
       args.push_back(v_arg);
@@ -148,7 +150,7 @@ LoweredFunc MakeAPI(Stmt body,
     seq_check.push_back(AttrStmt::make(
         node, attr::device_context_type, device_type, nop));
     Stmt set_device = IfThenElse::make(
-        device_type != kCPU, Evaluate::make(Call::make(
+        device_type != kDLCPU, Evaluate::make(Call::make(
             Int(32), intrinsic::tvm_call_packed,
             {StringImm::make(runtime::symbol::tvm_set_device),
              device_type, device_id}, Call::Intrinsic)));
diff --git a/src/pass/remap_thread_axis.cc b/src/pass/remap_thread_axis.cc
new file mode 100644
index 000000000000..94e4819a1d71
--- /dev/null
+++ b/src/pass/remap_thread_axis.cc
@@ -0,0 +1,83 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file remap_thread_axis.cc
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/ir_pass.h>
+#include <unordered_map>
+
+
+namespace tvm {
+namespace ir {
+
+// Mutator to change the read pattern
+class ThreadAxisRewriter : private IRMutator {
+ public:
+  explicit ThreadAxisRewriter(
+      const std::unordered_map<std::string, IterVar>& tmap)
+      : tmap_(tmap) {
+  }
+
+  Stmt Rewrite(Stmt stmt) {
+    return Mutate(stmt);
+  }
+
+ private:
+  Stmt Mutate_(const AttrStmt* op, const Stmt& stmt) final {
+    if (op->attr_key == attr::thread_extent) {
+      IterVar iv(op->node.node_);
+      CHECK_NE(iv->thread_tag.length(), 0U);
+      auto it = tmap_.find(iv->thread_tag);
+      if (it != tmap_.end()) {
+        const IterVar& new_iv = it->second;
+        const Variable* v = iv->var.get();
+        if (!vmap_.count(v)) {
+          vmap_[v] = new_iv->var;
+        } else {
+          CHECK(vmap_[v].same_as(new_iv->var));
+        }
+        Stmt body = this->Mutate(op->body);
+        return AttrStmt::make(
+            new_iv, op->attr_key, op->value, body);
+      }
+    }
+    return IRMutator::Mutate_(op, stmt);
+  }
+
+  Expr Mutate_(const Variable* op, const Expr& expr) final {
+    auto it = vmap_.find(op);
+    if (it != vmap_.end()) return it->second;
+    return IRMutator::Mutate_(op, expr);
+  }
+  // The thread map
+  const std::unordered_map<std::string, IterVar>& tmap_;
+  // variable map
+  std::unordered_map<const Variable*, Var> vmap_;
+};
+
+LoweredFunc
+RemapThreadAxis(LoweredFunc f, Map<Expr, IterVar> thread_map) {
+  std::unordered_map<std::string, IterVar> tmap;
+  for (const auto& kv : thread_map) {
+    const StringImm* str = kv.first.as<StringImm>();
+    CHECK(str != nullptr);
+    tmap[str->value] = kv.second;
+  }
+
+  CHECK_EQ(f->func_type, kDeviceFunc);
+  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  // replace the thread axis
+  for (size_t i = 0; i < n->thread_axis.size(); ++i) {
+    auto it = tmap.find(n->thread_axis[i]->thread_tag);
+    if (it != tmap.end()) {
+      n->thread_axis.Set(i, it->second);
+    }
+  }
+  n->body = ThreadAxisRewriter(tmap).Rewrite(n->body);
+  return LoweredFunc(n);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/remove_no_op.cc b/src/pass/remove_no_op.cc
index 9547eda076cd..148be28d4715 100644
--- a/src/pass/remove_no_op.cc
+++ b/src/pass/remove_no_op.cc
@@ -20,11 +20,8 @@ class NoOpRemover : public IRMutator {
     return is_no_op(op->body) ? MakeEvaluate(op->value) : stmt;
   }
   Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
-    if (op->attr_key == ir::attr::pragma_scope) {
-      const std::string& pname = op->value.as<StringImm>()->value;
-      if (pname == "debug_skip_region") {
-        return MakeEvaluate(0);
-      }
+    if (op->attr_key == "pragma_debug_skip_region") {
+      return MakeEvaluate(0);
     }
     Stmt stmt = IRMutator::Mutate_(op, s);
     op = stmt.as<AttrStmt>();
diff --git a/src/pass/rewrite_unsafe_select.cc b/src/pass/rewrite_unsafe_select.cc
index da72722bdeae..3224f47907a3 100644
--- a/src/pass/rewrite_unsafe_select.cc
+++ b/src/pass/rewrite_unsafe_select.cc
@@ -59,7 +59,7 @@ class UnsafeExprDetector : public ExprFunctor<bool(const Expr& n)> {
     return VisitExpr(op->a);
   }
   bool VisitExpr_(const Let* op) final {
-    return VisitExpr(op->body) && VisitExpr(op->value);
+    return VisitExpr(op->body) || VisitExpr(op->value);
   }
   bool VisitExpr_(const Cast* op) final {
     return VisitExpr(op->value);
@@ -77,6 +77,7 @@ class UnsafeExprDetector : public ExprFunctor<bool(const Expr& n)> {
     return false;
   }
   bool VisitExpr_(const Variable* op) final { return false; }
+  bool VisitExpr_(const UIntImm* op) final { return false; }
   bool VisitExpr_(const IntImm* op) final { return false; }
   bool VisitExpr_(const FloatImm* op) final { return false; }
   bool VisitExpr_(const StringImm* op) final { return false; }
@@ -84,7 +85,7 @@ class UnsafeExprDetector : public ExprFunctor<bool(const Expr& n)> {
  private:
   template<typename T>
   bool BinaryOp(const T* op) {
-    return VisitExpr(op->a) && VisitExpr(op->b);
+    return VisitExpr(op->a) || VisitExpr(op->b);
   }
 };
 
diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc
index 942e70339488..c7b20e137638 100644
--- a/src/pass/split_host_device.cc
+++ b/src/pass/split_host_device.cc
@@ -146,6 +146,11 @@ class IRUseDefAnalysis : public IRMutator {
 
 class HostDeviceSplitter : public IRMutator {
  public:
+  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
+    handle_data_type_[op->buffer_var.get()] = make_const(op->type, 0);
+    return IRMutator::Mutate_(op, s);
+  }
+
   Stmt Mutate_(const AttrStmt *op, const Stmt& s) final {
     if (op->attr_key == attr::thread_extent ||
         op->attr_key == attr::pipeline_exec_scope) {
@@ -174,7 +179,7 @@ class HostDeviceSplitter : public IRMutator {
  private:
   Stmt SplitDeviceFunc(Stmt body) {
     std::ostringstream os;
-    os << name_ << "__kernel" << device_funcs_.size();
+    os << name_ << "_kernel" << device_funcs_.size();
     std::shared_ptr<LoweredFuncNode> n = std::make_shared<LoweredFuncNode>();
     // isolate the device function.
     IRUseDefAnalysis m;
diff --git a/src/pass/storage_access.cc b/src/pass/storage_access.cc
index 9211f3f71de0..09be1a53da42 100644
--- a/src/pass/storage_access.cc
+++ b/src/pass/storage_access.cc
@@ -210,7 +210,8 @@ void StorageAccessVisitor::Visit_(const Call* op) {
 
 StorageScope StorageAccessVisitor::GetScope(const Variable* buf) const {
   auto it = storage_scope_.find(buf);
-  StorageScope s; s.rank = 0;
+  StorageScope s;
+  s.rank = StorageRank::kGlobal;
   if (it == storage_scope_.end()) return s;
   return it->second;
 }
diff --git a/src/pass/storage_access.h b/src/pass/storage_access.h
index 7268bb668342..4f313f8e7c24 100644
--- a/src/pass/storage_access.h
+++ b/src/pass/storage_access.h
@@ -17,6 +17,7 @@ namespace tvm {
 namespace ir {
 
 using runtime::StorageScope;
+using runtime::StorageRank;
 /*!
  * \brief Base class of storage access analysis
  */
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index f1aee504fb3e..f5cb98495ff9 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -22,14 +22,16 @@
 namespace tvm {
 namespace ir {
 
-using Halide::Internal::Region;
+using HalideIR::Internal::Region;
+using runtime::StorageRank;
 using runtime::StorageScope;
 using runtime::ThreadScope;
 using intrinsic::tvm_address_of;
 
 class StorageFlattener : public IRMutator {
  public:
-  explicit StorageFlattener(Map<Tensor, Buffer> extern_buffer, int cache_line_size) {
+  explicit StorageFlattener(Map<Tensor, Buffer> extern_buffer,
+                            int cache_line_size) {
     for (auto kv : extern_buffer) {
       BufferEntry e;
       e.buffer = kv.second;
@@ -38,6 +40,7 @@ class StorageFlattener : public IRMutator {
     }
     cache_line_size_ = cache_line_size;
   }
+
   Stmt Mutate_(const Store* op, const Stmt& s) final {
     Stmt stmt = IRMutator::Mutate_(op, s);
     op = stmt.as<Store>();
@@ -90,6 +93,8 @@ class StorageFlattener : public IRMutator {
       vinfo[dim].align_factor = tuple->args[1].as<IntImm>()->value;
       vinfo[dim].align_offset = tuple->args[2].as<IntImm>()->value;
       return this->Mutate(op->body);
+    } else if (op->attr_key == attr::opengl_stage_scope) {
+      is_opengl_ = true;
     }
     return IRMutator::Mutate_(op, s);
   }
@@ -104,7 +109,15 @@ class StorageFlattener : public IRMutator {
     const BufferEntry& e = it->second;
     CHECK(!e.released)
         << "Read a buffer that is already out of scope";
-    return e.buffer.vstore(e.RelIndex(op->args), op->value);
+    if (is_opengl_) {
+      return Evaluate::make(Call::make(
+          Type(),
+          Call::glsl_texture_store,
+          {e.buffer->data, op->value},
+          Call::Intrinsic));
+    } else {
+      return e.buffer.vstore(e.RelIndex(op->args), op->value);
+    }
   }
 
   Stmt Mutate_(const Realize* op, const Stmt& s) final {
@@ -129,7 +142,8 @@ class StorageFlattener : public IRMutator {
       const std::string& strkey = it->second;
       if (strkey.length() == 0) {
         if (curr_thread_scope_.size() != 0) {
-          skey.rank = curr_thread_scope_.back().rank + 1;
+          skey.rank = runtime::DefaultStorageRank(
+              curr_thread_scope_.back().rank);
         }
       } else {
         skey = StorageScope::make(strkey);
@@ -147,10 +161,11 @@ class StorageFlattener : public IRMutator {
         }
       }
       Array<Expr> strides;
-      if (dim_align_.count(key) != 0) {
+      if (dim_align_.count(key) != 0 && shape.size() != 0) {
         std::vector<Expr> rstrides;
         const std::vector<DimAlignInfo>& avec = dim_align_[key];
-        Expr stride = make_const(shape[0].type(), 1);
+        int first_dim = 0;
+        Expr stride = make_const(shape[first_dim].type(), 1);
         for (size_t i = shape.size(); i != 0; --i) {
           size_t dim = i - 1;
           if (dim < avec.size() && avec[dim].align_factor != 0) {
@@ -164,6 +179,7 @@ class StorageFlattener : public IRMutator {
         }
         strides = Array<Expr>(rstrides.rbegin(), rstrides.rend());
       }
+
       e.buffer = BufferNode::make(
           Var(key.GetName(), Handle()),
           op->type, shape, strides, Expr(),
@@ -176,13 +192,18 @@ class StorageFlattener : public IRMutator {
       Stmt ret;
 
       if (strides.size() != 0) {
+        int first_dim = 0;
         ret = Allocate::make(
             e.buffer->data, e.buffer->dtype,
-            {arith::ComputeExpr<Mul>(e.buffer->strides[0], e.buffer->shape[0])},
+            {arith::ComputeExpr<Mul>(e.buffer->strides[first_dim], e.buffer->shape[first_dim])},
             make_const(Bool(e.buffer->dtype.lanes()), true), body);
       } else {
+        shape = e.buffer->shape;
+        if (shape.size() == 0) {
+          shape.push_back(make_const(Int(32), 1));
+        }
         ret = Allocate::make(
-            e.buffer->data, e.buffer->dtype, e.buffer->shape,
+            e.buffer->data, e.buffer->dtype, shape,
             make_const(Bool(e.buffer->dtype.lanes()), true), body);
       }
       ret = AttrStmt::make(
@@ -289,7 +310,40 @@ class StorageFlattener : public IRMutator {
   }
 
  private:
-  // Start bind
+  // The specific tensor data layout is not determined before
+  // StorageFlatten pass. We use buffer_bind_scope
+  // to specify before hand we want to bind a subregion
+  // of tensor to a symbolic buffer, which get used in extern.
+  //
+  // Example:
+  //
+  // realize A in range [i*4, extent=10) {
+  //   bind Ab to A in [i*4+1, extent=4) {
+  //     call_func(Ab.ptr, Ab.shape[0])
+  //   }
+  // }
+  //
+  // After StorageFlatten
+  //
+  // alloc A[10]
+  //   call(A + 1,  4)
+  //
+  // Buffer is a protocol to declare specific
+  // data layout and shape we expect.
+  // So this function need to check:
+  // - If the bind range is within the realize range
+  // - If we can match the requirement of buffer
+  // - Remap variables such as Ab.ptr to the actual value.
+  //
+  // Here are a few possible failure cases:
+  // - Buffer is declared to have constant shape,
+  //   but we try to bind it to a different one.
+  // - Buffer is declared to be compact(no strides)
+  //   but this binded region is a subregion of
+  //   a matrix(tensor), which means it requires strides.
+  //
+  // We do support a few relaxed case, such as bindingx
+  // region with shape [1, 1, n, m] to buffer with shape [n, m]
   Stmt HandleBufferBindScope(const AttrStmt* op) {
     Array<NodeRef> arr(op->node.node_);
     CHECK_EQ(arr.size(), 2U);
@@ -381,6 +435,8 @@ class StorageFlattener : public IRMutator {
   std::vector<ThreadScope> curr_thread_scope_;
   // The size of cacheline
   int cache_line_size_;
+  // The current stage is an OpenGL shader.
+  bool is_opengl_{false};
 };
 
 Stmt StorageFlatten(Stmt stmt,
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 933a84598b88..0170499e1491 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -19,6 +19,7 @@
 namespace tvm {
 namespace ir {
 
+using runtime::StorageRank;
 using runtime::StorageScope;
 
 // Find a linear pattern of storage acess
@@ -41,27 +42,32 @@ class LinearAccessPatternFinder final : public IRVisitor {
   struct StmtEntry {
     // The statment
     const Node* stmt;
-    // Scope used for allocation.
-    StorageScope alloc_scope;
+    // The index in the linear_seq_ to point to end of the nested scope.
+    // This is only set to non-zero if stmt is a nested scope.
+    // if offset > 0, means this is the begin, the end entry is current_index + offset
+    // if offset < 0, means this is the end, the begin entry is current_index + offset
+    int64_t scope_pair_offset{0};
     // The buffer variables this statment touched.
     std::vector<const Variable*> touched;
   };
+  // The scope of each allocation
+  struct AllocEntry {
+    // Scope used for allocation.
+    StorageScope storage_scope;
+    // scope level
+    size_t level{0};
+    // allocation stmt
+    const Allocate* alloc{nullptr};
+  };
 
-  // Get linear access pattern.
-  std::vector<StmtEntry> GetLinearSeq(const Stmt& s) {
-    this->Visit(s);
-    return std::move(linear_seq_);
-  }
   void Visit_(const Allocate* op) final {
     size_t level = scope_.size();
     const Variable* buf = op->buffer_var.get();
-    CHECK(!alloc_scope_level_.count(buf));
-    alloc_scope_level_[buf] = level;
-    StmtEntry e;
-    e.stmt = op;
-    e.alloc_scope = GetScope(buf);
-    e.touched.push_back(buf);
-    linear_seq_.emplace_back(std::move(e));
+    auto it = alloc_info_.find(buf);
+    CHECK(it != alloc_info_.end());
+    CHECK(it->second.alloc == nullptr);
+    it->second.alloc = op;
+    it->second.level = level;
     IRVisitor::Visit_(op);
   }
   void Visit_(const Store* op) final {
@@ -70,9 +76,10 @@ class LinearAccessPatternFinder final : public IRVisitor {
     IRVisitor::Visit_(op);
     // Add write access.
     const Variable* buf = op->buffer_var.get();
-    auto it = alloc_scope_level_.find(buf);
-    if (it != alloc_scope_level_.end()) {
-      scope_[it->second].touched.push_back(buf);
+    auto it = alloc_info_.find(buf);
+    if (it != alloc_info_.end() && it->second.alloc) {
+      CHECK_LT(it->second.level, scope_.size());
+      scope_[it->second.level].touched.push_back(buf);
     }
     StmtEntry e = scope_.back();
     scope_.pop_back();
@@ -96,11 +103,11 @@ class LinearAccessPatternFinder final : public IRVisitor {
     // Add write access.
     IRVisitor::Visit_(op);
     const Variable* buf = op->buffer_var.get();
-    auto it = alloc_scope_level_.find(buf);
-    if (it != alloc_scope_level_.end()) {
-      CHECK_LT(it->second, scope_.size())
+    auto it = alloc_info_.find(buf);
+    if (it != alloc_info_.end() && it->second.alloc) {
+      CHECK_LT(it->second.level, scope_.size())
           << "Load memory in places other than store.";
-      scope_[it->second].touched.push_back(buf);
+      scope_[it->second.level].touched.push_back(buf);
     }
   }
   void Visit_(const Call* op) final {
@@ -113,10 +120,11 @@ class LinearAccessPatternFinder final : public IRVisitor {
   }
   void Visit_(const Variable* buf) final {
     // Directly reference to the variable count as a read.
-    auto it = alloc_scope_level_.find(buf);
-    if (it != alloc_scope_level_.end()) {
-      CHECK_LT(it->second, scope_.size()) << " buf=" << buf->name_hint;
-      scope_[it->second].touched.push_back(buf);
+    auto it = alloc_info_.find(buf);
+    if (it != alloc_info_.end() && it->second.alloc) {
+      CHECK_LT(it->second.level, scope_.size())
+          << " buf=" << buf->name_hint;
+      scope_[it->second.level].touched.push_back(buf);
     }
   }
   template<typename T>
@@ -124,13 +132,20 @@ class LinearAccessPatternFinder final : public IRVisitor {
     scope_.push_back(StmtEntry());
     StmtEntry e;
     e.stmt = op;
+    int64_t begin_index =  static_cast<int64_t>(linear_seq_.size());
     // before scope.
     linear_seq_.push_back(e);
     IRVisitor::Visit_(op);
     // after scope.
     e.touched = std::move(scope_.back().touched);
     scope_.pop_back();
+    int64_t end_index =  static_cast<int64_t>(linear_seq_.size());
+    CHECK_GT(end_index, begin_index);
+    e.scope_pair_offset = begin_index - end_index;
     linear_seq_.push_back(e);
+    // record the pointer to end index.
+    CHECK_NE(end_index, 0U);
+    linear_seq_[begin_index].scope_pair_offset = end_index - begin_index;
   }
   void Visit_(const AttrStmt* op) final {
     // Only record the outer most thread extent.
@@ -138,9 +153,13 @@ class LinearAccessPatternFinder final : public IRVisitor {
       in_thread_env_ = true;
       VisitNewScope(op);
       in_thread_env_ = false;
+    } else if (op->attr_key == attr::extern_scope) {
+      VisitNewScope(op);
+    } else if (op->attr_key == attr::virtual_thread) {
+      VisitNewScope(op);
     } else if (op->attr_key == attr::storage_scope) {
       const Variable* buf = op->node.as<Variable>();
-      storage_scope_[buf] =
+      alloc_info_[buf].storage_scope =
           StorageScope::make(op->value.as<StringImm>()->value);
       IRVisitor::Visit_(op);
     } else {
@@ -155,41 +174,161 @@ class LinearAccessPatternFinder final : public IRVisitor {
     VisitNewScope(op);
   }
 
+  // linearized access sequence.
+  std::vector<StmtEntry> linear_seq_;
+  // The storage scope of each buffer
+  std::unordered_map<const Variable*, AllocEntry> alloc_info_;
+
  private:
-  // Get storage scope of buffer.
-  StorageScope GetScope(const Variable* buf) const {
-    auto it = storage_scope_.find(buf);
-    CHECK(it != storage_scope_.end());
-    return it->second;
-  }
   // Whether already in thread env.
   bool in_thread_env_{false};
-  // linearized access sequence.
-  std::vector<StmtEntry> linear_seq_;
   // The scope stack.
   std::vector<StmtEntry> scope_;
-  // The storage scope of each buffer
-  std::unordered_map<const Variable*, StorageScope> storage_scope_;
-  // buffer -> allocated scope level in the IR.
-  std::unordered_map<const Variable*, size_t> alloc_scope_level_;
+};
+
+// Verify if the statement can be run safely via inplace fashion
+//
+// Detect pattern: dst[index] = f(src[index])
+//
+// WARNING: the current detection algorithm cannot handle the case
+// when a location in an array is written multiple times
+//
+// For example, the following program will pass the check,
+// but we cannot make A and B to be the same array.
+//
+//  A[0] = B[0] + 1
+//  A[0] = B[0] + 1
+//
+// The high level code generator needs to ensure that the generated
+// code only write each location of the target array once.
+//
+// This is the case with IR generated by the current compute schedule.
+// We explicitly return false if we find there is an extern block
+// which can be arbitrary IR.
+//
+// Neve-the-less, inplace detector should be used with care in mind.
+// We may also consider introduce a condition checker that checks
+// if every index only visited once for an absolute sufficient condition.
+//
+// The code after inplace transformation is no longer idempotent.
+//
+class InplaceOpVerifier : public IRVisitor {
+ public:
+  bool Check(const Node* stmt,
+             const Variable* dst,
+             const Variable* src) {
+    dst_ = dst;
+    src_ = src;
+    result_ = true;
+    if (stmt->is_type<AttrStmt>()) {
+      Visit_(static_cast<const AttrStmt*>(stmt));
+    } else if (stmt->is_type<For>()) {
+      Visit_(static_cast<const For*>(stmt));
+    } else if (stmt->is_type<IfThenElse>()) {
+      Visit_(static_cast<const IfThenElse*>(stmt));
+    } else if (stmt->is_type<Store>()) {
+      Visit_(static_cast<const Store*>(stmt));
+    } else {
+      return false;
+    }
+    return result_;
+  }
+
+  using IRVisitor::Visit_;
+
+  void Visit(const NodeRef& e) final {
+    if (!result_) return;
+    IRVisitor::Visit(e);
+  }
+
+  void Visit_(const Variable* op) final {
+    // assume all opaque access is unsafe
+    if (op == dst_ || op == src_) {
+      result_ = false; return;
+    }
+  }
+
+  void Visit_(const Store* op) final {
+    ++mem_nest_;
+    this->Visit(op->index);
+    --mem_nest_;
+    if (op->buffer_var.get() == dst_) {
+      store_ = op;
+      this->Visit(op->value);
+      this->Visit(op->predicate);
+      store_ = nullptr;
+    } else {
+      this->Visit(op->value);
+      this->Visit(op->predicate);
+    }
+  }
+
+  void Visit_(const AttrStmt* op) final {
+    // always reject extern code
+    if (op->attr_key == attr::extern_scope ||
+        op->attr_key == attr::volatile_scope) {
+      result_ = false; return;
+    }
+    IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Load* op) final {
+    const Variable* buf = op->buffer_var.get();
+    // cannot read from dst_ (no reduction)
+    if (buf == dst_) {
+      result_ = false; return;
+    }
+    // do not allow indirect memory load
+    if (mem_nest_ != 0) {
+      result_ = false; return;
+    }
+    if (src_ == buf) {
+      if (store_ == nullptr ||
+          store_->value.type() != op->type ||
+          !ir::Equal(store_->index, op->index)) {
+        result_ = false; return;
+      }
+    }
+    ++mem_nest_;
+    IRVisitor::Visit_(op);
+    --mem_nest_;
+  }
+
+
+ private:
+  // result of the check
+  bool result_{true};
+  // destination memory
+  const Variable* dst_;
+  // source variable
+  const Variable* src_;
+  // counter of load,
+  // it is not safe to inplace when there is nested load like A[B[i]]
+  int mem_nest_{0};
+  // The current store to be inspected
+  const Store* store_{nullptr};
 };
 
 // Planner to plan and rewrite memory allocation.
 class StoragePlanRewriter : public IRMutator {
  public:
   using StmtEntry = LinearAccessPatternFinder::StmtEntry;
+  using AllocEntry = LinearAccessPatternFinder::AllocEntry;
 
-  Stmt Rewrite(Stmt stmt) {
-    std::vector<StmtEntry> seq =
-       LinearAccessPatternFinder().GetLinearSeq(stmt);
-    this->FindFreeLocation(seq);
-    this->PlanMemory(seq);
+  Stmt Rewrite(Stmt stmt, bool detect_inplace) {
+    detect_inplace_ = detect_inplace;
+    // plan the rewrite
+    LinearAccessPatternFinder finder;
+    finder.Visit(stmt);
+    this->LivenessAnalysis(finder.linear_seq_);
+    this->PlanMemory(finder.linear_seq_, finder.alloc_info_);
     this->PrepareNewAlloc();
+    // start rewrite
     stmt = this->Mutate(stmt);
     if (attach_map_.count(nullptr)) {
       std::vector<Stmt> nest;
       for (StorageEntry* e : attach_map_.at(nullptr)) {
-        CHECK_EQ(e->scope.rank, 0);
+        // CHECK_EQ(e->scope.rank, 0);
         if (e->new_alloc.defined()) {
           nest.emplace_back(AttrStmt::make(
               e->alloc_var, attr::storage_scope,
@@ -225,7 +364,7 @@ class StoragePlanRewriter : public IRMutator {
   Expr Mutate_(const Variable* op, const Expr& e) final {
     auto it = alloc_map_.find(op);
     if (it != alloc_map_.end()) {
-      if (it->second->elem_offset != 0) {
+      if (it->second->bits_offset != 0) {
         LOG(WARNING) << "Use a merged buffer variable address, could cause error";
       }
       return it->second->alloc_var;
@@ -243,11 +382,10 @@ class StoragePlanRewriter : public IRMutator {
        const StorageEntry* se = it->second;
        Expr offset = Mutate(op->args[2]);
        Expr extent = Mutate(op->args[3]);
-       CHECK_EQ(se->elem_type, dtype.element_of())
-           << " buffer=" << buffer->name_hint;
-       CHECK_EQ(se->elem_offset % dtype.lanes(), 0);
-       if (se->elem_offset != 0) {
-         offset = make_const(offset.type(), se->elem_offset / dtype.lanes()) + offset;
+       uint64_t elem_bits = dtype.bits() * dtype.lanes();
+       CHECK_EQ(se->bits_offset % elem_bits, 0U);
+       if (se->bits_offset != 0) {
+         offset = make_const(offset.type(), se->bits_offset / elem_bits) + offset;
        }
        return Call::make(
            op->type, op->name,
@@ -259,12 +397,11 @@ class StoragePlanRewriter : public IRMutator {
   }
 
   Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
-    CHECK(op->attr_key != attr::virtual_thread)
-        << "InjectVirtualThread before StoragePlan";
     if (op->attr_key == attr::storage_scope) {
       return this->Mutate(op->body);
     } else if (op->attr_key == attr::thread_extent ||
-               op->attr_key == attr::pragma_scope) {
+               op->attr_key == attr::virtual_thread ||
+               attr::IsPragmaKey(op->attr_key)) {
       // remake all the allocation at the attach scope.
       if (attach_map_.count(op)) {
         auto& svec = attach_map_[op];
@@ -308,7 +445,6 @@ class StoragePlanRewriter : public IRMutator {
   }
 
  private:
-  // Alllocate entry of node.
   struct StorageEntry {
     // The scope that this alloc attaches after
     // For shared/local memory it is beginning of the thread extent.
@@ -329,26 +465,48 @@ class StoragePlanRewriter : public IRMutator {
     // The allocation element type.
     Type elem_type;
     // This is non-zero if this allocate is folded into another one
-    // the address becomes alloc_var + sizeof(elem_type) * elem_offset;
-    uint64_t elem_offset{0};
+    // the address(in bits) becomes alloc_var + bits_offset;
+    // can be effectively converted to the element type.
+    // We need to convert bit_offset to offset of specific element type later.
+    //
+    // We use bits(instead of bytes) to support non-conventional indexing in hardware.
+    // When we are merging buffer together, the bits_offset are set to be aligned
+    // to certain value given by the max_simd_bits property of the special memory.
+    //
+    // This allows effective sharing among different types as long as their alignment
+    // requirement fits into the max_simd_bits.
+    uint64_t bits_offset{0};
   };
+
+  // Alllocate entry of node.
+  // Event entry in liveness analysis
+  struct EventEntry {
+    // variables we generate
+    std::vector<const Variable*> gen;
+    // variables we kill
+    std::vector<const Variable*> kill;
+  };
+
   Stmt MakeAttach(const std::vector<StorageEntry*>& svec,
                   Stmt body) {
     std::vector<Stmt> nest;
     for (StorageEntry* e : svec) {
-      nest.emplace_back(AttrStmt::make(
-          e->alloc_var, attr::storage_scope,
-          StringImm::make(e->scope.to_string()),
-          Evaluate::make(0)));
-      nest.push_back(e->new_alloc);
+      if (e->new_alloc.defined()) {
+        nest.emplace_back(AttrStmt::make(
+            e->alloc_var, attr::storage_scope,
+            StringImm::make(e->scope.to_string()),
+            Evaluate::make(0)));
+        nest.push_back(e->new_alloc);
+      }
     }
     return MergeNest(nest, body);
   }
   // Remap the index
   Expr RemapIndex(Type dtype, Expr index, StorageEntry* e) {
-    CHECK_EQ(dtype.element_of(), e->elem_type);
-    if (e->elem_offset == 0) return index;
-    return make_const(index.type(), e->elem_offset) + index;
+    if (e->bits_offset == 0) return index;
+    uint64_t elem_bits = dtype.bits() * dtype.lanes();
+    CHECK_EQ(e->bits_offset % elem_bits, 0U);
+    return make_const(index.type(), e->bits_offset / elem_bits) + index;
   }
   // Prepare the new allocations
   void PrepareNewAlloc() {
@@ -378,7 +536,7 @@ class StoragePlanRewriter : public IRMutator {
       for (size_t i = 0; i < vec.size(); ++i) {
         StorageEntry* e = vec[i];
         // already merged
-        if (e->elem_offset != 0) continue;
+        if (e->bits_offset != 0) continue;
         if (e->merged_children.size() != 0) {
           NewAllocTagMerged(e); continue;
         }
@@ -395,22 +553,33 @@ class StoragePlanRewriter : public IRMutator {
           e->new_alloc = Allocate::make(
               e->alloc_var, alloc_type, e->allocs[0]->extents,
               e->allocs[0]->condition, Evaluate::make(0));
+          if (e->scope.tag.length() != 0) {
+            MemoryInfo info = GetMemoryInfo(e->scope.to_string());
+            uint64_t total_elem = e->const_nbits / e->elem_type.bits();
+            CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+                << "Allocation exceed bound of memory tag " << e->scope.to_string();
+          }
         } else {
           // Build a merged allocation
           Expr combo_size;
           for (const Allocate* op : e->allocs) {
-            Expr sz = arith::ComputeReduce<Mul>(op->extents);
-            if (alloc_type.lanes() != op->type.lanes()) {
-              sz = (sz * make_const(sz.type(), op->type.lanes()) +
-                    make_const(sz.type(), alloc_type.lanes() - 1)) /
-                  make_const(sz.type(), alloc_type.lanes());
-            }
+            Expr sz = arith::ComputeReduce<Mul>(op->extents, make_const(Int(32), 1));
+            // transform to bits
+            auto sz_nbits = sz * (op->type.bits() * op->type.lanes());
             if (combo_size.defined()) {
-              combo_size = max(combo_size, sz);
+              combo_size = max(combo_size, sz_nbits);
             } else {
-              combo_size = sz;
+              combo_size = sz_nbits;
             }
           }
+          // transform to alloc bytes
+          auto type_bits = alloc_type.bits() * alloc_type.lanes();
+          bool divided = can_prove(combo_size % type_bits == 0);
+          combo_size = combo_size / type_bits;
+          // round up for can not divided
+          if (!divided) {
+             combo_size += make_const(Int(32), 1);
+          }
           combo_size = ir::Simplify(combo_size);
           e->new_alloc = Allocate::make(
               e->alloc_var, alloc_type, {combo_size}, const_true(),
@@ -425,46 +594,62 @@ class StoragePlanRewriter : public IRMutator {
     // allocate with element type.
     CHECK_NE(e->const_nbits, 0U);
     MemoryInfo info = GetMemoryInfo(e->scope.to_string());
-    size_t align = 1;
+    uint64_t total_bits = e->const_nbits;
+    // By default, align to 32 bits.
+    size_t align = 32;
     if (info.defined()) {
-      align = (info->max_simd_bits + e->elem_type.bits() - 1) / e->elem_type.bits();
+      align = info->max_simd_bits;
     }
-    uint64_t total_elem = e->const_nbits / e->elem_type.bits();
-    if (total_elem % align != 0) {
-      total_elem += align  - (total_elem % align);
+    // Always align to max_simd_bits
+    // so we can remap types by keeping this property
+    if (total_bits % align != 0) {
+      total_bits += align  - (total_bits % align);
     }
     e->alloc_var = e->allocs[0]->buffer_var;
     for (StorageEntry* child : e->merged_children) {
-      CHECK_NE(e->const_nbits, 0U);
-      CHECK_NE(total_elem, 0U);
-      size_t num_elem = child->const_nbits / child->elem_type.bits();
-      child->elem_offset = total_elem;
+      CHECK_NE(child->const_nbits, 0U);
+      CHECK_NE(total_bits, 0U);
+      child->bits_offset = total_bits;
       child->alloc_var = e->alloc_var;
-      total_elem += num_elem;
-      if (total_elem % align != 0) {
-        total_elem += align  - (total_elem % align);
+      total_bits += child->const_nbits;
+      if (total_bits % align != 0) {
+        total_bits += align  - (total_bits % align);
       }
     }
+    uint64_t type_bits = e->elem_type.bits() * e->elem_type.lanes();
     Expr alloc_size = make_const(e->allocs[0]->extents[0].type(),
-                                 total_elem);
+                                 (total_bits + type_bits - 1) / type_bits);
     e->new_alloc = Allocate::make(
         e->alloc_var, e->elem_type, {alloc_size}, const_true(),
         Evaluate::make(0));
     if (info.defined()) {
-      CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+      CHECK_LE(total_bits, info->max_num_bits)
           << "Allocation exceed bound of memory tag " << e->scope.to_string();
     }
   }
-  // Find the free location of each varaible.
-  // Just do a reverse linear scan.
-  void FindFreeLocation(const std::vector<StmtEntry>& seq) {
+  // Liveness analysis to find gen and kill point of each variable.
+  void LivenessAnalysis(const std::vector<StmtEntry>& seq) {
+    // find kill point, do a reverse linear scan.
     std::unordered_set<const Variable*> touched;
     for (size_t i = seq.size(); i != 0; --i) {
       const StmtEntry& s = seq[i - 1];
       for (const Variable* buffer : s.touched) {
         if (!touched.count(buffer)) {
           touched.insert(buffer);
-          free_loc_[i - 1].push_back(buffer);
+          event_map_[s.stmt].kill.push_back(buffer);
+        }
+      }
+    }
+    // find gen point, do forward scan
+    touched.clear();
+    for (size_t i = 0; i < seq.size(); ++i) {
+      int64_t offset = seq[i].scope_pair_offset;
+      if (offset < 0) continue;
+      const StmtEntry& s = seq[i + offset];
+      for (const Variable* buffer : s.touched) {
+        if (!touched.count(buffer)) {
+          touched.insert(buffer);
+          event_map_[s.stmt].gen.push_back(buffer);
         }
       }
     }
@@ -494,14 +679,70 @@ class StoragePlanRewriter : public IRMutator {
   }
 
   // Memory plan algorithm
-  void PlanMemory(const std::vector<StmtEntry>& seq) {
+  void PlanMemory(const std::vector<StmtEntry>& seq,
+                  const std::unordered_map<const Variable*, AllocEntry>& alloc_info) {
+    std::unordered_set<const Variable*> inplace_flag;
+
     for (size_t i = 0; i < seq.size(); ++i) {
       const StmtEntry& s = seq[i];
+      auto it = event_map_.find(seq[i].stmt);
+
+      // scope_pair_offset >= 0 means it is either
+      // - leaf stmt(offset = 0)
+      // - beginning of scope(offset < 0)
+      // In both cases, we need to handle the gen event correctly
+      if (it != event_map_.end() && seq[i].scope_pair_offset >= 0) {
+        // Inplace operation detection
+        // specially handle this
+        bool detect_inplace = detect_inplace_ && (it->second.gen.size() <= 2);
+
+        for (const Variable* var : it->second.gen) {
+          CHECK(alloc_info.count(var));
+          const AllocEntry& ae = alloc_info.at(var);
+          StorageEntry* dst_entry = nullptr;
+          // inplace detection
+          if (detect_inplace) {
+            // only one inplace var for s.stmt
+            bool inplace_found = false;
+            for (const Variable* src : it->second.kill) {
+              if (!inplace_flag.count(src) && alloc_map_.count(src)) {
+                InplaceOpVerifier visitor;
+                StorageEntry* src_entry = alloc_map_.at(src);
+                if (src_entry->scope == ae.storage_scope &&
+                    src_entry->attach_scope_ == thread_scope_ &&
+                    src_entry->elem_type == ae.alloc->type.element_of() &&
+                    visitor.Check(s.stmt, var, src)) {
+                  uint64_t const_nbits = static_cast<uint64_t>(
+                      ae.alloc->constant_allocation_size() *
+                      ae.alloc->type.bits() *
+                      ae.alloc->type.lanes());
+                  if (src_entry->const_nbits == const_nbits && !inplace_found) {
+                    // successfully inplace
+                    dst_entry = src_entry;
+                    inplace_flag.insert(src);
+                    inplace_found = true;
+                  }
+                }
+              }
+            }
+          }
+          if (dst_entry == nullptr) {
+            dst_entry = FindAlloc(ae.alloc, thread_scope_, ae.storage_scope);
+          }
+          dst_entry->allocs.emplace_back(ae.alloc);
+          alloc_map_[var] = dst_entry;
+        }
+      }
+      // enter/exit new scope
       if (s.stmt->is_type<AttrStmt>()) {
         const auto* op = static_cast<const AttrStmt*>(s.stmt);
-        CHECK(op->attr_key == attr::thread_extent ||
-              op->attr_key == attr::pragma_scope);
-        PlanNewScope(op);
+        if (op->attr_key == attr::thread_extent ||
+            op->attr_key == attr::virtual_thread ||
+            attr::IsPragmaKey(op->attr_key)) {
+          PlanNewScope(op);
+        } else {
+          CHECK(op->attr_key == attr::extern_scope);
+        }
       } else if (s.stmt->is_type<For>()) {
         const auto* op = static_cast<const For*>(s.stmt);
         if (op->for_type == ForType::Parallel) {
@@ -509,16 +750,17 @@ class StoragePlanRewriter : public IRMutator {
             PlanNewScope(op);
           }
         }
-      } else if (s.stmt->is_type<Allocate>()) {
-        const auto* op = static_cast<const Allocate*>(s.stmt);
-        StorageEntry* e = this->FindAlloc(op, thread_scope_, s.alloc_scope);
-        e->allocs.emplace_back(op);
-        alloc_map_[op->buffer_var.get()] = e;
       }
-      // free list
-      if (free_loc_.count(i)) {
-        for (const Variable* var : free_loc_.at(i)) {
-          this->Free(var);
+      // scope_pair_offset <= 0 means it is either
+      // - leaf stmt(offset = 0)
+      // - end of scope(offset < 0)
+      // In both cases, we need to handle the kill event correctly
+      if (it != event_map_.end() && seq[i].scope_pair_offset <= 0) {
+        for (const Variable* var : it->second.kill) {
+          // skip space which are already replaced by inplace
+          if (!inplace_flag.count(var)) {
+            this->Free(var);
+          }
         }
       }
     }
@@ -528,6 +770,7 @@ class StoragePlanRewriter : public IRMutator {
                          const Node* attach_scope,
                          const StorageScope& scope,
                          size_t const_nbits) {
+    CHECK(op != nullptr);
     // Re-use not successful, allocate a new buffer.
     std::unique_ptr<StorageEntry> entry(new StorageEntry());
     entry->attach_scope_ = attach_scope;
@@ -538,43 +781,51 @@ class StoragePlanRewriter : public IRMutator {
     alloc_vec_.emplace_back(std::move(entry));
     return e;
   }
+
   StorageEntry* FindAlloc(const Allocate* op,
                           const Node* attach_scope,
                           const StorageScope& scope) {
+    CHECK(op != nullptr);
     // skip plan for local variable,
     // compiler can do a better job with register allocation.
     const uint64_t match_range = 16;
+    uint64_t op_elem_bits = op->type.bits() * op->type.lanes();
     uint64_t const_nbits = static_cast<uint64_t>(
-        op->constant_allocation_size() * op->type.bits() * op->type.lanes());
-    if (scope.rank > 1 || op->type.is_handle()) {
-      return NewAlloc(op, attach_scope, scope, const_nbits);
-    }
+        op->constant_allocation_size() * op_elem_bits);
     // disable reuse of small arrays, they will be lowered to registers in LLVM
-    if (const_nbits > 0  &&
-        const_nbits <= 32 &&
-        scope.tag.length() == 0) {
-      return NewAlloc(op, attach_scope, scope, const_nbits);
+    // This rules only apply if we are using non special memory
+    if (scope.tag.length() == 0) {
+      if (scope.rank >= StorageRank::kWarp || op->type.is_handle()) {
+        return NewAlloc(op, attach_scope, scope, const_nbits);
+      }
+      if (const_nbits > 0  &&  const_nbits <= 32) {
+        return NewAlloc(op, attach_scope, scope, const_nbits);
+      }
     }
     if (const_nbits != 0) {
       // constant allocation.
       auto begin = const_free_map_.lower_bound(const_nbits / match_range);
       auto mid = const_free_map_.lower_bound(const_nbits);
       auto end = const_free_map_.upper_bound(const_nbits * match_range);
+      // start looking at the buffer that is bigger than the required size first
       for (auto it = mid; it != end; ++it) {
         StorageEntry *e = it->second;
         if (e->attach_scope_ != attach_scope) continue;
         if (e->scope != scope) continue;
-        if (e->elem_type != op->type.element_of()) continue;
+        // when not divided, no reuse, eg, float4 vs float3
+        if (e->bits_offset % op_elem_bits != 0) continue;
         e->const_nbits = std::max(const_nbits, e->const_nbits);
         const_free_map_.erase(it);
         return e;
       }
+      // then start looking at smaller buffers.
       for (auto it = mid; it != begin;) {
         --it;
         StorageEntry *e = it->second;
         if (e->attach_scope_ != attach_scope) continue;
         if (e->scope != scope) continue;
         if (e->elem_type != op->type.element_of()) continue;
+        e->const_nbits = std::max(const_nbits, e->const_nbits);
         const_free_map_.erase(it);
         return e;
       }
@@ -597,10 +848,17 @@ class StoragePlanRewriter : public IRMutator {
     auto it = alloc_map_.find(var);
     CHECK(it != alloc_map_.end());
     StorageEntry* e = it->second;
-    // Disable sharing of local memory.
-    if (e->scope.rank > 1 || e->allocs[0]->type.is_handle()) return;
-    // disable reuse of small arrays
-    if (e->const_nbits > 0 && e->const_nbits <= 32) return;
+    CHECK_NE(e->allocs.size(), 0U);
+
+    // disable reuse of small arrays, they will be lowered to registers in LLVM
+    // This rules only apply if we are using non special memory
+    if (e->scope.tag.length() == 0) {
+      // Disable sharing of local memory.
+      if (e->scope.rank >= StorageRank::kWarp ||
+          e->allocs[0]->type.is_handle()) return;
+      // disable reuse of small arrays
+      if (e->const_nbits > 0 && e->const_nbits <= 32) return;
+    }
     // normal free.
     if (e->const_nbits != 0) {
       const_free_map_.insert({e->const_nbits, e});
@@ -610,17 +868,18 @@ class StoragePlanRewriter : public IRMutator {
   }
   // thread scope.
   const Node* thread_scope_{nullptr};
+  // whether enable inplace detection.
+  bool detect_inplace_{false};
   // Locations of free ops.
-  std::unordered_map<size_t,
-                     std::vector<const Variable*> > free_loc_;
-  // The allocation attach map
-  std::unordered_map<const Node*, std::vector<StorageEntry*> > attach_map_;
-  // The allocation assign map
-  std::unordered_map<const Variable*, StorageEntry*> alloc_map_;
+  std::unordered_map<const Node*, EventEntry> event_map_;
   // constant size free map.
   std::multimap<uint64_t, StorageEntry*> const_free_map_;
   // symbolic free list, for non constant items.
   std::list<StorageEntry*> sym_free_list_;
+  // The allocation attach map
+  std::unordered_map<const Node*, std::vector<StorageEntry*> > attach_map_;
+  // The allocation assign map
+  std::unordered_map<const Variable*, StorageEntry*> alloc_map_;
   // The allocations
   std::vector<std::unique_ptr<StorageEntry> > alloc_vec_;
 };
@@ -672,22 +931,44 @@ class VectorAllocRewriter : public IRMutator {
     return stmt;
   }
 
-
- private:
   void UpdateTypeMap(const Variable* buffer, Type t) {
     auto& tvec = acc_map_[buffer];
     if (std::find(tvec.begin(), tvec.end(), t) == tvec.end()) {
       tvec.push_back(t);
     }
   }
+
   // Internal access map
-  std::unordered_map<const Variable*,
-                     std::vector<Type> > acc_map_;
+  std::unordered_map<const Variable*, std::vector<Type> > acc_map_;
 };
 
 
+LoweredFunc PointerValueTypeRewrite(LoweredFunc f) {
+  std::shared_ptr<LoweredFuncNode> n =
+      std::make_shared<LoweredFuncNode>(*f.operator->());
+  VectorAllocRewriter rewriter;
+  n->body = rewriter.Mutate(n->body);
+  for (Var arg : f->args) {
+    if (arg.type().is_handle()) {
+      const auto& tvec = rewriter.acc_map_[arg.get()];
+      if (tvec.size() == 1) {
+        Expr dtype = make_const(tvec[0], 0);
+        n->handle_data_type.Set(arg, dtype);
+      } else {
+        // always set data type to be non vectorized so
+        // load/store can still work via scalarization
+        if (tvec.size() != 0 && !n->handle_data_type.count(arg)) {
+          Expr dtype = make_const(tvec[0].with_lanes(1), 0);
+          n->handle_data_type.Set(arg, dtype);
+        }
+      }
+    }
+  }
+  return LoweredFunc(n);
+}
+
 Stmt StorageRewrite(Stmt stmt) {
-  stmt = StoragePlanRewriter().Rewrite(stmt);
+  stmt = StoragePlanRewriter().Rewrite(stmt, true);
   return VectorAllocRewriter().Mutate(stmt);
 }
 }  // namespace ir
diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc
index af3dc1f128e5..6e2d1020a6b5 100644
--- a/src/pass/storage_sync.cc
+++ b/src/pass/storage_sync.cc
@@ -189,7 +189,7 @@ class ThreadSyncInserter : public IRMutator {
     if (syncs_.size() == 0) return stmt;
     if (syncs_.count(stmt.get())) {
       Stmt barrier;
-      if (sync_scope_.rank == 0) {
+      if (sync_scope_.rank == StorageRank::kGlobal) {
         barrier = MakeGlobalBarrier();
       } else {
         barrier = Evaluate::make(
@@ -206,15 +206,15 @@ class ThreadSyncInserter : public IRMutator {
     return stmt;
   }
   Expr Mutate_(const Load* op, const Expr& e) final {
-    if (sync_scope_.rank == 0 &&
-        GetScope(op->buffer_var.get()).rank == 0) {
+    if (sync_scope_.rank == StorageRank::kGlobal &&
+        GetScope(op->buffer_var.get()).rank == StorageRank::kGlobal) {
       ++rw_stats_[op->buffer_var].read_count;
     }
     return IRMutator::Mutate_(op, e);
   }
   Stmt Mutate_(const Store* op, const Stmt& s) final {
-    if (sync_scope_.rank == 0 &&
-        GetScope(op->buffer_var.get()).rank == 0) {
+    if (sync_scope_.rank == StorageRank::kGlobal &&
+        GetScope(op->buffer_var.get()).rank == StorageRank::kGlobal) {
       ++rw_stats_[op->buffer_var].write_count;
     }
     return IRMutator::Mutate_(op, s);
@@ -228,7 +228,7 @@ class ThreadSyncInserter : public IRMutator {
       thread_extents_.pop_back();
       std::swap(temp, in_thread_env_);
       // first thread scope.
-      if (!in_thread_env_ && sync_scope_.rank == 0) {
+      if (!in_thread_env_ && sync_scope_.rank == StorageRank::kGlobal) {
         ret = InitGlobalBarrier(ret.as<AttrStmt>());
         num_blocks_ = Expr();
         is_lead_ = Expr();
@@ -253,7 +253,8 @@ class ThreadSyncInserter : public IRMutator {
   // Get current storage scope.
   StorageScope GetScope(const Variable* buf) const {
     auto it = storage_scope_.find(buf);
-    StorageScope s; s.rank = 0;
+    StorageScope s;
+    s.rank = StorageRank::kGlobal;
     if (it == storage_scope_.end()) return s;
     return it->second;
   }
@@ -279,7 +280,7 @@ class ThreadSyncInserter : public IRMutator {
     return Block::make(prep, body);
   }
   Stmt MakeGlobalBarrier() {
-    CHECK_EQ(sync_scope_.rank, 0);
+    CHECK(sync_scope_.rank == StorageRank::kGlobal);
     if (!num_blocks_.defined()) {
       CHECK(!is_lead_.defined());
       num_work_dim_ = thread_extents_.size();
diff --git a/src/pass/unroll_loop.cc b/src/pass/unroll_loop.cc
index 7a0ce24963e6..6c0ac517553f 100644
--- a/src/pass/unroll_loop.cc
+++ b/src/pass/unroll_loop.cc
@@ -19,33 +19,49 @@ class LoopUnroller : public IRMutator {
  public:
   explicit LoopUnroller(int auto_max_step,
                         int auto_max_depth,
+                        int auto_max_extent,
                         bool explicit_unroll)
       : auto_max_step_(auto_max_step),
         auto_max_depth_(auto_max_depth),
+        auto_max_extent_(auto_max_extent),
         explicit_unroll_(explicit_unroll) {
   }
 
+  Stmt Mutate_(const AttrStmt* op, const Stmt& stmt) final {
+    if (op->attr_key == "pragma_auto_unroll_max_step") {
+      int value = 0;
+      CHECK(arith::GetConstInt(op->value, &value));
+      std::swap(value, auto_max_step_);
+      Stmt ret = this->Mutate(op->body);
+      std::swap(value, auto_max_step_);
+      return ret;
+    } else if (op->attr_key == "pragma_unroll_explicit") {
+      int value = 0;
+      CHECK(arith::GetConstInt(op->value, &value));
+      bool explicit_unroll = value;
+      std::swap(explicit_unroll, explicit_unroll_);
+      Stmt ret = this->Mutate(op->body);
+      std::swap(explicit_unroll, explicit_unroll_);
+      return ret;
+    } else {
+      return IRMutator::Mutate_(op, stmt);
+    }
+  }
+
   Stmt Mutate_(const For* op, const Stmt& s) {
     Stmt stmt = IRMutator::Mutate_(op, s);
     op = stmt.as<For>();
-    // constant folding.
-    Expr extent = ir::Simplify(op->extent);
-    const IntImm* v1 = extent.as<IntImm>();
-    const UIntImm* v2 = extent.as<UIntImm>();
-    int value = -1;
-    if (v1 != nullptr) {
-      value = static_cast<int>(v1->value);
-    }
-    if (v2 != nullptr) {
-      value = static_cast<int>(v2->value);
-    }
+    int value = GetExtent(op);
     // condition for auto unroll
     bool auto_unroll = (
         op->for_type == ForType::Serial &&
-        normal_loop_depth_ == 0 &&
         value >= 0 &&
-        unroll_depth_ <= auto_max_depth_ &&
-        value * step_count_ <= auto_max_step_);
+        normal_loop_depth_ == 0 &&
+        unroll_depth_ <= auto_max_depth_);
+
+    auto_unroll = auto_unroll && (
+        value * step_count_ <= auto_max_step_||
+        value <= auto_max_extent_);
 
     if (op->for_type == ForType::Unrolled) {
       CHECK_GE(value, 0)
@@ -61,24 +77,7 @@ class LoopUnroller : public IRMutator {
     }
 
     if (auto_unroll && explicit_unroll_) {
-      using arith::ComputeExpr;
-      if (value == 0) return Evaluate::make(0);
-      Stmt body = op->body;
-      Map<Var, Expr> vmap;
-      Stmt unrolled;
-      for (int i = 0; i < value; ++i) {
-        Var lv(op->loop_var.node_);
-        vmap.Set(lv,
-                 ComputeExpr<Add>(
-                     op->min, make_const(op->loop_var.type(), i)));
-        Stmt step = Substitute(body, vmap);
-        if (unrolled.defined()) {
-          unrolled = Block::make(unrolled, step);
-        } else {
-          unrolled = step;
-        }
-      }
-      return unrolled;
+      return Unroll(op);
     } else {
       if (auto_unroll) {
         if (op->for_type != ForType::Unrolled) {
@@ -123,10 +122,53 @@ class LoopUnroller : public IRMutator {
     }
   }
 
+  Stmt Unroll(const For* op) {
+    using arith::ComputeExpr;
+    int value = GetExtent(op);
+    // For loop must have a constant integer extent
+    CHECK_NE(value, -1) << "loop doesn't have a constant integer extent";
+    if (value == 0) return Evaluate::make(0);
+    Stmt body = op->body;
+    Map<Var, Expr> vmap;
+    Stmt unrolled;
+    for (int i = 0; i < value; ++i) {
+      Var lv(op->loop_var.node_);
+      vmap.Set(lv,
+               ComputeExpr<Add>(
+                       op->min, make_const(op->loop_var.type(), i)));
+      Stmt step = Substitute(body, vmap);
+      if (unrolled.defined()) {
+        unrolled = Block::make(unrolled, step);
+      } else {
+        unrolled = step;
+      }
+    }
+    return unrolled;
+  }
+
  private:
+  // returns the extent of the loop if it's a constant integer, otherwise return -1
+  int GetExtent(const For* op) {
+    // constant folding.
+    Expr extent = ir::Simplify(op->extent);
+    const IntImm  *v1 = extent.as<IntImm>();
+    const UIntImm *v2 = extent.as<UIntImm>();
+    int value = -1;
+    if (v1 != nullptr) {
+      value = static_cast<int>(v1->value);
+    }
+    if (v2 != nullptr) {
+      value = static_cast<int>(v2->value);
+    }
+    return value;
+  }
+
   // maximum number of step to perform auto unroll.
   int auto_max_step_;
   int auto_max_depth_;
+  // max extent of loop to auto unroll
+  // this not not count the total steps, only count the number of loops
+  int auto_max_extent_;
   bool explicit_unroll_;
   // Number of normal loops in scope
   int normal_loop_depth_{0};
@@ -140,10 +182,12 @@ class LoopUnroller : public IRMutator {
 Stmt UnrollLoop(Stmt stmt,
                 int auto_max_step,
                 int auto_max_depth,
+                int auto_max_extent,
                 bool explicit_unroll) {
   Stmt ret = LoopUnroller(
       auto_max_step,
       auto_max_depth,
+      auto_max_extent,
       explicit_unroll).Mutate(stmt);
   if (!ret.same_as(stmt)) {
     return ConvertSSA(ret);
@@ -152,5 +196,13 @@ Stmt UnrollLoop(Stmt stmt,
   }
 }
 
+Stmt UnrollLoopExplicitly(Stmt stmt) {
+  const For* op = stmt.as<For>();
+  if (!op) {
+    LOG(FATAL) << "attempted to unroll a non-loop statement";
+  }
+  return LoopUnroller(0, 0, 0, false).Unroll(op);
+}
+
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/pass/verify_gpu_code.cc b/src/pass/verify_gpu_code.cc
new file mode 100644
index 000000000000..363b7c4cf7cc
--- /dev/null
+++ b/src/pass/verify_gpu_code.cc
@@ -0,0 +1,174 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file verify_gpu_code.cc
+ * \brief Verify the correctness of a GPU IR.
+ *        It will check the whether the amount of memory usage or the number of threads
+ *        in a block exceeds the limit
+ */
+
+#include <tvm/api_registry.h>
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+
+namespace tvm {
+namespace ir {
+
+class GPUCodeVerifier : public IRVisitor {
+ public:
+  bool Verify(tvm::Stmt stmt,
+              int64_t max_local_memory_per_block,
+              int64_t max_shared_memory_per_block,
+              int64_t max_threads_per_block,
+              int64_t max_thread_x,
+              int64_t max_thread_y,
+              int64_t max_thread_z) {
+    max_local_memory_per_block_ = static_cast<size_t>(max_local_memory_per_block);
+    max_shared_memory_per_block_ = static_cast<size_t>(max_shared_memory_per_block);
+    max_threads_per_block_ = static_cast<size_t>(max_threads_per_block);
+    max_thread_x_ = static_cast<size_t>(max_thread_x);
+    max_thread_y_ = static_cast<size_t>(max_thread_y);
+    max_thread_z_ = static_cast<size_t>(max_thread_z);
+
+    Reset_();
+
+    this->Visit(stmt);
+
+    return valid_;
+  }
+
+  void Visit_(const ProducerConsumer *op) {
+    if (nest_level_ == 0) {
+      // enter a new kernel, reset statistics
+      Reset_();
+    }
+
+    if (op->is_producer) {
+      nest_level_++;
+      IRVisitor::Visit_(op);
+      nest_level_--;
+    } else {
+      IRVisitor::Visit_(op);
+    }
+
+    if (nest_level_ == 0) {
+      // exit a kernel, check the validity
+      valid_ &= thread_per_block_ <= max_threads_per_block_;
+
+      valid_ &= local_memory_per_block_ <= max_local_memory_per_block_;
+      valid_ &= shared_memory_per_block_ <= max_shared_memory_per_block_;
+    }
+  }
+
+  void Visit_(const Allocate *op) {
+    IRVisitor::Visit_(op);
+    // visit an allocation of a buffer in shared memory, record its size
+    if (visited_local_buffers_.count(op->buffer_var.get()) != 0) {
+      size_t size = static_cast<size_t>(op->constant_allocation_size());
+      local_memory_per_block_ += size * op->type.bytes();
+    } else if (visited_shared_buffers_.count(op->buffer_var.get()) != 0) {
+      size_t size = static_cast<size_t>(op->constant_allocation_size());
+      shared_memory_per_block_ += size * op->type.bytes();
+    }
+  }
+
+  void Visit_(const AttrStmt *op) {
+    if (op->attr_key == attr::storage_scope) {
+      if (op->value.as<StringImm>()->value == "local") {
+        visited_local_buffers_.insert(op->node.as<tvm::Variable>());
+      } else if (op->value.as<StringImm>()->value == "shared") {
+        visited_shared_buffers_.insert(op->node.as<tvm::Variable>());
+      }
+    } else if (op->attr_key == attr::thread_extent) {
+      VarExpr var = op->node.as<tvm::IterVarNode>()->var;
+      const auto *extent = op->value.as<IntImm>();
+      CHECK(extent);
+
+      // record the number of threads in a block
+      std::string name = var.get()->name_hint;
+      if (name == "threadIdx.x" || name == "threadIdx.y" || name == "threadIdx.z") {
+        if (!visited_threads_.count(name)) {
+          visited_threads_.insert(name);
+          size_t length = static_cast<size_t>(extent->value);
+          thread_per_block_ *= length;
+
+          if (name == "threadIdx.x") {
+            valid_ &= length <= max_thread_x_;
+          } else if (name == "threadIdx.y") {
+            valid_ &= length <= max_thread_y_;
+          } else if (name == "threadIdx.z") {
+            valid_ &= length <= max_thread_z_;
+          }
+        }
+      }
+    }
+    IRVisitor::Visit_(op);
+  }
+
+ private:
+  int nest_level_{0};
+
+  std::unordered_set<const tvm::Variable *> visited_local_buffers_;
+  std::unordered_set<const tvm::Variable *> visited_shared_buffers_;
+  std::unordered_set<std::string> visited_threads_;
+
+  size_t local_memory_per_block_;
+  size_t shared_memory_per_block_;
+  size_t thread_per_block_;
+
+  size_t max_local_memory_per_block_;
+  size_t max_shared_memory_per_block_;
+  size_t max_threads_per_block_;
+  size_t max_thread_x_, max_thread_y_, max_thread_z_;
+
+  bool valid_{true};
+
+  void Reset_() {
+    visited_local_buffers_.clear();
+    visited_shared_buffers_.clear();
+    local_memory_per_block_ = 0;
+    shared_memory_per_block_ = 0;
+
+    visited_threads_.clear();
+    thread_per_block_ = 1;
+  }
+};
+
+bool VerifyGPUCode(Stmt stmt,
+                   Map<std::string, Expr> constraints) {
+  GPUCodeVerifier verifier;
+
+  int64_t max_local_memory_per_block = INT64_MAX;
+  int64_t max_shared_memory_per_block = INT64_MAX;
+  int64_t max_threads_per_block = INT64_MAX;
+  int64_t max_thread_x = INT64_MAX;
+  int64_t max_thread_y = INT64_MAX;
+  int64_t max_thread_z = INT64_MAX;
+
+  for (auto iter : constraints) {
+    if (iter.first == "max_local_memory_per_block")
+      max_local_memory_per_block = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_shared_memory_per_block")
+      max_shared_memory_per_block = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_threads_per_block")
+      max_threads_per_block = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_thread_x")
+      max_thread_x = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_thread_y")
+      max_thread_y = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_thread_z")
+      max_thread_z = (iter.second).as<IntImm>()->value;
+    else
+      LOG(FATAL) << "Invalid check item: " << iter.first;
+  }
+
+  return verifier.Verify(stmt,
+                         max_local_memory_per_block,
+                         max_shared_memory_per_block,
+                         max_threads_per_block,
+                         max_thread_x,
+                         max_thread_y,
+                         max_thread_z);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/verify_memory.cc b/src/pass/verify_memory.cc
new file mode 100644
index 000000000000..bafaaa642ac7
--- /dev/null
+++ b/src/pass/verify_memory.cc
@@ -0,0 +1,172 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file verify_memory.cc
+ * \brief Pass to check if memory accesses are legal.
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/ir_pass.h>
+
+namespace tvm {
+namespace ir {
+namespace {
+
+/*!
+ * \brief Verify if memory accesses are legal.
+ *
+ *  In the case that tgt is cuda, if workload is not bound with
+ *  threads, CPU code is generated that tries to access GPU memory,
+ *  which is illegal.
+ *
+ *  This pass performs such verification by checking if all Producer/Consumer
+ *  with memory accesses are bound with threads when device type is GPU.
+ */
+class MemoryAccessVerifier final : protected IRVisitor {
+ public:
+  /// Special member functions
+  //@{
+  explicit MemoryAccessVerifier(LoweredFunc f, int device_type)
+      : func_(f), dev_type_(device_type) {}
+  virtual ~MemoryAccessVerifier() = default;
+  MemoryAccessVerifier(const MemoryAccessVerifier &) = delete;
+  MemoryAccessVerifier(MemoryAccessVerifier &&) = delete;
+  MemoryAccessVerifier &operator=(const MemoryAccessVerifier &) = delete;
+  MemoryAccessVerifier &operator=(MemoryAccessVerifier &&) = delete;
+  //@}
+
+  /// Interface to perform memory access verification
+  void Run() {
+    if (!IsGPUDevice(dev_type_) && !IsFPGADevice(dev_type_)) return;
+    IRVisitor::Visit(func_->body);
+  }
+
+  /// Verification result
+  bool Failed() const { return failure_; }
+
+ protected:
+  /// Visitor implementation
+  //@{
+  void Visit(const NodeRef &n) final {
+    if (Failed()) return;
+    IRVisitor::Visit(n);
+  }
+
+  void Visit_(const LetStmt *op) final {
+    // Book keep definitions
+    defs_[op->var.get()] = op->value;
+    return IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const AttrStmt *op) final {
+    if (!InThreadEnv() && (op->attr_key == attr::thread_extent ||
+                           op->attr_key == attr::pipeline_exec_scope)) {
+      EnterThreadEnv();
+      IRVisitor::Visit_(op);
+      ExitThreadEnv();
+    } else {
+      IRVisitor::Visit_(op);
+    }
+  }
+
+  void Visit_(const ProducerConsumer *op) final {
+    EnterProducerConsumer(op);
+    IRVisitor::Visit_(op);
+    ExitProducerConsumer();
+  }
+
+  void Visit_(const Load *op) final {
+    HandleLoadStoreToVariable(op->buffer_var);
+    return IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Store *op) final {
+    HandleLoadStoreToVariable(op->buffer_var);
+    return IRVisitor::Visit_(op);
+  }
+  //@}
+
+  /// Check if the value of a Variable comes from function argument.
+  bool IsFromFunctionArgs(const Variable *var) const {
+    const Variable *V = var;
+    while (true) {
+      CHECK(V) << "Invalid Variable\n";
+
+      // Variable is from function args. Return true.
+      if (V == func_->args[0].node_.get()) return true;
+
+      // The value is expected to come from a tvm_struct_get Call.
+      // Get the first argument of tvm_struct_get, and continue.
+      const auto &iter = defs_.find(V);
+      if (iter == defs_.end()) return false;
+      const Call *C = iter->second.as<const Call>();
+      if (!C || C->name != intrinsic::tvm_struct_get) return false;
+      V = C->args[0].as<Variable>();
+    }
+    return false;
+  }
+
+  /// Handle memory access to a Variable
+  void HandleLoadStoreToVariable(const VarExpr &var) {
+    // We skip the access within thread env.
+    if (InThreadEnv()) return;
+
+    // We only check access within a producer/consumer.
+    // Because for load/store out side of producer/consumer,
+    // they don't have to be in thread env to stay legal (e.g. Load of args).
+    if (!InProducerConsumer()) return;
+
+    // We only handle the variable from function argument.
+    // If it does not come from args, then it could be allocated internally,
+    // it may possibly be in host or device address space.
+    // We do not handle this case, and skip it conservatively.
+    if (!IsFromFunctionArgs(var.get())) return;
+
+    // The verification fails in this case.
+    SetFailure();
+  }
+
+  /// Status getter/setter
+  //@{
+  bool InThreadEnv() const { return in_thread_env_; }
+  void EnterThreadEnv() { in_thread_env_ = true; }
+  void ExitThreadEnv() { in_thread_env_ = false; }
+  bool InProducerConsumer() const { return pc_ != nullptr; }
+  const ProducerConsumer *GetCurrentProducerConsumer() const { return pc_; }
+  void EnterProducerConsumer(const ProducerConsumer *pc) { this->pc_ = pc; }
+  void ExitProducerConsumer() { pc_ = nullptr; }
+  void SetFailure() { failure_ = true; }
+  //@}
+
+  /// Check if a given DLDeviceType/TVMDeviceExtType value denotes GPU device.
+  static bool IsGPUDevice(int dev_type) {
+    return kDLGPU == dev_type || kDLOpenCL == dev_type ||
+           kDLVulkan == dev_type || kDLMetal == dev_type ||
+           kDLROCM == dev_type || kOpenGL == dev_type;
+  }
+  /// Check if a given DLDeviceType/TVMDeviceExtType value denotes FPGA device.
+  static bool IsFPGADevice(int dev_type) {
+    return kDLSDAccel == dev_type || kDLAOCL == dev_type;
+  }
+
+ private:
+  /// Status of visitor
+  //@{
+  bool in_thread_env_{false};
+  const ProducerConsumer *pc_{nullptr};
+  bool failure_{false};  ///< If the verification fails (i.e. has illegal access)
+  //@}
+  LoweredFunc func_{nullptr};  ///< Function to be verified.
+  int dev_type_{kDLCPU};       ///< Device type
+  std::unordered_map<const Variable *, Expr> defs_;  ///< Variable definitions
+};
+}  // namespace
+
+/// Interface of VerifyMemory pass
+bool VerifyMemory(LoweredFunc func, int device_type) {
+  MemoryAccessVerifier v(func, device_type);
+  v.Run();
+  return !v.Failed();
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index ce4a65dc79e2..916dfadecb4c 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -10,6 +10,9 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/device_api.h>
+#ifdef _LIBCPP_SGX_CONFIG
+#include "sgx/trusted/runtime.h"
+#endif
 #include <array>
 #include <algorithm>
 #include <string>
@@ -25,12 +28,16 @@ namespace runtime {
  */
 inline std::string DeviceName(int type) {
   switch (type) {
-    case kCPU: return "cpu";
-    case kGPU: return "gpu";
-    case kOpenCL: return "opencl";
-    case kMetal: return "metal";
-    case kVPI: return "vpi";
-    case kROCM: return "rocm";
+    case kDLCPU: return "cpu";
+    case kDLGPU: return "gpu";
+    case kDLOpenCL: return "opencl";
+    case kDLSDAccel: return "sdaccel";
+    case kDLAOCL: return "aocl";
+    case kDLVulkan: return "vulkan";
+    case kDLMetal: return "metal";
+    case kDLVPI: return "vpi";
+    case kDLROCM: return "rocm";
+    case kOpenGL: return "opengl";
     case kExtDev: return "ext_dev";
     default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
   }
@@ -94,61 +101,30 @@ DeviceAPI* DeviceAPI::Get(TVMContext ctx, bool allow_missing) {
       static_cast<int>(ctx.device_type), allow_missing);
 }
 
-void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size) {
-  return AllocDataSpace(ctx, size, kTempAllocaAlignment);
+void* DeviceAPI::AllocWorkspace(TVMContext ctx,
+                                size_t size,
+                                TVMType type_hint) {
+  return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
 }
 
 void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) {
   FreeDataSpace(ctx, ptr);
 }
 
-inline TVMArray* TVMArrayCreate_() {
-  TVMArray* arr = new TVMArray();
-  arr->shape = nullptr;
-  arr->strides = nullptr;
-  arr->ndim = 0;
-  arr->data = nullptr;
-  return arr;
-}
-
-inline void TVMArrayFree_(TVMArray* arr) {
-  if (arr != nullptr) {
-    // ok to delete nullptr
-    delete[] arr->shape;
-    delete[] arr->strides;
-    if (arr->data != nullptr) {
-      DeviceAPIManager::Get(arr->ctx)->FreeDataSpace(
-          arr->ctx, arr->data);
-    }
-  }
-  delete arr;
-}
-
-inline void VerifyType(int dtype_code, int dtype_bits, int dtype_lanes) {
-  CHECK_GE(dtype_lanes, 1);
-  if (dtype_code == kFloat) {
-    CHECK_EQ(dtype_bits % 32, 0);
-  } else {
-    CHECK_EQ(dtype_bits % 8, 0);
-  }
-  CHECK_EQ(dtype_bits & (dtype_bits - 1), 0);
+TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) {
+  LOG(FATAL) << "Device does not support stream api.";
+  return 0;
 }
 
-inline size_t GetDataSize(TVMArray* arr) {
-  size_t size = 1;
-  for (tvm_index_t i = 0; i < arr->ndim; ++i) {
-    size *= arr->shape[i];
-  }
-  size *= (arr->dtype.bits * arr->dtype.lanes + 7) / 8;
-  return size;
+void DeviceAPI::FreeStream(TVMContext ctx, TVMStreamHandle stream) {
+  LOG(FATAL) << "Device does not support stream api.";
 }
 
-inline size_t GetDataAlignment(TVMArray* arr) {
-  size_t align = (arr->dtype.bits / 8) * arr->dtype.lanes;
-  if (align < kAllocAlignment) return kAllocAlignment;
-  return align;
+void DeviceAPI::SyncStreamFromTo(TVMContext ctx,
+                                 TVMStreamHandle event_src,
+                                 TVMStreamHandle event_dst) {
+  LOG(FATAL) << "Device does not support stream api.";
 }
-
 }  // namespace runtime
 }  // namespace tvm
 
@@ -167,7 +143,11 @@ const char *TVMGetLastError() {
 }
 
 void TVMAPISetLastError(const char* msg) {
+#ifndef _LIBCPP_SGX_CONFIG
   TVMAPIRuntimeStore::Get()->last_error = msg;
+#else
+  sgx::OCallPackedFunc("__sgx_set_last_error__", msg);
+#endif
 }
 
 int TVMModLoadFromFile(const char* file_name,
@@ -218,12 +198,22 @@ int TVMBackendGetFuncFromEnv(void* mod_node,
 }
 
 void* TVMBackendAllocWorkspace(int device_type,
-                             int device_id,
-                             uint64_t size) {
+                               int device_id,
+                               uint64_t size,
+                               int dtype_code_hint,
+                               int dtype_bits_hint) {
   TVMContext ctx;
   ctx.device_type = static_cast<DLDeviceType>(device_type);
   ctx.device_id = device_id;
-  return DeviceAPIManager::Get(ctx)->AllocWorkspace(ctx, static_cast<size_t>(size));
+
+  TVMType type_hint;
+  type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
+  type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
+  type_hint.lanes = 1;
+
+  return DeviceAPIManager::Get(ctx)->AllocWorkspace(ctx,
+                                                    static_cast<size_t>(size),
+                                                    type_hint);
 }
 
 int TVMBackendFreeWorkspace(int device_type,
@@ -310,9 +300,9 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func,
           int ret = func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*)
                          args.num_args, rv, resource_handle);
           if (ret != 0) {
-            std::ostringstream os;
-            os << "TVMCall CFunc Error:\n" << TVMGetLastError();
-            throw dmlc::Error(os.str());
+            std::string err = "TVMCall CFunc Error:\n";
+            err += TVMGetLastError();
+            throw dmlc::Error(err);
           }
         });
   } else {
@@ -324,109 +314,30 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func,
           int ret = func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*)
                          args.num_args, rv, rpack.get());
           if (ret != 0) {
-            std::ostringstream os;
-            os << "TVMCall CFunc Error:\n" << TVMGetLastError();
-            throw dmlc::Error(os.str());
+            std::string err = "TVMCall CFunc Error:\n";
+            err += TVMGetLastError();
+            throw dmlc::Error(err);
           }
       });
   }
   API_END();
 }
 
-int TVMArrayAlloc(const tvm_index_t* shape,
-                  int ndim,
-                  int dtype_code,
-                  int dtype_bits,
-                  int dtype_lanes,
-                  int device_type,
-                  int device_id,
-                  TVMArrayHandle* out) {
-  TVMArray* arr = nullptr;
-  API_BEGIN();
-  // shape
-  arr = TVMArrayCreate_();
-  // ndim
-  arr->ndim = ndim;
-  // dtype
-  VerifyType(dtype_code, dtype_bits, dtype_lanes);
-  arr->dtype.code = static_cast<uint8_t>(dtype_code);
-  arr->dtype.bits = static_cast<uint8_t>(dtype_bits);
-  arr->dtype.lanes = static_cast<uint16_t>(dtype_lanes);
-  tvm_index_t* shape_copy = new tvm_index_t[ndim];
-  std::copy(shape, shape + ndim, shape_copy);
-  arr->shape = shape_copy;
-  // ctx
-  arr->ctx.device_type = static_cast<DLDeviceType>(device_type);
-  arr->ctx.device_id = device_id;
-  size_t size = GetDataSize(arr);
-  size_t alignment = GetDataAlignment(arr);
-  arr->data = DeviceAPIManager::Get(arr->ctx)->AllocDataSpace(
-      arr->ctx, size, alignment);
-  *out = arr;
-  API_END_HANDLE_ERROR(TVMArrayFree_(arr));
-}
-
-int TVMArrayFree(TVMArrayHandle handle) {
-  API_BEGIN();
-  TVMArray* arr = handle;
-  TVMArrayFree_(arr);
-  API_END();
-}
-
-int TVMArrayCopyFromTo(TVMArrayHandle from,
-                       TVMArrayHandle to,
-                       TVMStreamHandle stream) {
+int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out) {
   API_BEGIN();
-  size_t from_size = GetDataSize(from);
-  size_t to_size = GetDataSize(to);
-  CHECK_EQ(from_size, to_size)
-      << "TVMArrayCopyFromTo: The size must exactly match";
-  TVMContext ctx = from->ctx;
-  if (ctx.device_type == kCPU) {
-    ctx = to->ctx;
-  } else {
-    CHECK(to->ctx.device_type == kCPU ||
-          to->ctx.device_type == from->ctx.device_type)
-        << "Can not copy across different ctx types directly";
-  }
-  DeviceAPIManager::Get(ctx)->CopyDataFromTo(
-      from->data, static_cast<size_t>(from->byte_offset),
-      to->data, static_cast<size_t>(to->byte_offset),
-      from_size, from->ctx, to->ctx, stream);
-  API_END();
-}
-
-int TVMArrayCopyFromBytes(TVMArrayHandle handle,
-                          void* data,
-                          size_t nbytes) {
-  API_BEGIN();
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kCPU;
-  cpu_ctx.device_id = 0;
-  size_t arr_size = GetDataSize(handle);
-  CHECK_EQ(arr_size, nbytes)
-      << "TVMArrayCopyFromBytes: size mismatch";
-  DeviceAPIManager::Get(handle->ctx)->CopyDataFromTo(
-      data, 0,
-      handle->data, static_cast<size_t>(handle->byte_offset),
-      nbytes, cpu_ctx, handle->ctx, nullptr);
+  TVMContext ctx;
+  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  *out = DeviceAPIManager::Get(ctx)->CreateStream(ctx);
   API_END();
 }
 
-int TVMArrayCopyToBytes(TVMArrayHandle handle,
-                        void* data,
-                        size_t nbytes) {
+int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream) {
   API_BEGIN();
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kCPU;
-  cpu_ctx.device_id = 0;
-  size_t arr_size = GetDataSize(handle);
-  CHECK_EQ(arr_size, nbytes)
-      << "TVMArrayCopyToBytes: size mismatch";
-  DeviceAPIManager::Get(handle->ctx)->CopyDataFromTo(
-      handle->data, static_cast<size_t>(handle->byte_offset),
-      data, 0,
-      nbytes, handle->ctx, cpu_ctx, nullptr);
+  TVMContext ctx;
+  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  DeviceAPIManager::Get(ctx)->FreeStream(ctx, stream);
   API_END();
 }
 
@@ -448,6 +359,18 @@ int TVMSynchronize(int device_type, int device_id, TVMStreamHandle stream) {
   API_END();
 }
 
+int TVMStreamStreamSynchronize(int device_type,
+                               int device_id,
+                               TVMStreamHandle src,
+                               TVMStreamHandle dst) {
+  API_BEGIN();
+  TVMContext ctx;
+  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  DeviceAPIManager::Get(ctx)->SyncStreamFromTo(ctx, src, dst);
+  API_END();
+}
+
 int TVMCbArgToReturn(TVMValue* value, int code) {
   API_BEGIN();
   tvm::runtime::TVMRetValue rv;
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index 78947cd62cd5..e3434e01813e 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -20,13 +20,19 @@ class CPUDeviceAPI final : public DeviceAPI {
       *rv = 1;
     }
   }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
     void* ptr;
 #if _MSC_VER
-    ptr = _aligned_malloc(size, alignment);
+    ptr = _aligned_malloc(nbytes, alignment);
+    if (ptr == nullptr) throw std::bad_alloc();
+#elif defined(_LIBCPP_SGX_CONFIG)
+    ptr = memalign(alignment, nbytes);
     if (ptr == nullptr) throw std::bad_alloc();
 #else
-    int ret = posix_memalign(&ptr, alignment, size);
+    int ret = posix_memalign(&ptr, alignment, nbytes);
     if (ret != 0) throw std::bad_alloc();
 #endif
     return ptr;
@@ -47,6 +53,7 @@ class CPUDeviceAPI final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
     memcpy(static_cast<char*>(to) + to_offset,
            static_cast<const char*>(from) + from_offset,
@@ -56,7 +63,7 @@ class CPUDeviceAPI final : public DeviceAPI {
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
   }
 
-  void* AllocWorkspace(TVMContext ctx, size_t size) final;
+  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
   void FreeWorkspace(TVMContext ctx, void* data) final;
 
   static const std::shared_ptr<CPUDeviceAPI>& Global() {
@@ -68,10 +75,12 @@ class CPUDeviceAPI final : public DeviceAPI {
 
 struct CPUWorkspacePool : public WorkspacePool {
   CPUWorkspacePool() :
-      WorkspacePool(kCPU, CPUDeviceAPI::Global()) {}
+      WorkspacePool(kDLCPU, CPUDeviceAPI::Global()) {}
 };
 
-void* CPUDeviceAPI::AllocWorkspace(TVMContext ctx, size_t size) {
+void* CPUDeviceAPI::AllocWorkspace(TVMContext ctx,
+                                   size_t size,
+                                   TVMType type_hint) {
   return dmlc::ThreadLocalStore<CPUWorkspacePool>::Get()
       ->AllocWorkspace(ctx, size);
 }
diff --git a/src/runtime/cuda/cuda_common.h b/src/runtime/cuda/cuda_common.h
index 0b879dffe6bf..782fbc5d87e6 100644
--- a/src/runtime/cuda/cuda_common.h
+++ b/src/runtime/cuda/cuda_common.h
@@ -6,12 +6,9 @@
 #ifndef TVM_RUNTIME_CUDA_CUDA_COMMON_H_
 #define TVM_RUNTIME_CUDA_CUDA_COMMON_H_
 
-#include <tvm/runtime/config.h>
+#include <cuda_runtime.h>
 #include <tvm/runtime/packed_func.h>
 #include <string>
-
-#if TVM_CUDA_RUNTIME
-#include <cuda_runtime.h>
 #include "../workspace_pool.h"
 
 namespace tvm {
@@ -49,5 +46,4 @@ class CUDAThreadEntry {
 };
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_CUDA_RUNTIME
 #endif  // TVM_RUNTIME_CUDA_CUDA_COMMON_H_
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 340b286d87ca..98accdf1b0aa 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -3,11 +3,8 @@
  * \file cuda_device_api.cc
  * \brief GPU specific API
  */
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/device_api.h>
 
-#if TVM_CUDA_RUNTIME
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
 #include <cuda_runtime.h>
@@ -40,6 +37,11 @@ class CUDADeviceAPI final : public DeviceAPI {
             &value, cudaDevAttrWarpSize, ctx.device_id));
         break;
       }
+      case kMaxSharedMemoryPerBlock: {
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id));
+        break;
+      }
       case kComputeVersion: {
         std::ostringstream os;
         CUDA_CALL(cudaDeviceGetAttribute(
@@ -51,15 +53,48 @@ class CUDADeviceAPI final : public DeviceAPI {
         *rv = os.str();
         return;
       }
+      case kDeviceName: {
+        cudaDeviceProp props;
+        CUDA_CALL(cudaGetDeviceProperties(&props, ctx.device_id));
+        *rv = std::string(props.name);
+        return;
+      }
+      case kMaxClockRate: {
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &value, cudaDevAttrClockRate, ctx.device_id));
+        break;
+      }
+      case kMultiProcessorCount: {
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &value, cudaDevAttrMultiProcessorCount, ctx.device_id));
+        break;
+      }
+      case kMaxThreadDimensions: {
+        int dims[3];
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id));
+
+        std::stringstream ss;  // use json string to return multiple int values;
+        ss << "[" << dims[0] <<", " << dims[1] << ", " << dims[2] << "]";
+        *rv = ss.str();
+        return;
+      }
     }
     *rv = value;
   }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
     CUDA_CALL(cudaSetDevice(ctx.device_id));
     CHECK_EQ(256 % alignment, 0U)
         << "CUDA space is aligned at 256 bytes";
     void *ret;
-    CUDA_CALL(cudaMalloc(&ret, size));
+    CUDA_CALL(cudaMalloc(&ret, nbytes));
     return ret;
   }
 
@@ -75,11 +110,12 @@ class CUDADeviceAPI final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
     cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
     from = static_cast<const char*>(from) + from_offset;
     to = static_cast<char*>(to) + to_offset;
-    if (ctx_from.device_type == kGPU && ctx_to.device_type == kGPU) {
+    if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLGPU) {
       CUDA_CALL(cudaSetDevice(ctx_from.device_id));
       if (ctx_from.device_id == ctx_to.device_id) {
         GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
@@ -88,10 +124,10 @@ class CUDADeviceAPI final : public DeviceAPI {
                             from, ctx_from.device_id,
                             size, cu_stream);
       }
-    } else if (ctx_from.device_type == kGPU && ctx_to.device_type == kCPU) {
+    } else if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLCPU) {
       CUDA_CALL(cudaSetDevice(ctx_from.device_id));
       GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
-    } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kGPU) {
+    } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLGPU) {
       CUDA_CALL(cudaSetDevice(ctx_to.device_id));
       GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
     } else {
@@ -99,6 +135,30 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
   }
 
+  TVMStreamHandle CreateStream(TVMContext ctx) {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    cudaStream_t retval;
+    CUDA_CALL(cudaStreamCreate(&retval));
+    return static_cast<TVMStreamHandle>(retval);
+  }
+
+  void FreeStream(TVMContext ctx, TVMStreamHandle stream) {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
+    CUDA_CALL(cudaStreamDestroy(cu_stream));
+  }
+
+  void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    cudaStream_t src_stream = static_cast<cudaStream_t>(event_src);
+    cudaStream_t dst_stream = static_cast<cudaStream_t>(event_dst);
+    cudaEvent_t evt;
+    CUDA_CALL(cudaEventCreate(&evt));
+    CUDA_CALL(cudaEventRecord(evt, src_stream));
+    CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0));
+    CUDA_CALL(cudaEventDestroy(evt));
+  }
+
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
     CUDA_CALL(cudaSetDevice(ctx.device_id));
     CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
@@ -109,7 +169,7 @@ class CUDADeviceAPI final : public DeviceAPI {
         ->stream = static_cast<cudaStream_t>(stream);
   }
 
-  void* AllocWorkspace(TVMContext ctx, size_t size) final {
+  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final {
     return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
   }
 
@@ -140,7 +200,7 @@ class CUDADeviceAPI final : public DeviceAPI {
 typedef dmlc::ThreadLocalStore<CUDAThreadEntry> CUDAThreadStore;
 
 CUDAThreadEntry::CUDAThreadEntry()
-    : pool(kGPU, CUDADeviceAPI::Global()) {
+    : pool(kDLGPU, CUDADeviceAPI::Global()) {
 }
 
 CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
@@ -155,4 +215,3 @@ TVM_REGISTER_GLOBAL("device_api.gpu")
 
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_CUDA_RUNTIME
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index fb2de868640a..a0e613107bae 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -4,8 +4,6 @@
  */
 #include "./cuda_module.h"
 
-#if TVM_CUDA_RUNTIME
-
 #include <tvm/runtime/registry.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -305,4 +303,3 @@ TVM_REGISTER_GLOBAL("module.loadbinary_cuda")
   });
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_CUDA_RUNTIME
diff --git a/src/runtime/cuda/cuda_module.h b/src/runtime/cuda/cuda_module.h
index 6c7095734aaa..82427ca8fc22 100644
--- a/src/runtime/cuda/cuda_module.h
+++ b/src/runtime/cuda/cuda_module.h
@@ -6,7 +6,6 @@
 #ifndef TVM_RUNTIME_CUDA_CUDA_MODULE_H_
 #define TVM_RUNTIME_CUDA_CUDA_MODULE_H_
 
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/module.h>
 #include <memory>
 #include <vector>
diff --git a/src/runtime/file_util.cc b/src/runtime/file_util.cc
index 3d7370944fe2..7606bf89cd92 100644
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_util.cc
@@ -4,6 +4,7 @@
  */
 #include <dmlc/json.h>
 #include <dmlc/logging.h>
+#include <tvm/runtime/serializer.h>
 #include <fstream>
 
 #include "./file_util.h"
@@ -53,6 +54,7 @@ std::string GetFileFormat(const std::string& file_name,
                           const std::string& format) {
   std::string fmt = format;
   if (fmt.length() == 0) {
+    if (file_name.find(".signed.so") != std::string::npos) return "sgx";
     size_t pos = file_name.find_last_of(".");
     if (pos != std::string::npos) {
       return file_name.substr(pos + 1, file_name.length() - pos - 1);
@@ -64,6 +66,24 @@ std::string GetFileFormat(const std::string& file_name,
   }
 }
 
+std::string GetCacheDir() {
+  char* env_cache_dir;
+  if ((env_cache_dir = getenv("TVM_CACHE_DIR"))) return env_cache_dir;
+  if ((env_cache_dir = getenv("XDG_CACHE_HOME"))) {
+    return std::string(env_cache_dir) + "/tvm";
+  }
+  if ((env_cache_dir = getenv("HOME"))) {
+    return std::string(env_cache_dir) + "/.cache/tvm";
+  }
+  return ".";
+}
+
+std::string GetFileBasename(const std::string& file_name) {
+  size_t last_slash = file_name.find_last_of("/");
+  if (last_slash == std::string::npos) return file_name;
+  return file_name.substr(last_slash + 1);
+}
+
 std::string GetMetaFilePath(const std::string& file_name) {
   size_t pos  = file_name.find_last_of(".");
   if (pos != std::string::npos) {
diff --git a/src/runtime/file_util.h b/src/runtime/file_util.h
index 18b9ec6866a1..b3357271856e 100644
--- a/src/runtime/file_util.h
+++ b/src/runtime/file_util.h
@@ -19,12 +19,25 @@ namespace runtime {
 std::string GetFileFormat(const std::string& file_name,
                           const std::string& format);
 
+/*!
+ * \return the directory in which TVM stores cached files.
+ *         May be set using TVM_CACHE_DIR; defaults to system locations.
+ */
+std::string GetCacheDir();
+
 /*!
  * \brief Get meta file path given file name and format.
  * \param file_name The name of the file.
  */
 std::string GetMetaFilePath(const std::string& file_name);
 
+/*!
+ * \brief Get file basename (i.e. without leading directories)
+ * \param file_name The name of the file.
+ * \return the base name
+ */
+std::string GetFileBasename(const std::string& file_name);
+
 /*!
  * \brief Load binary file into a in-memory buffer.
  * \param file_name The name of the file.
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 2cf6a1fb1330..7a75771af23b 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -4,6 +4,7 @@
  */
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/runtime/ndarray.h>
 #include <dmlc/memory_io.h>
 #include <dmlc/json.h>
 #include <numeric>
@@ -57,14 +58,18 @@ class GraphRuntime : public ModuleNode {
   }
   /*!
    * \brief Initialize the graph executor with graph and context.
-   * \param graph The execution graph.
+   * \param graph_json The execution graph.
    * \param module The module containing the compiled functions.
    * \param ctx The context where the graph should sit on
    */
   void Init(const std::string& graph_json,
             tvm::runtime::Module module,
             TVMContext ctx) {
+#ifndef _LIBCPP_SGX_NO_IOSTREAMS
     std::istringstream is(graph_json);
+#else
+    std::string is = graph_json;
+#endif
     dmlc::JSONReader reader(&is);
     this->Load(&reader);
     module_ = module;
@@ -84,19 +89,29 @@ class GraphRuntime : public ModuleNode {
         return static_cast<int>(i);
       }
     }
-    LOG(FATAL) << "cannot find " << name << " among input";
+    LOG(WARNING) << "Warning: cannot find \"" << name << "\" among input";
     return -1;
   }
   /*!
    * \brief set index-th input to the graph.
    * \param index The input index.
-   * \param data The input data.
+   * \param data_in The input data.
    */
   void SetInput(int index, DLTensor* data_in) {
     CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
     uint32_t eid = this->entry_id(input_nodes_[index], 0);
     TVM_CCALL(TVMArrayCopyFromTo(data_in, &data_entry_[eid], nullptr));
   }
+  /*!
+   * \brief Copy index-th input to data_out
+   * \param index The input index.
+   * \param data_out The output
+   */
+  void GetInput(int index, DLTensor* data_out) {
+    CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+    uint32_t eid = this->entry_id(input_nodes_[index], 0);
+    TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
+  }
   /*!
    * \brief Copy index-th output to data_out.
    * \param index The output index.
@@ -107,7 +122,44 @@ class GraphRuntime : public ModuleNode {
     uint32_t eid = this->entry_id(outputs_[index]);
     TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
   }
+#ifdef TVM_GRAPH_RUNTIME_DEBUG
+  /*!
+   * \brief Get the node index given the name of node.
+   * \param name The name of the node.
+   * \return The index of node.
+   */
+  int GetNodeIndex(const std::string& name) {
+    for (uint32_t nid = 0; nid< nodes_.size(); ++nid) {
+      if (nodes_[nid].name == name) {
+        return static_cast<int>(nid);
+      }
+    }
+    LOG(FATAL) << "cannot find " << name << " among nodex";
+    return -1;
+  }
+
+  /*!
+   * \brief Copy index-th node to data_out.
+   *
+   * This method will do a partial run of the the graph
+   * from begining upto the index-th node and return output of index-th node.
+   * This is costly operation and suggest to use only for debug porpose.
+   *
+   * \param index: The  index of the node.
+   * \param data_out the node data.
+   */
+  void DebugGetNodeOutput(int index, DLTensor* data_out) {
+    CHECK_LT(static_cast<size_t>(index), nodes_.size());
+    uint32_t eid = index;
 
+    for (size_t i = 0; i < op_execs_.size(); ++i) {
+      if (op_execs_[i]) op_execs_[i]();
+      if (static_cast<int>(i) == index) break;
+    }
+
+    TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
+  }
+#endif
   /*!
    * \brief Load parameters from binary stream
    * \param strm The input stream.
@@ -161,27 +213,19 @@ class GraphRuntime : public ModuleNode {
       std::string key, value;
       reader->BeginObject();
       while (reader->NextObjectItem(&key)) {
+        reader->Read(&value);
         if (key == "func_name") {
-          reader->Read(&value);
           param->func_name = value;
           bitmask |= 1;
         } else if (key == "num_inputs") {
-          reader->Read(&value);
-          std::istringstream is(value);
-          is >> param->num_inputs;
+          param->num_inputs = strtoul(value.c_str(), nullptr, 10);
           bitmask |= 2;
         } else if (key == "num_outputs") {
-          reader->Read(&value);
-          std::istringstream is(value);
-          is >> param->num_outputs;
+          param->num_outputs = strtoul(value.c_str(), nullptr, 10);
           bitmask |= 4;
         } else if (key == "flatten_data") {
-          reader->Read(&value);
-          std::istringstream is(value);
-          is >> param->flatten_data;
+          param->flatten_data = strtoul(value.c_str(), nullptr, 10);
           bitmask |= 8;
-        } else {
-          reader->Read(&value);
         }
       }
       CHECK_EQ(bitmask, 1|2|4|8) << "invalid format";
@@ -354,45 +398,10 @@ class GraphRuntime : public ModuleNode {
 
 
 void GraphRuntime::LoadDLTensor(dmlc::Stream* strm, DLTensor* dst) {
-  uint64_t header, reserved;
-  CHECK(strm->Read(&header, sizeof(header)))
-      << "Invalid DLTensor file format";
-  CHECK(strm->Read(&reserved, sizeof(reserved)))
-      << "Invalid DLTensor file format";
-  CHECK(header == kTVMNDArrayMagic)
-      << "Invalid DLTensor file format";
-
-  DLTensor tensor;
-  CHECK(strm->Read(&tensor.ctx, sizeof(tensor.ctx)))
-      << "Invalid DLTensor file format";
-  CHECK(strm->Read(&tensor.ndim, sizeof(tensor.ndim)))
-      << "Invalid DLTensor file format";
-  CHECK(strm->Read(&tensor.dtype, sizeof(tensor.dtype)))
-      << "Invalid DLTensor file format";
-  std::vector<int64_t> shape(tensor.ndim);
-  CHECK(strm->Read(&shape[0], sizeof(int64_t) * tensor.ndim))
-      << "Invalid DLTensor file format";
-  CHECK_EQ(tensor.ndim, dst->ndim) << "param dimension mismatch";
-  CHECK(tensor.dtype.bits == dst->dtype.bits &&
-        tensor.dtype.code == dst->dtype.code &&
-        tensor.dtype.lanes == dst->dtype.lanes) << "param type mismatch";
-  for (int i = 0; i < tensor.ndim; ++i) {
-    CHECK_EQ(shape[i], dst->shape[i]) << "param shape mismatch";
-  }
-  size_t bits = dst->dtype.bits * dst->dtype.lanes;
-  size_t size = (bits + 7) / 8;
-  for (int i = 0; i < dst->ndim; ++i) {
-    size *= dst->shape[i];
-  }
-  int64_t data_byte_size;
-  CHECK(strm->Read(&data_byte_size, sizeof(data_byte_size)))
-      << "Invalid DLTensor file format";
-  CHECK(data_byte_size == size)
-      << "Invalid DLTensor file format";
-  std::vector<uint8_t> bytes(data_byte_size + 1);
-  CHECK(strm->Read(&bytes[0], data_byte_size))
-      << "Invalid DLTensor file format";
-  TVM_CCALL(TVMArrayCopyFromBytes(dst, &bytes[0], data_byte_size));
+  // always use strm->Read to maintain endianness conversion
+  NDArray temp;
+  temp.Load(strm);
+  temp.CopyTo(dst);
 }
 
 void GraphRuntime::LoadParams(dmlc::Stream* strm) {
@@ -408,13 +417,13 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
   CHECK(strm->Read(&names))
       << "Invalid parameters file format";
   uint64_t sz;
-  strm->Read(&sz, sizeof(sz));
+  strm->Read(&sz);
   size_t size = static_cast<size_t>(sz);
-
   CHECK(size == names.size())
       << "Invalid parameters file format";
   for (size_t i = 0; i < size; ++i) {
-    uint32_t in_idx = GetInputIndex(names[i]);
+    int in_idx = GetInputIndex(names[i]);
+    CHECK_GE(in_idx, 0) << "Found param for non-existent input: " << names[i];
     uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
     CHECK_LT(eid, data_entry_.size());
     LoadDLTensor(strm, &data_entry_[eid]);
@@ -428,14 +437,6 @@ void GraphRuntime::SetupStorage() {
     vtype.push_back(tvm::runtime::String2TVMType(s_type));
   }
   data_entry_.resize(num_node_entries());
-  // Find the maximum space size.
-  int max_id = 0;
-  for (size_t i = 0; i < attrs_.shape.size(); ++i) {
-    max_id = std::max(attrs_.storage_id[i] + 1, max_id);
-  }
-  for (uint32_t nid : input_nodes_) {
-    attrs_.storage_id[this->entry_id(nid, 0)] = max_id++;
-  }
   // size of each storage pool entry
   std::vector<size_t> pool_entry_bytes;
   // Find the maximum space size.
@@ -462,7 +463,7 @@ void GraphRuntime::SetupStorage() {
     int64_t shape[] = {static_cast<int64_t>(pool_entry_bytes[i] + 3) / 4};
     DLTensor* tensor;
     TVM_CCALL(TVMArrayAlloc(
-        shape, 1, kFloat, 32, 1, ctx_.device_type, ctx_.device_id, &tensor));
+        shape, 1, kDLFloat, 32, 1, ctx_.device_type, ctx_.device_id, &tensor));
     storage_pool_.push_back(tensor);
   }
   // Assign the pooled entries.
@@ -525,6 +526,9 @@ std::function<void()> GraphRuntime::CreateTVMOp(
       t->shape = &(arg_ptr->shape_data[i]);
     }
   }
+  if (param.func_name == "__nop") {
+    return [](){};
+  }
   // get compiled function from module.
   tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false);
   CHECK(pf != nullptr) << "no such function in module: " << param.func_name;
@@ -545,7 +549,8 @@ PackedFunc GraphRuntime::GetFunction(
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         if (args[0].type_code() == kStr) {
-          this->SetInput(this->GetInputIndex(args[0]), args[1]);
+          int in_idx = this->GetInputIndex(args[0]);
+          if (in_idx >= 0) this->SetInput(in_idx, args[1]);
         } else {
           this->SetInput(args[0], args[1]);
         }
@@ -554,6 +559,26 @@ PackedFunc GraphRuntime::GetFunction(
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         this->GetOutput(args[0], args[1]);
       });
+  } else if (name == "get_input") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        if (args[0].type_code() == kStr) {
+          int in_idx = this->GetInputIndex(args[0]);
+          CHECK_GE(in_idx, 0);
+          this->GetInput(in_idx, args[1]);
+        } else {
+          this->GetInput(args[0], args[1]);
+        }
+      });
+#ifdef TVM_GRAPH_RUNTIME_DEBUG
+  } else if (name == "debug_get_output") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        if (args[0].type_code() == kStr) {
+          this->DebugGetNodeOutput(this->GetNodeIndex(args[0]), args[1]);
+        } else {
+          this->DebugGetNodeOutput(args[0], args[1]);
+        }
+      });
+#endif
   } else if (name == "run") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         this->Run();
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 8e2590dc6359..7ebcf7d30b33 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -13,8 +13,6 @@
 namespace tvm {
 namespace runtime {
 
-/*! \brief Magic number for NDArray file */
-constexpr uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
 /*! \brief Magic number for NDArray list file  */
 constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
 
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index 35f78db4e814..205af9533f3b 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -13,7 +13,6 @@
 #import <Metal/MTLDevice.h>
 #import <Metal/MTLLibrary.h>
 
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/device_api.h>
@@ -45,14 +44,14 @@ class MetalWorkspace final : public DeviceAPI {
   ~MetalWorkspace();
   // Get command queue for given context.
   id<MTLCommandQueue> GetCommandQueue(TVMContext ctx) {
-    CHECK_EQ(ctx.device_type, kMetal);
+    CHECK_EQ(ctx.device_type, kDLMetal);
     CHECK(ctx.device_id >= 0  && static_cast<size_t>(ctx.device_id) < queues.size())
         << "Invalid Metal device_id=" << ctx.device_id;
     return queues[ctx.device_id];
   }
   // Get device for given context
   id<MTLDevice> GetDevice(TVMContext ctx) {
-    CHECK_EQ(ctx.device_type, kMetal);
+    CHECK_EQ(ctx.device_type, kDLMetal);
     CHECK(ctx.device_id >= 0  && static_cast<size_t>(ctx.device_id) < devices.size())
         << "Invalid Metal device_id=" << ctx.device_id;
     return devices[ctx.device_id];
@@ -63,7 +62,10 @@ class MetalWorkspace final : public DeviceAPI {
   // override device API
   void SetDevice(TVMContext ctx) final;
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final;
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final;
   void FreeDataSpace(TVMContext ctx, void* ptr) final;
   void CopyDataFromTo(const void* from,
                       size_t from_size,
@@ -72,9 +74,10 @@ class MetalWorkspace final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
-  void* AllocWorkspace(TVMContext ctx, size_t size) final;
+  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
   void FreeWorkspace(TVMContext ctx, void* data) final;
   // get the global workspace
   static const std::shared_ptr<MetalWorkspace>& Global();
@@ -91,9 +94,9 @@ class MetalThreadEntry {
   WorkspacePool pool;
   // constructor
   MetalThreadEntry()
-      : pool(static_cast<DLDeviceType>(kMetal), MetalWorkspace::Global()) {
+      : pool(static_cast<DLDeviceType>(kDLMetal), MetalWorkspace::Global()) {
     context.device_id = 0;
-    context.device_type = static_cast<DLDeviceType>(kMetal);
+    context.device_type = static_cast<DLDeviceType>(kDLMetal);
   }
   ~MetalThreadEntry();
   // Get temp buffer with at least size under ctx.
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index 4af274da98a3..47c2899cea71 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -2,11 +2,9 @@
  *  Copyright (c) 2017 by Contributors
  * \file metal_device_api.mm
  */
-#include "./metal_common.h"
-
-#if TVM_METAL_RUNTIME
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
+#include "./metal_common.h"
 
 namespace tvm {
 namespace runtime {
@@ -39,7 +37,12 @@
       *rv = 1;
       break;
     }
+    case kMaxSharedMemoryPerBlock: return;
     case kComputeVersion: return;
+    case kDeviceName: return;
+    case kMaxClockRate: return;
+    case kMultiProcessorCount: return;
+    case kMaxThreadDimensions: return;
     case kExist: break;
   }
 }
@@ -96,7 +99,7 @@ int GetWarpSize(id<MTLDevice> dev) {
 
 void MetalWorkspace::Init() {
   if (initialized_) return;
-  std::lock_guard<std::mutex>(this->mutex);
+  std::lock_guard<std::mutex> lock(this->mutex);
   if (initialized_) return;
   initialized_ = true;
   if (devices.size() != 0) return;
@@ -112,7 +115,7 @@ int GetWarpSize(id<MTLDevice> dev) {
       devices.push_back([d retain]);
       queues.push_back([[d newCommandQueue] retain]);
       LOG(INFO) << "Intializing Metal device " << i
-                <<  ", name=" << d.name;
+                <<  ", name=" << [d.name UTF8String];
       warp_size.push_back(GetWarpSize(d));
     }
 #endif
@@ -123,13 +126,21 @@ int GetWarpSize(id<MTLDevice> dev) {
 }
 
 void* MetalWorkspace::AllocDataSpace(
-    TVMContext ctx, size_t size, size_t alignment) {
+    TVMContext ctx, size_t nbytes, size_t alignment, TVMType type_hint) {
   this->Init();
   id<MTLDevice> dev = GetDevice(ctx);
-  // allocate buffer in GPU only mode.
+  // GPU memory only
+  MTLResourceOptions storage_mode = MTLResourceStorageModePrivate;
+  /*
+  #if TARGET_OS_IPHONE
+  storage_mode = MTLResourceStorageModeShared;
+  #else
+  storage_mode = MTLResourceStorageModeManaged;
+  #endif
+  */
   id<MTLBuffer> buf = [
-      dev newBufferWithLength:size
-          options:MTLResourceStorageModePrivate];
+      dev newBufferWithLength:nbytes
+          options:storage_mode];
   CHECK(buf != nil);
   return (__bridge void*)([buf retain]);
 }
@@ -146,17 +157,18 @@ int GetWarpSize(id<MTLDevice> dev) {
                                     size_t size,
                                     TVMContext ctx_from,
                                     TVMContext ctx_to,
+                                    TVMType type_hint,
                                     TVMStreamHandle stream) {
   this->Init();
   CHECK(stream == nullptr);
   TVMContext ctx = ctx_from;
-  if (ctx_from.device_type == kCPU) ctx = ctx_to;
+  if (ctx_from.device_type == kDLCPU) ctx = ctx_to;
   id<MTLCommandQueue> queue = GetCommandQueue(ctx);
   id<MTLCommandBuffer> cb = [queue commandBuffer];
   int from_dev_type = static_cast<int>(ctx_from.device_type);
   int to_dev_type = static_cast<int>(ctx_to.device_type);
 
-  if (from_dev_type == kMetal && to_dev_type == kMetal) {
+  if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) {
     CHECK_EQ(ctx_from.device_id, ctx_to.device_id)
         << "Metal disallow cross device copy.";
     id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
@@ -167,7 +179,7 @@ int GetWarpSize(id<MTLDevice> dev) {
              size:size];
     [encoder endEncoding];
     [cb commit];
-  } else if (from_dev_type == kMetal && to_dev_type == kCPU) {
+  } else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) {
     // copy to a local buffer before get into global buffer.
     id<MTLBuffer> from_buf = (__bridge id<MTLBuffer>)(from);
     if (from_buf.storageMode != MTLStorageModeShared) {
@@ -190,7 +202,7 @@ int GetWarpSize(id<MTLDevice> dev) {
              static_cast<char*>([from_buf contents]) + from_offset,
              size);
     }
-  } else if (from_dev_type == kCPU && to_dev_type == kMetal) {
+  } else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) {
     id<MTLBuffer> to_buf = (__bridge id<MTLBuffer>)(to);
     if (to_buf.storageMode != MTLStorageModeShared) {
       id<MTLBuffer> temp = MetalThreadEntry::ThreadLocal()
@@ -228,7 +240,9 @@ int GetWarpSize(id<MTLDevice> dev) {
   [cb waitUntilCompleted];
 }
 
-void* MetalWorkspace::AllocWorkspace(TVMContext ctx, size_t size) {
+void* MetalWorkspace::AllocWorkspace(TVMContext ctx,
+                                     size_t size,
+                                     TVMType type_hint) {
   return MetalThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
 }
 
@@ -274,5 +288,3 @@ int GetWarpSize(id<MTLDevice> dev) {
 }  // namespace metal
 }  // namespace runtime
 }  // namespace tvm
-
-#endif  // TVM_METAL_RUNTIME
diff --git a/src/runtime/metal/metal_module.h b/src/runtime/metal/metal_module.h
index bb2f9c86b731..9a928e62bd1b 100644
--- a/src/runtime/metal/metal_module.h
+++ b/src/runtime/metal/metal_module.h
@@ -6,7 +6,6 @@
 #ifndef TVM_RUNTIME_METAL_METAL_MODULE_H_
 #define TVM_RUNTIME_METAL_METAL_MODULE_H_
 
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/packed_func.h>
 #include <memory>
 #include <vector>
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index 76926f75bd63..c79e2cf11ac5 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -2,16 +2,13 @@
  *  Copyright (c) 2017 by Contributors
  * \file metal_module.cc
  */
-#include "./metal_module.h"
-
-#if TVM_METAL_RUNTIME
-
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/module.h>
 #include <array>
 #include <string>
 #include <mutex>
+#include "./metal_module.h"
 #include "./metal_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
@@ -22,7 +19,6 @@
 namespace runtime {
 
 // Module to support thread-safe multi-GPU execution.
-// cuModule is a per-GPU module
 // The runtime will contain a per-device module table
 // The modules will be lazily loaded
 class MetalModuleNode final :public runtime::ModuleNode {
@@ -83,15 +79,25 @@ void SaveToBinary(dmlc::Stream* stream) final {
     NSError* err_msg = nil;
     if (e.lib == nil) {
       if (fmt_ == "metal") {
+        MTLCompileOptions *opts = [MTLCompileOptions alloc];
+        // Use the Metal 1.2 for now.
+        opts.languageVersion = MTLLanguageVersion1_2;
+        opts.fastMathEnabled = YES;
+        // opts = nil;
         e.lib = [
             w->devices[device_id]
              newLibraryWithSource:[NSString stringWithUTF8String:data_.c_str()]
-             options:nil
+             options:opts
              error:&err_msg];
-        if (err_msg != nil || e.lib == nil) {
+        [opts dealloc];
+        if (e.lib == nil) {
           LOG(FATAL) << "Fail to compile metal lib:"
                      << [[err_msg localizedDescription] UTF8String];
         }
+        if (err_msg != nil) {
+          LOG(INFO) << "Warning: "
+                    << [[err_msg localizedDescription] UTF8String];
+        }
       } else {
         // Build from library.
         auto q = dispatch_queue_create("q", DISPATCH_QUEUE_SERIAL);
@@ -206,7 +212,7 @@ void operator()(TVMArgs args,
     MTLSize dimGrid = MTLSizeMake(
         wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2));
     MTLSize dimBlock = MTLSizeMake(
-        wl.block_dim(0), wl.block_dim(1), wl.work_size[2]);
+        wl.block_dim(0), wl.block_dim(1), wl.block_dim(2));
     [encoder dispatchThreadgroups: dimGrid
              threadsPerThreadgroup: dimBlock];
     [encoder endEncoding];
@@ -295,4 +301,3 @@ Module MetalModuleLoadBinary(void* strm) {
   });
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_METAL_RUNTIME
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 43ad6e523494..dbddfde44733 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -8,30 +8,19 @@
 #include <tvm/runtime/packed_func.h>
 #include <unordered_set>
 #include <cstring>
+#ifndef _LIBCPP_SGX_CONFIG
 #include "./file_util.h"
+#endif
 
 namespace tvm {
 namespace runtime {
 
-PackedFunc Module::GetFunction(
-    const std::string& name, bool query_imports) {
-  PackedFunc pf = node_->GetFunction(name, node_);
-  if (pf != nullptr) return pf;
-  if (query_imports) {
-    for (const Module& m : node_->imports_) {
-      pf = m.node_->GetFunction(name, m.node_);
-      if (pf != nullptr) return pf;
-    }
-  }
-  return pf;
-}
-
 void Module::Import(Module other) {
   // specially handle rpc
   if (!std::strcmp((*this)->type_key(), "rpc")) {
     static const PackedFunc* fimport_ = nullptr;
     if (fimport_ == nullptr) {
-      fimport_ = runtime::Registry::Get("contrib.rpc._ImportRemoteModule");
+      fimport_ = runtime::Registry::Get("rpc._ImportRemoteModule");
       CHECK(fimport_ != nullptr);
     }
     (*fimport_)(*this, other);
@@ -57,6 +46,7 @@ void Module::Import(Module other) {
 
 Module Module::LoadFromFile(const std::string& file_name,
                             const std::string& format) {
+#ifndef _LIBCPP_SGX_CONFIG
   std::string fmt = GetFileFormat(file_name, format);
   CHECK(fmt.length() != 0)
       << "Cannot deduce format of file " << file_name;
@@ -70,6 +60,9 @@ Module Module::LoadFromFile(const std::string& file_name,
       << load_f_name << ") is not presented.";
   Module m = (*f)(file_name, format);
   return m;
+#else
+  LOG(FATAL) << "SGX does not support LoadFromFile";
+#endif
 }
 
 void ModuleNode::SaveToFile(const std::string& file_name,
@@ -113,10 +106,14 @@ bool RuntimeEnabled(const std::string& target) {
     return true;
   } else if (target == "cuda" || target == "gpu") {
     f_name = "device_api.gpu";
-  } else if (target == "cl" || target == "opencl") {
+  } else if (target == "cl" || target == "opencl" || target == "sdaccel") {
     f_name = "device_api.opencl";
+  } else if (target == "gl" || target == "opengl") {
+    f_name = "device_api.opengl";
   } else if (target == "mtl" || target == "metal") {
     f_name = "device_api.metal";
+  } else if (target == "vulkan") {
+    f_name = "device_api.vulkan";
   } else if (target == "stackvm") {
     f_name = "codegen.build_stackvm";
   } else if (target == "rpc") {
@@ -124,9 +121,9 @@ bool RuntimeEnabled(const std::string& target) {
   } else if (target == "vpi" || target == "verilog") {
     f_name = "device_api.vpi";
   } else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") {
-    f_name = "codegen.build_nvptx";
+    f_name = "device_api.gpu";
   } else if (target.length() >= 4 && target.substr(0, 4) == "rocm") {
-    f_name = "codegen.build_rocm";
+    f_name = "device_api.rocm";
   } else if (target.length() >= 4 && target.substr(0, 4) == "llvm") {
     const PackedFunc* pf = runtime::Registry::Get("codegen.llvm_target_enabled");
     if (pf == nullptr) return false;
diff --git a/src/runtime/module_util.cc b/src/runtime/module_util.cc
index 825e0e459b08..95da78d23f09 100644
--- a/src/runtime/module_util.cc
+++ b/src/runtime/module_util.cc
@@ -3,7 +3,9 @@
  * \file module_util.cc
  * \brief Utilities for module.
  */
+#ifndef _LIBCPP_SGX_CONFIG
 #include <dmlc/memory_io.h>
+#endif
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include "./module_util.h"
@@ -12,6 +14,7 @@ namespace tvm {
 namespace runtime {
 
 void ImportModuleBlob(const char* mblob, std::vector<Module>* mlist) {
+#ifndef _LIBCPP_SGX_CONFIG
   CHECK(mblob != nullptr);
   uint64_t nbytes = 0;
   for (size_t i = 0; i < sizeof(nbytes); ++i) {
@@ -34,6 +37,9 @@ void ImportModuleBlob(const char* mblob, std::vector<Module>* mlist) {
     Module m = (*f)(static_cast<void*>(stream));
     mlist->push_back(m);
   }
+#else
+  LOG(FATAL) << "SGX does not support ImportModuleBlob";
+#endif
 }
 
 PackedFunc WrapPackedFunc(BackendPackedCFunc faddr,
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
new file mode 100644
index 000000000000..f862f32f6e99
--- /dev/null
+++ b/src/runtime/ndarray.cc
@@ -0,0 +1,248 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file ndarray.cc
+ * \brief NDArray container infratructure.
+ */
+#include <dmlc/logging.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+#include "./runtime_base.h"
+
+// deleter for arrays used by DLPack exporter
+extern "C" void NDArrayDLPackDeleter(DLManagedTensor* tensor);
+
+namespace tvm {
+namespace runtime {
+
+inline void VerifyDataType(DLDataType dtype) {
+  CHECK_GE(dtype.lanes, 1);
+  if (dtype.code == kDLFloat) {
+    CHECK_EQ(dtype.bits % 8, 0);
+  } else {
+    CHECK_EQ(dtype.bits % 8, 0);
+  }
+  CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
+}
+
+inline size_t GetDataSize(const DLTensor& arr) {
+  size_t size = 1;
+  for (tvm_index_t i = 0; i < arr.ndim; ++i) {
+    size *= arr.shape[i];
+  }
+  size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
+  return size;
+}
+
+inline size_t GetDataAlignment(const DLTensor& arr) {
+  size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
+  if (align < kAllocAlignment) return kAllocAlignment;
+  return align;
+}
+
+struct NDArray::Internal {
+  // Default deleter for the container
+  static void DefaultDeleter(NDArray::Container* ptr) {
+    using tvm::runtime::NDArray;
+    if (ptr->manager_ctx != nullptr) {
+      static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
+    } else if (ptr->dl_tensor.data != nullptr) {
+      tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.ctx)->FreeDataSpace(
+          ptr->dl_tensor.ctx, ptr->dl_tensor.data);
+    }
+    delete ptr;
+  }
+  // Deleter for NDArray converted from DLPack
+  // This is used from data which is passed from external DLPack(DLManagedTensor)
+  // that are not allocated inside of TVM.
+  // This enables us to create NDArray from memory allocated by other
+  // frameworks that are DLPack compatible
+  static void DLPackDeleter(NDArray::Container* ptr) {
+    DLManagedTensor* tensor = static_cast<DLManagedTensor*>(ptr->manager_ctx);
+    if (tensor->deleter != nullptr) {
+      (*tensor->deleter)(tensor);
+    }
+    delete ptr;
+  }
+  // Local create function which allocates tensor metadata
+  // but does not allocate space for the data.
+  static NDArray Create(std::vector<int64_t> shape,
+                        DLDataType dtype,
+                        DLContext ctx) {
+    VerifyDataType(dtype);
+    // critical zone
+    NDArray::Container* data = new NDArray::Container();
+    data->deleter = DefaultDeleter;
+    NDArray ret(data);
+    ret.data_ = data;
+    // RAII now in effect
+    // setup shape
+    data->shape_ = std::move(shape);
+    data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
+    data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
+    // setup dtype
+    data->dl_tensor.dtype = dtype;
+    // setup ctx
+    data->dl_tensor.ctx = ctx;
+    return ret;
+  }
+  // Implementation of API function
+  static DLTensor* MoveAsDLTensor(NDArray arr) {
+    DLTensor* tensor = const_cast<DLTensor*>(arr.operator->());
+    CHECK(reinterpret_cast<DLTensor*>(arr.data_) == tensor);
+    arr.data_ = nullptr;
+    return tensor;
+  }
+};
+
+NDArray NDArray::CreateView(std::vector<int64_t> shape,
+                            DLDataType dtype) {
+  CHECK(data_ != nullptr);
+  CHECK(data_->dl_tensor.strides == nullptr)
+      << "Can only create view for compact tensor";
+  NDArray ret = Internal::Create(shape, dtype, data_->dl_tensor.ctx);
+  ret.data_->dl_tensor.byte_offset =
+      this->data_->dl_tensor.byte_offset;
+  size_t curr_size = GetDataSize(this->data_->dl_tensor);
+  size_t view_size = GetDataSize(ret.data_->dl_tensor);
+  CHECK_LE(view_size, curr_size)
+      << "Tries to create a view that has bigger memory than current one";
+  // increase ref count
+  this->data_->IncRef();
+  ret.data_->manager_ctx = this->data_;
+  ret.data_->dl_tensor.data = this->data_->dl_tensor.data;
+  return ret;
+}
+
+DLManagedTensor* NDArray::ToDLPack() const {
+  CHECK(data_ != nullptr);
+  DLManagedTensor* ret = new DLManagedTensor();
+  ret->dl_tensor = data_->dl_tensor;
+  ret->manager_ctx = const_cast<NDArray*>(this);
+  data_->IncRef();
+  ret->deleter = NDArrayDLPackDeleter;
+  return ret;
+}
+
+NDArray NDArray::Empty(std::vector<int64_t> shape,
+                        DLDataType dtype,
+                        DLContext ctx) {
+  NDArray ret = Internal::Create(shape, dtype, ctx);
+  // setup memory content
+  size_t size = GetDataSize(ret.data_->dl_tensor);
+  size_t alignment = GetDataAlignment(ret.data_->dl_tensor);
+  ret.data_->dl_tensor.data =
+      DeviceAPI::Get(ret->ctx)->AllocDataSpace(
+          ret->ctx, size, alignment, ret->dtype);
+  return ret;
+}
+
+NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
+  NDArray::Container* data = new NDArray::Container();
+  data->deleter = Internal::DLPackDeleter;
+  data->manager_ctx = tensor;
+  data->dl_tensor = tensor->dl_tensor;
+  return NDArray(data);
+}
+
+void NDArray::CopyFromTo(DLTensor* from,
+                         DLTensor* to,
+                         TVMStreamHandle stream) {
+  size_t from_size = GetDataSize(*from);
+  size_t to_size = GetDataSize(*to);
+  CHECK_EQ(from_size, to_size)
+    << "TVMArrayCopyFromTo: The size must exactly match";
+
+  CHECK(from->ctx.device_type == to->ctx.device_type
+        || from->ctx.device_type == kDLCPU
+        || to->ctx.device_type == kDLCPU)
+    << "Can not copy across different ctx types directly";
+
+  // Use the context that is *not* a cpu context to get the correct device
+  // api manager.
+  TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx;
+
+  DeviceAPI::Get(ctx)->CopyDataFromTo(
+    from->data, static_cast<size_t>(from->byte_offset),
+    to->data, static_cast<size_t>(to->byte_offset),
+    from_size, from->ctx, to->ctx, from->dtype, stream);
+}
+
+}  // namespace runtime
+}  // namespace tvm
+
+using namespace tvm::runtime;
+
+void NDArrayDLPackDeleter(DLManagedTensor* tensor) {
+  static_cast<NDArray::Container*>(tensor->manager_ctx)->DecRef();
+  delete tensor;
+}
+
+int TVMArrayAlloc(const tvm_index_t* shape,
+                  int ndim,
+                  int dtype_code,
+                  int dtype_bits,
+                  int dtype_lanes,
+                  int device_type,
+                  int device_id,
+                  TVMArrayHandle* out) {
+  API_BEGIN();
+  DLDataType dtype;
+  dtype.code = static_cast<uint8_t>(dtype_code);
+  dtype.bits = static_cast<uint8_t>(dtype_bits);
+  dtype.lanes = static_cast<uint16_t>(dtype_lanes);
+  DLContext ctx;
+  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  *out = NDArray::Internal::MoveAsDLTensor(
+      NDArray::Empty(std::vector<int64_t>(shape, shape + ndim), dtype, ctx));
+  API_END();
+}
+
+int TVMArrayFree(TVMArrayHandle handle) {
+  API_BEGIN();
+  reinterpret_cast<NDArray::Container*>(handle)->DecRef();
+  API_END();
+}
+
+int TVMArrayCopyFromTo(TVMArrayHandle from,
+                       TVMArrayHandle to,
+                       TVMStreamHandle stream) {
+  API_BEGIN();
+  NDArray::CopyFromTo(from, to, stream);
+  API_END();
+}
+
+int TVMArrayCopyFromBytes(TVMArrayHandle handle,
+                          void* data,
+                          size_t nbytes) {
+  API_BEGIN();
+  TVMContext cpu_ctx;
+  cpu_ctx.device_type = kDLCPU;
+  cpu_ctx.device_id = 0;
+  size_t arr_size = GetDataSize(*handle);
+  CHECK_EQ(arr_size, nbytes)
+      << "TVMArrayCopyFromBytes: size mismatch";
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(
+      data, 0,
+      handle->data, static_cast<size_t>(handle->byte_offset),
+      nbytes, cpu_ctx, handle->ctx, handle->dtype, nullptr);
+  API_END();
+}
+
+int TVMArrayCopyToBytes(TVMArrayHandle handle,
+                        void* data,
+                        size_t nbytes) {
+  API_BEGIN();
+  TVMContext cpu_ctx;
+  cpu_ctx.device_type = kDLCPU;
+  cpu_ctx.device_id = 0;
+  size_t arr_size = GetDataSize(*handle);
+  CHECK_EQ(arr_size, nbytes)
+      << "TVMArrayCopyToBytes: size mismatch";
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(
+      handle->data, static_cast<size_t>(handle->byte_offset),
+      data, 0,
+      nbytes, handle->ctx, cpu_ctx, handle->dtype, nullptr);
+  API_END();
+}
diff --git a/src/runtime/opencl/aocl/aocl_common.h b/src/runtime/opencl/aocl/aocl_common.h
new file mode 100644
index 000000000000..234053098d1d
--- /dev/null
+++ b/src/runtime/opencl/aocl/aocl_common.h
@@ -0,0 +1,42 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file aocl_common.h
+ * \brief AOCL common header
+ */
+#ifndef TVM_RUNTIME_OPENCL_AOCL_AOCL_COMMON_H_
+#define TVM_RUNTIME_OPENCL_AOCL_AOCL_COMMON_H_
+
+#include "../opencl_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+/*!
+ * \brief Process global AOCL workspace.
+ */
+class AOCLWorkspace final : public OpenCLWorkspace {
+ public:
+  // override OpenCL device API
+  void Init() final;
+  bool IsOpenCLDevice(TVMContext ctx) final;
+  OpenCLThreadEntry* GetThreadEntry() final;
+  // get the global workspace
+  static const std::shared_ptr<OpenCLWorkspace>& Global();
+};
+
+
+/*! \brief Thread local workspace for AOCL */
+class AOCLThreadEntry : public OpenCLThreadEntry {
+ public:
+  // constructor
+  AOCLThreadEntry()
+      : OpenCLThreadEntry(static_cast<DLDeviceType>(kDLAOCL), AOCLWorkspace::Global()) {}
+
+  // get the global workspace
+  static AOCLThreadEntry* ThreadLocal();
+};
+}  // namespace cl
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENCL_AOCL_AOCL_COMMON_H_
diff --git a/src/runtime/opencl/aocl/aocl_device_api.cc b/src/runtime/opencl/aocl/aocl_device_api.cc
new file mode 100644
index 000000000000..e9cbc6b4cda0
--- /dev/null
+++ b/src/runtime/opencl/aocl/aocl_device_api.cc
@@ -0,0 +1,44 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file aocl_device_api.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <dmlc/thread_local.h>
+#include "./aocl_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+OpenCLThreadEntry* AOCLWorkspace::GetThreadEntry() {
+  return AOCLThreadEntry::ThreadLocal();
+}
+
+const std::shared_ptr<OpenCLWorkspace>& AOCLWorkspace::Global() {
+  static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<AOCLWorkspace>();
+  return inst;
+}
+
+void AOCLWorkspace::Init() {
+  OpenCLWorkspace::Init("aocl", "accelerator", "Intel(R) FPGA SDK for OpenCL(TM)");
+}
+
+bool AOCLWorkspace::IsOpenCLDevice(TVMContext ctx) {
+  return ctx.device_type == static_cast<DLDeviceType>(kDLAOCL);
+}
+
+typedef dmlc::ThreadLocalStore<AOCLThreadEntry> AOCLThreadStore;
+
+AOCLThreadEntry* AOCLThreadEntry::ThreadLocal() {
+  return AOCLThreadStore::Get();
+}
+
+TVM_REGISTER_GLOBAL("device_api.aocl")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    DeviceAPI* ptr = AOCLWorkspace::Global().get();
+    *rv = static_cast<void*>(ptr);
+  });
+
+}  // namespace cl
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/opencl/aocl/aocl_module.cc b/src/runtime/opencl/aocl/aocl_module.cc
new file mode 100644
index 000000000000..a056c5cee671
--- /dev/null
+++ b/src/runtime/opencl/aocl/aocl_module.cc
@@ -0,0 +1,58 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file aocl_module.cc
+ */
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/registry.h>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include "./aocl_common.h"
+#include "./aocl_module.h"
+
+namespace tvm {
+namespace runtime {
+
+class AOCLModuleNode : public OpenCLModuleNode {
+ public:
+  explicit AOCLModuleNode(std::string data,
+                          std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap,
+                          std::string source)
+      : OpenCLModuleNode(data, fmt, fmap, source) {}
+  const std::shared_ptr<cl::OpenCLWorkspace>& GetGlobalWorkspace() final;
+};
+
+const std::shared_ptr<cl::OpenCLWorkspace>& AOCLModuleNode::GetGlobalWorkspace() {
+  return cl::AOCLWorkspace::Global();
+}
+
+Module AOCLModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
+  std::shared_ptr<AOCLModuleNode> n =
+      std::make_shared<AOCLModuleNode>(data, fmt, fmap, source);
+  n->Init();
+  return Module(n);
+}
+
+Module AOCLModuleLoadFile(const std::string& file_name,
+                          const std::string& format) {
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt = GetFileFormat(file_name, format);
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  return AOCLModuleCreate(data, fmt, fmap, std::string());
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_aocx")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = AOCLModuleLoadFile(args[0], args[1]);
+  });
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/opencl/aocl/aocl_module.h b/src/runtime/opencl/aocl/aocl_module.h
new file mode 100644
index 000000000000..83ddbdb358ce
--- /dev/null
+++ b/src/runtime/opencl/aocl/aocl_module.h
@@ -0,0 +1,31 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file aocl_module.h
+ * \brief Execution handling of OpenCL kernels for AOCL
+ */
+#ifndef TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
+#define TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <memory>
+#include <vector>
+#include <string>
+#include "../../meta_data.h"
+
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief create a opencl module for AOCL from data.
+ *
+ * \param data The module data.
+ * \param fmt The format of the data, can be "aocx"
+ * \param fmap The map function information map of each function.
+ */
+Module AOCLModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source);
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index ccbe155e6204..c37dbaa94d7a 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -6,13 +6,11 @@
 #ifndef TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
 #define TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
 
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/device_api.h>
 #include <dmlc/logging.h>
 
-#if TVM_OPENCL_RUNTIME
 #ifdef __APPLE__
 #include <OpenCL/opencl.h>
 #else
@@ -23,6 +21,10 @@
 #include <string>
 #include <vector>
 #include "../workspace_pool.h"
+#include "../pack_args.h"
+#include "../thread_storage_scope.h"
+#include "../meta_data.h"
+#include "../file_util.h"
 
 namespace tvm {
 namespace runtime {
@@ -99,17 +101,25 @@ inline const char* CLGetErrorString(cl_int error) {
     OPENCL_CHECK_ERROR(e);                                            \
   }
 
+class OpenCLThreadEntry;
+
 /*!
  * \brief Process global OpenCL workspace.
  */
-class OpenCLWorkspace final : public DeviceAPI {
+class OpenCLWorkspace : public DeviceAPI {
  public:
+  // type key
+  std::string type_key;
   // global platform id
   cl_platform_id platform_id;
+  // global platform name
+  std::string platform_name;
   // global context of this process
   cl_context context{nullptr};
   // whether the workspace it initialized.
   bool initialized_{false};
+  // the device type
+  std::string device_type;
   // the devices
   std::vector<cl_device_id> devices;
   // the queues
@@ -130,10 +140,18 @@ class OpenCLWorkspace final : public DeviceAPI {
     }
   }
   // Initialzie the device.
-  void Init();
+  void Init(const std::string& type_key, const std::string& device_type,
+            const std::string& platform_name = "");
+  virtual void Init() {
+    Init("opencl", "gpu");
+  }
+  // Check whether the context is OpenCL or not.
+  virtual bool IsOpenCLDevice(TVMContext ctx) {
+    return ctx.device_type == kDLOpenCL;
+  }
   // get the queue of the context
   cl_command_queue GetQueue(TVMContext ctx) {
-    CHECK_EQ(ctx.device_type, kOpenCL);
+    CHECK(IsOpenCLDevice(ctx));
     this->Init();
     CHECK(ctx.device_id >= 0  && static_cast<size_t>(ctx.device_id) < queues.size())
         << "Invalid OpenCL device_id=" << ctx.device_id;
@@ -142,7 +160,10 @@ class OpenCLWorkspace final : public DeviceAPI {
   // override device API
   void SetDevice(TVMContext ctx) final;
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final;
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t size,
+                       size_t alignment,
+                       TVMType type_hint) final;
   void FreeDataSpace(TVMContext ctx, void* ptr) final;
   void CopyDataFromTo(const void* from,
                       size_t from_offset,
@@ -151,10 +172,17 @@ class OpenCLWorkspace final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
-  void* AllocWorkspace(TVMContext ctx, size_t size) final;
+  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
   void FreeWorkspace(TVMContext ctx, void* data) final;
+
+  /*!
+   * \brief Get the thread local ThreadEntry
+   */
+  virtual OpenCLThreadEntry* GetThreadEntry();
+
   // get the global workspace
   static const std::shared_ptr<OpenCLWorkspace>& Global();
 };
@@ -177,16 +205,87 @@ class OpenCLThreadEntry {
   /*! \brief workspace pool */
   WorkspacePool pool;
   // constructor
-  OpenCLThreadEntry()
-      : pool(kOpenCL, OpenCLWorkspace::Global()) {
+  OpenCLThreadEntry(DLDeviceType device_type, std::shared_ptr<DeviceAPI> device)
+      : pool(device_type, device) {
     context.device_id = 0;
-    context.device_type = kOpenCL;
+    context.device_type = device_type;
   }
+  OpenCLThreadEntry()
+      : OpenCLThreadEntry(kDLOpenCL, OpenCLWorkspace::Global()) {}
+
   // get the global workspace
   static OpenCLThreadEntry* ThreadLocal();
 };
 }  // namespace cl
+
+// Module to support thread-safe multi-device execution.
+// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe
+// To make the call thread-safe, we create a thread-local kernel table
+// and lazily install new kernels into the kernel table when the kernel is called.
+// The kernels are recycled when the module get destructed.
+class OpenCLModuleNode : public ModuleNode {
+ public:
+  // Kernel table reference entry.
+  struct KTRefEntry {
+    size_t kernel_id;
+    size_t version;
+  };
+  explicit OpenCLModuleNode(std::string data,
+                            std::string fmt,
+                            std::unordered_map<std::string, FunctionInfo> fmap,
+                            std::string source)
+      : data_(data), fmt_(fmt), fmap_(fmap), source_(source) {}
+  // destructor
+  ~OpenCLModuleNode();
+
+  /*!
+   * \brief Get the global workspace
+   */
+  virtual const std::shared_ptr<cl::OpenCLWorkspace>& GetGlobalWorkspace();
+
+  const char* type_key() const final { return workspace_->type_key.c_str(); }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final;
+  void SaveToBinary(dmlc::Stream* stream) final;
+  std::string GetSource(const std::string& format) final;
+  // Initialize the programs
+  void Init();
+  // install a new kernel to thread local entry
+  cl_kernel InstallKernel(cl::OpenCLWorkspace* w,
+                          cl::OpenCLThreadEntry* t,
+                          const std::string& func_name,
+                          const KTRefEntry& e);
+
+ protected:
+  // The workspace, need to keep reference to use it in destructor.
+  // In case of static destruction order problem.
+  std::shared_ptr<cl::OpenCLWorkspace> workspace_;
+  // the binary data
+  std::string data_;
+
+ private:
+  // The format
+  std::string fmt_;
+  // function information table.
+  std::unordered_map<std::string, FunctionInfo> fmap_;
+  // Module local mutex
+  std::mutex build_lock_;
+  // The OpenCL source.
+  std::string source_;
+  // the binary data
+  cl_program program_{nullptr};
+  // build info
+  std::vector<bool> device_built_flag_;
+  // kernel id cache
+  std::unordered_map<std::string, KTRefEntry> kid_map_;
+  // kernels build so far.
+  std::vector<cl_kernel> kernels_;
+};
+
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_OPENCL_RUNTIME
 #endif  // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index f70207ebe881..ac9373f1375b 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -2,24 +2,25 @@
  *  Copyright (c) 2017 by Contributors
  * \file opencl_device_api.cc
  */
-#include "./opencl_common.h"
-
-#if TVM_OPENCL_RUNTIME
-
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
+#include "./opencl_common.h"
 
 namespace tvm {
 namespace runtime {
 namespace cl {
 
+OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() {
+  return OpenCLThreadEntry::ThreadLocal();
+}
+
 const std::shared_ptr<OpenCLWorkspace>& OpenCLWorkspace::Global() {
   static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<OpenCLWorkspace>();
   return inst;
 }
 
 void OpenCLWorkspace::SetDevice(TVMContext ctx) {
-  OpenCLThreadEntry::ThreadLocal()->context.device_id = ctx.device_id;
+  GetThreadEntry()->context.device_id = ctx.device_id;
 }
 
 void OpenCLWorkspace::GetAttr(
@@ -32,9 +33,10 @@ void OpenCLWorkspace::GetAttr(
   }
   CHECK_LT(index, devices.size())
       << "Invalid device id " << index;
-  size_t value;
   switch (kind) {
+    case kExist: break;
     case kMaxThreadsPerBlock: {
+      size_t value;
       OPENCL_CALL(clGetDeviceInfo(
           devices[index],  CL_DEVICE_MAX_WORK_GROUP_SIZE,
           sizeof(size_t), &value, nullptr));
@@ -42,16 +44,62 @@ void OpenCLWorkspace::GetAttr(
       break;
     }
     case kWarpSize: {
+      /* TODO: the warp size of OpenCL device is not always 1
+               e.g. Intel Graphics has a sub group concept which contains 8 - 32 work items,
+               corresponding to the number of SIMD entries the heardware configures.
+               We need to figure out a way to query this information from the hardware.
+      */
       *rv = 1;
       break;
     }
+    case kMaxSharedMemoryPerBlock: {
+      cl_ulong value;
+      OPENCL_CALL(clGetDeviceInfo(
+          devices[index], CL_DEVICE_LOCAL_MEM_SIZE,
+          sizeof(cl_ulong), &value, nullptr));
+      *rv = static_cast<int64_t>(value);
+      break;
+    }
     case kComputeVersion: return;
-    case kExist: break;
+    case kDeviceName: {
+      char value[128] = {0};
+      OPENCL_CALL(clGetDeviceInfo(
+          devices[index], CL_DEVICE_NAME,
+          sizeof(value) - 1, value, nullptr));
+      *rv = std::string(value);
+      break;
+    }
+    case kMaxClockRate: {
+      cl_uint value;
+      OPENCL_CALL(clGetDeviceInfo(
+          devices[index], CL_DEVICE_MAX_CLOCK_FREQUENCY,
+          sizeof(cl_uint), &value, nullptr));
+      *rv = static_cast<int32_t>(value);
+      break;
+    }
+    case kMultiProcessorCount: {
+      cl_uint value;
+      OPENCL_CALL(clGetDeviceInfo(
+          devices[index], CL_DEVICE_MAX_COMPUTE_UNITS,
+          sizeof(cl_uint), &value, nullptr));
+      *rv = static_cast<int32_t>(value);
+      break;
+    }
+    case kMaxThreadDimensions: {
+      size_t dims[3];
+      OPENCL_CALL(clGetDeviceInfo(
+          devices[index], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(dims), dims, nullptr));
+
+      std::stringstream ss;  // use json string to return multiple int values;
+      ss << "[" << dims[0] <<", " << dims[1] << ", " << dims[2] << "]";
+      *rv = ss.str();
+      break;
+    }
   }
 }
 
 void* OpenCLWorkspace::AllocDataSpace(
-    TVMContext ctx, size_t size, size_t alignment) {
+    TVMContext ctx, size_t size, size_t alignment, TVMType type_hint) {
   this->Init();
   CHECK(context != nullptr) << "No OpenCL device";
   cl_int err_code;
@@ -73,16 +121,17 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from,
                                      size_t size,
                                      TVMContext ctx_from,
                                      TVMContext ctx_to,
+                                     TVMType type_hint,
                                      TVMStreamHandle stream) {
   this->Init();
   CHECK(stream == nullptr);
-  if (ctx_from.device_type == kOpenCL && ctx_to.device_type == kOpenCL) {
+  if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) {
     OPENCL_CALL(clEnqueueCopyBuffer(
         this->GetQueue(ctx_to),
         static_cast<cl_mem>((void*)from),  // NOLINT(*)
         static_cast<cl_mem>(to),
         from_offset, to_offset, size, 0, nullptr, nullptr));
-  } else if (ctx_from.device_type == kOpenCL && ctx_to.device_type == kCPU) {
+  } else if (IsOpenCLDevice(ctx_from) && ctx_to.device_type == kDLCPU) {
     OPENCL_CALL(clEnqueueReadBuffer(
         this->GetQueue(ctx_from),
         static_cast<cl_mem>((void*)from),  // NOLINT(*)
@@ -90,7 +139,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from,
         static_cast<char*>(to) + to_offset,
         0, nullptr, nullptr));
     OPENCL_CALL(clFinish(this->GetQueue(ctx_from)));
-  } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kOpenCL) {
+  } else if (ctx_from.device_type == kDLCPU && IsOpenCLDevice(ctx_to)) {
     OPENCL_CALL(clEnqueueWriteBuffer(
         this->GetQueue(ctx_to),
         static_cast<cl_mem>(to),
@@ -108,12 +157,14 @@ void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
   OPENCL_CALL(clFinish(this->GetQueue(ctx)));
 }
 
-void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx, size_t size) {
-  return OpenCLThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
+void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx,
+                                      size_t size,
+                                      TVMType type_hint) {
+  return GetThreadEntry()->pool.AllocWorkspace(ctx, size);
 }
 
 void OpenCLWorkspace::FreeWorkspace(TVMContext ctx, void* data) {
-  OpenCLThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
+  GetThreadEntry()->pool.FreeWorkspace(ctx, data);
 }
 
 typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
@@ -176,31 +227,41 @@ bool MatchPlatformInfo(
   return param_value.find(value) != std::string::npos;
 }
 
-void OpenCLWorkspace::Init() {
+void OpenCLWorkspace::Init(const std::string& type_key, const std::string& device_type,
+                           const std::string& platform_name) {
   if (initialized_) return;
-  std::lock_guard<std::mutex>(this->mu);
+  std::lock_guard<std::mutex> lock(this->mu);
   if (initialized_) return;
   initialized_ = true;
   if (context != nullptr) return;
   // matched platforms
-  std::vector<cl_platform_id> platform_matched = cl::GetPlatformIDs();
-  if (platform_matched.size() == 0) {
+  std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
+  if (platform_ids.size() == 0) {
     LOG(WARNING) << "No OpenCL platform matched given existing options ...";
     return;
   }
-  if (platform_matched.size() > 1) {
-    LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... ";
+  this->platform_id = nullptr;
+  for (auto platform_id : platform_ids) {
+    if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) {
+      continue;
+    }
+    std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
+    if (devices_matched.size() > 0) {
+      this->type_key = type_key;
+      this->platform_id = platform_id;
+      this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
+      this->device_type = device_type;
+      this->devices = devices_matched;
+      LOG(INFO) << "Initialize OpenCL platform \'" << this->platform_name << '\'';
+      break;
+    }
+    LOG(INFO) << "\'" << cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME)
+              << "\' platform has no OpenCL device: " << device_type << " mode";
   }
-  this->platform_id = platform_matched[0];
-  LOG(INFO) << "Initialize OpenCL platform \'"
-            << cl::GetPlatformInfo(this->platform_id, CL_PLATFORM_NAME) << '\'';
-  std::vector<cl_device_id> devices_matched =
-      cl::GetDeviceIDs(this->platform_id, "gpu");
-  if (devices_matched.size() == 0) {
-    LOG(WARNING) << "No OpenCL device any device matched given the options";
+  if (this->platform_id == nullptr) {
+    LOG(WARNING) << "No OpenCL device";
     return;
   }
-  this->devices = devices_matched;
   cl_int err_code;
   this->context = clCreateContext(
       nullptr, this->devices.size(), &(this->devices[0]),
@@ -212,17 +273,12 @@ void OpenCLWorkspace::Init() {
     this->queues.push_back(
         clCreateCommandQueue(this->context, did, 0, &err_code));
     OPENCL_CHECK_ERROR(err_code);
-    LOG(INFO) << "opencl(" << i
+    LOG(INFO) << type_key << "(" << i
               << ")=\'" << cl::GetDeviceInfo(did, CL_DEVICE_NAME)
               << "\' cl_device_id=" << did;
   }
 }
 
-bool InitOpenCL(TVMArgs args, TVMRetValue* rv) {
-  cl::OpenCLWorkspace::Global()->Init();
-  return true;
-}
-
 TVM_REGISTER_GLOBAL("device_api.opencl")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     DeviceAPI* ptr = OpenCLWorkspace::Global().get();
@@ -232,5 +288,3 @@ TVM_REGISTER_GLOBAL("device_api.opencl")
 }  // namespace cl
 }  // namespace runtime
 }  // namespace tvm
-
-#endif  // TVM_OPENCL_RUNTIME
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 5a585a19cccf..3efd789513ba 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -2,188 +2,27 @@
  *  Copyright (c) 2017 by Contributors
  * \file opencl_module.cc
  */
-#include "./opencl_common.h"
-#include "./opencl_module.h"
-
-#if TVM_OPENCL_RUNTIME
-
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/registry.h>
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "../pack_args.h"
-#include "../thread_storage_scope.h"
-#include "../meta_data.h"
-#include "../file_util.h"
+#include "./opencl_common.h"
+#include "./opencl_module.h"
 
 namespace tvm {
 namespace runtime {
 
-// Module to support thread-safe multi-device execution.
-// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe
-// To make the call thread-safe, we create a thread-local kernel table
-// and lazily install new kernels into the kernel table when the kernel is called.
-// The kernels are recycled when the module get destructed.
-class OpenCLModuleNode : public ModuleNode {
- public:
-  // Kernel table reference entry.
-  struct KTRefEntry {
-    size_t kernel_id;
-    size_t version;
-  };
-  explicit OpenCLModuleNode(std::string data,
-                            std::string fmt,
-                            std::unordered_map<std::string, FunctionInfo> fmap)
-      : data_(data), fmt_(fmt), fmap_(fmap) {}
-  // destructor
-  ~OpenCLModuleNode() {
-    {
-      // free the kernel ids in global table.
-      std::lock_guard<std::mutex> lock(workspace_->mu);
-      for (auto& kv : kid_map_) {
-        workspace_->free_kernel_ids.push_back(kv.second.kernel_id);
-      }
-    }
-    // free the kernels
-    for (cl_kernel k : kernels_) {
-      OPENCL_CALL(clReleaseKernel(k));
-    }
-    if (program_) {
-      OPENCL_CALL(clReleaseProgram(program_));
-    }
-  }
-
-  const char* type_key() const final {
-    return "opencl";
-  }
-
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-
-  void SaveToFile(const std::string& file_name,
-                  const std::string& format) final {
-    std::string fmt = GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, fmt_)
-        << "Can only save to format=" << fmt_;
-    std::string meta_file = GetMetaFilePath(file_name);
-    SaveMetaDataToFile(meta_file, fmap_);
-    SaveBinaryToFile(file_name, data_);
-  }
-
-  void SaveToBinary(dmlc::Stream* stream) final {
-    stream->Write(fmt_);
-    stream->Write(fmap_);
-    stream->Write(data_);
-  }
-
-  std::string GetSource(const std::string& format) final {
-    if (format == fmt_) return data_;
-    if (fmt_ == "cl") {
-      return data_;
-    } else {
-      return "";
-    }
-  }
-
-  // Initialize the programs
-  void Init() {
-    workspace_ = cl::OpenCLWorkspace::Global();
-    workspace_->Init();
-    CHECK(workspace_->context != nullptr) << "No OpenCL device";
-    if (fmt_ == "cl") {
-      const char* s = data_.c_str();
-      size_t len = data_.length();
-      cl_int err;
-      program_ = clCreateProgramWithSource(
-          workspace_->context, 1, &s, &len, &err);
-      OPENCL_CHECK_ERROR(err);
-    } else {
-      LOG(FATAL) << "Unknown OpenCL format " << fmt_;
-    }
-    device_built_flag_.resize(workspace_->devices.size(), false);
-    // initialize the kernel id, need to lock global table.
-    std::lock_guard<std::mutex> lock(workspace_->mu);
-    for (const auto& kv : fmap_) {
-      const std::string& key = kv.first;
-      KTRefEntry e;
-      if (workspace_->free_kernel_ids.size() != 0) {
-        e.kernel_id = workspace_->free_kernel_ids.back();
-        workspace_->free_kernel_ids.pop_back();
-      } else {
-        e.kernel_id = workspace_->num_registered_kernels++;
-      }
-      e.version = workspace_->timestamp++;
-      kid_map_[key] = e;
-    }
-  }
-  // install a new kernel to thread local entry
-  cl_kernel InstallKernel(cl::OpenCLWorkspace* w,
-                          cl::OpenCLThreadEntry* t,
-                          const std::string& func_name,
-                          const KTRefEntry& e) {
-    std::lock_guard<std::mutex> lock(build_lock_);
-    int device_id = t->context.device_id;
-    if (!device_built_flag_[device_id]) {
-      // build program
-      cl_int err;
-      cl_device_id dev = w->devices[device_id];
-      err = clBuildProgram(program_, 1, &dev, nullptr, nullptr, nullptr);
-      if (err != CL_SUCCESS) {
-        size_t len;
-        std::string log;
-        clGetProgramBuildInfo(
-            program_, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len);
-        log.resize(len);
-        clGetProgramBuildInfo(
-            program_, dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr);
-        LOG(FATAL) << "OpenCL build error for device=" << dev << log;
-      }
-      device_built_flag_[device_id] = true;
-    }
-    // build kernel
-    cl_int err;
-    cl_kernel kernel = clCreateKernel(program_, func_name.c_str(), &err);
-    OPENCL_CHECK_ERROR(err);
-    t->kernel_table[e.kernel_id].kernel = kernel;
-    t->kernel_table[e.kernel_id].version = e.version;
-    kernels_.push_back(kernel);
-    return kernel;
-  }
-
- private:
-  // The workspace, need to keep reference to use it in destructor.
-  // In case of static destruction order problem.
-  std::shared_ptr<cl::OpenCLWorkspace> workspace_;
-  // the binary data
-  std::string data_;
-  // The format
-  std::string fmt_;
-  // function information table.
-  std::unordered_map<std::string, FunctionInfo> fmap_;
-  // Module local mutex
-  std::mutex build_lock_;
-  // the binary data
-  cl_program program_{nullptr};
-  // build info
-  std::vector<bool> device_built_flag_;
-  // kernel id cache
-  std::unordered_map<std::string, KTRefEntry> kid_map_;
-  // kernels build so far.
-  std::vector<cl_kernel> kernels_;
-};
-
 class OpenCLWrappedFunc {
  public:
-  // initialize the CUDA function.
+  // initialize the OpenCL function.
   void Init(OpenCLModuleNode* m,
             std::shared_ptr<ModuleNode> sptr,
             OpenCLModuleNode::KTRefEntry entry,
             std::string func_name,
             std::vector<size_t> arg_size,
             const std::vector<std::string>& thread_axis_tags)  {
-    w_ = cl::OpenCLWorkspace::Global().get();
+    w_ = m->GetGlobalWorkspace().get();
     m_ = m;
     sptr_ = sptr;
     entry_ = entry;
@@ -195,7 +34,7 @@ class OpenCLWrappedFunc {
   void operator()(TVMArgs args,
                   TVMRetValue* rv,
                   void** void_args) const {
-    cl::OpenCLThreadEntry* t = cl::OpenCLThreadEntry::ThreadLocal();
+    cl::OpenCLThreadEntry* t = w_->GetThreadEntry();
     // get the kernel from thread local kernel table.
     if (entry_.kernel_id >= t->kernel_table.size()) {
       t->kernel_table.resize(entry_.kernel_id + 1);
@@ -240,6 +79,27 @@ class OpenCLWrappedFunc {
   ThreadAxisConfig thread_axis_cfg_;
 };
 
+OpenCLModuleNode::~OpenCLModuleNode() {
+  {
+    // free the kernel ids in global table.
+    std::lock_guard<std::mutex> lock(workspace_->mu);
+    for (auto& kv : kid_map_) {
+      workspace_->free_kernel_ids.push_back(kv.second.kernel_id);
+    }
+  }
+  // free the kernels
+  for (cl_kernel k : kernels_) {
+    OPENCL_CALL(clReleaseKernel(k));
+  }
+  if (program_) {
+    OPENCL_CALL(clReleaseProgram(program_));
+  }
+}
+
+const std::shared_ptr<cl::OpenCLWorkspace>& OpenCLModuleNode::GetGlobalWorkspace() {
+  return cl::OpenCLWorkspace::Global();
+}
+
 PackedFunc OpenCLModuleNode::GetFunction(
     const std::string& name,
     const std::shared_ptr<ModuleNode>& sptr_to_self) {
@@ -254,9 +114,14 @@ PackedFunc OpenCLModuleNode::GetFunction(
   for (size_t i = 0; i < info.arg_types.size(); ++i) {
     TVMType t = info.arg_types[i];
     CHECK_EQ(t.lanes, 1U);
-    uint32_t bits = t.bits;
-    CHECK_EQ(bits % 8, 0U);
-    arg_size[i] = bits / 8;
+    if (t.code == kHandle) {
+      // specially store pointer type size in OpenCL driver
+      arg_size[i] = sizeof(void*);
+    } else {
+      uint32_t bits = t.bits;
+      CHECK_EQ(bits % 8, 0U);
+      arg_size[i] = bits / 8;
+    }
   }
   // initialize the wrapped func.
   f.Init(this, sptr_to_self, kid_map_.at(name),
@@ -264,12 +129,111 @@ PackedFunc OpenCLModuleNode::GetFunction(
   return PackFuncVoidAddr(f, info.arg_types);
 }
 
+void OpenCLModuleNode::SaveToFile(const std::string& file_name,
+                                  const std::string& format) {
+  std::string fmt = GetFileFormat(file_name, format);
+  CHECK_EQ(fmt, fmt_)
+      << "Can only save to format=" << fmt_;
+  std::string meta_file = GetMetaFilePath(file_name);
+  SaveMetaDataToFile(meta_file, fmap_);
+  SaveBinaryToFile(file_name, data_);
+}
+
+void OpenCLModuleNode::SaveToBinary(dmlc::Stream* stream) {
+  stream->Write(fmt_);
+  stream->Write(fmap_);
+  stream->Write(data_);
+}
+
+std::string OpenCLModuleNode::GetSource(const std::string& format) {
+  if (format == fmt_) return data_;
+  if (fmt_ == "cl") {
+    return data_;
+  } else {
+    return source_;
+  }
+}
+
+void OpenCLModuleNode::Init() {
+  workspace_ = GetGlobalWorkspace();
+  workspace_->Init();
+  CHECK(workspace_->context != nullptr) << "No OpenCL device";
+  device_built_flag_.resize(workspace_->devices.size(), false);
+  // initialize the kernel id, need to lock global table.
+  std::lock_guard<std::mutex> lock(workspace_->mu);
+  for (const auto& kv : fmap_) {
+    const std::string& key = kv.first;
+    KTRefEntry e;
+    if (workspace_->free_kernel_ids.size() != 0) {
+      e.kernel_id = workspace_->free_kernel_ids.back();
+      workspace_->free_kernel_ids.pop_back();
+    } else {
+      e.kernel_id = workspace_->num_registered_kernels++;
+    }
+    e.version = workspace_->timestamp++;
+    kid_map_[key] = e;
+  }
+}
+
+cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w,
+                                          cl::OpenCLThreadEntry* t,
+                                          const std::string& func_name,
+                                          const KTRefEntry& e) {
+  std::lock_guard<std::mutex> lock(build_lock_);
+  int device_id = t->context.device_id;
+  if (!device_built_flag_[device_id]) {
+    // create program
+    if (fmt_ == "cl") {
+      if (program_ == nullptr) {
+        const char* s = data_.c_str();
+        size_t len = data_.length();
+        cl_int err;
+        program_ = clCreateProgramWithSource(w->context, 1, &s, &len, &err);
+        OPENCL_CHECK_ERROR(err);
+      }
+    } else if (fmt_ == "xclbin" || fmt_ == "awsxclbin" || fmt_ == "aocx") {
+      const unsigned char* s = (const unsigned char *)data_.c_str();
+      size_t len = data_.length();
+      cl_int err;
+      cl_device_id dev = w->devices[device_id];
+      program_ = clCreateProgramWithBinary(w->context, 1, &dev, &len, &s, NULL, &err);
+      OPENCL_CHECK_ERROR(err);
+    } else {
+      LOG(FATAL) << "Unknown OpenCL format " << fmt_;
+    }
+    // build program
+    cl_int err;
+    cl_device_id dev = w->devices[device_id];
+    err = clBuildProgram(program_, 1, &dev, nullptr, nullptr, nullptr);
+    if (err != CL_SUCCESS) {
+      size_t len;
+      std::string log;
+      clGetProgramBuildInfo(
+          program_, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len);
+      log.resize(len);
+      clGetProgramBuildInfo(
+          program_, dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr);
+      LOG(FATAL) << "OpenCL build error for device=" << dev << log;
+    }
+    device_built_flag_[device_id] = true;
+  }
+  // build kernel
+  cl_int err;
+  cl_kernel kernel = clCreateKernel(program_, func_name.c_str(), &err);
+  OPENCL_CHECK_ERROR(err);
+  t->kernel_table[e.kernel_id].kernel = kernel;
+  t->kernel_table[e.kernel_id].version = e.version;
+  kernels_.push_back(kernel);
+  return kernel;
+}
+
 Module OpenCLModuleCreate(
     std::string data,
     std::string fmt,
-    std::unordered_map<std::string, FunctionInfo> fmap) {
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
   std::shared_ptr<OpenCLModuleNode> n =
-      std::make_shared<OpenCLModuleNode>(data, fmt, fmap);
+      std::make_shared<OpenCLModuleNode>(data, fmt, fmap, source);
   n->Init();
   return Module(n);
 }
@@ -283,7 +247,7 @@ Module OpenCLModuleLoadFile(const std::string& file_name,
   std::string meta_file = GetMetaFilePath(file_name);
   LoadBinaryFromFile(file_name, &data);
   LoadMetaDataFromFile(meta_file, &fmap);
-  return OpenCLModuleCreate(data, fmt, fmap);
+  return OpenCLModuleCreate(data, fmt, fmap, std::string());
 }
 
 Module OpenCLModuleLoadBinary(void* strm) {
@@ -294,7 +258,7 @@ Module OpenCLModuleLoadBinary(void* strm) {
   stream->Read(&fmt);
   stream->Read(&fmap);
   stream->Read(&data);
-  return OpenCLModuleCreate(data, fmt, fmap);
+  return OpenCLModuleCreate(data, fmt, fmap, std::string());
 }
 
 TVM_REGISTER_GLOBAL("module.loadfile_cl")
@@ -313,5 +277,3 @@ TVM_REGISTER_GLOBAL("module.loadbinary_opencl")
   });
 }  // namespace runtime
 }  // namespace tvm
-
-#endif  // TVM_OPENCL_RUNTIME
diff --git a/src/runtime/opencl/opencl_module.h b/src/runtime/opencl/opencl_module.h
index 85c50e3e9755..b3fe72cbd301 100644
--- a/src/runtime/opencl/opencl_module.h
+++ b/src/runtime/opencl/opencl_module.h
@@ -6,7 +6,6 @@
 #ifndef TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
 #define TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
 
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/packed_func.h>
 #include <memory>
 #include <vector>
@@ -16,7 +15,7 @@
 namespace tvm {
 namespace runtime {
 /*!
- * \brief create a cuda module from data.
+ * \brief create a opencl module for GPU devices from data.
  *
  * \param data The module data.
  * \param fmt The format of the data, can be "clbin", "cl"
@@ -25,7 +24,8 @@ namespace runtime {
 Module OpenCLModuleCreate(
     std::string data,
     std::string fmt,
-    std::unordered_map<std::string, FunctionInfo> fmap);
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source);
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
diff --git a/src/runtime/opencl/sdaccel/sdaccel_common.h b/src/runtime/opencl/sdaccel/sdaccel_common.h
new file mode 100644
index 000000000000..1975d71d7e03
--- /dev/null
+++ b/src/runtime/opencl/sdaccel/sdaccel_common.h
@@ -0,0 +1,42 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sdaccel_common.h
+ * \brief SDAccel common header
+ */
+#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
+#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
+
+#include "../opencl_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+/*!
+ * \brief Process global SDAccel workspace.
+ */
+class SDAccelWorkspace final : public OpenCLWorkspace {
+ public:
+  // override OpenCL device API
+  void Init() final;
+  bool IsOpenCLDevice(TVMContext ctx) final;
+  OpenCLThreadEntry* GetThreadEntry() final;
+  // get the global workspace
+  static const std::shared_ptr<OpenCLWorkspace>& Global();
+};
+
+
+/*! \brief Thread local workspace for SDAccel*/
+class SDAccelThreadEntry : public OpenCLThreadEntry {
+ public:
+  // constructor
+  SDAccelThreadEntry()
+      : OpenCLThreadEntry(static_cast<DLDeviceType>(kDLSDAccel), SDAccelWorkspace::Global()) {}
+
+  // get the global workspace
+  static SDAccelThreadEntry* ThreadLocal();
+};
+}  // namespace cl
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
diff --git a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
new file mode 100644
index 000000000000..4b057b7e009a
--- /dev/null
+++ b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
@@ -0,0 +1,44 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sdaccel_device_api.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <dmlc/thread_local.h>
+#include "./sdaccel_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+OpenCLThreadEntry* SDAccelWorkspace::GetThreadEntry() {
+  return SDAccelThreadEntry::ThreadLocal();
+}
+
+const std::shared_ptr<OpenCLWorkspace>& SDAccelWorkspace::Global() {
+  static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<SDAccelWorkspace>();
+  return inst;
+}
+
+void SDAccelWorkspace::Init() {
+  OpenCLWorkspace::Init("sdaccel", "accelerator", "Xilinx");
+}
+
+bool SDAccelWorkspace::IsOpenCLDevice(TVMContext ctx) {
+  return ctx.device_type == static_cast<DLDeviceType>(kDLSDAccel);
+}
+
+typedef dmlc::ThreadLocalStore<SDAccelThreadEntry> SDAccelThreadStore;
+
+SDAccelThreadEntry* SDAccelThreadEntry::ThreadLocal() {
+  return SDAccelThreadStore::Get();
+}
+
+TVM_REGISTER_GLOBAL("device_api.sdaccel")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    DeviceAPI* ptr = SDAccelWorkspace::Global().get();
+    *rv = static_cast<void*>(ptr);
+  });
+
+}  // namespace cl
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.cc b/src/runtime/opencl/sdaccel/sdaccel_module.cc
new file mode 100644
index 000000000000..c99e78c8e347
--- /dev/null
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.cc
@@ -0,0 +1,73 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sdaccel_module.cc
+ */
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/registry.h>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include "./sdaccel_common.h"
+#include "./sdaccel_module.h"
+
+namespace tvm {
+namespace runtime {
+
+class SDAccelModuleNode : public OpenCLModuleNode {
+ public:
+  explicit SDAccelModuleNode(std::string data,
+                             std::string fmt,
+                             std::unordered_map<std::string, FunctionInfo> fmap,
+                             std::string source)
+      : OpenCLModuleNode(data, fmt, fmap, source) {}
+  const std::shared_ptr<cl::OpenCLWorkspace>& GetGlobalWorkspace() final;
+};
+
+const std::shared_ptr<cl::OpenCLWorkspace>& SDAccelModuleNode::GetGlobalWorkspace() {
+  return cl::SDAccelWorkspace::Global();
+}
+
+Module SDAccelModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
+  std::shared_ptr<SDAccelModuleNode> n =
+      std::make_shared<SDAccelModuleNode>(data, fmt, fmap, source);
+  n->Init();
+  return Module(n);
+}
+
+Module SDAccelModuleLoadFile(const std::string& file_name,
+                             const std::string& format) {
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt = GetFileFormat(file_name, format);
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  return SDAccelModuleCreate(data, fmt, fmap, std::string());
+}
+
+Module SDAccelModuleLoadBinary(void* strm) {
+  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt;
+  stream->Read(&fmt);
+  stream->Read(&fmap);
+  stream->Read(&data);
+  return SDAccelModuleCreate(data, fmt, fmap, std::string());
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_xclbin")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = SDAccelModuleLoadFile(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("module.loadfile_awsxclbin")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = SDAccelModuleLoadFile(args[0], args[1]);
+  });
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.h b/src/runtime/opencl/sdaccel/sdaccel_module.h
new file mode 100644
index 000000000000..2c615d20ed2b
--- /dev/null
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.h
@@ -0,0 +1,31 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sdaccel_module.h
+ * \brief Execution handling of OPENCL kernels for SDAccel FPGAs
+ */
+#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
+#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <memory>
+#include <vector>
+#include <string>
+#include "../../meta_data.h"
+
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief create a opencl module for SDAccel from data.
+ *
+ * \param data The module data.
+ * \param fmt The format of the data, can be "xclbin", "awsxclbin"
+ * \param fmap The map function information map of each function.
+ */
+Module SDAccelModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source);
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
diff --git a/src/runtime/opengl/opengl_common.h b/src/runtime/opengl/opengl_common.h
new file mode 100644
index 000000000000..edc425dae953
--- /dev/null
+++ b/src/runtime/opengl/opengl_common.h
@@ -0,0 +1,495 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_common.h
+ * \brief OpenGL common header
+ */
+#ifndef TVM_RUNTIME_OPENGL_OPENGL_COMMON_H_
+#define TVM_RUNTIME_OPENGL_OPENGL_COMMON_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/device_api.h>
+#include <dmlc/logging.h>
+#if defined(__APPLE__)
+#define GLFW_INCLUDE_GLCOREARB
+#endif
+#include <GLFW/glfw3.h>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace gl {
+
+// This file contains the following classes.
+class GLFunctionPointers;
+class OpenGLWorkspace;
+class Texture;
+class Program;
+
+inline GLFWglproc GetProcAddress(const char* procname) {
+  GLFWglproc proc = glfwGetProcAddress(procname);
+  CHECK(proc != nullptr) << "Cannot get function \"" << procname << "\"";
+  return proc;
+}
+
+#define SetGLFunctionPointer(NAME) \
+  NAME(decltype(NAME)(GetProcAddress("gl" #NAME)))
+
+/*!
+ * \brief The function pointers of all OpenGL APIs that are used.
+ * Must be constructed after creating an OpenGL context.
+ */
+class GLFunctionPointers {
+ public:
+  GLFunctionPointers()
+      : SetGLFunctionPointer(ActiveTexture),
+        SetGLFunctionPointer(AttachShader),
+        SetGLFunctionPointer(BindBuffer),
+        SetGLFunctionPointer(BindFramebuffer),
+        SetGLFunctionPointer(BindTexture),
+        SetGLFunctionPointer(BindVertexArray),
+        SetGLFunctionPointer(BufferData),
+        SetGLFunctionPointer(CheckFramebufferStatus),
+        SetGLFunctionPointer(Clear),
+        SetGLFunctionPointer(CompileShader),
+        SetGLFunctionPointer(CreateProgram),
+        SetGLFunctionPointer(CreateShader),
+        SetGLFunctionPointer(DeleteFramebuffers),
+        SetGLFunctionPointer(DeleteProgram),
+        SetGLFunctionPointer(DeleteShader),
+        SetGLFunctionPointer(DeleteTextures),
+        SetGLFunctionPointer(DetachShader),
+        SetGLFunctionPointer(DrawArrays),
+        SetGLFunctionPointer(DrawBuffers),
+        SetGLFunctionPointer(EnableVertexAttribArray),
+        SetGLFunctionPointer(Finish),
+        SetGLFunctionPointer(FramebufferTexture2D),
+        SetGLFunctionPointer(GenBuffers),
+        SetGLFunctionPointer(GenFramebuffers),
+        SetGLFunctionPointer(GenTextures),
+        SetGLFunctionPointer(GenVertexArrays),
+        SetGLFunctionPointer(GetAttribLocation),
+        SetGLFunctionPointer(GetError),
+        SetGLFunctionPointer(GetIntegerv),
+        SetGLFunctionPointer(GetProgramInfoLog),
+        SetGLFunctionPointer(GetProgramiv),
+        SetGLFunctionPointer(GetShaderInfoLog),
+        SetGLFunctionPointer(GetShaderiv),
+        SetGLFunctionPointer(GetString),
+        SetGLFunctionPointer(GetUniformLocation),
+        SetGLFunctionPointer(LinkProgram),
+        SetGLFunctionPointer(ReadPixels),
+        SetGLFunctionPointer(ShaderSource),
+        SetGLFunctionPointer(TexImage2D),
+        SetGLFunctionPointer(TexParameteri),
+        SetGLFunctionPointer(TexSubImage2D),
+        SetGLFunctionPointer(Uniform1f),
+        SetGLFunctionPointer(Uniform1i),
+        SetGLFunctionPointer(UseProgram),
+        SetGLFunctionPointer(VertexAttribPointer),
+        SetGLFunctionPointer(Viewport) {}
+
+  void (*ActiveTexture)(GLenum texture);
+  void (*AttachShader)(GLuint program, GLuint shader);
+  void (*BindBuffer)(GLenum target, GLuint buffer);
+  void (*BindFramebuffer)(GLenum target, GLuint framebuffer);
+  void (*BindTexture)(GLenum target, GLuint texture);
+  void (*BindVertexArray)(GLuint array);
+  void (*BufferData)(GLenum target, GLsizeiptr size, const GLvoid* data,
+                     GLenum usage);
+  GLenum (*CheckFramebufferStatus)(GLenum target);
+  void (*Clear)(GLbitfield mask);
+  void (*CompileShader)(GLuint shader);
+  GLuint (*CreateProgram)();
+  GLuint (*CreateShader)(GLenum shader_type);
+  void (*DeleteFramebuffers)(GLsizei n, const GLuint* framebuffers);
+  void (*DeleteProgram)(GLuint program);
+  void (*DeleteShader)(GLuint shader);
+  void (*DeleteTextures)(GLsizei n, const GLuint* textures);
+  void (*DetachShader)(GLuint program, GLuint shader);
+  void (*DrawArrays)(GLenum mode, GLint first, GLsizei count);
+  void (*DrawBuffers)(GLsizei n, const GLenum* bufs);
+  void (*EnableVertexAttribArray)(GLuint index);
+  void (*Finish)();
+  void (*FramebufferTexture2D)(GLenum target, GLenum attachment,
+                               GLenum textarget, GLuint texture, GLint level);
+  void (*GenBuffers)(GLsizei n, GLuint* buffers);
+  void (*GenFramebuffers)(GLsizei n, GLuint* ids);
+  void (*GenTextures)(GLsizei n, GLuint* textures);
+  void (*GenVertexArrays)(GLsizei n, GLuint* arrays);
+  GLint (*GetAttribLocation)(GLuint program, const GLchar* name);
+  GLenum (*GetError)();
+  void (*GetIntegerv)(GLenum pname, GLint* data);
+  void (*GetProgramInfoLog)(GLuint program, GLsizei maxLength, GLsizei* length,
+                            GLchar* info_log);
+  void (*GetProgramiv)(GLuint program, GLenum pname, GLint* params);
+  void (*GetShaderInfoLog)(GLuint shader, GLsizei max_length, GLsizei* length,
+                           GLchar* info_log);
+  void (*GetShaderiv)(GLuint shader, GLenum pname, GLint* params);
+  const GLubyte *(*GetString)(GLenum name);
+  GLint (*GetUniformLocation)(GLuint program, const GLchar* name);
+  void (*LinkProgram)(GLuint program);
+  void (*ReadPixels)(GLint x, GLint y, GLsizei width, GLsizei height,
+                     GLenum format, GLenum type, GLvoid* data);
+  void (*ShaderSource)(GLuint shader, GLsizei count, const GLchar** string,
+                       const GLint* length);
+  void (*TexImage2D)(GLenum target, GLint level, GLint internal_format,
+                     GLsizei width, GLsizei height, GLint border, GLenum format,
+                     GLenum type, const GLvoid* data);
+  void (*TexParameteri)(GLenum target, GLenum pname, GLint param);
+  void (*TexSubImage2D)(GLenum target, GLint level, GLint xoffset,
+                        GLint yoffset, GLsizei width, GLsizei height,
+                        GLenum format, GLenum type, const GLvoid* data);
+  void (*Uniform1f)(GLint location, GLfloat v0);
+  void (*Uniform1i)(GLint location, GLint v0);
+  void (*UseProgram)(GLuint program);
+  void (*VertexAttribPointer)(GLuint index, GLint size, GLenum type,
+                              GLboolean normalized, GLsizei stride,
+                              const GLvoid* pointer);
+  void (*Viewport)(GLint x, GLint y, GLsizei width, GLsizei height);
+};
+
+/*!
+ * \brief Process global OpenGL workspace.
+ */
+class OpenGLWorkspace final : public DeviceAPI {
+ public:
+  ~OpenGLWorkspace() final;
+
+  // override device API
+  void SetDevice(TVMContext ctx) final;
+  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final;
+  void FreeDataSpace(TVMContext ctx, void* ptr) final;
+  void CopyDataFromTo(const void* from,
+                      size_t from_offset,
+                      void* to,
+                      size_t to_offset,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMType type_hint,
+                      TVMStreamHandle stream) final;
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
+
+  /*!
+   * \brief Get the global OpenGL workspace.
+   * \return The global OpenGL workspace.
+   */
+  static const std::shared_ptr<OpenGLWorkspace>& Global();
+
+  /*!
+   * \brief Create an OpenGL program that uses the given fragment shader.
+   * \param fragment_shader The fragment shader **source**.
+   * \return The OpenGL program.
+   */
+  Program CreateProgram(const char* fragment_shader_src);
+
+  /*!
+   * \brief Create an OpenGL texture that stores an array.
+   * \param type Element type.
+   * \param nbytes Number of bytes in the array.
+   * \return The OpenGL texture.
+   */
+  Texture CreateTexture(TVMType type, size_t nbytes);
+
+  /*!
+   * \brief Upload user data into a sub-region of an OpenGL texture.
+   * \param texture The texture to be written to.
+   * \param begin The index of the first element to be written to.
+   * \param nelems The number of elements to be written to.
+   * \param data The user data.
+   */
+  void PutTextureData(Texture* texture,
+                      GLint begin,
+                      GLsizei nelems,
+                      const GLvoid* data);
+  /*!
+   * \brief Download a sub-region of an OpenGL texture.
+   * \param texture The texture to download from.
+   * \param begin The index of first element to download from.
+   * \param nelems The number of elements to download from.
+   * \param data The user buffer.
+   */
+  void GetTextureData(const Texture* texture,
+                      GLint begin,
+                      GLsizei nelems,
+                      GLvoid* data);
+
+  /*!
+   * \brief Set currently used OpenGL program.
+   */
+  void SetCurrentProgram(const Program& program);
+
+  /*!
+   * \brief Set uniform values for an OpenGL program.
+   * Must call SetCurrentProgram before calling this.
+   * \param program The OpenGL program.
+   * \param name The uniform argument name.
+   * \param type The type of the uniform.
+   * \param value The value to pass in.
+   */
+  void SetUniform(const Program& program,
+                  const std::string& name,
+                  TVMType type,
+                  void* value);
+
+  /*!
+   * \brief Set input texture for an OpenGL program.
+   * Must call SetCurrentProgram before calling this.
+   * \param program The OpenGL program.
+   * \param name The texture uniform argument name.
+   * \param unit The texture unit to use. Each input texture must occupy a
+   * different unit.
+   * \param texture The OpenGL texture to pass in.
+   */
+  void SetInputTexture(const Program& program,
+                       const std::string& name,
+                       GLuint unit,
+                       Texture* texture);
+
+  /*!
+   * \brief Render to a texture.
+   * \param output The output texture.
+   */
+  void Render(Texture* output);
+
+ private:
+  friend class Texture;
+  friend class Program;
+
+  // Global singleton. Hide constructor.
+  OpenGLWorkspace();
+
+  GLFWwindow* window_;
+  std::unique_ptr<GLFunctionPointers> gl;
+  GLuint vertex_shader_;
+  static const int kWindowWidth = 640;
+  static const int kWindowHeight = 480;
+  struct Vertex {
+    float x, y;
+  };
+  static constexpr size_t kNumVertices = 6;
+  static const Vertex vertices[kNumVertices];
+  static const char* vertex_shader_text_;
+
+  /*!
+   * \brief Bind a texture to a "texture unit".
+   * After calling this function, the "texture unit" becomes "active", and the
+   * texture is bound to GL_TEXTURE_2D in that "texture unit".
+   * \param unit The texture unit to activate.
+   * \param texture The texture to bind.
+   */
+  void BindTextureUnit(GLuint unit, GLuint texture);
+
+  /*!
+   * \brief Callback in Texture's destructor.
+   */
+  void OnDeleteTexture(GLuint texture);
+
+  /*!
+   * \brief Callback in Program's destructor.
+   */
+  void OnDeleteProgram(GLuint program);
+
+  /*!
+   * \brief Check if there is any outstanding OpenGL error. If there is, crash.
+   */
+  void CheckOpenGLError();
+
+  /*!
+   * \brief Get the maximum number of texture units.
+   */
+  GLuint NumTextureUnits();
+
+  /*!
+   * \brief Create and compile a shader from a source string.
+   * \param shader_kind The kind of shader.
+   * Could be GL_VERTEX_SHADER or GL_FRAGMENT_SHADER.
+   * \param shader_src The source string of the shader.
+   * \return The compiled shader ID.
+   */
+  GLuint CreateShader(GLenum shader_kind, const char* shader_src);
+
+  /*!
+   * \brief Create an OpenGL program that uses the given fragment shader.
+   * \param fragment_shader The **compiled** fragment shader.
+   * \return The OpenGL program.
+   */
+  Program CreateProgram(GLuint fragment_shader);
+};
+
+/*!
+ * \brief An OpenGL program, composed of a vertex shader and a fragment shader.
+ * In TVM, every program has the same vertex shader.
+ * So a program just corresponds to a fragment shader.
+ * A program can only be created by the workspace.
+ * This class is just a wrapper over an OpenGL program ID.
+ */
+class Program {
+ public:
+  // Move constructor.
+  Program(Program&& other) noexcept
+      : workspace_(other.workspace_), program_(other.program_) {
+    other.program_ = kInvalidProgram;
+  }
+
+  // Move assignment.
+  Program& operator=(Program&& other) noexcept {
+    workspace_ = other.workspace_;
+    program_ = other.program_;
+    other.program_ = kInvalidProgram;
+    return *this;
+  }
+
+  // Disallow copy.
+  Program(const Program& other) = delete;
+  Program& operator=(const Program& other) = delete;
+
+  // Destructor.
+  ~Program() {
+    if (program_ != kInvalidProgram) {
+      workspace_->OnDeleteProgram(program_);
+      program_ = kInvalidProgram;
+    }
+  }
+
+ private:
+  friend class OpenGLWorkspace;
+
+  // Only OpenGLWorkspace can create a Program.
+  // We enforce this to make sure OpenGL is initialized.
+  explicit Program(OpenGLWorkspace* workspace, GLuint program)
+      : workspace_(workspace), program_(program) {}
+
+  // The internal OpenGL program ID.
+  GLuint program() const { return program_; }
+
+  static constexpr GLuint kInvalidProgram = static_cast<GLuint>(-1);
+
+  OpenGLWorkspace* workspace_;
+  GLuint program_;
+};
+
+/*!
+ * \brief The storage format of a texture.
+ * The members match the API of glTexImage2D.
+ */
+struct TextureFormat {
+  TextureFormat(GLint internal_format, GLenum format, GLenum type)
+      : internal_format(internal_format), format(format), type(type) {}
+
+  GLsizei elemsz() const {
+    switch (type) {
+      case GL_BYTE: case GL_UNSIGNED_BYTE:
+        return 1;
+      case GL_SHORT: case GL_UNSIGNED_SHORT:
+        return 2;
+      case GL_INT: case GL_UNSIGNED_INT:
+        return 4;
+      case GL_FLOAT:
+        return 4;
+      default:
+        LOG(FATAL) << "Unsupported type";
+        return -1;
+    }
+  }
+
+  bool operator==(const TextureFormat& other) const {
+    return std::make_tuple(internal_format, format, type) ==
+        std::make_tuple(other.internal_format, other.format, other.type);
+  }
+
+  GLint internal_format;  // OpenGL says this is GLint, not GLenum.
+  GLenum format;
+  GLenum type;
+};
+
+/*!
+ * \brief An OpenGL texture represents a chunk of GPU memory.
+ * This is the way we represent tensors.
+ * We always use 2D textures.
+ */
+class Texture {
+ public:
+  // Move constructor.
+  Texture(Texture&& other) noexcept
+      : workspace_(other.workspace_), texture_(other.texture_),
+        format_(other.format_), width_(other.width_), height_(other.height_) {
+    other.texture_ = kInvalidTexture;
+  }
+
+  // Move assignment.
+  Texture& operator=(Texture&& other) noexcept {
+    workspace_ = other.workspace_;
+    texture_ = other.texture_;
+    format_ = other.format_;
+    width_ = other.width_;
+    height_ = other.height_;
+    other.texture_ = kInvalidTexture;
+    return *this;
+  }
+
+  // Disallow copy.
+  Texture(const Texture& other) = delete;
+  Texture& operator=(const Texture& other) = delete;
+
+  // Destructor.
+  ~Texture() {
+    if (texture_ != kInvalidTexture) {
+      workspace_->OnDeleteTexture(texture_);
+      texture_ = kInvalidTexture;
+    }
+  }
+
+  /*!
+   * \brief The width of the texture in number of pixels.
+   */
+  GLsizei width() const { return width_; }
+
+  /*!
+   * \brief The height of the texture in number of pixels.
+   */
+  GLsizei height() const { return height_; }
+
+  /*!
+   * \brief The number of bytes of each element in the array.
+   */
+  GLsizei elemsz() const { return format_.elemsz(); }
+
+ private:
+  friend class OpenGLWorkspace;
+
+  // Only OpenGLWorkspace can create a Texture.
+  // We enforce this to make sure OpenGL is initialized.
+  // Always only use the first dimension of a 2D texture.
+  // The reason is that texelFetch only supports 2D textures.
+  explicit Texture(OpenGLWorkspace* workspace, GLuint texture,
+                   TextureFormat format,
+                   GLsizei width, GLsizei height)
+      : workspace_(workspace), texture_(texture), format_(format),
+        width_(width), height_(height) {}
+
+  // The internal texture ID.
+  GLuint texture() const { return texture_; }
+
+  static constexpr GLuint kInvalidTexture = static_cast<GLuint>(-1);
+
+  OpenGLWorkspace* workspace_;
+  GLuint texture_;
+  TextureFormat format_;
+  GLsizei width_;
+  GLsizei height_;
+};
+
+}  // namespace gl
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_OPENGL_OPENGL_COMMON_H_
diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc
new file mode 100644
index 000000000000..3a21ed6e6d07
--- /dev/null
+++ b/src/runtime/opengl/opengl_device_api.cc
@@ -0,0 +1,614 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_device_api.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <cstring>
+#include "./opengl_common.h"
+#include "./opengl_module.h"
+
+namespace tvm {
+namespace runtime {
+namespace gl {
+
+/*!
+ * \brief Turn OpenGL error enum to string.
+ */
+static const char* GLGetErrorString(GLenum error) {
+  switch (error) {
+    case GL_NO_ERROR:
+      return "GL_NO_ERROR";
+    case GL_INVALID_ENUM:
+      return "GL_INVALID_ENUM";
+    case GL_INVALID_VALUE:
+      return "GL_INVALID_VALUE";
+    case GL_INVALID_OPERATION:
+      return "GL_INVALID_OPERATION";
+#if !defined(__APPLE__)
+    case GL_STACK_OVERFLOW:
+      return "GL_STACK_OVERFLOW";
+    case GL_STACK_UNDERFLOW:
+      return "GL_STACK_UNDERFLOW";
+#endif
+    case GL_OUT_OF_MEMORY:
+      return "GL_OUT_OF_MEMORY";
+    default:
+      return "Unknown OpenGL error code";
+  }
+}
+
+/*!
+ * \brief Get the latest error.
+ */
+void OpenGLWorkspace::CheckOpenGLError() {
+  GLenum err = gl->GetError();
+  CHECK_EQ(err, GL_NO_ERROR) << "OpenGL error, code=" << err << ": "
+                             << gl::GLGetErrorString(err);
+}
+
+/*!
+ * \brief Protected OpenGL call.
+ * \param func Expression to call.
+ */
+#define OPENGL_CALL(func)                                                      \
+  {                                                                            \
+    (func);                                                                    \
+    CheckOpenGLError();                                                        \
+  }
+
+/*!
+ * \brief The error handling callback passed to GLFW.
+ */
+void GlfwErrorCallback(int err, const char* str) {
+  LOG(FATAL) << "Error: [" << err << "] " << str;
+}
+
+const std::shared_ptr<OpenGLWorkspace>& OpenGLWorkspace::Global() {
+  static std::shared_ptr<OpenGLWorkspace> inst(new OpenGLWorkspace);
+  return inst;
+}
+
+void OpenGLWorkspace::SetDevice(TVMContext ctx) {
+  CHECK_EQ(ctx.device_type, static_cast<int>(kOpenGL))
+    << "Device type must be OpenGL.";
+  CHECK_EQ(ctx.device_id, 0) << "Only support 1 OpenGL \"device\".";
+}
+
+void OpenGLWorkspace::GetAttr(
+    TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) {
+  switch (kind) {
+    case kExist: {
+      *rv = static_cast<int>(ctx.device_id == 0);
+      break;
+    }
+    case kMaxThreadsPerBlock: {
+      GLint max_texture_size;
+      OPENGL_CALL(gl->GetIntegerv(GL_MAX_TEXTURE_SIZE, &max_texture_size));
+      break;
+    }
+    case kWarpSize: {
+      *rv = 1;
+      break;
+    }
+    case kMaxSharedMemoryPerBlock: return;
+    case kComputeVersion: {
+      break;
+    }
+    case kDeviceName: return;
+    case kMaxClockRate: return;
+    case kMultiProcessorCount: return;
+    case kMaxThreadDimensions: return;
+  }
+}
+
+void* OpenGLWorkspace::AllocDataSpace(
+    TVMContext ctx, size_t nbytes, size_t alignment, TVMType type_hint) {
+  return reinterpret_cast<void*>(new Texture(CreateTexture(type_hint, nbytes)));
+}
+
+void OpenGLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
+  delete reinterpret_cast<Texture*>(ptr);
+}
+
+void OpenGLWorkspace::CopyDataFromTo(const void* from,
+                                     size_t from_offset,
+                                     void* to,
+                                     size_t to_offset,
+                                     size_t size,
+                                     TVMContext ctx_from,
+                                     TVMContext ctx_to,
+                                     TVMType type_hint,
+                                     TVMStreamHandle stream) {
+  CHECK(stream == nullptr);
+
+  // TODO(zhixunt): This is a nasty hack to avoid comparison between
+  // incompatible enums. We should add kOpenGL to dlpack.
+  constexpr int gl_devtype = kOpenGL;
+  std::tuple<int, int> type_from_to(ctx_from.device_type, ctx_to.device_type);
+
+  if (type_from_to == std::make_tuple(gl_devtype, gl_devtype)) {
+    auto from_texture = static_cast<const Texture*>(from);
+    auto to_texture = static_cast<Texture*>(to);
+    auto temp_buffer = std::unique_ptr<char[]>(new char[size]);
+    CHECK(from_texture->format_ == to_texture->format_);
+    auto elemsz = from_texture->elemsz();
+    auto from_begin = static_cast<GLint>(from_offset / elemsz);
+    auto to_begin = static_cast<GLint>(to_offset / elemsz);
+    auto nelems = static_cast<GLsizei>(size / elemsz);
+    GetTextureData(from_texture, from_begin, nelems, temp_buffer.get());
+    PutTextureData(to_texture, to_begin, nelems, temp_buffer.get());
+
+  } else if (type_from_to == std::make_tuple(gl_devtype, kDLCPU)) {
+    auto texture = static_cast<const Texture*>(from);
+    void *data = static_cast<char *>(to) + to_offset;
+    auto elemsz = texture->elemsz();
+    auto begin = static_cast<GLint>(from_offset / elemsz);
+    auto nelems = static_cast<GLsizei>(size / elemsz);
+    GetTextureData(texture, begin, nelems, data);
+
+  } else if (type_from_to == std::make_tuple(kDLCPU, gl_devtype)) {
+    auto texture = reinterpret_cast<Texture*>(to);
+    const void* data = static_cast<const char*>(from) + from_offset;
+    auto elemsz = texture->elemsz();
+    auto begin = static_cast<GLint>(to_offset / elemsz);
+    auto nelems = static_cast<GLsizei>(size / elemsz);
+    PutTextureData(texture, begin, nelems, data);
+
+  } else {
+    LOG(FATAL) << "Expect copy from/to OpenGL or between OpenGL";
+  }
+}
+
+void OpenGLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {}
+
+OpenGLWorkspace::OpenGLWorkspace() {
+  // Set an error handler.
+  // This can be called before glfwInit().
+  glfwSetErrorCallback(&GlfwErrorCallback);
+
+  // Initialize GLFW.
+  if (glfwInit() != GL_TRUE) {
+    LOG(FATAL) << "glfwInit() failed!";
+  }
+
+  // Create a window.
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+  glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
+  glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+  glfwWindowHint(GLFW_VISIBLE, GL_FALSE);
+  window_ = glfwCreateWindow(kWindowWidth, kWindowHeight, "", nullptr, nullptr);
+  if (window_ == nullptr) {
+    LOG(FATAL) << "glfwCreateWindow() failed!";
+  }
+
+  // Before using any OpenGL API, we must specify a context.
+  glfwMakeContextCurrent(window_);
+
+  // Load all OpenGL API function pointers.
+  gl = std::unique_ptr<GLFunctionPointers>(new GLFunctionPointers);
+
+  CheckOpenGLError();
+
+  // We always render the same vertices and triangles.
+  GLuint vertex_buffer;
+  OPENGL_CALL(gl->GenBuffers(1, &vertex_buffer));
+  OPENGL_CALL(gl->BindBuffer(GL_ARRAY_BUFFER, vertex_buffer));
+  OPENGL_CALL(gl->BufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices,
+                             GL_STATIC_DRAW));
+
+  GLuint vertex_array;
+  OPENGL_CALL(gl->GenVertexArrays(1, &vertex_array));
+  OPENGL_CALL(gl->BindVertexArray(vertex_array));
+  OPENGL_CALL(gl->BindBuffer(GL_ARRAY_BUFFER, vertex_buffer));
+
+  // We always use the same vertex shader.
+  vertex_shader_ = CreateShader(GL_VERTEX_SHADER, vertex_shader_text_);
+
+  LOG(INFO) << "OpenGL initialized, version = " << gl->GetString(GL_VERSION);
+}
+
+OpenGLWorkspace::~OpenGLWorkspace() {
+  // Paired with glfwCreateWindow().
+  glfwDestroyWindow(window_);
+
+  // Paired with glfwInit().
+  glfwTerminate();
+}
+
+void OpenGLWorkspace::BindTextureUnit(GLuint unit, GLuint texture) {
+  OPENGL_CALL(gl->ActiveTexture(GL_TEXTURE0 + unit));
+  OPENGL_CALL(gl->BindTexture(GL_TEXTURE_2D, texture));
+}
+
+void OpenGLWorkspace::OnDeleteTexture(GLuint texture) {
+  OPENGL_CALL(gl->DeleteTextures(1, &texture));
+}
+
+void OpenGLWorkspace::OnDeleteProgram(GLuint program) {
+  OPENGL_CALL(gl->DeleteProgram(program));
+}
+
+GLuint OpenGLWorkspace::NumTextureUnits() {
+  GLint num_units;
+  OPENGL_CALL(gl->GetIntegerv(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS, &num_units));
+  return static_cast<GLuint>(num_units);
+}
+
+const OpenGLWorkspace::Vertex OpenGLWorkspace::vertices[OpenGLWorkspace::kNumVertices] = {
+    {-1.f, -1.f},
+    {1.0f, -1.f},
+    {1.0f, 1.0f},
+    {-1.f, -1.f},
+    {-1.f, 1.0f},
+    {1.0f, 1.0f},
+};
+
+// Don't need to change this.
+// The vertex shader only needs to take in the triangle points.
+// No need for point transformations.
+const char* OpenGLWorkspace::vertex_shader_text_ = "#version 300 es\n"
+    "in vec2 point; // input to vertex shader\n"
+    "void main() {\n"
+    "  gl_Position = vec4(point, 0.0, 1.0);\n"
+    "}\n";
+
+Program OpenGLWorkspace::CreateProgram(
+    const char* fragment_shader_src) {
+  // Create and compile the shaders.
+  GLuint fragment_shader = CreateShader(GL_FRAGMENT_SHADER,
+                                        fragment_shader_src);
+
+  // Link the shaders and create the program.
+  Program program = CreateProgram(fragment_shader);
+
+  OPENGL_CALL(gl->DeleteShader(fragment_shader));
+
+  return program;
+}
+
+GLuint OpenGLWorkspace::CreateShader(GLenum shader_kind,
+                                     const char* shader_src) {
+  // Create the shader.
+  GLuint shader = gl->CreateShader(shader_kind);
+  gl->ShaderSource(shader, 1, &shader_src, nullptr);
+  gl->CompileShader(shader);
+
+  // Check compile errors.
+  GLint err;
+  gl->GetShaderiv(shader, GL_COMPILE_STATUS, &err);
+
+  GLint info_log_len;
+  gl->GetShaderiv(shader, GL_INFO_LOG_LENGTH, &info_log_len);
+
+  if (err != GL_TRUE) {
+    std::unique_ptr<char[]> err_msg(new char[info_log_len + 1]);
+    gl->GetShaderInfoLog(shader, info_log_len, nullptr, err_msg.get());
+    LOG(FATAL) << err_msg.get() << "\n" << shader_src;
+    assert(false);
+  }
+
+  CheckOpenGLError();
+
+  return shader;
+}
+
+static TextureFormat GetTextureFormat(TVMType type) {
+  CHECK_EQ(type.lanes, 1) << "Not supporting multi-lane types.";
+
+  switch (type.code) {
+    case kDLInt: {
+      switch (type.bits) {
+        case 8:
+          return {GL_R8I, GL_RED_INTEGER, GL_BYTE};
+        case 16:
+          return {GL_R16I, GL_RED_INTEGER, GL_SHORT};
+        case 32:
+          return {GL_R32I, GL_RED_INTEGER, GL_INT};
+        default:
+          LOG(FATAL) << "Unsupported type bits " << type.bits;
+      }
+    }
+    case kDLUInt: {
+      switch (type.bits) {
+        case 8:
+          return {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE};
+        case 16:
+          return {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT};
+        case 32:
+          return {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT};
+        default:
+          LOG(FATAL) << "Unsupported type bits " << type.bits;
+      }
+    }
+    case kDLFloat: {
+      switch (type.bits) {
+        case 32:
+          return {GL_R32F, GL_RED, GL_FLOAT};
+        default:
+          LOG(FATAL) << "Unsupported type bits " << type.bits;
+      }
+    }
+    default: {
+      LOG(FATAL) << "Unsupported type code" << type.code;
+    }
+  }
+  return {GL_R32F, GL_RED, GL_FLOAT};
+}
+
+Texture OpenGLWorkspace::CreateTexture(TVMType type, size_t nbytes) {
+  // Create a texture.
+  GLuint texture;
+  OPENGL_CALL(gl->GenTextures(1, &texture));
+
+  BindTextureUnit(NumTextureUnits() - 1, texture);
+
+  // Use glTexImage2D with nullptr data to specify GPU data storage.
+  auto texture_format = GetTextureFormat(type);
+  auto nelems = static_cast<GLsizei>(nbytes / (type.bits / 8));
+  auto height = (nelems + kTextureRowSize - 1) / kTextureRowSize;
+  auto width = (height == 1) ? nelems : kTextureRowSize;
+  OPENGL_CALL(gl->TexImage2D(GL_TEXTURE_2D, /*level=*/0,
+                             texture_format.internal_format,
+                             width, height, /*border=*/0,
+                             texture_format.format, texture_format.type,
+                             /*data=*/nullptr));
+
+  OPENGL_CALL(
+      gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+  OPENGL_CALL(
+      gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+  OPENGL_CALL(
+      gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+  OPENGL_CALL(
+      gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+
+  return Texture(this, texture, texture_format, width, height);
+}
+
+Program OpenGLWorkspace::CreateProgram(GLuint fragment_shader) {
+  // Create the program and link the shaders.
+  GLuint program = gl->CreateProgram();
+  gl->AttachShader(program, vertex_shader_);
+  gl->AttachShader(program, fragment_shader);
+  gl->LinkProgram(program);
+
+  // Check link errors.
+  GLint err;
+  gl->GetProgramiv(program, GL_LINK_STATUS, &err);
+
+  GLint info_log_len;
+  gl->GetProgramiv(program, GL_INFO_LOG_LENGTH, &info_log_len);
+
+  if (err != GL_TRUE) {
+    std::unique_ptr<char[]> err_msg(new char[info_log_len + 1]);
+    gl->GetProgramInfoLog(program, info_log_len, nullptr, err_msg.get());
+    LOG(FATAL) << err_msg.get();
+    assert(false);
+  }
+
+  CheckOpenGLError();
+
+  OPENGL_CALL(gl->DetachShader(program, vertex_shader_));
+  OPENGL_CALL(gl->DetachShader(program, fragment_shader));
+
+  auto point_attrib = GLuint(gl->GetAttribLocation(program, "point"));
+  OPENGL_CALL(gl->EnableVertexAttribArray(point_attrib));
+
+  OPENGL_CALL(gl->VertexAttribPointer(point_attrib, 2, GL_FLOAT, GL_FALSE,
+                                      sizeof(Vertex), nullptr));
+
+  return Program(this, program);
+}
+
+/*!
+ * \brief Visit a 1D range of an OpenGL texture-backed TVM array.
+ * When getting/setting a sub image of a texture, we can only specify a 2D
+ * block (xbeg, ybeg, width, height).
+ * Since we are storing all TVM arrays using (kTextureRowSize x nrows) 2D
+ * textures (row-major), a range in an array does not necessarily map to a 2D
+ * block.
+ * This function split a 1D range into 3 2D blocks.
+ * \param beg The index of the first element in the 1D range.
+ * \param end The index of the last + 1 element in the 1D range.
+ * \param on_2d_block Callback for each 2D block. Must have interface
+ * void(GLint xbeg, GLint ybeg, GLsizei width, GLsizei height).
+ */
+template <typename F>
+static void Visit1DRange(GLint beg, GLint end, F&& on_2d_block) {
+  CHECK_LE(beg, end) << "Invalid range.";
+
+  //           xbeg         kTextureRowSize
+  // ybeg  ....************
+  //       ****************
+  //       ****************
+  // ylast *********.......
+  //           xlast
+  GLint xbeg = beg % kTextureRowSize;
+  GLint ybeg = beg / kTextureRowSize;
+  GLint xlast = (end - 1) % kTextureRowSize;
+  GLint ylast = (end - 1) / kTextureRowSize;
+
+  if (ybeg == ylast) {  // Only one row.
+    on_2d_block(xbeg, ybeg, end - beg, 1);
+    return;
+  }
+
+  // First row.
+  on_2d_block(xbeg, ybeg, kTextureRowSize - xbeg, 1);
+
+  // Middle block.
+  if (ylast - ybeg > 1) {
+    on_2d_block(0, ybeg + 1, kTextureRowSize, ylast - ybeg - 1);
+  }
+
+  // Last row.
+  on_2d_block(0, ylast, xlast + 1, 1);
+}
+
+void OpenGLWorkspace::PutTextureData(Texture *texture,
+                                     GLint begin,
+                                     GLsizei nelems,
+                                     const GLvoid* data) {
+  // Bind to temporary unit.
+  BindTextureUnit(NumTextureUnits() - 1, texture->texture());
+
+  Visit1DRange(begin, begin + nelems, [&](GLint xbeg, GLint ybeg,
+                                          GLsizei width, GLsizei height) {
+    auto offset = (ybeg * kTextureRowSize + xbeg - begin) * texture->elemsz();
+    const GLvoid* ptr = static_cast<const char*>(data) + offset;
+
+    // Similar to cudaMemcpy.
+    OPENGL_CALL(gl->TexSubImage2D(GL_TEXTURE_2D, /*level=*/0,
+                                  xbeg, ybeg, width, height,
+                                  texture->format_.format,
+                                  texture->format_.type, ptr));
+  });
+}
+
+void OpenGLWorkspace::GetTextureData(const Texture *texture,
+                                     GLint begin,
+                                     GLsizei nelems,
+                                     GLvoid* data) {
+  BindTextureUnit(NumTextureUnits() - 1, texture->texture());
+
+  // Create frame buffer.
+  GLuint frame_buffer;
+  OPENGL_CALL(gl->GenFramebuffers(1, &frame_buffer));
+  OPENGL_CALL(gl->BindFramebuffer(GL_FRAMEBUFFER, frame_buffer));
+
+  // Bind texture to framebuffer's attachment 0.
+  OPENGL_CALL(gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                                       GL_TEXTURE_2D, texture->texture(), 0));
+
+  // Always check that our framebuffer is okay.
+  if (gl->CheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+    LOG(FATAL) << "Framebuffer not complete.";
+  }
+
+#ifdef __EMSCRIPTEN__
+  // WebGL2's glReadPixels API doesn't allow GL_RED user buffer format.
+  // Instead, We must use GL_RGBA. This means the data we retrieve has useless
+  // GBA channels. Here we are applying a dirty hack.
+  // TODO(zhixunt): We really want to utilize all RGBA channels in textures.
+  //
+  // WebGL2's glReadPixels API also doesn't allow GL_RED_INTEGER or
+  // GL_RGB_INTEGER user buffer format, which means we cannot retrieve integer
+  // texture data? (need to confirm)
+
+  CHECK_EQ(texture->format_.internal_format, GL_R32F)
+      << "Retrieving integer texture not supported yet.";
+  auto elemsz = texture->format_.elemsz();
+  auto nchannels = 4;
+  auto padded_data_size = nchannels * nelems * elemsz;
+  auto padded_data = std::unique_ptr<char[]>(new char[padded_data_size]);
+  Visit1DRange(begin, begin + nelems, [&](GLint xbeg, GLint ybeg,
+                                          GLsizei width, GLsizei height) {
+    auto data_offset = (ybeg * kTextureRowSize + xbeg - begin) * elemsz;
+    auto padded_data_offset = data_offset * nchannels;
+    OPENGL_CALL(gl->ReadPixels(xbeg, ybeg, width, height,
+                               GL_RGBA, GL_FLOAT,
+                               padded_data.get() + padded_data_offset));
+  });
+  for (GLsizei i = 0; i != nelems; ++i) {
+    auto dst = reinterpret_cast<char *>(data) + i * elemsz;
+    auto src = padded_data.get() + nchannels * i * elemsz;
+    std::memcpy(dst, src, elemsz);
+  }
+#else
+  Visit1DRange(begin, begin + nelems, [&](GLint xbeg, GLint ybeg,
+                                          GLsizei width, GLsizei height) {
+    auto offset = (ybeg * kTextureRowSize + xbeg - begin) * texture->elemsz();
+    GLvoid* ptr = static_cast<char*>(data) + offset;
+
+    OPENGL_CALL(gl->ReadPixels(xbeg, ybeg, width, height,
+                               texture->format_.format, texture->format_.type,
+                               ptr));
+  });
+#endif
+
+  OPENGL_CALL(gl->DeleteFramebuffers(1, &frame_buffer));
+}
+
+void OpenGLWorkspace::SetCurrentProgram(const Program& program) {
+  OPENGL_CALL(gl->UseProgram(program.program()));
+}
+
+void OpenGLWorkspace::SetUniform(const Program& program,
+                                 const std::string& name,
+                                 TVMType type,
+                                 void* value) {
+  GLint location = gl->GetUniformLocation(program.program(), name.c_str());
+  switch (type.code) {
+    case kDLInt: {
+      CHECK_EQ(type.bits, 32) << "Only support 32-bit int for uniform.";
+      GLint uniform_value = *reinterpret_cast<GLint*>(value);
+      OPENGL_CALL(gl->Uniform1i(location, uniform_value));
+      break;
+    }
+    case kDLUInt: {
+      LOG(FATAL) << "Strangely, emcc WebGL does not support glUniform1ui.";
+      break;
+    }
+    case kDLFloat: {
+      CHECK_EQ(type.bits, 32) << "Only support 32-bit float for uniform.";
+      GLfloat uniform_value = *reinterpret_cast<GLfloat*>(value);
+      OPENGL_CALL(gl->Uniform1f(location, uniform_value));
+      break;
+    }
+    default: {
+      LOG(FATAL) << "Unsupported type code for uniform.";
+      break;
+    }
+  }
+}
+
+void OpenGLWorkspace::SetInputTexture(const Program& program,
+                                      const std::string& name,
+                                      GLuint unit,
+                                      Texture* texture) {
+  // We always use the last texture unit as temporary.
+  // Therefore, we can have "NumTextureUnits() - 1" input textures.
+  CHECK_LT(unit, NumTextureUnits() - 1) << "Too many textures.";
+
+  BindTextureUnit(unit, texture->texture());
+  GLint location = gl->GetUniformLocation(program.program_, name.c_str());
+  OPENGL_CALL(gl->Uniform1i(location, unit));
+}
+
+void OpenGLWorkspace::Render(Texture* output) {
+  // Create frame buffer.
+  GLuint frame_buffer;
+  OPENGL_CALL(gl->GenFramebuffers(1, &frame_buffer));
+  OPENGL_CALL(gl->BindFramebuffer(GL_FRAMEBUFFER, frame_buffer));
+
+  // Set "renderedTexture" as our colour attachement 0.
+  OPENGL_CALL(gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                                       GL_TEXTURE_2D, output->texture(), 0));
+
+  // Specify that we will render to color attachment 0.
+  GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0};
+  OPENGL_CALL(gl->DrawBuffers(1, DrawBuffers));
+
+  // Always check that our framebuffer is okay.
+  if (gl->CheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+    LOG(FATAL) << "Framebuffer not complete.";
+  }
+
+  // Perform rendering.
+  OPENGL_CALL(gl->Viewport(0, 0, output->width(), output->height()));
+  OPENGL_CALL(gl->Clear(GL_COLOR_BUFFER_BIT));
+  OPENGL_CALL(gl->DrawArrays(GL_TRIANGLES, 0, 6));
+
+  OPENGL_CALL(gl->DeleteFramebuffers(1, &frame_buffer));
+}
+
+TVM_REGISTER_GLOBAL("device_api.opengl")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  DeviceAPI* ptr = OpenGLWorkspace::Global().get();
+  *rv = static_cast<void*>(ptr);
+});
+
+}  // namespace gl
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/opengl/opengl_module.cc b/src/runtime/opengl/opengl_module.cc
new file mode 100644
index 000000000000..d800af95f053
--- /dev/null
+++ b/src/runtime/opengl/opengl_module.cc
@@ -0,0 +1,278 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_module.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <utility>
+#include "./opengl_common.h"
+#include "./opengl_module.h"
+#include "../pack_args.h"
+#include "../thread_storage_scope.h"
+#include "../file_util.h"
+
+namespace tvm {
+namespace runtime {
+
+class OpenGLModuleNode final : public ModuleNode {
+ public:
+  OpenGLModuleNode(std::unordered_map<std::string, OpenGLShader> shaders,
+                   std::string fmt,
+                   std::unordered_map<std::string, FunctionInfo> fmap);
+
+  ~OpenGLModuleNode() override = default;
+
+  const char* type_key() const final { return "opengl"; }
+
+  PackedFunc GetFunction(const std::string& name,
+                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+
+  std::string GetSource(const std::string& format) final;
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final;
+
+  void SaveToBinary(dmlc::Stream* stream) final;
+
+  const gl::Program& GetProgram(const std::string& func_name) const;
+
+  const OpenGLShader& GetShader(const std::string& func_name) const;
+
+  const FunctionInfo& GetFunctionInfo(const std::string& func_name) const;
+
+  gl::OpenGLWorkspace& workspace() const { return *workspace_; }
+
+ private:
+  std::shared_ptr<gl::OpenGLWorkspace> workspace_;
+  std::unordered_map<std::string, OpenGLShader> shaders_;
+  std::string fmt_;
+  std::unordered_map<std::string, FunctionInfo> fmap_;
+  std::unordered_map<std::string, gl::Program> programs_;
+
+  DISALLOW_COPY_AND_ASSIGN(OpenGLModuleNode);
+};
+
+class OpenGLWrappedFunc {
+ public:
+  OpenGLWrappedFunc(OpenGLModuleNode* m,
+                    std::shared_ptr<ModuleNode> sptr,
+                    std::string func_name,
+                    std::vector<size_t> arg_size,
+                    const std::vector<std::string>& thread_axis_tags);
+
+  void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const;
+
+ private:
+  // The module
+  OpenGLModuleNode* m_;
+  // resource handle
+  std::shared_ptr<ModuleNode> sptr_;
+  // The name of the function.
+  std::string func_name_;
+  // convert code for void argument
+  std::vector<size_t> arg_size_;
+  // thread axis config
+  ThreadAxisConfig thread_axis_cfg_;
+};
+
+OpenGLModuleNode::OpenGLModuleNode(
+    std::unordered_map<std::string, OpenGLShader> shaders,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap)
+    : workspace_(gl::OpenGLWorkspace::Global()), shaders_(std::move(shaders)),
+      fmt_(std::move(fmt)), fmap_(std::move(fmap)), programs_() {
+  CHECK_EQ(fmt_, "gl") << "Unknown OpenGL format " << fmt_;
+  for (auto &pair : shaders_) {
+    auto &func_name = pair.first;
+    auto &shader = pair.second;
+    programs_.emplace(func_name,
+                      workspace_->CreateProgram(shader.source.c_str()));
+  }
+}
+
+PackedFunc OpenGLModuleNode::GetFunction(
+    const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  CHECK_EQ(sptr_to_self.get(), this);
+  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+
+  auto func_info_it = fmap_.find(name);
+  if (func_info_it == fmap_.end()) { return PackedFunc(); }
+  auto &func_info = func_info_it->second;
+
+  std::vector<size_t> arg_size(func_info.arg_types.size());
+  for (size_t i = 0; i < func_info.arg_types.size(); ++i) {
+    TVMType t = func_info.arg_types[i];
+    CHECK_EQ(t.lanes, 1U);
+    uint32_t bits = t.bits;
+    CHECK_EQ(bits % 8, 0U);
+    arg_size[i] = bits / 8;
+  }
+
+  // Initialize the wrapped func.
+  OpenGLWrappedFunc f(this, sptr_to_self, name, arg_size,
+                      func_info.thread_axis_tags);
+  return PackFuncVoidAddr(f, func_info.arg_types);
+}
+
+std::string OpenGLModuleNode::GetSource(const std::string& format) {
+  if (format != fmt_ && fmt_ != "gl") { return ""; }
+
+  std::ostringstream os;
+  for (auto &pair : shaders_) {
+    auto &name = pair.first;
+    auto &shader = pair.second;
+    os << "[" << name << "]" << "\n";
+    os << shader.source <<"\n";
+  }
+  return os.str();
+}
+
+void OpenGLModuleNode::SaveToFile(const std::string& file_name,
+                                  const std::string& format) {
+  std::string fmt = GetFileFormat(file_name, format);
+  CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+  std::string meta_file = GetMetaFilePath(file_name);
+  SaveMetaDataToFile(meta_file, fmap_);
+  SaveBinaryToFile(file_name, ToJSON(shaders_));
+}
+
+void OpenGLModuleNode::SaveToBinary(dmlc::Stream* stream) {
+  stream->Write(fmt_);
+  stream->Write(fmap_);
+  stream->Write(ToJSON(shaders_));
+}
+
+const gl::Program& OpenGLModuleNode::GetProgram(
+    const std::string& func_name) const {
+  auto it = programs_.find(func_name);
+  if (it == programs_.end()) {
+    LOG(FATAL) << "Cannot find program";
+  }
+  return it->second;
+}
+
+const OpenGLShader& OpenGLModuleNode::GetShader(
+    const std::string& func_name) const {
+  auto it = shaders_.find(func_name);
+  if (it == shaders_.end()) {
+    LOG(FATAL) << "Cannot find shader";
+  }
+  return it->second;
+}
+
+const FunctionInfo& OpenGLModuleNode::GetFunctionInfo(
+    const std::string& func_name) const {
+  auto it = fmap_.find(func_name);
+  if (it == fmap_.end()) {
+    LOG(FATAL) << "Cannot find shader";
+  }
+  return it->second;
+}
+
+OpenGLWrappedFunc::OpenGLWrappedFunc(
+    OpenGLModuleNode* m,
+    std::shared_ptr<ModuleNode> sptr,
+    std::string func_name,
+    std::vector<size_t> arg_size,
+    const std::vector<std::string>& thread_axis_tags)
+    : m_(m), sptr_(std::move(sptr)), func_name_(std::move(func_name)),
+      arg_size_(std::move(arg_size)) {
+  thread_axis_cfg_.Init(arg_size_.size(), thread_axis_tags);
+}
+
+void OpenGLWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
+                                   void** void_args) const {
+  auto &shader = m_->GetShader(func_name_);
+  auto &program = m_->GetProgram(func_name_);
+  auto &func_info = m_->GetFunctionInfo(func_name_);
+  size_t nargs = shader.arg_kinds.size();
+
+  // Must call this function before setting uniforms & input textures.
+  m_->workspace().SetCurrentProgram(program);
+
+  // Set all arguments.
+  GLuint texture_unit = 0;
+  gl::Texture* output = nullptr;
+  for (size_t i = 0; i != nargs; ++i) {
+    auto &name = shader.arg_names.at(i);
+    auto kind = shader.arg_kinds.at(i);
+    auto type = func_info.arg_types.at(i);
+    switch (kind) {
+      case OpenGLArgKind::kUniform: {
+        m_->workspace().SetUniform(program, name, type, void_args[i]);
+        break;
+      }
+      case OpenGLArgKind::kInputTexture: {
+        CHECK_EQ(type.code, kHandle) << "Type is not handle?";
+        auto texture = *static_cast<gl::Texture**>(void_args[i]);
+        m_->workspace().SetInputTexture(program, name, texture_unit, texture);
+        ++texture_unit;
+        break;
+      }
+      case OpenGLArgKind::kOutputTexture: {
+        CHECK_EQ(type.code, kHandle) << "Type is not handle?";
+        CHECK(output == nullptr) << "Can only have one output texture.";
+        output = *static_cast<gl::Texture**>(void_args[i]);
+        break;
+      }
+    }
+  }
+
+  // Set "thread_extent" uniform.
+  ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
+  std::unique_ptr<GLint> thread_extent(new GLint(wl.block_dim(0)));
+  m_->workspace().SetUniform(program, shader.thread_extent_var,
+                             TVMType{kDLInt, 32, 1},
+                             static_cast<void*>(thread_extent.get()));
+
+  m_->workspace().Render(output);
+}
+
+Module OpenGLModuleCreate(std::unordered_map<std::string, OpenGLShader> shaders,
+                          std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap) {
+  auto n = std::make_shared<OpenGLModuleNode>(std::move(shaders),
+                                              std::move(fmt),
+                                              std::move(fmap));
+  return Module(n);
+}
+
+Module OpenGLModuleLoadFile(const std::string& file_name,
+                            const std::string& format) {
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt = GetFileFormat(file_name, format);
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  return OpenGLModuleCreate(FromJSON(data), fmt, fmap);
+}
+
+Module OpenGLModuleLoadBinary(void* strm) {
+  auto stream = static_cast<dmlc::Stream*>(strm);
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt;
+  stream->Read(&fmt);
+  stream->Read(&fmap);
+  stream->Read(&data);
+  return OpenGLModuleCreate(FromJSON(data), fmt, fmap);
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_gl")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenGLModuleLoadFile(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("module.loadfile_glbin")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenGLModuleLoadFile(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("module.loadbinary_opengl")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenGLModuleLoadBinary(args[0]);
+  });
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/opengl/opengl_module.h b/src/runtime/opengl/opengl_module.h
new file mode 100644
index 000000000000..a55a09b6c1bd
--- /dev/null
+++ b/src/runtime/opengl/opengl_module.h
@@ -0,0 +1,162 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_module.h
+ * \brief Execution handling of OpenGL kernels
+ */
+#ifndef TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_
+#define TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "../meta_data.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief The fixed row size of all OpenGL textures in TVM.
+ *
+ * OpenGL has texture size limit on each dimension. Suppose we have a limit of
+ * 1024, then we can have a 2D texture of size (2^10 x 2^10) but not (2^20 x 1).
+ * This means we don't want to just use (n x 1) 2D textures for all arrays,
+ * because that would limit our array size to be 1024. Here we use (1024 x m)
+ * 2D textures. Then we can have arrays of size up to 2^20.
+ */
+static constexpr int kTextureRowBits = 10;
+static constexpr int kTextureRowSize = 1 << kTextureRowBits;
+static constexpr int kTextureRowMask = kTextureRowSize - 1;
+
+/*!
+ * \brief Determines how we supply arguments.
+ */
+enum class OpenGLArgKind {
+  kInputTexture = 0,   // Bind to "gsampler2D" in GLSL.
+  kOutputTexture = 1,  // Bind to "out" in GLSL.
+  kUniform = 2,        // Bind to "uniform" in GLSL.
+};
+
+std::string OpenGLArgKind2String(OpenGLArgKind kind);
+OpenGLArgKind String2OpenGLArgKind(const std::string& str);
+
+/*!
+ * \brief The output of OpenGL codegen.
+ * Contains necessary information to build a fragment shader and bind arguments.
+ */
+struct OpenGLShader {
+  OpenGLShader() = default;
+  OpenGLShader(std::string source,
+               std::vector<std::string> arg_names,
+               std::vector<OpenGLArgKind> arg_kinds,
+               std::string thread_extent_var)
+      : source(std::move(source)), arg_names(std::move(arg_names)),
+        arg_kinds(std::move(arg_kinds)),
+        thread_extent_var(std::move(thread_extent_var)) {
+    CHECK_EQ(this->arg_names.size(), this->arg_kinds.size()) << "Invalid input";
+  }
+
+  std::string source;
+  std::vector<std::string> arg_names;    // Matches FunctionInfo.
+  std::vector<OpenGLArgKind> arg_kinds;  // Matches FunctionInfo.
+  std::string thread_extent_var;         // Stores the output length.
+
+  void Save(dmlc::JSONWriter* writer) const;
+  void Load(dmlc::JSONReader* reader);
+};
+
+std::string ToJSON(const std::unordered_map<std::string, OpenGLShader>& shaders);
+std::unordered_map<std::string, OpenGLShader> FromJSON(const std::string& str);
+
+/*!
+ * \brief Create an OpenGL module from data.
+ *
+ * \param data The module data.
+ * \param fmt The format of the data,
+ * \param fmap The map function information map of each function.
+ */
+Module OpenGLModuleCreate(std::unordered_map<std::string, OpenGLShader> shaders,
+                          std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap);
+
+inline std::string OpenGLArgKind2String(OpenGLArgKind kind) {
+  switch (kind) {
+    case OpenGLArgKind::kOutputTexture:
+      return "output_texture";
+    case OpenGLArgKind::kInputTexture:
+      return "input_texture";
+    case OpenGLArgKind::kUniform:
+      return "uniform";
+    default:
+      LOG(FATAL) << "invalid arg kind";
+      return "";
+  }
+}
+
+inline OpenGLArgKind String2OpenGLArgKind(const std::string& str) {
+  if (str == "output_texture") {
+    return OpenGLArgKind::kOutputTexture;
+  } else if (str == "input_texture") {
+    return OpenGLArgKind::kInputTexture;
+  } else if (str == "uniform") {
+    return OpenGLArgKind::kUniform;
+  } else {
+    LOG(FATAL) << "Invalid OpenGL arg kind.";
+    return OpenGLArgKind::kUniform;
+  }
+}
+
+inline void OpenGLShader::Save(dmlc::JSONWriter* writer) const {
+  std::vector<std::string> arg_kind_strs;
+  for (auto kind : arg_kinds) {
+    arg_kind_strs.push_back(OpenGLArgKind2String(kind));
+  }
+
+  writer->BeginObject();
+  writer->WriteObjectKeyValue("arg_names", arg_names);
+  writer->WriteObjectKeyValue("arg_kinds", arg_kind_strs);
+  writer->WriteObjectKeyValue("source", source);
+  writer->WriteObjectKeyValue("thread_extent_var", thread_extent_var);
+  writer->EndObject();
+}
+
+inline void OpenGLShader::Load(dmlc::JSONReader* reader) {
+  std::vector<std::string> arg_kind_strs;
+  dmlc::JSONObjectReadHelper helper;
+  helper.DeclareField("arg_names", &arg_names);
+  helper.DeclareField("arg_kinds", &arg_kind_strs);
+  helper.DeclareField("source", &source);
+  helper.DeclareField("thread_extent_var", &thread_extent_var);
+  helper.ReadAllFields(reader);
+
+  arg_kinds.clear();
+  for (auto& str : arg_kind_strs) {
+    arg_kinds.push_back(String2OpenGLArgKind(str));
+  }
+}
+
+inline std::string ToJSON(
+    const std::unordered_map<std::string, OpenGLShader>& shaders) {
+  std::ostringstream os;
+  dmlc::JSONWriter writer(&os);
+  writer.BeginObject();
+  writer.WriteObjectKeyValue("shaders", shaders);
+  writer.EndObject();
+  return os.str();
+}
+
+inline std::unordered_map<std::string, OpenGLShader> FromJSON(
+    const std::string& str) {
+  std::unordered_map<std::string, OpenGLShader> shaders;
+  std::istringstream is(str);
+  dmlc::JSONReader reader(&is);
+  dmlc::JSONObjectReadHelper helper;
+  helper.DeclareField("shaders", &shaders);
+  helper.ReadAllFields(&reader);
+  return shaders;
+}
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_
diff --git a/src/runtime/pack_args.h b/src/runtime/pack_args.h
index 3cb214161f22..0a00e79f07df 100644
--- a/src/runtime/pack_args.h
+++ b/src/runtime/pack_args.h
@@ -104,12 +104,12 @@ enum ArgConvertCode {
 inline ArgConvertCode GetArgConvertCode(TVMType t) {
   CHECK_EQ(t.lanes, 1U)
       << "Cannot pass vector type argument to devic function for now";
-  if (t.code == kInt) {
+  if (t.code == kDLInt) {
     if (t.bits == 64U) return INT64_TO_INT64;
     if (t.bits == 32U) return INT64_TO_INT32;
-  } else if (t.code == kUInt) {
+  } else if (t.code == kDLUInt) {
     if (t.bits == 32U) return INT64_TO_UINT32;
-  } else if (t.code == kFloat) {
+  } else if (t.code == kDLFloat) {
     if (t.bits == 64U) return FLOAT64_TO_FLOAT64;
     if (t.bits == 32U) return FLOAT64_TO_FLOAT32;
   } else if (t.code == kHandle) {
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index d7587b6ce1a5..3f72828390ee 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -46,7 +46,7 @@ Registry& Registry::set_body(PackedFunc f) {  // NOLINT(*)
 
 Registry& Registry::Register(const std::string& name, bool override) {  // NOLINT(*)
   Manager* m = Manager::Global();
-  std::lock_guard<std::mutex>(m->mutex);
+  std::lock_guard<std::mutex> lock(m->mutex);
   auto it = m->fmap.find(name);
   if (it == m->fmap.end()) {
     Registry* r = new Registry();
@@ -62,7 +62,7 @@ Registry& Registry::Register(const std::string& name, bool override) {  // NOLIN
 
 bool Registry::Remove(const std::string& name) {
   Manager* m = Manager::Global();
-  std::lock_guard<std::mutex>(m->mutex);
+  std::lock_guard<std::mutex> lock(m->mutex);
   auto it = m->fmap.find(name);
   if (it == m->fmap.end()) return false;
   m->fmap.erase(it);
@@ -71,7 +71,7 @@ bool Registry::Remove(const std::string& name) {
 
 const PackedFunc* Registry::Get(const std::string& name) {
   Manager* m = Manager::Global();
-  std::lock_guard<std::mutex>(m->mutex);
+  std::lock_guard<std::mutex> lock(m->mutex);
   auto it = m->fmap.find(name);
   if (it == m->fmap.end()) return nullptr;
   return &(it->second->func_);
@@ -79,7 +79,7 @@ const PackedFunc* Registry::Get(const std::string& name) {
 
 std::vector<std::string> Registry::ListNames() {
   Manager* m = Manager::Global();
-  std::lock_guard<std::mutex>(m->mutex);
+  std::lock_guard<std::mutex> lock(m->mutex);
   std::vector<std::string> keys;
   keys.reserve(m->fmap.size());
   for (const auto &kv : m->fmap) {
@@ -101,7 +101,7 @@ ExtTypeVTable* ExtTypeVTable::RegisterInternal(
     int type_code, const ExtTypeVTable& vt) {
   CHECK(type_code > kExtBegin && type_code < kExtEnd);
   Registry::Manager* m = Registry::Manager::Global();
-  std::lock_guard<std::mutex>(m->mutex);
+  std::lock_guard<std::mutex> lock(m->mutex);
   ExtTypeVTable* pvt = &(m->ext_vtable[type_code]);
   pvt[0] = vt;
   return pvt;
diff --git a/src/runtime/rocm/rocm_common.h b/src/runtime/rocm/rocm_common.h
index 63e4bff4582b..a6898122e389 100644
--- a/src/runtime/rocm/rocm_common.h
+++ b/src/runtime/rocm/rocm_common.h
@@ -6,12 +6,9 @@
 #ifndef TVM_RUNTIME_ROCM_ROCM_COMMON_H_
 #define TVM_RUNTIME_ROCM_ROCM_COMMON_H_
 
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/packed_func.h>
-#include <string>
-
-#if TVM_ROCM_RUNTIME
 #include <hip/hip_runtime_api.h>
+#include <string>
 #include "../workspace_pool.h"
 
 namespace tvm {
@@ -19,17 +16,17 @@ namespace runtime {
 
 #define ROCM_DRIVER_CALL(x)                                             \
   {                                                                     \
-    hipError_t result = x;                                                \
-    if (result != hipSuccess && result != hipErrorDeinitialized) { \
+    hipError_t result = x;                                              \
+    if (result != hipSuccess && result != hipErrorDeinitialized) {      \
       LOG(FATAL)                                                        \
-          << "ROCM HIP Error: " #x " failed with error: " << hipGetErrorString(result);            \
+          << "ROCM HIP Error: " #x " failed with error: " << hipGetErrorString(result); \
     }                                                                   \
   }
 
-#define ROCM_CALL(func)                                            \
-  {                                                                \
-    hipError_t e = (func);                                        \
-    CHECK(e == hipSuccess)       \
+#define ROCM_CALL(func)                                               \
+  {                                                                   \
+    hipError_t e = (func);                                            \
+    CHECK(e == hipSuccess)                                            \
         << "ROCM HIP: " << hipGetErrorString(e);                      \
   }
 
@@ -47,5 +44,4 @@ class ROCMThreadEntry {
 };
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_ROCM_RUNTIME
 #endif  // TVM_RUNTIME_ROCM_ROCM_COMMON_H_
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index d7b4eabf01d4..6aff5e56c715 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -3,10 +3,8 @@
  * \file rocm_device_api.cc
  * \brief GPU specific API
  */
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/device_api.h>
 
-#if TVM_ROCM_RUNTIME
 #include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
@@ -44,20 +42,29 @@ class ROCMDeviceAPI final : public DeviceAPI {
         value = 64;
         break;
       }
-      case kComputeVersion:
+      case kMaxSharedMemoryPerBlock: return;
+      case kComputeVersion: {
         hipDeviceProp_t prop;
         ROCM_CALL(hipGetDeviceProperties(&prop, ctx.device_id));
         *rv = prop.gcnArch;
         return;
+      }
+      case kDeviceName: return;
+      case kMaxClockRate: return;
+      case kMultiProcessorCount: return;
+      case kMaxThreadDimensions: return;
     }
     *rv = value;
   }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
     ROCM_CALL(hipSetDevice(ctx.device_id));
     CHECK_EQ(256 % alignment, 0U)
         << "ROCM space is aligned at 256 bytes";
     void *ret;
-    ROCM_CALL(hipMalloc(&ret, size));
+    ROCM_CALL(hipMalloc(&ret, nbytes));
     return ret;
   }
 
@@ -73,11 +80,12 @@ class ROCMDeviceAPI final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
     hipStream_t hip_stream = static_cast<hipStream_t>(stream);
     from = static_cast<const char*>(from) + from_offset;
     to = static_cast<char*>(to) + to_offset;
-    if (ctx_from.device_type == kROCM && ctx_to.device_type == kROCM) {
+    if (ctx_from.device_type == kDLROCM && ctx_to.device_type == kDLROCM) {
       ROCM_CALL(hipSetDevice(ctx_from.device_id));
       if (ctx_from.device_id == ctx_to.device_id) {
         GPUCopy(from, to, size, hipMemcpyDeviceToDevice, hip_stream);
@@ -86,10 +94,10 @@ class ROCMDeviceAPI final : public DeviceAPI {
                             from, ctx_from.device_id,
                             size, hip_stream);
       }
-    } else if (ctx_from.device_type == kROCM && ctx_to.device_type == kCPU) {
+    } else if (ctx_from.device_type == kDLROCM && ctx_to.device_type == kDLCPU) {
       ROCM_CALL(hipSetDevice(ctx_from.device_id));
       GPUCopy(from, to, size, hipMemcpyDeviceToHost, hip_stream);
-    } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kROCM) {
+    } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLROCM) {
       ROCM_CALL(hipSetDevice(ctx_to.device_id));
       GPUCopy(from, to, size, hipMemcpyHostToDevice, hip_stream);
     } else {
@@ -107,7 +115,7 @@ class ROCMDeviceAPI final : public DeviceAPI {
         ->stream = static_cast<hipStream_t>(stream);
   }
 
-  void* AllocWorkspace(TVMContext ctx, size_t size) final {
+  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final {
     return ROCMThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
   }
 
@@ -138,7 +146,7 @@ class ROCMDeviceAPI final : public DeviceAPI {
 typedef dmlc::ThreadLocalStore<ROCMThreadEntry> ROCMThreadStore;
 
 ROCMThreadEntry::ROCMThreadEntry()
-    : pool(kROCM, ROCMDeviceAPI::Global()) {
+    : pool(kDLROCM, ROCMDeviceAPI::Global()) {
 }
 
 ROCMThreadEntry* ROCMThreadEntry::ThreadLocal() {
@@ -153,4 +161,3 @@ TVM_REGISTER_GLOBAL("device_api.rocm")
 
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_ROCM_RUNTIME
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index 90f7c5c2239a..503b04872c82 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -2,16 +2,13 @@
  *  Copyright (c) 2017 by Contributors
  * \file rocm_module.cc
  */
-#include "./rocm_module.h"
-
-#if TVM_ROCM_RUNTIME
-
 #include <tvm/runtime/registry.h>
 #include <hip/hip_runtime_api.h>
 #include <vector>
 #include <array>
 #include <string>
 #include <mutex>
+#include "./rocm_module.h"
 #include "./rocm_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
@@ -229,6 +226,11 @@ TVM_REGISTER_GLOBAL("module.loadbinary_hsaco")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     *rv = ROCMModuleLoadBinary(args[0]);
   });
+
+
+TVM_REGISTER_GLOBAL("module.loadbinary_hip")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = ROCMModuleLoadBinary(args[0]);
+  });
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_ROCM_RUNTIME
diff --git a/src/runtime/rocm/rocm_module.h b/src/runtime/rocm/rocm_module.h
index 039326ab1223..4386157fc986 100644
--- a/src/runtime/rocm/rocm_module.h
+++ b/src/runtime/rocm/rocm_module.h
@@ -6,7 +6,6 @@
 #ifndef TVM_RUNTIME_ROCM_ROCM_MODULE_H_
 #define TVM_RUNTIME_ROCM_ROCM_MODULE_H_
 
-#include <tvm/runtime/config.h>
 #include <tvm/runtime/module.h>
 #include <memory>
 #include <vector>
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index e8cc5b94ad52..5740a393c253 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -20,10 +20,13 @@ class RPCDeviceAPI final : public DeviceAPI {
     *rv = GetSess(ctx)->CallRemote(
         RPCCode::kDevGetAttr, ctx, static_cast<int>(kind));
   }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
     auto sess = GetSess(ctx);
     void *data = sess->CallRemote(
-            RPCCode::kDevAllocData, ctx, size, alignment);
+            RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint);
     RemoteSpace* space = new RemoteSpace();
     space->data = data;
     space->sess = std::move(sess);
@@ -31,8 +34,12 @@ class RPCDeviceAPI final : public DeviceAPI {
   }
   void FreeDataSpace(TVMContext ctx, void* ptr) final {
     RemoteSpace* space = static_cast<RemoteSpace*>(ptr);
-    GetSess(ctx)->CallRemote(
-        RPCCode::kDevFreeData, ctx, space->data);
+    try {
+      GetSess(ctx)->CallRemote(
+          RPCCode::kDevFreeData, ctx, space->data);
+    } catch (const dmlc::Error& e) {
+      // fault tolerance to remote close.
+    }
     delete space;
   }
   void CopyDataFromTo(const void* from,
@@ -42,6 +49,7 @@ class RPCDeviceAPI final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
     int from_dev_type = ctx_from.device_type;
     int to_dev_type = ctx_to.device_type;
@@ -53,19 +61,18 @@ class RPCDeviceAPI final : public DeviceAPI {
           RPCCode::kCopyAmongRemote,
           static_cast<const RemoteSpace*>(from)->data, from_offset,
           static_cast<const RemoteSpace*>(to)->data, to_offset,
-          size,  ctx_from, ctx_to, stream);
+          size,  ctx_from, ctx_to, type_hint, stream);
     } else if (from_dev_type > kRPCSessMask &&
-               to_dev_type == kCPU) {
+               to_dev_type == kDLCPU) {
       GetSess(ctx_from)->CopyFromRemote(
           static_cast<const RemoteSpace*>(from)->data, from_offset,
-          to, to_offset, size,
-          ctx_from);
-    } else if (from_dev_type == kCPU &&
+          to, to_offset, size, ctx_from, type_hint);
+    } else if (from_dev_type == kDLCPU &&
                to_dev_type > kRPCSessMask) {
       GetSess(ctx_to)->CopyToRemote(
           (void*)from, from_offset,  // NOLINT(*)
           static_cast<const RemoteSpace*>(to)->data, to_offset,
-          size, ctx_to);
+          size, ctx_to, type_hint);
     } else {
       LOG(FATAL) << "expect copy from/to remote or between remote";
     }
diff --git a/src/runtime/rpc/rpc_event_impl.cc b/src/runtime/rpc/rpc_event_impl.cc
index df90bee56c26..fc5ecca1f421 100644
--- a/src/runtime/rpc/rpc_event_impl.cc
+++ b/src/runtime/rpc/rpc_event_impl.cc
@@ -32,18 +32,21 @@ class CallbackChannel final : public RPCChannel {
   PackedFunc fsend_;
 };
 
-PackedFunc CreateEventDrivenServer(PackedFunc fsend, std::string name) {
+PackedFunc CreateEventDrivenServer(PackedFunc fsend,
+                                   std::string name,
+                                   std::string remote_key) {
   std::unique_ptr<CallbackChannel> ch(new CallbackChannel(fsend));
-  std::shared_ptr<RPCSession> sess = RPCSession::Create(std::move(ch), name);
+  std::shared_ptr<RPCSession> sess =
+      RPCSession::Create(std::move(ch), name, remote_key);
   return PackedFunc([sess](TVMArgs args, TVMRetValue* rv) {
       int ret = sess->ServerEventHandler(args[0], args[1]);
       *rv = ret;
     });
 }
 
-TVM_REGISTER_GLOBAL("contrib.rpc._CreateEventDrivenServer")
+TVM_REGISTER_GLOBAL("rpc._CreateEventDrivenServer")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = CreateEventDrivenServer(args[0], args[1]);
+    *rv = CreateEventDrivenServer(args[0], args[1], args[2]);
   });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index af2e0647871d..251871bf0cc1 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -26,7 +26,11 @@ struct RPCWrappedFunc {
     sess_->CallFunc(handle_, args, rv, &fwrap_);
   }
   ~RPCWrappedFunc() {
-    sess_->CallRemote(RPCCode::kFreeFunc, handle_);
+    try {
+      sess_->CallRemote(RPCCode::kFreeFunc, handle_);
+    } catch (const dmlc::Error& e) {
+      // fault tolerance to remote close
+    }
   }
 
   static void WrapRemote(std::shared_ptr<RPCSession> sess,
@@ -48,7 +52,12 @@ class RPCModuleNode final : public ModuleNode {
   }
   ~RPCModuleNode() {
     if (module_handle_ != nullptr) {
-      sess_->CallRemote(RPCCode::kModuleFree, module_handle_);
+      try {
+        sess_->CallRemote(RPCCode::kModuleFree, module_handle_);
+      } catch (const dmlc::Error& e) {
+        // fault tolerance to remote close
+      }
+      module_handle_ = nullptr;
     }
   }
 
@@ -77,10 +86,11 @@ class RPCModuleNode final : public ModuleNode {
 
   PackedFunc GetTimeEvaluator(const std::string& name,
                               TVMContext ctx,
-                              int nstep) {
+                              int number,
+                              int repeat) {
     RPCFuncHandle handle = GetFuncHandle(name);
     if (handle == nullptr) return PackedFunc();
-    handle = sess_->GetTimeEvaluator(handle, ctx, nstep);
+    handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat);
     return WrapRemote(handle);
   }
 
@@ -148,14 +158,14 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator")
     ctx.device_id = args[3];
     if (tkey == "rpc") {
       *rv = static_cast<RPCModuleNode*>(m.operator->())
-          ->GetTimeEvaluator(args[1], ctx, args[4]);
+          ->GetTimeEvaluator(args[1], ctx, args[4], args[5]);
     } else {
       *rv = WrapTimeEvaluator(
-          m.GetFunction(args[1], false), ctx, args[4]);
+          m.GetFunction(args[1], false), ctx, args[4], args[5]);
     }
   });
 
-TVM_REGISTER_GLOBAL("contrib.rpc._LoadRemoteModule")
+TVM_REGISTER_GLOBAL("rpc._LoadRemoteModule")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     Module m = args[0];
     std::string tkey = m->type_key();
@@ -167,7 +177,7 @@ TVM_REGISTER_GLOBAL("contrib.rpc._LoadRemoteModule")
     *rv = Module(n);
   });
 
-TVM_REGISTER_GLOBAL("contrib.rpc._ImportRemoteModule")
+TVM_REGISTER_GLOBAL("rpc._ImportRemoteModule")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     Module parent = args[0];
     Module child = args[1];
@@ -182,7 +192,7 @@ TVM_REGISTER_GLOBAL("contrib.rpc._ImportRemoteModule")
                              cmod->module_handle());
   });
 
-TVM_REGISTER_GLOBAL("contrib.rpc._ModuleHandle")
+TVM_REGISTER_GLOBAL("rpc._ModuleHandle")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     Module m = args[0];
     std::string tkey = m->type_key();
@@ -190,12 +200,13 @@ TVM_REGISTER_GLOBAL("contrib.rpc._ModuleHandle")
     *rv = static_cast<RPCModuleNode*>(m.operator->())->module_handle();
   });
 
-TVM_REGISTER_GLOBAL("contrib.rpc._SessTableIndex")
+TVM_REGISTER_GLOBAL("rpc._SessTableIndex")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     Module m = args[0];
     std::string tkey = m->type_key();
     CHECK_EQ(tkey, "rpc");
     *rv = static_cast<RPCModuleNode*>(m.operator->())->sess()->table_index();
   });
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rpc/rpc_server_env.cc b/src/runtime/rpc/rpc_server_env.cc
index 17ee2abd97b1..ca91b88247e5 100644
--- a/src/runtime/rpc/rpc_server_env.cc
+++ b/src/runtime/rpc/rpc_server_env.cc
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file rpc_server_env
+ * \file rpc_server_env.cc
  * \brief Server environment of the RPC.
  */
 #include <tvm/runtime/registry.h>
@@ -11,20 +11,19 @@ namespace runtime {
 
 std::string RPCGetPath(const std::string& name) {
   static const PackedFunc* f =
-      runtime::Registry::Get("tvm.contrib.rpc.server.workpath");
-  CHECK(f != nullptr) << "require tvm.contrib.rpc.server.workpath";
+      runtime::Registry::Get("tvm.rpc.server.workpath");
+  CHECK(f != nullptr) << "require tvm.rpc.server.workpath";
   return (*f)(name);
 }
 
-TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.upload").
+TVM_REGISTER_GLOBAL("tvm.rpc.server.upload").
 set_body([](TVMArgs args, TVMRetValue *rv) {
     std::string file_name = RPCGetPath(args[0]);
     std::string data = args[1];
-    LOG(INFO) << "Upload " << file_name << "... nbytes=" << data.length();
     SaveBinaryToFile(file_name, data);
   });
 
-TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.download")
+TVM_REGISTER_GLOBAL("tvm.rpc.server.download")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
     std::string file_name = RPCGetPath(args[0]);
     std::string data;
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 323faf4a9b1c..21fff7b29882 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -6,6 +6,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/runtime/serializer.h>
 #include <memory>
 #include <array>
 #include <string>
@@ -44,16 +45,24 @@ struct RPCArgBuffer {
 };
 
 // Event handler for RPC events.
-class RPCSession::EventHandler {
+class RPCSession::EventHandler : public dmlc::Stream {
  public:
   EventHandler(common::RingBuffer* reader,
                common::RingBuffer* writer,
                int rpc_sess_table_index,
-               std::string name)
-      : reader_(reader), writer_(writer),
+               std::string name,
+               std::string* remote_key)
+      : reader_(reader),
+        writer_(writer),
         rpc_sess_table_index_(rpc_sess_table_index),
-        name_(name) {
+        name_(name),
+        remote_key_(remote_key) {
     this->Clear();
+    if (*remote_key == "%toinit") {
+      state_ = kInitHeader;
+      remote_key_->resize(0);
+      pending_request_bytes_ = sizeof(int32_t);
+    }
   }
   // Bytes needed to fulfill current request
   size_t BytesNeeded() {
@@ -63,6 +72,15 @@ class RPCSession::EventHandler {
       return 0;
     }
   }
+  // Request number of bytes from reader.
+  void RequestBytes(size_t nbytes) {
+    pending_request_bytes_ += nbytes;
+    reader_->Reserve(pending_request_bytes_);
+  }
+  // Whether we are ready to handle next request.
+  bool Ready() {
+    return reader_->bytes_available() >= pending_request_bytes_;
+  }
   bool CanCleanShutdown() const {
     return state_ == kRecvCode;
   }
@@ -75,14 +93,15 @@ class RPCSession::EventHandler {
     std::swap(client_mode_, client_mode);
     while (this->Ready()) {
       switch (state_) {
+        case kInitHeader: HandleInitHeader(); break;
         case kRecvCode: HandleRecvCode(); break;
         case kRecvCallHandle: {
-          this->Read(&call_handle_, sizeof(call_handle_));
+          CHECK(this->Read(&call_handle_));
           this->SwitchToState(kRecvPackedSeqNumArgs);
           break;
         }
         case kRecvPackedSeqNumArgs: {
-          this->Read(&num_packed_args_, sizeof(num_packed_args_));
+          CHECK(this->Read(&num_packed_args_));
           arg_buf_.reset(new RPCArgBuffer());
           arg_buf_->value.resize(num_packed_args_);
           arg_buf_->tcode.resize(num_packed_args_);
@@ -91,7 +110,7 @@ class RPCSession::EventHandler {
         }
         case kRecvPackedSeqTypeCode: {
           if (num_packed_args_ != 0) {
-            this->Read(arg_buf_->tcode.data(), sizeof(int) * num_packed_args_);
+            this->ReadArray(arg_buf_->tcode.data(), num_packed_args_);
           }
           arg_index_ = 0;
           arg_recv_stage_ = 0;
@@ -155,23 +174,34 @@ class RPCSession::EventHandler {
   }
   // send Packed sequence to writer.
   void SendPackedSeq(const TVMValue* arg_values, const int* type_codes, int n) {
-    writer_->Write(&n, sizeof(n));
-    writer_->Write(type_codes, sizeof(int) * n);
+    this->Write(n);
+    // only handles .
+    for (int i = 0; i < n; ++i) {
+      int tcode = type_codes[i];
+      if (tcode == kNDArrayContainer) tcode = kArrayHandle;
+      this->Write(tcode);
+    }
     // Argument packing.
     for (int i = 0; i < n; ++i) {
       int tcode = type_codes[i];
       TVMValue value = arg_values[i];
       switch (tcode) {
-        case kInt:
-        case kUInt:
-        case kFloat:
+        case kDLInt:
+        case kDLUInt:
+        case kDLFloat: {
+          this->Write<int64_t>(value.v_int64);
+          break;
+        }
         case kTVMType: {
-          writer_->Write(&value, sizeof(TVMValue));
+          this->Write(value.v_type);
+          // padding
+          int32_t padding = 0;
+          this->Write<int32_t>(padding);
           break;
         }
         case kTVMContext: {
           value.v_ctx = StripSessMask(value.v_ctx);
-          writer_->Write(&value, sizeof(TVMValue));
+          this->Write(value.v_ctx);
           break;
         }
         case kFuncHandle:
@@ -179,19 +209,20 @@ class RPCSession::EventHandler {
         case kHandle: {
           // always send handle in 64 bit.
           uint64_t handle = reinterpret_cast<uint64_t>(value.v_handle);
-          writer_->Write(&handle, sizeof(uint64_t));
+          this->Write(handle);
           break;
         }
+        case kNDArrayContainer:
         case kArrayHandle: {
           DLTensor* arr = static_cast<DLTensor*>(value.v_handle);
           TVMContext ctx = StripSessMask(arr->ctx);
           uint64_t data = reinterpret_cast<uint64_t>(
               static_cast<RemoteSpace*>(arr->data)->data);
-          writer_->Write(&data, sizeof(uint64_t));
-          writer_->Write(&ctx, sizeof(ctx));
-          writer_->Write(&(arr->ndim), sizeof(int));
-          writer_->Write(&(arr->dtype), sizeof(DLDataType));
-          writer_->Write(arr->shape, sizeof(int64_t) * arr->ndim);
+          this->Write(data);
+          this->Write(ctx);
+          this->Write(arr->ndim);
+          this->Write(arr->dtype);
+          this->WriteArray(arr->shape, arr->ndim);
           CHECK(arr->strides == nullptr)
               << "Donot support strided remote array";
           CHECK_EQ(arr->byte_offset, 0)
@@ -202,15 +233,15 @@ class RPCSession::EventHandler {
         case kStr: {
           const char* s = value.v_str;
           uint64_t len = strlen(s);
-          writer_->Write(&len, sizeof(len));
-          writer_->Write(s, sizeof(char) * len);
+          this->Write(len);
+          this->WriteArray(s, len);
           break;
         }
         case kBytes: {
           TVMByteArray* bytes = static_cast<TVMByteArray*>(arg_values[i].v_handle);
           uint64_t len = bytes->size;
-          writer_->Write(&len, sizeof(len));
-          writer_->Write(bytes->data, sizeof(char) * len);
+          this->Write(len);
+          this->WriteArray(bytes->data, len);
           break;
         }
         default: {
@@ -221,8 +252,26 @@ class RPCSession::EventHandler {
     }
   }
 
+  // Endian aware IO handling
+  using Stream::Read;
+  using Stream::Write;
+  using Stream::ReadArray;
+  using Stream::WriteArray;
+
+  inline bool Read(RPCCode* code) {
+    int cdata;
+    if (!this->Read(&cdata)) return false;
+    *code = static_cast<RPCCode>(cdata);
+    return true;
+  }
+  inline void Write(RPCCode code) {
+    int cdata = static_cast<int>(code);
+    this->Write(cdata);
+  }
+
  protected:
   enum State {
+    kInitHeader,
     kRecvCode,
     kRecvCallHandle,
     kRecvPackedSeqNumArgs,
@@ -240,6 +289,8 @@ class RPCSession::EventHandler {
   RPCCode code_;
   // Handle for the remote function call.
   uint64_t call_handle_;
+  // Initialize remote header
+  bool init_header_step_{0};
   // Number of packed arguments.
   int num_packed_args_;
   // Current argument index.
@@ -258,6 +309,7 @@ class RPCSession::EventHandler {
   std::string temp_data_;
   // Temp variables for copy request state.
   TVMContext copy_ctx_;
+  TVMType copy_dtype_;
   uint64_t copy_handle_, copy_offset_, copy_size_;
   // State switcher
   void SwitchToState(State state) {
@@ -266,6 +318,10 @@ class RPCSession::EventHandler {
         << "state=" << state;
     state_ = state;
     switch (state) {
+      case kInitHeader: {
+        LOG(FATAL) << "cannot switch to init header";
+        break;
+      }
       case kRecvCode: {
         this->RequestBytes(sizeof(RPCCode));
         break;
@@ -295,11 +351,13 @@ class RPCSession::EventHandler {
       case kDoCopyFromRemote: {
         this->RequestBytes(sizeof(uint64_t) * 3);
         this->RequestBytes(sizeof(TVMContext));
+        this->RequestBytes(sizeof(TVMType));
         break;
       }
       case kDoCopyToRemote: {
         this->RequestBytes(sizeof(uint64_t) * 3);
         this->RequestBytes(sizeof(TVMContext));
+        this->RequestBytes(sizeof(TVMType));
         break;
       }
       case kCopyAckReceived:
@@ -315,9 +373,9 @@ class RPCSession::EventHandler {
     int tcode = arg_buf_->tcode[arg_index_];
     static_assert(sizeof(TVMValue) == sizeof(uint64_t), "invariant");
     switch (tcode) {
-      case kInt:
-      case kUInt:
-      case kFloat:
+      case kDLInt:
+      case kDLUInt:
+      case kDLFloat:
       case kTVMType:
       case kHandle:
       case kStr:
@@ -352,12 +410,24 @@ class RPCSession::EventHandler {
     TVMValue& value = arg_buf_->value[arg_index_];
     if (arg_recv_stage_ == 0) {
       switch (tcode) {
-        case kInt:
-        case kUInt:
-        case kFloat:
-        case kTVMType:
+        case kDLInt:
+        case kDLUInt:
+        case kDLFloat: {
+          this->Read<int64_t>(&(value.v_int64));
+          ++arg_index_;
+          this->SwitchToState(kRecvPackedSeqArg);
+          break;
+        }
+        case kTVMType: {
+          this->Read(&(value.v_type));
+          int32_t padding = 0;
+          this->Read<int32_t>(&padding);
+          ++arg_index_;
+          this->SwitchToState(kRecvPackedSeqArg);
+          break;
+        }
         case kTVMContext: {
-          this->Read(&value, sizeof(TVMValue));
+          this->Read(&(value.v_ctx));
           ++arg_index_;
           this->SwitchToState(kRecvPackedSeqArg);
           break;
@@ -367,7 +437,7 @@ class RPCSession::EventHandler {
         case kHandle: {
           // always send handle in 64 bit.
           uint64_t handle;
-          this->Read(&handle, sizeof(handle));
+          this->Read(&handle);
           value.v_handle = reinterpret_cast<void*>(handle);
           ++arg_index_;
           this->SwitchToState(kRecvPackedSeqArg);
@@ -382,7 +452,7 @@ class RPCSession::EventHandler {
         case kStr:
         case kBytes: {
           uint64_t len;
-          this->Read(&len, sizeof(len));
+          this->Read(&len);
           temp_bytes_.reset( new RPCByteArrayBuffer());
           temp_bytes_->data.resize(len);
           arg_recv_stage_ = 1;
@@ -393,12 +463,12 @@ class RPCSession::EventHandler {
       case kArrayHandle: {
         temp_array_.reset(new RPCDataArrayBuffer());
         uint64_t handle;
-        this->Read(&handle, sizeof(handle));
+        this->Read(&handle);
         DLTensor& tensor = temp_array_->tensor;
         tensor.data = reinterpret_cast<void*>(handle);
-        this->Read(&(tensor.ctx), sizeof(TVMContext));
-        this->Read(&(tensor.ndim), sizeof(int));
-        this->Read(&(tensor.dtype), sizeof(DLDataType));
+        this->Read(&(tensor.ctx));
+        this->Read(&(tensor.ndim));
+        this->Read(&(tensor.dtype));
         temp_array_->shape.resize(tensor.ndim);
         tensor.shape = temp_array_->shape.data();
         arg_recv_stage_ = 1;
@@ -416,7 +486,7 @@ class RPCSession::EventHandler {
       CHECK_EQ(arg_recv_stage_, 1);
       if (tcode == kStr || tcode == kBytes) {
         if (temp_bytes_->data.size() != 0) {
-          this->Read(&(temp_bytes_->data[0]), temp_bytes_->data.size());
+          this->ReadArray(&(temp_bytes_->data[0]), temp_bytes_->data.size());
         }
         if (tcode == kStr) {
           value.v_str = temp_bytes_->data.c_str();
@@ -429,7 +499,7 @@ class RPCSession::EventHandler {
       } else {
         CHECK_EQ(tcode, kArrayHandle);
         DLTensor& tensor = temp_array_->tensor;
-        this->Read(tensor.shape, tensor.ndim * sizeof(int64_t));
+        this->ReadArray(tensor.shape, tensor.ndim);
         value.v_handle = &tensor;
         arg_buf_->temp_array.emplace_back(std::move(temp_array_));
       }
@@ -438,9 +508,24 @@ class RPCSession::EventHandler {
       this->SwitchToState(kRecvPackedSeqArg);
     }
   }
+  // handler for initial header read
+  void HandleInitHeader() {
+    if (init_header_step_ == 0) {
+      int32_t len;
+      this->Read(&len);
+      remote_key_->resize(len);
+      init_header_step_ = 1;
+      this->RequestBytes(len);
+      return;
+    } else {
+      CHECK_EQ(init_header_step_, 1);
+      this->ReadArray(dmlc::BeginPtr(*remote_key_), remote_key_->length());
+      this->SwitchToState(kRecvCode);
+    }
+  }
   // Handler for read code.
   void HandleRecvCode() {
-    this->Read(&code_, sizeof(code_));
+    this->Read(&code_);
     if (code_ > RPCCode::kSystemFuncStart) {
       SwitchToState(kRecvPackedSeqNumArgs);
       return;
@@ -478,32 +563,47 @@ class RPCSession::EventHandler {
   }
 
   void HandleCopyFromRemote() {
-    uint64_t handle, offset, size;
+    uint64_t handle, offset, num_bytes;
     TVMContext ctx;
-    this->Read(&handle, sizeof(handle));
-    this->Read(&offset, sizeof(offset));
-    this->Read(&size, sizeof(size));
-    this->Read(&ctx, sizeof(ctx));
-    if (ctx.device_type == kCPU) {
+    TVMType type_hint;
+    this->Read(&handle);
+    this->Read(&offset);
+    this->Read(&num_bytes);
+    this->Read(&ctx);
+    this->Read(&type_hint);
+    size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8;
+
+    if (ctx.device_type == kDLCPU) {
       RPCCode code = RPCCode::kCopyAck;
-      writer_->Write(&code, sizeof(code));
-      writer_->Write(reinterpret_cast<char*>(handle) + offset, size);
+      this->Write(code);
+      char* dptr = reinterpret_cast<char*>(handle) + offset;
+      if (!DMLC_IO_NO_ENDIAN_SWAP) {
+        temp_data_.resize(0);
+        temp_data_.insert(temp_data_.end(), dptr, dptr + num_bytes);
+        dmlc::ByteSwap(dmlc::BeginPtr(temp_data_), elem_bytes, num_bytes / elem_bytes);
+        this->WriteArray(temp_data_.data(), num_bytes);
+      } else {
+        this->WriteArray(dptr, num_bytes);
+      }
     } else {
-      temp_data_.resize(size + 1);
+      temp_data_.resize(num_bytes + 1);
       try {
         TVMContext cpu_ctx;
-        cpu_ctx.device_type = kCPU;
+        cpu_ctx.device_type = kDLCPU;
         cpu_ctx.device_id = 0;
         DeviceAPI::Get(ctx)->CopyDataFromTo(
             reinterpret_cast<void*>(handle), offset,
             dmlc::BeginPtr(temp_data_), 0,
-            size, ctx, cpu_ctx, nullptr);
+            num_bytes, ctx, cpu_ctx, type_hint, nullptr);
         RPCCode code = RPCCode::kCopyAck;
-        writer_->Write(&code, sizeof(code));
-        writer_->Write(&temp_data_[0], size);
+        this->Write(code);
+        if (!DMLC_IO_NO_ENDIAN_SWAP) {
+          dmlc::ByteSwap(dmlc::BeginPtr(temp_data_), elem_bytes, num_bytes / elem_bytes);
+        }
+        this->WriteArray(&temp_data_[0], num_bytes);
       } catch (const std::runtime_error &e) {
         RPCCode code = RPCCode::kException;
-        writer_->Write(&code, sizeof(code));
+        this->Write(code);
         TVMValue ret_value;
         ret_value.v_str = e.what();
         int ret_tcode = kStr;
@@ -517,10 +617,11 @@ class RPCSession::EventHandler {
     // use static variable to persist state.
     // This only works if next stage is immediately after this.
     if (arg_recv_stage_ == 0) {
-      this->Read(&copy_handle_, sizeof(uint64_t));
-      this->Read(&copy_offset_, sizeof(uint64_t));
-      this->Read(&copy_size_, sizeof(uint64_t));
-      this->Read(&copy_ctx_, sizeof(TVMContext));
+      CHECK(this->Read(&copy_handle_));
+      CHECK(this->Read(&copy_offset_));
+      CHECK(this->Read(&copy_size_));
+      CHECK(this->Read(&copy_ctx_));
+      CHECK(this->Read(&copy_dtype_));
       arg_recv_stage_ = 1;
       CHECK_EQ(pending_request_bytes_, 0U);
       this->RequestBytes(copy_size_);
@@ -531,20 +632,28 @@ class RPCSession::EventHandler {
       int ret_tcode = kNull;
       RPCCode code = RPCCode::kReturn;
       std::string errmsg;
-      if (copy_ctx_.device_type == kCPU) {
-        this->Read(
-            reinterpret_cast<char*>(copy_handle_) + copy_offset_, copy_size_);
+
+      size_t elem_bytes = (copy_dtype_.bits * copy_dtype_.lanes + 7) / 8;
+      if (copy_ctx_.device_type == kDLCPU) {
+        char* dptr = reinterpret_cast<char*>(copy_handle_) + copy_offset_;
+        this->ReadArray(dptr, copy_size_);
+        if (!DMLC_IO_NO_ENDIAN_SWAP) {
+          dmlc::ByteSwap(dptr, elem_bytes, copy_size_ / elem_bytes);
+        }
       } else {
         temp_data_.resize(copy_size_ + 1);
-        this->Read(&temp_data_[0], copy_size_);
+        this->ReadArray(&temp_data_[0], copy_size_);
+        if (!DMLC_IO_NO_ENDIAN_SWAP) {
+          dmlc::ByteSwap(dmlc::BeginPtr(temp_data_), elem_bytes, copy_size_ / elem_bytes);
+        }
         try {
           TVMContext cpu_ctx;
-          cpu_ctx.device_type = kCPU;
+          cpu_ctx.device_type = kDLCPU;
           cpu_ctx.device_id = 0;
           DeviceAPI::Get(copy_ctx_)->CopyDataFromTo(
               temp_data_.data(), 0,
               reinterpret_cast<void*>(copy_handle_), copy_offset_,
-              copy_size_, cpu_ctx, copy_ctx_, nullptr);
+              copy_size_, cpu_ctx, copy_ctx_, copy_dtype_, nullptr);
         } catch (const std::runtime_error &e) {
           code = RPCCode::kException;
           errmsg = e.what();
@@ -552,7 +661,7 @@ class RPCSession::EventHandler {
           ret_tcode = kStr;
         }
       }
-      writer_->Write(&code, sizeof(code));
+      this->Write(code);
       SendPackedSeq(&ret_value, &ret_tcode, 1);
       arg_recv_stage_ = 0;
       this->SwitchToState(kRecvCode);
@@ -572,7 +681,7 @@ class RPCSession::EventHandler {
       std::unique_ptr<RPCArgBuffer> args = std::move(arg_buf_);
       f(args->AsTVMArgs(), &rv);
       RPCCode code = RPCCode::kReturn;
-      writer_->Write(&code, sizeof(code));
+      this->Write(code);
       if (rv.type_code() == kStr) {
         ret_value.v_str = rv.ptr<std::string>()->c_str();
         ret_tcode = kStr;
@@ -599,7 +708,7 @@ class RPCSession::EventHandler {
       }
     } catch (const std::runtime_error& e) {
       RPCCode code = RPCCode::kException;
-      writer_->Write(&code, sizeof(code));
+      this->Write(code);
       ret_value.v_str = e.what();
       ret_tcode = kStr;
       SendPackedSeq(&ret_value, &ret_tcode, 1);
@@ -609,19 +718,14 @@ class RPCSession::EventHandler {
  private:
   // Utility functions
   // Internal read function, update pending_request_bytes_
-  void Read(void* data, size_t size) {
+  size_t Read(void* data, size_t size) final {
     CHECK_LE(size, pending_request_bytes_);
     reader_->Read(data, size);
     pending_request_bytes_ -= size;
+    return size;
   }
-  // Request number of bytes from reader.
-  void RequestBytes(size_t nbytes) {
-    pending_request_bytes_ += nbytes;
-    reader_->Reserve(pending_request_bytes_);
-  }
-  // Whether we are ready to handle next request.
-  bool Ready() {
-    return reader_->bytes_available() >= pending_request_bytes_;
+  void Write(const void* data, size_t size) final {
+    writer_->Write(data, size);
   }
   // Number of pending bytes requests
   size_t pending_request_bytes_;
@@ -633,6 +737,8 @@ class RPCSession::EventHandler {
   int rpc_sess_table_index_;
   // Name of session.
   std::string name_;
+  // remote key
+  std::string* remote_key_;
 };
 
 struct RPCSessTable {
@@ -699,7 +805,8 @@ RPCCode RPCSession::HandleUntilReturnEvent(
 
 void RPCSession::Init() {
   // Event handler
-  handler_ = std::make_shared<EventHandler>(&reader_, &writer_, table_index_, name_);
+  handler_ = std::make_shared<EventHandler>(
+      &reader_, &writer_, table_index_, name_, &remote_key_);
   // Quick function to call remote.
   call_remote_ = PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
       handler_->SendPackedSeq(args.values, args.type_codes, args.num_args);
@@ -709,10 +816,13 @@ void RPCSession::Init() {
 }
 
 std::shared_ptr<RPCSession> RPCSession::Create(
-    std::unique_ptr<RPCChannel> channel, std::string name) {
+    std::unique_ptr<RPCChannel> channel,
+    std::string name,
+    std::string remote_key) {
   std::shared_ptr<RPCSession> sess = std::make_shared<RPCSession>();
   sess->channel_ = std::move(channel);
   sess->name_ = std::move(name);
+  sess->remote_key_ = std::move(remote_key);
   sess->table_index_ = RPCSessTable::Global()->Insert(sess);
   sess->Init();
   return sess;
@@ -729,7 +839,7 @@ RPCSession::~RPCSession() {
 void RPCSession::Shutdown() {
   if (channel_ != nullptr) {
     RPCCode code = RPCCode::kShutdown;
-    writer_.Write(&code, sizeof(code));
+    handler_->Write(code);
     // flush all writing buffer to output channel.
     try {
       while (writer_.bytes_available() != 0) {
@@ -746,10 +856,12 @@ void RPCSession::Shutdown() {
 
 void RPCSession::ServerLoop() {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
+  if (const auto* f = Registry::Get("tvm.rpc.server.start")) {
+    (*f)();
+  }
   TVMRetValue rv;
   CHECK(HandleUntilReturnEvent(&rv, false, nullptr) == RPCCode::kShutdown);
-  LOG(INFO) << "Shutdown...";
-  if (const auto* f = Registry::Get("tvm.contrib.rpc.server.shutdown")) {
+  if (const auto* f = Registry::Get("tvm.rpc.server.shutdown")) {
     (*f)();
   }
   channel_.reset(nullptr);
@@ -781,9 +893,9 @@ void RPCSession::CallFunc(void* h,
                           const PackedFunc* fwrap) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   RPCCode code = RPCCode::kCallFunc;
-  writer_.Write(&code, sizeof(code));
+  handler_->Write(code);
   uint64_t handle = reinterpret_cast<uint64_t>(h);
-  writer_.Write(&handle, sizeof(handle));
+  handler_->Write(handle);
   handler_->SendPackedSeq(args.values, args.type_codes, args.num_args);
   code = HandleUntilReturnEvent(rv, true, fwrap);
   CHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
@@ -794,19 +906,21 @@ void RPCSession::CopyToRemote(void* from,
                               void* to,
                               size_t to_offset,
                               size_t data_size,
-                              TVMContext ctx_to) {
+                              TVMContext ctx_to,
+                              TVMType type_hint) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   ctx_to = handler_->StripSessMask(ctx_to);
   RPCCode code = RPCCode::kCopyToRemote;
-  writer_.Write(&code, sizeof(code));
+  handler_->Write(code);
   uint64_t handle = reinterpret_cast<uint64_t>(to);
-  writer_.Write(&handle, sizeof(handle));
+  handler_->Write(handle);
   uint64_t offset = static_cast<uint64_t>(to_offset);
-  writer_.Write(&offset, sizeof(offset));
+  handler_->Write(offset);
   uint64_t size = static_cast<uint64_t>(data_size);
-  writer_.Write(&size, sizeof(size));
-  writer_.Write(&ctx_to, sizeof(ctx_to));
-  writer_.Write(reinterpret_cast<char*>(from) + from_offset, data_size);
+  handler_->Write(size);
+  handler_->Write(ctx_to);
+  handler_->Write(type_hint);
+  handler_->WriteArray(reinterpret_cast<char*>(from) + from_offset, data_size);
   TVMRetValue rv;
   CHECK(HandleUntilReturnEvent(&rv, true, nullptr) == RPCCode::kReturn);
 }
@@ -816,36 +930,40 @@ void RPCSession::CopyFromRemote(void* from,
                                 void* to,
                                 size_t to_offset,
                                 size_t data_size,
-                                TVMContext ctx_from) {
+                                TVMContext ctx_from,
+                                TVMType type_hint) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   ctx_from = handler_->StripSessMask(ctx_from);
   RPCCode code = RPCCode::kCopyFromRemote;
-  writer_.Write(&code, sizeof(code));
+  handler_->Write(code);
   uint64_t handle = reinterpret_cast<uint64_t>(from);
-  writer_.Write(&handle, sizeof(handle));
+  handler_->Write(handle);
   uint64_t offset = static_cast<uint64_t>(from_offset);
-  writer_.Write(&offset, sizeof(offset));
+  handler_->Write(offset);
   uint64_t size = static_cast<uint64_t>(data_size);
-  writer_.Write(&size, sizeof(size));
-  writer_.Write(&ctx_from, sizeof(ctx_from));
+  handler_->Write(size);
+  handler_->Write(ctx_from);
+  handler_->Write(type_hint);
   TVMRetValue rv;
   CHECK(HandleUntilReturnEvent(&rv, true, nullptr) == RPCCode::kCopyAck);
   reader_.Reserve(data_size);
-  while (reader_.bytes_available() < data_size) {
-    size_t bytes_needed = data_size - reader_.bytes_available();
+  handler_->RequestBytes(data_size);
+  while (!handler_->Ready()) {
+    size_t bytes_needed = handler_->BytesNeeded();
     reader_.WriteWithCallback([this](void* data, size_t size) {
         size_t n = channel_->Recv(data, size);
         CHECK_NE(n, 0U) << "Channel closes before we get neded bytes";
         return n;
       }, bytes_needed);
   }
-  reader_.Read(reinterpret_cast<char*>(to) + to_offset, data_size);
+  handler_->ReadArray(reinterpret_cast<char*>(to) + to_offset, data_size);
   handler_->FinishCopyAck();
 }
 
 RPCFuncHandle RPCSession::GetTimeEvaluator(
-    RPCFuncHandle fhandle, TVMContext ctx, int nstep) {
-  return this->CallRemote(RPCCode::kGetTimeEvaluator, fhandle, ctx, nstep);
+    RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat) {
+  return this->CallRemote(
+      RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat);
 }
 
 // Event handler functions
@@ -887,9 +1005,11 @@ void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) {
 
 void RPCDevAllocData(TVMArgs args, TVMRetValue *rv) {
   TVMContext ctx = args[0];
-  uint64_t size = args[1];
+  uint64_t nbytes = args[1];
   uint64_t alignment = args[2];
-  void* data = DeviceAPI::Get(ctx)->AllocDataSpace(ctx, size, alignment);
+  TVMType type_hint = args[3];
+  void* data = DeviceAPI::Get(ctx)->AllocDataSpace(
+      ctx, nbytes, alignment, type_hint);
   *rv = data;
 }
 
@@ -913,25 +1033,26 @@ void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) {
   uint64_t size = args[4];
   TVMContext ctx_from = args[5];
   TVMContext ctx_to = args[6];
-  TVMStreamHandle stream = args[7];
+  TVMType type_hint = args[7];
+  TVMStreamHandle stream = args[8];
   TVMContext ctx = ctx_from;
-  if (ctx.device_type == kCPU) {
+  if (ctx.device_type == kDLCPU) {
     ctx = ctx_to;
   } else {
-    CHECK(ctx_to.device_type == kCPU ||
+    CHECK(ctx_to.device_type == kDLCPU ||
           ctx_to.device_type == ctx_from.device_type)
         << "Can not copy across different ctx types directly";
   }
   DeviceAPI::Get(ctx)->CopyDataFromTo(
       from, from_offset,
       to, to_offset,
-      size, ctx_from, ctx_to, stream);
+      size, ctx_from, ctx_to, type_hint, stream);
 }
 
 void RPCModuleLoad(TVMArgs args, TVMRetValue *rv) {
   static const PackedFunc* fsys_load_ = nullptr;
   if (fsys_load_ == nullptr) {
-    fsys_load_ = runtime::Registry::Get("tvm.contrib.rpc.server.load_module");
+    fsys_load_ = runtime::Registry::Get("tvm.rpc.server.load_module");
     CHECK(fsys_load_ != nullptr);
   }
   std::string file_name = args[0];
@@ -971,7 +1092,7 @@ void RPCModuleGetSource(TVMArgs args, TVMRetValue *rv) {
 
 void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
   PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
-  void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2]));
+  void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3]));
   delete pf;
   *rv = fhandle;
 }
@@ -1022,23 +1143,31 @@ void RPCSession::EventHandler::HandlePackedCall() {
   CHECK_EQ(state_, kRecvCode);
 }
 
-PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int nstep) {
-  auto ftimer = [pf, ctx, nstep](TVMArgs args, TVMRetValue *rv) {
+PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) {
+  auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) {
     TVMRetValue temp;
+    std::ostringstream os;
     // skip first time call, to activate lazy compilation components.
     pf.CallPacked(args, &temp);
     DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
-    // start timing
-    auto tbegin = std::chrono::high_resolution_clock::now();
-    for (int i = 0; i < nstep; ++i) {
-      pf.CallPacked(args, &temp);
+    for (int i = 0; i < repeat; ++i) {
+      // start timing
+      auto tbegin = std::chrono::high_resolution_clock::now();
+      for (int i = 0; i < number; ++i) {
+        pf.CallPacked(args, &temp);
+      }
+      DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+      auto tend = std::chrono::high_resolution_clock::now();
+      double speed = std::chrono::duration_cast<std::chrono::duration<double> >(
+          tend - tbegin).count() / number;
+      os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
     }
-    DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
-    auto tend = std::chrono::high_resolution_clock::now();
-    double speed = std::chrono::duration_cast<std::chrono::duration<double> >(
-        tend - tbegin).count() / nstep;
+    std::string blob = os.str();
+    TVMByteArray arr;
+    arr.size = blob.length();
+    arr.data = blob.data();
     // return the time.
-    *rv = speed;
+    *rv = arr;
   };
   return PackedFunc(ftimer);
 }
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 80dde9171401..68f6763ae6db 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -116,42 +116,48 @@ class RPCSession {
    * \param from_offset The byte offeset in the from.
    * \param to The target array.
    * \param to_offset The byte offset in the to.
-   * \param size The size of the memory.
+   * \param nbytes The size of the memory in bytes.
    * \param ctx_to The target context.
+   * \param type_hint Hint of content data type.
    */
   void CopyToRemote(void* from,
                     size_t from_offset,
                     void* to,
                     size_t to_offset,
-                    size_t size,
-                    TVMContext ctx_to);
+                    size_t nbytes,
+                    TVMContext ctx_to,
+                    TVMType type_hint);
   /*!
    * \brief Copy bytes from remote array content.
    * \param from The source host data.
    * \param from_offset The byte offeset in the from.
    * \param to The target array.
    * \param to_offset The byte offset in the to.
-   * \param size The size of the memory.
+   * \param nbytes The size of the memory in bytes.
    * \param ctx_from The source context.
+   * \param type_hint Hint of content data type.
    */
   void CopyFromRemote(void* from,
                       size_t from_offset,
                       void* to,
                       size_t to_offset,
-                      size_t size,
-                      TVMContext ctx_from);
+                      size_t nbytes,
+                      TVMContext ctx_from,
+                      TVMType type_hint);
   /*!
    * \brief Get a remote timer function on ctx.
    *  This function consumes fhandle, caller should not call Free on fhandle.
    *
    * \param fhandle The function handle.
    * \param ctx The ctx to run measurement on.
-   * \param nstep Number of steps to run.
+   * \param number How many steps to run in each time evaluation
+   * \param repeat How many times to repeat the timer
    * \return A remote timer function
    */
   RPCFuncHandle GetTimeEvaluator(RPCFuncHandle fhandle,
                                  TVMContext ctx,
-                                 int nstep);
+                                 int number,
+                                 int repeat);
   /*!
    * \brief Call a remote defined system function with arguments.
    * \param fcode The function code.
@@ -169,12 +175,15 @@ class RPCSession {
   /*!
    * \brief Create a RPC session with given channel.
    * \param channel The communication channel.
-   * \param name The name of the session, used for debug
-   * \return The session.
+   * \param name The local name of the session, used for debug
+   * \param remote_key The remote key of the session
+   *   if remote_key equals "%toinit", we need to re-intialize
+   *   it by event handler.
    */
   static std::shared_ptr<RPCSession> Create(
       std::unique_ptr<RPCChannel> channel,
-      std::string name);
+      std::string name,
+      std::string remote_key);
   /*!
    * \brief Try get session from the global session table by table index.
    * \param table_index The table index of the session.
@@ -206,15 +215,18 @@ class RPCSession {
   int table_index_{0};
   // The name of the session.
   std::string name_;
+  // The remote key
+  std::string remote_key_;
 };
 
 /*!
  * \brief Wrap a timer function for a given packed function.
  * \param f The function argument.
  * \param ctx The context.
- * \param nstep Number of repeative steps.
+ * \param number Number of steps in the inner iteration
+ * \param repeat How many steps to repeat the time evaluation.
  */
-PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int nstep);
+PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat);
 
 /*!
  * \brief Create a Global RPC module that refers to the session.
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index e2e089f25734..22f221d46526 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -68,7 +68,14 @@ RPCConnect(std::string url, int port, std::string key) {
     sock.Close();
     LOG(FATAL) << "URL " << url << ":" << port << " is not TVM RPC server";
   }
-  return RPCSession::Create(std::unique_ptr<SockChannel>(new SockChannel(sock)), key);
+  CHECK_EQ(sock.RecvAll(&keylen, sizeof(keylen)), sizeof(keylen));
+  std::string remote_key;
+  if (keylen != 0) {
+    remote_key.resize(keylen);
+    CHECK_EQ(sock.RecvAll(&remote_key[0], keylen), keylen);
+  }
+  return RPCSession::Create(
+      std::unique_ptr<SockChannel>(new SockChannel(sock)), key, remote_key);
 }
 
 Module RPCClientConnect(std::string url, int port, std::string key) {
@@ -80,15 +87,15 @@ void RPCServerLoop(int sockfd) {
       static_cast<common::TCPSocket::SockType>(sockfd));
   RPCSession::Create(
       std::unique_ptr<SockChannel>(new SockChannel(sock)),
-                     "SockServerLoop")->ServerLoop();
+      "SockServerLoop", "")->ServerLoop();
 }
 
-TVM_REGISTER_GLOBAL("contrib.rpc._Connect")
+TVM_REGISTER_GLOBAL("rpc._Connect")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     *rv = RPCClientConnect(args[0], args[1], args[2]);
   });
 
-TVM_REGISTER_GLOBAL("contrib.rpc._ServerLoop")
+TVM_REGISTER_GLOBAL("rpc._ServerLoop")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     RPCServerLoop(args[0]);
   });
diff --git a/src/runtime/sgx/common.h b/src/runtime/sgx/common.h
new file mode 100644
index 000000000000..a375bcd21dd2
--- /dev/null
+++ b/src/runtime/sgx/common.h
@@ -0,0 +1,25 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file common.h
+ * \brief TVM SGX common API.
+ */
+#ifndef TVM_RUNTIME_SGX_COMMON_H_
+#define TVM_RUNTIME_SGX_COMMON_H_
+
+#include <sgx_error.h>
+
+namespace tvm {
+namespace runtime {
+namespace sgx {
+
+#define TVM_SGX_CHECKED_CALL(Function)                                         \
+  sgx_status_t TVM_STR_CONCAT(__sgx_status_, __LINE__) = SGX_ERROR_UNEXPECTED; \
+  TVM_STR_CONCAT(__sgx_status_, __LINE__) = Function;                          \
+  CHECK_EQ(TVM_STR_CONCAT(__sgx_status_, __LINE__), SGX_SUCCESS)               \
+    << "SGX Error: " << TVM_STR_CONCAT(__sgx_status_, __LINE__);
+
+}  // namespace sgx
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_SGX_COMMON_H_
diff --git a/src/runtime/sgx/trusted/ecall_registry.h b/src/runtime/sgx/trusted/ecall_registry.h
new file mode 100644
index 000000000000..4bc476aaef61
--- /dev/null
+++ b/src/runtime/sgx/trusted/ecall_registry.h
@@ -0,0 +1,91 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file ecall_registry.h
+ * \brief The global registry of packed functions available via ecall_packed_func.
+ */
+#ifndef TVM_RUNTIME_SGX_TRUSTED_ECALL_REGISTRY_H_
+#define TVM_RUNTIME_SGX_TRUSTED_ECALL_REGISTRY_H_
+
+#include <dmlc/logging.h>
+#include <tvm/runtime/registry.h>
+#include <string>
+#include <algorithm>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace sgx {
+
+class ECallRegistry: public Registry {
+ public:
+  explicit ECallRegistry(std::string name) {
+    name_ = name;
+  }
+
+  Registry& set_body(PackedFunc f) {
+     func_ = f;
+     return *this;
+  }
+
+  Registry& set_body(PackedFunc::FType f) {  // NOLINT(*)
+    return set_body(PackedFunc(f));
+  }
+
+  static Registry& Register(const std::string& name, bool override = false) {
+    for (auto& r : exports_) {
+      if (r.name_ == name) {
+        CHECK(override) << "ecall " << name << " is already registered";
+        return r;
+      }
+    }
+    TVM_SGX_CHECKED_CALL(
+        tvm_ocall_register_export(name.c_str(), exports_.size()));
+    exports_.emplace_back(name);
+    return exports_.back();
+  }
+
+  static bool Remove(const std::string& name) {
+    LOG(FATAL) << "Removing enclave exports is not supported.";
+  }
+
+  static const PackedFunc* Get(const std::string& name) {
+    for (const auto& r : exports_) {
+      if (r.name_ == name) return &r.func_;
+    }
+    return nullptr;
+  }
+
+  static const PackedFunc* Get(unsigned func_id) {
+    return func_id >= exports_.size() ? nullptr : &exports_[func_id].func_;
+  }
+
+  static std::vector<std::string> ListNames() {
+    std::vector<std::string> names;
+    names.resize(exports_.size());
+    std::transform(exports_.begin(), exports_.end(), names.begin(),
+                   [](ECallRegistry r) { return r.name_; });
+    return names;
+  }
+
+  static std::vector<ECallRegistry> exports_;
+};
+
+std::vector<ECallRegistry> ECallRegistry::exports_;
+
+/*!
+ * \brief Register a function callable via ecall_packed_func
+ * \code
+ *   TVM_REGISTER_ENCLAVE_FUNC("DoThing")
+ *   .set_body([](TVMArgs args, TVMRetValue* rv) {
+ *   });
+ * \endcode
+ */
+#define TVM_REGISTER_ENCLAVE_FUNC(OpName)                              \
+  TVM_STR_CONCAT(TVM_FUNC_REG_VAR_DEF, __COUNTER__) =                  \
+      ::tvm::runtime::sgx::ECallRegistry::Register(OpName, true)
+
+}  // namespace sgx
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_SGX_TRUSTED_ECALL_REGISTRY_H_
diff --git a/src/runtime/sgx/trusted/runtime.cc b/src/runtime/sgx/trusted/runtime.cc
new file mode 100644
index 000000000000..a863327f956c
--- /dev/null
+++ b/src/runtime/sgx/trusted/runtime.cc
@@ -0,0 +1,75 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file runtime_t.cc
+ */
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/packed_func.h>
+#include "../../c_runtime_api.cc"
+#include "../../cpu_device_api.cc"
+#include "../../module.cc"
+#include "../../module_util.cc"
+#include "../../registry.cc"
+#include "../../system_lib_module.cc"
+#include "../../thread_pool.cc"
+#include "../../workspace_pool.cc"
+#include "./ecall_registry.h"
+#include "./runtime.h"
+#include "./threading_backend.cc"
+
+namespace tvm {
+namespace runtime {
+namespace sgx {
+
+extern "C" {
+
+void tvm_ecall_init(TVMRetValueHandle ret) {}
+
+void tvm_ecall_packed_func(int func_id,
+                           const TVMValue* arg_values,
+                           const int* type_codes,
+                           int num_args,
+                           TVMRetValueHandle ret) {
+  const PackedFunc* f = ECallRegistry::Get(func_id);
+  CHECK(f != nullptr) << "ecall function not found.";
+
+  TVMRetValue rv;
+  f->CallPacked(TVMArgs(arg_values, type_codes, num_args), &rv);
+
+  int ret_type_code = rv.type_code();
+  if (ret_type_code == kNull) return;
+
+  TVMValue ret_value;
+  if (ret_type_code == kBytes || ret_type_code == kStr) {
+    // allocate a buffer in untrusted, copy the values in
+    std::string bytes = rv;
+
+    void* ret_buf;
+    TVM_SGX_CHECKED_CALL(tvm_ocall_reserve_space(
+          &ret_buf, bytes.size() + sizeof(TVMByteArray), sizeof(uint64_t)));
+
+    char* data_buf = static_cast<char*>(ret_buf) + sizeof(TVMByteArray);
+    memcpy(data_buf, bytes.data(), bytes.size());
+
+    TVMByteArray* arr = static_cast<TVMByteArray*>(ret_buf);
+    arr->data = data_buf;
+    arr->size = bytes.size();
+
+    ret_value = TVMValue{.v_handle = arr};
+    ret_type_code = kBytes;
+  } else {
+    rv.MoveToCHost(&ret_value, &ret_type_code);
+  }
+  TVM_SGX_CHECKED_CALL(tvm_ocall_set_return(ret, &ret_value, &ret_type_code, 1));
+}
+
+}  // extern "C"
+
+TVM_REGISTER_ENCLAVE_FUNC("__tvm_main__")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  Module mod = (*Registry::Get("module._GetSystemLib"))();
+  mod.GetFunction("default_function").CallPacked(args, rv);
+});
+
+}  // namespace sgx
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/sgx/trusted/runtime.h b/src/runtime/sgx/trusted/runtime.h
new file mode 100644
index 000000000000..9bd834e0513f
--- /dev/null
+++ b/src/runtime/sgx/trusted/runtime.h
@@ -0,0 +1,42 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file trusted/runtime.h
+ * \brief TVM SGX trusted API.
+ */
+#ifndef TVM_RUNTIME_SGX_TRUSTED_RUNTIME_H_
+#define TVM_RUNTIME_SGX_TRUSTED_RUNTIME_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <string>
+#include "../common.h"
+
+namespace tvm {
+namespace runtime {
+namespace sgx {
+
+template<typename... Args>
+inline TVMRetValue OCallPackedFunc(std::string name, Args&& ...args) {
+  const int kNumArgs = sizeof...(Args);
+  const int kArraySize = kNumArgs > 0 ? kNumArgs : 1;
+  TVMValue values[kArraySize];
+  int type_codes[kArraySize];
+  detail::for_each(TVMArgsSetter(values, type_codes),
+                   std::forward<Args>(args)...);
+  TVMValue ret_val;
+  int ret_type_code;
+  TVM_SGX_CHECKED_CALL(tvm_ocall_packed_func(name.c_str(),
+                                             values,
+                                             type_codes,
+                                             kNumArgs,
+                                             &ret_val,
+                                             &ret_type_code));
+  TVMRetValue* rv = new TVMRetValue();
+  *rv = TVMArgValue(ret_val, ret_type_code);
+  return *rv;
+}
+
+}  // namespace sgx
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_SGX_TRUSTED_RUNTIME_H_
diff --git a/src/runtime/sgx/trusted/threading_backend.cc b/src/runtime/sgx/trusted/threading_backend.cc
new file mode 100644
index 000000000000..fd9112c20228
--- /dev/null
+++ b/src/runtime/sgx/trusted/threading_backend.cc
@@ -0,0 +1,78 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sgx/threading_backend.cc
+ * \brief SGX threading backend
+ */
+#include <tvm/runtime/threading_backend.h>
+#include <dmlc/logging.h>
+#include <sgx_edger8r.h>
+#include <sgx_trts.h>
+#include <atomic>
+#include "runtime.h"
+
+#ifndef TVM_SGX_MAX_CONCURRENCY
+#define TVM_SGX_MAX_CONCURRENCY 1
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace threading {
+
+class ThreadGroup::Impl {
+ public:
+  Impl(int num_workers, std::function<void(int)> worker_callback,
+       bool exclude_worker0)
+      : num_workers_(num_workers),
+        worker_callback_(worker_callback),
+        next_task_id_(exclude_worker0) {
+    CHECK(num_workers <= TVM_SGX_MAX_CONCURRENCY)
+      << "Tried spawning more threads than allowed by TVM_SGX_MAX_CONCURRENCY.";
+    sgx::OCallPackedFunc("__sgx_thread_group_launch__",
+        num_workers_, reinterpret_cast<void*>(this));
+  }
+
+  ~Impl() {
+    sgx::OCallPackedFunc("__sgx_thread_group_join__");
+  }
+
+  void RunTask() {
+    int task_id = next_task_id_++;
+    CHECK(task_id < num_workers_)
+      << "More workers entered enclave than allowed by TVM_SGX_MAX_CONCURRENCY";
+    worker_callback_(task_id);
+  }
+
+ private:
+  int num_workers_;
+  std::function<void(int)> worker_callback_;
+  std::atomic<int> next_task_id_;
+};
+
+ThreadGroup::ThreadGroup(int num_workers,
+                         std::function<void(int)> worker_callback,
+                         bool exclude_worker0)
+  : impl_(new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0)) {}
+void ThreadGroup::Join() {}
+int ThreadGroup::Configure(AffinityMode mode, int nthreads, bool exclude_worker0) {
+  int max_conc = MaxConcurrency();
+  if (!nthreads || ntheads > max_conc) {
+    return max_conc;
+  }
+  return nthreads;
+}
+ThreadGroup::~ThreadGroup() { delete impl_; }
+
+void Yield() {}
+
+int MaxConcurrency() { return TVM_SGX_MAX_CONCURRENCY; }
+
+TVM_REGISTER_ENCLAVE_FUNC("__tvm_run_worker__")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    void* tg = args[0];
+    if (!sgx_is_within_enclave(tg, sizeof(ThreadGroup::Impl))) return;
+    reinterpret_cast<ThreadGroup::Impl*>(tg)->RunTask();
+  });
+
+}  // namespace threading
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/sgx/tvm.edl b/src/runtime/sgx/tvm.edl
new file mode 100644
index 000000000000..b4d9852f8499
--- /dev/null
+++ b/src/runtime/sgx/tvm.edl
@@ -0,0 +1,28 @@
+enclave {
+    from "sgx_tstdc.edl" import *;
+
+    trusted {
+        public void tvm_ecall_init([isptr, user_check] TVMRetValueHandle ret);
+        public void tvm_ecall_packed_func(int func_id,
+                                          [in, count=num_args] const TVMValue* arg_values,
+                                          [in, count=num_args] const int* type_codes,
+                                          int num_args,
+                                          [isptr, user_check] TVMRetValueHandle ret);
+    };
+
+    untrusted {
+        void tvm_ocall_packed_func([in, string] const char* name,
+                                   [in, count=num_args] const TVMValue* arg_values,
+                                   [in, count=num_args] const int* type_codes,
+                                   int num_args,
+                                   [out] TVMValue* ret_val,
+                                   [out] int* ret_type_code);
+        void tvm_ocall_set_return([isptr, user_check] TVMRetValueHandle ret,
+                                   [in, count=num_ret] const TVMValue* value,
+                                   [in, count=num_ret] const int* type_code,
+                                   int num_ret);
+        void tvm_ocall_register_export([in, string] const char* name, int func_id);
+        void* tvm_ocall_reserve_space(size_t num_bytes, size_t alignment);
+    };
+};
+
diff --git a/src/runtime/sgx/untrusted/sgx_module.cc b/src/runtime/sgx/untrusted/sgx_module.cc
new file mode 100644
index 000000000000..8dd696349b05
--- /dev/null
+++ b/src/runtime/sgx/untrusted/sgx_module.cc
@@ -0,0 +1,238 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sgx_module.cc
+ * \brief SGX enclave module.
+ */
+#include <dmlc/logging.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/threading_backend.h>
+#include <sgx_urts.h>
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include "../common.h"
+#include "../../file_util.h"
+
+namespace tvm {
+namespace runtime {
+
+class SGXModuleNode;
+
+namespace sgx {
+
+class EnclaveContext {
+ public:
+  explicit EnclaveContext(SGXModuleNode* mod) {
+    CHECK(Context()->mod_ == nullptr)
+      << "Tried overriding existing enclave context.";
+    CHECK(mod != nullptr) << "Tried setting null enclave context.";
+    Context()->mod_ = mod;
+  }
+  ~EnclaveContext() {
+    Context()->mod_ = nullptr;
+  }
+
+  static SGXModuleNode* GetModule() {
+    SGXModuleNode* ctx = Context()->mod_;
+    CHECK(ctx != nullptr) << "No current enclave context";
+    return ctx;
+  }
+
+ private:
+  EnclaveContext() {}
+  SGXModuleNode* mod_;
+
+  static EnclaveContext* Context() {
+    static thread_local EnclaveContext inst;
+    return &inst;
+  }
+};
+
+}  // namespace sgx
+
+class SGXModuleNode : public ModuleNode {
+ public:
+  ~SGXModuleNode() {
+    if (eid_) {
+      sgx::EnclaveContext ctx(this);
+      sgx_destroy_enclave(eid_);
+    }
+  }
+
+  void Init(const std::string& enclave_file) {
+    std::string token_file = GetCacheDir() + "/" +
+                             GetFileBasename(enclave_file) + ".token";
+    sgx_launch_token_t token = {0};
+    int token_updated = 0;
+
+    try {
+      std::ifstream ifs(token_file, std::fstream::in | std::fstream::binary);
+      ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+      ifs >> token;
+    } catch (std::ifstream::failure e) {
+      memset(&token, 0x0, sizeof(sgx_launch_token_t));
+    }
+
+    TVM_SGX_CHECKED_CALL(sgx_create_enclave(
+        enclave_file.c_str(), SGX_DEBUG_FLAG, &token, &token_updated, &eid_, NULL));
+
+    sgx::EnclaveContext ctx(this);
+    TVMRetValue rv;
+    TVM_SGX_CHECKED_CALL(tvm_ecall_init(eid_, &rv));
+
+    if (!token_updated) return;
+
+    try {
+      std::ofstream ofs(token_file, std::fstream::trunc | std::fstream::binary);
+      ofs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+      ofs << token;
+    } catch (std::ifstream::failure e) {
+      LOG(INFO) << "Could not save SGX launch token to " << token_file;
+    }
+  }
+
+  const char* type_key() const final {
+    return "sgx";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    auto exported = exports_.find(name);
+    if (exported == exports_.end()) return PackedFunc();
+    int func_id = exported->second;
+    return PackedFunc([this, func_id](TVMArgs args, TVMRetValue* rv) {
+        sgx::EnclaveContext ctx(this);
+        TVM_SGX_CHECKED_CALL(tvm_ecall_packed_func(eid_, func_id,
+              args.values, args.type_codes, args.num_args, rv));
+      });
+  }
+
+  void RunWorkers(int num_tasks, void* tg) {
+    std::function<void(int)> runner = [this, tg](int _worker_id) {
+      this->GetFunction("__tvm_run_worker__",
+                        std::shared_ptr<SGXModuleNode>(nullptr))(tg);
+    };
+    thread_group_.reset(new tvm::runtime::threading::ThreadGroup(
+          num_tasks, runner, false /* include_main_thread */));
+  }
+
+  void JoinThreads() {
+    thread_group_->Join();
+  }
+
+  void RegisterExport(std::string name, int func_id) {
+    exports_[name] = func_id;
+  }
+
+ private:
+  // ID of the loaded enclave
+  sgx_enclave_id_t eid_;
+  // Names and IDs of functions exported by the enclave module
+  std::unordered_map<std::string, int> exports_;
+  std::unique_ptr<tvm::runtime::threading::ThreadGroup> thread_group_;
+};
+
+namespace sgx {
+
+TVM_REGISTER_GLOBAL("__sgx_thread_group_launch__")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  EnclaveContext::GetModule()->RunWorkers(args[0], args[1]);
+});
+
+TVM_REGISTER_GLOBAL("__sgx_thread_group_join__")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  EnclaveContext::GetModule()->JoinThreads();
+});
+
+TVM_REGISTER_GLOBAL("__sgx_set_last_error__")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  std::string err = args[0];
+  TVMAPISetLastError(err.c_str());
+});
+
+TVM_REGISTER_GLOBAL("__sgx_println__")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  std::ostringstream msg;
+  for (int i = 0; i < args.num_args; ++i) {
+    switch (args.type_codes[i]) {
+    case kDLInt: msg << static_cast<int64_t>(args[i]); break;
+    case kDLUInt: msg << static_cast<uint64_t>(args[i]); break;
+    case kDLFloat: msg << static_cast<double>(args[i]); break;
+    case kStr:
+    case kBytes: {
+      std::string val = args[i];
+      msg << val;
+    }
+    break;
+    }
+    msg << " ";
+  }
+  LOG(INFO) << msg.str();
+});
+
+extern "C" {
+
+void tvm_ocall_register_export(const char* name, int func_id) {
+  EnclaveContext::GetModule()->RegisterExport(name, func_id);
+}
+
+void tvm_ocall_packed_func(const char* name,
+                           const TVMValue* arg_values,
+                           const int* type_codes,
+                           int num_args,
+                           TVMValue* ret_val,
+                           int* ret_type_code) {
+  const PackedFunc* f = Registry::Get(name);
+  CHECK(f != nullptr) << "ocall to nonexistent function \"" << name << "\"";
+  TVMRetValue rv;
+  f->CallPacked(TVMArgs(arg_values, type_codes, num_args), &rv);
+  rv.MoveToCHost(ret_val, ret_type_code);
+}
+
+// Allocates space for return values. The returned pointer is only valid between
+// successive calls to `tvm_ocall_reserve_space`.
+void* tvm_ocall_reserve_space(size_t num_bytes, size_t alignment) {
+  static TVMContext ctx = { kDLCPU, 0 };
+  static thread_local void* buf = nullptr;
+  static thread_local size_t buf_size = 0;
+  static thread_local size_t buf_align = 0;
+
+  if (buf_size >= num_bytes && buf_align >= alignment) return buf;
+
+  DeviceAPI::Get(ctx)->FreeDataSpace(ctx, buf);
+  buf = DeviceAPI::Get(ctx)->AllocDataSpace(ctx, num_bytes, alignment, {});
+  buf_size = num_bytes;
+  buf_align = alignment;
+
+  return buf;
+}
+
+void tvm_ocall_set_return(TVMRetValueHandle ret,
+                           const TVMValue* value,
+                           const int* type_code,
+                           int num_ret) {
+  CHECK_EQ(num_ret, 1) << "Only one return value is currently supported.";
+  CHECK(type_code[0] != kStr) << "Return kBytes, not kStr.";
+  TVMRetValue* rv = static_cast<TVMRetValue*>(ret);
+  *rv = TVMArgValue(value[0], type_code[0]);
+}
+
+}  // extern "C"
+}  // namespace sgx
+
+TVM_REGISTER_GLOBAL("module.loadfile_sgx")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  std::shared_ptr<SGXModuleNode> node = std::make_shared<SGXModuleNode>();
+  node->Init(args[0]);
+  *rv = runtime::Module(node);
+});
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/system_lib_module.cc b/src/runtime/system_lib_module.cc
index db06f57e8422..01ff99d7da87 100644
--- a/src/runtime/system_lib_module.cc
+++ b/src/runtime/system_lib_module.cc
@@ -13,8 +13,8 @@ namespace runtime {
 
 class SystemLibModuleNode : public ModuleNode {
  public:
-  SystemLibModuleNode() {
-  }
+  SystemLibModuleNode() = default;
+
   const char* type_key() const final {
     return "system_lib";
   }
@@ -23,6 +23,13 @@ class SystemLibModuleNode : public ModuleNode {
       const std::string& name,
       const std::shared_ptr<ModuleNode>& sptr_to_self) final {
     std::lock_guard<std::mutex> lock(mutex_);
+
+    if (module_blob_ != nullptr) {
+      // If we previously recorded submodules, load them now.
+      ImportModuleBlob(reinterpret_cast<const char*>(module_blob_), &imports_);
+      module_blob_ = nullptr;
+    }
+
     auto it = tbl_.find(name);
     if (it != tbl_.end()) {
       return WrapPackedFunc(
@@ -38,19 +45,22 @@ class SystemLibModuleNode : public ModuleNode {
       void** ctx_addr = reinterpret_cast<void**>(ptr);
       *ctx_addr = this;
     } else if (name == symbol::tvm_dev_mblob) {
-      ImportModuleBlob(reinterpret_cast<const char*>(ptr), &imports_);
+      // Record pointer to content of submodules to be loaded.
+      // We defer loading submodules to the first call to GetFunction().
+      // The reason is that RegisterSymbol() gets called when initializing the
+      // syslib (i.e. library loading time), and the registeries aren't ready
+      // yet. Therefore, we might not have the functionality to load submodules
+      // now.
+      CHECK(module_blob_ == nullptr) << "Resetting mobule blob?";
+      module_blob_ = ptr;
     } else {
       auto it = tbl_.find(name);
-      if (it != tbl_.end()) {
-        if (ptr != it->second) {
-          LOG(WARNING) << "SystemLib symbol " << name
-                       << " get overriden to a different address "
-                   << ptr << "->" << it->second;
-          tbl_[name] = ptr;
-        }
-      } else {
-        tbl_[name] = ptr;
+      if (it != tbl_.end() && ptr != it->second) {
+        LOG(WARNING) << "SystemLib symbol " << name
+                     << " get overriden to a different address "
+                     << ptr << "->" << it->second;
       }
+      tbl_[name] = ptr;
     }
   }
 
@@ -65,6 +75,8 @@ class SystemLibModuleNode : public ModuleNode {
   std::mutex mutex_;
   // Internal symbol table
   std::unordered_map<std::string, void*> tbl_;
+  // Module blob to be imported
+  void* module_blob_{nullptr};
 };
 
 TVM_REGISTER_GLOBAL("module._GetSystemLib")
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 7caedaa15538..e10738ab8bbe 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -5,6 +5,9 @@
  */
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/threading_backend.h>
 #include <dmlc/thread_local.h>
 #include <dmlc/logging.h>
 #include <thread>
@@ -18,6 +21,8 @@
 #include <memory>
 #include <sstream>
 
+const constexpr int kL1CacheBytes = 64;
+
 namespace tvm {
 namespace runtime {
 
@@ -34,12 +39,11 @@ class ParallelLauncher {
             void* cdata,
             int num_task,
             bool need_sync) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    num_pending_ = num_task;
+    num_pending_.store(num_task);
     this->cdata = cdata;
     this->flambda = flambda;
     this->env.num_task = num_task;
-    has_error_ = false;
+    has_error_.store(false);
     // reshape
     if (static_cast<size_t>(num_task) > par_errors_.size()) {
       par_errors_.resize(num_task + 1);
@@ -63,40 +67,31 @@ class ParallelLauncher {
   }
   // Wait n jobs to finish
   int WaitForJobs() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [this] {
-        return num_pending_ == 0;
-      });
-    if (!has_error_) return 0;
-    std::ostringstream os;
+    while (num_pending_.load() != 0) {
+      tvm::runtime::threading::Yield();
+    }
+    if (!has_error_.load()) return 0;
+    // the following is intended to use string due to
+    // security issue raised in SGX backend
+    std::string err("");
     for (size_t i = 0; i < par_errors_.size(); ++i) {
       if (par_errors_[i].length() != 0) {
-        os << "Task " << i << " error: " << par_errors_[i] << '\n';
+        err += "Task " + std::to_string(i) + " error: " + par_errors_[i] + '\n';
         par_errors_[i].clear();
       }
     }
-    TVMAPISetLastError(os.str().c_str());
+    TVMAPISetLastError(err.c_str());
     return -1;
   }
   // Signal that one job has finished.
   void SignalJobError(int task_id) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    --num_pending_;
+    num_pending_.fetch_sub(1);
     par_errors_[task_id] = TVMGetLastError();
-    has_error_ = true;
-    if (num_pending_ == 0) {
-      lock.unlock();
-      cv_.notify_one();
-    }
+    has_error_.store(true);
   }
   // Signal that one job has finished.
   void SignalJobFinish() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    --num_pending_;
-    if (num_pending_ == 0) {
-      lock.unlock();
-      cv_.notify_one();
-    }
+    num_pending_.fetch_sub(1);
   }
   // Get thread local version of the store.
   static ParallelLauncher* ThreadLocal() {
@@ -113,143 +108,157 @@ class ParallelLauncher {
   bool is_worker{false};
 
  private:
-  // The mutex to access local env.
-  std::mutex mutex_;
-  // The conditional variable.
-  std::condition_variable cv_;
   // The pending jobs.
-  uint32_t num_pending_;
+  std::atomic<int32_t> num_pending_;
   // Whether error has been countered.
-  bool has_error_;
+  std::atomic<bool> has_error_;
   // The counter page.
   std::atomic<int32_t>* sync_counter_{nullptr};
   // The error message
   std::vector<std::string> par_errors_;
 };
 
-/*! \brief Working queue for each thread */
-class ParallelTaskQueue {
+/*! \brief Lock-free single-producer-single-consumer queue for each thread */
+class SpscTaskQueue {
  public:
   /*! \brief The task entry */
   struct Task {
     ParallelLauncher* launcher;
     int32_t task_id;
   };
-  ParallelTaskQueue() {
-    ring_.resize(2);
+
+  SpscTaskQueue() :
+    buffer_(new Task[kRingSize]),
+    head_(0),
+    tail_(0) {
   }
-  /*!
-   * \brief Signal to kill the job.
-   */
-  void SignalForKill() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    exit_now_.store(true);
-    cv_.notify_all();
+
+  ~SpscTaskQueue() {
+    delete[] buffer_;
   }
+
   /*!
-   * \brief Push task into the queue.
-   * \param task The task to be pushed.
+   * \brief Push a task into the queue and notify the comsumer if it is on wait.
+   * \param input The task to be dequeued.
    */
-  void Push(Task task) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (num_pending_ < ring_.size()) {
-      CHECK_NE(ring_.size(), 0U);
-      ring_[(head_ + num_pending_) % ring_.size()] = task;
-      ++num_pending_;
-    } else {
-      size_t old_size = ring_.size();
-      ring_.resize(old_size * 2);
-      if (head_ + num_pending_ > old_size) {
-        // copy the ring overflow part into the tail.
-        size_t ncopy = head_ + num_pending_ - old_size;
-        memcpy(&ring_[0] + old_size, &ring_[0], ncopy * sizeof(Task));
-      }
-      ring_[(head_ + num_pending_) % ring_.size()] = task;
-      ++num_pending_;
+  void Push(const Task& input) {
+    while (!Enqueue(input)) {
+      tvm::runtime::threading::Yield();
     }
-    if (nwait_consumer_ != 0) {
-      lock.unlock();
+    if (pending_.fetch_add(1) == -1) {
+      std::unique_lock<std::mutex> lock(mutex_);
       cv_.notify_one();
     }
   }
+
   /*!
-   * \brief Pop task from the queue
-   * \param task The task to be poped.
-   * \param timeout The number of cycles to spin before sleep.
-   * \return Whether pop is successful or we need to exit now.
+   * \brief Pop a task out of the queue and condition wait if no tasks.
+   * \param output The pointer to the task to be dequeued.
+   * \param spin_count The number of iterations to spin before sleep.
+   * \return Whether pop is successful (true) or we need to exit now (false).
    */
-  bool Pop(Task* task, int timeout = 10) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (num_pending_ != 0) {
-      *task = ring_[head_];
-      head_ = (head_ + 1) % ring_.size();
-      --num_pending_;
-      if (exit_now_.load()) return false;
-    } else {
-      lock.unlock();
-      // do a bit spin and busy waiting before sleep.
-      for (int i = 0; i < timeout && num_pending_ == 0; ++i) {
-        std::this_thread::yield();
-      }
-      lock.lock();
-      ++nwait_consumer_;
+  bool Pop(Task* output, uint32_t spin_count = 300000) {
+    // Busy wait a bit when the queue is empty.
+    // If a new task comes to the queue quickly, this wait avoid the worker from sleeping.
+    // The default spin count is set by following the typical omp convention
+    for (uint32_t i = 0; i < spin_count && pending_.load() == 0; ++i) {
+      tvm::runtime::threading::Yield();
+    }
+    if (pending_.fetch_sub(1) == 0) {
+      std::unique_lock<std::mutex> lock(mutex_);
       cv_.wait(lock, [this] {
-          return num_pending_ != 0 || exit_now_.load();
+          return pending_.load() >= 0 || exit_now_.load();
         });
-      --nwait_consumer_;
-      *task = ring_[head_];
-      head_ = (head_ + 1) % ring_.size();
-      --num_pending_;
-      if (exit_now_.load()) return false;
     }
+    if (exit_now_.load(std::memory_order_relaxed)) {
+      return false;
+    }
+    const uint32_t head = head_.load(std::memory_order_relaxed);
+    // sanity check if the queue is empty
+    CHECK(tail_.load(std::memory_order_acquire) != head);
+    *output = buffer_[head];
+    head_.store((head + 1) % kRingSize, std::memory_order_release);
     return true;
   }
 
- private:
-  // Number of the elments in the queue
-  uint32_t num_pending_{0};
-  // Queue head
-  uint32_t head_{0};
-  // Number of consumers to wait.
-  uint32_t nwait_consumer_{0};
+  /*!
+   * \brief Signal to terminate the worker.
+   */
+  void SignalForKill() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exit_now_.store(true);
+    cv_.notify_all();
+  }
+
+ protected:
+  /*!
+   * \brief Lock-free enqueue.
+   * \param input The task to be enqueued.
+   * \return Whether the task is enqueued.
+   */
+  bool Enqueue(const Task& input) {
+    if (exit_now_.load(std::memory_order_relaxed)) return false;
+
+    const uint32_t tail = tail_.load(std::memory_order_relaxed);
+
+    if ((tail + 1) % kRingSize != (head_.load(std::memory_order_acquire))) {
+      buffer_[tail] = input;
+      tail_.store((tail + 1) % kRingSize, std::memory_order_release);
+      return true;
+    }
+    return false;
+  }
+
+  // the cache line paddings are used for avoid false sharing between atomic variables
+  typedef char cache_line_pad_t[kL1CacheBytes];
+  cache_line_pad_t pad0_;
+  // size of the queue, the queue can host size_ - 1 items at most
+  // define it as a constant for better compiler optimization
+  static constexpr const int kRingSize = 2;
+  // pointer to access the item
+  Task* const buffer_;
+
+  cache_line_pad_t pad1_;
+  // queue head, where one gets a task from the queue
+  std::atomic<uint32_t> head_;
+
+  cache_line_pad_t pad2_;
+  // queue tail, when one puts a task to the queue
+  std::atomic<uint32_t> tail_;
+
+  cache_line_pad_t pad3_;
+  // pending tasks in the queue
+  std::atomic<int8_t> pending_{0};
+
+  cache_line_pad_t pad4_;
+  // signal for exit now
+  std::atomic<bool> exit_now_{false};
+
   // internal mutex
   std::mutex mutex_;
   // cv for consumer
   std::condition_variable cv_;
-  // signal for exit now
-  std::atomic<bool> exit_now_{false};
-  // The internal ring.
-  std::vector<Task> ring_;
 };
 
 // The thread pool
 class ThreadPool {
  public:
-  ThreadPool() {
-    const char *val = getenv("TVM_NUM_THREADS");
-    if (val == nullptr) {
-      val = getenv("OMP_NUM_THREADS");
-    }
-    if (val != nullptr) {
-      num_workers_ = atoi(val);
-    } else {
-#if defined(_M_X64) || defined(__x86_64__)
-      // Half to not count hyper threading.
-      num_workers_ = std::thread::hardware_concurrency() / 2;
-#else
-      num_workers_ = std::thread::hardware_concurrency();
-#endif
+  ThreadPool(): num_workers_(tvm::runtime::threading::MaxConcurrency()) {
+    for (int i = 0; i < num_workers_; ++i) {
+      // The SpscTaskQueue only hosts ONE item at a time
+      queues_.emplace_back(std::unique_ptr<SpscTaskQueue>(new SpscTaskQueue()));
     }
-    num_workers_ = std::max(num_workers_, 1);
-    this->Init();
+    threads_ = std::unique_ptr<tvm::runtime::threading::ThreadGroup>(
+        new tvm::runtime::threading::ThreadGroup(
+          num_workers_, [this](int worker_id) { this->RunWorker(worker_id); },
+          exclude_worker0_ /* include_main_thread */));
+    num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_);
   }
   ~ThreadPool() {
-    for (std::unique_ptr<ParallelTaskQueue>& q : queues_) {
+    for (std::unique_ptr<SpscTaskQueue>& q : queues_) {
       q->SignalForKill();
     }
-    for (std::thread& t : threads_) {
-      t.join();
-    }
+    threads_.reset();
   }
   int Launch(FTVMParallelLambda flambda,
              void* cdata,
@@ -259,45 +268,53 @@ class ThreadPool {
     CHECK(!launcher->is_worker)
         << "Cannot launch parallel job inside worker, consider fuse then parallel";
     if (num_task == 0) {
-      num_task = num_workers_;
+      num_task = num_workers_used_;
     }
     if (need_sync != 0) {
-      CHECK_LE(num_task, num_workers_)
-          << "Request parallel sync task larger than number of threads available "
-          << " workers=" << num_workers_ << " request=" << num_task;
+      CHECK_LE(num_task, num_workers_used_)
+          << "Request parallel sync task larger than number of threads used "
+          << " workers=" << num_workers_used_ << " request=" << num_task;
     }
     launcher->Init(flambda, cdata, num_task, need_sync != 0);
-    ParallelTaskQueue::Task tsk;
+    SpscTaskQueue::Task tsk;
     tsk.launcher = launcher;
-    for (int i = 0; i < num_task; ++i) {
+    // if worker0 is taken by the master, queues_[0] is abandoned
+    for (int i = exclude_worker0_; i < num_task; ++i) {
       tsk.task_id = i;
       queues_[i]->Push(tsk);
     }
-    return launcher->WaitForJobs();
+    // use the master thread to run task 0
+    if (exclude_worker0_) {
+      TVMParallelGroupEnv* penv = &(tsk.launcher->env);
+      if ((*tsk.launcher->flambda)(0, penv, cdata) == 0) {
+        tsk.launcher->SignalJobFinish();
+      } else {
+        tsk.launcher->SignalJobError(tsk.task_id);
+      }
+    }
+    int res = launcher->WaitForJobs();
+    return res;
   }
 
-  static ThreadPool* Global() {
-    static ThreadPool inst;
-    return &inst;
+  static ThreadPool* ThreadLocal() {
+    return dmlc::ThreadLocalStore<ThreadPool>::Get();
   }
 
- private:
-  // Initialize the pool.
-  void Init() {
-    for (int i = 0; i < num_workers_; ++i) {
-      queues_.emplace_back(
-          std::unique_ptr<ParallelTaskQueue>(new ParallelTaskQueue()));
-    }
-    threads_.resize(num_workers_);
-    for (int i = 0; i < num_workers_; ++i) {
-      threads_[i] = std::thread([this, i] {
-          this->RunWorker(queues_[i].get());
-        });
-    }
+  void UpdateWorkerConfiguration(threading::ThreadGroup::AffinityMode mode, int nthreads) {
+    // this will also reset the affinity of the ThreadGroup
+    // may use less than the MaxConcurrency number of workers
+    num_workers_used_ = threads_->Configure(mode, nthreads,
+                                            exclude_worker0_);
+    // if MaxConcurrency restricted the number of workers (e.g., due to
+    // hyperthreading), respect the restriction
+    num_workers_used_ = std::min(num_workers_, num_workers_used_);
   }
+
+ private:
   // Internal worker function.
-  void RunWorker(ParallelTaskQueue* queue) {
-    ParallelTaskQueue::Task task;
+  void RunWorker(int worker_id) {
+    SpscTaskQueue* queue = queues_[worker_id].get();
+    SpscTaskQueue::Task task;
     ParallelLauncher::ThreadLocal()->is_worker = true;
     while (queue->Pop(&task)) {
       CHECK(task.launcher != nullptr);
@@ -310,21 +327,40 @@ class ThreadPool {
       }
     }
   }
-  // Number of workers
   int num_workers_;
-  std::vector<std::unique_ptr<ParallelTaskQueue> > queues_;
-  std::vector<std::thread> threads_;
+  // number of workers used (can be restricted with affinity pref)
+  int num_workers_used_;
+  // if excluding worker 0 and using master to run task 0
+#ifndef _LIBCPP_SGX_CONFIG
+  bool exclude_worker0_{true};
+#else
+  bool exclude_worker0_{false};
+#endif
+  std::vector<std::unique_ptr<SpscTaskQueue> > queues_;
+  std::unique_ptr<tvm::runtime::threading::ThreadGroup> threads_;
 };
 
+TVM_REGISTER_GLOBAL("runtime.config_threadpool")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    threading::ThreadGroup::AffinityMode mode =\
+    static_cast<threading::ThreadGroup::AffinityMode>(\
+    static_cast<int>(args[0]));
+    int nthreads = args[1];
+    ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads);
+});
+
+
 }  // namespace runtime
 }  // namespace tvm
 
+
 int TVMBackendParallelLaunch(
     FTVMParallelLambda flambda,
     void* cdata,
     int num_task) {
-  return tvm::runtime::ThreadPool::Global()->Launch(
+  int res = tvm::runtime::ThreadPool::ThreadLocal()->Launch(
       flambda, cdata, num_task, 1);
+  return res;
 }
 
 int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) {
@@ -338,7 +374,7 @@ int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) {
     if (i != task_id) {
       while (sync_counter[i * kSyncStride].load(
                  std::memory_order_relaxed) <= old_counter) {
-        std::this_thread::yield();
+        tvm::runtime::threading::Yield();
       }
     }
   }
diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index e2767647dd77..647bbb82ea34 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -13,10 +13,47 @@
 namespace tvm {
 namespace runtime {
 
+/*!
+ * \brief Memory hierachy rank in the storage system
+ * \note The global rank and shared rank have one to one
+ *       correspondence to the thread rank.
+ */
+enum class StorageRank {
+  /*! \brief global memory */
+  kGlobal = 0,
+  /*! \brief shared memory among thread group */
+  kShared = 1,
+  /*!
+   * \brief reserved for warp memory.
+   *  This is only used by programming model.
+   *  There is no such memory usually in GPU.
+   *  Instead, we can simulate it by registers and shuffle.
+   */
+  kWarp = 2,
+  /*! \brief thread local memory */
+  kLocal = 3
+};
+
+/*!
+ * \param thread_scope_rank The thread scope rank
+ * \return default storage rank given the thread scope
+ */
+inline StorageRank DefaultStorageRank(int thread_scope_rank) {
+  switch (thread_scope_rank) {
+    case -1: return StorageRank::kGlobal;
+    case 0: return StorageRank::kShared;
+    case 1: return StorageRank::kLocal;
+    default: {
+      LOG(FATAL) << "unknown rank";
+      return StorageRank::kGlobal;
+    }
+  }
+}
+
 /*! \brief class to represent storage scope */
 struct StorageScope {
   /*! \brief The rank of the storage */
-  int rank{0};
+  StorageRank rank{StorageRank::kGlobal};
   /*! \brief tag for special purpose memory. */
   std::string tag;
   // comparator
@@ -29,9 +66,10 @@ struct StorageScope {
   inline std::string to_string() const {
     std::string ret;
     switch (rank) {
-      case 0: return "global" + tag;
-      case 1: return "shared" + tag;
-      case 2: return "local" + tag;
+      case StorageRank::kGlobal: return "global" + tag;
+      case StorageRank::kShared: return "shared" + tag;
+      case StorageRank::kWarp: return "warp" + tag;
+      case StorageRank::kLocal: return "local" + tag;
       default: LOG(FATAL) << "unknown storage scope"; return "";
     }
   }
@@ -43,13 +81,16 @@ struct StorageScope {
   static StorageScope make(const std::string& s) {
     StorageScope r;
     if (s.compare(0, 6, "global")  == 0) {
-      r.rank = 0;
+      r.rank = StorageRank::kGlobal;
       r.tag = s.substr(6, std::string::npos);
     } else if (s.compare(0, 6, "shared") == 0) {
-      r.rank = 1;
+      r.rank = StorageRank::kShared;
       r.tag = s.substr(6, std::string::npos);
+    } else if (s.compare(0, 4, "warp") == 0) {
+      r.rank = StorageRank::kWarp;
+      r.tag = s.substr(4, std::string::npos);
     } else if (s.compare(0, 5, "local") == 0) {
-      r.rank = 2;
+      r.rank = StorageRank::kLocal;
       r.tag = s.substr(5, std::string::npos);
     } else {
       LOG(FATAL) << "unknown storage scope " << s;
@@ -71,7 +112,7 @@ struct ThreadScope {
    */
   static ThreadScope make(const std::string& s) {
     ThreadScope r;
-    if (s == "vthread") {
+    if (s == "vthread" || s == "cthread") {
       // virtual thread at the same level as local
       r.rank = 1;
       r.dim_index = -1;
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
new file mode 100644
index 000000000000..ad63493fb94f
--- /dev/null
+++ b/src/runtime/threading_backend.cc
@@ -0,0 +1,218 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file threading_backend.cc
+ * \brief Native threading backend
+ */
+#include <tvm/runtime/threading_backend.h>
+#include <dmlc/logging.h>
+#include <thread>
+#include <algorithm>
+#if defined(__linux__) || defined(__ANDROID__)
+#include <fstream>
+#else
+#endif
+#if defined(__linux__)
+#include <sched.h>
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace threading {
+
+class ThreadGroup::Impl {
+ public:
+  Impl(int num_workers,
+       std::function<void(int)> worker_callback,
+       bool exclude_worker0)
+      : num_workers_(num_workers) {
+    CHECK_GE(num_workers, 1)
+      << "Requested a non-positive number of worker threads.";
+    for (int i = exclude_worker0; i < num_workers_; ++i) {
+      threads_.emplace_back([worker_callback, i] { worker_callback(i); });
+    }
+    InitSortedOrder();
+  }
+  ~Impl() { Join(); }
+
+  void Join() {
+    for (auto& t : threads_) {
+      if (t.joinable()) t.join();
+    }
+  }
+
+  int Configure(AffinityMode mode, int nthreads, bool exclude_worker0) {
+    int num_workers_used = 0;
+    if (mode == kLittle) {
+      num_workers_used = little_count_;
+    } else if (mode == kBig) {
+      num_workers_used = big_count_;
+    } else {
+      // use default
+      num_workers_used = threading::MaxConcurrency();
+    }
+    // if a specific number was given, use that
+    if (nthreads) {
+      num_workers_used = nthreads;
+    }
+    // if MaxConcurrency restricted the number of workers (e.g., due to
+    // hyperthreading), respect the restriction. On CPUs with N logical cores
+    // and N/2 physical cores this will set affinity to the first N/2 logical
+    // ones.
+    num_workers_used = std::min(num_workers_, num_workers_used);
+
+    const char *val = getenv("TVM_BIND_THREADS");
+    if (val == nullptr || atoi(val) == 1) {
+      // Do not set affinity if there are more workers than found cores
+      if (sorted_order_.size() >= static_cast<unsigned int>(num_workers_)) {
+          SetAffinity(exclude_worker0, mode == kLittle);
+      } else {
+        LOG(WARNING)
+          << "The thread affinity cannot be set when the number of workers"
+          << "is larger than the number of available cores in the system.";
+      }
+    }
+    return num_workers_used;
+  }
+
+ private:
+  // bind worker threads to disjoint cores
+  // if worker 0 is offloaded to master, i.e. exclude_worker0 is true,
+  // the master thread is bound to core 0.
+  void SetAffinity(bool exclude_worker0, bool reverse = false) {
+#if defined(__ANDROID__)
+#ifndef CPU_SET
+#define CPU_SETSIZE 1024
+#define __NCPUBITS (8 * sizeof (uint64_t))
+    typedef struct {
+      uint64_t __bits[CPU_SETSIZE / __NCPUBITS];
+    } cpu_set_t;
+
+#define CPU_SET(cpu, cpusetp) \
+    ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
+#define CPU_ZERO(cpusetp) \
+    memset((cpusetp), 0, sizeof(cpu_set_t))
+#endif
+#endif
+#if defined(__linux__) || defined(__ANDROID__)
+    CHECK_GE(sorted_order_.size(), num_workers_);
+
+    for (unsigned i = 0; i < threads_.size(); ++i) {
+      unsigned core_id;
+      if (reverse) {
+        core_id = sorted_order_[sorted_order_.size() - (i + exclude_worker0) - 1];
+      } else {
+        core_id = sorted_order_[i + exclude_worker0];
+      }
+      cpu_set_t cpuset;
+      CPU_ZERO(&cpuset);
+      CPU_SET(core_id, &cpuset);
+#if defined(__ANDROID__)
+      sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
+#else
+      pthread_setaffinity_np(threads_[i].native_handle(),
+          sizeof(cpu_set_t), &cpuset);
+#endif
+    }
+    if (exclude_worker0) {  // bind the master thread to core 0
+      cpu_set_t cpuset;
+      CPU_ZERO(&cpuset);
+      if (reverse) {
+        CPU_SET(sorted_order_[sorted_order_.size() - 1], &cpuset);
+      } else {
+        CPU_SET(sorted_order_[0], &cpuset);
+      }
+#if defined(__ANDROID__)
+      sched_setaffinity(pthread_self(),
+        sizeof(cpu_set_t), &cpuset);
+#else
+      pthread_setaffinity_np(pthread_self(),
+        sizeof(cpu_set_t), &cpuset);
+#endif
+    }
+#endif
+  }
+
+  void InitSortedOrder() {
+    unsigned int threads = std::thread::hardware_concurrency();
+    std::vector<std::pair <unsigned int, int64_t> > max_freqs;
+
+    for (unsigned int i = 0; i < threads; ++i) {
+      int64_t cur_freq = 0;
+      #if defined(__linux__) || defined(__ANDROID__)
+        std::ostringstream filepath;
+        filepath << "/sys/devices/system/cpu/cpu"  << i << "/cpufreq/cpuinfo_max_freq";
+        std::ifstream ifs(filepath.str());
+        if (!ifs.fail()) {
+          if (!(ifs >> cur_freq)) {
+            cur_freq = -1;
+          }
+          ifs.close();
+        }
+      #endif
+      max_freqs.push_back(std::make_pair(i, cur_freq));
+    }
+
+    auto fcmpbyfreq = [] (const std::pair<unsigned int, int64_t> &a,
+                          const std::pair<unsigned int, int64_t> &b) {
+        return a.second == b.second ? a.first < b.first : a.second > b.second;
+    };
+    std::sort(max_freqs.begin(), max_freqs.end(), fcmpbyfreq);
+    int64_t big_freq = max_freqs.begin()->second;
+    int64_t little_freq = max_freqs.rbegin()->second;
+    for (auto it = max_freqs.begin(); it != max_freqs.end(); it++) {
+      sorted_order_.push_back(it->first);
+      if (big_freq == it->second) {
+        big_count_++;
+      }
+      if (big_freq != little_freq && little_freq == it->second) {
+        little_count_++;
+      }
+    }
+    if (big_count_ + little_count_ != static_cast<int>(sorted_order_.size())) {
+      LOG(WARNING) << "more than two frequencies detected!";
+    }
+  }
+
+  int num_workers_;
+  std::vector<std::thread> threads_;
+  std::vector<unsigned int> sorted_order_;
+  int big_count_ = 0;
+  int little_count_ = 0;
+};
+
+ThreadGroup::ThreadGroup(int num_workers,
+                         std::function<void(int)> worker_callback,
+                         bool exclude_worker0)
+  : impl_(new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0)) {}
+ThreadGroup::~ThreadGroup() { delete impl_; }
+void ThreadGroup::Join() { impl_->Join(); }
+
+int ThreadGroup::Configure(AffinityMode mode, int nthreads, bool exclude_worker0) {
+  return impl_->Configure(mode, nthreads, exclude_worker0);
+}
+
+void Yield() {
+  std::this_thread::yield();
+}
+
+int MaxConcurrency() {
+  int max_concurrency = 1;
+  const char *val = getenv("TVM_NUM_THREADS");
+  if (val == nullptr) {
+    val = getenv("OMP_NUM_THREADS");
+  }
+  if (val != nullptr) {
+    max_concurrency = atoi(val);
+  } else {
+    max_concurrency = std::thread::hardware_concurrency();
+#if defined(_M_X64) || defined(__x86_64__)
+    max_concurrency /= 2;  // ignore hyper-threading
+#endif
+  }
+  return std::max(max_concurrency, 1);
+}
+
+
+}  // namespace threading
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h
new file mode 100644
index 000000000000..b1770fd3b243
--- /dev/null
+++ b/src/runtime/vulkan/vulkan_common.h
@@ -0,0 +1,281 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file vulkan_common.h
+ * \brief Vulkan common header
+ */
+#ifndef TVM_RUNTIME_VULKAN_VULKAN_COMMON_H_
+#define TVM_RUNTIME_VULKAN_VULKAN_COMMON_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/device_api.h>
+#include <dmlc/logging.h>
+
+#include <vulkan/vulkan.h>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "../workspace_pool.h"
+
+namespace tvm {
+namespace runtime {
+namespace vulkan {
+
+inline const char* VKGetErrorString(VkResult error) {
+  switch (error) {
+    case VK_SUCCESS: return "VK_SUCCESS";
+    case VK_NOT_READY: return "VK_NOT_READY";
+    case VK_TIMEOUT: return "VK_TIMEOUT";
+    case VK_EVENT_SET: return "VK_EVENT_SET";
+    case VK_EVENT_RESET: return "VK_EVENT_RESET";
+    case VK_INCOMPLETE: return "VK_INCOMPLETE";
+    case VK_ERROR_OUT_OF_HOST_MEMORY: return "VK_ERROR_OUT_OF_HOST_MEMORY";
+    case VK_ERROR_OUT_OF_DEVICE_MEMORY: return "VK_ERROR_OUT_OF_DEVICE_MEMORY";
+    case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED";
+    case VK_ERROR_DEVICE_LOST: return "VK_ERROR_DEVICE_LOST";
+    case VK_ERROR_MEMORY_MAP_FAILED: return "VK_ERROR_MEMORY_MAP_FAILED";
+    case VK_ERROR_LAYER_NOT_PRESENT: return "VK_ERROR_LAYER_NOT_PRESENT";
+    case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT";
+    case VK_ERROR_FEATURE_NOT_PRESENT: return "VK_ERROR_FEATURE_NOT_PRESENT";
+    case VK_ERROR_INCOMPATIBLE_DRIVER: return "VK_ERROR_INCOMPATIBLE_DRIVER";
+    case VK_ERROR_TOO_MANY_OBJECTS: return "VK_ERROR_TOO_MANY_OBJECTS";
+    case VK_ERROR_FORMAT_NOT_SUPPORTED: return "VK_ERROR_FORMAT_NOT_SUPPORTED";
+    case VK_ERROR_FRAGMENTED_POOL: return "VK_ERROR_FRAGMENTED_POOL";
+    default: return "Unknown Vulkan error code";
+  }
+}
+
+/*!
+ * \brief Protected Vulkan call
+ * \param func Expression to call.
+ */
+#define VULKAN_CHECK_ERROR(__e)                                         \
+  {                                                                     \
+    CHECK(__e == VK_SUCCESS)                                            \
+        << "Vulan Error, code=" << __e << ": " << vulkan::VKGetErrorString(__e); \
+  }
+
+#define VULKAN_CALL(func)                                             \
+  {                                                                   \
+    VkResult __e = (func);                                            \
+    VULKAN_CHECK_ERROR(__e);                                          \
+  }
+
+/*! \brief Auxiliary context structure for vulkan */
+struct VulkanContext {
+  // phyiscal device
+  VkPhysicalDevice phy_device{nullptr};
+  // Phyiscal device property
+  VkPhysicalDeviceProperties phy_device_prop;
+  // Memory type index for staging.
+  uint32_t staging_mtype_index{0};
+  // whether staging is coherent
+  bool coherent_staging{false};
+  // Memory type index for compute
+  uint32_t compute_mtype_index{0};
+  // The logical device
+  VkDevice device{nullptr};
+  // command queue
+  VkQueue queue{nullptr};
+  // queue family_index;
+  uint32_t queue_family_index{0};
+  // Queue family index.
+  VkQueueFamilyProperties queue_prop;
+};
+
+/*! \brief The buffer object */
+struct VulkanBuffer {
+  /*! \brief underlying buffer */
+  VkBuffer buffer{VK_NULL_HANDLE};
+  /*! \brief underlying buffer */
+  VkDeviceMemory memory{VK_NULL_HANDLE};
+};
+
+/*! \brief Buffer only used for stagging */
+struct VulkanStagingBuffer {
+  /*! \brief the corresponding device */
+  VkDevice device{nullptr};
+  /*! \brief underlying buffer */
+  VkBuffer buffer{VK_NULL_HANDLE};
+  /*! \brief underlying buffer */
+  VkDeviceMemory memory{VK_NULL_HANDLE};
+  /*! \brief host address */
+  void* host_addr{nullptr};
+  /*! \brief size of the memory */
+  size_t size{0};
+};
+
+/*!
+ * \brief Process global Vulkan workspace.
+ */
+class VulkanWorkspace final : public DeviceAPI {
+ public:
+  // global mutex
+  std::mutex mu;
+  // whether the workspace it initialized.
+  bool initialized_{false};
+  // vulkan instance
+  VkInstance instance_{nullptr};
+  // The physical devices, have 1 to 1 mapping to devices
+  std::vector<VulkanContext> context_;
+  // Destructor
+  ~VulkanWorkspace();
+  // Initialize workspace
+  // Return false if already initialized, otherwise return true.
+  void Init();
+  // override device API
+  void SetDevice(TVMContext ctx) final;
+  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final;
+  void FreeDataSpace(TVMContext ctx, void* ptr) final;
+  void CopyDataFromTo(const void* from,
+                      size_t from_size,
+                      void* to,
+                      size_t to_size,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMType type_hint,
+                      TVMStreamHandle stream) final;
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
+  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
+  void FreeWorkspace(TVMContext ctx, void* data) final;
+  // get the global workspace
+  static const std::shared_ptr<VulkanWorkspace>& Global();
+};
+
+/*! \brief Helper command buffer resource */
+struct VulkanCommandBuffer {
+  /*! \brief fence to signal the resource is ready to use */
+  VkFence fence{VK_NULL_HANDLE};
+  /*! \brief The internal command buffer */
+  VkCommandBuffer cmd_buffer{nullptr};
+  /*! \brief Descriptor set used to bind arguments */
+  VkDescriptorSet descriptor_set{VK_NULL_HANDLE};
+  /*! \brief Internal utilities for write command */
+  VkWriteDescriptorSet write_descriptor_set;
+
+  VulkanCommandBuffer() {
+    write_descriptor_set.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    write_descriptor_set.pNext = nullptr;
+    write_descriptor_set.dstSet = VK_NULL_HANDLE;
+    write_descriptor_set.dstBinding = 0;
+    write_descriptor_set.dstArrayElement = 0;
+    write_descriptor_set.descriptorCount = 1;
+    write_descriptor_set.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    write_descriptor_set.pImageInfo = nullptr;
+    write_descriptor_set.pBufferInfo = nullptr;
+    write_descriptor_set.pTexelBufferView = nullptr;
+  }
+};
+
+/*!
+ * \brief Command pool backed by a fixed size ring buffer.
+ *
+ *  Vulkan requires us not to reuse command buffer until
+ *  All its corresponding jobs have finished.
+ *
+ *  This class to faciliate automatic management
+ *  of the command buffers. A fence is created
+ *  for each launch of command buffer jobs
+ *  and when we try to reuse the same entry
+ *  in the ring, we need to make sure that
+ *  the previous pending job already finishes.
+ *
+ */
+class VulkanCommandPool {
+ public:
+  /*! \brief Maximum number of pending jobs in the pool */
+  static constexpr const int kMaxPending = 4;
+  /*! \brief Maximum number of pending jobs in the pool */
+  static constexpr const int kMaxNumArgs = 16;
+  /*!
+   * \brief constructor
+   * \param vctx The corresponding vulkan context.
+   */
+  explicit VulkanCommandPool(const VulkanContext& vctx);
+  /*! \brief destructor */
+  ~VulkanCommandPool();
+  /*!
+   * \brief Allocate a new command buffer entry
+   *
+   *  The caller must only submit the entry once
+   *  with the given fence in the entry,
+   *  before calling next Alloc.
+   *
+   *  This function may block to wait for a
+   *  previously unfinished command when
+   *  there is more than kMaxPending jobs.
+   *
+   * \returns The allocated entry.
+   */
+  VulkanCommandBuffer* Alloc();
+
+  /*!
+   * \brief Allocate a new command buffer entry
+   * \param dlayout the descriptor layout.
+   *
+   * \returns The allocated entry.
+   */
+  VulkanCommandBuffer* Alloc(const VkDescriptorSetLayout* dlayout);
+
+ private:
+  /*! \brief Local ring buffer */
+  std::vector<VulkanCommandBuffer> ring_;
+  /*! \brief clock pointer */
+  size_t clock_ptr_{0};
+  /*! \brief the corresponding device*/
+  VkDevice device_{nullptr};
+  /*! \brief internal command buffer pool */
+  VkCommandPool cmd_pool_{VK_NULL_HANDLE};
+  /*! \brief Descriptor pool */
+  VkDescriptorPool descriptor_pool_{VK_NULL_HANDLE};
+};
+
+/*! \brief Thread local workspace */
+class VulkanThreadEntry {
+ public:
+  /*! \brief The current context */
+  TVMContext context;
+  /*! \brief workspace pool */
+  WorkspacePool pool;
+  /*! \brief The staging buffers */
+  std::vector<VulkanStagingBuffer> staging_buffer_;
+  /*!
+   * \brief Get the command pool of corresponding device;
+   * \param device_id The device id
+   * \return The corresponding command buffer.
+   */
+  VulkanCommandPool* CommandPool(int device_id);
+  /*!
+   * \brief Get the stagging buffer.
+   * \param device_id The device id
+   * \return The corresponding stagging buffer.
+   */
+  VulkanStagingBuffer* StagingBuffer(int device_id, size_t size);
+
+  // constructor
+  VulkanThreadEntry()
+      : pool(static_cast<DLDeviceType>(kDLVulkan), VulkanWorkspace::Global()) {
+    context.device_id = 0;
+    context.device_type = static_cast<DLDeviceType>(kDLVulkan);
+  }
+  ~VulkanThreadEntry();
+  // get the global workspace
+  static VulkanThreadEntry* ThreadLocal();
+
+ private:
+  /*! \brief the command pools */
+  std::vector<std::unique_ptr<VulkanCommandPool> > pool_;
+};
+
+// inline implementation
+
+
+}  // namespace vulkan
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_VULKAN_VULKAN_COMMON_H_
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
new file mode 100644
index 000000000000..45f8549d54f2
--- /dev/null
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -0,0 +1,692 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file vulkan_device_api.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <dmlc/thread_local.h>
+#include <cstring>
+#include "./vulkan_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace vulkan {
+
+VulkanWorkspace::~VulkanWorkspace() {
+  for (VulkanContext& ctx : context_) {
+    vkDestroyDevice(ctx.device, nullptr);
+  }
+  if (instance_ != nullptr) {
+    vkDestroyInstance(instance_, nullptr);
+  }
+}
+
+const std::shared_ptr<VulkanWorkspace>& VulkanWorkspace::Global() {
+  static std::shared_ptr<VulkanWorkspace> inst = std::make_shared<VulkanWorkspace>();
+  return inst;
+}
+
+void VulkanWorkspace::SetDevice(TVMContext ctx) {
+  VulkanThreadEntry::ThreadLocal()->context.device_id = ctx.device_id;
+}
+
+void VulkanWorkspace::GetAttr(
+    TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) {
+  this->Init();
+  size_t index = static_cast<size_t>(ctx.device_id);
+  if (kind == kExist) {
+    *rv = static_cast<int>(index< context_.size());
+    return;
+  }
+  CHECK_LT(index, context_.size())
+      << "Invalid device id " << index;
+  switch (kind) {
+    case kMaxThreadsPerBlock: {
+      VkPhysicalDeviceProperties phy_prop;
+      vkGetPhysicalDeviceProperties(context_[ctx.device_id].phy_device, &phy_prop);
+      int64_t value = phy_prop.limits.maxComputeWorkGroupSize[0];
+      *rv = value;
+      break;
+    }
+    case kMaxSharedMemoryPerBlock: {
+      VkPhysicalDeviceProperties phy_prop;
+      vkGetPhysicalDeviceProperties(context_[ctx.device_id].phy_device, &phy_prop);
+      int64_t value = phy_prop.limits.maxComputeSharedMemorySize;
+      *rv = value;
+      break;
+    }
+    case kWarpSize: {
+      *rv = 1;
+      break;
+    }
+    case kComputeVersion: {
+      VkPhysicalDeviceProperties phy_prop;
+      vkGetPhysicalDeviceProperties(context_[ctx.device_id].phy_device, &phy_prop);
+      int64_t value = phy_prop.apiVersion;
+      std::ostringstream os;
+      os << VK_VERSION_MAJOR(value)
+         << "." << VK_VERSION_MINOR(value)
+         << "." << VK_VERSION_PATCH(value);
+      *rv = os.str();
+      break;
+    }
+    case kDeviceName: return;
+    case kMaxClockRate: return;
+    case kMultiProcessorCount: return;
+    case kExist: break;
+    case kMaxThreadDimensions: break;
+  }
+}
+
+void* VulkanWorkspace::AllocDataSpace(
+    TVMContext ctx, size_t size, size_t alignment, TVMType type_hint) {
+  this->Init();
+
+  VulkanContext& vctx = context_[ctx.device_id];
+
+  VkBufferCreateInfo info;
+  info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+  info.pNext = nullptr;
+  info.flags = 0;
+  info.size = size;
+  info.queueFamilyIndexCount = 1;
+  info.pQueueFamilyIndices = &(vctx.queue_family_index);
+  info.usage =
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+      VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+  // create buffer
+  VkBuffer buffer;
+  VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer));
+  // bind to memory
+  VkMemoryAllocateInfo minfo;
+  minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+  minfo.pNext = nullptr;
+  minfo.allocationSize = size;
+  minfo.memoryTypeIndex = vctx.compute_mtype_index;
+  VkDeviceMemory memory;
+  VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &memory));
+  VULKAN_CALL(vkBindBufferMemory(vctx.device, buffer, memory, 0));
+
+  VulkanBuffer* pbuf = new VulkanBuffer();
+  pbuf->memory = memory;
+  pbuf->buffer = buffer;
+  return pbuf;
+}
+
+void VulkanWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
+  VulkanContext& vctx = context_[ctx.device_id];
+  VulkanBuffer* pbuf = static_cast<VulkanBuffer*>(ptr);
+  vkDestroyBuffer(vctx.device, pbuf->buffer, nullptr);
+  vkFreeMemory(vctx.device, pbuf->memory, nullptr);
+  delete pbuf;
+}
+
+void VulkanWorkspace::CopyDataFromTo(const void* from,
+                                     size_t from_offset,
+                                     void* to,
+                                     size_t to_offset,
+                                     size_t size,
+                                     TVMContext ctx_from,
+                                     TVMContext ctx_to,
+                                     TVMType type_hint,
+                                     TVMStreamHandle stream) {
+  this->Init();
+  CHECK(stream == nullptr);
+  TVMContext ctx = ctx_from;
+  if (ctx_from.device_type == kDLCPU) ctx = ctx_to;
+  VulkanThreadEntry* tls = VulkanThreadEntry::ThreadLocal();
+  VulkanCommandBuffer* cmd = tls->CommandPool(ctx.device_id)->Alloc();
+
+  VkCommandBufferBeginInfo cb_begin;
+  cb_begin.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+  cb_begin.pNext = nullptr;
+  cb_begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+  cb_begin.pInheritanceInfo = 0;
+
+  VkSubmitInfo cb_submit;
+  cb_submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+  cb_submit.pNext = nullptr;
+  cb_submit.waitSemaphoreCount = 0;
+  cb_submit.pWaitSemaphores = nullptr;
+  cb_submit.pWaitDstStageMask = 0;
+  cb_submit.commandBufferCount = 1;
+  cb_submit.pCommandBuffers = &(cmd->cmd_buffer);
+  cb_submit.signalSemaphoreCount = 0;
+  cb_submit.pSignalSemaphores = nullptr;
+
+
+  int from_dev_type = static_cast<int>(ctx_from.device_type);
+  int to_dev_type = static_cast<int>(ctx_to.device_type);
+
+  if (from_dev_type == kDLVulkan && to_dev_type == kDLVulkan) {
+    CHECK_EQ(ctx_from.device_id, ctx_to.device_id)
+        << "Vulkan disallow cross device copy.";
+    const VulkanContext& vctx = context_[ctx_from.device_id];
+    const VulkanBuffer* from_buf = static_cast<const VulkanBuffer*>(from);
+    VulkanBuffer* to_buf = static_cast<VulkanBuffer*>(to);
+    // The assumption is that subsequence ops only perform compute/transfer
+    // 0: begin
+    VULKAN_CALL(vkBeginCommandBuffer(cmd->cmd_buffer, &cb_begin));
+    // 1: copy
+    VkBufferCopy copy_info;
+    copy_info.srcOffset = from_offset;
+    copy_info.dstOffset = to_offset;
+    copy_info.size = size;
+    vkCmdCopyBuffer(cmd->cmd_buffer, from_buf->buffer, to_buf->buffer, 1, &copy_info);
+    // 2: barrier(transfer-> compute|transfer)
+    VkMemoryBarrier barrier_info;
+    barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+    barrier_info.pNext = nullptr;
+    barrier_info.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    barrier_info.dstAccessMask =
+        (VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT |
+         VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
+    vkCmdPipelineBarrier(
+        cmd->cmd_buffer,
+        VK_PIPELINE_STAGE_TRANSFER_BIT,
+        VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+        0, 1, &barrier_info, 0, nullptr, 0, nullptr);
+    // 3: end
+    VULKAN_CALL(vkEndCommandBuffer(cmd->cmd_buffer));
+    // 4: submit with cmd->fence
+    VULKAN_CALL(vkQueueSubmit(vctx.queue, 1, &cb_submit, cmd->fence));
+  } else if (from_dev_type == kDLVulkan && to_dev_type == kDLCPU) {
+    const VulkanContext& vctx = context_[ctx_from.device_id];
+    const VulkanBuffer* from_buf = static_cast<const VulkanBuffer*>(from);
+    VulkanStagingBuffer* temp = tls->StagingBuffer(ctx_from.device_id, size);
+    // 0: begin
+    VULKAN_CALL(vkBeginCommandBuffer(cmd->cmd_buffer, &cb_begin));
+    // 1: copy
+    VkBufferCopy copy_info;
+    copy_info.srcOffset = from_offset;
+    copy_info.dstOffset = 0;
+    copy_info.size = size;
+    vkCmdCopyBuffer(cmd->cmd_buffer,
+                    from_buf->buffer,
+                    temp->buffer,
+                    1, &copy_info);
+    // 2: end
+    VULKAN_CALL(vkEndCommandBuffer(cmd->cmd_buffer));
+    // 4: submit with cmd->fence
+    VULKAN_CALL(vkQueueSubmit(vctx.queue, 1, &cb_submit, cmd->fence));
+    // Block until done, to make sure temp can be reused later.
+    VULKAN_CALL(vkQueueWaitIdle(vctx.queue));
+    // host side invalidation if access is not coherent.
+    // so writes from GPU is visible to CPU
+    if (!vctx.coherent_staging) {
+      VkMappedMemoryRange mrange;
+      mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+      mrange.pNext = nullptr;
+      mrange.memory = temp->memory;
+      mrange.offset = 0;
+      mrange.size = size;
+      VULKAN_CALL(vkInvalidateMappedMemoryRanges(
+          vctx.device, 1, &mrange));
+    }
+    memcpy(static_cast<char*>(to) + to_offset,
+           static_cast<char*>(temp->host_addr),
+           size);
+  } else if (from_dev_type == kDLCPU && to_dev_type == kDLVulkan) {
+    const VulkanContext& vctx = context_[ctx_to.device_id];
+    const VulkanBuffer* to_buf = static_cast<const VulkanBuffer*>(to);
+    VulkanStagingBuffer* temp = tls->StagingBuffer(ctx_to.device_id, size);
+    memcpy(temp->host_addr,
+           static_cast<const char*>(from) + from_offset,
+           size);
+    // host side flush if access is not coherent.
+    // so writes from CPU is visible to GPU
+    if (!vctx.coherent_staging) {
+      VkMappedMemoryRange mrange;
+      mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+      mrange.pNext = nullptr;
+      mrange.memory = temp->memory;
+      mrange.offset = 0;
+      mrange.size = size;
+      VULKAN_CALL(vkFlushMappedMemoryRanges(vctx.device, 1, &mrange));
+    }
+    VULKAN_CALL(vkBeginCommandBuffer(cmd->cmd_buffer, &cb_begin));
+    // 0: barrier(host->transfer)
+    VkMemoryBarrier barrier_info;
+    barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+    barrier_info.pNext = nullptr;
+    barrier_info.srcAccessMask = 0;
+    barrier_info.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    vkCmdPipelineBarrier(cmd->cmd_buffer,
+                         VK_PIPELINE_STAGE_HOST_BIT,
+                         VK_PIPELINE_STAGE_TRANSFER_BIT,
+                         0, 1, &barrier_info,
+                         0, nullptr, 0, nullptr);
+    // 1: copy
+    VkBufferCopy copy_info;
+    copy_info.srcOffset = 0;
+    copy_info.dstOffset = to_offset;
+    copy_info.size = size;
+    vkCmdCopyBuffer(cmd->cmd_buffer,
+                    temp->buffer,
+                    to_buf->buffer,
+                    1, &copy_info);
+    // 2: end
+    VULKAN_CALL(vkEndCommandBuffer(cmd->cmd_buffer));
+    // 4: submit with cmd->fence
+    VULKAN_CALL(vkQueueSubmit(vctx.queue, 1, &cb_submit, cmd->fence));
+    // wait until copy finishes, so we can reuse temp next time.
+    VULKAN_CALL(vkQueueWaitIdle(vctx.queue));
+  } else {
+    LOG(FATAL) << "Expect copy from/to Metal or between Metal"
+               << ", from=" << from_dev_type
+               << ", to=" << to_dev_type;
+  }
+}
+
+void VulkanWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
+  CHECK(stream == nullptr);
+  VulkanContext& vctx = context_[ctx.device_id];
+  VULKAN_CALL(vkQueueWaitIdle(vctx.queue));
+}
+
+void* VulkanWorkspace::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) {
+  return VulkanThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
+}
+
+void VulkanWorkspace::FreeWorkspace(TVMContext ctx, void* data) {
+  VulkanThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
+}
+
+// VulkanCommandPool
+VulkanCommandPool::VulkanCommandPool(const VulkanContext& vctx) {
+  ring_.resize(kMaxPending, VulkanCommandBuffer());
+  device_ = vctx.device;
+  {
+    // create command pool
+    VkCommandPoolCreateInfo cmd_pool_cinfo;
+    cmd_pool_cinfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+    cmd_pool_cinfo.pNext = nullptr;
+    cmd_pool_cinfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+    cmd_pool_cinfo.queueFamilyIndex = vctx.queue_family_index;
+    VULKAN_CALL(vkCreateCommandPool(device_, &cmd_pool_cinfo, nullptr, &cmd_pool_));
+  }
+  {
+    // create descriptor pool
+    VkDescriptorPoolSize pool_size;
+    pool_size.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    pool_size.descriptorCount = kMaxPending * kMaxNumArgs;
+    VkDescriptorPoolCreateInfo descrip_pool_cinfo;
+    descrip_pool_cinfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    descrip_pool_cinfo.pNext = nullptr;
+    descrip_pool_cinfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
+    descrip_pool_cinfo.maxSets = kMaxPending + 2;
+    descrip_pool_cinfo.poolSizeCount = 1;
+    descrip_pool_cinfo.pPoolSizes = &pool_size;
+    VULKAN_CALL(vkCreateDescriptorPool(
+        device_, &descrip_pool_cinfo, nullptr, &descriptor_pool_));
+  }
+  VkCommandBufferAllocateInfo buffer_alloc_info;
+  buffer_alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+  buffer_alloc_info.pNext = nullptr;
+  buffer_alloc_info.commandPool = cmd_pool_;
+  buffer_alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+  buffer_alloc_info.commandBufferCount = 1;
+
+  VkFenceCreateInfo fence_cinfo;
+  fence_cinfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+  fence_cinfo.pNext = nullptr;
+  fence_cinfo.flags = VK_FENCE_CREATE_SIGNALED_BIT;
+
+  for (size_t i = 0; i < ring_.size(); ++i) {
+    VULKAN_CALL(vkAllocateCommandBuffers(
+        device_, &buffer_alloc_info, &(ring_[i].cmd_buffer)));
+    VULKAN_CALL(vkCreateFence(
+        device_, &fence_cinfo, nullptr, &(ring_[i].fence)));
+  }
+}
+
+VulkanCommandPool::~VulkanCommandPool() {
+  // wait device to be idle so we know we can recycle buffers
+  VULKAN_CALL(vkDeviceWaitIdle(device_));
+  // start recycling.
+  for (size_t i = 0; i < ring_.size(); ++i) {
+    if (ring_[i].cmd_buffer != nullptr) {
+      vkFreeCommandBuffers(device_, cmd_pool_, 1, &(ring_[i].cmd_buffer));
+      ring_[i].cmd_buffer = nullptr;
+    }
+    if (ring_[i].fence != VK_NULL_HANDLE) {
+      vkDestroyFence(device_, ring_[i].fence, nullptr);
+    }
+  }
+  // delete cmd_pool and descriptor pool
+  vkDestroyCommandPool(device_, cmd_pool_, nullptr);
+  vkDestroyDescriptorPool(device_, descriptor_pool_, nullptr);
+}
+
+VulkanCommandBuffer* VulkanCommandPool::Alloc() {
+  return Alloc(nullptr);
+}
+
+VulkanCommandBuffer* VulkanCommandPool::Alloc(
+    const VkDescriptorSetLayout* dlayout) {
+  // always allocate resource in round robin manner
+  VulkanCommandBuffer* e = &(ring_[clock_ptr_]);
+  clock_ptr_ = (clock_ptr_ + 1) % ring_.size();
+  // Wait until previous usage of commad buffer is finished.
+  uint64_t timeout = 1UL << 30UL;
+  VkResult res;
+  res = vkWaitForFences(device_, 1, &(e->fence), 0, timeout);
+  while (res == VK_TIMEOUT) {
+    res = vkWaitForFences(device_, 1, &(e->fence), 0, timeout);
+  }
+  VULKAN_CHECK_ERROR(res);
+  vkResetFences(device_, 1, (&e->fence));
+  if (e->descriptor_set != VK_NULL_HANDLE) {
+    VULKAN_CALL(vkFreeDescriptorSets(
+        device_, descriptor_pool_, 1, &(e->descriptor_set)));
+    e->descriptor_set = VK_NULL_HANDLE;
+  }
+  if (dlayout != nullptr) {
+    VkDescriptorSetAllocateInfo alloc_info;
+    alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    alloc_info.pNext = nullptr;
+    alloc_info.descriptorPool = descriptor_pool_;
+    alloc_info.descriptorSetCount = 1;
+    alloc_info.pSetLayouts = dlayout;
+    VULKAN_CALL(vkAllocateDescriptorSets(
+        device_, &alloc_info, &(e->descriptor_set)));
+  }
+  return e;
+}
+
+// VulkanThreadEntry
+typedef dmlc::ThreadLocalStore<VulkanThreadEntry> VulkanThreadStore;
+
+VulkanThreadEntry* VulkanThreadEntry::ThreadLocal() {
+  return VulkanThreadStore::Get();
+}
+
+VulkanCommandPool* VulkanThreadEntry::CommandPool(int device_id) {
+  while (pool_.size() <= static_cast<size_t>(device_id)) {
+    pool_.emplace_back(std::unique_ptr<VulkanCommandPool>());
+  }
+  if (pool_[device_id] == nullptr) {
+    const VulkanContext& vctx =
+        VulkanWorkspace::Global()->context_[device_id];
+    pool_[device_id].reset(new VulkanCommandPool(vctx));
+  }
+  return pool_[device_id].get();
+}
+
+VulkanStagingBuffer*
+VulkanThreadEntry::StagingBuffer(int device_id, size_t size) {
+  if (staging_buffer_.size() <= static_cast<size_t>(device_id)) {
+    staging_buffer_.resize(device_id + 1, VulkanStagingBuffer());
+  }
+  VulkanStagingBuffer& buf = staging_buffer_[device_id];
+
+  if (buf.device != nullptr && buf.size < size) {
+    // free previous buffer
+    if (buf.host_addr != nullptr) {
+      vkUnmapMemory(buf.device, buf.memory);
+    }
+    if (buf.memory != VK_NULL_HANDLE) {
+      vkFreeMemory(buf.device, buf.memory, nullptr);
+    }
+    if (buf.buffer != VK_NULL_HANDLE) {
+      vkDestroyBuffer(buf.device, buf.buffer, nullptr);
+    }
+    buf.host_addr = nullptr;
+    buf.memory = VK_NULL_HANDLE;
+    buf.buffer = VK_NULL_HANDLE;
+  }
+  const VulkanContext& vctx =
+      VulkanWorkspace::Global()->context_[device_id];
+
+  if (buf.device == nullptr) {
+    buf.device = vctx.device;
+  }
+  if (buf.memory == VK_NULL_HANDLE) {
+    // allocate the stagging buffer memory if necessary
+    VkBufferCreateInfo info;
+    info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    info.pNext = nullptr;
+    info.flags = 0;
+    info.size = size;
+    info.queueFamilyIndexCount = 1;
+    info.pQueueFamilyIndices = &(vctx.queue_family_index);
+    info.usage =
+        VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+        VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+    VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &(buf.buffer)));
+    VkMemoryAllocateInfo minfo;
+    minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    minfo.pNext = nullptr;
+    minfo.allocationSize = size;
+    minfo.memoryTypeIndex = vctx.staging_mtype_index;
+    VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &(buf.memory)));
+    VULKAN_CALL(vkBindBufferMemory(vctx.device, (buf.buffer), buf.memory, 0));
+    VULKAN_CALL(vkMapMemory(vctx.device, buf.memory, 0, size, 0, &(buf.host_addr)));
+    buf.size = size;
+  }
+  memset(buf.host_addr, 0, size);
+  return &buf;
+}
+
+VulkanThreadEntry::~VulkanThreadEntry() {
+  // Because the thread entry refers to Device API
+  // The command buffer always will be destroyed before
+  // the instance and device get destroyed.
+  // The destruction need to be manually called
+  // to ensure the destruction order.
+  pool_.clear();
+  for (VulkanStagingBuffer buf : staging_buffer_) {
+    if (buf.host_addr != nullptr) {
+      vkUnmapMemory(buf.device, buf.memory);
+    }
+    if (buf.memory != VK_NULL_HANDLE) {
+      vkFreeMemory(buf.device, buf.memory, nullptr);
+    }
+    if (buf.buffer != VK_NULL_HANDLE) {
+      vkDestroyBuffer(buf.device, buf.buffer, nullptr);
+    }
+  }
+}
+
+VkInstance CreateInstance() {
+  VkApplicationInfo app_info;
+  app_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+  app_info.pNext = nullptr;
+  app_info.pApplicationName = "TVM";
+  app_info.applicationVersion = 0;
+  app_info.pEngineName = "";
+  app_info.engineVersion = 0;
+  app_info.apiVersion = VK_MAKE_VERSION(1, 0, 65);
+
+  VkInstanceCreateInfo inst_info;
+  inst_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+  inst_info.pNext = nullptr;
+  inst_info.flags = 0;
+  inst_info.pApplicationInfo = &app_info;
+  inst_info.enabledLayerCount = 0;
+  inst_info.ppEnabledLayerNames = nullptr;
+  inst_info.enabledExtensionCount = 0;
+  inst_info.ppEnabledExtensionNames = nullptr;
+
+  VkInstance inst;
+  VULKAN_CALL(vkCreateInstance(&inst_info, nullptr, &inst));
+  return inst;
+}
+
+// find suitable mem_type_index for staging and compute
+void FindMemoryTypeIndex(VulkanContext* vctx) {
+  // Find suitable compute index.
+  VkBuffer buffer;
+  VkMemoryRequirements req_staging, req_compute;
+  VkBufferCreateInfo info;
+  info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+  info.pNext = nullptr;
+  info.flags = 0;
+  info.size = 1024;
+  info.queueFamilyIndexCount = 1;
+  info.pQueueFamilyIndices = &(vctx->queue_family_index);
+
+  // get staging requirement
+  info.usage =
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+  VULKAN_CALL(vkCreateBuffer(vctx->device, &info, nullptr, &buffer));
+  vkGetBufferMemoryRequirements(vctx->device, buffer, &req_staging);
+  vkDestroyBuffer(vctx->device, buffer, nullptr);
+  // get compute requirement
+  info.usage =
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+      VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+  VULKAN_CALL(vkCreateBuffer(vctx->device, &info, nullptr, &buffer));
+  vkGetBufferMemoryRequirements(vctx->device, buffer, &req_compute);
+  vkDestroyBuffer(vctx->device, buffer, nullptr);
+
+  // Query phyiscal device property
+  // find a memory that is host visible, no need to be consistent
+  int win_rank = -1;
+  VkPhysicalDeviceMemoryProperties prop;
+  vkGetPhysicalDeviceMemoryProperties(vctx->phy_device, &prop);
+
+  for (uint32_t k = 0; k < prop.memoryTypeCount; ++k) {
+    VkMemoryType ty = prop.memoryTypes[k];
+    size_t heap_size = prop.memoryHeaps[ty.heapIndex].size;
+    // host visible
+    if (!(ty.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) continue;
+    // match copy requirment
+    if (!(req_staging.memoryTypeBits & (1 << k))) continue;
+    if (heap_size < 1024) continue;
+    int rank = 0;
+    rank += ty.propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+    if (rank > win_rank) {
+      win_rank = rank;
+      vctx->staging_mtype_index = k;
+      vctx->coherent_staging =
+          ty.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+    }
+  }
+  CHECK_GE(win_rank, 0) << "Cannot find suitable staging memory on device.";
+
+  win_rank = -1;
+  for (uint32_t k = 0; k < prop.memoryTypeCount; ++k) {
+    VkMemoryType ty = prop.memoryTypes[k];
+    size_t heap_size = prop.memoryHeaps[ty.heapIndex].size;
+    // host visible
+    if (!(ty.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) continue;
+    // match copy requirment
+    if (!(req_staging.memoryTypeBits & (1 << k))) continue;
+    if (heap_size < 1024) continue;
+    int rank = 0;
+    // prefer not host visible
+    rank += !(ty.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+    if (rank > win_rank) {
+      win_rank = rank;
+      vctx->compute_mtype_index = k;
+    }
+  }
+  CHECK_GE(win_rank, 0) << "Cannot find suitable staging memory on device.";
+}
+
+// Get all logic devices that support compute
+std::vector<VulkanContext> GetContext(VkInstance instance) {
+  std::vector<VulkanContext> result;
+  uint32_t phy_dev_count = 0;
+  VULKAN_CALL(vkEnumeratePhysicalDevices(
+      instance, &phy_dev_count, nullptr));
+  std::vector<VkPhysicalDevice> all_phy_devs(phy_dev_count);
+  VULKAN_CALL(vkEnumeratePhysicalDevices(
+      instance, &phy_dev_count, dmlc::BeginPtr(all_phy_devs)));
+  for (VkPhysicalDevice phy_dev : all_phy_devs) {
+    uint32_t queue_prop_count = 0;
+    vkGetPhysicalDeviceQueueFamilyProperties(
+        phy_dev, &queue_prop_count, nullptr);
+    std::vector<VkQueueFamilyProperties> queue_props(queue_prop_count);
+    vkGetPhysicalDeviceQueueFamilyProperties(
+        phy_dev, &queue_prop_count, dmlc::BeginPtr(queue_props));
+    uint32_t queue_family_index = 0;
+    std::vector<VkDeviceQueueCreateInfo> queue_create_info;
+
+    for (uint32_t i = 0; i < queue_props.size(); i++) {
+      // find queues that support compute
+      if (VK_QUEUE_COMPUTE_BIT & queue_props[i].queueFlags) {
+        float priority = 1.0f;
+
+        VkDeviceQueueCreateInfo info;
+        info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+        info.pNext = nullptr;
+        info.flags = 0;
+        info.queueFamilyIndex = i;
+        info.queueCount = 1;
+        info.pQueuePriorities = &priority;
+
+        queue_create_info.push_back(info);
+        // only use the first available queue for now
+        if (queue_create_info.size() == 0) {
+          queue_family_index = i;
+        }
+      }
+    }
+    if (queue_create_info.size() == 0) continue;
+
+    VkDeviceCreateInfo device_create_info;
+    device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+    device_create_info.pNext = nullptr;
+    device_create_info.flags = 0;
+    device_create_info.queueCreateInfoCount
+        = static_cast<uint32_t>(queue_create_info.size());
+    device_create_info.pQueueCreateInfos = queue_create_info.data();
+    device_create_info.enabledLayerCount = 0;
+    device_create_info.ppEnabledLayerNames = nullptr;
+    device_create_info.enabledExtensionCount = 0;
+    device_create_info.ppEnabledExtensionNames = nullptr;
+    device_create_info.pEnabledFeatures = nullptr;
+
+    VulkanContext ctx;
+    // setup context
+    ctx.phy_device = phy_dev;
+    vkGetPhysicalDeviceProperties(ctx.phy_device, &(ctx.phy_device_prop));
+    VULKAN_CALL(vkCreateDevice(
+        phy_dev, &device_create_info, nullptr, &(ctx.device)));
+    vkGetDeviceQueue(ctx.device, queue_family_index, 0, &(ctx.queue));
+    ctx.queue_family_index = queue_family_index;
+    FindMemoryTypeIndex(&ctx);
+    // Find suitable memory type for staging and compute
+    result.push_back(ctx);
+  }
+  return result;
+}
+
+void VulkanWorkspace::Init() {
+  if (initialized_) return;
+  std::lock_guard<std::mutex> lock(this->mu);
+  if (initialized_) return;
+  initialized_ = true;
+  try {
+    instance_ = CreateInstance();
+    context_ = GetContext(instance_);
+    LOG(INFO) << "Initialzie Vulkan with " << context_.size() << " devices..";
+    for (size_t i = 0; i < context_.size(); ++i) {
+      LOG(INFO) << "vulkan(" << i
+                <<  ")=\'" << context_[i].phy_device_prop.deviceName
+                << "\' phy_dev_id=" << context_[i].phy_device;
+    }
+  } catch (const dmlc::Error& err) {
+    LOG(INFO) << "Cannot initialize vulkan: " << err.what() << "\n"
+              << "You can still compile vulkan module but cannot run locally";
+  }
+}
+
+bool InitVulkan(TVMArgs args, TVMRetValue* rv) {
+  vulkan::VulkanWorkspace::Global()->Init();
+  return true;
+}
+
+TVM_REGISTER_GLOBAL("device_api.vulkan")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    DeviceAPI* ptr = VulkanWorkspace::Global().get();
+    *rv = static_cast<void*>(ptr);
+  });
+
+}  // namespace vulkan
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vulkan/vulkan_module.cc b/src/runtime/vulkan/vulkan_module.cc
new file mode 100644
index 000000000000..b5425dd8fbc5
--- /dev/null
+++ b/src/runtime/vulkan/vulkan_module.cc
@@ -0,0 +1,420 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vulkan_module.cc
+ */
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/module.h>
+#include <array>
+#include <string>
+#include <mutex>
+#include "./vulkan_common.h"
+#include "./vulkan_module.h"
+#include "../pack_args.h"
+#include "../thread_storage_scope.h"
+#include "../meta_data.h"
+#include "../file_util.h"
+
+
+namespace tvm {
+namespace runtime {
+
+void VulkanShader::Save(dmlc::Stream* writer) const {
+  writer->Write(flag);
+  writer->Write(data);
+}
+
+bool VulkanShader::Load(dmlc::Stream* reader) {
+  if (!reader->Read(&flag)) return false;
+  if (!reader->Read(&data)) return false;
+  return true;
+}
+
+// Multi-device enabled module.
+class VulkanModuleNode final :public runtime::ModuleNode {
+ public:
+  // Pipeline cache states
+  struct PipelineEntry {
+    VkShaderModule shader{VK_NULL_HANDLE};
+    VkPipelineLayout pipeline_layout{VK_NULL_HANDLE};
+    VkDescriptorSetLayout descriptor_layout{VK_NULL_HANDLE};
+    VkPipeline pipeline{VK_NULL_HANDLE};
+  };
+  // constructor
+  explicit VulkanModuleNode(std::unordered_map<std::string, VulkanShader> smap,
+                            std::unordered_map<std::string, FunctionInfo> fmap,
+                            std::string source)
+      : smap_(smap), fmap_(fmap), source_(source) {
+  }
+
+  ~VulkanModuleNode() {
+    // cleanup vulkan related caches.
+    for (DeviceEntry& e : finfo_) {
+      if (e.device == nullptr) continue;
+      for (auto &kv : e.smap) {
+        PipelineEntry& pe = kv.second;
+        vkDestroyShaderModule(e.device, pe.shader, nullptr);
+        vkDestroyDescriptorSetLayout(e.device, pe.descriptor_layout, nullptr);
+        vkDestroyPipelineLayout(e.device, pe.pipeline_layout, nullptr);
+        vkDestroyPipeline(e.device, pe.pipeline, nullptr);
+      }
+    }
+  }
+  const char* type_key() const final {
+    return "vulkan";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final {
+    std::string fmt = GetFileFormat(file_name, format);
+    CHECK_EQ(fmt, fmt_)
+        << "Can only save to customized format vulkan";
+    std::string meta_file = GetMetaFilePath(file_name);
+    SaveMetaDataToFile(meta_file, fmap_);
+    std::string data_bin;
+    dmlc::MemoryStringStream fs(&data_bin);
+    dmlc::Stream* stream = &fs;
+    uint32_t magic = kVulkanModuleMagic;
+    stream->Write(magic);
+    stream->Write(smap_);
+    SaveBinaryToFile(file_name, data_bin);
+  }
+
+  void SaveToBinary(dmlc::Stream* stream) final {
+    stream->Write(fmt_);
+    stream->Write(fmap_);
+    stream->Write(smap_);
+  }
+  std::string GetSource(const std::string& format) final {
+    // can only return source code.
+    return source_;
+  }
+
+  // get a from primary context in device_id
+  PipelineEntry GetPipeline(size_t device_id,
+                            const std::string& func_name,
+                            size_t num_pack_args) {
+    vulkan::VulkanWorkspace* w = vulkan::VulkanWorkspace::Global().get();
+    CHECK_LT(device_id, w->context_.size());
+    // start lock scope.
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (finfo_.size() <= device_id) {
+      finfo_.resize(device_id + 1, DeviceEntry());
+    }
+    DeviceEntry& e = finfo_[device_id];
+    auto it = e.smap.find(func_name);
+    if (it != e.smap.end()) return it->second;
+    PipelineEntry pe;
+    if (e.device == nullptr) {
+      e.device = w->context_[device_id].device;
+    }
+    {
+      // create shader
+      auto sit = smap_.find(func_name);
+      CHECK(sit != smap_.end());
+      const std::vector<uint32_t>& data = sit->second.data;
+      VkShaderModuleCreateInfo shader_cinfo;
+      shader_cinfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+      shader_cinfo.pNext = nullptr;
+      shader_cinfo.flags = 0;
+      shader_cinfo.codeSize = data.size() * sizeof(uint32_t);
+      shader_cinfo.pCode = data.data();
+      VULKAN_CALL(vkCreateShaderModule(
+          e.device, &shader_cinfo, nullptr, &(pe.shader)));
+    }
+    std::vector<VkDescriptorSetLayoutBinding> arg_binding;
+    uint32_t num_pod = 0, num_buffer = 0;
+    {
+      auto fit = fmap_.find(func_name);
+      CHECK(fit != fmap_.end());
+      for (TVMType arg_type : fit->second.arg_types) {
+        if (arg_type.code == kHandle) {
+          VkDescriptorSetLayoutBinding bd;
+          bd.binding = num_buffer;
+          bd.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+          bd.descriptorCount = 1;
+        bd.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+        bd.pImmutableSamplers = nullptr;
+        arg_binding.push_back(bd);
+        ++num_buffer;
+        } else {
+          ++num_pod;
+        }
+      }
+    }
+
+    VkDescriptorSetLayoutCreateInfo descrip_cinfo;
+    descrip_cinfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+    descrip_cinfo.pNext = nullptr;
+    descrip_cinfo.flags = 0;
+    descrip_cinfo.bindingCount = arg_binding.size();
+    descrip_cinfo.pBindings = arg_binding.data();
+    VULKAN_CALL(vkCreateDescriptorSetLayout(
+        e.device, &descrip_cinfo, nullptr, &(pe.descriptor_layout)));
+
+    VkPushConstantRange crange;
+    crange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    crange.offset = 0;
+    crange.size = sizeof(ArgUnion) * num_pack_args;
+
+    VkPipelineLayoutCreateInfo playout_cinfo;
+    playout_cinfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+    playout_cinfo.pNext = nullptr;
+    playout_cinfo.flags = 0;
+    playout_cinfo.setLayoutCount = 1;
+    playout_cinfo.pSetLayouts = &(pe.descriptor_layout);
+
+    if (num_pack_args != 0) {
+      playout_cinfo.pushConstantRangeCount = 1;
+      playout_cinfo.pPushConstantRanges = &crange;
+      CHECK_LE(crange.size,
+               w->context_[device_id].phy_device_prop.limits.maxPushConstantsSize);
+    } else {
+      playout_cinfo.pushConstantRangeCount = 0;
+      playout_cinfo.pPushConstantRanges = nullptr;
+    }
+
+    VULKAN_CALL(vkCreatePipelineLayout(
+        e.device, &playout_cinfo, nullptr, &(pe.pipeline_layout)));
+    VkComputePipelineCreateInfo pipeline_cinfo;
+    pipeline_cinfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    pipeline_cinfo.pNext = nullptr;
+    pipeline_cinfo.flags = 0;
+    pipeline_cinfo.stage.sType =
+        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    pipeline_cinfo.stage.pNext = nullptr;
+    pipeline_cinfo.stage.flags = 0;
+    pipeline_cinfo.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    pipeline_cinfo.stage.module = pe.shader;
+    pipeline_cinfo.stage.pName = func_name.c_str();
+    pipeline_cinfo.stage.pSpecializationInfo = nullptr;
+    pipeline_cinfo.layout = pe.pipeline_layout;
+    pipeline_cinfo.basePipelineHandle = VK_NULL_HANDLE;
+    pipeline_cinfo.basePipelineIndex = 0;
+    VULKAN_CALL(vkCreateComputePipelines(
+        e.device, VK_NULL_HANDLE, 1, &pipeline_cinfo, nullptr, &(pe.pipeline)));
+    e.smap[func_name] = pe;
+    return pe;
+  }
+
+ private:
+  // device specific entry
+  struct DeviceEntry {
+    VkDevice device{nullptr};
+    std::unordered_map<std::string, PipelineEntry> smap;
+  };
+  // the binary data
+  std::vector<uint32_t> data_;
+  // function information table.
+  std::unordered_map<std::string, VulkanShader> smap_;
+  // function information table.
+  std::unordered_map<std::string, FunctionInfo> fmap_;
+  // The format
+  std::string fmt_{"vulkan"};
+  // The source
+  std::string source_;
+  // device local pipeline information.
+  std::vector<DeviceEntry> finfo_;
+  // internal mutex when updating the module
+  std::mutex mutex_;
+};
+
+// a wrapped function class to get packed fucn.
+class VulkanWrappedFunc {
+ public:
+  // initialize the VULKAN function.
+  void Init(VulkanModuleNode* m,
+            std::shared_ptr<ModuleNode> sptr,
+            const std::string& func_name,
+            size_t num_buffer_args,
+            size_t num_pack_args,
+            const std::vector<std::string>& thread_axis_tags) {
+    w_ = vulkan::VulkanWorkspace::Global().get();
+    m_ = m;
+    sptr_ = sptr;
+    func_name_ = func_name;
+    num_buffer_args_ = num_buffer_args;
+    num_pack_args_ = num_pack_args;
+    thread_axis_cfg_.Init(num_buffer_args + num_pack_args, thread_axis_tags);
+  }
+  // invoke the function with void arguments
+  void operator()(TVMArgs args,
+                  TVMRetValue* rv,
+                  const ArgUnion* pack_args) const {
+    vulkan::VulkanThreadEntry* tls = vulkan::VulkanThreadEntry::ThreadLocal();
+    int device_id = tls->context.device_id;
+    CHECK_LT(device_id, kVulkanMaxNumDevice);
+    const vulkan::VulkanContext& vctx = w_->context_[device_id];
+    VulkanModuleNode::PipelineEntry& pe = scache_[device_id];
+    if (pe.pipeline == VK_NULL_HANDLE) {
+      pe = m_->GetPipeline(device_id, func_name_, num_pack_args_);
+    }
+    ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
+    vulkan::VulkanCommandBuffer* cmd = tls->CommandPool(device_id)->Alloc(
+        &(pe.descriptor_layout));
+
+    cmd->write_descriptor_set.dstSet = cmd->descriptor_set;
+
+    // setup descriptors
+    for (uint32_t i = 0; i < num_buffer_args_; ++i) {
+      void* buf = args[static_cast<int>(i)];
+      VkDescriptorBufferInfo binfo;
+      binfo.buffer = static_cast<vulkan::VulkanBuffer*>(buf)->buffer;
+      binfo.offset = 0;
+      binfo.range = VK_WHOLE_SIZE;
+      cmd->write_descriptor_set.dstBinding = i;
+      cmd->write_descriptor_set.pBufferInfo = &binfo;
+      vkUpdateDescriptorSets(
+          vctx.device, 1, &(cmd->write_descriptor_set), 0, nullptr);
+    }
+
+    // dispatch
+    VkCommandBufferBeginInfo cb_begin;
+    cb_begin.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    cb_begin.pNext = nullptr;
+    cb_begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    cb_begin.pInheritanceInfo = 0;
+
+    VkSubmitInfo cb_submit;
+    cb_submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    cb_submit.pNext = nullptr;
+    cb_submit.waitSemaphoreCount = 0;
+    cb_submit.pWaitSemaphores = nullptr;
+    cb_submit.pWaitDstStageMask = 0;
+    cb_submit.commandBufferCount = 1;
+    cb_submit.pCommandBuffers = &(cmd->cmd_buffer);
+    cb_submit.signalSemaphoreCount = 0;
+    cb_submit.pSignalSemaphores = nullptr;
+    // 0: begin
+    VULKAN_CALL(vkBeginCommandBuffer(cmd->cmd_buffer, &cb_begin));
+    // 1: dispatch
+    vkCmdBindPipeline(
+        cmd->cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pe.pipeline);
+    vkCmdBindDescriptorSets(
+        cmd->cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+        pe.pipeline_layout, 0, 1, &(cmd->descriptor_set), 0, nullptr);
+    // bind push constant if necessary
+    if (num_pack_args_ != 0) {
+      vkCmdPushConstants(
+          cmd->cmd_buffer,
+          pe.pipeline_layout,
+          VK_SHADER_STAGE_COMPUTE_BIT,
+          0, num_pack_args_ * sizeof(ArgUnion),
+          pack_args);
+    }
+    vkCmdDispatch(
+        cmd->cmd_buffer, wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2));
+    // 2: barrier(compute->compute|transfer)
+    VkMemoryBarrier barrier_info;
+    barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+    barrier_info.pNext = nullptr;
+    barrier_info.srcAccessMask =
+        VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT;
+    barrier_info.dstAccessMask =
+        (VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT |
+         VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
+    vkCmdPipelineBarrier(
+        cmd->cmd_buffer,
+        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+        VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+        0, 1, &barrier_info, 0, nullptr, 0, nullptr);
+    // 3: end
+    VULKAN_CALL(vkEndCommandBuffer(cmd->cmd_buffer));
+    // 4: submit with cmd->fence
+    VULKAN_CALL(vkQueueSubmit(vctx.queue, 1, &cb_submit, cmd->fence));
+  }
+
+ private:
+  // Reference to global workspace.
+  vulkan::VulkanWorkspace* w_;
+  // internal module
+  VulkanModuleNode* m_;
+  // the resource holder
+  std::shared_ptr<ModuleNode> sptr_;
+  // The name of the function.
+  std::string func_name_;
+  // Number of buffer arguments
+  size_t num_buffer_args_;
+  // number of packed arguments.
+  size_t num_pack_args_;
+  // Device state cache per device.
+  // mark as mutable, to enable lazy initialization
+  mutable std::array<VulkanModuleNode::PipelineEntry, kVulkanMaxNumDevice> scache_;
+  // thread axis configuration
+  ThreadAxisConfig thread_axis_cfg_;
+};
+
+PackedFunc VulkanModuleNode::GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  CHECK_EQ(sptr_to_self.get(), this);
+  CHECK_NE(name, symbol::tvm_module_main)
+      << "Device function do not have main";
+  auto it = fmap_.find(name);
+  if (it == fmap_.end()) return PackedFunc();
+  const FunctionInfo& info = it->second;
+  VulkanWrappedFunc f;
+  size_t num_buffer_args = NumBufferArgs(info.arg_types);
+  f.Init(this, sptr_to_self, name,
+         num_buffer_args, info.arg_types.size() - num_buffer_args,
+         info.thread_axis_tags);
+  return PackFuncNonBufferArg(f, info.arg_types);
+}
+
+Module VulkanModuleCreate(
+    std::unordered_map<std::string, VulkanShader> smap,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
+  vulkan::VulkanWorkspace::Global()->Init();
+  std::shared_ptr<VulkanModuleNode> n =
+      std::make_shared<VulkanModuleNode>(smap, fmap, source);
+  return Module(n);
+}
+
+// Load module from module.
+Module VulkanModuleLoadFile(const std::string& file_name,
+                            const std::string& format) {
+  std::string data;
+  std::unordered_map<std::string, VulkanShader> smap;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt = GetFileFormat(file_name, format);
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  dmlc::MemoryStringStream fs(&data);
+  dmlc::Stream* stream = &fs;
+  uint32_t magic;
+  stream->Read(&magic);
+  CHECK_EQ(magic, kVulkanModuleMagic)
+      << "VulkanModule Magic mismatch";
+  stream->Read(&smap);
+  return VulkanModuleCreate(smap, fmap, "");
+}
+
+Module VulkanModuleLoadBinary(void* strm) {
+  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
+  std::unordered_map<std::string, VulkanShader> smap;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+
+  std::string fmt;
+  stream->Read(&fmt);
+  stream->Read(&fmap);
+  stream->Read(&smap);
+  return VulkanModuleCreate(smap, fmap, "");
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_vulkan")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = VulkanModuleLoadFile(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("module.loadbinary_vulkan")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = VulkanModuleLoadBinary(args[0]);
+  });
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vulkan/vulkan_module.h b/src/runtime/vulkan/vulkan_module.h
new file mode 100644
index 000000000000..41dcfd310dca
--- /dev/null
+++ b/src/runtime/vulkan/vulkan_module.h
@@ -0,0 +1,63 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file metal_module.h
+ * \brief Execution handling of Metal kernels
+ */
+#ifndef TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
+#define TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <dmlc/type_traits.h>
+#include <memory>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include "../meta_data.h"
+
+namespace tvm {
+namespace runtime {
+/*! \brief Maximum number of GPU supported in VulkanModule. */
+static constexpr const int kVulkanMaxNumDevice = 8;
+
+/*! \brief TVM Vulkan binary pack magic number */
+static constexpr const int kVulkanModuleMagic = 0x02700027;
+
+/*!
+ * \brief A single VK shader program
+ *
+ *  Due to the global resource declaration.
+ *  Current SPIRV only allows one entry program per shader,
+ *  making it less useful for a Module like system.
+ *
+ *  Instead we pass in map of str->VulkanShader until
+ *  there is a native solution available.
+ */
+struct VulkanShader {
+  /*! \brief header flag */
+  uint32_t flag{0};
+  /*! \brief Data segment */
+  std::vector<uint32_t> data;
+
+  void Save(dmlc::Stream *writer) const;
+  bool Load(dmlc::Stream *reader);
+};
+
+/*!
+ * \brief create a metal module from data.
+ *
+ * \param pmap The program map.
+ * \param fmap The function information map.
+ * \param source Optional, source code.
+ */
+Module VulkanModuleCreate(
+    std::unordered_map<std::string, VulkanShader> smap,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source);
+}  // namespace runtime
+}  // namespace tvm
+
+namespace dmlc {
+DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::VulkanShader, true);
+}  // namespace dmlc
+
+#endif  // TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc
index 494927979a0f..c903a8621206 100644
--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -23,28 +23,32 @@ class WorkspacePool::Pool {
     allocated_.push_back(e);
   }
   // allocate from pool
-  void* Alloc(TVMContext ctx, DeviceAPI* device, size_t size) {
+  void* Alloc(TVMContext ctx, DeviceAPI* device, size_t nbytes) {
     // Allocate align to page.
-    size = (size + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize;
-    if (size == 0) size = kWorkspacePageSize;
+    nbytes = (nbytes + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize;
+    if (nbytes == 0) nbytes = kWorkspacePageSize;
     Entry e;
+    TVMType type;
+    type.code = kDLUInt;
+    type.bits = 8;
+    type.lanes = 1;
     if (free_list_.size() == 2) {
       e = free_list_.back();
       free_list_.pop_back();
-      if (e.size < size) {
+      if (e.size < nbytes) {
         // resize the page
         device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment);
-        e.size = size;
+        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+        e.size = nbytes;
       }
     } else if (free_list_.size() == 1) {
-      e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment);
-      e.size = size;
+      e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+      e.size = nbytes;
     } else {
-      if (free_list_.back().size >= size) {
+      if (free_list_.back().size >= nbytes) {
         // find smallest fit
         auto it = free_list_.end() - 2;
-        for (; it->size >= size; --it) {}
+        for (; it->size >= nbytes; --it) {}
         e = *(it + 1);
         free_list_.erase(it + 1);
       } else {
@@ -52,8 +56,8 @@ class WorkspacePool::Pool {
         e = free_list_.back();
         free_list_.pop_back();
         device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment);
-        e.size = size;
+        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+        e.size = nbytes;
       }
     }
     allocated_.push_back(e);
diff --git a/src/schedule/bound.cc b/src/schedule/bound.cc
index 203ce28708a3..7929969a8502 100644
--- a/src/schedule/bound.cc
+++ b/src/schedule/bound.cc
@@ -6,6 +6,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/schedule_pass.h>
 #include <tvm/operation.h>
+#include <tvm/ir_pass.h>
 #include <unordered_map>
 #include <unordered_set>
 #include "./graph.h"
@@ -15,8 +16,9 @@
 namespace tvm {
 namespace schedule {
 
-using runtime::ThreadScope;
+using runtime::StorageRank;
 using runtime::StorageScope;
+using runtime::ThreadScope;
 
 /*! \brief The graph context used during bound inference. */
 struct GraphContext {
@@ -40,7 +42,16 @@ bool NeedRelax(const IterVar& iv,
   if (tag.length() == 0 || tag == "pipeline") {
     return !found_attach;
   }
-  return scope.rank <= ThreadScope::make(tag).rank;
+  ThreadScope ts = ThreadScope::make(tag);
+
+  // When there is warp memory
+  // threadIdx.x must be set to be warp index.
+  if (scope.rank == StorageRank::kWarp &&
+      ts.rank == 1 &&
+      ts.dim_index == 0) {
+    return true;
+  }
+  return static_cast<int>(scope.rank) <= ts.rank;
 }
 
 // infer storage scope, if not given
@@ -49,16 +60,17 @@ StorageScope InferStorageScope(
   if (stage->scope.length() != 0) {
     return StorageScope::make(stage->scope);
   }
-  int max_rank = 0;
+  int max_rank = -1;
   for (IterVar iv : ctx.attach_path.at(stage->op)) {
     auto it = ctx.bind_map.find(iv);
     const std::string& tag = (
         it != ctx.bind_map.end() ? it->second->thread_tag : iv->thread_tag);
     if (tag != "pipeline" && tag.length() != 0) {
-      max_rank = std::max(max_rank, ThreadScope::make(tag).rank + 1);
+      max_rank = std::max(max_rank, ThreadScope::make(tag).rank);
     }
   }
-  StorageScope s; s.rank = max_rank;
+  StorageScope s;
+  s.rank = runtime::DefaultStorageRank(max_rank);
   return s;
 }
 
@@ -209,6 +221,10 @@ Map<IterVar, Range> InferBound(const Schedule& sch) {
       ret[iv] = iv->dom;
     }
   }
+  for (auto& p : ret) {
+    ret[p.first] = Range::make_by_min_extent(ir::Simplify(p.second->min),
+                                             ir::Simplify(p.second->extent));
+  }
   return Map<IterVar, Range>(ret.begin(), ret.end());
 }
 
diff --git a/src/schedule/message_passing.cc b/src/schedule/message_passing.cc
index 969a18ee9469..b13dcefb1b9f 100644
--- a/src/schedule/message_passing.cc
+++ b/src/schedule/message_passing.cc
@@ -82,6 +82,8 @@ void PassDownDomain(const Stage& stage,
       Update(p_state, r->rebased,
              Range::make_by_min_extent(
                  0, state.at(r->parent)->extent));
+    } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
+      Update(p_state, s->iter, Range::make_by_min_extent(0, 1));
     } else {
       LOG(FATAL) << "unknown relation type";
     }
@@ -147,6 +149,7 @@ void PassUpIndex(const Stage& stage,
       } else {
         state[s->parent] = value;
       }
+    } else if (rel.as<SingletonNode>()) {
     } else {
       LOG(FATAL) << "unknown relation type";
     }
@@ -192,6 +195,8 @@ void PassDownIndex(const Stage& stage,
       Expr parent_min = dom_map.at(s->parent)->min;
       CHECK(is_zero(parent_min));
       state[s->rebased] = value;
+    } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
+      state[s->iter] = make_zero(s->iter->var.type());
     } else {
       LOG(FATAL) << "unknown relation type";
     }
@@ -296,6 +301,7 @@ void PassUpDomain(const Stage& stage,
                    state.at(r->rebased),
                    &parent);
       state[r->parent] = parent;
+    } else if (rel.as<SingletonNode>()) {
     } else {
       LOG(FATAL) << "unknown relation type";
     }
@@ -344,6 +350,7 @@ void PassUpBitMaskOr(const Stage& stage,
       } else {
         state[s->parent] |= state[s->rebased];
       }
+    } else if (rel.as<SingletonNode>()) {
     } else {
       LOG(FATAL) << "unknown relation type";
     }
@@ -390,6 +397,8 @@ void PassDownBitMaskOr(const Stage& stage,
       } else {
         state[s->rebased] |= state.at(s->parent);
       }
+    } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
+      state[s->iter] = 0;
     } else {
       LOG(FATAL) << "unknown relation type";
     }
@@ -407,7 +416,7 @@ void PassUpBoundCheck(const Stage& s,
                       const Map<IterVar, Range>& dom_map,
                       std::unordered_map<IterVar, bool>* p_state) {
   auto& state = *p_state;
-  using Halide::Internal::can_prove;
+  using HalideIR::Internal::can_prove;
   for (size_t i = s->relations.size(); i != 0; --i) {
     IterVarRelation rel = s->relations[i - 1];
     if (rel.as<SplitNode>()) {
@@ -438,6 +447,8 @@ void PassUpBoundCheck(const Stage& s,
     } else if (rel.as<RebaseNode>()) {
       const RebaseNode* s = rel.as<RebaseNode>();
       state[s->parent] = state.at(s->rebased);
+    } else if (rel.as<SingletonNode>()) {
+      // nop
     } else {
       LOG(FATAL) << "unknown relation type";
     }
@@ -477,9 +488,14 @@ std::vector<Expr> MakeBoundCheck(
     CHECK(iv->dom.defined());
     if (!skip_ivar_domain && !iv->dom.same_as(dom)) {
       Expr value = ComputeExpr<Sub>(value_map.at(iv), iv->dom->min);
-      Expr vmax = EvalSet(value, iset_dmap).max();
+      IntSet s = EvalSet(value, iset_dmap);
+      Expr vmin = s.min();
+      Expr vmax = s.max();
+      if (vmin.type() != value.type() || !can_prove(vmin >= iv->dom->min)) {
+        preds.emplace_back(value >= 0);
+      }
       if (vmax.type() != value.type() || !can_prove(vmax < iv->dom->extent)) {
-        preds.emplace_back(value < iv->dom->extent);
+        preds.emplace_back(value < (iv->dom->extent - iv->dom->min));
       }
     }
   }
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index a8dc4edf57f1..e9fbcba088fe 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -47,29 +47,44 @@ Expr InjectPredicate(const Array<Expr>& predicates,
   const Reduce* reduce = body.as<Reduce>();
   if (reduce) {
     std::shared_ptr<Reduce> n = std::make_shared<Reduce>(*reduce);
-    n->condition = n->condition && arith::ComputeReduce<ir::And>(predicates);
+    n->condition = n->condition && arith::ComputeReduce<ir::And>(predicates, Expr());
     return Expr(n);
   }
-  return Select::make(arith::ComputeReduce<ir::And>(predicates),
+  return Select::make(arith::ComputeReduce<ir::And>(predicates, Expr()),
                       body,
                       make_zero(body.type()));
 }
 
 // Replace data flow appears in all stages given the tensor change.
 // Also update vmap if subsequent dataflow need to be replaced.
+// Need to keep an update to the date transitive closure property on the vmap by a reverse map.
 void ReplaceDataFlow(const Array<Stage>& stages,
-                     std::unordered_map<Tensor, Tensor>* vmap) {
+                     std::unordered_map<Tensor, Tensor>* vmap,
+                     std::unordered_map<Tensor, Tensor>* rvmap) {
   for (Stage s : stages) {
     Operation op = s->op->ReplaceInputs(s->op, *vmap);
     if (!op.same_as(s->op)) {
       for (int i = 0; i < op->num_outputs(); ++i) {
-        (*vmap)[s->op.output(i)] = op.output(i);
+        auto it = rvmap->find(s->op.output(i));
+        if (it != rvmap->end()) {
+          (*vmap)[it->second] = op.output(i);
+        } else {
+          (*vmap)[s->op.output(i)] = op.output(i);
+          (*rvmap)[op.output(i)] = s->op.output(i);
+        }
       }
       s->op = op;
     }
   }
 }
 
+inline bool ReduceEqual(const ir::Reduce* a, const ir::Reduce* b) {
+  return (a->combiner.same_as(b->combiner)) &&
+         (a->source.same_as(b->source)) &&
+         (a->axis.same_as(b->axis)) &&
+         (a->condition.same_as(b->condition));
+}
+
 Tensor Schedule::cache_read(const Tensor& tensor,
                             const std::string& scope,
                             const Array<Operation>& readers) {
@@ -82,13 +97,16 @@ Tensor Schedule::cache_read(const Tensor& tensor,
   }
   os << "." << scope;
 
-  Tensor cache = compute(tensor->shape, [&tensor](const Array<Var>& i) {
-      return tensor(Array<Expr>(i.begin(), i.end()));
-    }, os.str());
   std::unordered_map<Tensor, Tensor> vsub;
-  vsub[tensor] = cache;
+  Stage s = operator[](tensor->op);
+  Tensor sugar_tensor = s->op.output(tensor->value_index);
+  Tensor cache = compute(sugar_tensor->shape, [&sugar_tensor](const Array<Var>& i) {
+      return sugar_tensor(Array<Expr>(i.begin(), i.end()));
+    }, os.str());
+  vsub[sugar_tensor] = cache;
 
   std::unordered_map<Tensor, Tensor> vmap;
+  std::unordered_map<Tensor, Tensor> rvmap;
   for (Operation op : readers) {
     Stage s = operator[](op);
     Operation repl_op = s->op->ReplaceInputs(s->op, vsub);
@@ -96,9 +114,10 @@ Tensor Schedule::cache_read(const Tensor& tensor,
         << "Cannot find " << tensor
         << " in the inputs of " << s->op;
     vmap[s->op.output(0)] = repl_op.output(0);
+    rvmap[repl_op.output(0)] = s->op.output(0);
     s->op = repl_op;
   }
-  ReplaceDataFlow((*this)->stages, &vmap);
+  ReplaceDataFlow((*this)->stages, &vmap, &rvmap);
   ArrayNode* stages = (*this)->stages.CopyOnWrite();
   Stage op_stage = operator[](tensor->op);
   size_t pos = FindNodeRef(stages, op_stage);
@@ -116,15 +135,15 @@ Tensor Schedule::cache_read(const Tensor& tensor,
   return cache;
 }
 
-
 // Cache write and relayout the data according to loop pattern
-Tensor CacheWriteWithReLayout(Schedule sch,
-                              const Tensor& tensor,
+Array<Tensor> CacheWriteWithReLayout(Schedule sch,
+                              const Array<Tensor>& tensor_array,
                               const std::string& scope) {
+  size_t tensor_size = tensor_array.size();
   sch->InvalidateCache();
+  Tensor tensor = tensor_array[0];
   Stage orig_stage = sch[tensor->op];
   const ComputeOpNode* compute = orig_stage->op.as<ComputeOpNode>();
-
   std::unordered_set<IterVar> red_axis;
   for (IterVar iv : compute->reduce_axis) {
     red_axis.insert(iv);
@@ -170,9 +189,34 @@ Tensor CacheWriteWithReLayout(Schedule sch,
       vsub[iv->var.get()] = value_map.at(iv);
     }
   }
-  Expr body = VarReplacer(vsub).Mutate(compute->body[tensor->value_index]);
-  body = InjectPredicate(predicates, body);
-  body = VarReplacer(vsub2newvar).Mutate(body);
+
+  Expr body;
+  Array<Expr> body_list;
+  const ir::Reduce* first_reduce = nullptr;
+  for (auto cbody : compute->body) {
+    body = VarReplacer(vsub).Mutate(cbody);
+    body = InjectPredicate(predicates, body);
+    body = VarReplacer(vsub2newvar).Mutate(body);
+    // Reduce nodes in ONE computeOp must be the same except value_index
+    // This is right only if the oringinal body ensures Reduce nodes are the same
+    if (body->is_type<ir::Reduce>()) {
+      const ir::Reduce* reduce_body = body.as<ir::Reduce>();
+      if (first_reduce != nullptr) {
+        CHECK(ReduceEqual(reduce_body, first_reduce));
+        body = ir::Reduce::make(first_reduce->combiner,
+                                first_reduce->source,
+                                first_reduce->axis,
+                                first_reduce->condition,
+                                reduce_body->value_index);
+      } else {
+        first_reduce = reduce_body;
+      }
+    } else {
+      CHECK(first_reduce == nullptr)
+        << "cannot mix reduce and other node in ONE compute bodys";
+    }
+    body_list.push_back(body);
+  }
   // The reader args
   Array<Expr> args;
   {
@@ -188,15 +232,28 @@ Tensor CacheWriteWithReLayout(Schedule sch,
     }
   }
   Operation cache_op = ComputeOpNode::make(
-      compute->name + "." + scope, compute->tag, new_axis, {body});
-  Tensor cache_tensor = cache_op.output(0);
+      compute->name + "." + scope, compute->tag, compute->attrs,
+      new_axis, body_list);
+  Array<Tensor> cache_tensor_list;
+  Array<Expr> cache_expr_list;
+  for (size_t i = 0; i < tensor_size; i++) {
+    Tensor cache_tensor = cache_op.output(i);
+    cache_tensor_list.push_back(cache_tensor);
+    cache_expr_list.push_back(cache_tensor(args));
+  }
   Operation orig_new_op = ComputeOpNode::make(
-      compute->name, compute->tag, compute->axis,
-      {cache_tensor(args)});
+      compute->name, compute->tag, compute->attrs,
+      compute->axis, cache_expr_list);
   // The replace of the dataflow
   std::unordered_map<Tensor, Tensor> vmap;
+  std::unordered_map<Tensor, Tensor> rvmap;
   vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
-  ReplaceDataFlow(sch->stages, &vmap);
+  rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
+  for (size_t i = 0; i < tensor_size; i++) {
+    vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
+    rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
+  }
+  ReplaceDataFlow(sch->stages, &vmap, &rvmap);
   // mutate orig stage
   orig_stage->op = orig_new_op;
   orig_stage->all_iter_vars = orig_stage->op->root_iter_vars();
@@ -216,7 +273,26 @@ Tensor CacheWriteWithReLayout(Schedule sch,
   if (cache_stage->group.defined()) {
     ++cache_stage->group->num_child_stages;
   }
-  return cache_tensor;
+  return cache_tensor_list;
+}
+
+Array<Tensor> Schedule::cache_write(const Array<Tensor>& tensor_array,
+                             const std::string& scope) {
+  (*this)->InvalidateCache();
+  CHECK(tensor_array.size() > 0)
+      << "size of tensor_array must be greater than 0";
+  Tensor tensor = tensor_array[0];
+  Stage orig_stage = operator[](tensor->op);
+  const ComputeOpNode* compute = tensor->op.as<ComputeOpNode>();
+  CHECK(static_cast<size_t>(compute->num_outputs()) == tensor_array.size())
+      << "size of input tensor list must be same as number of stage outputs";
+  for (size_t i = 1; i < tensor_array.size(); i++) {
+    Stage tmp_stage = operator[](tensor_array[i]->op);
+    CHECK(orig_stage.same_as(tmp_stage))
+        << "Input tensor list must be generated by ONE computeOp";
+  }
+
+  return CacheWriteWithReLayout(*this, tensor_array, scope);
 }
 
 Tensor Schedule::cache_write(const Tensor& tensor,
@@ -229,7 +305,7 @@ Tensor Schedule::cache_write(const Tensor& tensor,
   CHECK_EQ(compute->num_outputs(), 1)
       << "cache write only support single output ComputeOp";
 
-  return CacheWriteWithReLayout(*this, tensor, scope);
+  return (CacheWriteWithReLayout(*this, {tensor}, scope))[0];
 }
 
 void RebaseNonZeroMinLoop(const Schedule& sch) {
@@ -275,13 +351,6 @@ void RebaseNonZeroMinLoop(const Schedule& sch) {
   }
 }
 
-inline bool ReduceEqual(const ir::Reduce* a, const ir::Reduce* b) {
-  return (a->combiner.same_as(b->combiner)) &&
-         (a->source.same_as(b->source)) &&
-         (a->axis.same_as(b->axis)) &&
-         (a->condition.same_as(b->condition));
-}
-
 void InjectInline(ScheduleNode* sch) {
   sch->InvalidateCache();
 
@@ -363,7 +432,8 @@ void InjectInline(ScheduleNode* sch) {
       Operation op = s->op;
       if (changed[i]) {
         op = ComputeOpNode::make(
-            compute->name, compute->tag, compute->axis, new_body[i]);
+            compute->name, compute->tag, compute->attrs,
+            compute->axis, new_body[i]);
       }
       op = op->ReplaceInputs(op, repl);
       if (!op.same_as(s->op)) {
@@ -393,7 +463,8 @@ Schedule Schedule::normalize() {
 
 // Handle reduction factor.
 Array<Tensor> Schedule::rfactor(const Tensor& tensor,
-                                const IterVar& axis) {
+                                const IterVar& axis,
+                                int factor_axis) {
   (*this)->InvalidateCache();
   using ir::Reduce;
   CHECK_EQ(axis->iter_type, kCommReduce)
@@ -446,6 +517,9 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
       reduce_stage, dom_map, value_map, true, skip_bound_check);
 
   // Get the factored op node.
+  const int factor_axis_pos = \
+      factor_axis >= 0 ? factor_axis : static_cast<int>(compute_op->axis.size() + 1) + factor_axis;
+  CHECK_LE(factor_axis_pos, compute_op->axis.size());
   auto n = std::make_shared<ComputeOpNode>();
   n->name = compute_op->name + ".rf";
   {
@@ -456,10 +530,16 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
         << "Can only factor reduction domain starting from 0";
     iv_node->var = axis->var;
     iv_node->iter_type = kDataPar;
-    n->axis.push_back(IterVar(iv_node));
 
-    for (IterVar iv : compute_op->axis) {
-      n->axis.push_back(iv);
+    const int size = compute_op->axis.size();
+    for (int idx = 0; idx < size; ++idx) {
+      if (factor_axis_pos == idx) {
+        n->axis.push_back(IterVar(iv_node));
+      }
+      n->axis.push_back(compute_op->axis[idx]);
+    }
+    if (factor_axis_pos == size) {
+      n->axis.push_back(IterVar(iv_node));
     }
   }
   // predicate generation, copy not touched axis.
@@ -467,7 +547,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
   const Reduce* reduce = compute_op->body[idx].as<Reduce>();
   CHECK(reduce) << "Can only rfactor non-inline reductions";
   predicates.push_back(reduce->condition);
-  Expr predicate = arith::ComputeReduce<ir::And>(predicates);
+  Expr predicate = arith::ComputeReduce<ir::And>(predicates, Expr());
 
   std::unordered_map<const Variable*, Expr> vsub;
 
@@ -546,9 +626,15 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
   Array<Tensor> repl_tensors = compute(old_tensors[0]->shape,
     [&](const Array<Var>& i) {
       Array<Expr> indices;
-      indices.push_back(repl_red_axis->var);
-      for (Var v : i) {
-        indices.push_back(v);
+      const int idx_size = static_cast<int>(i.size());
+      for (int idx = 0; idx < idx_size; ++idx) {
+        if (factor_axis_pos == idx) {
+          indices.push_back(repl_red_axis->var);
+        }
+        indices.push_back(i[idx]);
+      }
+      if (factor_axis_pos == idx_size) {
+          indices.push_back(repl_red_axis->var);
       }
       Array<Expr> factor_exprs;
       for (int idx = 0; idx < size; ++idx) {
@@ -565,10 +651,12 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
     }, reduce_stage->op->name + ".repl");
 
   std::unordered_map<Tensor, Tensor> vmap;
+  std::unordered_map<Tensor, Tensor> rvmap;
   for (int idx = 0; idx < size; ++idx) {
     vmap[old_tensors[idx]] = repl_tensors[idx];
+    rvmap[repl_tensors[idx]] = old_tensors[idx];
   }
-  ReplaceDataFlow((*this)->stages, &vmap);
+  ReplaceDataFlow((*this)->stages, &vmap, &rvmap);
   // revamp the reduction stage.
   reduce_stage->op = repl_tensors[0]->op;
   reduce_stage->all_iter_vars = repl_tensors[0]->op->root_iter_vars();
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index f8fcb8b0c744..eea8aa1aae80 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -237,7 +237,6 @@ Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT
   IterVar fused = IterVarNode::make(
       Range(), Var(fused_name, outer->var.type()), iter_type);
 
-  *p_target = fused;
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
   ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
 
@@ -255,6 +254,31 @@ Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT
                         leaf_vars->data.begin() + pos_inner + 1);
   leaf_vars->data.insert(leaf_vars->data.begin() + pos_outer,
                          fused.node_);
+  *p_target = fused;
+  return *this;
+}
+
+Stage& Stage::fuse(const Array<IterVar>& axes, IterVar* p_target) {  // NOLINT(*)
+  if (axes.size() != 0) {
+    IterVar fused = axes[0];
+    for (size_t i = 1; i < axes.size(); ++i) {
+      this->fuse(fused, axes[i], &fused);
+    }
+    *p_target = std::move(fused);
+  } else {
+    StageNode* self = operator->();
+    // special handle fuse empty array.
+    // insert at the outer most loop
+    IterVar singleton = IterVarNode::make(
+        Range::make_by_min_extent(0, 1),
+        Var("singleton", Int(32)), kDataPar);
+    self->relations.push_back(SingletonNode::make(singleton));
+    ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
+    ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
+    all_vars->data.push_back(singleton.node_);
+    leaf_vars->data.insert(leaf_vars->data.begin(), singleton.node_);
+    *p_target = singleton;
+  }
   return *this;
 }
 
@@ -350,15 +374,19 @@ Stage& Stage::parallel(IterVar var) {   // NOLINT(*)
   return *this;
 }
 
-Stage& Stage::pragma(IterVar var, const std::string& pragma_type) {   // NOLINT(*)
+Stage& Stage::pragma(IterVar var,
+                     const std::string& pragma_type,
+                     const Expr& pragma_value) {   // NOLINT(*)
   if (pragma_type == "unroll") {
     this->unroll(var);
   } else if (pragma_type == "vectorize") {
     this->vectorize(var);
   } else {
-    UpdateIterVarAttr(operator->(), var, [pragma_type](IterVarAttrNode* n) {
-        n->pragmas.push_back(ir::StringImm::make(pragma_type));
-      });
+    UpdateIterVarAttr(
+        operator->(), var, [pragma_type, pragma_value](IterVarAttrNode* n) {
+          n->pragma_keys.push_back(ir::StringImm::make(pragma_type));
+          n->pragma_values.push_back(pragma_value);
+        });
   }
   return *this;
 }
@@ -397,6 +425,48 @@ Stage& Stage::double_buffer() {
   return *this;
 }
 
+Stage& Stage::opengl() {
+  CHECK(!is_scheduled()) << "Must be a fresh schedule";
+  StageNode *self = operator->();
+
+  auto all_iter_vars = self->all_iter_vars;  // curr version of all_iter_vars
+  CHECK(!all_iter_vars.empty()) << "At least one iter var";
+
+  // Fuse all data parallel dimensions to 1.
+  IterVar fused = all_iter_vars[0];
+  for (size_t i = 1; i != all_iter_vars.size(); ++i) {
+    auto iter_var = all_iter_vars[i];
+    switch (iter_var->iter_type) {
+      case IterVarType::kDataPar: {
+        fuse(fused, all_iter_vars[i], &fused);
+        break;
+      }
+      case IterVarType::kThreadIndex: {
+        LOG(ERROR) << "A fresh schedule shouldn't have thread index iter var";
+        break;
+      }
+      case IterVarType::kCommReduce:
+      case IterVarType::kOrdered:
+      case IterVarType::kOpaque: {
+        break;
+      }
+      default: {
+        LOG(ERROR) << "Invalid iter var type "
+                   << IterVarType2String(iter_var->iter_type);
+        break;
+      }
+    }
+  }
+
+  // Bind the only dimension to threadIdx.x.
+  bind(fused, thread_axis(Range(nullptr), "threadIdx.x"));
+
+  // Mark this stage as OpenGL.
+  (*this)->is_opengl = true;
+
+  return *this;
+}
+
 Stage CopyStage(const Stage& s) {
   std::shared_ptr<StageNode> n =
       std::make_shared<StageNode>(*s.operator->());
@@ -686,11 +756,18 @@ IterVarRelation RebaseNode::make(IterVar parent, IterVar rebased) {
   return IterVarRelation(n);
 }
 
+IterVarRelation SingletonNode::make(IterVar iter) {
+  auto n = std::make_shared<SingletonNode>();
+  n->iter = iter;
+  return IterVarRelation(n);
+}
+
 TVM_REGISTER_NODE_TYPE(StageNode);
 TVM_REGISTER_NODE_TYPE(IterVarAttrNode);
 TVM_REGISTER_NODE_TYPE(SplitNode);
 TVM_REGISTER_NODE_TYPE(FuseNode);
 TVM_REGISTER_NODE_TYPE(RebaseNode);
+TVM_REGISTER_NODE_TYPE(SingletonNode);
 TVM_REGISTER_NODE_TYPE(ScheduleNode);
 
 // Printer
@@ -732,6 +809,11 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
     p->print(op->rebased);
     p->stream << ')';
 })
+.set_dispatch<SingletonNode>([](const SingletonNode *op, IRPrinter *p) {
+    p->stream << "singleton(";
+    p->print(op->iter);
+    p->stream << ')';
+})
 .set_dispatch<ScheduleNode>([](const ScheduleNode *op, IRPrinter *p) {
     p->stream << "schedule(" << op << ")";
   });
diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc
index 875df556466a..6fd2496aeabe 100644
--- a/src/schedule/schedule_ops.cc
+++ b/src/schedule/schedule_ops.cc
@@ -22,8 +22,9 @@ using namespace ir;
 
 Stmt MakePipeline(const Stage& s,
                   const std::unordered_map<IterVar, Range>& dom_map,
-                  Stmt consumer) {
-  Stmt producer = s->op->BuildProvide(s, dom_map);
+                  Stmt consumer,
+                  bool debug_keep_trivial_loop) {
+  Stmt producer = s->op->BuildProvide(s, dom_map, debug_keep_trivial_loop);
   if (producer.defined()) {
     producer = ProducerConsumer::make(s->op, true, producer);
   }
@@ -43,6 +44,11 @@ Stmt MakePipeline(const Stage& s,
       s->op, ir::attr::realize_scope,
       StringImm::make(s->scope),
       pipeline);
+
+  if (s->is_opengl) {
+    pipeline = AttrStmt::make(
+        s->op, ir::attr::opengl_stage_scope, StringImm::make(""), pipeline);
+  }
   return pipeline;
 }
 
@@ -51,8 +57,10 @@ class InjectAttach : public IRMutator {
  public:
   InjectAttach(const Stage& stage,
                const Stage& attach_spec,
-               const std::unordered_map<IterVar, Range>& dom_map)
-      : stage_(stage), attach_spec_(attach_spec), dom_map_(dom_map) {}
+               const std::unordered_map<IterVar, Range>& dom_map,
+               bool debug_keep_trivial_loop)
+      : stage_(stage), attach_spec_(attach_spec), dom_map_(dom_map),
+        debug_keep_trivial_loop_(debug_keep_trivial_loop) {}
 
   Stmt Mutate(Stmt stmt) final {
     CHECK(stmt.defined());
@@ -68,7 +76,7 @@ class InjectAttach : public IRMutator {
         found_attach = true;
         stmt = AttrStmt::make(
             op->node, op->attr_key, op->value,
-            MakePipeline(stage_, dom_map_, op->body));
+            MakePipeline(stage_, dom_map_, op->body, debug_keep_trivial_loop_));
       }
     }
     return stmt;
@@ -83,6 +91,9 @@ class InjectAttach : public IRMutator {
   const Stage& attach_spec_;
   // domain map
   const std::unordered_map<IterVar, Range>& dom_map_;
+  // Whether keep trivial loops with extent of 1 during lowering.
+  // This is a debug feature for dataflow/axis analysis
+  bool debug_keep_trivial_loop_;
 };
 
 // inject the operator's realization on the stmt.
@@ -91,9 +102,10 @@ class InjectScanStep : public IRMutator {
   InjectScanStep(const Stage& stage,
                  const Operation& scan_op,
                  const std::unordered_map<IterVar, Range>& dom_map,
-                 bool is_init)
+                 bool is_init,
+                 bool debug_keep_trivial_loop)
       : stage_(stage), scan_op_(scan_op),
-        dom_map_(dom_map), is_init_(is_init) {}
+        dom_map_(dom_map), is_init_(is_init), debug_keep_trivial_loop_(debug_keep_trivial_loop) {}
 
   Stmt Mutate(Stmt stmt) final {
     CHECK(stmt.defined());
@@ -107,7 +119,7 @@ class InjectScanStep : public IRMutator {
         found_attach = true;
         stmt = AttrStmt::make(
             op->node, op->attr_key, op->value,
-            MakePipeline(stage_, dom_map_, op->body));
+            MakePipeline(stage_, dom_map_, op->body, debug_keep_trivial_loop_));
       }
     }
     return stmt;
@@ -124,6 +136,9 @@ class InjectScanStep : public IRMutator {
   const std::unordered_map<IterVar, Range>& dom_map_;
   // whether it is init.
   bool is_init_;
+  // Whether keep trivial loops with extent of 1 during lowering.
+  // This is a debug feature for dataflow/axis analysis
+  bool debug_keep_trivial_loop_;
 };
 
 // Postprocessing of schedule op
@@ -324,7 +339,7 @@ class SchedulePostProc : public IRMutator {
 };
 
 Stmt ScheduleOps(
-    Schedule sch, Map<IterVar, Range> dom_map_) {
+    Schedule sch, Map<IterVar, Range> dom_map_, bool debug_keep_trivial_loop) {
   Stmt body = Stmt();
   std::unordered_map<IterVar, Range> dom_map = as_unordered_map(dom_map_);
   // scan init and scan updates
@@ -359,14 +374,14 @@ Stmt ScheduleOps(
 
     if (scan_init.count(s->op)) {
       CHECK(body.defined());
-      InjectScanStep mu(s, scan_init.at(s->op), dom_map, true);
+      InjectScanStep mu(s, scan_init.at(s->op), dom_map, true, debug_keep_trivial_loop);
       body = mu.Mutate(body);
       CHECK(mu.found_attach)
           << "did not find attachment point for scan.init";
     } else if (attach_spec->attach_type == kScanUpdate) {
       // Handle scan update
       CHECK(body.defined());
-      InjectScanStep mu(s, attach_spec->attach_stage->op, dom_map, false);
+      InjectScanStep mu(s, attach_spec->attach_stage->op, dom_map, false, debug_keep_trivial_loop);
       body = mu.Mutate(body);
       CHECK(mu.found_attach)
           << "did not find attachment point for scan.update";
@@ -374,11 +389,11 @@ Stmt ScheduleOps(
       // do nothing
     } else if (attach_spec->attach_type == kGroupRoot) {
       CHECK(!s->group.defined());
-      body = MakePipeline(s, dom_map, body);
+      body = MakePipeline(s, dom_map, body, debug_keep_trivial_loop);
     } else {
       CHECK_EQ(attach_spec->attach_type, kScope);
       CHECK(body.defined());
-      InjectAttach mutator(s, attach_spec, dom_map);
+      InjectAttach mutator(s, attach_spec, dom_map, debug_keep_trivial_loop);
       body = mutator.Mutate(body);
       CHECK(mutator.found_attach)
           << "did not find attachment point for " << s << " in "
diff --git a/tests/ci_build/Dockerfile.lint b/tests/ci_build/Dockerfile.lint
deleted file mode 100644
index 4ba4ca3be294..000000000000
--- a/tests/ci_build/Dockerfile.lint
+++ /dev/null
@@ -1,6 +0,0 @@
-# For lint test
-FROM ubuntu:16.04
-
-RUN apt-get update && apt-get install -y python-pip sudo
-RUN apt-get install -y doxygen graphviz
-RUN pip install cpplint pylint
diff --git a/tests/ci_build/README.md b/tests/ci_build/README.md
deleted file mode 100644
index cd74ff2336ff..000000000000
--- a/tests/ci_build/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# CI Build Scripts
-
-This directory contains the files and setup instructions to run all tests.
-
-## Run locally
-
-To run locally, we need to first install
-[docker](https://docs.docker.com/engine/installation/) and
-[nvidia-docker](https://github.com/NVIDIA/nvidia-docker/wiki).
-
-Then we can run the tasks defined in the [Jenkinsfile](../../Jenkinsfile) by
-using (`ci_build.sh`)[./ci_build.sh]. For example
-
-- lint the python codes
-
-  ```bash
-  ./ci_build.sh lint make pylint
-  ```
-
-- build codes with CUDA supports
-
-  ```bash
-  ./ci_build.sh gpu make -j$(nproc)
-  ```
-
-- do the python unittest
-
-  ```bash
-  ./ci_build.sh gpu PYTHONPATH=./python/ nosetests --with-timer --verbose tests/python/unittest'
-  ```
-
-- build the documents. The results will be available at `docs/_build/html`
-
-  ```bash
-  tests/ci_build/ci_build.sh gpu make -C docs html
-  ```
diff --git a/tests/ci_build/install/ubuntu_install_python.sh b/tests/ci_build/install/ubuntu_install_python.sh
deleted file mode 100644
index c6f6f75c564e..000000000000
--- a/tests/ci_build/install/ubuntu_install_python.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-# install python and pip, don't modify this, modify install_python_package.sh
-apt-get update && apt-get install -y python-pip python-dev python3-dev
-
-# the version of the pip shipped with ubuntu may be too lower, install a recent version here
-cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python2 get-pip.py
diff --git a/tests/ci_build/install/ubuntu_install_sphinx.sh b/tests/ci_build/install/ubuntu_install_sphinx.sh
deleted file mode 100644
index 767643f10488..000000000000
--- a/tests/ci_build/install/ubuntu_install_sphinx.sh
+++ /dev/null
@@ -1 +0,0 @@
-pip install sphinx==1.6.2 sphinx-gallery sphinx_rtd_theme matplotlib Image commonmark>=0.7.3 docutils>=0.11
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
new file mode 100644
index 000000000000..aa6e81e860f7
--- /dev/null
+++ b/tests/cpp/build_module_test.cc
@@ -0,0 +1,45 @@
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/tvm.h>
+#include <tvm/operation.h>
+#include <tvm/build_module.h>
+
+TEST(BuildModule, Basic) {
+  using namespace tvm;
+  auto n = var("n");
+  Array<Expr> shape;
+  shape.push_back(n);
+
+  auto A = placeholder(shape, Float(32), "A");
+  auto B = placeholder(shape, Float(32), "B");
+
+  auto C = compute(A->shape, [&A, &B](Expr i) {
+    return A[i] + B[i];
+  }, "C");
+
+  auto s = create_schedule({ C->op });
+
+  auto cAxis = C->op.as<ComputeOpNode>()->axis;
+
+  IterVar bx, tx;
+  s[C].split(cAxis[0], 64, &bx, &tx);
+
+  auto args = Array<Tensor>({ A, B, C });
+  std::unordered_map<Tensor, Buffer> binds;
+
+  auto config = build_config();
+  auto target = target::llvm();
+
+  auto lowered = lower(s, args, "func", binds, config);
+  auto module = build(lowered, target, Target(), config);
+
+  auto mali_target = Target::create("opencl -model=Mali-T860MP4@800Mhz -device=mali");
+  CHECK_EQ(mali_target->str(), "opencl -model=Mali-T860MP4@800Mhz -device=mali"); 
+}
+
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc
index b5141262c0d3..4a0500bf4faf 100644
--- a/tests/cpp/container_test.cc
+++ b/tests/cpp/container_test.cc
@@ -35,6 +35,15 @@ TEST(Map, Expr) {
   CHECK(!dict.count(zz));
 }
 
+TEST(StrMap, Expr) {
+  using namespace tvm;
+  Var x("x");
+  auto z = max(x + 1 + 2, 100);
+  Map<std::string, Expr> dict{{"x", z}, {"z", 2}};
+  CHECK(dict.size() == 2);
+  CHECK(dict["x"].same_as(z));
+}
+
 TEST(Map, Mutate) {
   using namespace tvm;
   Var x("x");
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index fb8685695013..9cdfef7f6a01 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -15,6 +15,15 @@ TEST(Expr, Basic) {
 }
 
 
+TEST(ExprNodeRef, Basic) {
+  using namespace tvm;
+  Var x("x");
+  Expr z = max(x + 1 + 2, 100);
+  const ir::Max* op = z.as<ir::Max>();
+  CHECK(op->GetNodeRef().same_as(z));
+}
+
+
 int main(int argc, char ** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/tests/cpp/ir_mutator_test.cc b/tests/cpp/ir_mutator_test.cc
index 7a0739950bb5..fd5a60756f1c 100644
--- a/tests/cpp/ir_mutator_test.cc
+++ b/tests/cpp/ir_mutator_test.cc
@@ -4,8 +4,8 @@
 
 namespace {
 using namespace tvm::ir;
-using namespace Halide::Internal;
-using namespace Halide;
+using namespace HalideIR::Internal;
+using namespace HalideIR;
 
 // replace variable to constant
 class IRVar2Const : public IRMutator {
@@ -38,7 +38,7 @@ TVM_STATIC_IR_FUNCTOR(IRVar2Const, vtable_expr)
 }  // namespace
 
 TEST(IRMutator, Basic) {
-  using namespace Halide::Internal;
+  using namespace HalideIR::Internal;
   using namespace tvm;
   Var x("x"), y;
   auto z = x + y;
diff --git a/tests/cpp/ir_simplify_test.cc b/tests/cpp/ir_simplify_test.cc
index e963bc9e991b..0667dc27367c 100644
--- a/tests/cpp/ir_simplify_test.cc
+++ b/tests/cpp/ir_simplify_test.cc
@@ -4,7 +4,7 @@
 #include <arithmetic/Simplify.h>
 
 TEST(IRSIMPLIFY, Basic) {
-  using namespace Halide::Internal;
+  using namespace HalideIR::Internal;
   simplify_test();
 }
 
diff --git a/tests/cpp/ir_ssa_test.cc b/tests/cpp/ir_ssa_test.cc
index 2de7dba080a3..97251eb5eeeb 100644
--- a/tests/cpp/ir_ssa_test.cc
+++ b/tests/cpp/ir_ssa_test.cc
@@ -5,7 +5,7 @@
 
 
 TEST(IRSSA, Convert) {
-  using namespace Halide::Internal;
+  using namespace HalideIR::Internal;
   using namespace tvm;
   Var x("x"), y;
   Expr let = Let::make(x, 1, x + 1);
@@ -17,7 +17,7 @@ TEST(IRSSA, Convert) {
 }
 
 TEST(IRSSA, Basic) {
-  using namespace Halide::Internal;
+  using namespace HalideIR::Internal;
   using namespace tvm;
   Var x("x"), y;
   auto z = Evaluate::make(x + y);
diff --git a/tests/cpp/ir_visitor_test.cc b/tests/cpp/ir_visitor_test.cc
index 0a649a09304c..930b0a273143 100644
--- a/tests/cpp/ir_visitor_test.cc
+++ b/tests/cpp/ir_visitor_test.cc
@@ -5,7 +5,7 @@
 #include <tvm/ir_pass.h>
 
 TEST(IRVisitor, CountVar) {
-  using namespace Halide::Internal;
+  using namespace HalideIR::Internal;
   using namespace tvm;
   int n_var = 0;
   Var x("x"), y;
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 015d95c90adb..9b2f1df73731 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -14,7 +14,7 @@ TEST(PackedFunc, Basic) {
   Var v = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
       CHECK(args.num_args == 3);
       CHECK(args.values[0].v_float64 == 1.0);
-      CHECK(args.type_codes[0] == kFloat);
+      CHECK(args.type_codes[0] == kDLFloat);
       CHECK(args.values[1].v_handle == &a);
       CHECK(args.type_codes[1] == kArrayHandle);
       CHECK(args.values[2].v_handle == &x);
@@ -38,6 +38,31 @@ TEST(PackedFunc, Node) {
   CHECK(t.same_as(x));
 }
 
+TEST(PackedFunc, NDArray) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  auto x = NDArray::Empty(
+      {}, String2TVMType("float32"),
+      TVMContext{kDLCPU, 0});
+  reinterpret_cast<float*>(x->data)[0] = 10.0f;
+  CHECK(x.use_count() == 1);
+
+  PackedFunc forward([&](TVMArgs args, TVMRetValue* rv) {
+      *rv = args[0];
+    });
+
+  NDArray ret = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+      NDArray y = args[0];
+      DLTensor* ptr = args[0];
+      CHECK(ptr == x.operator->());
+      CHECK(x.same_as(y));
+      CHECK(x.use_count() == 2);
+      *rv = forward(y);
+    })(x);
+  CHECK(ret.use_count() == 2);
+  CHECK(ret.same_as(x));
+}
+
 TEST(PackedFunc, str) {
   using namespace tvm;
   using namespace tvm::runtime;
diff --git a/tests/lint/pylintrc b/tests/lint/pylintrc
index ef5a780f3d42..f5c4452cfa16 100644
--- a/tests/lint/pylintrc
+++ b/tests/lint/pylintrc
@@ -65,7 +65,7 @@ enable=indexing-exception,old-raise-syntax
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access,useless-object-inheritance
 
 [REPORTS]
 
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
new file mode 100644
index 000000000000..c488c8c680e1
--- /dev/null
+++ b/tests/python/contrib/test_cublas.py
@@ -0,0 +1,33 @@
+import tvm
+import numpy as np
+from tvm.contrib import cublas
+
+def test_matmul_add():
+    n = 1024
+    l = 128
+    m = 235
+    A = tvm.placeholder((n, l), name='A')
+    B = tvm.placeholder((l, m), name='B')
+    C = cublas.matmul(A, B)
+    s = tvm.create_schedule(C.op)
+
+    def verify(target="cuda"):
+        if not tvm.module.enabled(target):
+            print("skip because %s is not enabled..." % target)
+            return
+        if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
+            print("skip because extern function is not avalable")
+            return
+        ctx = tvm.gpu(0)
+        f = tvm.build(s, [A, B, C], target)
+        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+        f(a, b, c)
+        np.testing.assert_allclose(
+            c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
+    verify()
+
+
+if __name__ == "__main__":
+    test_matmul_add()
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index 93e17ea54ad8..c197f5c5b0c6 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -41,7 +41,7 @@ def test_conv2d():
                              tensor_format=0,
                              algo=1)
     yshape = [x.value for x in Y.shape]
-    s = tvm.create_schedule(Y.op)
+    s =  tvm.create_schedule(Y.op)
     
     def verify():
         ctx = tvm.gpu(0)
diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py
new file mode 100644
index 000000000000..4e13b052e616
--- /dev/null
+++ b/tests/python/contrib/test_miopen.py
@@ -0,0 +1,65 @@
+import tvm
+from tvm.contrib import miopen
+import numpy as np
+
+
+def test_conv2d():
+    in_channel = 3
+    out_channel = 64
+    filter_h = 3
+    filter_w = 3
+    pad_h = 1
+    pad_w = 1
+    stride_h = 1
+    stride_w = 1
+    dilation_h = 1
+    dilation_w = 1
+
+    xshape = [1, in_channel, 128, 128]
+    if not tvm.module.enabled("rocm"):
+        print("skip because rocm is not enabled...")
+        return
+    if not tvm.get_global_func("tvm.contrib.miopen.conv2d.setup", True):
+        print("skip because miopen is not enabled...")
+        return
+    wshape = (out_channel, in_channel, filter_h, filter_w)
+
+    X = tvm.placeholder(xshape, name='X')
+    W = tvm.placeholder(wshape, name='W')
+    Y = miopen.conv2d_forward(X,
+                              W,
+                              stride_h,
+                              stride_w,
+                              pad_h,
+                              pad_w,
+                              dilation_h,
+                              dilation_w,
+                              conv_mode=0)
+
+    yshape = [x.value for x in Y.shape]
+    import topi
+    with tvm.target.create("rocm -libs=miopen"):
+        s = topi.generic.schedule_extern(Y)
+
+    def verify():
+        ctx = tvm.rocm(0)
+        f = tvm.build(s, [X, W, Y], "rocm", target_host="llvm", name="conv2d")
+        x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), ctx)
+        w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), ctx)
+        y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx)
+        f(x, w, y)
+
+        Y_ref = topi.nn.conv2d_nchw(X, W, (stride_h, stride_w), (pad_h, pad_w))
+        with tvm.target.rocm():
+            s_ref = topi.generic.schedule_conv2d_nchw([Y_ref])
+        f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm")
+        y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx)
+        f_ref(x, w, y_ref)
+        print("Max abs diff:", np.max(np.abs(y.asnumpy() - y_ref.asnumpy())))
+        np.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3)
+
+    verify()
+
+
+if __name__ == "__main__":
+    test_conv2d()
diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py
new file mode 100644
index 000000000000..25437605525b
--- /dev/null
+++ b/tests/python/contrib/test_mps.py
@@ -0,0 +1,84 @@
+import tvm
+import numpy as np
+from tvm.contrib import mps
+
+def test_matmul():
+    if not tvm.module.enabled("metal"):
+        print("skip because %s is not enabled..." % "metal")
+        return
+    n = 1024
+    l = 128
+    m = 256
+    A = tvm.placeholder((n, l), name='A')
+    B = tvm.placeholder((l, m), name='B')
+    C = mps.matmul(A, B)
+    D = tvm.compute(
+        C.shape,
+        lambda *i: C(*i) + 1.
+    )
+    s = tvm.create_schedule(D.op)
+    yo, xo = D.op.axis
+    block_y = tvm.thread_axis("blockIdx.y")
+    block_x = tvm.thread_axis("blockIdx.x")
+    thread_y = tvm.thread_axis("threadIdx.y")
+    thread_x = tvm.thread_axis("threadIdx.x")
+    by, ty = s[D].split(yo, factor=16)
+    bx, tx = s[D].split(xo, factor=16)
+    s[D].bind(by, block_y)
+    s[D].bind(bx, block_x)
+    s[D].bind(ty, thread_y)
+    s[D].bind(tx, thread_x)
+
+
+
+    def verify(A, B, D, s, target="metal"):
+        if not tvm.get_global_func("tvm.contrib.mps.matmul", True):
+            print("skip because extern function is not avalable")
+            return
+        ctx = tvm.metal(0)
+        f = tvm.build(s, [A, B, D], "metal")
+        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+        f(a, b, c)
+        np.testing.assert_allclose(
+            c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 1, rtol=1e-5)
+    verify(A, B, D, s)
+
+def test_conv2d():
+    if not tvm.module.enabled("metal"):
+        print("skip because %s is not enabled..." % "metal")
+        return
+    n = 1
+    h = 14
+    w = 14
+    ci = 2
+    co = 4
+    kh = 3
+    kw = 3
+    stride = 2
+    A = tvm.placeholder((n, h, w, ci), name="x")
+    B = tvm.placeholder((co, kh, kw, ci), name="w")
+    C = mps.conv2d(A, B, 'SAME', 2)
+    s1 = tvm.create_schedule(C.op)
+
+    def verify(A, B, C, target="llvm"):
+        if not tvm.get_global_func("tvm.contrib.mps.conv2d", True):
+            print("skip because extern function is not avalable")
+            return
+        ctx = tvm.metal(0)
+        f = tvm.build(s1, [A, B, C], "metal")
+        a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), ctx)
+        f(a, b, c)
+        # print(c.asnumpy())
+        # print(c.shape)
+        
+    verify(A, B, C, s1)
+
+
+if __name__ == "__main__":
+    #test_matmul()
+    test_conv2d()
+
diff --git a/tests/python/contrib/test_mxnet_bridge.py b/tests/python/contrib/test_mxnet_bridge.py
new file mode 100644
index 000000000000..2228f7305c6b
--- /dev/null
+++ b/tests/python/contrib/test_mxnet_bridge.py
@@ -0,0 +1,48 @@
+def mxnet_check():
+    """This is a simple test function for MXNet bridge
+
+    It is not included as nosetests, because of its dependency on mxnet
+
+    User can directly run this script to verify correctness.
+    """
+    import mxnet as mx
+    import topi
+    import tvm
+    import numpy as np
+    from tvm.contrib.mxnet import to_mxnet_func
+
+    # build a TVM function through topi
+    n = 20
+    shape = (20,)
+    scale = tvm.var("scale", dtype="float32")
+    x = tvm.placeholder(shape)
+    y = tvm.placeholder(shape)
+    z = topi.broadcast_add(x, y)
+    zz = tvm.compute(shape, lambda *i: z(*i) * scale)
+
+    target = tvm.target.cuda()
+
+    # build the function
+    with target:
+        s = topi.generic.schedule_injective(zz)
+        f = tvm.build(s, [x, y, zz, scale])
+
+    # get a mxnet version
+    mxf = to_mxnet_func(f, const_loc=[0, 1])
+
+    ctx = mx.gpu(0)
+    xx = mx.nd.uniform(shape=shape, ctx=ctx)
+    yy = mx.nd.uniform(shape=shape, ctx=ctx)
+    zz = mx.nd.empty(shape=shape, ctx=ctx)
+
+    # invoke myf: this runs in mxnet engine
+    mxf(xx, yy, zz, 10.0)
+    mxf(xx, yy, zz, 10.0)
+
+
+    np.testing.assert_allclose(
+        zz.asnumpy(), (xx.asnumpy() + yy.asnumpy()) * 10)
+
+
+if __name__ == "__main__":
+    mxnet_check()
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
new file mode 100644
index 000000000000..a74273a0ccba
--- /dev/null
+++ b/tests/python/contrib/test_random.py
@@ -0,0 +1,79 @@
+import tvm
+import numpy as np
+from tvm.contrib import random
+
+def test_randint():
+    m = 1024
+    n = 1024
+    A = random.randint(-127, 128, size=(m, n), dtype='int32')
+    s = tvm.create_schedule(A.op)
+
+    def verify(target="llvm"):
+        if not tvm.module.enabled(target):
+            print("skip because %s is not enabled..." % target)
+            return
+        if not tvm.get_global_func("tvm.contrib.random.randint", True):
+            print("skip because extern function is not avalable")
+            return
+        ctx = tvm.cpu(0)
+        f = tvm.build(s, [A], target)
+        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx)
+        f(a)
+        na = a.asnumpy()
+        assert abs(np.mean(na)) < 0.2
+        assert np.min(na) == -127
+        assert np.max(na) == 127
+    verify()
+
+
+def test_uniform():
+    m = 1024
+    n = 1024
+    A = random.uniform(0, 1, size=(m, n))
+    s = tvm.create_schedule(A.op)
+
+    def verify(target="llvm"):
+        if not tvm.module.enabled(target):
+            print("skip because %s is not enabled..." % target)
+            return
+        if not tvm.get_global_func("tvm.contrib.random.uniform", True):
+            print("skip because extern function is not avalable")
+            return
+        ctx = tvm.cpu(0)
+        f = tvm.build(s, [A], target)
+        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx)
+        f(a)
+        na = a.asnumpy()
+        assert abs(np.mean(na) - 0.5) < 1e-2
+        assert abs(np.min(na) - 0.0) < 1e-3
+        assert abs(np.max(na) - 1.0) < 1e-3
+    verify()
+
+
+def test_normal():
+    m = 1024
+    n = 1024
+    A = random.normal(3, 4, size=(m, n))
+    s = tvm.create_schedule(A.op)
+
+    def verify(target="llvm"):
+        if not tvm.module.enabled(target):
+            print("skip because %s is not enabled..." % target)
+            return
+        if not tvm.get_global_func("tvm.contrib.random.normal", True):
+            print("skip because extern function is not avalable")
+            return
+        ctx = tvm.cpu(0)
+        f = tvm.build(s, [A], target)
+        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx)
+        f(a)
+        na = a.asnumpy()
+        assert abs(np.mean(na) - 3) < 1e-2
+        assert abs(np.std(na) - 4) < 1e-2
+    verify()
+
+
+if __name__ == "__main__":
+    test_randint()
+    test_uniform()
+    test_normal()
diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py
new file mode 100644
index 000000000000..46350f4d6625
--- /dev/null
+++ b/tests/python/contrib/test_rocblas.py
@@ -0,0 +1,33 @@
+import tvm
+import numpy as np
+from tvm.contrib import rocblas
+
+def test_matmul_add():
+    n = 1024
+    l = 128
+    m = 235
+    A = tvm.placeholder((n, l), name='A')
+    B = tvm.placeholder((l, m), name='B')
+    C = rocblas.matmul(A, B)
+    s = tvm.create_schedule(C.op)
+
+    def verify(target="rocm"):
+        if not tvm.module.enabled(target):
+            print("skip because %s is not enabled..." % target)
+            return
+        if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True):
+            print("skip because extern function is not avalable")
+            return
+        ctx = tvm.rocm(0)
+        f = tvm.build(s, [A, B, C], target)
+        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+        f(a, b, c)
+        np.testing.assert_allclose(
+            c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
+    verify()
+
+
+if __name__ == "__main__":
+    test_matmul_add()
diff --git a/tests/python/contrib/test_rpc_proxy.py b/tests/python/contrib/test_rpc_proxy.py
index 74bc2a585abb..3a9364774ad6 100644
--- a/tests/python/contrib/test_rpc_proxy.py
+++ b/tests/python/contrib/test_rpc_proxy.py
@@ -3,7 +3,7 @@
 import numpy as np
 import time
 import multiprocessing
-from tvm.contrib import rpc
+from tvm import rpc
 
 def rpc_proxy_check():
     """This is a simple test function for RPC Proxy
@@ -17,9 +17,9 @@ def rpc_proxy_check():
     """
 
     try:
-        from tvm.contrib import rpc_proxy
+        from tvm.rpc import proxy
         web_port = 8888
-        prox = rpc_proxy.Proxy("localhost", web_port=web_port)
+        prox = proxy.Proxy("localhost", web_port=web_port)
         def check():
             if not tvm.module.enabled("rpc"):
                 return
@@ -30,7 +30,7 @@ def addone(x):
             def addone(name, x):
                 return "%s:%d" % (name, x)
             server = multiprocessing.Process(
-                target=rpc_proxy.websocket_proxy_server,
+                target=proxy.websocket_proxy_server,
                 args=("ws://localhost:%d/ws" % web_port,"x1"))
             # Need to make sure that the connection start after proxy comes up
             time.sleep(0.1)
diff --git a/tests/python/contrib/test_rpc_tracker.py b/tests/python/contrib/test_rpc_tracker.py
new file mode 100644
index 000000000000..d8ecbaaa46e6
--- /dev/null
+++ b/tests/python/contrib/test_rpc_tracker.py
@@ -0,0 +1,93 @@
+import tvm
+import logging
+import numpy as np
+import time
+import multiprocessing
+from tvm import rpc
+
+def check_server_drop():
+    """test when server drops"""
+    try:
+        from tvm.rpc import tracker, proxy, base
+        from tvm.rpc.base import TrackerCode
+
+        @tvm.register_func("rpc.test2.addone")
+        def addone(x):
+            return x + 1
+
+        def _put(tclient, value):
+            base.sendjson(tclient._sock, value)
+            base.recvjson(tclient._sock)
+
+        tserver = tracker.Tracker("localhost", 8888)
+        tproxy = proxy.Proxy("localhost", 8881,
+                             tracker_addr=("localhost", tserver.port))
+        tclient = rpc.connect_tracker("localhost", tserver.port)
+
+        server0 = rpc.Server(
+            "localhost", port=9099,
+            tracker_addr=("localhost", tserver.port),
+            key="abc")
+        server1 = rpc.Server(
+            "localhost", port=9099,
+            tracker_addr=("localhost", tserver.port),
+            key="xyz")
+        server2 = rpc.Server(
+            "localhost", tproxy.port, is_proxy=True,
+            key="xyz")
+        server3 = rpc.Server(
+            "localhost", tproxy.port, is_proxy=True,
+            key="xyz1")
+
+        # Fault tolerence to un-handled requested value
+        _put(tclient, [TrackerCode.REQUEST, "abc", "", 1])
+        _put(tclient, [TrackerCode.REQUEST, "xyz1", "", 1])
+
+        # Fault tolerence to stale worker value
+        _put(tclient, [TrackerCode.PUT, "xyz", (server1.port, "abc")])
+        _put(tclient, [TrackerCode.PUT, "xyz", (server1.port, "abcxxx")])
+        _put(tclient, [TrackerCode.PUT, "xyz", (tproxy.port, "abcxxx11")])
+
+        # Fault tolerence server timeout
+        def check_timeout(timeout, sleeptime):
+            def myfunc(remote):
+                time.sleep(sleeptime)
+                f1 = remote.get_function("rpc.test2.addone")
+                assert f1(10) == 11
+            try:
+                tclient.request_and_run("xyz", myfunc, session_timeout=timeout)
+            except RuntimeError:
+                pass
+            print(tclient.text_summary())
+            try:
+                remote = tclient.request("xyz", priority=0, session_timeout=timeout)
+                remote2 = tclient.request("xyz", session_timeout=timeout)
+                time.sleep(sleeptime)
+                f1 = remote.get_function("rpc.test2.addone")
+                assert f1(10) == 11
+                f1 = remote2.get_function("rpc.test2.addone")
+                assert f1(10) == 11
+
+            except tvm.TVMError as e:
+                pass
+            remote3 = tclient.request("abc")
+            f1 = remote3.get_function("rpc.test2.addone")
+            remote3 = tclient.request("xyz1")
+            f1 = remote3.get_function("rpc.test2.addone")
+            assert f1(10) == 11
+
+        check_timeout(0.01, 0.1)
+        check_timeout(2, 0)
+        tserver.terminate()
+        server0.terminate()
+        server1.terminate()
+        server2.terminate()
+        server3.terminate()
+        tproxy.terminate()
+    except ImportError:
+        print("Skip because tornado is not available")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    check_server_drop()
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
new file mode 100644
index 000000000000..3a99779e58f0
--- /dev/null
+++ b/tests/python/contrib/test_sort.py
@@ -0,0 +1,62 @@
+import tvm
+import numpy as np
+
+def test_sort():
+    n = 2
+    l = 5
+    m = 3
+    data = tvm.placeholder((n, l, m), name='data')
+    sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32")
+    axis = 1
+    is_descend = True
+    out = tvm.extern(data.shape, [data, sort_num],
+                     lambda ins, outs: tvm.call_packed(
+                         "tvm.contrib.sort.argsort", ins[0],
+                         ins[1], outs[0], axis, is_descend),
+                     dtype='int32', name="sort_tensor")
+    input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]],
+             [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]]
+    sort_num_input = [[1, 2, 3], [4, 5, 5]]
+    sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]],
+                    [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]]
+
+    ctx = tvm.cpu(0)
+    target = "llvm"
+    s = tvm.create_schedule(out.op)
+    f = tvm.build(s, [data, sort_num, out], target)
+    a = tvm.nd.array(np.array(input).astype(data.dtype), ctx)
+    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
+    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
+    f(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
+
+def test_sort_np():
+    dshape = (1, 2, 3, 4, 5, 6)
+    axis = 4
+    reduced_shape = (1, 2, 3, 4, 6)
+    is_descend = False
+    data = tvm.placeholder(dshape, name='data')
+    sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32")
+    out = tvm.extern(data.shape, [data, sort_num],
+                     lambda ins, outs: tvm.call_packed(
+                         "tvm.contrib.sort.argsort", ins[0],
+                         ins[1], outs[0], axis, is_descend),
+                     dtype='int32', name="sort_tensor")
+
+    ctx = tvm.cpu(0)
+    target = "llvm"
+    s = tvm.create_schedule(out.op)
+    f = tvm.build(s, [data, sort_num, out], target)
+
+    np_data = np.random.uniform(size=dshape)
+    np_out = np.argsort(np_data, axis=axis)
+    sort_num_input = np.full(reduced_shape, dshape[axis])
+    a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx)
+    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
+    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
+    f(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
+
+if __name__ == "__main__":
+    test_sort()
+    test_sort_np()
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index 8cbfef3ebbf8..f16f15325735 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -1,4 +1,5 @@
 import tvm
+from tvm.contrib import nvcc
 import numpy as np
 import time
 
@@ -18,7 +19,8 @@ def test_exp():
     def check_device(device, host="stackvm"):
         if not tvm.module.enabled(host):
             return
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             return
         fexp = tvm.build(s, [A, B],
                          device, host,
@@ -32,9 +34,54 @@ def check_device(device, host="stackvm"):
         np.testing.assert_allclose(
             b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
 
+    check_device("opencl -device=intel_graphics")
     check_device("cuda", "llvm")
-    check_device("opencl")
+    check_device("vulkan")
+
+
+def test_multiple_cache_write():
+    # graph
+    n = tvm.convert(1024)
+    A0 = tvm.placeholder((n,), name='A0', dtype = "float32")
+    A1 = tvm.placeholder((n,), name='A1', dtype = "float32")
+    B0, B1 = tvm.compute((n,),
+            lambda *i: (A0(*i) + A1(*i), A0(*i) * A1(*i)),
+            name='B')
+    C = tvm.compute((n,), lambda *i: B0(*i) + B1(*i),
+            name='C')
+    s = tvm.create_schedule(C.op)
+    # create iter var and assign them tags.
+    num_thread = 8
+    B0_cache, B1_cache = s.cache_write([B0, B1], "local")
+    bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
+    s[B0].compute_at(s[C], bx)
+    s[B0_cache].compute_at(s[C], bx)
+    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    # one line to build the function.
+    def check_device(device, host="stackvm"):
+        if not tvm.module.enabled(host):
+            return
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            return
+        func = tvm.build(s, [A0, A1, C],
+                         device, host,
+                         name="multiple_cache_write")
+        ctx = tvm.context(device, 0)
+        # launch the kernel.
+        n = 1024
+        a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), ctx)
+        a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        func(a0, a1, c)
+        np.testing.assert_allclose(
+            c.asnumpy(), a0.asnumpy() + a1.asnumpy() + (a0.asnumpy() * a1.asnumpy()),
+            rtol=1e-5)
 
+    check_device("cuda", "llvm")
+    check_device("vulkan")
+    check_device("opencl")
 
 def test_log_pow_llvm():
     # graph
@@ -55,78 +102,146 @@ def test_log_pow_llvm():
     n = 1028
     a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
     b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
-    flog(a, b)
+    repeat = 10
+    ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat)
+    res = ftimer(a, b)
+    assert(len(res.results) == repeat)
     np.testing.assert_allclose(
         b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
 
 
-def test_popcount_llvm():
-    # graph
-    n = tvm.var('n')
-    A = tvm.placeholder((n,), name='A', dtype="uint32")
-    B = tvm.compute(A.shape, lambda *i: tvm.popcount(A(*i)), name='B')
-    s = tvm.create_schedule(B.op)
+def test_popcount():
+    def run(dtype):
+        # graph
+        n = tvm.convert(1024)
+        A = tvm.placeholder((n,), name='A', dtype=dtype)
+        B = tvm.compute(A.shape, lambda *i: tvm.popcount(A(*i)), name='B')
+        s = tvm.create_schedule(B.op)
+        # simple schedule
+        num_thread = 8
+        bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
 
-    if not tvm.module.enabled("llvm"):
-        return
-    f = tvm.build(s, [A, B], "llvm")
-    ctx = tvm.cpu(0)
-    # launch the kernel.
-    n = 1024
-    a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), ctx)
-    b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), ctx)
-    f(a, b)
-    np.testing.assert_allclose(
-        b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5)
+        def check_device(device):
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("skip because %s is not enabled.." % device)
+                return
+            target = tvm.target.create(device)
+            if "cpu" not in target.keys:
+                s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
+                s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+            func = tvm.build(s, [A, B], device)
+            # launch the kernel.
+            n = 1024
+            a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), ctx)
+            b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), ctx)
+            func(a, b)
+            np.testing.assert_allclose(
+                b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5)
+
+        check_device("llvm")
+        check_device("cuda")
+        check_device("opencl")
+        if dtype == "uint32":
+            check_device("metal")
+            check_device("vulkan")
+    run('uint32')
+    run('uint64')
 
 
 def test_add():
-    # graph
-    n = tvm.var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    bias = tvm.var("bias", dtype="float32")
-    scale = tvm.var("scale", dtype="float32")
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i) * scale + bias, name='C')
-    # schedule
-    s = tvm.create_schedule(C.op)
-    # create iter var and assign them tags.
-    num_thread = 32
-    bx, x = s[C].split(C.op.axis[0], factor=num_thread*4)
-    tx, x = s[C].split(x, nparts=num_thread)
-    _, x = s[C].split(x, factor=4)
-    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
-    s[C].vectorize(x)
+    def run(dtype):
+        # graph
+        n = tvm.var('n')
+        A = tvm.placeholder((n,), name='A', dtype=dtype)
+        B = tvm.placeholder((n,), name='B', dtype=dtype)
+        bias = tvm.var("bias", dtype=dtype)
+        scale = tvm.var("scale", dtype=dtype)
+        C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+        # schedule
+        s = tvm.create_schedule(C.op)
+        # create iter var and assign them tags.
+        num_thread = 16
+        bx, x = s[C].split(C.op.axis[0], factor=num_thread*4)
+        tx, x = s[C].split(x, nparts=num_thread)
+        _, x = s[C].split(x, factor=4)
+        s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[C].vectorize(x)
+
+        # one line to build the function.
+        def check_device(device):
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("skip because %s is not enabled.." % device)
+                return
+            fadd = tvm.build(s, [A, B, C],
+                             device,
+                             name="myadd")
+
+            # launch the kernel.
+            n = 1024
+            a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), ctx)
+            b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), ctx)
+            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+            ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=1)
+            tcost = ftimer(a, b, c).mean
+            np.testing.assert_allclose(
+                c.asnumpy(), a.asnumpy() + b.asnumpy(), rtol=1e-6)
+
+        check_device("opencl")
+        check_device("cuda")
+        if dtype == "float32":
+            check_device("metal")
+            check_device("vulkan")
+
+    run("float32")
+    run("int32")
+    run("int64")
+    run("uint64")
+
+
+def try_warp_memory():
+    """skip this in default test because it require higher arch"""
+    m = 128
+    A = tvm.placeholder((m,), name='A')
+    B = tvm.compute((m,), lambda i: A[i] + 3, name='B')
+    warp_size = 32
+    s = tvm.create_schedule(B.op)
+    AA = s.cache_read(A, "warp", [B])
+    xo, xi = s[B].split(B.op.axis[0], warp_size * 2)
+    xi0, xi1 = s[B].split(xi, factor=warp_size)
+    tx = tvm.thread_axis("threadIdx.x")
+    s[B].bind(xi1, tx)
+    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[AA].compute_at(s[B], xo)
+    xo, xi = s[AA].split(s[AA].op.axis[0], warp_size)
+    s[AA].bind(xi, tx)
+
+    @tvm.register_func
+    def tvm_callback_cuda_compile(code):
+        ptx =  nvcc.compile_cuda(code, target="ptx")
+        return ptx
 
     # one line to build the function.
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("skip because %s is not enabled.." % device)
             return
-        fadd = tvm.build(s, [A, B, C, bias, scale],
-                         device,
-                         name="myadd")
-        ctx = tvm.context(device, 0)
-        # launch the kernel.
-        n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
-        vbias = np.random.uniform()
-        vscale = np.random.uniform()
-        ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=10)
-        tcost = ftimer(a, b, c, vbias, vscale).mean
+        f = tvm.build(s, [A, B], device)
+        a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
+        f(a, b)
         np.testing.assert_allclose(
-            c.asnumpy(), a.asnumpy() + b.asnumpy() * vscale + vbias, rtol=1e-6)
-
-    check_device("opencl")
-    check_device("metal")
+            b.asnumpy(), a.asnumpy() + 3, rtol=1e-6)
     check_device("cuda")
 
 
 if __name__ == "__main__":
-    test_log_pow_llvm()
-    test_popcount_llvm()
     test_exp()
+    try_warp_memory()
+    test_multiple_cache_write()
     test_add()
+    test_log_pow_llvm()
+    test_popcount()
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
new file mode 100644
index 000000000000..0abefff02778
--- /dev/null
+++ b/tests/python/integration/test_ewise_fpga.py
@@ -0,0 +1,90 @@
+import tvm
+import numpy as np
+import os
+
+os.environ["XCL_EMULATION_MODE"] = "1"
+os.environ["CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA"] = "1"
+
+@tvm.register_func
+def tvm_callback_vhls_postproc(code):
+    """Hook to inspect the Vivado HLS code before actually run it"""
+    print(code)
+    return code
+
+def test_exp():
+    # graph
+    n = tvm.convert(1024)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B')
+    s = tvm.create_schedule(B.op)
+    # create iter var and assign them tags.
+    px, x = s[B].split(B.op.axis[0], nparts=1)
+    s[B].bind(px, tvm.thread_axis("pipeline"))
+
+    # one line to build the function.
+    def check_device(device, host="llvm"):
+        if not tvm.module.enabled(host):
+            return
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            return
+        fexp = tvm.build(s, [A, B],
+                         device, host,
+                         name="myexp")
+        ctx = tvm.context(device, 0)
+        # launch the kernel.
+        n = 1024
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        fexp(a, b)
+        np.testing.assert_allclose(
+            b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
+
+    check_device("sdaccel")
+    if "AWS_PLATFORM" in os.environ:
+        check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM"))
+
+    check_device("aocl -device=s5_ref -mattr=emulator")
+
+def test_multi_kernel():
+    # graph
+    n = tvm.convert(1024)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    D = tvm.compute(A.shape, lambda *i: A(*i) + C(*i), name='D')
+    s = tvm.create_schedule(D.op)
+    # create iter var and assign them tags.
+    px, x = s[C].split(C.op.axis[0], nparts=1)
+    s[C].bind(px, tvm.thread_axis("pipeline"))
+    px, x = s[D].split(D.op.axis[0], nparts=1)
+    s[D].bind(px, tvm.thread_axis("pipeline"))
+
+    # one line to build the function.
+    def check_device(device, host="llvm"):
+        if not tvm.module.enabled(host):
+            return
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            return
+        fadd = tvm.build(s, [A, B, C, D],
+                         device, host,
+                         name="myadd")
+        ctx = tvm.context(device, 0)
+        # launch the kernel.
+        n = 1024
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), ctx)
+        d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), ctx)
+        fadd(a, b, c, d)
+        np.testing.assert_allclose(
+            d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5)
+
+    check_device("sdaccel")
+    check_device("aocl -device=s5_ref -mattr=emulator")
+
+
+if __name__ == "__main__":
+    test_exp()
+    test_multi_kernel()
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index fea342baade6..6e74052d8283 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -2,6 +2,7 @@
 import numpy as np
 import time
 
+
 def test_gemm():
     # graph
     nn = 1024
@@ -64,13 +65,14 @@ def test_gemm():
 
     # one line to build the function.
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("skip because %s is not enabled.." % device)
             return
 
         with tvm.target.create(device):
             f = tvm.build(s, [A, B, C])
-        ctx = tvm.context(device, 0)
+
         # launch the kernel.
         n = nn
         m = n
@@ -86,12 +88,12 @@ def check_device(device):
         np.testing.assert_allclose(
             c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5)
 
+    check_device("vulkan")
     check_device("nvptx -mcpu=sm_20")
     check_device("rocm")
     check_device("metal")
     check_device("opencl")
     check_device("cuda")
-    #check_device("nvptx -mcpu=sm_20")
 
 if __name__ == "__main__":
     test_gemm()
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index 8f2f053c9138..c8fb98746bf6 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -1,14 +1,16 @@
 import tvm
 import numpy as np
 
+
 def test_reduce_prims():
     def test_prim(reducer, np_reducer):
         # graph
         n = tvm.var('n')
         m = tvm.var('m')
         A = tvm.placeholder((n, m), name='A')
+        R = tvm.compute((n, ), lambda i: tvm.select((i > 1), 1, 0), name='R')
         k = tvm.reduce_axis((0, m))
-        B = tvm.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(i>1)), name='B')
+        B = tvm.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(R[i]==1)), name='B')
         # schedule
         s = tvm.create_schedule(B.op)
         # create iter var and assign them tags.
@@ -16,15 +18,16 @@ def test_prim(reducer, np_reducer):
         xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
         s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
         s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[R].compute_inline()
 
         # one line to build the function.
         def check_device(device, host="stackvm"):
+            ctx = tvm.context(device, 0)
             if not tvm.module.enabled(host):
                 return
-            if not tvm.module.enabled(device):
+            if not ctx.exist:
                 print("skip because %s is not enabled.." % device)
                 return
-            ctx = tvm.context(device, 0)
             freduce = tvm.build(s,
                              args=[A, B],
                              target=device, target_host=host,
@@ -42,6 +45,7 @@ def check_device(device, host="stackvm"):
             np.testing.assert_allclose(npy, res, rtol=1e-4)
 
         check_device("metal")
+        check_device("vulkan")
         check_device("cuda")
         check_device("opencl")
     test_prim(tvm.sum, np.sum)
@@ -79,6 +83,36 @@ def check_target(target="llvm"):
 
     check_target()
 
+def test_rfactor_factor_axis():
+    n = tvm.convert(1027)
+    A = tvm.placeholder((n,), name='A')
+    k = tvm.reduce_axis((0, n))
+    B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B')
+    # schedule
+    s = tvm.create_schedule(B.op)
+    kf, ki = s[B].split(k, nparts=4)
+    BF = s.rfactor(B, kf, 1)
+    s[BF].parallel(BF.op.axis[0])
+    # one line to build the function.
+    def check_target(target="llvm"):
+        if not tvm.module.enabled(target):
+            return
+        ctx = tvm.cpu(0)
+        fapi = tvm.lower(s, args=[A, B])
+        fsum = tvm.build(fapi,
+                         target=target,
+                         name="mysum")
+        # launch the kernel.
+        n = 1027
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
+        b  = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
+        fsum(a, b)
+        res = np.sum(a.asnumpy(), axis=0)
+        np.testing.assert_allclose(
+            b.asnumpy(), res, rtol=1e-4)
+
+    check_target()
+
 
 def test_rfactor_threads():
     nn = 1027
@@ -104,10 +138,11 @@ def test_rfactor_threads():
 
     # one line to build the function.
     def check_target(device, host="stackvm"):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("skip because %s is not enabled.." % device)
             return
-        ctx = tvm.context(device, 0)
+
         fapi = tvm.lower(s, args=[A, B])
         fsum = tvm.build(fapi,
                          target=device,
@@ -123,6 +158,7 @@ def check_target(device, host="stackvm"):
         np.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
+    check_target("vulkan")
     check_target("cuda")
     check_target("metal")
     check_target("opencl")
@@ -157,15 +193,14 @@ def test_rfactor_elemwise_threads():
 
     # one line to build the function.
     def check_target(device, host="stackvm"):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("skip because %s is not enabled.." % device)
             return
-        ctx = tvm.context(device, 0)
         fapi = tvm.lower(s, args=[A, C])
         fsum = tvm.build(fapi,
                          target=device,
                          name="mysum")
-        print(fsum.imported_modules[0].get_source())
         # launch the kernel.
         a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), ctx)
         b  = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
@@ -174,6 +209,7 @@ def check_target(device, host="stackvm"):
         np.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
+    check_target("vulkan")
     check_target("cuda")
     check_target("metal")
     check_target("opencl")
@@ -262,10 +298,10 @@ def fidentity(t0, t1):
     s[B0].set_store_predicate(thread_x.var.equal(0))
 
     def check_target(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("skip because %s is not enabled.." % device)
             return
-        ctx = tvm.context(device, 0)
         fapi = tvm.lower(s, args=[A0, A1, B0, B1])
         fargmax = tvm.build(fapi,
                             target=device,
@@ -283,10 +319,12 @@ def check_target(device):
         np.testing.assert_allclose(np_res, nd_res0.asnumpy())
 
     check_target("cuda")
+    check_target("vulkan")
 
 if __name__ == "__main__":
     test_rfactor_elemwise_threads()
     test_rfactor_threads()
+    test_rfactor_factor_axis()
     test_rfactor()
     test_reduce_prims()
     test_argmax()
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
index 2f9d29e9437b..855f3e072133 100644
--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -24,13 +24,13 @@ def test_scan():
 
     # one line to build the function.
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("skip because %s is not enabled.." % device)
             return
         fscan = tvm.build(s, [X, res],
                           device,
                           name="myscan")
-        ctx = tvm.context(device, 0)
         # launch the kernel.
         n = 1024
         m = 10
@@ -41,6 +41,7 @@ def check_device(device):
         np.testing.assert_allclose(
             b.asnumpy(), np.cumsum(a_np, axis=0))
 
+    check_device("vulkan")
     check_device("cuda")
     check_device("metal")
     check_device("opencl")
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
new file mode 100644
index 000000000000..87da86a4654f
--- /dev/null
+++ b/tests/python/integration/test_tuning.py
@@ -0,0 +1,160 @@
+"""
+Test the tuner
+"""
+import logging
+import time
+
+import tvm
+
+from tvm import autotvm
+from tvm.autotvm.tuner import RandomTuner
+
+@autotvm.template
+def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
+    """An example template for testing"""
+    assert N == 1, "Only consider batch_size = 1 in this template"
+
+    data = tvm.placeholder((N, CI, H, W), name='data')
+    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
+
+    rc = tvm.reduce_axis((0, CI), name='rc')
+    ry = tvm.reduce_axis((0, KH), name='ry')
+    rx = tvm.reduce_axis((0, KW), name='rx')
+
+    conv = tvm.compute(
+        (N, CO, H - KH + 1, W - KW + 1),
+        lambda nn, ff, yy, xx: tvm.sum(
+            data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx],
+            axis=[rc, ry, rx]), tag="conv2d_nchw")
+
+    s = tvm.create_schedule([conv.op])
+
+    output = conv
+    OL = s.cache_write(conv, 'local')
+
+    # create cache stage
+    AA = s.cache_read(data, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+    AL = s.cache_read(AA, 'local', [OL])
+    WL = s.cache_read(WW, 'local', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+    kernel_scope = n  # this is the scope to attach global config inside this kernel
+
+    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile and bind reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
+    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
+    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+    s[AL].compute_at(s[OL], rxm)
+    s[WL].compute_at(s[OL], rxm)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # tune unroll
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    return s, [data, kernel, conv]
+
+def get_sample_task(target=tvm.target.cuda(), target_host=None):
+    """return a sample task for testing"""
+    task = autotvm.task.create(conv2d_no_batching,
+                               args=(1, 7, 7, 512, 512, 3, 3),
+                               target=target, target_host=target_host)
+    return task, target
+
+
+def test_task_tuner_without_measurement():
+    """test task and tuner without measurement"""
+    task, target = get_sample_task()
+
+    def custom_measure(input_pack, build_func, build_args, number, repeat,
+                       ref_input, ref_output):
+        from tvm.autotvm import MeasureResult
+
+        results = []
+        for inp in input_pack:
+            tic = time.time()
+            # do nothing
+            time.sleep(0.001)
+            results.append(MeasureResult([time.time() - tic], 0,
+                                         time.time() - tic, time.time()))
+        return results
+    measure_option = autotvm.measure_option(custom_measure)
+
+    logging.info("%s", task.config_space)
+
+    # new tuner and recorder
+    for tuner_class in [autotvm.tuner.RandomTuner, autotvm.tuner.GridSearchTuner]:
+        tuner = tuner_class(task)
+        tuner.tune(n_trial=10, measure_option=measure_option)
+        assert tuner.best_flops > 1
+
+def test_tuning_with_measure():
+    def check(target, target_host):
+        ctx = tvm.context(target, 0)
+        if not ctx.exist:
+            logging.info("Skip test because %s is not available" % target)
+            return
+
+        # init task
+        task, target = get_sample_task(target, target_host)
+        logging.info("%s", task.config_space)
+
+        measure_option = autotvm.measure_option('local',
+                                                timeout=4,
+                                                number=2)
+
+        tuner = RandomTuner(task)
+        tuner.tune(n_trial=10, measure_option=measure_option)
+
+    check("cuda", None)
+    check("opencl", None)
+
+if __name__ == "__main__":
+    # only print log when invoked from main
+    logging.basicConfig(level=logging.DEBUG)
+
+    test_task_tuner_without_measurement()
+    test_tuning_with_measure()
+
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index c1b4daaada5b..78589cf3af0e 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -25,12 +25,17 @@ def test_deduce():
 
     e0 = (-b)*a+c-d
     res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
-    ans0 = (d-c)/(-b)+(-1)
+    ans0 = ((d - c) /(b*-1))
+    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
+
+    e0 = d*a+c-d
+    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
+    ans0 = ((0-c)/d + 1)
     assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
 
     e1 = (a*4+b < c)
     res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
-    ans1 = (c-b)/4+(-2)
+    ans1 = (((c - b) + -1)/4) 
     assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)
 
     e2 = (tvm.max(5, a * 4) < 0)
@@ -59,14 +64,77 @@ def test_check():
 
     # multiple compare operators
     res2 = tvm.arith.DeduceBound(a, (a+b>3)>c , {b: b_s, c: c_s}, {})
-    assert res1.is_nothing()
+    assert res2.is_nothing()
 
     # multiple target variable
     res2 = tvm.arith.DeduceBound(a, a*2-a>b, {b: b_s}, {})
-    assert res1.is_nothing()
+    assert res2.is_nothing()
+
+def test_deduce_basic():
+    def test_basic(a1, a2, coff):
+        a = tvm.var('a')
+        b = tvm.var('b')
+        b_s = tvm.arith.intset_interval(a1, a2)
+        e0 = b + a*coff + 3
+
+        res1 = tvm.arith.DeduceBound(a, e0<17, {b: b_s}, {b: b_s})
+        [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
+        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) < 17)).value == 1
+
+        res1 = tvm.arith.DeduceBound(a, e0>17, {b: b_s}, {b: b_s})
+        [x, y] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
+        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) > 17)).value == 1
+
+        res1 = tvm.arith.DeduceBound(a, e0<=17, {b: b_s}, {b: b_s})
+        [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
+        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1
+      
+        res1 = tvm.arith.DeduceBound(a, e0>=17, {b: b_s}, {b: b_s})
+        [x, y] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
+        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) >= 17)).value == 1
+       
+    test_basic(0, 4, 4)
+    test_basic(1, 5, 4)
+    test_basic(2, 6, 4)
+    test_basic(0, 4, -4)
+    test_basic(1, 5, -4)
+    test_basic(2, 6, -4)
+
+def test_deduce_complex():
+    def test_complex(a1, a2, coff):
+        a = tvm.var('a')
+        b = tvm.var('b')
+        b_s = tvm.arith.intset_interval(a1, a2)
+        e0 = (b*3 + a* coff) * 4
+
+        res1 = tvm.arith.DeduceBound(a, e0<63, {b: b_s}, {b: b_s})
+        [t, x] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
+        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) < 63)).value == 1
+
+        res1 = tvm.arith.DeduceBound(a, e0<=63, {b: b_s}, {b: b_s})
+        [t, x] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
+        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) <= 63)).value == 1
+
+        res1 = tvm.arith.DeduceBound(a, e0>63, {b: b_s}, {b: b_s})
+        [t, x] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
+        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) > 63)).value == 1
+
+        res1 = tvm.arith.DeduceBound(a, e0>=63, {b: b_s}, {b: b_s})
+        [t, x] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
+        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) >= 63)).value == 1
+
+    test_complex(0, 4, 4)
+    test_complex(0, 4, -4)
+    test_complex(2, 6, 4)
+    test_complex(0, 4, -4)
+    test_complex(1, 5, -4)
+    test_complex(2, 6, -4)
 
 if __name__ == "__main__":
     test_basic()
     test_vector()
     test_deduce()
     test_check()
+    test_deduce_basic()
+    test_deduce_complex()
+
diff --git a/tests/python/unittest/test_arith_simplify.py b/tests/python/unittest/test_arith_simplify.py
index 9ff8571eac42..e6689dddf9d0 100644
--- a/tests/python/unittest/test_arith_simplify.py
+++ b/tests/python/unittest/test_arith_simplify.py
@@ -20,5 +20,43 @@ def test_simplify():
     zz = zz.a
     assert zz.a == x and zz.b.value == 4
 
+def test_simplify_mod():
+    """Not yet working, mock design"""
+    ib = tvm.ir_builder.create()
+    n = tvm.var('n')
+    j = tvm.var('j')
+    A = ib.pointer("float32", name="A")
+    with ib.for_range(0, 16, name="i") as i:
+        A[i] = A[((n * 4 + j * 2) * 8 + i+1) % 16]
+    body = ib.get()
+    stmt = tvm.ir_pass.CanonicalSimplify(body)
+    diff = tvm.ir_pass.CanonicalSimplify(stmt.body.value.index - (1 + i) % 16)
+    assert diff.value == 0
+    index = tvm.ir_pass.CanonicalSimplify(
+        (j + n * 32) % 16, {j: tvm.Range(0, 6)})
+    assert index == j
+
+
+def test_modular():
+    rx = tvm.var("rx")
+    ry = tvm.var("ry")
+    y = tvm.var("y")
+    x = tvm.var("x")
+    vmap = {rx: tvm.Range(tvm.const(0), tvm.const(3)),
+            ry: tvm.Range(tvm.const(0), tvm.const(3)),
+            y: tvm.Range(tvm.const(0), tvm.const(2)),
+            x: tvm.Range(tvm.const(0), tvm.const(14))}
+    idx = ry * 16 + rx + y * 16 + x
+    z1 = tvm.ir_pass.CanonicalSimplify(idx // 16, vmap)
+    z2 = tvm.ir_pass.CanonicalSimplify(idx % 16, vmap)
+    assert tvm.ir_pass.CanonicalSimplify(z1 - (ry + y)).value == 0
+    assert tvm.ir_pass.CanonicalSimplify(z2 - (rx + x)).value == 0
+
+
+
+
+
 if __name__ == "__main__":
+    test_simplify_mod()
+    test_modular()
     test_simplify()
diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py
new file mode 100644
index 000000000000..3a6883f69489
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_common.py
@@ -0,0 +1,50 @@
+"""Common utilities for testing autotvm"""
+import time
+
+import tvm
+from tvm import autotvm
+from tvm.autotvm import MeasureInput, MeasureResult
+
+@autotvm.template
+def matmul(N, L, M, dtype):
+    A = tvm.placeholder((N, L), name='A', dtype=dtype)
+    B = tvm.placeholder((L, M), name='B', dtype=dtype)
+
+    k = tvm.reduce_axis((0, L), name='k')
+    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = tvm.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    ##### define space begin #####
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_y", y, num_outputs=2)
+    cfg.define_split("tile_x", x, num_outputs=2)
+    ##### define space end #####
+
+    # schedule according to config
+    yo, yi = cfg["tile_y"].apply(s, C, y)
+    xo, xi = cfg["tile_x"].apply(s, C, x)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+def get_sample_task(n=128):
+    """return a sample task for testing"""
+    target = tvm.target.create("llvm")
+    task = autotvm.task.create(matmul, args=(n, n, n, 'float32'), target=target)
+    return task, target
+
+def get_sample_records(n):
+    """get sample records for testing"""
+    tsk, target = get_sample_task()
+
+    inps, ress = [], []
+    for i in range(n):
+        inps.append(MeasureInput(target, tsk, tsk.config_space.get(i)))
+        ress.append(MeasureResult((i+1,), 0, i, time.time()))
+    return list(zip(inps, ress))
+
diff --git a/tests/python/unittest/test_autotvm_database.py b/tests/python/unittest/test_autotvm_database.py
new file mode 100644
index 000000000000..af4704d95e51
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_database.py
@@ -0,0 +1,237 @@
+"""Test database"""
+import copy
+import logging
+import time
+
+import numpy as np
+import tvm
+
+from tvm import autotvm
+from tvm.autotvm import database
+from tvm.autotvm.measure.measure_methods import HashMismatchError
+from tvm.autotvm.record import encode, MeasureInput, MeasureResult
+
+from test_autotvm_common import get_sample_task, get_sample_records
+
+def test_save_load():
+    logging.info("test basic db load/save ...")
+    records = get_sample_records(3)
+    inp1, res1 = records[0]
+    inp2, res2 = records[1]
+    inp3, _ = records[2]
+
+    _db = database.DummyDatabase()
+    _db.flush()
+    _db.save(inp1, res1)
+    _db.save(inp2, res2)
+
+    load1 = _db.load(inp1)
+    load2 = _db.load(inp2)
+    load3 = _db.load(inp3)
+    assert load1 == res1
+    assert load2 == res2
+    assert load3 is None
+    assert load1 != load2
+
+TRIAL_LIMIT = 2
+
+def test_db_filter():
+    logging.info("test db filter ...")
+
+    # Pick a GPU target because there are more likely to be failures/invalid configs
+    task, target = get_sample_task()
+
+    ctx = tvm.context(str(target))
+    if not ctx.exist:
+        logging.warning("Skip this test because there is no supported device for test")
+
+    batch_size = 2
+
+    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2)
+    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
+
+    ct = 0
+    all_inputs = list()
+    all_results = list()
+    batches = list()
+    tuner = autotvm.tuner.RandomTuner(task)
+    while ct < TRIAL_LIMIT:
+        inputs = list()
+        for i in range(batch_size):
+            cfg = tuner.next_batch(1)[0]
+            inputs.append((MeasureInput(target, task, cfg)))
+            all_inputs.append(inputs[-1])
+        batches.append(inputs)
+        results = measure_batch(inputs)
+        all_results += results
+        ct += 1
+
+    del measure_batch
+
+    db = database.DummyDatabase()
+    db.flush()
+
+    # First setting, memoize one input at a time, check that each is saved and replayed
+    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2, replay_db=db)
+    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
+
+    for i in range(len(all_inputs)+1):
+        db.flush()
+        for j in range(i):
+            db.save(all_inputs[j], all_results[j])
+
+        for k in range(len(batches)):
+            batch = batches[k]
+            batch_result = measure_batch(batch)
+            for l in range(batch_size):
+                all_idx = k*batch_size + l
+                assert batch_result[l] is not None
+                if all_idx < i:
+                    assert encode(batch[l], batch_result[l]) == encode(batch[l], all_results[all_idx]), \
+                        "(no retry) EXPECTED MATCH, GOT MISMATCH"
+                else:
+                    assert encode(batch[l], batch_result[l]) != encode(batch[l], all_results[all_idx]), \
+                        "(no retry) EXPECTED MISMATCH, GOT MATCH"
+
+    del measure_batch
+
+def test_db_hash():
+    logging.info("test db hash check ...")
+    inp1, res1 = get_sample_records(1)[0]
+    inp2 = copy.deepcopy(inp1)
+    inp1.config.code_hash = 'cafecafe'
+    inp2.config.code_hash = 'dbffdbff'
+    res2l = list(tuple(res1))
+
+    # set timestamp
+    res2l[-1] = -1
+    res2 = MeasureResult(*res2l)
+    _db = database.DummyDatabase()
+    _db.flush()
+    _db.save(inp1, res1, extend=True)
+    _db.save(inp2, res2, extend=True)
+
+    load1 = _db.load(inp1)
+    load2 = _db.load(inp2)
+    assert load1 != load2
+    assert load1.timestamp != -1
+    assert load2.timestamp == -1
+
+def test_db_latest_all():
+    logging.info("test db load w/ multiple results ...")
+    inp1, res1 = get_sample_records(1)[0]
+    lis1 = list(tuple(res1))
+    lis2 = list(tuple(res1))
+    lis3 = list(tuple(res1))
+
+    # set timestamp
+    lis1[-1] = 0.0
+    lis2[-1] = 1.1
+    lis3[-1] = 9999.9999
+    res1 = MeasureResult(*lis1)
+    res2 = MeasureResult(*lis2)
+    res3 = MeasureResult(*lis3)
+
+    _db = database.DummyDatabase()
+    _db.flush()
+    _db.save(inp1, res1, extend=True)
+    load1 = _db.load(inp1)
+    assert load1.timestamp == 0.0
+    _db.save(inp1, res2, extend=True)
+    load2 = _db.load(inp1)
+    assert load2.timestamp == 1.1
+    _db.save(inp1, res3, extend=True)
+    load3 = _db.load(inp1)
+    assert load3.timestamp == 9999.9999
+
+    load4 = _db.load(inp1, get_all=True)
+    assert encode(inp1, load4[0]) == encode(inp1, res1)
+    assert encode(inp1, load4[1]) == encode(inp1, res2)
+    assert encode(inp1, load4[2]) == encode(inp1, res3)
+
+def test_db_save_replay():
+    logging.info("test db save (from measure_batch) and replay ...")
+    _db = database.DummyDatabase()
+    _db.flush()
+
+    task, target = get_sample_task()
+
+    ctx = tvm.context(str(target))
+    if not ctx.exist:
+        logging.warning("Skip this test because there is no supported device for test")
+
+    measure_option = autotvm.measure_option('local',
+                                            do_fork=False,
+                                            timeout=2,
+                                            replay_db=_db)
+    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
+
+    batch_size = 2
+
+    ct = 0
+    all_inputs = list()
+    all_results = list()
+    batches = list()
+    tuner = autotvm.tuner.RandomTuner(task)
+    while ct < TRIAL_LIMIT:
+        inputs = list()
+        for i in range(batch_size):
+            cfg = tuner.next_batch(1)[0]
+            inputs.append((MeasureInput(target, task, cfg)))
+            all_inputs.append(inputs[-1])
+        batches.append(inputs)
+        results = measure_batch(inputs)
+        all_results += results
+        ct += 1
+    callback = autotvm.callback.log_to_database(_db)
+    callback(None, all_inputs, all_results)
+
+    assert len(_db.db.keys()) == batch_size * TRIAL_LIMIT, \
+        "%d vs %d" % (len(_db.db.keys()), batch_size * TRIAL_LIMIT)
+
+    all_results_2 = measure_batch(all_inputs)
+    all_results_3 = measure_batch(all_inputs)
+
+    for i in range(len(all_results)):
+        encr1 = encode(all_inputs[i], all_results[i])
+        encr2 = encode(all_inputs[i], all_results_2[i])
+        encr3 = encode(all_inputs[i], all_results_3[i])
+        assert encr1 == encr2, "EXPECTED MATCH WITH SAVE REPLAY (first replay), got MISMATCH"
+        assert encr2 == encr3, "EXPECTED MATCH WITH SAVE REPLAY (second replay), got MISMATCH"
+
+    del measure_batch
+
+def test_check_hashmismatch():
+    logging.info("test hash mismatch check")
+
+    task, target = get_sample_task()
+
+    ctx = tvm.context(str(target))
+    if not ctx.exist:
+        logging.warning("Skip this test because there is no supported device for test")
+
+    measure_option = autotvm.measure_option('local', do_fork=False)
+    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
+
+    inputs = list()
+    cfg = task.config_space.get(np.random.randint(len(task.config_space)))
+    # notvalidh is not a valid CRC32 hash (not hex)
+    cfg.code_hash = 'notvalidh'
+    inputs.append((MeasureInput(target, task, cfg)))
+
+    try:
+        results = measure_batch(inputs)
+        assert False, "HashMismatchError should be raised"
+    except HashMismatchError:
+        pass
+
+    del measure_batch
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    test_save_load()
+    test_db_filter()
+    test_db_hash()
+    test_db_latest_all()
+    test_db_save_replay()
+    test_check_hashmismatch()
diff --git a/tests/python/unittest/test_autotvm_dispatch_context.py b/tests/python/unittest/test_autotvm_dispatch_context.py
new file mode 100644
index 000000000000..6c718e5bd041
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_dispatch_context.py
@@ -0,0 +1,36 @@
+"""Test dispatcher.
+The dispatcher can choose which template to use according
+to the parameters of workload"""
+
+from collections import namedtuple
+from tvm.autotvm.task import dispatcher, DispatchContext
+
+SimpleWorkload = namedtuple("SimpleWorkload", ["key"])
+SimpleConfig = namedtuple("SimpleConfig", ["template_key"])
+
+def test_dispatch():
+    @dispatcher
+    def my_dispatcher(a, b):
+        return SimpleWorkload(key=a + b)
+
+    @my_dispatcher.register("spatial_pack")
+    def _sp_pack_add(cfg, a, b):
+        return b + 100
+
+    @my_dispatcher.register("im2col")
+    def _im2col_add(cfg, a, b):
+        return a + 1
+
+    class SimpleDispatcher(DispatchContext):
+        def query(self, target, workload):
+            tkey = "spatial_pack" if workload.key > 2 else "im2col"
+            return SimpleConfig(tkey)
+
+    with SimpleDispatcher():
+        # im2col
+        assert my_dispatcher(1, 0) == 2
+        # spack
+        assert my_dispatcher(1, 100) == 200
+
+if __name__ == "__main__":
+    test_dispatch()
diff --git a/tests/python/unittest/test_autotvm_executor.py b/tests/python/unittest/test_autotvm_executor.py
new file mode 100644
index 000000000000..6dd104210ccf
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_executor.py
@@ -0,0 +1,47 @@
+"""Test local executor"""
+import time
+
+from tvm.autotvm.measure import LocalExecutor, executor
+
+def slow(n):
+    r = 0
+    for i in range(0, n+1):
+        r += i
+    return r
+
+def fast(n):
+    return n*(n+1)//2
+
+def test_local_measure_async():
+    ex = LocalExecutor()
+    f1 = ex.submit(slow, 9999999)
+    f2 = ex.submit(fast, 9999999)
+    t1 = 0
+    t2 = 0
+    while True:
+        if t1 == 0 and f1.done():
+            t1 = time.time()
+        if t2 == 0 and f2.done():
+            t2 = time.time()
+        if t1 != 0 and t2 != 0:
+            break
+    assert t2 < t1, "Expected fast async job to finish first!"
+    assert f1.get() == f2.get()
+
+def timeout_job(n):
+    time.sleep(n * 1.5)
+
+def test_timeout():
+    timeout = 0.5
+
+    ex = LocalExecutor(timeout=timeout)
+
+    f1 = ex.submit(timeout_job, timeout)
+    while not f1.done():
+        pass
+    res = f1.get()
+    assert isinstance(res, executor.TimeoutError)
+
+if __name__ == "__main__":
+    test_local_measure_async()
+    test_timeout()
diff --git a/tests/python/unittest/test_autotvm_feature.py b/tests/python/unittest/test_autotvm_feature.py
new file mode 100644
index 000000000000..43754c27e0ea
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_feature.py
@@ -0,0 +1,99 @@
+"""Test feature extraction"""
+
+import numpy as np
+
+import tvm
+from tvm.autotvm import feature
+
+def test_iter_feature_gemm():
+    N = 128
+
+    k = tvm.reduce_axis((0, N), 'k')
+    A = tvm.placeholder((N, N), name='A')
+    B = tvm.placeholder((N, N), name='B')
+    C = tvm.compute(
+        A.shape,
+        lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
+        name='C')
+
+    s = tvm.create_schedule(C.op)
+
+    feas = feature.get_itervar_feature(s, [A, B, C], take_log=False)
+
+    expected = [
+        {
+            '_attr_': [128, 1, 128, 2097152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+            'A_0': [128, -1, 16384, 128, 0, 0], 'B_0': [0, -1, 16384, 128, 0, 0],
+            'C_0': [128, -1, 16384, 128, 0, 0], 'C_1': [128, -1, 16384, 128, 0, 0],
+        },
+        {
+            '_attr_': [128, 2, 16384, 16384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+            'A_0': [0, -1, 128, 128, 0, 0], 'B_0': [1, -1, 16384, 1, 0, 0],
+            'C_0': [1, -1, 128, 128, 0, 0], 'C_1': [1, -1, 128, 128, 0, 0],
+        },
+        {
+            '_attr_': [128, 3, 2097152, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+            'A_0': [1, -1, 128, 1, 0, 0], 'B_0': [128, -1, 128, 1, 0, 0],
+            'C_1': [0, -1, 1, 128, 0, 0], 'C_2':  [0, -1, 1, 128, 0, 0],
+        }
+    ]
+
+    for ans, row in zip(expected, feas):
+        for pair in row:
+            if pair[0] not in ans:
+                continue
+            assert ans[pair[0]] == pair[1:], "%s: %s vs %s" % (pair[0], ans[pair[0]], pair[1:])
+
+
+def test_feature_shape():
+    """test the dimensions of flatten feature are the same"""
+
+    N = 1024
+    n_sample = 100
+
+    def get_gemm_feature(target):
+        k = tvm.reduce_axis((0, N), 'k')
+        A = tvm.placeholder((N, N), name='A')
+        B = tvm.placeholder((N, N), name='B')
+        C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
+                        name='C')
+
+        s = tvm.create_schedule(C.op)
+
+        y, x = s[C].op.axis
+        axes = list(s[C].tile(y, x, 8, 8)) + [k]
+        perm = np.random.permutation(5)
+        axes = [axes[x] for x in perm]
+        s[C].reorder(*axes)
+
+        if "gpu" in target.keys:
+            pick = []
+            # filter out reduction axis
+            for i in range(len(perm)):
+                if perm[i] != 4:
+                    pick.append(axes[i])
+            s[C].bind(pick[0], tvm.thread_axis("blockIdx.x"))
+            s[C].bind(pick[1], tvm.thread_axis("vthread"))
+            s[C].bind(pick[2], tvm.thread_axis("threadIdx.y"))
+
+        with target:
+            feas = feature.get_itervar_feature(s, [A, B, C])
+            feas = feature.flatten_itervar_feature(feas)
+        return feas
+
+    targets = [
+        tvm.target.cuda(),
+        tvm.target.mali(),
+        tvm.target.arm_cpu(),
+    ]
+
+    for target in targets:
+        dim = len(get_gemm_feature(target))
+        for i in range(n_sample):
+            assert dim == len(get_gemm_feature(target)), "dimensions of feature do not match" \
+                                                   " for different configurations"
+
+
+if __name__ == "__main__":
+    test_iter_feature_gemm()
+    test_feature_shape()
diff --git a/tests/python/unittest/test_autotvm_flop_calculator.py b/tests/python/unittest/test_autotvm_flop_calculator.py
new file mode 100644
index 000000000000..27bd49fe14df
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_flop_calculator.py
@@ -0,0 +1,77 @@
+"""Test flop calculation"""
+
+import tvm
+import numpy as np
+
+from tvm.autotvm.task.task import compute_flop
+
+def test_conv():
+    for i in range(5):
+        N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
+        D = tvm.placeholder((N, CI, H, W))
+        K = tvm.placeholder((CO, CI, KH, KW))
+
+        KH = min(H, KH)
+        KW = min(W, KW)
+
+        ci = tvm.reduce_axis((0, CI))
+        kh = tvm.reduce_axis((0, KH))
+        kw = tvm.reduce_axis((0, KW))
+
+        OH = (H - KH) + 1
+        OW = (W - KW) + 1
+
+        C = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:
+        tvm.sum(D[n][ci][h][w] * K[co][ci][h][w], axis=[ci, kh, kw]))
+
+        s = tvm.create_schedule([C.op])
+
+        assert compute_flop(s) == 2 * N * CO * OH * OW * CI * KH * KW
+
+def test_pack_gemm():
+    for i in range(5):
+        N, L, M = [np.random.randint(10, 128) * 4 for _ in range(3)]
+        A = tvm.placeholder((N, L))
+        B = tvm.placeholder((M, L))
+        k = tvm.reduce_axis((0, L))
+
+        bn = 4
+        A_pack = tvm.compute((N // bn, L, bn), lambda i, j, k: A[i * bn + k][j])
+        B_pack = tvm.compute((M // bn, L, bn), lambda i, j, k: B[i * bn + k][j])
+        C_pack = tvm.compute((N // bn, M // bn, bn, bn), lambda i, j, ii, jj:
+        tvm.sum(A_pack[i, k, ii] * B_pack[j, k, jj], axis=[k]))
+        C = tvm.compute((N, M), lambda i, j: C_pack[i // bn][j // bn][i % bn][j % bn])
+
+        s = tvm.create_schedule([C.op])
+        assert compute_flop(s) == 2 * N * L * M
+
+def test_outer_dot():
+    for i in range(5):
+        N, M = [np.random.randint(10, 128) * 4 for _ in range(2)]
+        A = tvm.placeholder((N,))
+        B = tvm.placeholder((M,))
+
+        C = tvm.compute((N, M), lambda i, j: A[i] * B[j])
+
+        s = tvm.create_schedule([C.op])
+        assert compute_flop(s) == N * M
+
+def test_move():
+    """No float number operation in simple move. So the estimator should raise an error """
+    N = 1024
+
+    A = tvm.placeholder((N,))
+    C = tvm.compute((N,), lambda i: A[i])
+    s = tvm.create_schedule([C.op])
+
+    try:
+        compute_flop(s)
+        assert False
+    except RuntimeError:
+        pass
+
+if __name__ == '__main__':
+    test_conv()
+    test_pack_gemm()
+    test_outer_dot()
+    test_move()
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
new file mode 100644
index 000000000000..0d8bc9dcdfac
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -0,0 +1,65 @@
+"""test the correctness of dump and load of data log"""
+import time
+
+import tvm
+from tvm.contrib import util
+
+from tvm import autotvm
+from tvm.autotvm.measure import MeasureInput, MeasureResult, MeasureErrorNo
+from tvm.autotvm.record import encode, decode, ApplyHistoryBest, measure_str_key
+
+from test_autotvm_common import get_sample_task
+
+def test_load_dump():
+    task, target = get_sample_task()
+
+    inp = MeasureInput(target, task, task.config_space.get(0))
+    result = MeasureResult((2.0, 2.23, 0.23, 0.123, 0.234, 0.123), MeasureErrorNo.NO_ERROR,
+                           2.3, time.time())
+
+    for protocol in ['json', 'pickle']:
+        row = encode(inp, result, protocol=protocol)
+        inp_2, result_2 = decode(row, protocol=protocol)
+
+        assert measure_str_key(inp) == measure_str_key(inp_2), \
+            "%s vs %s" % (measure_str_key(inp), measure_str_key(inp_2))
+        assert result.costs == result_2.costs
+        assert result.error_no == result_2.error_no
+        assert result.timestamp == result_2.timestamp
+
+
+def test_file_io():
+    temp = util.tempdir()
+    file_path = temp.relpath("temp.log")
+
+    tsk, target = get_sample_task()
+    inputs = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(0, 10)]
+    results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)]
+
+    with open(file_path, "w") as fo:
+        cb = autotvm.callback.log_to_file(fo)
+        cb(None, inputs, results)
+
+    ref = zip(inputs, results)
+    for x, y in zip(ref, autotvm.record.load_from_file(file_path)):
+        assert x[1] == y[1]
+
+
+def test_apply_history_best():
+    tsk, target = get_sample_task()
+
+    records = [
+        (MeasureInput(target, tsk, tsk.config_space.get(0)), MeasureResult((0.1,), 0, 2.3, 0)),
+        (MeasureInput(target, tsk, tsk.config_space.get(1)), MeasureResult((0.3,), 0, 2.3, 0)),
+        (MeasureInput(target, tsk, tsk.config_space.get(2)), MeasureResult((0.01,), 0, 2.3, 0)),
+        (MeasureInput(target, tsk, tsk.config_space.get(4)), MeasureResult((0.4,), 0, 2.3, 0))
+    ]
+    hist_best = ApplyHistoryBest(records)
+    x = hist_best.query(target, tsk.workload)
+    assert str(x) == str(tsk.config_space.get(2))
+
+
+if __name__ == "__main__":
+    test_load_dump()
+    test_apply_history_best()
+    test_file_io()
diff --git a/tests/python/unittest/test_autotvm_space.py b/tests/python/unittest/test_autotvm_space.py
new file mode 100644
index 000000000000..0320ef1c6f3c
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_space.py
@@ -0,0 +1,30 @@
+"""Test space definition primitives"""
+
+import tvm
+from tvm.autotvm.task.space import ConfigSpace
+
+def gemm_func(cfg, N):
+    A = tvm.placeholder((N, N), name='A')
+    B = tvm.placeholder((N, N), name='B')
+
+    k = tvm.reduce_axis((0, N), name='k')
+    C = tvm.compute((N, N), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=[k]), name='C')
+
+    s = tvm.create_schedule([C.op])
+
+    y, x = s[C].op.axis
+
+    cfg.define_split('tile_y', cfg.axis(y), num_outputs=2)
+    cfg.define_split('tile_x', cfg.axis(x), num_outputs=2)
+
+    return s, [A, B, C]
+
+def test_split():
+    cfg = ConfigSpace()
+
+    gemm_func(cfg, 128)
+    assert len(cfg) == 64
+    assert len(cfg.space_map['tile_y']) == 8
+
+if __name__ == '__main__':
+    test_split()
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
new file mode 100644
index 000000000000..3488d0f599a5
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -0,0 +1,40 @@
+import time
+
+import numpy as np
+
+import tvm
+from tvm import autotvm
+from tvm.autotvm import MeasureInput, MeasureResult
+from tvm.autotvm.tuner.xgboost_cost_model import XGBoostCostModel
+
+from test_autotvm_common import get_sample_task, get_sample_records
+
+
+def test_fit():
+    task, target = get_sample_task()
+    records = get_sample_records(n=100)
+
+    base_model = XGBoostCostModel(task, feature_type='itervar', loss_type='rank')
+    base_model.fit_log(records, plan_size=32)
+
+    upper_model = XGBoostCostModel(task, feature_type='itervar', loss_type='rank')
+    upper_model.load_basemodel(base_model)
+
+    xs = np.arange(100)
+    ys = np.arange(100)
+
+    upper_model.fit(xs, ys, plan_size=32)
+
+
+def test_tuner():
+    task, target = get_sample_task()
+    records = get_sample_records(n=100)
+
+    tuner = autotvm.tuner.XGBTuner(task)
+    tuner.load_history(records)
+
+
+if __name__ == "__main__":
+    test_fit()
+    test_tuner()
+
diff --git a/tests/python/unittest/test_codegen_arm.py b/tests/python/unittest/test_codegen_arm.py
new file mode 100644
index 000000000000..24240db72b26
--- /dev/null
+++ b/tests/python/unittest/test_codegen_arm.py
@@ -0,0 +1,30 @@
+import tvm
+import re
+import os
+import ctypes
+
+def test_popcount():
+    target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
+
+    def check_correct_assembly(type, elements, counts):
+        n = tvm.convert(elements)
+        A = tvm.placeholder(n, dtype=type, name='A')
+        B = tvm.compute(A.shape, lambda i: tvm.popcount(A[i]), name='B')
+        s = tvm.create_schedule(B.op)
+        s[B].vectorize(s[B].op.axis[0])
+        f = tvm.build(s, [A, B], target)
+
+        # Verify we see the correct number of vpaddl and vcnt instructions in the assembly
+        assembly = f.get_source('asm')
+        matches = re.findall("vpaddl", assembly)
+        assert (len(matches) == counts)
+        matches = re.findall("vcnt", assembly)
+        assert (len(matches) == 1)
+    check_correct_assembly('uint16', 8, 1)
+    check_correct_assembly('uint16', 4, 1)
+    check_correct_assembly('uint32', 4, 2)
+    check_correct_assembly('uint32', 2, 2)
+    check_correct_assembly('uint64', 2, 3)
+
+if __name__ == "__main__":
+    test_popcount()
diff --git a/tests/python/unittest/test_codegen_cross_llvm.py b/tests/python/unittest/test_codegen_cross_llvm.py
index f2a1dea62463..aa6f9d708a41 100644
--- a/tests/python/unittest/test_codegen_cross_llvm.py
+++ b/tests/python/unittest/test_codegen_cross_llvm.py
@@ -2,7 +2,8 @@
 import tvm
 import os
 import struct
-from tvm.contrib import util, cc, rpc
+from tvm import rpc
+from tvm.contrib import util, cc
 import numpy as np
 
 def test_llvm_add_pipeline():
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
new file mode 100644
index 000000000000..b0ed92a30281
--- /dev/null
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -0,0 +1,94 @@
+import tvm
+import numpy as np
+from tvm.contrib.nvcc import have_fp16, have_int8
+from tvm.contrib import nvcc
+
+def test_cuda_vectorize_add():
+    num_thread = 8
+    def check_cuda(dtype, n, lanes):
+        if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
+            print("skip because gpu does not support fp16")
+            return
+        if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
+            print("skip because gpu does not support int8")
+            return
+        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
+        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        fun = tvm.build(s, [A, B], "cuda")
+        ctx = tvm.gpu(0)
+        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(
+            np.random.uniform(size=(n, lanes)))
+        c = tvm.nd.empty((n,), B.dtype, ctx)
+        fun(a, c)
+        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
+        
+    check_cuda("float32", 64, 2)
+    check_cuda("float16", 64, 2)
+    check_cuda("int8", 64, 4)
+
+
+def test_cuda_multiply_add():
+    num_thread = 8
+    def check_cuda(dtype, n, lanes):
+        if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
+            print("skip because gpu does not support int8")
+            return
+        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
+        C = tvm.placeholder((n,), name='C', dtype="int32")        
+        D = tvm.compute((n,),
+                        lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
+        s = tvm.create_schedule(D.op)
+        xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
+        s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
+        s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
+        fun = tvm.build(s, [A, B, C, D], "cuda")
+        np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
+        np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
+        np_c = np.random.randint(low=0, high=127, size=(n,))
+        np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
+        ctx = tvm.gpu(0)
+        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
+        b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
+        c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
+        d = tvm.nd.empty((n,), D.dtype, ctx)
+        fun(a, b, c, d)
+        np.testing.assert_allclose(d.asnumpy(), np_d)
+    check_cuda("int8", 64, 4)
+
+def test_cuda_vectorize_load():
+    num_thread = 8
+    def check_cuda(dtype, n, lanes):
+        if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        ctx = tvm.gpu(0)
+        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = tvm.compute((n,), lambda i: A[i], name='B')
+        s = tvm.create_schedule(B.op)
+        bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
+        s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+        fun = tvm.build(s, [A, B], "cuda", name="vector_load")
+        np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
+        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
+        b = tvm.nd.empty((n,), B.dtype, ctx)
+        fun(a,b)
+        np.testing.assert_allclose(a.asnumpy(), b.asnumpy())
+    check_cuda("int8", 64, 8)
+    check_cuda("int8", 64, 16)
+
+if __name__ == "__main__":
+    test_cuda_vectorize_add()
+    test_cuda_multiply_add()
+    test_cuda_vectorize_load()
diff --git a/tests/python/unittest/test_codegen_device.py b/tests/python/unittest/test_codegen_device.py
index bbdd65e4be1c..0bb072ebf0bd 100644
--- a/tests/python/unittest/test_codegen_device.py
+++ b/tests/python/unittest/test_codegen_device.py
@@ -5,20 +5,20 @@
 def test_add_pipeline():
     n = tvm.var('n')
     A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    B = tvm.placeholder((), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(), name='C')
     D = tvm.compute(A.shape, lambda *i: C(*i) + 1, name='D')
     s = tvm.create_schedule(D.op)
 
     # GPU schedule have to split by gridIdx and threadIdx
     num_thread = 256
     xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
-    s[C].bind(xo, tvm.thread_axis("threadIdx.x"))
-    s[C].bind(xi, tvm.thread_axis("blockIdx.x"))
+    s[C].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
 
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
-    s[D].bind(xo, tvm.thread_axis("threadIdx.x"))
-    s[D].bind(xi, tvm.thread_axis("blockIdx.x"))
+    s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
 
     # compile to IR
     s = s.normalize()
@@ -27,7 +27,7 @@ def test_add_pipeline():
     Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
     Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
     Db = tvm.decl_buffer(D.shape, D.dtype, name='D')
-    stmt = tvm.ir_pass.LoopPartition(stmt)
+    stmt = tvm.ir_pass.LoopPartition(stmt, False)
     stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, D:Db}, 64)
     stmt = tvm.ir_pass.Simplify(stmt)
     fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Db], 0, True)
@@ -35,11 +35,11 @@ def test_add_pipeline():
     fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
 
     def check_target(device, host="stackvm"):
-        if not tvm.module.enabled(host):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             return
-        if not tvm.module.enabled(device):
+        if not tvm.module.enabled(host):
             return
-        ctx = tvm.context(device, 0)
         mhost = tvm.codegen.build_module(fsplits[0], host)
         mdev = tvm.codegen.build_module(fsplits[1:], device)
         mhost.import_module(mdev)
@@ -48,19 +48,19 @@ def check_target(device, host="stackvm"):
         # launch the kernel.
         n = 1027
         a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx)
         d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx)
         f(a, b, d)
         np.testing.assert_allclose(
             d.asnumpy(), a.asnumpy() + b.asnumpy() + 1)
 
     def check_module_save(device, host="stackvm"):
-        if not tvm.module.enabled(host):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             return
-        if not tvm.module.enabled(device):
+        if not tvm.module.enabled(host):
             return
-        ctx = tvm.context(device, 0)
-        fmt = "ptx" if device == "cuda" else "cl"
+        fmt = "ptx" if device == "cuda" else device
         mhost = tvm.codegen.build_module(fsplits[0], host)
         mdev = tvm.codegen.build_module(fsplits[1:], device)
         temp = util.tempdir()
@@ -72,7 +72,7 @@ def check_module_save(device, host="stackvm"):
         # launch the kernel.
         n = 1027
         a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx)
         d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx)
         f(a, b, d)
         np.testing.assert_allclose(
@@ -82,7 +82,10 @@ def check_module_save(device, host="stackvm"):
     check_target("cuda", host="llvm")
     check_module_save("cuda", host="stackvm")
     check_target("nvptx", host="llvm")
+    check_target("vulkan", host="llvm")
     check_target("rocm", host="llvm")
+    check_module_save("vulkan", host="stackvm")
+
 
 if __name__ == "__main__":
     test_add_pipeline()
diff --git a/tests/python/unittest/test_codegen_extern.py b/tests/python/unittest/test_codegen_extern.py
index 43736bc46768..dfbf1820c21d 100644
--- a/tests/python/unittest/test_codegen_extern.py
+++ b/tests/python/unittest/test_codegen_extern.py
@@ -2,35 +2,55 @@
 import numpy as np
 
 def test_add_pipeline():
-    nn = 1024
+    nn = 64
+    max_threads = 4
     n = tvm.convert(nn)
     A = tvm.placeholder((n,), name='A')
 
     def extern_generator(ins, outs):
         """Manually write the IR for the extern function, add pipeline"""
         ib = tvm.ir_builder.create()
-        with ib.for_range(0, n/2) as i:
+        with ib.for_range(0, (n+1) // 2) as i:
             ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
         return ib.get()
 
-    C = tvm.extern(A.shape, [A], extern_generator, name='C')
-    s = tvm.create_schedule(C.op)
+    def extern_generator_gpu(ins, outs):
+        """Manually write the IR for the extern function, add pipeline"""
+        ib = tvm.ir_builder.create()
+        bx = tvm.thread_axis("blockIdx.x")
+        tx = tvm.thread_axis("threadIdx.x")
+        ib.scope_attr(bx, "thread_extent", (nn+max_threads-1) // max_threads)
+        ib.scope_attr(tx, "thread_extent", max_threads)
+        idx = bx.var * max_threads + tx.var
+        with ib.if_scope(ib.likely(idx < n)):
+            ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.const(1, "float32x2")))
+        return ib.get()
 
-    def check_llvm():
-        if not tvm.module.enabled("llvm"):
+    C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C')
+    C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C')
+    s_cpu = tvm.create_schedule(C_cpu.op)
+    s_gpu = tvm.create_schedule(C_gpu.op)
+    print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
+    print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))
+
+    def check_target(target):
+        if not tvm.module.enabled(target):
             return
+        s = s_gpu if target in ['opencl', 'cuda'] else s_cpu
+        C = C_gpu if target in ['opencl', 'cuda'] else C_cpu
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
-        ctx = tvm.cpu(0)
+        f = tvm.build(s, [A, C], target)
+        ctx = tvm.context(target, 0)
         # launch the kernel.
         n = nn
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(
-            c.asnumpy(), a.asnumpy() + 1)
-    check_llvm()
-
+        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
+        
+    check_target("llvm")
+    check_target("opencl")
+    check_target("cuda")
 
 def test_pack_buffer_simple():
     nn = 1024
diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py
index 0db06b934f1b..9f282050df3e 100644
--- a/tests/python/unittest/test_codegen_llvm.py
+++ b/tests/python/unittest/test_codegen_llvm.py
@@ -1,4 +1,5 @@
 import tvm
+from tvm.contrib import util, clang
 import numpy as np
 import ctypes
 
@@ -17,6 +18,57 @@ def test_llvm_intrin():
     func = tvm.ir_pass.MakeAPI(body, "prefetch", [A], 0, True)
     fcode = tvm.build(func, None, "llvm")
 
+
+def test_llvm_import():
+    # extern "C" is necessary to get the correct signature
+    cc_code = """
+    extern "C" float my_add(float x, float y) {
+      return x + y;
+    }
+    """
+    n = 10
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.compute((n,), lambda *i:
+                    tvm.call_pure_extern("float32", "my_add", A(*i), 1.0),
+                    name='B')
+    def check_llvm(use_file):
+        if not tvm.module.enabled("llvm"):
+            return
+        if not clang.find_clang(required=False):
+            print("skip because clang is not available")
+            return
+        temp = util.tempdir()
+        ll_path = temp.relpath("temp.ll")
+        ll_code = clang.create_llvm(cc_code, output=ll_path)
+        s = tvm.create_schedule(B.op)
+        if use_file:
+            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
+        else:
+            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
+        # BUILD and invoke the kernel.
+        f = tvm.build(s, [A, B], "llvm")
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        f(a, b)
+        np.testing.assert_allclose(
+            b.asnumpy(), a.asnumpy() + 1.0)
+    check_llvm(use_file=True)
+    check_llvm(use_file=False)
+
+
+
+def test_llvm_lookup_intrin():
+    ib = tvm.ir_builder.create()
+    m = tvm.var("m")
+    A = ib.pointer("uint8x8", name="A")
+    x = tvm.call_llvm_intrin("uint8x8", "llvm.ctpop.i8", tvm.const(1, 'uint32'), A)
+    ib.emit(x)
+    body = ib.get()
+    func = tvm.ir_pass.MakeAPI(body, "ctpop", [A], 1, True)
+    fcode = tvm.build(func, None, "llvm")
+
 def test_llvm_add_pipeline():
     nn = 1024
     n = tvm.convert(nn)
@@ -273,7 +325,48 @@ def check_llvm(n):
     check_llvm(64)
 
 
+def test_rank_zero():
+    def check_llvm(n):
+        if not tvm.module.enabled("llvm"):
+            return
+        A = tvm.placeholder((n, ), name='A')
+        scale = tvm.placeholder((), name='scale')
+        k = tvm.reduce_axis((0, n), name="k")
+        C = tvm.compute((), lambda : tvm.sum(A[k] * scale, axis=k), name="C")
+        D = tvm.compute((), lambda : C + 1)
+        s = tvm.create_schedule(D.op)
+        # build and invoke the kernel.
+        f = tvm.build(s, [A, scale, D], "llvm")
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
+        sc = tvm.nd.array(
+            np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
+        d = tvm.nd.empty((), D.dtype, ctx)
+        f(a, sc, d)
+        d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
+        np.testing.assert_allclose(d.asnumpy(), d_np)
+    check_llvm(64)
+
+
+def test_alignment():
+    n = tvm.convert(1024)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.compute(A.shape, lambda i: A[i] * 3, name='B')
+    s = tvm.create_schedule(B.op)
+    bx, tx = s[B].split(B.op.axis[0], factor=8)
+    s[B].vectorize(tx)
+    f = tvm.build(s, [A, B], "llvm")
+
+    for l in f.get_source().split("\n"):
+        if "align" in l and "4 x float" in l:
+            assert "align 32" in l
+
+
 if __name__ == "__main__":
+    test_llvm_import()
+    test_alignment()
+    test_rank_zero()
     test_llvm_bool()
     test_llvm_persist_parallel()
     test_llvm_select()
@@ -284,3 +377,4 @@ def check_llvm(n):
     test_llvm_flip_pipeline()
     test_llvm_madd_pipeline()
     test_llvm_temp_space()
+    test_llvm_lookup_intrin()
diff --git a/tests/python/unittest/test_codegen_static_init.py b/tests/python/unittest/test_codegen_static_init.py
index ecb7d82df6cd..1a03de9ee000 100644
--- a/tests/python/unittest/test_codegen_static_init.py
+++ b/tests/python/unittest/test_codegen_static_init.py
@@ -1,7 +1,8 @@
 import tvm
+import ctypes
 import numpy as np
 
-def test_static_init():
+def test_static_callback():
     dtype = 'int64'
     n = tvm.var('n')
     Ab = tvm.decl_buffer((n, ), dtype)
@@ -22,6 +23,29 @@ def test_static_init():
     f(a)
     np.testing.assert_equal(a.asnumpy(), np.ones(a.shape[0]))
 
+def test_static_init():
+    dtype = 'int64'
+    n = tvm.var('n')
+    Ab = tvm.decl_buffer((n, ), dtype)
+    i = tvm.var('i')
+    ib = tvm.ir_builder.create()
+    handle = tvm.call_intrin("handle", "tvm_static_handle")
+    ib.emit(
+        tvm.call_packed("test_static_callback", handle, Ab))
+
+    @tvm.register_func("test_static_callback")
+    def test_cb(sh, A):
+        assert isinstance(sh, ctypes.c_void_p)
+        return sh
+
+    stmt = ib.get()
+    fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
+    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
+    f = tvm.codegen.build_module(fapi, "llvm")
+    a = tvm.nd.array(np.zeros(10, dtype=dtype))
+    f(a)
+
 
 if __name__ == "__main__":
+    test_static_callback()
     test_static_init()
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
new file mode 100644
index 000000000000..0f500d7c704f
--- /dev/null
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -0,0 +1,351 @@
+import tvm, inspect, sys, traceback, numpy, nose
+from tvm.hybrid import script
+from tvm.hybrid.intrin import HYBRID_GLOBALS
+
+@nose.tools.nottest
+def run_and_check(func, args, outs, var_dict={}, target='llvm'):
+    def tvm_val_2_py_val(val):
+        val = tvm.ir_pass.Substitute(val, var_dict)
+        val = tvm.ir_pass.Simplify(val)
+        assert isinstance(val, (tvm.expr.IntImm, tvm.expr.UIntImm))
+        return val.value
+
+    ctx = tvm.context(target, 0)
+
+    emu_args = []
+    nd_args = []
+    to_check = []
+    for i in args:
+        if isinstance(i, tvm.tensor.Tensor):
+            shape = [tvm_val_2_py_val(j) for j in i.shape]
+            if i in outs:
+                emu_args.append(numpy.zeros(shape).astype(i.dtype))
+                nd_args.append(tvm.nd.array(emu_args[-1], ctx))
+                to_check.append((nd_args[-1], emu_args[-1]))
+            else:
+                emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
+                nd_args.append(tvm.nd.array(emu_args[-1], ctx))
+        else:
+            assert isinstance(i, tvm.expr.Var)
+            emu_args.append(tvm_val_2_py_val(i))
+            nd_args.append(emu_args[-1])
+
+    func(*emu_args)
+
+    lowerd_func = tvm.lower(func(*args), args)
+    module = tvm.build(lowerd_func, target=target)
+    assert module
+    module(*nd_args)
+
+    for nd, np in to_check:
+        numpy.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
+
+
+@script
+def outer_product(n, m, a, b, c):
+    for i in range(n):
+        for j in range(m):
+            c[i, j] = a[i] * b[j]
+
+#Test global function
+#Test bridge between frontend and backend
+def test_outer_product():
+    n = tvm.var('n')
+    m = tvm.var('m')
+    a = tvm.placeholder((n, ), name='a')
+    b = tvm.placeholder((m, ), name='b')
+    c = tvm.placeholder((n, m), name='c')
+    ir = outer_product(n, m, a, b, c)
+    #Check for i in (0, n)
+    assert isinstance(ir, tvm.stmt.For)
+    assert ir.loop_var.name == 'i'
+    assert ir.min.value == 0
+    assert ir.extent.name == 'n'
+    ibody = ir.body
+    assert isinstance(ibody, tvm.stmt.For)
+    #Check for j in (0, m)
+    assert ibody.loop_var.name == 'j'
+    assert ibody.min.value == 0
+    assert ibody.extent.name == 'm'
+    #Check loop body
+    jbody = ibody.body
+    assert isinstance(jbody, tvm.stmt.Provide)
+    assert jbody.func.name == 'c'
+    assert len(jbody.args) == 2
+    assert jbody.args[0].name == 'i'
+    assert jbody.args[1].name == 'j'
+    assert isinstance(jbody.value, tvm.expr.Mul)
+    mul = jbody.value
+    assert isinstance(mul.a, tvm.expr.Call)
+    assert mul.a.name == 'a'
+    assert mul.b.name == 'b'
+
+    func = tvm.lower(ir, [n, m, a, b, c])
+    func = tvm.build(func)
+
+    run_and_check(outer_product, [n, m, a, b, c], [c], {n: 999, m: 1001})
+
+    for key, _ in HYBRID_GLOBALS.items():
+        assert key not in globals().keys()
+        assert key not in outer_product.__globals__.keys()
+
+#Test local function
+#Test allocation of local variable
+def test_fanout():
+    @script
+    def fanout(n, a, b):
+        three = 3.0
+        for i in range(a.shape[0] - 3):
+            sigma = 0.0
+            for j in range(3):
+                sigma = sigma + a[i + j]
+            sigma = sigma / three
+            b[i] = sigma
+
+    n = tvm.var('n')
+    a = tvm.placeholder((n, ), 'float32', name='a')
+    b = tvm.placeholder((n-3, ), 'float32', name='b')
+    ir = fanout(n, a, b)
+
+    #Check for i in (0, n-3)
+    assert isinstance(ir, tvm.stmt.For)
+    assert ir.loop_var.name == 'i'
+    assert ir.min.value == 0
+    assert tvm.ir_pass.Equal(ir.extent, n - 3)
+    #Check loopbody
+    ibody = ir.body
+    assert isinstance(ibody, tvm.stmt.AttrStmt)
+    abody = ibody.body
+    assert isinstance(abody, tvm.stmt.Realize)
+    assert abody.bounds[0].min.value == 0
+    assert abody.bounds[0].extent.value == 1
+    assert abody.func.name == 'sigma'
+    #Check i loop body
+    rbody = abody.body
+    assert isinstance(rbody.first, tvm.stmt.Provide)
+    assert rbody.first.func.name == 'sigma'
+    assert len(rbody.first.args) == 1
+    assert rbody.first.args[0].value == 0
+    #Check fanout loop
+    jloop = rbody.rest.first
+    assert jloop.loop_var.name == 'j'
+    assert jloop.min.value == 0
+    assert jloop.extent.value == 3
+    jbody = jloop.body
+    assert isinstance(jbody, tvm.stmt.Provide)
+    assert len(jbody.args) == 1
+    assert jbody.args[0].value == 0
+    assert jbody.func.name == 'sigma'
+    assert isinstance(jbody.value, tvm.expr.Add)
+    value = jbody.value
+    assert isinstance(value.a, tvm.expr.Call)
+    assert value.a.name == 'sigma'
+    assert len(value.a.args) == 1
+    assert value.a.args[0].value == 0
+    assert value.b.name == 'a'
+    assert len(value.b.args) == 1
+    assert tvm.ir_pass.Equal(value.b.args[0], ir.loop_var + jloop.loop_var)
+    divide= rbody.rest.rest.first
+    assert isinstance(divide, tvm.stmt.Provide)
+    assert len(divide.args) == 1
+    assert divide.args[0].value == 0
+    value = divide.value
+    assert isinstance(value, tvm.expr.Mul)
+    assert value.a.name == 'sigma'
+    assert len(value.a.args) == 1
+    assert value.a.args[0].value == 0
+    assert abs(value.b.value - (1 / 3.0)) < 1e-5
+    write = rbody.rest.rest.rest
+    assert isinstance(write, tvm.stmt.Provide)
+    assert write.func.name == 'b'
+    assert write.value.name == 'sigma'
+    assert len(write.value.args) == 1
+    assert write.value.args[0].value == 0
+
+    run_and_check(fanout, [n, a, b], [b], {n: 10})
+
+
+@script
+def failure():
+    for i in range(1, 100):
+        i = 0
+
+def test_failure():
+    try:
+        tvm.hybrid.parse(failure, [])
+    except IOError as err:
+        assert sys.version_info[0] == 2
+        print('[Warning] Case test_failure is skipped by Python2 because "%s"' % str(err))
+    except Exception as err:
+        assert str(err) == 'You CAN NEVER overwrite a loop variable!'
+
+
+def test_looptype():
+    @script
+    def looptype(a, b, c):
+        for i in parallel(8):
+            a[i] = i
+        for j in vectorize(8):
+            b[j] = j
+        for k in unroll(8):
+            c[k] = k
+
+    a = tvm.placeholder((8, ), name='a', dtype='int32')
+    b = tvm.placeholder((8, ), name='b', dtype='int32')
+    c = tvm.placeholder((8, ), name='c', dtype='int32')
+    ir = looptype(a, b, c)
+    iloop = ir.first
+    jloop = ir.rest.first
+    kloop = ir.rest.rest
+    assert iloop.for_type == tvm.stmt.For.Parallel
+    assert jloop.for_type == tvm.stmt.For.Vectorized
+    assert kloop.for_type == tvm.stmt.For.Unrolled
+
+    run_and_check(looptype, [a, b, c], [a, b, c])
+
+
+def test_if():
+    @script
+    def if_then_else(a, b):
+        for i in range(10):
+            if i % 2 == 0:
+                a[i] = -1
+            else:
+                a[i] = 1
+        for i in unroll(10):
+            b[i] = -1 if i % 2 == 0 else 1
+
+    a = tvm.placeholder((10, ), dtype='int32', name='a')
+    b = tvm.placeholder((10, ), dtype='int32', name='b')
+
+    run_and_check(if_then_else, [a, b], [a, b])
+
+
+def test_bind():
+    if not tvm.gpu(0).exist:
+        print('[Warning] No GPU found! Skip bind test!')
+        return
+    @script
+    def vec_add(a, b, c):
+        for tx in bind('threadIdx.x', 1000):
+            c[tx] = b[tx] + c[tx]
+
+    a = tvm.placeholder((1000, ), dtype='float32', name='a')
+    b = tvm.placeholder((1000, ), dtype='float32', name='b')
+    c = tvm.placeholder((1000, ), dtype='float32', name='c')
+
+    run_and_check(vec_add, [a, b, c], [c], target='cuda')
+
+def test_math_intrin():
+    @script
+    def intrin_real(a):
+        a[0] = sqrt(a[0])
+        a[1] = log(a[1])
+        a[2] = exp(a[2])
+        a[3] = sigmoid(a[3])
+        a[4] = power(a[4], a[5])
+        a[5] = tanh(a[5])
+        a[6] = min(a[4], a[5])
+        a[7] = max(a[5], a[6])
+
+    a8 = tvm.placeholder((8, ), dtype='float32', name='a')
+    ir = intrin_real(a8)
+    func = tvm.build(tvm.lower(ir, [a8]))
+    assert func
+    a = numpy.arange(2, 10).astype('float32')
+    tvm_a = tvm.ndarray.array(a)
+    func(tvm_a)
+    intrin_real(a)
+    numpy.testing.assert_allclose(a, tvm_a.asnumpy(), rtol=1e-5)
+
+    @script
+    def intrin_int(a):
+        a[0] = popcount(a[0])
+
+    a1 = tvm.placeholder((1, ), dtype='int32')
+    ir = intrin_int(a1)
+    func = tvm.build(tvm.lower(ir, [a1]))
+    assert func
+    a = numpy.array([1234567890]).astype('int32')
+    tvm_a = tvm.ndarray.array(a)
+    intrin_int(a)
+    func(tvm_a)
+    assert tvm_a.asnumpy()[0] == a[0]
+
+def test_non_zero():
+    @tvm.hybrid.script
+    def blur(a, b):
+        for i in range(2, 32):
+            for j in range(2, 32):
+                s = 0.0
+                for di in range(3):
+                    for dj in range(3):
+                        s = s + a[i-di, j-dj]
+                b[i-2, j-2] = s / 9.0
+    try:
+        a = tvm.placeholder((32, 32), 'float32', 'a')
+        b = tvm.placeholder((30, 30), 'float32', 'b')
+        run_and_check(blur, [a, b], [b])
+    except IOError as err:
+        assert sys.version_info[0] == 2
+        print('[Warning] Case test_non_zero is skipped by Python2 because "%s"' % str(err))
+
+    @tvm.hybrid.script
+    def triangle(a, b, c):
+        for i in range(10):
+            for j in range(i, 10):
+                c[i, j] = a[i] * b[j]
+
+    a = tvm.placeholder((10, ), dtype='float32', name='a')
+    b = tvm.placeholder((10, ), dtype='float32', name='b')
+    c = tvm.placeholder((10, 10), dtype='float32', name='c')
+
+    run_and_check(triangle, [a, b, c], [c])
+
+def test_allocate():
+    @tvm.hybrid.script
+    def blur2d(a, b):
+        for i in range(30):
+            ha = allocate((3, 30), 'float32')
+            for j in range(3):
+                for k in range(30):
+                    ha[j, k] = a[i+j, k] + a[i+j, k+1] + a[i+j, k+2]
+            for j in range(30):
+                b[i, j] = (ha[0, j] + ha[1, j] + ha[2, j]) / 9.0
+
+    a = tvm.placeholder((32, 32), 'float32', 'a')
+    b = tvm.placeholder((30, 30), 'float32', 'b')
+
+    run_and_check(blur2d, [a, b], [b])
+
+    if tvm.gpu().exist:
+        @tvm.hybrid.script
+        def share_vec_add(a, b, c):
+            shared = allocate((256, ), 'float32', 'shared')
+            for i in bind("threadIdx.x", 256):
+                shared[i] = a[i]
+            local = allocate((256, ), 'float32', 'local')
+            for i in bind("threadIdx.x", 256):
+                local[i] = b[i]
+            for i in bind("threadIdx.x", 256):
+                c[i] = shared[i] + local[i]
+
+        a = tvm.placeholder((256, ), dtype='float32', name='a')
+        b = tvm.placeholder((256, ), dtype='float32', name='b')
+        c = tvm.placeholder((256, ), dtype='float32', name='c')
+        run_and_check(share_vec_add, [a, b, c], [c], target='cuda')
+    else:
+        print('[Warning] No GPU found! Skip shared mem test!')
+
+
+if __name__ == "__main__":
+    test_outer_product()
+    test_fanout()
+    test_failure()
+    test_looptype()
+    test_if()
+    test_bind()
+    test_math_intrin()
+    test_non_zero()
+    test_allocate()
+
diff --git a/tests/python/unittest/test_ir_builder.py b/tests/python/unittest/test_ir_builder.py
index 86b43c3f7980..864455d01ad9 100644
--- a/tests/python/unittest/test_ir_builder.py
+++ b/tests/python/unittest/test_ir_builder.py
@@ -1,4 +1,5 @@
 import tvm
+import numpy as np
 
 def test_for():
     ib = tvm.ir_builder.create()
@@ -53,8 +54,84 @@ def test_prefetch():
     body = ib.get()
     assert body.body.bounds[0].extent.value == 2
 
+def test_cpu():
+    n = 1024
+    dtype = "float32"
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    def test_device_ir(A, B, C):
+        n = A.shape[0]
+        max_threads = 8
+        ib = tvm.ir_builder.create()
+        Aptr = ib.buffer_ptr(A)
+        Bptr = ib.buffer_ptr(B)
+        Cptr = ib.buffer_ptr(C)
+        with ib.for_range(0, n, name="i") as i:
+            Cptr[i] = Aptr[i] + Bptr[i]
+        body = ib.get()
+        return body
+    C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
+                   name="vector_add", dtype=dtype)
+    s = tvm.create_schedule(C.op)
+    def check_target(target):
+        if not tvm.module.enabled(target):
+            return
+        # build and invoke the kernel.
+        fadd = tvm.build(s, [A, B, C], target)
+        ctx = tvm.context(target, 0)
+        # launch the kernel.
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        fadd(a, b, c)
+        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+    check_target("llvm")
+
+def test_gpu():
+    n = tvm.var('n')
+    dtype = "float32"
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    def test_device_ir(A, B, C):
+        n = A.shape[0]
+        max_threads = 32
+        ib = tvm.ir_builder.create()
+        bx = tvm.thread_axis("blockIdx.x")
+        tx = tvm.thread_axis("threadIdx.x")
+        ib.scope_attr(bx, "thread_extent", (n+max_threads-1) // max_threads)
+        ib.scope_attr(tx, "thread_extent", max_threads)
+        idx = bx.var * max_threads + tx.var
+        Aptr = ib.buffer_ptr(A)
+        Bptr = ib.buffer_ptr(B)
+        Cptr = ib.buffer_ptr(C)
+        with ib.if_scope(ib.likely(idx<n)):
+            Cptr[idx] = Aptr[idx] + Bptr[idx]
+        body = ib.get()
+        return body
+    C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
+                   name="vector_add", dtype=dtype)
+    s = tvm.create_schedule(C.op)
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    def check_target(target):
+        n = 1024
+        if not tvm.module.enabled(target):
+            return
+        # build and invoke the kernel.
+        fadd = tvm.build(s, [A, B, C], target)
+        ctx = tvm.context(target, 0)
+        # launch the kernel.
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        fadd(a, b, c)
+        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+    check_target("opencl")
+    check_target("cuda")
 
 if __name__ == "__main__":
     test_prefetch()
     test_if()
     test_for()
+    test_cpu()
+    test_gpu()
diff --git a/tests/python/unittest/test_lang_basic.py b/tests/python/unittest/test_lang_basic.py
index 1461ecec100f..c9a04747b56d 100644
--- a/tests/python/unittest/test_lang_basic.py
+++ b/tests/python/unittest/test_lang_basic.py
@@ -10,6 +10,8 @@ def test_make():
     x = tvm.const(1)
     y = tvm.make.IntImm('int32', 1)
     z = x + y
+    assert isinstance(tvm.max(x, y), tvm.expr.Max)
+    assert isinstance(tvm.min(x, y), tvm.expr.Min)
 
 def test_ir():
     x = tvm.const(1)
@@ -132,6 +134,9 @@ def test_bitwise():
     assert str(x | y) == 'bitwise_or(x, y)'
     assert str(x ^ y) == 'bitwise_xor(x, y)'
     assert str(~x) == 'bitwise_not(x)'
+    assert(tvm.const(1, "int8x2") >> 1).dtype == "int8x2"
+    assert(x >> tvm.const(1, "int32x2")).dtype == "int32x2"
+    assert(tvm.var("z", "int8x2") << tvm.const(1, "int8x2")).dtype == "int8x2"
 
 
 def test_equality():
diff --git a/tests/python/unittest/test_lang_buffer.py b/tests/python/unittest/test_lang_buffer.py
index c3f00ac2f166..a5a8f5d065a6 100644
--- a/tests/python/unittest/test_lang_buffer.py
+++ b/tests/python/unittest/test_lang_buffer.py
@@ -23,6 +23,24 @@ def test_buffer_access_ptr():
     aptr = Ab.access_ptr("w")
     assert aptr.args[4].value == Buffer.WRITE
 
+def test_buffer_access_ptr_offset():
+    m = tvm.var('m')
+    n = tvm.var('n')
+    Ab = tvm.decl_buffer((m, n), tvm.float32)
+    aptr = Ab.access_ptr("rw", offset=100)
+    offset = tvm.ir_pass.Simplify(aptr.args[2])
+    assert tvm.ir_pass.Equal(offset, 100)
+    assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
+    v = tvm.var('int32')
+    aptr = Ab.access_ptr("rw", offset=100 + 100 + v)
+    offset = tvm.ir_pass.Simplify(aptr.args[2])
+    assert tvm.ir_pass.Equal(offset, 200 + v)
+    assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
+    aptr = Ab.access_ptr("rw", offset=tvm.call_extern('int32', "test_call", 100 + 100 + v))
+    offset = tvm.ir_pass.Simplify(aptr.args[2])
+    assert tvm.ir_pass.Equal(offset, tvm.call_extern('int32', "test_call", 200 + v))
+    assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
+
 def test_buffer_index_merge_mult_mod():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -57,4 +75,5 @@ def assert_simplified_equal(index_simplified, index_direct):
 if __name__ == "__main__":
     test_buffer()
     test_buffer_access_ptr()
+    test_buffer_access_ptr_offset()
     test_buffer_index_merge_mult_mod()
diff --git a/tests/python/unittest/test_lang_container.py b/tests/python/unittest/test_lang_container.py
index d945fce31fd4..615c5ac0a8d5 100644
--- a/tests/python/unittest/test_lang_container.py
+++ b/tests/python/unittest/test_lang_container.py
@@ -10,6 +10,7 @@ def test_array_save_load_json():
     a_loaded = tvm.load_json(json_str)
     assert(a[1].value == 2)
 
+
 def test_map():
     a = tvm.var('a')
     b = tvm.var('b')
@@ -22,6 +23,17 @@ def test_map():
     assert b in dd
     assert a + 1 not in amap
 
+
+def test_str_map():
+    amap = tvm.convert({'a': 2, 'b': 3})
+    assert 'a' in amap
+    assert len(amap) == 2
+    dd = dict(amap.items())
+    assert amap['a'].value == 2
+    assert 'a' in dd
+    assert 'b' in dd
+
+
 def test_map_save_load_json():
     a = tvm.var('a')
     b = tvm.var('b')
@@ -35,6 +47,7 @@ def test_map_save_load_json():
 
 
 if __name__ == "__main__":
+    test_str_map()
     test_array()
     test_map()
     test_array_save_load_json()
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index 08ba4b494f08..fefb8771a812 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -11,6 +11,18 @@ def test_const_saveload_json():
     assert tvm.save_json(zz) == tvm.save_json(z)
 
 
+def test_make_smap():
+    # save load json
+    x = tvm.const(1)
+    y = tvm.const(10)
+    z = x + y
+    smap = tvm.convert({"z": z, "x": x})
+    json_str = tvm.save_json(tvm.convert([smap]))
+    arr = tvm.load_json(json_str)
+    assert len(arr) == 1
+    assert arr[0]["z"].a == arr[0]["x"]
+
+
 def test_make_node():
     x = tvm.make.node("IntImm", dtype="int32", value=10)
     assert isinstance(x, tvm.expr.IntImm)
@@ -35,5 +47,6 @@ def test_make_sum():
 
 if __name__ == "__main__":
     test_make_node()
+    test_make_smap()
     test_const_saveload_json()
     test_make_sum()
diff --git a/tests/python/unittest/test_lang_schedule.py b/tests/python/unittest/test_lang_schedule.py
index 6c29f1067632..1eb42f3f0bca 100644
--- a/tests/python/unittest/test_lang_schedule.py
+++ b/tests/python/unittest/test_lang_schedule.py
@@ -84,6 +84,19 @@ def test_fuse():
     assert any(isinstance(x, tvm.schedule.Fuse) for x in s[T].relations)
     assert tuple(s[T].leaf_iter_vars) == (fused, xi, yi)
 
+
+def test_singleton():
+    A = tvm.placeholder((), name='A')
+    T = tvm.compute((), lambda : A() + 1)
+    s = tvm.create_schedule(T.op)
+    fused = s[T].fuse()
+    assert any(isinstance(x, tvm.schedule.Singleton) for x in s[T].relations)
+    assert tuple(s[T].leaf_iter_vars) == (fused,)
+    dump = pkl.dumps(s)
+    s_loaded = pkl.loads(dump)
+    assert isinstance(s_loaded, tvm.schedule.Schedule)
+
+
 def test_vectorize():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -110,7 +123,7 @@ def test_pragma():
     s[T].pragma(xo, "pragma1")
     s[T].pragma(xi, "vectorize")
     VECTORIZE = tvm.schedule.IterVar.Vectorized
-    assert s[T].iter_var_attrs[xo].pragmas[0].value == "pragma1"
+    assert s[T].iter_var_attrs[xo].pragma_keys[0].value == "pragma1"
     assert s[T].iter_var_attrs[xi].iter_type == VECTORIZE
 
 
@@ -137,6 +150,16 @@ def test_rfactor():
     assert(BF.op.body[0].axis[0] ==  k2)
     assert(BF.op.body[0].axis[1].var ==  ko.var)
     assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
+    # schedule with factor_axis
+    s = tvm.create_schedule(B.op)
+    ko, ki = s[B].split(k1, factor=4)
+    xo, xi = s[B].split(B.op.axis[0], factor=8)
+    BF = s.rfactor(B, ki, 1)
+    assert(n == BF.shape[0])
+    assert(BF.shape[1].value == 4)
+    assert(BF.op.body[0].axis[0] ==  k2)
+    assert(BF.op.body[0].axis[1].var ==  ko.var)
+    assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
 
 def test_tensor_intrin():
     n = 16
@@ -164,6 +187,7 @@ def intrin_func(ins, outs):
 
 
 if __name__ == "__main__":
+    test_singleton()
     test_pragma()
     test_tensor_intrin()
     test_rfactor()
diff --git a/tests/python/unittest/test_lang_tag.py b/tests/python/unittest/test_lang_tag.py
index c58c5da6e1e9..e141830b0595 100644
--- a/tests/python/unittest/test_lang_tag.py
+++ b/tests/python/unittest/test_lang_tag.py
@@ -1,3 +1,4 @@
+import json
 import tvm
 
 @tvm.tag_scope(tag="conv")
@@ -24,8 +25,19 @@ def test_with():
     B = tvm.placeholder((m, l), name='B')
     with tvm.tag_scope(tag="gemm"):
         k = tvm.reduce_axis((0, l), name='k')
-        C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k))
+        C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k),
+                        attrs={"hello" : 1, "arr": [10, 12]})
+
     assert C.op.tag == 'gemm'
+    assert "hello" in C.op.attrs
+    assert "xx" not in C.op.attrs
+    assert C.op.attrs["hello"].value == 1
+    CC = tvm.load_json(tvm.save_json(C))
+    assert CC.op.attrs["hello"].value == 1
+    assert CC.op.attrs["arr"][0].value == 10
+    # str format happened to be json compatible
+    assert json.loads(str(CC.op.attrs))["arr"][1] == 12
+
 
 def test_decorator():
     n = tvm.var('n')
@@ -39,6 +51,7 @@ def test_decorator():
     B = tvm.placeholder((c, c, kh, kw), name='B')
     C = compute_conv(A, B)
     assert C.op.tag == 'conv'
+    assert len(C.op.attrs) == 0
 
 def test_nested():
     n = tvm.var('n')
@@ -59,5 +72,6 @@ def test_nested():
 
 
 if __name__ == "__main__":
-    import nose
-    nose.runmodule()
+    test_with()
+    test_decorator()
+    test_nested()
diff --git a/tests/python/unittest/test_lang_target.py b/tests/python/unittest/test_lang_target.py
index 4e13c76d5f39..f7309fc30819 100644
--- a/tests/python/unittest/test_lang_target.py
+++ b/tests/python/unittest/test_lang_target.py
@@ -28,17 +28,27 @@ def test_target_dispatch():
     with tvm.target.create("cuda"):
         assert mygeneric(1) == 3
 
-    with tvm.target.rasp():
+    with tvm.target.arm_cpu():
         assert mygeneric(1) == 11
 
     with tvm.target.create("metal"):
         assert mygeneric(1) == 3
 
-    try:
-        mygeneric(0)
-        raise RuntimeError("not reached")
-    except RuntimeError:
-        pass
+    assert tvm.target.current_target() == None
+
+
+def test_target_string_parse():
+    target = tvm.target.create("cuda -libs=cublas,cudnn")
+
+    assert target.target_name == "cuda"
+    assert target.options == ['-libs=cublas,cudnn']
+    assert target.keys == ['cuda', 'gpu']
+    assert target.libs == ['cublas', 'cudnn']
+    assert str(target) == str(tvm.target.cuda("-libs=cublas,cudnn"))
+
+
+    assert tvm.target.intel_graphics().device_name == "intel_graphics"
 
 if __name__ == "__main__":
     test_target_dispatch()
+    test_target_string_parse()
diff --git a/tests/python/unittest/test_lang_tensor.py b/tests/python/unittest/test_lang_tensor.py
index 6f151749c849..1d8603dfc98b 100644
--- a/tests/python/unittest/test_lang_tensor.py
+++ b/tests/python/unittest/test_lang_tensor.py
@@ -19,6 +19,17 @@ def test_tensor():
     assert(T[0][0][0].astype('float16').dtype == 'float16')
 
 
+def test_rank_zero():
+    m = tvm.var('m')
+    A = tvm.placeholder((m,), name='A')
+    scale = tvm.placeholder((), name='s')
+    k = tvm.reduce_axis((0, m), name="k")
+    T = tvm.compute((), lambda : tvm.sum(A[k] * scale(), axis=k))
+    print(T)
+    print(T.op.body)
+    assert(tuple(T.shape) == ())
+
+
 def test_conv1d():
     n = tvm.var('n')
     A = tvm.placeholder((n+2), name='A')
@@ -173,7 +184,9 @@ def test_tensor_inputs():
     y = tvm.compute(x.shape, lambda i: x[i] + x[i])
     assert tuple(y.op.input_tensors) == (x,)
 
+
 if __name__ == "__main__":
+    test_rank_zero()
     test_tensor_inputs()
     test_tensor_reduce_multi_axis()
     test_conv1d()
diff --git a/tests/python/unittest/test_lang_tensor_overload_op.py b/tests/python/unittest/test_lang_tensor_overload_op.py
new file mode 100644
index 000000000000..14853e89188a
--- /dev/null
+++ b/tests/python/unittest/test_lang_tensor_overload_op.py
@@ -0,0 +1,249 @@
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+
+
+def test_operator_type_and_tags():
+    k = 1
+    n = tvm.var('n')
+    A = tvm.placeholder((), name='A')
+    B = tvm.placeholder((10, 5), name='B')
+    B1 = B[0]
+    B2 = B[0,0]
+
+    assert isinstance(k + n, tvm.expr.Expr)
+    assert isinstance(n + n, tvm.expr.Expr)
+    assert isinstance(k + A, tvm.tensor.Tensor)
+    assert isinstance(A + k, tvm.tensor.Tensor)
+    assert isinstance(n + A, tvm.tensor.Tensor)
+    assert isinstance(A + n, tvm.tensor.Tensor)
+    assert isinstance(A + A, tvm.tensor.Tensor)
+
+    assert isinstance(k + B, tvm.tensor.Tensor)
+    assert isinstance(B + k, tvm.tensor.Tensor)
+    assert isinstance(n + B, tvm.tensor.Tensor)
+    assert isinstance(B + n, tvm.tensor.Tensor)
+    assert isinstance(A + B, tvm.tensor.Tensor)
+    assert isinstance(B + A, tvm.tensor.Tensor)
+    assert isinstance(B + B, tvm.tensor.Tensor)
+
+    assert (k + B).op.tag == topi.tag.ELEMWISE
+    assert (B + k).op.tag == topi.tag.ELEMWISE
+    assert (n + B).op.tag == topi.tag.ELEMWISE
+    assert (B + n).op.tag == topi.tag.ELEMWISE
+    assert (A + B).op.tag == topi.tag.BROADCAST
+    assert (B + A).op.tag == topi.tag.BROADCAST
+    assert (B + B).op.tag == topi.tag.BROADCAST
+
+    assert isinstance(k + B2, tvm.expr.Expr)
+    assert isinstance(B2 + k, tvm.expr.Expr)
+    assert isinstance(n + B2, tvm.expr.Expr)
+    assert isinstance(B2 + n, tvm.expr.Expr)
+    assert isinstance(B2 + B2, tvm.expr.Expr)
+    assert isinstance(B2 + A, tvm.tensor.Tensor)
+    assert isinstance(A + B2, tvm.tensor.Tensor)
+    assert isinstance(B2 + B, tvm.tensor.Tensor)
+    assert isinstance(B + B2, tvm.tensor.Tensor)
+
+
+def test_combination():
+    k = 3
+    n = 5
+    m = 10
+    x = tvm.var('x')
+    A = tvm.placeholder((n, m), name='A')
+    B = tvm.placeholder((n, m), name='B')
+    C = tvm.placeholder((n, m), name='C')
+    D = k + A - B * C / x
+    s = tvm.create_schedule(D.op)
+    foo = tvm.build(s, [x, A, B, C, D], "llvm")
+    ctx = tvm.cpu(0)
+    x = 2
+    a = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(n, m)).astype(B.dtype), ctx)
+    c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), ctx)
+    d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
+    foo(x, a, b, c, d)
+    np.testing.assert_allclose(d.asnumpy(), k + a.asnumpy() - b.asnumpy() * c.asnumpy() / x)
+
+
+def verify_tensor_scalar_bop(shape, typ="add"):
+    """Verify non-constant Tensor and scalar binary operations."""
+    sh = [tvm.var('n%d' % i) for i in range(0, len(shape))]
+    k = tvm.var('k')
+    A = tvm.placeholder(sh, name='A')
+    if typ == "add":
+        B = A + k
+    elif typ == "sub":
+        B = A - k
+    elif typ == "mul":
+        B = A * k
+    elif typ == "div":
+        B = A / k
+    else:
+        raise NotImplementedError()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_elemwise(B)
+
+        k_ = 2
+        foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ)
+        a_npy = np.random.uniform(size=shape).astype(A.dtype)
+        if typ == "add":
+            b_npy = a_npy + k_
+        elif typ == "sub":
+            b_npy = a_npy - k_
+        elif typ == "mul":
+            b_npy = a_npy * k_
+        elif typ == "div":
+            b_npy = a_npy / k_
+        else:
+            raise NotImplementedError()
+
+        a_nd = tvm.nd.array(a_npy, ctx)
+        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
+        foo(a_nd, b_nd, k_, *shape)
+        np.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5)
+
+    for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+        check_device(device)
+
+
+def verify_broadcast_bop(lhs_shape, rhs_shape, typ="add"):
+    A = tvm.placeholder(shape=lhs_shape, name="A")
+    B = tvm.placeholder(shape=rhs_shape, name="B")
+    if typ == "add":
+        C = A + B
+    elif typ == "sub":
+        C = A - B
+    elif typ == "mul":
+        C = A * B
+    elif typ == "div":
+        C = A / B
+    else:
+        raise NotImplementedError()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(C)
+
+        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
+        lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
+        rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
+        if typ == "add":
+            out_npy = lhs_npy + rhs_npy
+        elif typ == "sub":
+            out_npy = lhs_npy - rhs_npy
+        elif typ == "mul":
+            out_npy = lhs_npy * rhs_npy
+        elif typ == "div":
+            rhs_npy = np.abs(rhs_npy) + 0.001
+            out_npy = lhs_npy / rhs_npy
+        else:
+            raise NotImplementedError()
+
+        lhs_nd = tvm.nd.array(lhs_npy, ctx)
+        rhs_nd = tvm.nd.array(rhs_npy, ctx)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        for _ in range(1):
+            foo(lhs_nd, rhs_nd, out_nd)
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
+
+    for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+        check_device(device)
+
+
+def verify_conv2d_scalar_bop(batch, in_size, in_channel, num_filter, kernel, stride, padding, typ="add"):
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+
+        k = 10.0
+        with tvm.target.create(device):
+            A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A')
+            W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+            B = topi.nn.conv2d(A, W, stride, padding)
+            if typ == "add":
+                C = B + k
+            elif typ == "sub":
+                C = B - k
+            elif typ == "mul":
+                C = B * k
+            elif typ == "div":
+                C = B / k
+            else:
+                raise NotImplementedError()
+            s = topi.generic.schedule_conv2d_nchw([C])
+
+        foo = tvm.build(s, [A, W, B, C], device, name="conv2d_scalar_" + typ)
+
+        a_npy = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
+        w_npy = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype)
+        b_npy = topi.testing.conv2d_nchw_python(a_npy, w_npy, stride, padding)
+        c_npy = np.random.uniform(size=get_const_tuple(B.shape)).astype(B.dtype)
+        if typ == "add":
+            c_npy = b_npy + k
+        elif typ == "sub":
+            c_npy = b_npy - k
+        elif typ == "mul":
+            c_npy = b_npy * k
+        elif typ == "div":
+            c_npy = b_npy / k
+        else:
+            raise NotImplementedError()
+
+        a_nd = tvm.nd.array(a_npy, ctx)
+        w_nd = tvm.nd.array(w_npy, ctx)
+        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
+        c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), ctx)
+        foo(a_nd, w_nd, b_nd, c_nd)
+        np.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1E-4, atol=1E-4)
+
+    for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+        check_device(device)
+
+
+def test_tensor_scalar_bop():
+    verify_tensor_scalar_bop((1,), typ="add")
+    verify_tensor_scalar_bop((3, 5), typ="sub")
+    verify_tensor_scalar_bop((1, 3, 5), typ="mul")
+    verify_tensor_scalar_bop((2, 3, 1, 32), typ="div")
+
+
+def test_broadcast_bop():
+    verify_broadcast_bop((2, 3), (), typ="add")
+    verify_broadcast_bop((5, 2, 3), (1,), typ="add")
+    verify_broadcast_bop((1, 32), (64, 32), typ="sub")
+    verify_broadcast_bop((5, 64, 128), (2, 5, 64, 1), typ="mul")
+    verify_broadcast_bop((2, 3, 1, 32), (64, 32), typ="div")
+
+
+def test_conv2d_scalar_bop():
+    verify_conv2d_scalar_bop(1, 16, 4, 4, 3, 1, 1, typ="add")
+    verify_conv2d_scalar_bop(1, 32, 2, 1, 3, 1, 1, typ="sub")
+    verify_conv2d_scalar_bop(1, 32, 1, 1, 3, 1, 1, typ="mul")
+    verify_conv2d_scalar_bop(1, 16, 2, 1, 3, 1, 1, typ="div")
+
+
+if __name__ == "__main__":
+    test_operator_type_and_tags()
+    test_combination()
+    test_tensor_scalar_bop()
+    test_broadcast_bop()
+    test_conv2d_scalar_bop()
diff --git a/tests/python/unittest/test_lang_verify_compute.py b/tests/python/unittest/test_lang_verify_compute.py
new file mode 100644
index 000000000000..1b9ecf453267
--- /dev/null
+++ b/tests/python/unittest/test_lang_verify_compute.py
@@ -0,0 +1,64 @@
+import tvm
+
+def test_verify_compute():
+  n = tvm.var("n")
+  m = tvm.var("m")
+  A = tvm.placeholder((n, m), name='A')
+  k = tvm.reduce_axis((0, m), "k")
+  k_ = tvm.reduce_axis((0, m-1), "k_")
+  f1 = lambda i: tvm.sum(A[i, k], axis=k)
+  f2 = lambda i: A[i,0] + 1
+  f3 = lambda i: tvm.sum(A[i, k], axis=k) + 1
+  f4 = lambda i: A[i,0] * (tvm.sum(A[i, k], axis=k) + 1)
+  f5 = lambda i: (tvm.sum(A[i, k], axis=k), A[i,0] + 1)
+  f6 = lambda i: (tvm.sum(A[i, k], axis=k), tvm.sum(A[i, k_], axis=k_))
+
+  #
+  # Valid compute
+  try:
+    B = tvm.compute((n,), f1, name="B")
+  except tvm._ffi.base.TVMError as ex:
+    assert False
+
+  #
+  # Valid compute
+  try:
+    B = tvm.compute((n,), f2, name="B")
+  except tvm._ffi.base.TVMError as ex:
+    assert False
+
+  #
+  # Invalid compute with non top level reduction
+  try:
+    B = tvm.compute((n,), f3, name="B")
+    assert False
+  except tvm._ffi.base.TVMError as ex:
+    pass
+
+  #
+  # Invalid compute with non top level reduction
+  try:
+    B = tvm.compute((n,), f4, name="B")
+    assert False
+  except tvm._ffi.base.TVMError as ex:
+    pass
+
+  #
+  # Invalid compute with reduction and non-reduction batch ops
+  try:
+    B0, B1 = tvm.compute((n,), f5, name="B")
+    assert False
+  except tvm._ffi.base.TVMError as ex:
+    pass
+
+  #
+  # Invalid compute with unequal batch reduction ops
+  try:
+    B0, B1 = tvm.compute((n,), f6, name="B")
+    assert False
+  except tvm._ffi.base.TVMError as ex:
+    pass
+
+
+if __name__ == "__main__":
+  test_verify_compute()
\ No newline at end of file
diff --git a/tests/python/unittest/test_module_load.py b/tests/python/unittest/test_module_load.py
index 210ef954464c..1b239a357f66 100644
--- a/tests/python/unittest/test_module_load.py
+++ b/tests/python/unittest/test_module_load.py
@@ -95,7 +95,7 @@ def check_device(device):
             f = tvm.build(s, [A, B], device, "llvm", name=name)
         else:
             raise ValueError("Unsupported platform")
-       
+
         path_dso = temp.relpath("dev_lib.so")
         f.export_library(path_dso)
 
@@ -110,6 +110,7 @@ def check_device(device):
             np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
     check_device("cuda")
+    check_device("vulkan")
     check_device("opencl")
     check_device("metal")
 
@@ -172,7 +173,7 @@ def check_system_lib():
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
         mm['myadd2'](a, b)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
-    
+
     if sys.platform != "win32":
         check_system_lib()
     check_llvm()
diff --git a/tests/python/unittest/test_pass_inject_copy_intrin.py b/tests/python/unittest/test_pass_inject_copy_intrin.py
index 08477895b322..370044d85c03 100644
--- a/tests/python/unittest/test_pass_inject_copy_intrin.py
+++ b/tests/python/unittest/test_pass_inject_copy_intrin.py
@@ -44,7 +44,27 @@ def cb(src, dst, pad_before, pad_after, pad_value):
         return tvm.make.Evaluate(0)
     stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
 
+def test_single_point_test():
+    A = tvm.placeholder((1,), name='A')
+    B = tvm.compute((1,), lambda i:
+                    A[i], name='B')
+    s = tvm.create_schedule(B.op)
+    s[B].pragma(B.op.axis[0], "memcpy")
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    def cb(src, dst, pad_before, pad_after, pad_value):
+        assert tvm.ir_pass.Simplify(src.elem_offset).value == 0
+        assert tvm.ir_pass.Simplify(dst.elem_offset).value == 0
+        assert tvm.ir_pass.Simplify(src.strides[0]).value == 1
+        assert tvm.ir_pass.Simplify(dst.strides[0]).value == 1
+        return tvm.make.Evaluate(0)
+    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
+
 def assert_expr_equal(a, b):
+    print(a, b)
     assert tvm.ir_pass.Simplify(a - b).value == 0
 
 def test_copy_pad_split():
@@ -63,10 +83,12 @@ def test_copy_pad_split():
     Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
     Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
     stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    stmt = tvm.ir_pass.Simplify(stmt)
     stmt = tvm.ir_pass.CanonicalSimplify(stmt)
     def cb(src, dst, pad_before, pad_after, pad_value):
         assert(dst.elem_offset.value == 0)
         assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1)
+
         rpad_before = tvm.max(1 - xo * 4, 0)
         rpad_after = tvm.max(xo * 4 - 7, 0)
         assert_expr_equal(pad_before[0], rpad_before)
@@ -80,3 +102,4 @@ def cb(src, dst, pad_before, pad_after, pad_value):
     test_copy2d()
     test_copy_pad()
     test_copy_pad_split()
+    test_single_point_test()
diff --git a/tests/python/unittest/test_pass_inject_double_buffer.py b/tests/python/unittest/test_pass_inject_double_buffer.py
index 3136e33197ec..0e3500edf2e3 100644
--- a/tests/python/unittest/test_pass_inject_double_buffer.py
+++ b/tests/python/unittest/test_pass_inject_double_buffer.py
@@ -7,7 +7,7 @@ def test_double_buffer():
     tx = tvm.thread_axis("threadIdx.x")
     ib = tvm.ir_builder.create()
     A = ib.pointer("float32", name="A")
-    C = ib.pointer("float32", name="A")
+    C = ib.pointer("float32", name="C")
     ib.scope_attr(tx, "thread_extent", 1)
     with ib.for_range(0, n) as i:
         B = ib.allocate("float32", m, name="B", scope="shared")
diff --git a/tests/python/unittest/test_pass_inject_vthread.py b/tests/python/unittest/test_pass_inject_vthread.py
index e4b3b51fbd2d..502a55574df0 100644
--- a/tests/python/unittest/test_pass_inject_vthread.py
+++ b/tests/python/unittest/test_pass_inject_vthread.py
@@ -28,5 +28,39 @@ def get_vthread(name):
     stmt = tvm.ir_pass.InjectVirtualThread(get_vthread("cthread"))
     assert len(stmt.body.body.extents) == 3
 
+
+def test_vthread_extern():
+    dtype = 'int64'
+    n = 100
+    m = 4
+    nthread = 2
+    def get_vthread(name):
+        tx = tvm.thread_axis(name)
+        ty = tvm.thread_axis(name)
+        ib = tvm.ir_builder.create()
+        with ib.for_range(0, n) as i:
+            ib.scope_attr(tx, "virtual_thread", nthread)
+            ib.scope_attr(ty, "virtual_thread", nthread)
+            A = ib.allocate("float32", m, name="A", scope="shared")
+            B = ib.allocate("float32", m, name="B", scope="shared")
+            C = ib.allocate("float32", m, name="C", scope="shared")
+            cbuffer = tvm.decl_buffer((m,), dtype=C.dtype, data=C.asnode())
+            abuffer = tvm.decl_buffer((m,), dtype=A.dtype, data=A.asnode())
+            bbuffer = tvm.decl_buffer((m,), dtype=B.dtype, data=B.asnode())
+            A[tx] = tx + 1.0
+            B[ty] = ty + 1.0
+            ib.emit(tvm.call_extern("int32", "Run",
+                                    abuffer.access_ptr("r"),
+                                    bbuffer.access_ptr("r"),
+                                    cbuffer.access_ptr("rw")))
+        return ib.get()
+
+    stmt = tvm.ir_pass.InjectVirtualThread(get_vthread("vthread"))
+    assert stmt.body.body.extents[0].value == 2
+    assert stmt.body.body.body.body.body.body.extents[0].value == 2
+    assert len(stmt.body.body.body.body.body.body.extents) == 3
+
+
 if __name__ == "__main__":
+    test_vthread_extern()
     test_vthread()
diff --git a/tests/python/unittest/test_pass_loop_partition.py b/tests/python/unittest/test_pass_loop_partition.py
index a1f337b40214..a1025e1f662c 100644
--- a/tests/python/unittest/test_pass_loop_partition.py
+++ b/tests/python/unittest/test_pass_loop_partition.py
@@ -19,7 +19,7 @@ def lower(sch, args):
     sch = sch.normalize()
     bounds = tvm.schedule.InferBound(sch)
     stmt = tvm.schedule.ScheduleOps(sch, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt)
+    stmt = tvm.ir_pass.LoopPartition(stmt, False)
     stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64)
     stmt = tvm.ir_pass.CanonicalSimplify(stmt)
     stmt = tvm.ir_pass.VectorizeLoop(stmt)
@@ -37,7 +37,22 @@ def test_basic():
 
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt)
+    stmt = tvm.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert('if' not in str(stmt.body.body.body.first))
+
+def test_const_loop():
+    n = 21
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((n, ), name='B')
+
+    T = tvm.compute((n, ), lambda i: A[i]+B[i])
+    s = tvm.create_schedule(T.op)
+    xo, xi = s[T].split(T.op.axis[0], factor=4)
+
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
     stmt = tvm.ir_pass.Simplify(stmt)
     assert('if' not in str(stmt.body.body.body.first))
 
@@ -53,7 +68,7 @@ def test_multi_loop():
                 with ib.else_scope():
                     ib.emit(tvm.make.Evaluate(n))
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt)
+    stmt = tvm.ir_pass.LoopPartition(stmt, False)
     stmt = tvm.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt.body.first, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
 
@@ -73,7 +88,7 @@ def test_multi_if():
                 with ib.else_scope():
                     ib.emit(tvm.make.Evaluate(n))
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt)
+    stmt = tvm.ir_pass.LoopPartition(stmt, False)
     stmt = tvm.ir_pass.Simplify(stmt)
     assert('if' not in str(stmt.body.first))
 
@@ -92,7 +107,7 @@ def test_thread_axis():
 
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt)
+    stmt = tvm.ir_pass.LoopPartition(stmt, False)
     stmt = tvm.ir_pass.Simplify(stmt)
     assert('if' not in str(stmt.body.body.body.first))
 
@@ -127,7 +142,7 @@ def test_select():
         ib.emit(tvm.make.Evaluate(
           tvm.make.Select(ib.likely(i*4+j<n), m, n)))
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt)
+    stmt = tvm.ir_pass.LoopPartition(stmt, False)
     stmt = tvm.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt.first, lambda x: isinstance(x, tvm.expr.Select))))
 
@@ -158,12 +173,13 @@ def test_everything_during_deduction():
                 # this guard will produce everything during deduction
                 ib.emit(tvm.make.Evaluate(m))
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt)
+    stmt = tvm.ir_pass.LoopPartition(stmt, False)
     stmt = tvm.ir_pass.Simplify(stmt)
     assert(isinstance(stmt.body.body, tvm.stmt.IfThenElse))
 
 if __name__ == "__main__":
     test_basic()
+    test_const_loop()
     test_multi_loop()
     test_multi_if()
     test_thread_axis()
diff --git a/tests/python/unittest/test_pass_lower_warp_memory.py b/tests/python/unittest/test_pass_lower_warp_memory.py
new file mode 100644
index 000000000000..9793b21371bd
--- /dev/null
+++ b/tests/python/unittest/test_pass_lower_warp_memory.py
@@ -0,0 +1,27 @@
+import tvm
+
+def test_lower_warp_mem():
+    m = 128
+    A = tvm.placeholder((m,), name='A')
+    B = tvm.compute((m,), lambda i: A[i] + 3, name='B')
+
+    s = tvm.create_schedule(B.op)
+    AA = s.cache_read(A, "warp", [B])
+    xo, xi = s[B].split(B.op.axis[0], 32)
+    xi0, xi1 = s[B].split(xi, factor=16)
+    tx = tvm.thread_axis("threadIdx.x")
+    s[B].bind(xi1, tx)
+    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[AA].compute_at(s[B], xo)
+    xo, xi = s[AA].split(s[AA].op.axis[0], 16)
+    s[AA].bind(xi, tx)
+
+    f = tvm.lower(s, [A, B])
+    fhost, fdevice = tvm.ir_pass.SplitHostDevice(f)
+    fdevice = tvm.ir_pass.LowerWarpMemory(fdevice, 16)
+    assert(fdevice.body.body.value.value == "local")
+    assert(fdevice.body.body.body.extents[0].value == 2)
+
+
+if __name__ == "__main__":
+    test_lower_warp_mem()
diff --git a/tests/python/unittest/test_pass_simplify.py b/tests/python/unittest/test_pass_simplify.py
index 2cc8825e37f3..c38083822fe2 100644
--- a/tests/python/unittest/test_pass_simplify.py
+++ b/tests/python/unittest/test_pass_simplify.py
@@ -27,6 +27,12 @@ def test_basic():
     assert str(ret.value) == "(m - 1)"
 
 
+def test_bound():
+    m = tvm.var('m')
+    vrange = tvm.convert({m: tvm.Range(tvm.const(0), tvm.const(10))})
+    ret = tvm.ir_pass.Simplify(m % 10, vrange)
+    assert ret == m
+
 def test_canonical():
     x = tvm.var("x")
     z = tvm.const(3)
@@ -36,7 +42,19 @@ def test_canonical():
     ret = tvm.ir_pass.CanonicalSimplify(x / (z+z) - x / (z+z))
     assert(tvm.ir_pass.Equal(ret, 0))
 
+    #make sure terms are ordered based on their top operators (e.g., / always precedes %)
+    ret1 = tvm.ir_pass.CanonicalSimplify(x % 3 + x / 3)
+    ret2 = tvm.ir_pass.CanonicalSimplify(x / 3 + x % 3)
+    assert(tvm.ir_pass.Equal(ret1, ret2))
+
+    #when top operators match, compare string representation of terms
+    ret1 = tvm.ir_pass.CanonicalSimplify(x % 4 + x % 3)
+    ret2 = tvm.ir_pass.CanonicalSimplify(x % 3 + x % 4)
+    assert (tvm.ir_pass.Equal(ret1, ret2))
+
 if __name__ == "__main__":
+    test_modular()
+    test_bound()
     test_basic()
     test_simplify()
     test_canonical()
diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py
index 4d2110319d2c..2bb02998982f 100644
--- a/tests/python/unittest/test_pass_storage_rewrite.py
+++ b/tests/python/unittest/test_pass_storage_rewrite.py
@@ -19,14 +19,124 @@ def test_storage_share():
     stmt = tvm.ir_pass.CanonicalSimplify(stmt)
     stmt = tvm.ir_pass.Simplify(stmt)
     stmt = tvm.ir_pass.StorageRewrite(stmt)
-    # verify only have two allocations.
-    # verify that the data is folded.
+    # verify only have one allocations.
+    # verify inplace folding works
+    num_alloc = [0]
+    def verify(n):
+        if isinstance(n, tvm.stmt.Allocate):
+            num_alloc[0] += 1
+    tvm.ir_pass.PostOrderVisit(stmt, verify)
+    assert num_alloc[0] == 1
+
+def test_alloc_seq():
+    ib = tvm.ir_builder.create()
+    n = tvm.var("n")
+    with ib.for_range(0, n, name="i") as i:
+        with ib.for_range(0, 10, name="j") as j:
+            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
+            A[j] = 1.2
+        with ib.for_range(0, 10, name="j") as j:
+            A = ib.allocate("float32", 200, name="B", scope="local.L0A")
+            A[j] = 1.3
+
+    body = ib.get()
+    body = tvm.ir_pass.StorageRewrite(body)
+    num_alloc = [0]
+    def verify(n):
+        if isinstance(n, tvm.stmt.Allocate):
+            num_alloc[0] += 1
+            assert n.extents[0].value == 200
+    tvm.ir_pass.PostOrderVisit(body, verify)
+    assert num_alloc[0] == 1
+
+def test_alloc_different_dtypes():
+    def stmt_generater(dtype_list, length):
+        ib = tvm.ir_builder.create()
+        base_dtype = dtype_list[0]
+        global_a = tvm.placeholder((length,), name = "global_a", dtype = base_dtype)
+        assert len(dtype_list) == 4
+        with ib.for_range(0, length, name="j") as j:
+            dtype = dtype_list[0]
+            A = ib.allocate(dtype, length, name="A", scope="local.L0A")
+            A[j] = tvm.const(1, dtype = dtype)
+        with ib.for_range(0, length, name="j") as j:
+            dtype = dtype_list[1]
+            B = ib.allocate(dtype, length, name="B", scope="local.L0A")
+            B[j] = tvm.const(1, dtype = dtype)
+        with ib.for_range(0, length, name="j") as j:
+            dtype = dtype_list[2]
+            C = ib.allocate(dtype, length, name="C", scope="local.L0A")
+            C[j] = tvm.const(1, dtype = dtype)
+        with ib.for_range(0, length, name="j") as j:
+            dtype = dtype_list[3]
+            D = ib.allocate(dtype, length, name="D", scope="local.L0A")
+            D[j] = tvm.const(1, dtype = dtype)
+        with ib.for_range(0, length, name="j") as j:
+            dtype = "int8"
+            E = ib.allocate(dtype, length, name="E", scope="local.L0A")
+            E[j] = A[j].astype(dtype) + B[j].astype(dtype) + C[j].astype(dtype) + D[j].astype(dtype)
+        return ib.get()
+
+    def dtype_bit_len(dtype):
+        index = 0
+        for i in dtype:
+            if i.isdigit():
+                break
+            index += 1
+        return int(dtype[index:])
+
+    def offset_generater(dtype_list, length):
+        dtype_len_list = [dtype_bit_len(i) for i in dtype_list]
+        base_len = dtype_len_list[0]
+        return sum([i * length / base_len for i in dtype_len_list])
+
+    def dtype_test(dtype_list, length):
+        def verify(n):
+            if isinstance(n, tvm.stmt.Allocate):
+                assert n.extents[0].value == offset
+
+        body = stmt_generater(dtype_list, length)
+        offset = offset_generater(dtype_list, length)
+        body = tvm.ir_pass.StorageRewrite(body)
+        tvm.ir_pass.PostOrderVisit(body, verify)
+
+    length = 1024
+    dtype_list = ["float16", "int32", "uint16", "int8"]
+    dtype_test(dtype_list, length)
+
+    dtype_list = ["float32", "int32", "uint16", "int8"]
+    dtype_test(dtype_list, length)
+
+    dtype_list = ["float64", "int32", "uint16", "int8"]
+    dtype_test(dtype_list, length)
+
+    dtype_list = ["int8", "int32", "uint16", "uint8"]
+    dtype_test(dtype_list, length)
+
+
+def test_inplace_rule():
+    m = 10
+    A = tvm.placeholder((m,), name='A')
+    A0 = tvm.compute((m,), lambda i: A[i], name='A0')
+    A1 = tvm.compute((m,), lambda i: A[i] + 1, name='A1')
+    AA =  tvm.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name='AA')
+    B = tvm.compute((m,), lambda i: AA[i] + 1, name='B')
+    s = tvm.create_schedule(B.op)
+    bounds = tvm.schedule.InferBound(s)
+    assert isinstance(bounds, tvm.container.Map)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.ir_pass.StorageRewrite(stmt)
+    # verify only have one allocations.
+    # verify inplace folding works
     num_alloc = [0]
     def verify(n):
         if isinstance(n, tvm.stmt.Allocate):
             num_alloc[0] += 1
-        elif isinstance(n, tvm.stmt.Store):
-            assert n.buffer_var != n.value.a.buffer_var
     tvm.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 2
 
@@ -38,13 +148,12 @@ def test_storage_combine():
     B = A
     stages = []
     for t in range(num_stage):
-        B = tvm.compute((n, ), lambda i: B[i] + (t+1), name='A%d' % t)
+        B = tvm.compute((n, ), lambda i: B[i] + B[0] + (t+1), name='A%d' % t)
         stages.append(B)
 
     s = tvm.create_schedule(B.op)
     for S in stages[:-1]:
         s[S].set_scope("global:tag")
-
     bounds = tvm.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
@@ -121,12 +230,243 @@ def test_parallel_alloc():
                 A[j] = A[j] + 2
     body = ib.get()
     body = tvm.ir_pass.StorageRewrite(body)
+
     assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate))
 
+def test_inplace_rule2():
+    #Test Buffer
+    scope_tb = "local_TB2"
+    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
+    def mem_info_inp_buffer():
+        return tvm.make.node("MemoryInfo",
+                        unit_bits= 16,
+                        max_simd_bits=32,
+                        max_num_bits=1024*1024*1024,
+                        head_address=None)
+    m = 10
+    A = tvm.placeholder((m,), name='A')
+    C = tvm.placeholder((m,), name='C')
+    D = tvm.placeholder((m,), name='D')
+    A0 = tvm.compute((m,), lambda i: A[i] + C[i], name='A0')
+    A1 = tvm.compute((m,), lambda i: D[i] * D[i], name='A1')
+    A2 = tvm.compute((m,), lambda i: A0[i] + A1[i], name='A2')
+    B = tvm.compute((m,), lambda i: A2[i], name='B')
+    s = tvm.create_schedule(B.op)
+    A0L = s.cache_read(A0, scope_tb, [A2])
+    A1L = s.cache_read(A1, scope_tb, [A2])
+    A2L = s.cache_read(A2, scope_tb, [B])
+    bounds = tvm.schedule.InferBound(s)
+    assert isinstance(bounds, tvm.container.Map)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
+    Cc = tvm.decl_buffer(C.shape, B.dtype, name='C')
+    Dd = tvm.decl_buffer(D.shape, B.dtype, name='D')
+    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D:Dd}, 64)
+    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.ir_pass.StorageRewrite(stmt)
+    # verify only have one allocations.
+    # verify inplace folding works
+    num_alloc = [0]
+    def verify(n):
+        if isinstance(n, tvm.stmt.Allocate):
+            num_alloc[0] += 1
+    tvm.ir_pass.PostOrderVisit(stmt, verify)
+    assert num_alloc[0] == 2
+
+def test_inplace_rule3():
+    #Test Buffer
+    scope_tb = "local_TB3"
+    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
+    def mem_info_inp_buffer():
+        return tvm.make.node("MemoryInfo",
+                        unit_bits= 16,
+                        max_simd_bits=32,
+                        max_num_bits=1024*1024*1024,
+                        head_address=None)
+    m = 10
+    B0 = tvm.placeholder((m,), name='B0')
+    B1 = tvm.placeholder((m,), name='B1')
+    B2 = tvm.placeholder((m,), name='B2')
+    B3 = tvm.placeholder((m,), name='B3')
+    B4 = tvm.placeholder((m,), name='B4')
+    B5 = tvm.placeholder((m,), name='B5')
+
+    B6 = tvm.compute((m,), lambda i: B1[i] * B5[i], name='B6')
+    B7 = tvm.compute((m,), lambda i: B2[i] * B4[i], name='B7')
+    B8 = tvm.compute((m,), lambda i: B6[i] - B7[i], name='B8')
+
+    B9 = tvm.compute((m,), lambda i: B2[i] * B3[i], name='B9')
+    B10 = tvm.compute((m,), lambda i: B0[i] * B5[i], name='B10')
+    B11 = tvm.compute((m,), lambda i: B9[i] - B10[i], name='B11')
+
+    B12 = tvm.compute((m,), lambda i: B0[i] * B4[i], name='B12')
+    B13 = tvm.compute((m,), lambda i: B1[i] * B3[i], name='B13')
+    B14 = tvm.compute((m,), lambda i: B12[i] - B13[i], name='B14')
+
+    B = tvm.compute((m,), lambda i: B8[i] * B11[i] + B14[i], name='B')
+    s = tvm.create_schedule(B.op)
+
+    B1L = s.cache_read(B1, scope_tb, [B6, B13])
+    B5L = s.cache_read(B5, scope_tb, [B6, B10])
+    B2L = s.cache_read(B2, scope_tb, [B7, B9])
+    B4L = s.cache_read(B4, scope_tb, [B7, B12])
+    B3L = s.cache_read(B3, scope_tb, [B9, B13])
+    B0L = s.cache_read(B0, scope_tb, [B10, B12])
+
+    B8L = s.cache_write(B8, scope_tb)
+    B11L = s.cache_write(B11, scope_tb)
+    B14L = s.cache_write(B14, scope_tb)
+    B6L = s.cache_write(B6, scope_tb)
+    B7L = s.cache_write(B7, scope_tb)
+    B9L = s.cache_write(B9, scope_tb)
+    B10L = s.cache_write(B10, scope_tb)
+    B12L = s.cache_write(B12, scope_tb)
+    B13L = s.cache_write(B13, scope_tb)
+
+    s[B12].compute_inline()
+    s[B13].compute_inline()
+    s[B8].compute_inline()
+    s[B11].compute_inline()
+    s[B14].compute_inline()
+    s[B6].compute_inline()
+    s[B7].compute_inline()
+    s[B9].compute_inline()
+    s[B10].compute_inline()
+
+    s = s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    assert isinstance(bounds, tvm.container.Map)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+
+    B0a = tvm.decl_buffer(B0.shape, B0.dtype, name='B0')
+    B1a = tvm.decl_buffer(B1.shape, B1.dtype, name='B1')
+    B2a = tvm.decl_buffer(B2.shape, B2.dtype, name='B2')
+    B3a = tvm.decl_buffer(B3.shape, B3.dtype, name='B3')
+    B4a = tvm.decl_buffer(B4.shape, B4.dtype, name='B4')
+    B5a = tvm.decl_buffer(B5.shape, B5.dtype, name='B5')
+
+    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.ir_pass.StorageFlatten(stmt, {B0: B0a, B1: B1a, B2: B2a, B3: B2a, B4: B4a, B5: B5a, B: Bb}, 64)
+    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.ir_pass.StorageRewrite(stmt)
+    # verify only have one allocations.
+    # verify inplace folding works
+    def verify(n):
+        if isinstance(n, tvm.stmt.Allocate):
+            assert n.extents[0].value == 70
+    tvm.ir_pass.PostOrderVisit(stmt, verify)
+
+def test_alloc_seq_type():
+    ib = tvm.ir_builder.create()
+    n = tvm.var("n")
+    with ib.for_range(0, n, name="i") as i:
+        with ib.for_range(0, 10, name="j") as j:
+            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
+            A1 = ib.allocate("float32", 200, name="A1", scope="local.L0A")
+            A[j] = 1.2
+            A1[j] = 1.3
+            B = ib.allocate("int16", 200, name="B", scope="local.L0A")
+            B[j] = tvm.const(1, "int16")
+            C = ib.allocate("int16", 200, name="C", scope="local.L0A")
+            C[j] = tvm.const(1, "int16")
+            D = ib.allocate("int16", 200, name="D", scope="local.L0A")
+            D[j] = B[j] + C[j]
+            A2 = ib.allocate("float32", 200, name="A2", scope="local.L0A")
+            A2[j] = A[j]
+
+    body = ib.get()
+    body = tvm.ir_pass.StorageRewrite(body)
+    num_alloc = [0]
+    def verify(n):
+        if isinstance(n, tvm.stmt.Allocate):
+            num_alloc[0] += 1
+            assert n.extents[0].value == 500
+    tvm.ir_pass.PostOrderVisit(body, verify)
+    assert num_alloc[0] == 1
+
+def test_alloc_seq_type2():
+    ib = tvm.ir_builder.create()
+    n = tvm.var("n")
+    with ib.for_range(0, n, name="i") as i:
+        with ib.for_range(0, 10, name="j") as j:
+            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
+            A[j] = 1.2
+        with ib.for_range(0, 20, name="j") as j:
+            B = ib.allocate("int16", 400, name="B", scope="local.L0A")
+            B[j] = tvm.const(1, "int16")
+        with ib.for_range(0, 10, name="j") as j:
+            C = ib.allocate("float32", 200, name="C", scope="local.L0A")
+            C[j] = 1.2
+
+    body = ib.get()
+    body = tvm.ir_pass.StorageRewrite(body)
+    num_alloc = [0]
+    def verify(n):
+        if isinstance(n, tvm.stmt.Allocate):
+            num_alloc[0] += 1
+            assert n.extents[0].value == 200
+    tvm.ir_pass.PostOrderVisit(body, verify)
+    assert num_alloc[0] == 1
+
+
+def test_reuse_small_buffer():
+    ib = tvm.ir_builder.create()
+    n = tvm.var("n")
+    with ib.for_range(0, n, name="i") as i:
+        with ib.for_range(0, 10, name="j") as j:
+            A = ib.allocate("int16", 200, name="A", scope="local.L0A")
+            A[j] = tvm.const(1, "int16")
+            B = ib.allocate("int16", 200, name="B", scope="local.L0A")
+            B[j] = tvm.const(1, "int16")
+            B1 = ib.allocate("int16", 200, name="B1", scope="local.L0A")
+            B1[j] = A[j] + B[j]
+            C = ib.allocate("int16", 400, name="C", scope="local.L0A")
+            C[j] = tvm.const(1, "int16")
+            D = ib.allocate("int16", 400, name="D", scope="local.L0A")
+            D[j] = tvm.const(1, "int16")
+            E = ib.allocate("int16", 400, name="E", scope="local.L0A")
+            E[j] = C[j]
+
+    body = ib.get()
+    body = tvm.ir_pass.StorageRewrite(body)
+
+    num_alloc = [0]
+
+    def verify(n):
+        if isinstance(n, tvm.stmt.Allocate):
+            num_alloc[0] += 1
+            assert n.extents[0].value == 800
+    tvm.ir_pass.PostOrderVisit(body, verify)
+    assert num_alloc[0] == 1
+
+def test_replace_dataflow():
+    shape = (255,)
+    A = tvm.placeholder(shape, name = "A")
+    B = tvm.compute(shape, lambda i: A[i] + A[i], name = "B")
+    C = tvm.compute(shape, lambda i: A[i] + B[i], name = "C")
+    D = tvm.compute(shape, lambda i: A[i] + C[i], name = "D")
+    E = tvm.compute(shape, lambda i: A[i] + D[i], name = "E")
+
+    s = tvm.create_schedule(E.op)
+    s.cache_read(A, "local", [B, C, D, E])
+    bounds = tvm.schedule.InferBound(s)
+    assert isinstance(bounds, tvm.container.Map)
 
 
 if __name__ == "__main__":
+    test_alloc_seq()
+    test_alloc_different_dtypes()
+    test_inplace_rule()
+    test_storage_share()
     test_parallel_alloc()
     test_storage_combine()
     test_storage_share_gpu()
-    test_storage_share()
+    test_inplace_rule2()
+    test_inplace_rule3()
+    test_alloc_seq_type()
+    test_alloc_seq_type2()
+    test_reuse_small_buffer()
+    test_replace_dataflow()
diff --git a/tests/python/unittest/test_pass_storage_sync.py b/tests/python/unittest/test_pass_storage_sync.py
index 8360ed2f0c3f..2286dd53e981 100644
--- a/tests/python/unittest/test_pass_storage_sync.py
+++ b/tests/python/unittest/test_pass_storage_sync.py
@@ -58,6 +58,64 @@ def meminfo_cache():
     assert(blist[-1].value.args[3].value == 10)
 
 
+def test_coproc_sync2():
+    ib = tvm.ir_builder.create()
+    n = tvm.var("n")
+    cp = tvm.thread_axis((0, 1), "cop")
+    ty = tvm.thread_axis("cthread")
+    A = ib.allocate("float32", 128, name="A")
+    ib.scope_attr(ty, "virtual_thread", 2)
+    with ib.new_scope():
+        ib.scope_attr(cp, "coproc_scope", 2)
+        A[ty] = 0.0
+    with ib.for_range(0, n, name="i") as i:
+        with ib.new_scope():
+            ib.scope_attr(cp, "coproc_scope", 1)
+            A[ty] = 1.0
+        with ib.new_scope():
+            ib.scope_attr(cp, "coproc_scope", 2)
+            A[ty] = 1.0
+    stmt = ib.get()
+    stmt = tvm.ir_pass.CoProcSync(stmt)
+
+def test_coproc_sync3():
+    def __check_list(tvm_array, py_list):
+        for ti, li in zip(tvm_array, py_list):
+            if ti.value != li:
+                return False
+        return True
+
+    ib = tvm.ir_builder.create()
+    n = tvm.var("n")
+    cp = tvm.thread_axis((0, 1), "cop")
+    A = ib.allocate("float32", 128, name="A", scope="global.cache")
+    with ib.for_range(0, n, name="i") as i:
+        with ib.for_range(0, n, name="i") as j:
+            with ib.new_scope():
+                ib.scope_attr(cp, "coproc_scope", 1)
+                A[i] = 1.0
+            with ib.new_scope():
+                ib.scope_attr(cp, "coproc_scope", 2)
+                A[i] = 1.0
+    with ib.new_scope():
+        ib.scope_attr(cp, "coproc_scope", 3)
+        A[0] = 0.0
+   
+    stmt = ib.get()
+    stmt = tvm.ir_pass.CoProcSync(stmt)
+    slist = tvm.make.stmt_list(stmt.first.body.body)
+    push_st = slist[2]
+    slist = tvm.make.stmt_list(slist[-1])
+    pop_st = slist[0].body.first
+
+    assert(push_st.value.name == "cop.coproc_dep_push")
+    assert(__check_list(push_st.value.args, [2,3]))
+    assert(pop_st.value.name == "cop.coproc_dep_pop")
+    assert(__check_list(pop_st.value.args, [2,3]))
+    
+
 if __name__ == "__main__":
     test_coproc_sync()
     test_storage_sync()
+    test_coproc_sync2()
+    test_coproc_sync3()
diff --git a/tests/python/unittest/test_pass_unroll.py b/tests/python/unittest/test_pass_unroll.py
index 3c155e44aa0a..dda3fdad166c 100644
--- a/tests/python/unittest/test_pass_unroll.py
+++ b/tests/python/unittest/test_pass_unroll.py
@@ -1,27 +1,40 @@
 import tvm
+import os
+
 
 def test_unroll_loop():
+    ib = tvm.ir_builder.create()
     dtype = 'int64'
     n = tvm.var('n')
     Ab = tvm.decl_buffer((n, ), dtype)
-    i = tvm.var('i')
-    j = tvm.var('j')
+    Aptr = ib.buffer_ptr(Ab)
     # for i in 0 to n-1:
-    stmt = tvm.make.For(
-        i, n, 2, 0, 0,
-        tvm.make.For(j, 0, 8, 3, 0,
-                     tvm.make.Store(Ab.data,
-                                    tvm.make.Load(dtype, Ab.data, i) + 1,
-                                    j + 1)))
+    with ib.for_range(n, n + 2, name="i") as i:
+        with ib.for_range(0, 8, name="i", for_type="unroll") as j:
+            Aptr[j + 1] = Aptr[i] + 1
+
+    stmt = ib.get()
     assert isinstance(stmt, tvm.stmt.For)
-    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, True)
+    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, True)
     assert not isinstance(ret, tvm.stmt.For)
-    ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, True)
+    ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, 0, True)
     assert isinstance(ret, tvm.stmt.For)
-    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, False)
+    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, False)
     assert isinstance(ret, tvm.stmt.For)
     assert ret.for_type == tvm.stmt.For.Unrolled
 
+    ib = tvm.ir_builder.create()
+    ib.scope_attr(tvm.const(0), "pragma_auto_unroll_max_step", 16)
+    ib.emit(stmt)
+    wrapped = ib.get()
+    wrapped = tvm.make.Block(wrapped, stmt)
+    assert isinstance(ret, tvm.stmt.For)
+    ret = tvm.ir_pass.UnrollLoop(wrapped, 0, 8, 0, False)
+    assert isinstance(ret.first, tvm.stmt.For)
+    assert ret.first.for_type == tvm.stmt.For.Unrolled
+    assert isinstance(ret.rest, tvm.stmt.For)
+    assert ret.rest.for_type != tvm.stmt.For.Unrolled
+
 
 if __name__ == "__main__":
     test_unroll_loop()
diff --git a/tests/python/unittest/test_pass_verify_gpu_code.py b/tests/python/unittest/test_pass_verify_gpu_code.py
new file mode 100644
index 000000000000..6fc0387cf144
--- /dev/null
+++ b/tests/python/unittest/test_pass_verify_gpu_code.py
@@ -0,0 +1,169 @@
+"""Test gpu code verifier"""
+import tvm
+
+def get_verify_pass(valid, **kwargs):
+    def verify_pass(stmt):
+        valid[0] = tvm.ir_pass.VerifyGPUCode(stmt, kwargs)
+        return stmt
+    return verify_pass
+
+def test_shared_memory():
+    N = 1024
+    M = 128
+
+    A = tvm.placeholder((N,), name='A', dtype='float32')
+    B = tvm.compute((N, ), lambda i: A[i], name='B')
+
+    s = tvm.create_schedule([B.op])
+    AA = s.cache_read(A, "shared", [B])
+    o, i = s[B].split(s[B].op.axis[0], M)
+    s[AA].compute_at(s[B], o)
+    s[B].bind(o, tvm.thread_axis("blockIdx.x"))
+    s[B].bind(i, tvm.thread_axis("threadIdx.x"))
+
+    # shared memory usage: M * 4B
+    # thread usage: M
+
+    for target in ['opencl', 'cuda']:
+        if not tvm.context(target).exist:
+            continue
+        valid = [None]
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_shared_memory_per_block=4 * M - 1,
+                                max_threads_per_block=M))]}):
+            tvm.build(s, [A, B], target)
+        assert not valid[0]
+
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_shared_memory_per_block=4 * M,
+                                max_threads_per_block=M))]}):
+            tvm.build(s, [A, B], target)
+        assert valid[0]
+
+def test_local_memory():
+    N = 1024
+    M = 128
+
+    A = tvm.placeholder((N,), name='A', dtype='float32')
+    B = tvm.compute((N, ), lambda i: A[i], name='B')
+
+    s = tvm.create_schedule([B.op])
+    AA = s.cache_read(A, "local", [B])
+    o, i = s[B].split(s[B].op.axis[0], M)
+    s[AA].compute_at(s[B], o)
+    s[B].bind(o, tvm.thread_axis("blockIdx.x"))
+
+    # local memory usage: M * 4B
+    # thread usage: M
+
+    for target in ['opencl', 'cuda']:
+        if not tvm.context(target).exist:
+            continue
+
+        valid = [None]
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_local_memory_per_block=4 * M - 1,
+                                max_threads_per_block=1))]}):
+            tvm.build(s, [A, B], target)
+        assert not valid[0]
+
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_local_memory_per_block=4 * M,
+                                max_threads_per_block=1))]}):
+            tvm.build(s, [A, B], target)
+        assert valid[0]
+
+def test_num_thread():
+    N = 1024
+    M = 128
+
+    A = tvm.placeholder((N,), name='A', dtype='float32')
+    B = tvm.compute((N, ), lambda i: A[i], name='B')
+
+    s = tvm.create_schedule([B.op])
+    o, i = s[B].split(s[B].op.axis[0], M)
+
+    s[B].bind(o, tvm.thread_axis('threadIdx.x'))
+    s[B].bind(i, tvm.thread_axis("threadIdx.y"))
+
+    # shared memory usage: 0
+    # thread usage: N
+
+    for target in ['opencl', 'cuda']:
+        if not tvm.context(target).exist:
+            continue
+
+        valid = [None]
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_shared_memory_per_block=0,
+                                max_threads_per_block=N - 1))]}):
+            tvm.build(s, [A, B], target)
+        assert not valid[0]
+
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_shared_memory_per_block=0,
+                                max_threads_per_block=N))]}):
+            tvm.build(s, [A, B], target)
+        assert valid[0]
+
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_shared_memory_per_block=0,
+                                max_threads_per_block=N,
+                                max_thread_y=M-1))]}):
+            tvm.build(s, [A, B], target)
+        assert not valid[0]
+
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_shared_memory_per_block=0,
+                                max_threads_per_block=N,
+                                max_thread_y=M))]}):
+            tvm.build(s, [A, B], target)
+        assert valid[0]
+
+def test_multiple_kernels():
+    N = 1024
+
+    A = tvm.placeholder((N, N), name='A')
+    B = tvm.compute((N, N), lambda i, j: A[i, j])
+    C = tvm.compute((N, N), lambda i, j: B[i, j])
+
+    s = tvm.create_schedule([C.op])
+
+    s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x"))
+    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))
+
+    # shared memory usage: 0
+    # thread usage: N
+
+    for target in ['opencl', 'cuda']:
+        if not tvm.context(target).exist:
+            continue
+
+        valid = [None]
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_shared_memory_per_block=0,
+                                max_threads_per_block=N - 1))]}):
+            tvm.build(s, [A, C], target)
+        assert not valid[0]
+
+        with tvm.build_config(**{"add_lower_pass": [
+            (2, get_verify_pass(valid,
+                                max_shared_memory_per_block=0,
+                                max_threads_per_block=N))]}):
+            tvm.build(s, [A, C], target)
+        assert valid[0]
+
+if __name__ == "__main__":
+    test_local_memory()
+    test_shared_memory()
+    test_num_thread()
+    test_multiple_kernels()
diff --git a/tests/python/unittest/test_pass_verify_memory.py b/tests/python/unittest/test_pass_verify_memory.py
new file mode 100644
index 000000000000..d1f5d4326621
--- /dev/null
+++ b/tests/python/unittest/test_pass_verify_memory.py
@@ -0,0 +1,96 @@
+import tvm
+
+# The following DLDeviceType/TVMDeviceExtType values
+# are originally defined in dlpack.h and c_runtime_api.h.
+gpu_devices = [2, 4, 7, 8, 10, 11]
+other_devices = [1, 3, 9, 12]
+
+
+def lower(sch, args):
+    binds = {}
+    arg_list = []
+    for x in args:
+        if isinstance(x, tvm.tensor.Tensor):
+            buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.name)
+            assert x not in binds
+            binds[x] = buf
+            arg_list.append(buf)
+        else:
+            raise ValueError("args must be Tensor, Buffer or Var")
+    sch = sch.normalize()
+    bounds = tvm.schedule.InferBound(sch)
+    stmt = tvm.schedule.ScheduleOps(sch, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64)
+    func = tvm.ir_pass.MakeAPI(stmt, "myadd", arg_list, 0, True)
+    return func
+
+
+# All computations are bound. 
+# So VerifyMemory pass is expected to succeed.
+#
+def test_verify_memory_all_bind():
+  n = tvm.var("n")
+  A = tvm.placeholder((n,), name='A')
+  B = tvm.compute(A.shape, lambda i: A[i] + 1.0, name="B")
+
+  # B is bound to threads.
+  s = tvm.create_schedule(B.op)
+  bx, tx = s[B].split(B.op.axis[0], factor=64)
+  s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
+  s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+  func = lower(s, [A, B])
+  
+  for dev_type in gpu_devices + other_devices:
+    assert tvm.ir_pass.VerifyMemory(func, dev_type)
+
+
+# Computations are not bound. 
+# So VerifyMemory pass fails when device type is GPU.
+#
+def test_verify_memory_not_bind():
+  n = tvm.var("n")
+  A = tvm.placeholder((n,), name='A')
+  B = tvm.compute(A.shape, lambda i: A[i] + 1.0, name="B")
+
+  # B is not bound to threads.
+  s = tvm.create_schedule(B.op)
+
+  func = lower(s, [A, B])  
+
+  for dev_type in gpu_devices:
+    assert not tvm.ir_pass.VerifyMemory(func, dev_type)
+  for dev_type in other_devices:
+    assert tvm.ir_pass.VerifyMemory(func, dev_type)
+
+
+# Computations are partially bound. 
+# So VerifyMemory pass fails when device type is GPU.
+#
+def test_verify_memory_partially_bind():
+  n = tvm.var("n")
+  A = tvm.placeholder((n,), name='A')
+  B = tvm.compute(A.shape, lambda i: A[i] + 1.0, name="B")
+  C = tvm.compute(B.shape, lambda i: B[i] + 2.0, name="C")
+  D = tvm.compute(C.shape, lambda i: C[i] + 2.0, name="D")
+
+  # C is bound to threads, but B and D are not.
+  s = tvm.create_schedule([B.op, C.op, D.op])
+  bx, tx = s[C].split(C.op.axis[0], factor=64)
+  s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+  s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+  func = lower(s, [A, B, C, D])  
+
+  for dev_type in gpu_devices:
+    assert not tvm.ir_pass.VerifyMemory(func, dev_type)
+  for dev_type in other_devices:
+    assert tvm.ir_pass.VerifyMemory(func, dev_type)
+
+
+if __name__ == "__main__":
+  test_verify_memory_all_bind()
+  test_verify_memory_not_bind()
+  test_verify_memory_partially_bind()
+  
diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py
index 7177810cfa0c..9978c5992910 100644
--- a/tests/python/unittest/test_runtime_graph.py
+++ b/tests/python/unittest/test_runtime_graph.py
@@ -1,7 +1,8 @@
 import tvm
 import numpy as np
 import json
-from tvm.contrib import rpc, util, graph_runtime
+from tvm import rpc
+from tvm.contrib import util, graph_runtime
 
 def test_graph_simple():
     n = 4
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py
index 698f877d2504..9f33e2aabfd8 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -7,6 +7,7 @@ def enabled_ctx_list():
                 ('cl', tvm.opencl(0)),
                 ('metal', tvm.metal(0)),
                 ('rocm', tvm.rocm(0)),
+                ('vulkan', tvm.vulkan(0)),
                 ('vpi', tvm.vpi(0))]
     for k, v  in ctx_list:
         assert tvm.context(k, 0) == v
@@ -19,7 +20,8 @@ def enabled_ctx_list():
 
 def test_nd_create():
     for ctx in ENABLED_CTX_LIST:
-        for dtype in ["float32", "int8", "uint16"]:
+        for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32",
+                      "float32"]:
             x = np.random.randint(0, 10, size=(3, 4))
             x = np.array(x, dtype=dtype)
             y = tvm.nd.array(x, ctx=ctx)
diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_packed_func.py
index 44b450b23fe2..279172555d2a 100644
--- a/tests/python/unittest/test_runtime_packed_func.py
+++ b/tests/python/unittest/test_runtime_packed_func.py
@@ -63,7 +63,15 @@ def myfunc(ss):
     f(a)
 
 
+def test_empty_array():
+    def myfunc(ss):
+        assert tuple(ss) == ()
+    x = tvm.convert(())
+    tvm.convert(myfunc)(x)
+
+
 if __name__ == "__main__":
+    test_empty_array()
     test_get_global()
     test_get_callback_with_node()
     test_convert()
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index cae65176efd5..e7c0cc1bbabd 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -1,8 +1,41 @@
 import tvm
+import os
 import logging
 import numpy as np
 import time
-from tvm.contrib import rpc, util
+from tvm import rpc
+from tvm.contrib import util
+
+
+def test_bigendian_rpc():
+    """Test big endian rpc when there is a PowerPC RPC server available"""
+    host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
+    port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090)
+    if host is None:
+        return
+    def verify_rpc(remote, target, shape, dtype):
+        A = tvm.placeholder(shape, dtype=dtype)
+        B = tvm.compute(A.shape, lambda i: A[i]+tvm.const(1, A.dtype))
+        s = tvm.create_schedule(B.op)
+        f = tvm.build(s, [A, B], target, name="myadd")
+
+        ctx = remote.cpu(0)
+        a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx)
+        b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx)
+        temp = util.tempdir()
+        path_dso = temp.relpath("dev_lib.o")
+        f.save(path_dso)
+        remote.upload(path_dso)
+        f = remote.load_module("dev_lib.o")
+        f(a, b)
+        np.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy())
+
+    print("Test RPC connection to PowerPC...")
+    remote = rpc.connect(host, port)
+    target = "llvm -mtriple=powerpc-linux-gnu"
+    for dtype in ["float32", "float64", "int32", "int8"]:
+        verify_rpc(remote, target, (10,), dtype)
+
 
 def test_rpc_simple():
     if not tvm.module.enabled("rpc"):
@@ -18,7 +51,7 @@ def strcat(name, x):
     def remotethrow(name):
         raise ValueError("%s" % name)
 
-    server = rpc.Server("localhost")
+    server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     f1 = client.get_function("rpc.test.addone")
     assert f1(10) == 11
@@ -41,7 +74,6 @@ def remote_array_func(y):
         np.testing.assert_equal(y.asnumpy(), x)
     server = rpc.Server("localhost")
     remote = rpc.connect(server.host, server.port)
-    print("second connect")
     r_cpu = tvm.nd.array(x, remote.cpu(0))
     assert str(r_cpu.context).startswith("remote")
     np.testing.assert_equal(r_cpu.asnumpy(), x)
@@ -62,14 +94,14 @@ def test_rpc_remote_module():
     if not tvm.module.enabled("rpc"):
         return
     server = rpc.Server("localhost")
-    remote = rpc.connect(server.host, server.port)
+    client = rpc.connect(server.host, server.port)
     # graph
     n = tvm.convert(1024)
     A = tvm.placeholder((n,), name='A')
     B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
     s = tvm.create_schedule(B.op)
 
-    def check_remote():
+    def check_remote(remote):
         if not tvm.module.enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
@@ -87,7 +119,7 @@ def check_remote():
         print('%g secs/op' % cost)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    def check_remote_link_cl():
+    def check_remote_link_cl(remote):
         """Test function to run remote code such as cl
 
         This is not enabled because there is forking issue
@@ -135,23 +167,42 @@ def check_remote_link_cl():
         fhost(a, b)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    check_remote()
+    check_remote(client)
+    check_remote(rpc.LocalSession())
+
 
 def test_rpc_return_func():
     @tvm.register_func("rpc.test.remote_func")
     def addone(x):
         return lambda y: x+y
-    server = rpc.Server("localhost")
+    server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     f1 = client.get_function("rpc.test.remote_func")
     fadd = f1(10)
     assert fadd(12) == 22
 
 
+def test_local_func():
+    @tvm.register_func("rpc.test.remote_func2")
+    def addone(x):
+        return lambda y: x+y
+    client = rpc.LocalSession()
+    f1 = client.get_function("rpc.test.remote_func2")
+    fadd = f1(10)
+    assert fadd(12) == 22
+
+    blob = bytearray(np.random.randint(0, 10, size=(10)))
+    client.upload(blob, "dat.bin")
+    rev = client.download("dat.bin")
+    assert rev == blob
+
+
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
+    test_bigendian_rpc()
     test_rpc_remote_module()
     test_rpc_return_func()
     test_rpc_file_exchange()
     test_rpc_array()
     test_rpc_simple()
+    test_local_func()
diff --git a/tests/python/unittest/test_schedule_bound_inference.py b/tests/python/unittest/test_schedule_bound_inference.py
index 3601833de08d..30be3783bbb3 100644
--- a/tests/python/unittest/test_schedule_bound_inference.py
+++ b/tests/python/unittest/test_schedule_bound_inference.py
@@ -53,6 +53,29 @@ def test_bound3():
     assert(bounds[A1.op.axis[0]].extent.value==32)
     assert(bounds[A1.op.axis[1]].extent.value==16)
 
+
+def test_bound_warp():
+    m = tvm.var('m')
+    l = tvm.var('l')
+    A = tvm.placeholder((m, l), name='A')
+    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+
+    s = tvm.create_schedule(A2.op)
+    s[A1].set_scope("warp")
+    xo, xi = s[A2].split(A2.op.axis[0], 32)
+    xi0, xi1 = s[A2].split(xi, factor=16)
+    tx = tvm.thread_axis("threadIdx.x")
+    s[A2].bind(xi1, tx)
+    s[A2].bind(xi0, tvm.thread_axis("threadIdx.y"))
+    y = s[A2].op.axis[1]
+    s[A1].compute_at(s[A2], y)
+    xo, xi = s[A1].split(s[A1].op.axis[0], factor=16)
+    s[A1].bind(xi, tx)
+    bounds = tvm.schedule.InferBound(s)
+    assert isinstance(bounds, tvm.container.Map)
+    assert(bounds[A1.op.axis[0]].extent.value==16)
+
 def test_bound_scan():
     m = tvm.var("m")
     n = tvm.var("n")
@@ -249,3 +272,4 @@ def test_gemm_bound():
     test_bound_conv1d()
     test_bound2()
     test_gemm_bound()
+    test_bound_warp()
diff --git a/tests/python/unittest/test_schedule_schedule_ops.py b/tests/python/unittest/test_schedule_schedule_ops.py
index a85db2a23e86..8e6f4090d403 100644
--- a/tests/python/unittest/test_schedule_schedule_ops.py
+++ b/tests/python/unittest/test_schedule_schedule_ops.py
@@ -20,6 +20,7 @@ def test_schedule1():
 
     s = tvm.create_schedule(A1.op)
     xo, xi = s[A1].split(A1.op.axis[0], 8)
+    s[A1].pragma(xo, "auto_unroll_max_step", 10)
     bounds = tvm.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
@@ -182,6 +183,25 @@ def test_schedule_cache():
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+def test_schedule_middle_cache():
+    m = tvm.var('m')
+    n = tvm.var('n')
+    A = tvm.placeholder((m, n), name='A')
+    B = tvm.placeholder((m, n), name='B')
+
+    C = tvm.compute((m, n), lambda i, j:  A(i, j) * B(i, j), name='C')
+    D = tvm.compute((m, n), lambda i, j:  C(i , j) , name='D')
+
+    s = tvm.create_schedule(D.op)
+    AA = s.cache_read(A, "local", readers=[C])
+    BB = s.cache_read(B, "local", readers=[C])
+    CC = s.cache_read(C, "local", readers=[D])
+    DD = s.cache_write(D, "local")
+    #s[AA].compute_at(s[CC], CC.op.axis[0])
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+
+
 
 def test_schedule_cache_relayout1():
     m = tvm.var('m')
@@ -229,9 +249,37 @@ def test_schedule_cache_relayout3():
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+def test_schedule_cache_relayout4():
+    def _compute(*indice):
+        return A(*indice) + 1, B(*indice) / 2
+    m = tvm.var('m')
+    n = tvm.var('n')
+    A = tvm.placeholder((m*4, n), name='A')
+    B = tvm.placeholder((m*4, n), name='B')
+    C1, C2 = tvm.compute(A.shape, _compute, name='C')
+    s = tvm.create_schedule([C1.op, C2.op])
+    C1_cache, C2_cache = s.cache_write([C1, C2], "local")
+    s = s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+
+
+def test_schedule_bound_condition():
+   A = tvm.placeholder((64,), name='A', dtype="float32")
+   Apad = tvm.compute((66,), lambda i: tvm.select(tvm.all(i>0, i < 65), A[i-1], tvm.const(0.)), name='Apad')
+   Apad2 = tvm.compute((66,), lambda i: Apad[i]*2, name='Apad2')
+   s = tvm.create_schedule(Apad2.op)
+   AL1 = s.cache_read(A,"local",[Apad])
+   s = s.normalize()
+   bounds = tvm.schedule.InferBound(s)
+   stmt = tvm.schedule.ScheduleOps(s, bounds)
+   stmt = tvm.ir_pass.Simplify(stmt)
+   assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
 
 if __name__ == "__main__":
+    test_schedule_middle_cache()
     test_inline_multi_reduce()
+    test_schedule_cache_relayout4()
     test_schedule_cache_relayout3()
     test_schedule_cache_relayout2()
     test_schedule_cache_relayout1()
@@ -245,3 +293,4 @@ def test_schedule_cache_relayout3():
     test_schedule1()
     test_schedule2()
     test_schedule_cache()
+    test_schedule_bound_condition()
diff --git a/tests/python/unittest/test_schedule_tensorize.py b/tests/python/unittest/test_schedule_tensorize.py
index d8553c025e2d..ca5836143ef3 100644
--- a/tests/python/unittest/test_schedule_tensorize.py
+++ b/tests/python/unittest/test_schedule_tensorize.py
@@ -40,6 +40,33 @@ def intrin_func(ins, outs):
         return tvm.decl_tensor_intrin(z.op, intrin_func,
                                       binds={w: Wb})
 
+def intrin_gemv_no_reset(m, n):
+    w = tvm.placeholder((m, n), name='w')
+    x = tvm.placeholder((n,), name='x')
+    k = tvm.reduce_axis((0, n), name='k')
+    z = tvm.compute((m,), lambda i:
+                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
+    Wb = tvm.decl_buffer(w.shape, w.dtype,
+                         name="W",
+                         offset_factor=16,
+                         strides=[tvm.var('ldw'), 1])
+    def intrin_func(ins, outs):
+        ww, xx = ins
+        zz = outs[0]
+        ww_ptr = ww.access_ptr("r")
+        xx_ptr = xx.access_ptr("r")
+        zz_ptr = zz.access_ptr("w")
+        body = tvm.call_packed(
+            "gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
+        update = tvm.call_packed(
+            "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
+        return body, None, update
+
+    with tvm.build_config(data_alignment=16,
+                          offset_factor=16):
+        return tvm.decl_tensor_intrin(z.op, intrin_func,
+                                      binds={w: Wb})
+
 
 def test_tensorize_vadd():
     m = 128
@@ -123,10 +150,86 @@ def check_rfactor(factor, rfactor):
         stmt = tvm.schedule.ScheduleOps(s, dom_map)
         tvm.lower(s, [A, B, C])
 
+    def check_rfactor_no_reset(factor, rfactor):
+        s = tvm.create_schedule(C.op)
+        x, y = C.op.axis
+        rk = C.op.reduce_axis[0]
+        yo, yi = s[C].split(y, factor=factor)
+        ro, ri = s[C].split(rk, factor=rfactor)
+        s[C].reorder(yo, ro, yi, ri)
+        gemv = intrin_gemv_no_reset(factor, rfactor)
+        s[C].tensorize(yi, gemv)
+        s = s.normalize()
+        dom_map = tvm.schedule.InferBound(s)
+        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
+        out_dom, in_dom = finfer(s[C], dom_map)
+        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
+        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
+        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
+        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
+        body = fmatch(s[C], out_dom, in_dom, gemv)
+        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
+                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
+        stmt = tvm.schedule.ScheduleOps(s, dom_map)
+        tvm.lower(s, [A, B, C])
+
+    def check_rfactor_no_reset_multi_reduction(factor, rfactor):
+        s = tvm.create_schedule(C.op)
+        x, y = C.op.axis
+        rk = C.op.reduce_axis[0]
+        yo, yi = s[C].split(y, factor=factor)
+        ro, ri = s[C].split(rk, factor=rfactor)
+        roo, roi = s[C].split(ro, factor=2)
+        s[C].reorder(yo, roo, roi, yi, ri)
+        gemv = intrin_gemv_no_reset(factor, rfactor)
+        s[C].tensorize(yi, gemv)
+        s = s.normalize()
+        dom_map = tvm.schedule.InferBound(s)
+        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
+        out_dom, in_dom = finfer(s[C], dom_map)
+        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
+        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
+        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
+        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
+        body = fmatch(s[C], out_dom, in_dom, gemv)
+        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
+                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
+        stmt = tvm.schedule.ScheduleOps(s, dom_map)
+        tvm.lower(s, [A, B, C])
+
     check(16)
     check_rfactor(16, 16)
+    check_rfactor_no_reset(16, 16)
+    check_rfactor_no_reset_multi_reduction(16, 16)
+
+# This tests whether algorithm and intrinsics expressions are simplified
+# as much as possible first and then checked for equality. See Issue #696
+def test_tensorize_op():
+    def op_intrin():
+        bh = 9
+        bw = 9
+        x = tvm.placeholder((5, 5), name='A')
+        y = tvm.compute((bh, bw), lambda i,j: x[j/3 + i%3, j%3+ i/3])
+
+        def intrin_func(ins, outs):
+            xx, = ins
+            zz = outs[0]
+            return tvm.call_packed("op", xx, zz)
+
+        with tvm.build_config(offset_factor=2):
+            return tvm.decl_tensor_intrin(y.op, intrin_func)
+
+    A = tvm.placeholder((5, 5), name='A')
+    B = tvm.compute((9,9), lambda i, j: A[j/3 + i%3, j%3 + i/3])
+    bt = op_intrin()
+    s = tvm.create_schedule(B.op)
 
+    x,y = B.op.axis
+    s[B].tensorize(x, bt)
+    s = s.normalize()
+    tvm.lower(s, [A, B])
 
 if __name__ == "__main__":
     test_tensorize_vadd()
     test_tensorize_matmul()
+    test_tensorize_op()
diff --git a/tests/scripts/task_build.sh b/tests/scripts/task_build.sh
new file mode 100755
index 000000000000..2225d6eea5c3
--- /dev/null
+++ b/tests/scripts/task_build.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+cd $1 && cmake .. && make $2 && cd ..
diff --git a/tests/scripts/task_clean.sh b/tests/scripts/task_clean.sh
new file mode 100755
index 000000000000..815680bfdc0c
--- /dev/null
+++ b/tests/scripts/task_clean.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+echo "Cleanup data..."
+cd $1 && rm -rf Cmake* && cd ..
diff --git a/tests/scripts/task_cpp_topi.sh b/tests/scripts/task_cpp_topi.sh
new file mode 100755
index 000000000000..c005b0e56ef4
--- /dev/null
+++ b/tests/scripts/task_cpp_topi.sh
@@ -0,0 +1,4 @@
+export PYTHONPATH=python:topi/python
+
+python -m nose -v topi/tests/python_cpp || exit -1
+python3 -m nose -v topi/tests/python_cpp || exit -1
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index 9a140b64e089..2996fd6099a1 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 export LD_LIBRARY_PATH=lib:${LD_LIBRARY_PATH}
 
-make test -j8 || exit -1
-for test in tests/cpp/*_test; do
+make cpptest -j8 || exit -1
+for test in build/*_test; do
     ./$test || exit -1
 done
diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh
index 9017079237f1..df85e496b226 100755
--- a/tests/scripts/task_java_unittest.sh
+++ b/tests/scripts/task_java_unittest.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-export PYTHONPATH=python:apps/extension/python
-export PYTHONPATH=${PYTHONPATH}:apps/graph_executor/python:apps/graph_executor/nnvm/python
+export PYTHONPATH=python
 export LD_LIBRARY_PATH=lib:${LD_LIBRARY_PATH}
 
 CURR_DIR=$(cd `dirname $0`; pwd)
@@ -9,6 +8,7 @@ TEMP_DIR=$(mktemp -d)
 
 python $SCRIPT_DIR/test_add_cpu.py $TEMP_DIR || exit -1
 python $SCRIPT_DIR/test_add_gpu.py $TEMP_DIR || exit -1
+python $SCRIPT_DIR/test_graph_runtime.py $TEMP_DIR || exit -1
 
 # start rpc proxy server
 PORT=$(( ( RANDOM % 1000 )  + 9000 ))
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index b55dae933c6b..4ff7c490935e 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -8,10 +8,10 @@ make doc
 jsdoc web/tvm_runtime.js web/README.md || exit -1
 mv out docs/_build/html/jsdoc || exit -1
 
-rm -rf python/tvm/*.pyc python/tvm/*/*.pyc
+rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
 cd docs
-PYTHONPATH=../python make html || exit -1
+PYTHONPATH=`pwd`/../python make html || exit -1
 cd _build/html
 tar czf docs.tgz *
 mv docs.tgz ../../../
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 757f2429ad32..8104bf079502 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -1,11 +1,12 @@
 #!/bin/bash
 export PYTHONPATH=python:apps/extension/python
-export LD_LIBRARY_PATH=lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=build:${LD_LIBRARY_PATH}
 
-rm -rf python/tvm/*.pyc python/tvm/*/*.pyc
+rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
 # Test TVM
 make cython || exit -1
+make cython3 || exit -1
 
 # Test extern package package
 cd apps/extension
@@ -17,3 +18,7 @@ TVM_FFI=cython python -m nose -v tests/python/integration || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/integration || exit -1
 TVM_FFI=cython python -m nose -v tests/python/contrib || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/contrib || exit -1
+
+# Do not enabke OpenGL
+# TVM_FFI=cython python -m nose -v tests/webgl || exit -1
+# TVM_FFI=ctypes python3 -m nose -v tests/webgl || exit -1
diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh
new file mode 100755
index 000000000000..2fc41980fb3d
--- /dev/null
+++ b/tests/scripts/task_python_nnvm.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+export PYTHONPATH=nnvm/python:python:topi/python
+
+echo "Running unittest..."
+python -m nose -v nnvm/tests/python/unittest || exit -1
+python3 -m nose -v nnvm/tests/python/unittest || exit -1
+
+echo "Running compiler test..."
+python -m nose -v nnvm/tests/python/compiler || exit -1
+python3 -m nose -v nnvm/tests/python/compiler || exit -1
+
+echo "Running ONNX frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/onnx || exit -1
+
+echo "Running MXNet frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/mxnet || exit -1
+
+echo "Running Keras frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/keras || exit -1
+
+echo "Running Tensorflow frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/tensorflow || exit -1
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index a77dc989ebe5..6ffd5675534a 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 
-export PYTHONPATH=python
+export PYTHONPATH=python:topi/python
 
-rm -rf python/tvm/*.pyc python/tvm/*/*.pyc
+rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
 TVM_FFI=ctypes python -m nose -v tests/python/unittest || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/unittest || exit -1
diff --git a/tests/scripts/task_python_vta.sh b/tests/scripts/task_python_vta.sh
new file mode 100755
index 000000000000..5d8c47cfdb1a
--- /dev/null
+++ b/tests/scripts/task_python_vta.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export PYTHONPATH=python:nnvm/python:vta/python:topi/python
+
+rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc python/tvm/*/*/*/*.pyc
+rm -rf ~/.tvm
+
+echo "Running unittest..."
+python -m nose -v vta/tests/python/unittest || exit -1
+python3 -m nose -v vta/tests/python/unittest || exit -1
+
+echo "Running integration test..."
+python -m nose -v vta/tests/python/integration || exit -1
+python3 -m nose -v vta/tests/python/integration || exit -1
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index bb23e2df058a..24a2ac9265aa 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -32,6 +32,7 @@ fi
 if [ ${TASK} == "cpp_test" ] || [ ${TASK} == "all_test" ]; then
     make -f dmlc-core/scripts/packages.mk gtest
     ./tests/scripts/task_cpp_unittest.sh || exit -1
+    ./tests/scripts/task_cpp_topi.sh || exit -1
 fi
 
 if [ ${TASK} == "python_test" ] || [ ${TASK} == "all_test" ]; then
diff --git a/tests/verilog/integration/test_codegen_verilog.py b/tests/verilog/integration/test_codegen_verilog.py
index e25ff6d9dff4..26c0a9e36c9d 100644
--- a/tests/verilog/integration/test_codegen_verilog.py
+++ b/tests/verilog/integration/test_codegen_verilog.py
@@ -18,7 +18,7 @@ def lower(s, args, name):
     stmt = tvm.ir_pass.CanonicalSimplify(stmt)
     stmt = tvm.ir_pass.Simplify(stmt)
     stmt = tvm.ir_pass.SplitPipeline(stmt, True)
-    fapi = tvm.ir_pass.MakeAPI(stmt, name, arg_list, 0)
+    fapi = tvm.ir_pass.MakeAPI(stmt, name, arg_list, 0, True)
     return fapi
 
 @tvm.register_func
diff --git a/tests/web/websock_rpc_test.py b/tests/web/websock_rpc_test.py
index ece1f8beb024..0c1c8b050720 100644
--- a/tests/web/websock_rpc_test.py
+++ b/tests/web/websock_rpc_test.py
@@ -6,7 +6,8 @@
 
 import tvm
 import os
-from tvm.contrib import rpc, util, emscripten
+from tvm import rpc
+from tvm.contrib import util, emscripten
 import numpy as np
 
 proxy_host = "localhost"
diff --git a/tests/webgl/README.md b/tests/webgl/README.md
new file mode 100644
index 000000000000..c9f2ae3d2272
--- /dev/null
+++ b/tests/webgl/README.md
@@ -0,0 +1,7 @@
+## Test cases for the WebGL backend
+
+Any test case with name `test_local_...` tests the C++ OpenGL backend on the
+local OS, which can be executed automatically.
+
+Any test case with name `test_remote_...` tests the WebGL backend within the
+browser, which must be run manually. See instruction within the test.
diff --git a/tests/webgl/test_local_gemm.py b/tests/webgl/test_local_gemm.py
new file mode 100644
index 000000000000..0dd1c0fc7376
--- /dev/null
+++ b/tests/webgl/test_local_gemm.py
@@ -0,0 +1,41 @@
+import tvm
+import numpy as np
+
+def test_local_gemm():
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+
+    nn = 1024
+    n = tvm.var('n')
+    n = tvm.convert(nn)
+    m = n
+    l = n
+    A = tvm.placeholder((n, l), name='A', dtype='int32')
+    B = tvm.placeholder((m, l), name='B', dtype='int32')
+    k = tvm.reduce_axis((0, l), name='k')
+    C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
+                    name='CC')
+
+    s = tvm.create_schedule(C.op)
+    s[C].opengl()
+    print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+    f = tvm.build(s, [A, B, C], "opengl", name="gemm")
+    print("------opengl code------")
+    print(f.imported_modules[0].get_source(fmt="gl"))
+
+    ctx = tvm.opengl()
+    n, m, l = nn, nn, nn
+    a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype)
+    b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+    f(a, b, c)
+
+    np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
+
+if __name__ == "__main__":
+    test_local_gemm()
diff --git a/tests/webgl/test_local_multi_stage.py b/tests/webgl/test_local_multi_stage.py
new file mode 100644
index 000000000000..47fa5c76c7aa
--- /dev/null
+++ b/tests/webgl/test_local_multi_stage.py
@@ -0,0 +1,30 @@
+import tvm
+import numpy as np
+
+def test_local_multi_stage():
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+
+    n = tvm.var("n")
+    A = tvm.placeholder((n,), name='A', dtype="int32")
+    B = tvm.compute((n,), lambda i: A[i] + 1, name="B")
+    C = tvm.compute((n,), lambda i: B[i] * 2, name="C")
+
+    s = tvm.create_schedule(C.op)
+    s[B].opengl()
+    s[C].opengl()
+
+    f = tvm.build(s, [A, C], "opengl", name="multi_stage")
+
+    ctx = tvm.opengl(0)
+    n = 10
+    a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
+    c = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
+    f(a, c)
+
+    np.testing.assert_allclose(c.asnumpy(), (a.asnumpy() + 1) * 2)
+
+if __name__ == "__main__":
+    test_local_multi_stage()
diff --git a/tests/webgl/test_local_save_load.py b/tests/webgl/test_local_save_load.py
new file mode 100644
index 000000000000..5ed058a7461c
--- /dev/null
+++ b/tests/webgl/test_local_save_load.py
@@ -0,0 +1,36 @@
+import numpy as np
+import tvm
+from tvm import rpc
+from tvm.contrib import util, emscripten
+
+def test_local_save_load():
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+
+    n = tvm.var("n")
+    A = tvm.placeholder((n,), name='A', dtype='int32')
+    B = tvm.placeholder((n,), name='B', dtype='int32')
+    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    s = tvm.create_schedule(C.op)
+    s[C].opengl()
+
+    f = tvm.build(s, [A, B, C], "opengl", target_host="llvm", name="myadd")
+
+    ctx = tvm.opengl(0)
+    n = 10
+    a = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(B.dtype), ctx)
+    c = tvm.nd.array(np.zeros((n), dtype=C.dtype), ctx)
+    f(a, b, c)
+
+    temp = util.tempdir()
+    path_so = temp.relpath("myadd.so")
+    f.export_library(path_so)
+    f1 = tvm.module.load(path_so)
+    f1(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+
+if __name__ == "__main__":
+    test_local_save_load()
diff --git a/tests/webgl/test_local_topi_conv2d_nchw.py b/tests/webgl/test_local_topi_conv2d_nchw.py
new file mode 100644
index 000000000000..106534505694
--- /dev/null
+++ b/tests/webgl/test_local_topi_conv2d_nchw.py
@@ -0,0 +1,82 @@
+"""Example code to do convolution.
+Copied from topi/tests/python/test_topi_conv2d_nchw.py.
+Should be removed once we fix OpenGL testing on Jenkins."""
+import os
+import numpy as np
+import tvm
+import topi
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    B = topi.nn.conv2d_nchw(A, W, stride, padding)
+    C = topi.nn.relu(B)
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
+        c_np = np.maximum(b_np, 0)
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s1 = topi.generic.schedule_conv2d_nchw([B])
+            s2 = topi.generic.schedule_conv2d_nchw([C])
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        with tvm.build_config(auto_unroll_max_step=1400,
+                              unroll_explicit=(device != "cuda")):
+            func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+            func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+            func1(a, w, b)
+            func2(a, w, c)
+            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ['opengl']:
+        check_device(device)
+
+
+def test_conv2d_nchw():
+    # ResNet18 worklaods
+    verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0)
+    verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1)
+    verify_conv2d_nchw(1, 64, 56, 128, 1, 2, 0)
+    verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
+    verify_conv2d_nchw(1, 128, 28, 256, 3, 2, 1)
+    verify_conv2d_nchw(1, 128, 28, 256, 1, 2, 0)
+    verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
+    verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1)
+    verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0)
+    verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
+    # Vgg16 workloads
+    verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1)
+    # Super resolution workloads
+    verify_conv2d_nchw(1, 1, 224, 64, 5, 1, 2)
+    verify_conv2d_nchw(1, 64, 224, 64, 3, 1, 1)
+    verify_conv2d_nchw(1, 64, 224, 32, 3, 1, 1)
+    verify_conv2d_nchw(1, 32, 224, 9, 3, 1, 1)
+
+if __name__ == "__main__":
+    test_conv2d_nchw()
diff --git a/tests/webgl/test_local_topi_dense.py b/tests/webgl/test_local_topi_dense.py
new file mode 100644
index 000000000000..f2e7dfc1331c
--- /dev/null
+++ b/tests/webgl/test_local_topi_dense.py
@@ -0,0 +1,59 @@
+"""Test code for dense operator
+Copied from topi/tests/python/test_topi_dense.py.
+Should be removed once we fix OpenGL testing on Jenkins.
+"""
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+
+def verify_dense(batch, in_dim, out_dim, use_bias=True):
+    A = tvm.placeholder((batch, in_dim), name='A')
+    B = tvm.placeholder((out_dim, in_dim), name='B')
+    C = tvm.placeholder((out_dim,), name='C')
+    D = topi.nn.dense(A, B, C if use_bias else None)
+    D = topi.nn.relu(D)
+    dtype = A.dtype
+
+    # use memoize to pickle the test data for next time use
+    @memoize("topi.tests.test_topi_dense")
+    def get_ref_data():
+        a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype)
+        b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype)
+        c_np = np.random.uniform(size=(out_dim,)).astype(dtype)
+        if use_bias:
+            d_np = np.maximum(np.dot(a_np, b_np.T) + c_np, 0.0)
+        else:
+            d_np = np.maximum(np.dot(a_np, b_np.T), 0.0)
+        return (a_np, b_np, c_np, d_np)
+    # get the test data
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_dense(D)
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A, B, C, D], device, name="dense")
+        f(a, b, c, d)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+
+    for device in ['opengl']:
+        check_device(device)
+
+def test_dense():
+    verify_dense(1, 1024, 1000, use_bias=True)
+    verify_dense(1, 1024, 1000, use_bias=False)
+
+
+if __name__ == "__main__":
+    test_dense()
diff --git a/tests/webgl/test_local_topi_pooling.py b/tests/webgl/test_local_topi_pooling.py
new file mode 100644
index 000000000000..813fcd227e2f
--- /dev/null
+++ b/tests/webgl/test_local_topi_pooling.py
@@ -0,0 +1,115 @@
+"""Test code for pooling
+Copied from topi/tests/python/test_topi_pooling.py.
+Should be removed once we fix OpenGL testing on Jenkins.
+"""
+import numpy as np
+import tvm
+import topi
+import math
+from topi.util import get_const_tuple
+
+def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode):
+    iw = ih
+    kw = kh
+    sw = sh
+    ph, pw = padding
+    A = tvm.placeholder((n, ic, ih, iw), name='A')
+    B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding,
+                     pool_type=pool_type, ceil_mode=ceil_mode)
+    B = topi.nn.relu(B)
+    dtype = A.dtype
+
+    bshape = get_const_tuple(B.shape)
+    ashape = get_const_tuple(A.shape)
+    if ceil_mode:
+        assert bshape[2] == int(math.ceil(float(ashape[2] - kh + ph * 2) / sh) + 1)
+        assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pw * 2) / sw) + 1)
+    else:
+        assert bshape[2] == int(math.floor(float(ashape[2] - kh + ph * 2) / sh) + 1)
+        assert bshape[3] == int(math.floor(float(ashape[3] - kw + pw * 2) / sw) + 1)
+
+
+    a_np = np.random.uniform(size=(n, ic, ih, iw)).astype(dtype)
+    pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype)
+    no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw)))
+    pad_np[np.ix_(*no_zero)] = a_np
+    _, oc, oh, ow = get_const_tuple(B.shape)
+    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)
+
+    if pool_type == 'avg':
+        for i in range(oh):
+            for j in range(ow):
+                b_np[:,:,i,j] = np.mean(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3))
+    elif pool_type =='max':
+        for i in range(oh):
+            for j in range(ow):
+                b_np[:,:,i,j] = np.max(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3))
+    b_np = np.maximum(b_np, 0.0)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_pool(B)
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
+        print(tvm.lower(s, [A, B], simple_mode=True))
+
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['opengl']:
+        check_device(device)
+
+def test_pool():
+    verify_pool(1, 256, 32, 2, 2, [0, 0], 'avg', False)
+    verify_pool(1, 256, 31, 3, 3, [1, 2], 'avg', False)
+    verify_pool(1, 256, 32, 2, 2, [0, 0], 'max', False)
+    verify_pool(1, 256, 31, 3, 3, [2, 1], 'max', False)
+    verify_pool(1, 256, 31, 3, 3, [2, 1], 'max', True)
+
+
+
+def verify_global_pool(n, c, h, w, pool_type):
+    A = tvm.placeholder((n, c, h, w), name='A')
+    B = topi.nn.global_pool(A, pool_type=pool_type)
+    B = topi.nn.relu(B)
+
+    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
+    if pool_type == 'avg':
+        b_np = np.mean(a_np, axis=(2,3), keepdims=True)
+    elif pool_type =='max':
+        b_np = np.max(a_np, axis=(2,3), keepdims=True)
+    b_np = np.maximum(b_np, 0.0)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_global_pool(B)
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['opengl']:
+        check_device(device)
+
+def test_global_pool():
+    verify_global_pool(1, 1024, 7, 7, 'avg')
+    verify_global_pool(4, 1024, 7, 7, 'avg')
+    verify_global_pool(1, 1024, 7, 7, 'max')
+    verify_global_pool(4, 1024, 7, 7, 'max')
+
+
+if __name__ == "__main__":
+    test_pool()
+    test_global_pool()
diff --git a/tests/webgl/test_local_topi_softmax.py b/tests/webgl/test_local_topi_softmax.py
new file mode 100644
index 000000000000..34f8bfb8d8f5
--- /dev/null
+++ b/tests/webgl/test_local_topi_softmax.py
@@ -0,0 +1,79 @@
+"""Test code for softmax
+Copied from topi/tests/python/test_topi_softmax.py.
+Should be removed once we fix OpenGL testing on Jenkins.
+"""
+
+import os
+import numpy as np
+import tvm
+import topi
+import logging
+from topi.util import get_const_tuple
+
+def verify_softmax(m, n):
+    A = tvm.placeholder((m, n), name='A')
+    B = topi.nn.softmax(A)
+    # confirm lower works
+    s = tvm.create_schedule([B.op])
+    tvm.lower(s, [A, B], simple_mode=True)
+
+    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
+    b_np = topi.testing.softmax_python(a_np)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_softmax(B)
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        foo = tvm.build(s, [A, B], device, name="softmax")
+        foo(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ["opengl"]:
+        check_device(device)
+
+def test_softmax():
+    verify_softmax(32, 10)
+    verify_softmax(3, 4)
+
+
+def verify_log_softmax(m, n):
+    A = tvm.placeholder((m, n), name='A')
+    B = topi.nn.log_softmax(A)
+    # confirm lower works
+    s = tvm.create_schedule([B.op])
+    tvm.lower(s, [A, B], simple_mode=True)
+    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
+    b_np = topi.testing.log_softmax_python(a_np)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_softmax(B)
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        foo = tvm.build(s, [A, B], device, name="log_softmax")
+        foo(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ["opengl"]:
+        check_device(device)
+
+
+def test_log_softmax():
+    verify_log_softmax(32, 10)
+    verify_log_softmax(3, 4)
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    test_softmax()
+    test_log_softmax()
diff --git a/tests/webgl/test_remote_save_load.py b/tests/webgl/test_remote_save_load.py
new file mode 100644
index 000000000000..b1db6ce741c5
--- /dev/null
+++ b/tests/webgl/test_remote_save_load.py
@@ -0,0 +1,79 @@
+"""
+The following instruction is based on web/README.md.
+
+Setup an RPC server:
+$ python -m tvm.exec.rpc_proxy --example-rpc=1
+
+Go to http://localhost:9190 in browser.
+
+Click "Connect To Proxy".
+
+Run this test script:
+$ python tests/webgl/test_remote_save_load.py
+"""
+
+import numpy as np
+import tvm
+from tvm import rpc
+from tvm.contrib import util, emscripten
+
+proxy_host = "localhost"
+proxy_port = 9090
+
+def try_remote_save_load():
+    if not tvm.module.enabled("rpc"):
+        return
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+
+    # Build the module.
+    n = tvm.var("n")
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    s = tvm.create_schedule(C.op)
+    s[C].opengl()
+    target_host = "llvm -target=asmjs-unknown-emscripten -system-lib"
+    f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd")
+
+    remote = rpc.connect(proxy_host, proxy_port, key="js")
+
+    temp = util.tempdir()
+    ctx = remote.opengl(0)
+    path_obj = temp.relpath("myadd.bc")
+    path_dso = temp.relpath("myadd.js")
+    path_gl = temp.relpath("myadd.gl")
+    path_json = temp.relpath("myadd.tvm_meta.json")
+
+    f.save(path_obj)
+    emscripten.create_js(path_dso, path_obj, side_module=True)
+    f.imported_modules[0].save(path_gl)
+
+    remote.upload(path_dso, "myadd.dso")
+    remote.upload(path_gl)
+    remote.upload(path_json)
+
+    remote.download("myadd.dso")
+    remote.download("myadd.gl")
+    remote.download("myadd.tvm_meta.json")
+
+    print('Loading myadd.dso')
+    fhost = remote.load_module("myadd.dso")
+
+    print('Loading myadd.gl')
+    fdev = remote.load_module("myadd.gl")
+
+    print('import_module')
+    fhost.import_module(fdev)
+
+    print('running...')
+    a = tvm.nd.array(np.random.uniform(size=16).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx)
+    c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx)
+    fhost(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+
+if __name__ == "__main__":
+    try_remote_save_load()
diff --git a/tests/webgl/test_static_webgl_library.html b/tests/webgl/test_static_webgl_library.html
new file mode 100644
index 000000000000..39bcb5fff8c7
--- /dev/null
+++ b/tests/webgl/test_static_webgl_library.html
@@ -0,0 +1,55 @@
+<html>
+
+<head>
+  <meta charset="UTF-8">
+  <title>TVM RPC Test Page</title>
+</head>
+
+<body>
+  <h1>TVM Test Page</h1>
+  <div id="log"></div>
+  <canvas id="canvas"></canvas>
+  <script>
+    var Module = {};
+    Module["canvas"] = document.getElementById("canvas");
+  </script>
+  <script src="identity_static.js"></script>
+  <script src="tvm_runtime.js"></script>
+  <script>
+    var tvm = tvm_runtime.create(Module);
+    tvm.logger = function (message) {
+      console.log(message);
+      var d = document.createElement("div");
+      d.innerHTML = message;
+      document.getElementById("log").appendChild(d);
+    };
+
+    function randomArray(length, max) {
+      return Array.apply(null, Array(length)).map(function () {
+        return Math.random() * max;
+      });
+    }
+
+    setTimeout(function () {
+      this.syslib = tvm.systemLib();
+      this.identity = this.syslib.getFunction("identity");
+
+      this.n = 16;
+      this.a = randomArray(this.n, 1);
+      this.ctx = tvm.context("opengl", 0);
+      this.A = tvm.empty(this.n, "float32", ctx).copyFrom(this.a);
+      this.B = tvm.empty(this.n, "float32", ctx);
+      identity(this.A, this.B);
+
+      this.a = this.A.asArray();
+      this.b = this.B.asArray();
+      for (var i = 0; i < n; ++i) {
+        tvm.assert(this.a[i] == this.b[i]);
+      }
+      this.identity.release();
+    }, 1000);
+
+  </script>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/tests/webgl/test_static_webgl_library.py b/tests/webgl/test_static_webgl_library.py
new file mode 100644
index 000000000000..262416c42506
--- /dev/null
+++ b/tests/webgl/test_static_webgl_library.py
@@ -0,0 +1,49 @@
+"""Create a static WebGL library and run it in the browser."""
+
+from __future__ import absolute_import, print_function
+
+import os, shutil, SimpleHTTPServer, SocketServer
+import tvm
+from tvm.contrib import emscripten, util
+import numpy as np
+
+def try_static_webgl_library():
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+
+    # Change to lib/ which contains "libtvm_runtime.bc".
+    os.chdir(os.path.join(curr_path, "../../lib"))
+
+    # Create OpenGL module.
+    n = tvm.var("n")
+    A = tvm.placeholder((n,), name='A', dtype="float")
+    B = tvm.compute((n,), lambda *i: A[i], name="B")
+
+    s = tvm.create_schedule(B.op)
+    s[B].opengl()
+
+    target_host = "llvm -target=asmjs-unknown-emscripten -system-lib"
+    f = tvm.build(s, [A, B], name="identity", target="opengl",
+                  target_host=target_host)
+
+    # Create a JS library that contains both the module and the tvm runtime.
+    path_dso = "identity_static.js"
+    f.export_library(path_dso, emscripten.create_js, options=[
+        "-s", "USE_GLFW=3",
+        "-s", "USE_WEBGL2=1",
+        "-lglfw",
+    ])
+
+    # Create "tvm_runtime.js" and "identity_static.html" in lib/
+    shutil.copyfile(os.path.join(curr_path, "../../web/tvm_runtime.js"),
+                    "tvm_runtime.js")
+    shutil.copyfile(os.path.join(curr_path, "test_static_webgl_library.html"),
+                    "identity_static.html")
+
+    port = 8080
+    handler = SimpleHTTPServer.SimpleHTTPRequestHandler
+    httpd = SocketServer.TCPServer(("", port), handler)
+    print("Please open http://localhost:" + str(port) + "/identity_static.html")
+    httpd.serve_forever()
+
+if __name__ == "__main__":
+    try_static_webgl_library()
diff --git a/topi/include/topi/broadcast.h b/topi/include/topi/broadcast.h
index ff0c5ce4ded1..ad1c04ae1327 100644
--- a/topi/include/topi/broadcast.h
+++ b/topi/include/topi/broadcast.h
@@ -7,8 +7,9 @@
 #define TOPI_BROADCAST_H_
 
 #include <string>
-
+#include <algorithm>
 #include "topi/detail/broadcast.h"
+#include "topi/detail/constant_utils.h"
 #include "topi/tags.h"
 
 namespace topi {
@@ -33,8 +34,8 @@ inline tvm::Tensor broadcast_to(const tvm::Tensor& t,
       << output_shape << "\nvs\ninput: " << t;
   auto bh = detail::BroadcastShape(output_shape, t->shape);
   CHECK_EQ(output_shape.size(), bh.common_shape.size());
-  for (int i = 0; i < output_shape.size(); ++i) {
-    CHECK(tvm::ir::Equal(output_shape[i], bh.common_shape[i]));
+  for (size_t i = 0; i < output_shape.size(); ++i) {
+    CHECK(topi::detail::EqualCheck(output_shape[i], bh.common_shape[i]));
   }
   auto l = [&](tvm::Array<tvm::Var> ovars) {
     return t(detail::InputIndexFromBroadcast(ovars, t, bh.vars2, bh.all_vars));
@@ -46,106 +47,267 @@ inline tvm::Tensor broadcast_to(const tvm::Tensor& t,
       tag);
 }
 
+#define TOPI_DEFINE_BCAST_OP(Name, ComputeRule)                   \
+  inline tvm::Expr Name(const tvm::Expr& a,                       \
+                        const tvm::Expr& b) {                     \
+    ComputeRule;                                                  \
+  }                                                               \
+  inline tvm::Tensor Name(const tvm::Tensor& A,                   \
+                          const tvm::Tensor& B,                   \
+                          std::string name = "tensor",            \
+                          std::string tag = kBroadcast) {         \
+    auto l = [](tvm::Expr a, tvm::Expr b) { ComputeRule; };       \
+    return detail::WithBroadcast(l, A, B, name, tag);             \
+  }                                                               \
+  inline tvm::Tensor Name(const tvm::Tensor& A,                   \
+                          const tvm::Expr& B,                     \
+                          std::string name = "tensor",            \
+                          std::string tag = kElementWise) {       \
+    auto l = [](tvm::Expr a, tvm::Expr b) { ComputeRule; };           \
+    return compute(A->shape, [&](const ::tvm::Array<::tvm::Var>& i) { \
+        return l(A(i), B);                                        \
+      }, name, tag);                                              \
+  }                                                               \
+  inline tvm::Tensor Name(const tvm::Expr& A,                     \
+                          const tvm::Tensor& B,                   \
+                          std::string name = "tensor",            \
+                          std::string tag = kElementWise) {       \
+    auto l = [&](tvm::Expr a, tvm::Expr b) { ComputeRule; };      \
+    return compute(B->shape, [&](const ::tvm::Array<::tvm::Var>& i) { \
+        return l(A, B(i));                                        \
+      }, name, tag);                                              \
+  }
+
+
+#define TOPI_DEFINE_OP_OVERLOAD(Name, OpName)                       \
+  inline tvm::Tensor Name(const tvm::Tensor& A,                     \
+                          const tvm::Tensor& B) {                   \
+    return topi::OpName(A, B);                                      \
+  }                                                                 \
+  inline tvm::Tensor Name(const tvm::Expr& A,                       \
+                          const tvm::Tensor& B) {                   \
+    return topi::OpName(A, B);                                      \
+  }                                                                 \
+  inline tvm::Tensor Name(const tvm::Tensor& A,                     \
+                          const tvm::Expr& B) {                     \
+    return topi::OpName(A, B);                                      \
+  }
+
+
 /*!
- * \brief Creates an operation that performs pointwise addition of 2 tensors
- * and broadcasts them into a common compatible shape where necessary,
- * according to numpy's rules
+ * \fn add
+ * \brief Compute A + B with auto-broadcasting.
  *
- * \param A The first tensor to add
- * \param B The second tensor to add
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
- * \return A Tensor whose op member is a pointwise addition with broadcast
+ * \return The result.
  */
-inline tvm::Tensor broadcast_add(const tvm::Tensor& A,
-                                 const tvm::Tensor& B,
-                                 std::string name = "tensor",
-                                 std::string tag = kBroadcast) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) { return a + b; };
-  return detail::WithBroadcast(l, A, B, name, tag);
-}
+TOPI_DEFINE_BCAST_OP(add, { return a + b; });
+TOPI_DEFINE_OP_OVERLOAD(operator+, add);
 
 /*!
- * \brief Creates an operation that performs pointwise subtraction of 2 tensors
- * and broadcasts them into a common compatible shape where necessary,
- * according to numpy's rules
+ * \fn subtract
+ * \brief Compute A - B with auto-broadcasting.
  *
- * \param A The first tensor
- * \param B The second tensor to subtract from the first
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
- * \return A Tensor whose op member is a pointwise subtraction with broadcast
+ * \return The result.
  */
-inline tvm::Tensor broadcast_sub(const tvm::Tensor& A,
-                                 const tvm::Tensor& B,
-                                 std::string name = "tensor",
-                                 std::string tag = kBroadcast) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) { return a - b; };
-  return detail::WithBroadcast(l, A, B, name, tag);
-}
+TOPI_DEFINE_BCAST_OP(subtract, { return a - b; });
+TOPI_DEFINE_OP_OVERLOAD(operator-, subtract);
 
 /*!
- * \brief Creates an operation that performs pointwise multiplication of 2
- * tensors and broadcasts them into a common compatible shape where necessary,
- * according to numpy's rules
+ * \fn multiply
+ * \brief Compute A * B with auto-broadcasting.
  *
- * \param A The first tensor to multiply
- * \param B The second tensor to multiply
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
- * \return A Tensor whose op member is a pointwise multiplication with broadcast
+ * \return The result.
  */
-inline tvm::Tensor broadcast_mul(const tvm::Tensor& A,
-                                 const tvm::Tensor& B,
-                                 std::string name = "tensor",
-                                 std::string tag = kBroadcast) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) { return a * b; };
-  return detail::WithBroadcast(l, A, B, name, tag);
-}
+TOPI_DEFINE_BCAST_OP(multiply, { return a * b; });
+TOPI_DEFINE_OP_OVERLOAD(operator*, multiply);
 
 /*!
- * \brief Creates an operation that performs pointwise division of 2 tensors
- * and broadcasts them into a common compatible shape where necessary,
- * according to numpy's rules
+ * \fn divide
+ * \brief Compute A / B with auto-broadcasting.
  *
- * \param A The first tensor
- * \param B The second tensor to divide the first tensor with
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
- * \return A Tensor whose op member is a pointwise division with broadcast
+ * \return The result.
  */
-inline tvm::Tensor broadcast_div(const tvm::Tensor& A,
-                                 const tvm::Tensor& B,
-                                 std::string name = "tensor",
-                                 std::string tag = kBroadcast) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) { return a / b; };
-  return detail::WithBroadcast(l, A, B, name, tag);
-}
+TOPI_DEFINE_BCAST_OP(divide, { return a / b; });
+TOPI_DEFINE_OP_OVERLOAD(operator/, divide);
 
 /*!
- * \brief Creates an operation that performs pointwise modulo remainder of 2
- * tensors and broadcasts them into a common compatible shape where necessary,
- * according to numpy's rules
+ * \fn mod
+ * \brief Compute A % B with auto-broadcasting.
  *
- * \param A The first tensor
- * \param B The second tensor to compute A % B
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
- * \return A Tensor whose op member is a pointwise modulo remainder with
- * broadcast
+ * \return The result.
  */
-inline tvm::Tensor broadcast_mod(const tvm::Tensor& A,
-                                 const tvm::Tensor& B,
-                                 std::string name = "tensor",
-                                 std::string tag = kBroadcast) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) { return a % b; };
-  return detail::WithBroadcast(l, A, B, name, tag);
-}
+TOPI_DEFINE_BCAST_OP(mod, { return a % b; });
+TOPI_DEFINE_OP_OVERLOAD(operator%, mod);
+
+/*!
+ * \fn maximum
+ * \brief Compute maximum(A, B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(maximum, { return tvm::max(a, b); });
+
+/*!
+ * \fn minimum
+ * \brief Compute minimum(A, B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(minimum, { return tvm::min(a, b); });
+
+/*!
+ * \fn power
+ * \brief Compute power(A, B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(power, { return tvm::pow(a, b); });
+
+/*!
+ * \fn left_shift
+ * \brief Compute A << B with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(left_shift, { return a << b; });
+TOPI_DEFINE_OP_OVERLOAD(operator<<, left_shift);
+
+/*!
+ * \fn right_shift
+ * \brief Compute A >> B with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(right_shift, { return a >> b; });
+TOPI_DEFINE_OP_OVERLOAD(operator>>, right_shift);
+
+/*!
+ * \fn greater
+ * \brief Compute (A > B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(greater, { return (a > b); });
+
+/*!
+ * \fn less
+ * \brief Compute (A < B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(less, { return (a < b); });
+
+/*!
+ * \fn equal
+ * \brief Compute (A == B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(equal, { return (a == b); });
+
+/*!
+ * \fn not_equal
+ * \brief Compute (A != B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(not_equal, { return (a != b); });
+
+/*!
+ * \fn greater_equal
+ * \brief Compute (A >= B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(greater_equal, { return (a >= b); });
+
+/*!
+ * \fn less_equal
+ * \brief Compute (A <= B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(less_equal, { return (a <= b); });
 
 }  // namespace topi
 
diff --git a/topi/include/topi/contrib/cublas.h b/topi/include/topi/contrib/cublas.h
new file mode 100644
index 000000000000..0ec81cc405cb
--- /dev/null
+++ b/topi/include/topi/contrib/cublas.h
@@ -0,0 +1,49 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief External function interface to cuBLAS libraries
+ * \file tags.h
+ */
+#ifndef TOPI_CONTRIB_CUBLAS_H_
+#define TOPI_CONTRIB_CUBLAS_H_
+
+#include "tvm/tvm.h"
+#include "topi/detail/extern.h"
+
+namespace topi {
+namespace contrib {
+using namespace tvm;
+using namespace topi::detail;
+/*!
+* \brief Create an op that multiplies lhs and rhs with cuBLAS
+*
+* \param lhs The left matrix operand
+* \param rhs The right matrix operand
+* \param transa Whether to transpose lhs
+* \param transb Whether to transpose rhs
+*
+* \return The output tensor
+*/
+inline Tensor cublas_matmul(const Tensor& lhs,
+                            const Tensor& rhs,
+                            bool transa,
+                            bool transb) {
+  auto n = transa ? lhs->shape[1] : lhs->shape[0];
+  auto m = transb ? rhs->shape[0] : rhs->shape[1];
+
+  return make_extern(
+    { { n, m } }, { lhs->dtype }, { lhs, rhs },
+    [&](Array<Buffer> ins, Array<Buffer> outs) {
+      return call_packed({
+        Expr("tvm.contrib.cublas.matmul"),
+        pack_buffer(ins[0]),
+        pack_buffer(ins[1]),
+        pack_buffer(outs[0]),
+        transa,
+        transb });
+    }, "C", "", {})[0];
+}
+
+}  // namespace contrib
+}  // namespace topi
+
+#endif  // TOPI_CONTRIB_CUBLAS_H_
diff --git a/topi/include/topi/contrib/rocblas.h b/topi/include/topi/contrib/rocblas.h
new file mode 100644
index 000000000000..5abdc4259c29
--- /dev/null
+++ b/topi/include/topi/contrib/rocblas.h
@@ -0,0 +1,48 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief External function interface to rocBLAS libraries
+ * \file tags.h
+ */
+#ifndef TOPI_CONTRIB_ROCBLAS_H_
+#define TOPI_CONTRIB_ROCBLAS_H_
+
+#include "tvm/tvm.h"
+#include "topi/detail/extern.h"
+
+namespace topi {
+namespace contrib {
+using namespace tvm;
+/*!
+* \brief Create an op that multiplies lhs and rhs with rocBLAS
+*
+* \param lhs The left matrix operand
+* \param rhs The right matrix operand
+* \param transa Whether to transpose lhs
+* \param transb Whether to transpose rhs
+*
+* \return The output tensor
+*/
+inline Tensor rocblas_matmul(const Tensor& lhs,
+                             const Tensor& rhs,
+                             bool transa,
+                             bool transb) {
+  auto n = transa ? lhs->shape[1] : lhs->shape[0];
+  auto m = transb ? rhs->shape[0] : rhs->shape[1];
+
+  return make_extern(
+    { { n, m } }, { lhs->dtype }, { lhs, rhs },
+    [&](Array<Buffer> ins, Array<Buffer> outs) {
+      return call_packed({
+        Expr("tvm.contrib.rocblas.matmul"),
+        pack_buffer(ins[0]),
+        pack_buffer(ins[1]),
+        pack_buffer(outs[0]),
+        transa,
+        transb });
+    }, "C", "", {})[0];
+}
+
+}  // namespace contrib
+}  // namespace topi
+
+#endif  // TOPI_CONTRIB_ROCBLAS_H_
diff --git a/topi/include/topi/cuda/dense.h b/topi/include/topi/cuda/dense.h
new file mode 100644
index 000000000000..77e29fc7fdbb
--- /dev/null
+++ b/topi/include/topi/cuda/dense.h
@@ -0,0 +1,133 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file cuda/dense.h
+* \brief CUDA schedule for dense operation
+*/
+#ifndef TOPI_CUDA_DENSE_H_
+#define TOPI_CUDA_DENSE_H_
+
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+#include "topi/tags.h"
+#include "topi/detail/array_utils.h"
+#include "topi/nn/dense.h"
+#include "topi/contrib/cublas.h"
+#include "topi/generic/extern.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace cuda {
+/*!
+* \brief Implementation of dense for CUDA backend
+*
+* \param target The target device
+* \param data Tensor with shape [batch, in_dim]
+* \param weight Tensor with shape [out_dim, in_dim]
+* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+*
+* \return Tensor with shape [batch, out_dim]
+*/
+inline tvm::Tensor dense_cuda(const Target& target,
+                              const tvm::Tensor& data,
+                              const tvm::Tensor& weight,
+                              const tvm::Tensor& bias) {
+  CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
+  CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
+  if (bias.defined()) {
+    CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
+  }
+
+  auto batch = data->shape[0];
+  auto in_dim = data->shape[1];
+  auto out_dim = weight->shape[0];
+
+  if (target->libs().count("cublas")) {
+    auto mm = topi::contrib::cublas_matmul(data, weight, false, true);
+    if (bias.defined()) {
+      mm = tvm::compute({ batch, out_dim },
+                        [&](Var i, Var j) {
+                          return mm(i, j) + bias(j);
+                        }, "tensor", kBroadcast);
+    }
+
+    return mm;
+  } else {
+    return topi::nn::dense(data, weight, bias);
+  }
+}
+
+/*!
+* \brief Create a CUDA schedule for dense
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_dense(const Target &target, const Array<Tensor>& outs) {
+  if (target->target_name == "cuda" &&
+    target->libs().count("cublas")) {
+    return topi::generic::schedule_extern(target, outs);
+  }
+
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+
+  auto _schedule = [&](const Tensor& dense) {
+    auto num_thread = 64;
+    auto k = dense->op.as<ComputeOpNode>()->reduce_axis[0];
+    IterVar ko, kf;
+    s[dense].split(k, num_thread, &ko, &kf);
+    auto dense_f = s.rfactor(dense, kf)[0];
+
+    Tensor out;
+    if (detail::contains(s->outputs, dense->op)) {
+      out = dense;
+    } else {
+      out = outs[0]->op.output(0);
+      s[dense].compute_at(s[out], s[out]->op.as<ComputeOpNode>()->axis[1]);
+    }
+    s[out].bind(s[out]->op.as<ComputeOpNode>()->axis[0], tvm::thread_axis(Range(), "blockIdx.y"));
+    s[out].bind(s[out]->op.as<ComputeOpNode>()->axis[1], tvm::thread_axis(Range(), "blockIdx.x"));
+
+    auto tx = s[dense]->op.as<ComputeOpNode>()->reduce_axis[0];
+    auto thread_x = tvm::thread_axis(Range(), "threadIdx.x");
+    s[dense].bind(tx, thread_x);
+    s[dense_f].compute_at(s[dense], tx);
+    s[dense].set_store_predicate(static_cast<Expr>(thread_x) == 0);
+    s[out].set_store_predicate(static_cast<Expr>(thread_x) == 0);
+  };
+
+  std::function<void(Operation)> traverse;
+  traverse = [&](const Operation& op) {
+    // Inline all one-to-one-mapping operators except the last stage (output)
+    if (is_broadcast(op->tag)) {
+      if (!detail::contains(s->outputs, op)) {
+        s[op].compute_inline();
+      }
+      for (auto tensor : op->InputTensors()) {
+        if (tensor->op->InputTensors().size() > 0) {
+          traverse(tensor->op);
+        }
+      }
+    } else if (op->tag == "dense") {
+      // If tag starts with global_pool
+      auto dense = op.output(0);
+      _schedule(dense);
+    } else {
+      LOG(ERROR) << "Unsupported operator " << op->tag;
+    }
+  };
+
+  traverse(outs[0]->op);
+  return s;
+}
+
+}  // namespace cuda
+}  // namespace topi
+#endif  // TOPI_CUDA_DENSE_H_
+
diff --git a/topi/include/topi/cuda/extern.h b/topi/include/topi/cuda/extern.h
new file mode 100644
index 000000000000..475ab6ba8a19
--- /dev/null
+++ b/topi/include/topi/cuda/extern.h
@@ -0,0 +1,68 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file cuda/extern.h
+* \brief CUDA schedule for extern followed by injective operations
+*/
+#ifndef TOPI_CUDA_EXTERN_H_
+#define TOPI_CUDA_EXTERN_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace cuda {
+/*!
+ * \brief Schedule a given operation representing one of the outputs of an
+ * external function which is followed by injective operations.
+ *
+ * \param target The target to generate a schedule for.
+ * \param op The operation representing the output followed by injective operations.
+ * \param sch The schedule to apply this scheduling to
+ *
+ * \return The schedule given by sch
+ */
+inline Schedule ScheduleOutputForExtern(Target target, Operation op, Schedule sch) {
+  auto x = op.output(0);
+  auto fused = detail::Fuse(sch[x], sch[x]->op.as<ComputeOpNode>()->axis);
+  auto num_thread = target->max_num_threads;
+  IterVar bx, tx;
+  sch[x].split(fused, num_thread, &bx, &tx);
+  sch[x].bind(bx, tvm::thread_axis(Range(), "blockIdx.x"));
+  sch[x].bind(tx, tvm::thread_axis(Range(), "threadIdx.x"));
+  return sch;
+}
+
+/*!
+* \brief Schedule an extern op followed by injective operations.
+* For example, cudnn kernel + bias add + relu
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the op.
+*/
+inline Schedule schedule_extern(const Target& target, Array<Tensor> outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+
+  tvm::schedule::AutoInlineInjective(s);
+  for (auto out : outs) {
+    if (out->op->derived_from<ExternOpNode>()) {
+      continue;
+    }
+    ScheduleOutputForExtern(target, out->op, s);
+  }
+
+  return s;
+}
+
+}  // namespace cuda
+}  // namespace topi
+#endif  // TOPI_CUDA_EXTERN_H_
diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h
new file mode 100644
index 000000000000..91c6df3a2a3c
--- /dev/null
+++ b/topi/include/topi/cuda/injective.h
@@ -0,0 +1,58 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file cuda/injective.h
+* \brief CUDA schedule for injective operations
+*/
+#ifndef TOPI_CUDA_INJECTIVE_H_
+#define TOPI_CUDA_INJECTIVE_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace cuda {
+/*!
+* \brief Schedule a given injective operation.
+*
+* \param target The target to generate a schedule for.
+* \param op The operation representing the injective operation.
+* \param s The schedule to apply this scheduling to
+*/
+inline void ScheduleInjectiveOp(const Target &target, Operation op, Schedule s) {
+  auto x = op.output(0);
+  auto fused = detail::Fuse(s[x], s[x]->op.as<ComputeOpNode>()->axis);
+  auto num_thread = target->max_num_threads;
+  IterVar bx, tx;
+  s[x].split(fused, num_thread, &bx, &tx);
+  s[x].bind(bx, thread_axis(Range(), "blockIdx.x"));
+  s[x].bind(tx, thread_axis(Range(), "threadIdx.x"));
+}
+
+/*!
+ * \brief Create a CUDA schedule for the given output tensors.
+ *
+ * \param target The target to generate a schedule for.
+ * \param outs The output tensors.
+ *
+ * \return A schedule for the given ops.
+ */
+inline Schedule schedule_injective(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  tvm::schedule::AutoInlineInjective(s);
+  for (auto out : outs) {
+    ScheduleInjectiveOp(target, out->op, s);
+  }
+  return s;
+}
+
+}  // namespace cuda
+}  // namespace topi
+#endif  // TOPI_CUDA_INJECTIVE_H_
diff --git a/topi/include/topi/cuda/normalization.h b/topi/include/topi/cuda/normalization.h
new file mode 100644
index 000000000000..91578c46d266
--- /dev/null
+++ b/topi/include/topi/cuda/normalization.h
@@ -0,0 +1,106 @@
+/*!
+*  Copyright (c) 2018 by Contributors
+* \file cuda/normalization.h
+* \brief CUDA schedule for LRN and l2 normalization operations
+*/
+#ifndef TOPI_CUDA_NORMALIZATION_H_
+#define TOPI_CUDA_NORMALIZATION_H_
+
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+#include "topi/tags.h"
+
+namespace topi {
+using namespace tvm;
+namespace cuda {
+/*!
+* \brief Create a CUDA schedule for LRN
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_lrn(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  Schedule s = create_schedule(out_ops);
+  int num_thread = 64;
+  IterVar block_x = tvm::thread_axis(Range(), "blockIdx.x");
+  IterVar thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
+  Tensor lrn = outs[0];
+  Tensor sqr_sum_up = lrn->op->InputTensors()[1];
+  Tensor sqr_sum = sqr_sum_up->op->InputTensors()[0];
+  Tensor set_pad = sqr_sum->op->InputTensors()[0];
+  s[set_pad].bind(set_pad->op.as<ComputeOpNode>()->axis[0], block_x);
+  IterVar rxk = sqr_sum->op.as<ComputeOpNode>()->reduce_axis[0];
+  IterVar xko, xki;
+  s[sqr_sum].split(rxk, num_thread, &xko, &xki);
+  Tensor srf = s.rfactor(sqr_sum, xki)[0];
+  s[sqr_sum].bind(s[sqr_sum]->op.as<ComputeOpNode>()->axis[0], block_x);
+  s[sqr_sum].bind(s[sqr_sum]->op.as<ComputeOpNode>()->reduce_axis[0], thread_x);
+  s[srf].compute_at(s[sqr_sum], s[sqr_sum]->op.as<ComputeOpNode>()->reduce_axis[0]);
+  s[sqr_sum_up].bind(sqr_sum_up->op.as<ComputeOpNode>()->axis[0], block_x);
+  IterVar xto, xti;
+  s[lrn].split_by_nparts(lrn->op.as<ComputeOpNode>()->axis[1], num_thread, &xto, &xti);
+  s[lrn].bind(lrn->op.as<ComputeOpNode>()->axis[0], block_x);
+  s[lrn].bind(xto, thread_x);
+
+  return s;
+}
+
+/*!
+* \brief Create a CUDA schedule for L2 normalization
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_l2_normalize(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  Schedule s = create_schedule(out_ops);
+
+  std::function<void(Operation)> traverse;
+  traverse = [&](const Operation& op) {
+    // Inline all one-to-one-mapping operators except the last stage (output)
+    if (is_injective(op->tag) || op->tag == "l2_normalize") {
+      if (!detail::contains(s->outputs, op)) {
+        s[op].compute_inline();
+      }
+      for (auto tensor : op->InputTensors()) {
+        if (tensor->op->InputTensors().size() > 0) {
+          traverse(tensor->op);
+        }
+      }
+    } else if (op->tag == "comm_reduce") {
+      ScheduleReduce(target, op, s, false);
+      for (auto tensor : op->InputTensors()) {
+        traverse(tensor->op);
+      }
+    } else {
+      LOG(ERROR) << "Unsupported operator " << op->tag;
+    }
+  };
+
+  traverse(outs[0]->op);
+  int num_thread = 64;
+  Tensor l2_normalize = outs[0];
+  IterVar block_x = tvm::thread_axis(Range(), "blockIdx.x");
+  IterVar thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
+  IterVar xto, xti;
+  s[l2_normalize].split_by_nparts(l2_normalize->op.as<ComputeOpNode>()->axis[1],
+                                 num_thread, &xto, &xti);
+  s[l2_normalize].bind(l2_normalize->op.as<ComputeOpNode>()->axis[0], block_x);
+  s[l2_normalize].bind(xto, thread_x);
+  return s;
+}
+}  // namespace cuda
+}  // namespace topi
+#endif  // TOPI_CUDA_NORMALIZATION_H_
+
diff --git a/topi/include/topi/cuda/pooling.h b/topi/include/topi/cuda/pooling.h
new file mode 100644
index 000000000000..d1c006e3a215
--- /dev/null
+++ b/topi/include/topi/cuda/pooling.h
@@ -0,0 +1,165 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file cuda/pooling.h
+* \brief CUDA schedule for pooling operations
+*/
+#ifndef TOPI_CUDA_POOLING_H_
+#define TOPI_CUDA_POOLING_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "topi/detail/array_utils.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace cuda {
+
+/*!
+* \brief Create a CUDA schedule for pool
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_pool(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+
+  auto _schedule = [&](const Tensor& padded_input, const Tensor& pool) {
+    if (padded_input->op->is_type<ComputeOpNode>()) {
+      s[padded_input].compute_inline();
+    }
+    auto num_thread = target->max_num_threads;
+    Tensor out;
+    Tensor OL;
+    if (detail::contains(s->outputs, pool->op)) {
+      out = pool;
+      OL = s.cache_write(pool, "local");
+    } else {
+      out = outs[0]->op.output(0);
+      s[pool].set_scope("local");
+    }
+    auto fused = detail::Fuse(s[out], s[out]->op.as<ComputeOpNode>()->axis);
+    IterVar bx, tx;
+    s[out].split(fused, num_thread, &bx, &tx);
+    s[out].bind(bx, tvm::thread_axis(Range(), "blockIdx.x"));
+    s[out].bind(tx, tvm::thread_axis(Range(), "threadIdx.x"));
+    if (detail::contains(s->outputs, pool->op)) {
+      s[OL].compute_at(s[out], tx);
+    } else {
+      s[pool].compute_at(s[out], tx);
+    }
+  };
+
+  std::function<void(Operation)> traverse;
+  traverse = [&](const Operation& op) {
+    // Inline all one-to-one-mapping operators except the last stage (output)
+    if (is_broadcast(op->tag)) {
+      if (!detail::contains(s->outputs, op)) {
+        s[op].compute_inline();
+      }
+      for (auto tensor : op->InputTensors()) {
+        if (tensor->op->InputTensors().size() > 0) {
+          traverse(tensor->op);
+        }
+      }
+    } else if (op->tag.rfind("pool", 0) == 0) {
+      // If tag starts with pool
+      auto padded_input = op->InputTensors()[0];
+      auto pool = op.output(0);
+      _schedule(padded_input, pool);
+    } else {
+      LOG(ERROR) << "Unsupported operator " << op->tag;
+    }
+  };
+
+  traverse(outs[0]->op);
+  return s;
+}
+
+/*!
+* \brief Create a CUDA schedule for global_pool
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_global_pool(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+
+  auto _schedule = [&](const Tensor& pool) {
+    auto num_thread = 8;
+    auto block_x = tvm::thread_axis(Range(), "blockIdx.x");
+    auto block_y = tvm::thread_axis(Range(), "blockIdx.y");
+    auto thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
+    auto thread_y = tvm::thread_axis(Range(0, num_thread), "threadIdx.y");
+    Tensor out;
+    Tensor OL;
+    if (detail::contains(s->outputs, pool->op)) {
+      out = pool;
+      OL = s.cache_write(pool, "local");
+    } else {
+      out = outs[0]->op.output(0);
+      s[pool].set_scope("local");
+    }
+
+    auto i = s[out]->op.as<ComputeOpNode>()->axis[0];
+    auto c = s[out]->op.as<ComputeOpNode>()->axis[1];
+
+    IterVar by, ty;
+    s[out].split(i, num_thread, &by, &ty);
+    IterVar bx, tx;
+    s[out].split(c, num_thread, &bx, &tx);
+    s[out].reorder({ by, bx, ty, tx });
+    s[out].bind(ty, thread_y);
+    s[out].bind(tx, thread_x);
+    s[out].bind(by, block_y);
+    s[out].bind(bx, block_x);
+
+    if (detail::contains(s->outputs, pool->op)) {
+      s[OL].compute_at(s[out], tx);
+    } else {
+      s[pool].compute_at(s[out], tx);
+    }
+  };
+
+  std::function<void(Operation)> traverse;
+  traverse = [&](const Operation& op) {
+    // Inline all one-to-one-mapping operators except the last stage (output)
+    if (is_broadcast(op->tag)) {
+      if (!detail::contains(s->outputs, op)) {
+        s[op].compute_inline();
+      }
+      for (auto tensor : op->InputTensors()) {
+        if (tensor->op->InputTensors().size() > 0) {
+          traverse(tensor->op);
+        }
+      }
+    } else if (op->tag.rfind("global_pool", 0) == 0) {
+      // If tag starts with global_pool
+      auto pool = op.output(0);
+      _schedule(pool);
+    } else {
+      LOG(ERROR) << "Unsupported operator " << op->tag;
+    }
+  };
+
+  traverse(outs[0]->op);
+  return s;
+}
+
+}  // namespace cuda
+}  // namespace topi
+#endif  // TOPI_CUDA_POOLING_H_
diff --git a/topi/include/topi/cuda/reduction.h b/topi/include/topi/cuda/reduction.h
new file mode 100644
index 000000000000..a1670873cc31
--- /dev/null
+++ b/topi/include/topi/cuda/reduction.h
@@ -0,0 +1,181 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file cuda/reduction.h
+* \brief CUDA schedule for reduction operations
+*/
+#ifndef TOPI_CUDA_REDUCTION_H_
+#define TOPI_CUDA_REDUCTION_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace cuda {
+/*!
+ * \brief Schedule a given reduce operation.
+ *
+ * \param target The target to generate a schedule for.
+ * \param op The operation representing the injective operation.
+ * \param sch The schedule to apply this scheduling to
+ * \param is_idx_reduce Pass true to schedule a reduce op that returns
+ * an index, such as argmax or argmin.
+ *
+ * \return The schedule given by sch
+*/
+Schedule ScheduleReduce(const Target& target,
+                        Operation op,
+                        Schedule sch,
+                        bool is_idx_reduce = false) {
+  Tensor data_out;
+  Tensor data_in;
+
+  if (!is_idx_reduce) {
+    data_in = op->InputTensors()[0];
+    data_out = op.output(0);
+  } else {
+    data_out = op->InputTensors()[0];
+  }
+
+  auto out_stage = sch[data_out];
+  CHECK_GT(out_stage->op.as<ComputeOpNode>()->reduce_axis.size(), 0) <<
+    "reduce_axis must be greater than zero";
+
+  bool all_reduce;
+  int num_thread;
+  IterVar block_x, thread_x, thread_y;
+
+  if (out_stage->op.as<ComputeOpNode>()->axis.size() > 0) {
+    all_reduce = false;
+    num_thread = 32;
+    if (target->target_name == "opencl") {
+      // Without this, CL_INVALID_WORK_GROUP_SIZE occurs with python tests.
+      // Don't know why.
+      num_thread = 16;
+    }
+    block_x = tvm::thread_axis(Range(), "blockIdx.x");
+    thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
+    thread_y = tvm::thread_axis(Range(0, num_thread), "threadIdx.y");
+  } else {
+    all_reduce = true;
+    num_thread = target->max_num_threads;
+    thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
+  }
+
+  auto fused_reduce = detail::Fuse(out_stage, out_stage->op.as<ComputeOpNode>()->reduce_axis);
+
+  IterVar ko, ki;
+  out_stage.split(fused_reduce, num_thread, &ko, &ki);
+  auto data_out_rf = sch.rfactor(data_out, ki)[0];
+  auto tx = out_stage->op.as<ComputeOpNode>()->reduce_axis[0];
+  out_stage.bind(tx, thread_x);
+  sch[data_out_rf].compute_at(out_stage, tx);
+
+  Tensor real_output;
+  Tensor temp_idx_input, temp_val_input;
+  if (is_idx_reduce) {
+    real_output = op.output(0);
+    temp_idx_input = data_out->op.output(0);
+    temp_val_input = data_out->op.output(1);
+  } else {
+    real_output = data_out;
+  }
+
+  auto stage_real = sch[real_output];
+  if (!all_reduce) {
+    // Fuse and split the axis
+    auto fused_outer = detail::Fuse(stage_real, stage_real->op.as<ComputeOpNode>()->axis);
+    IterVar bx, outer_in;
+    stage_real.split(fused_outer, num_thread, &bx, &outer_in);
+
+    // Bind the axes to threads and blocks
+    stage_real.bind(outer_in, thread_y);
+    stage_real.bind(bx, block_x);
+    if (is_idx_reduce) {
+      sch[temp_idx_input].compute_at(stage_real, outer_in);
+      sch[temp_val_input].compute_at(stage_real, outer_in);
+    }
+  } else {
+    if (is_idx_reduce) {
+      sch[temp_idx_input].compute_at(stage_real,
+                                     stage_real->op.as<ComputeOpNode>()->axis[0]);
+      sch[temp_val_input].compute_at(stage_real,
+                                     stage_real->op.as<ComputeOpNode>()->axis[0]);
+    }
+  }
+
+  stage_real.set_store_predicate(static_cast<Expr>(thread_x) == 0);
+  return sch;
+}
+
+/*!
+ * \brief Recursively traverse operator inputs, setting injective inputs
+ * to be computed inline.
+ *
+ * \param s The schedule we are building
+ * \param op The current op in the traversal
+ */
+void TraverseBeforeReduce(Schedule s, Operation op) {
+  if (op->derived_from<PlaceholderOpNode>()) {
+    return;
+  } else if (is_injective(op->tag)) {
+    s[op].compute_inline();
+    for (auto tensor : op->InputTensors()) {
+      TraverseBeforeReduce(s, tensor->op);
+    }
+  } else {
+    LOG(ERROR) << "Unsupported operator " << op->tag;
+  }
+}
+
+/*!
+* \brief Schedule a reduce op, then invoke TraverseBeforeReduce on each
+* of the op's inputs.
+*
+* \param target The target to generate a schedule for.
+* \param s The schedule we are building
+* \param op The reduce op
+*/
+void TraverseAfterReduce(const Target& target, Schedule s, Operation op) {
+  if (is_broadcast(op->tag)) {
+    LOG(ERROR) << "Elementwise op after reduce is not yet supported";
+  } else if (op->tag == kCommReduce) {
+    ScheduleReduce(target, op, s, false);
+    for (auto tensor : op->InputTensors()) {
+      TraverseBeforeReduce(s, tensor->op);
+    }
+  } else if (op->tag == kCommReduceIdx) {
+    ScheduleReduce(target, op, s, true);
+    for (auto tensor : op->InputTensors()[0]->op->InputTensors()) {
+      TraverseBeforeReduce(s, tensor->op);
+    }
+  } else {
+    LOG(ERROR) << "Unsupported operator " << op->tag;
+  }
+}
+
+/*!
+* \brief Create a CUDA schedule for a reduce operation.
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+Schedule schedule_reduce(const Target& target, Array<Tensor> outs) {
+  CHECK_EQ(outs.size(), 1) << "outs must have size 1";
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  TraverseAfterReduce(target, s, outs[0]->op);
+  return s;
+}
+
+}  // namespace cuda
+}  // namespace topi
+#endif  // TOPI_CUDA_REDUCTION_H_
diff --git a/topi/include/topi/cuda/softmax.h b/topi/include/topi/cuda/softmax.h
new file mode 100644
index 000000000000..8d22f4b4b46d
--- /dev/null
+++ b/topi/include/topi/cuda/softmax.h
@@ -0,0 +1,62 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file cuda/injective.h
+* \brief CUDA schedule for injective operations
+*/
+#ifndef TOPI_CUDA_SOFTMAX_H_
+#define TOPI_CUDA_SOFTMAX_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace cuda {
+
+/*!
+ * \brief Create a CUDA schedule for the given softmax output tensors.
+ *
+ * \param target The target to generate a schedule for.
+ * \param outs The output tensors.
+ *
+ * \return A schedule for the given ops.
+ */
+inline Schedule schedule_softmax(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+
+  auto softmax = outs[0];
+  auto max_elem = softmax->op->InputTensors()[1];
+  auto expsum = softmax->op->InputTensors()[2];
+
+  int num_thread = 64;
+  auto block_x = tvm::thread_axis(Range(), "blockIdx.x");
+  auto thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
+
+  s[max_elem].bind(max_elem->op.as<ComputeOpNode>()->axis[0], block_x);
+
+  auto k = expsum->op.as<ComputeOpNode>()->reduce_axis[0];
+  IterVar ko, ki;
+  s[expsum].split(k, num_thread, &ko, &ki);
+  auto EF = s.rfactor(expsum, ki)[0];
+  s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->axis[0], block_x);
+  s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0], thread_x);
+  s[EF].compute_at(s[expsum], s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0]);
+  s[expsum].set_store_predicate(thread_x->var == 0);
+
+  IterVar tx, xi;
+  s[softmax].split_by_nparts(softmax->op.as<ComputeOpNode>()->axis[1], num_thread, &tx, &xi);
+  s[softmax].bind(tx, thread_x);
+
+  return s;
+}
+
+}  // namespace cuda
+}  // namespace topi
+#endif  // TOPI_CUDA_SOFTMAX_H_
diff --git a/topi/include/topi/cuda/vision.h b/topi/include/topi/cuda/vision.h
new file mode 100644
index 000000000000..4dd8b7cee15d
--- /dev/null
+++ b/topi/include/topi/cuda/vision.h
@@ -0,0 +1,95 @@
+/*!
+*  Copyright (c) 2018 by Contributors
+* \file cuda/vision.h
+* \brief CUDA schedule for vision operations
+*/
+#ifndef TOPI_CUDA_VISION_H_
+#define TOPI_CUDA_VISION_H_
+
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+#include "topi/tags.h"
+#include "topi/detail/array_utils.h"
+#include "topi/contrib/cublas.h"
+#include "topi/generic/extern.h"
+
+namespace topi {
+using namespace tvm;
+namespace cuda {
+/*!
+* \brief Create a CUDA schedule for region
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_region(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  auto output = outs[0]->op.output(0);
+  auto num_thread = 64;
+
+  auto _schedule_softmax = [&](const Operation& softmax_op) {
+    auto softmax_inputs = softmax_op->InputTensors();
+    auto softmax = softmax_inputs[0];
+    auto max_elem = softmax_inputs[1];
+    auto expsum = softmax_inputs[2];
+
+    auto block_x = tvm::thread_axis(Range(), "blockIdx.x");
+    auto thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
+
+    s[max_elem].bind(max_elem->op.as<ComputeOpNode>()->axis[0], block_x);
+    auto k = expsum->op.as<ComputeOpNode>()->reduce_axis[0];
+    IterVar ko, ki;
+    s[expsum].split(k, num_thread, &ko, &ki);
+    auto ef = s.rfactor(expsum, ki)[0];
+
+    s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->axis[0], block_x);
+    s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0], thread_x);
+    s[ef].compute_at(s[expsum], s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0]);
+
+    s[expsum].set_store_predicate(static_cast<Expr>(thread_x) == 0);
+    IterVar tx, xi;
+    s[softmax_op].split_by_nparts(softmax_op.as<ComputeOpNode>()->axis[1], num_thread, &tx, &xi);
+    s[softmax_op].bind(tx, thread_x);
+
+    return max_elem->op.as<ComputeOpNode>()->InputTensors()[0];
+  };
+
+  std::function<void(Operation)> traverse;
+  traverse = [&](const Operation& op) {
+    // Inline all one-to-one-mapping operators except the last stage (output)
+    if (is_injective(op->tag)) {
+      if (!detail::contains(s->outputs, op)) {
+        s[op].compute_inline();
+      }
+      for (auto tensor : op->InputTensors()) {
+        if (tensor->op->InputTensors().size() > 0) {
+          traverse(tensor->op);
+        }
+      }
+    } else if (op->tag == "softmax_output") {
+      auto tensor = _schedule_softmax(op);
+      if (tensor->op->InputTensors().size() > 0) {
+        traverse(tensor->op);
+      }
+    } else {
+      LOG(ERROR) << "Unsupported operator " << op->tag;
+    }
+  };
+
+  traverse(outs[0]->op);
+  auto k = output->op.as<ComputeOpNode>()->axis[0];
+  IterVar bx, tx;
+  s[output].split(k, num_thread, &bx, &tx);
+  s[output].bind(bx, tvm::thread_axis(Range(), "blockIdx.x"));
+  s[output].bind(tx, tvm::thread_axis(Range(), "threadIdx.x"));
+  return s;
+}
+}  // namespace cuda
+}  // namespace topi
+#endif  // TOPI_CUDA_VISION_H_
diff --git a/topi/include/topi/detail/array_utils.h b/topi/include/topi/detail/array_utils.h
new file mode 100644
index 000000000000..3e77650cda6f
--- /dev/null
+++ b/topi/include/topi/detail/array_utils.h
@@ -0,0 +1,35 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file array_utils.h
+* \brief Utility functions for handling arrays
+*/
+#ifndef TOPI_DETAIL_ARRAY_UTILS_H_
+#define TOPI_DETAIL_ARRAY_UTILS_H_
+
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace detail {
+using namespace tvm;
+
+/*!
+ * \brief Search an array for a specific item
+ *
+ * \param array The array to search
+ * \param item The item to search for
+ *
+ * \return True iff the given array contains the given item.
+ */
+template<typename T>
+inline bool contains(Array<T> array, T item) {
+  for (auto& i : array) {
+    if (i == item) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace detail
+}  // namespace topi
+#endif  // TOPI_DETAIL_ARRAY_UTILS_H_
diff --git a/topi/include/topi/detail/broadcast.h b/topi/include/topi/detail/broadcast.h
index 19f074661734..19ada244f28e 100644
--- a/topi/include/topi/detail/broadcast.h
+++ b/topi/include/topi/detail/broadcast.h
@@ -12,6 +12,7 @@
 
 #include "tvm/ir_pass.h"
 #include "tvm/tvm.h"
+#include "topi/detail/constant_utils.h"
 
 namespace topi {
 namespace detail {
@@ -32,15 +33,15 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::Expr>& shape1,
   int i;
   for (i = 1; i <= std::min(s1_size, s2_size); ++i) {
     bh.all_vars.push_front(tvm::Var());
-    if (tvm::ir::Equal(shape1[s1_size - i], shape2[s2_size - i])) {
+    if (topi::detail::EqualCheck(shape1[s1_size - i], shape2[s2_size - i])) {
       bh.common_shape.push_front(shape1[s1_size - i]);
       bh.vars1.push_front(bh.all_vars[0]);
       bh.vars2.push_front(bh.all_vars[0]);
-    } else if (tvm::ir::Equal(one, shape1[s1_size - i])) {
-      CHECK(!tvm::ir::Equal(one, shape2[s2_size - i]));
+    } else if (topi::detail::EqualCheck(one, shape1[s1_size - i])) {
+      CHECK(!topi::detail::EqualCheck(one, shape2[s2_size - i]));
       bh.common_shape.push_front(shape2[s2_size - i]);
       bh.vars2.push_front(bh.all_vars[0]);
-    } else if (tvm::ir::Equal(one, shape2[s2_size - i])) {
+    } else if (topi::detail::EqualCheck(one, shape2[s2_size - i])) {
       bh.common_shape.push_front(shape1[s1_size - i]);
       bh.vars1.push_front(bh.all_vars[0]);
     } else {
@@ -55,7 +56,7 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::Expr>& shape1,
   auto max_size = std::max(s1_size, s2_size);
   auto& shape = (s1_size > s2_size) ? shape1 : shape2;
   auto& vars = (s1_size > s2_size) ? bh.vars1 : bh.vars2;
-  for (i = i; i <= max_size; ++i) {
+  for (; i <= max_size; ++i) {
     bh.all_vars.push_front(tvm::Var());
     bh.common_shape.push_front(shape[max_size - i]);
     vars.push_front(bh.all_vars[0]);
@@ -69,10 +70,10 @@ inline tvm::Array<tvm::Expr> InputIndexFromBroadcast(
   tvm::Array<tvm::Expr> ivars;
   CHECK_EQ(ovars.size(), all_vars.size());
   // N^2, could use a map but NBD..
-  int expected_dims = T->shape.size();
-  for (int i = 0; i < ovars.size(); ++i) {
+  size_t expected_dims = T->shape.size();
+  for (size_t i = 0; i < ovars.size(); ++i) {
     bool found = false;
-    for (int j = 0; j < my_vars.size(); ++j) {
+    for (size_t j = 0; j < my_vars.size(); ++j) {
     if (all_vars[i].same_as(my_vars[j])) {
         ivars.push_back(ovars[i]);
         found = true;
diff --git a/topi/include/topi/detail/constant_utils.h b/topi/include/topi/detail/constant_utils.h
new file mode 100644
index 000000000000..343334562349
--- /dev/null
+++ b/topi/include/topi/detail/constant_utils.h
@@ -0,0 +1,107 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file constant_utils.h
+* \brief Utility functions for handling constants in TVM expressions
+*/
+#ifndef TOPI_DETAIL_CONSTANT_UTILS_H_
+#define TOPI_DETAIL_CONSTANT_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "tvm/tvm.h"
+#include "tvm/ir_pass.h"
+
+namespace topi {
+namespace detail {
+using namespace tvm;
+
+/*!
+ * \brief Test whether the given Expr is a constant integer
+ *
+ * \param expr the Expr to query
+ *
+ * \return true if the given expr is a constant int or uint, false otherwise.
+ */
+inline bool IsConstInt(Expr expr) {
+  return
+    expr->derived_from<tvm::ir::IntImm>() ||
+    expr->derived_from<tvm::ir::UIntImm>();
+}
+
+/*!
+ * \brief Get the value of the given constant integer expression. An error
+ * is logged if the given expression is not a constant integer.
+ *
+ * \param expr The expression to get the value of
+ *
+ * \return The integer value.
+ */
+inline int64_t GetConstInt(Expr expr) {
+  if (expr->derived_from<tvm::ir::IntImm>()) {
+    return expr.as<tvm::ir::IntImm>()->value;
+  }
+  if (expr->derived_from<tvm::ir::UIntImm>()) {
+    return expr.as<tvm::ir::UIntImm>()->value;
+  }
+  LOG(ERROR) << "expr must be a constant integer";
+  return -1;
+}
+
+/*!
+ * \brief Get the value of all the constant integer expressions in the given array
+ *
+ * \param exprs The array of expressions to get the values of
+ * \param var_name The name to be used when logging an error in the event that any
+ * of the expressions are not constant integers.
+ *
+ * \return A vector of the integer values
+ */
+inline std::vector<int> GetConstIntValues(Array<Expr> exprs, const std::string& var_name) {
+  std::vector<int> result;
+  for (auto expr : exprs) {
+    CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
+    result.push_back(GetConstInt(expr));
+  }
+  return result;
+}
+
+/*!
+ * \brief Get the value of all the constant integer expressions in the given array
+ *
+ * \param exprs The array of expressions to get the values of
+ * \param var_name The name to be used when logging an error in the event that any
+ * of the expressions are not constant integers.
+ *
+ * \return A vector of the int64_t values
+ */
+inline std::vector<int64_t> GetConstInt64Values(Array<Expr> exprs, const std::string& var_name) {
+  std::vector<int64_t> result;
+  for (auto expr : exprs) {
+    CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
+    result.push_back(GetConstInt(expr));
+  }
+  return result;
+}
+
+/*!
+ * \brief Check weather the two expressions are equal or not, if not simplify the expressions and check again
+ * \note This is stronger equality check than tvm::ir::Equal
+ *
+ * \param lhs First expreesion
+ * \param rhs Second expreesion
+ *
+ * \return result True if both expressions are equal, else false
+ */
+inline bool EqualCheck(Expr lhs, Expr rhs) {
+  bool result = tvm::ir::Equal(lhs, rhs);
+  if (!result) {
+    Expr zero(0);
+    result = tvm::ir::Equal(tvm::ir::CanonicalSimplify(lhs-rhs), zero);
+  }
+  return result;
+}
+
+}  // namespace detail
+}  // namespace topi
+#endif  // TOPI_DETAIL_CONSTANT_UTILS_H_
diff --git a/topi/include/topi/detail/extern.h b/topi/include/topi/detail/extern.h
new file mode 100644
index 000000000000..44170b75b9a0
--- /dev/null
+++ b/topi/include/topi/detail/extern.h
@@ -0,0 +1,140 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file detail/extern.h
+* \brief Helpers for using external functions
+*/
+#ifndef TOPI_DETAIL_EXTERN_H_
+#define TOPI_DETAIL_EXTERN_H_
+
+#include <tvm/tvm.h>
+#include <vector>
+#include <string>
+
+
+namespace topi {
+namespace detail {
+using namespace tvm;
+
+/*!
+ * \brief Construct a buffer to pass to an external function
+ *
+ * \param shape The shape of the buffer
+ * \param dtype The type of the buffer elements
+ * \param name The name of the buffer
+ *
+ * \return The Buffer object
+ */
+inline Buffer DeclExternBuffer(Array<Expr> shape,
+                               Type dtype,
+                               std::string name) {
+  auto data = var(name, Handle());
+  auto elem_offset = Expr();
+  return BufferNode::make(data, dtype, shape, Array<Expr>(), elem_offset, name, "",
+                          -1, 0);
+}
+
+/*!
+ * \brief A function which constructs an Expr representing the invocation of an external
+ * function. The function expects two arguments: an array of Buffers holding the input
+ * tensor values, and a pre-allocated array of Buffers to be filled with the outputs.
+ */
+using FExtern = std::function<Expr(Array<Buffer>, Array<Buffer>)>;
+
+/*!
+ * \brief Create tensors representing the result of invoking an external function.
+ * This function will create the necessary buffers to hold input and output tensor values.
+ *
+ * \param out_shapes An array where each element is the shape of the corresponding output tensor.
+ * \param out_types An array where each element is the dtype of the corresponding output tensor.
+ * \param inputs An array of input Tensors
+ * \param fextern A function that constructs an Expr representing the invocation of
+ * the external function given the input and output buffers.
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ * \param attrs The additional auxiliary attributes of the operation.
+ *
+ * \return An array of Tensors representing the outputs of the function invocation. There will
+ * be one output Tensor for each element of out_shapes, with dtype equal to the corresponding
+ * element of out_types.
+ */
+inline Array<Tensor> make_extern(const Array< Array<Expr> >& out_shapes,
+                                 const std::vector<Type>& out_types,
+                                 const Array<Tensor>& inputs,
+                                 FExtern fextern,
+                                 std::string name,
+                                 std::string tag,
+                                 ::tvm::Map<std::string, NodeRef> attrs) {
+  CHECK_EQ(out_shapes.size(), out_types.size())
+    << "make_extern: out_shapes and out_types must have equal size";
+
+  Array<Buffer> input_placeholders;
+  for (auto t : inputs) {
+    input_placeholders.push_back(DeclExternBuffer(t->shape, t->dtype, t->op->name));
+  }
+  Array<Buffer> output_placeholders;
+  for (size_t i = 0; i < out_shapes.size(); ++i) {
+    output_placeholders.push_back(DeclExternBuffer(out_shapes[i], out_types[i], name));
+  }
+
+  auto body = fextern(input_placeholders, output_placeholders);
+  auto body_stmt = tvm::ir::Evaluate::make(body);
+
+  auto op = ExternOpNode::make(
+      name, tag, attrs, inputs,
+      input_placeholders, output_placeholders, body_stmt);
+
+  Array<Tensor> outputs;
+  for (size_t i = 0; i < output_placeholders.size(); ++i) {
+    outputs.push_back(op.output(i));
+  }
+  return outputs;
+}
+
+/*!
+ * \brief This function is used to create a DLTensor structure on the stack to
+ * be able to pass a symbolic buffer as arguments to TVM PackedFunc
+ *
+ * \param buf The buffer to pack
+ *
+ * \return An expression representing the pack operation
+ */
+inline Expr pack_buffer(Buffer buf) {
+  CHECK_GT(buf->shape.size(), 0) << "buf shape must have at least one element";
+  auto shape = tvm::ir::Call::make(Handle(), tvm::ir::intrinsic::tvm_stack_make_shape,
+                                   buf->shape, tvm::ir::Call::CallType::Intrinsic);
+  Expr strides;
+  if (buf->strides.size() > 0) {
+    strides = tvm::ir::Call::make(Handle(), tvm::ir::intrinsic::tvm_stack_make_shape,
+                                  buf->shape, tvm::ir::Call::CallType::Intrinsic);
+  } else {
+    strides = 0;
+  }
+  Array<Expr> pack_args{
+    buf->data,
+    shape,
+    strides,
+    make_const(Int(32), static_cast<int64_t>(buf->shape.size())),
+    make_const(buf->dtype, 0),
+    buf->elem_offset
+  };
+  return tvm::ir::Call::make(Handle(), tvm::ir::intrinsic::tvm_stack_make_array,
+                             pack_args, tvm::ir::Call::CallType::Intrinsic);
+}
+
+/*!
+ * \brief Construct an Expr representing the invocation of a PackedFunc
+ *
+ * \param args An array containing the registered name of the PackedFunc followed
+ * by the arguments to pass to the PackedFunc when called. The first element of the
+ * array must be a constant string expression.
+ *
+ * \return An expression representing the invocation
+ */
+inline Expr call_packed(Array<Expr> args) {
+  return tvm::ir::Call::make(Int(32), tvm::ir::intrinsic::tvm_call_packed,
+                             args, tvm::ir::Call::CallType::Intrinsic);
+}
+
+}  // namespace detail
+}  // namespace topi
+#endif  // TOPI_DETAIL_EXTERN_H_
diff --git a/topi/include/topi/detail/fuse.h b/topi/include/topi/detail/fuse.h
new file mode 100644
index 000000000000..9ee7fbd1cffd
--- /dev/null
+++ b/topi/include/topi/detail/fuse.h
@@ -0,0 +1,37 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file fuse.h
+* \brief Fuse operation
+*/
+#ifndef TOPI_DETAIL_FUSE_H_
+#define TOPI_DETAIL_FUSE_H_
+
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace detail {
+using namespace tvm;
+
+/*!
+ * \brief Fuse all of the given args
+ * 
+ * \param stage The stage in which to apply the fuse
+ * \param args The iteration variables to be fused
+ *
+ * \return The fused iteration variable
+ */
+inline IterVar Fuse(Stage stage, const Array<IterVar>& args) {
+  CHECK_GE(args.size(), 1) << "Fuse requires at least 1 arg";
+
+  auto fused = args[0];
+  for (size_t i = 1; i < args.size(); ++i) {
+    IterVar out;
+    stage.fuse(fused, args[i], &out);
+    fused = out;
+  }
+  return fused;
+}
+
+}  // namespace detail
+}  // namespace topi
+#endif  // TOPI_DETAIL_FUSE_H_
diff --git a/topi/include/topi/detail/pad_utils.h b/topi/include/topi/detail/pad_utils.h
new file mode 100644
index 000000000000..f982f35844fa
--- /dev/null
+++ b/topi/include/topi/detail/pad_utils.h
@@ -0,0 +1,38 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file pad_utils.h
+* \brief Padding helpers
+*/
+#ifndef TOPI_DETAIL_PAD_UTILS_H_
+#define TOPI_DETAIL_PAD_UTILS_H_
+
+#include <vector>
+
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace detail {
+using namespace tvm;
+
+/*!
+ * \brief Get padding size for each side given padding height and width
+ *
+ * \param pad_h The amount to pad each of the top and bottom sides
+ * \param pad_w The amount to pad each of the left and right sides
+ *
+ * \return An array of 4 elements, representing padding sizes for
+ * each individual side. The array is in the order { top, left, bottom, right }
+ */
+inline Array<Expr> GetPadTuple(Expr pad_h, Expr pad_w) {
+  pad_h *= 2;
+  pad_w *= 2;
+
+  auto pad_top = (pad_h + 1) / 2;
+  auto pad_left = (pad_w + 1) / 2;
+
+  return { pad_top, pad_left, pad_h - pad_top, pad_w - pad_left };
+}
+
+}  // namespace detail
+}  // namespace topi
+#endif  // TOPI_DETAIL_PAD_UTILS_H_
diff --git a/topi/include/topi/detail/ravel_unravel.h b/topi/include/topi/detail/ravel_unravel.h
new file mode 100644
index 000000000000..b9774153efc7
--- /dev/null
+++ b/topi/include/topi/detail/ravel_unravel.h
@@ -0,0 +1,60 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file ravel_unravel.h
+* \brief Index ravel and unraval operations
+*/
+#ifndef TOPI_DETAIL_RAVEL_UNRAVEL_H_
+#define TOPI_DETAIL_RAVEL_UNRAVEL_H_
+
+#include <vector>
+
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace detail {
+using namespace tvm;
+
+/*!
+* \brief Flatten the indices to 1D
+*
+* \param indices The input coordinates
+* \param shape Shape of the tensor
+*
+* \return The index after flattening
+*/
+inline Expr RavelIndex(Array<Var> indices, Array<Expr> shape) {
+  CHECK_EQ(indices.size(), shape.size()) << "indices and shape must have equal size";
+  CHECK_GT(indices.size(), 0) << "indices must not be empty";
+  Expr idx;
+  for (size_t i = 0; i < indices.size(); ++i) {
+    if (i == 0) {
+      idx = indices[i];
+    } else {
+      idx = idx * shape[i] + indices[i];
+    }
+  }
+  return idx;
+}
+
+/*!
+* \brief Convert flattened index to coordinate array
+*
+* \param idx The 1D index
+* \param shape Shape of the tensor
+*
+* \return The coordinate corresponding to the 1D index
+*/
+inline Array<Expr> UnavelIndex(Expr idx, Array<Expr> shape) {
+  std::vector<Expr> indices;
+
+  for (int i = static_cast<int>(shape.size()) - 1; i >= 0; --i) {
+    indices.push_back(idx % shape[i]);
+    idx = idx / shape[i];
+  }
+  std::reverse(indices.begin(), indices.end());
+  return indices;
+}
+
+}  // namespace detail
+}  // namespace topi
+#endif  // TOPI_DETAIL_RAVEL_UNRAVEL_H_
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index e133833a790b..88c77f0afc52 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -10,6 +10,8 @@
 
 #include "topi/tags.h"
 #include "tvm/tvm.h"
+#include "tvm/ir.h"
+#include "tvm/ir_pass.h"
 
 namespace topi {
 using namespace tvm;
@@ -28,6 +30,171 @@ TOPI_DECLARE_UNARY_OP(exp);
 TOPI_DECLARE_UNARY_OP(tanh);
 TOPI_DECLARE_UNARY_OP(sigmoid);
 TOPI_DECLARE_UNARY_OP(sqrt);
+TOPI_DECLARE_UNARY_OP(log);
+TOPI_DECLARE_UNARY_OP(floor);
+TOPI_DECLARE_UNARY_OP(ceil);
+TOPI_DECLARE_UNARY_OP(round);
+TOPI_DECLARE_UNARY_OP(trunc);
+TOPI_DECLARE_UNARY_OP(abs);
+
+/*!
+* \brief Creates an operation that returns identity of a given tensor
+*
+* \param x The input tensor
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the identity operation
+*/
+inline Tensor identity(const Tensor& x,
+                       std::string name = "tensor",
+                       std::string tag = kElementWise) {
+  return compute(x->shape, [&](const Array<Var>& i) {
+    return x(i);
+  }, name, tag);
+}
+
+/*!
+* \brief Creates an operation that returns the negation of a given tensor
+*
+* \param x The input tensor
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the negation operation
+*/
+inline Tensor negative(const Tensor& x,
+                       std::string name = "tensor",
+                       std::string tag = kElementWise) {
+  return compute(x->shape, [&](const Array<Var>& i) {
+    return -x(i);
+  }, name, tag);
+}
+
+/*!
+* \brief Creates an operation that clips each element of a tensor to
+* the interval [a_min, a_max]
+*
+* \param x The input tensor
+* \param a_min The inclusive lower bound of the interval
+* \param a_max The inclusive upper bound of the interval
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the clip operation
+*/
+inline Tensor clip(const Tensor& x,
+                   const Expr& a_min,
+                   const Expr& a_max,
+                   std::string name = "tensor",
+                   std::string tag = kElementWise) {
+  return compute(x->shape, [&](const Array<Var>& i) {
+    auto min_val = tvm::cast(x->dtype, a_min);
+    auto max_val = tvm::cast(x->dtype, a_max);
+    return tvm::max(tvm::min(x(i), max_val), min_val);  // NOLINT(*)
+  }, name, tag);
+}
+
+/*!
+ * \brief Cast each element of x to the given type. If expr is
+ * scalar and type is a corresponding vector type, a
+ * Broadcast is generated, otherwise a Cast is generated.
+ *
+ * \param x The input tensor
+ * \param type The type to cast to
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is the cast operation
+ */
+inline Tensor cast(const Tensor& x,
+                   Type type,
+                   std::string name = "tensor",
+                   std::string tag = kElementWise) {
+  return compute(x->shape, [&](const Array<Var>& i) {
+    auto expr = x(i);
+    if (expr.type().code() == type.code() && expr.type().bits() == type.bits()) {
+      if (expr.type().lanes() == type.lanes()) {
+        return expr;
+      } else if (expr.type().lanes() == 1 && type.lanes() > 1) {
+        return tvm::ir::Broadcast::make(expr, type.lanes());
+      }
+    }
+
+    return tvm::cast(type, x(i));
+  }, name, tag);
+}
+
+/*!
+* \brief Creates an operation that sum each element of a tensor
+*
+* \param xs The input tensor array
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the sum operation
+*/
+inline Tensor elemwise_sum(const Array<Tensor>& xs,
+                           std::string name = "tensor",
+                           std::string tag = kElementWise) {
+  CHECK_GT(xs.size(), 0) << "elemwise sum must have at least one input tensor.";
+  return compute(xs[0]->shape, [&](const Array<Var>& i) {
+      auto sum_expr = xs[0](i);
+      for (size_t j = 1; j < xs.size(); j++) {
+        sum_expr = sum_expr + xs[j](i);
+      }
+      return sum_expr;
+  }, name, tag);
+}
+
+/*!
+* \brief Creates an operation that fill a tensor with fill_value
+*
+* \param shape The shape of a tensor
+* \param dtype The Type of fill_value
+* \param fill_value The value to be filled
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the full operation
+*/
+inline Tensor full(const Array<Expr>& shape,
+                   Type dtype,
+                   const Expr fill_value,
+                   std::string name = "tensor",
+                   std::string tag = kElementWise) {
+  Expr ev = lossless_cast(dtype, fill_value);
+  if (!ev.defined()) {
+    LOG(ERROR) << "Can't cast fill_value to " << dtype;
+  }
+  return compute(shape, [&](const Array<Var>& i) {
+      return ev;
+  }, name, tag);
+}
+
+/*!
+* \brief Creates an operation that construct a tensor with same shape as input tensor, 
+* then fill a tensor with fill_value
+*
+* \param x The input tensor
+* \param fill_value The value to be filled
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op memeber is the full_like operation
+*/
+inline Tensor full_like(const Tensor& x,
+                        const Expr fill_value,
+                        std::string name = "tensor",
+                        std::string tag = kElementWise) {
+  Expr ev = lossless_cast(x->dtype, fill_value);
+  if (!ev.defined()) {
+    LOG(ERROR) << "Can't cast fill_value to " << x->dtype;
+  }
+  return compute(x->shape, [&](const Array<Var>& i) {
+      return ev;
+  }, name, tag);
+}
 
 }  // namespace topi
 #endif  // TOPI_ELEMWISE_H_
diff --git a/topi/include/topi/generic/default.h b/topi/include/topi/generic/default.h
new file mode 100644
index 000000000000..5e0615742a14
--- /dev/null
+++ b/topi/include/topi/generic/default.h
@@ -0,0 +1,61 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file generic/default.h
+* \brief Generic default schedule
+*/
+#ifndef TOPI_GENERIC_DEFAULT_H_
+#define TOPI_GENERIC_DEFAULT_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace generic {
+/*!
+* \brief Create a generic default schedule for the given output tensors.
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule default_schedule(const Target& target, Array<Tensor> outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  return s;
+}
+
+/*!
+* \brief Create a generic default schedule for the given output tensors, and apply
+* auto inline
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule default_schedule_auto_inline(const Target& target, Array<Tensor> outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  auto x = outs[0];
+  tvm::schedule::AutoInlineInjective(s);
+  auto axis = s[x]->op.as<ComputeOpNode>()->axis;
+  if (axis.size() > 0) {
+    detail::Fuse(s[x], axis);
+  }
+  return s;
+}
+
+}  // namespace generic
+}  // namespace topi
+#endif  // TOPI_GENERIC_DEFAULT_H_
diff --git a/topi/include/topi/generic/extern.h b/topi/include/topi/generic/extern.h
new file mode 100644
index 000000000000..e14054f29b68
--- /dev/null
+++ b/topi/include/topi/generic/extern.h
@@ -0,0 +1,37 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file generic/extern.h
+* \brief Schedule for extern followed by injective ops
+*/
+#ifndef TOPI_GENERIC_EXTERN_H_
+#define TOPI_GENERIC_EXTERN_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace generic {
+/*!
+* \brief Schedule an extern op followed by injective operations
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the op.
+*/
+inline Schedule schedule_extern(const Target& target, Array<Tensor> outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  return s;
+}
+
+}  // namespace generic
+}  // namespace topi
+#endif  // TOPI_GENERIC_EXTERN_H_
diff --git a/topi/include/topi/generic/injective.h b/topi/include/topi/generic/injective.h
new file mode 100644
index 000000000000..c1d63eac6af7
--- /dev/null
+++ b/topi/include/topi/generic/injective.h
@@ -0,0 +1,42 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file generic/injective.h
+* \brief Generic schedule for injective operations
+*/
+#ifndef TOPI_GENERIC_INJECTIVE_H_
+#define TOPI_GENERIC_INJECTIVE_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace generic {
+
+/*!
+ * \brief Create a generic schedule for the given injective ops.
+ *
+ * \param target The target to generate a schedule for.
+ * \param outs The output tensors.
+ *
+ * \return A schedule for the given ops.
+ */
+inline Schedule schedule_injective(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  tvm::schedule::AutoInlineInjective(s);
+  auto x = outs[0];
+  detail::Fuse(s[x], s[x]->op.as<ComputeOpNode>()->axis);
+
+  return s;
+}
+
+}  // namespace generic
+}  // namespace topi
+#endif  // TOPI_GENERIC_INJECTIVE_H_
diff --git a/topi/include/topi/image/resize.h b/topi/include/topi/image/resize.h
new file mode 100644
index 000000000000..b6bd51ef0fd2
--- /dev/null
+++ b/topi/include/topi/image/resize.h
@@ -0,0 +1,336 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file topi/image/resize.h
+ * \brief image resize constructors
+ */
+#ifndef TOPI_IMAGE_RESIZE_H_
+#define TOPI_IMAGE_RESIZE_H_
+
+#include <string>
+#include <vector>
+#include <iterator>
+#include <algorithm>
+
+#include "topi/tags.h"
+#include "topi/detail/ravel_unravel.h"
+#include "topi/detail/constant_utils.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace image {
+using namespace tvm;
+
+/*!
+* \brief Resize given tensor to given shape using nearest neighbour for NHWC
+*
+* \param input The input tensor.
+* \param shape Output shape to resize to.
+* \param align_corners To preserve centers of 4 corner pixels
+* \param name Name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor resized to given shape
+*/
+inline Tensor resize_nearest_neighbor_nhwc(const Tensor& input,
+                                           const Array<Expr>& shape,
+                                           bool align_corners = false,
+                                           std::string name = "tensor",
+                                           std::string tag = kInjective) {
+  Array<Expr> out_shape;
+  out_shape.push_back(input->shape[0]);
+  out_shape.push_back(shape[0]);
+  out_shape.push_back(shape[1]);
+  out_shape.push_back(input->shape[3]);
+
+  Expr h_ratio = shape[0] / input->shape[1];
+  Expr w_ratio = shape[1] / input->shape[2];
+
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+    Array<Expr> idx;
+    idx.push_back(indices[0]);
+    idx.push_back(indices[1] / h_ratio);
+    idx.push_back(indices[2] / w_ratio);
+    idx.push_back(indices[3]);
+
+    return input(idx);
+    }, name, tag);
+}
+
+/*!
+* \brief Resize given tensor to given shape using nearest neighbour for NCHW
+*
+* \param input The input tensor.
+* \param shape Output shape to resize to.
+* \param align_corners To preserve centers of 4 corner pixels
+* \param name Name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor resized to given shape
+*/
+inline Tensor resize_nearest_neighbor_nchw(const Tensor& input,
+                                           const Array<Expr>& shape,
+                                           bool align_corners = false,
+                                           std::string name = "tensor",
+                                           std::string tag = kInjective) {
+  Array<Expr> out_shape;
+  out_shape.push_back(input->shape[0]);
+  out_shape.push_back(input->shape[1]);
+  out_shape.push_back(shape[0]);
+  out_shape.push_back(shape[1]);
+
+  Expr h_ratio = shape[0] / input->shape[2];
+  Expr w_ratio = shape[1] / input->shape[3];
+
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+    Array<Expr> idx;
+    idx.push_back(indices[0]);
+    idx.push_back(indices[1]);
+    idx.push_back(indices[2] / h_ratio);
+    idx.push_back(indices[3] / w_ratio);
+
+    return input(idx);
+    }, name, tag);
+}
+
+/*!
+* \brief Resize given tensor to given shape using nearest neighbour
+*
+* \param input The input tensor.
+* \param shape Output shape to resize to.
+* \param layout input layout
+* \param align_corners To preserve centers of 4 corner pixels
+* \param name Name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor resized to given shape
+*/
+inline Tensor resize_nearest_neighbor(const Tensor& input,
+                                      const Array<Expr>& shape,
+                                      std::string layout = "NCHW",
+                                      bool align_corners = false,
+                                      std::string name = "tensor",
+                                      std::string tag = kInjective) {
+  CHECK_EQ(align_corners, false) << "Align corners not supported for nearest neighbour";
+
+  if (layout == "NHWC") {
+    return resize_nearest_neighbor_nhwc(input, shape, align_corners);
+  } else {
+    return resize_nearest_neighbor_nchw(input, shape, align_corners);
+  }
+}
+
+/*!
+* \brief Resize given tensor to given shape using bilinear interpolation for NHWC
+*
+* \param input The input tensor.
+* \param shape Output shape to resize to.
+* \param align_corners To preserve centers of 4 corner pixels
+* \param name Name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor resized to given shape
+*/
+inline Tensor resize_bilinear_nhwc(const Tensor& input,
+                                   const Array<Expr>& shape,
+                                   bool align_corners = false,
+                                   std::string name = "tensor",
+                                   std::string tag = kInjective) {
+  Array<Expr> out_shape;
+  out_shape.push_back(input->shape[0]);
+  out_shape.push_back(shape[0]);
+  out_shape.push_back(shape[1]);
+  out_shape.push_back(input->shape[3]);
+
+  Expr cone = make_const(Int(32), 1);
+
+  auto in_height = as_const_int(input->shape[1]);
+  auto in_width = as_const_int(input->shape[2]);
+  auto out_height = as_const_int(shape[0]);
+  auto out_width = as_const_int(shape[1]);
+
+  Expr y_ratio;
+  Expr x_ratio;
+
+  if (!align_corners) {
+    y_ratio = make_const(Float(32), (static_cast<float>(*in_height) /
+                                     static_cast<float>(*out_height)));
+    x_ratio = make_const(Float(32), (static_cast<float>(*in_width) /
+                                     static_cast<float>(*out_width)));
+  } else {
+    y_ratio = make_const(Float(32), (static_cast<float>(*in_height - 1) /
+                                     static_cast<float>(*out_height - 1)));
+    x_ratio = make_const(Float(32), (static_cast<float>(*in_width - 1) /
+                                     static_cast<float>(*out_width - 1)));
+  }
+
+  Expr other_y = tvm::ir::Simplify(input->shape[1] - cone);
+  Expr other_x = tvm::ir::Simplify(input->shape[2] - cone);
+
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+    auto in_y = indices[1] * y_ratio;
+    auto yf = tvm::floor(in_y);
+    auto yc = HalideIR::Internal::Cast::make(Int(32), tvm::ceil(in_y));
+
+    auto y0 = HalideIR::Internal::Cast::make(Int(32), tvm::floor(in_y));
+    auto y1 = tvm::select((yc > other_y), other_y, yc);
+    auto y_lerp  = in_y - yf;
+
+    auto in_x = indices[2] * x_ratio;
+    auto xf = tvm::floor(in_x);
+    auto xc = HalideIR::Internal::Cast::make(Int(32), tvm::ceil(in_x));
+
+    auto x0 = HalideIR::Internal::Cast::make(Int(32), tvm::floor(in_x));
+    auto x1 = tvm::select((xc > other_x), other_x, xc);
+    auto x_lerp  = in_x - xf;
+
+    auto A = input(indices[0], y0, x0, indices[3]);
+    auto B = input(indices[0], y0, x1, indices[3]);
+    auto C = input(indices[0], y1, x0, indices[3]);
+    auto D = input(indices[0], y1, x1, indices[3]);
+
+    auto top = A + (B - A) * x_lerp;
+    auto bottom = C + (D - C) * x_lerp;
+
+    return  (top + (bottom - top) * y_lerp);
+    }, name, tag);
+}
+
+/*!
+* \brief Resize given tensor to given shape using bilinear interpolation for NCHW
+*
+* \param input The input tensor.
+* \param shape Output shape to resize to.
+* \param align_corners To preserve centers of 4 corner pixels
+* \param name Name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor resized to given shape
+*/
+inline Tensor resize_bilinear_nchw(const Tensor& input,
+                                   const Array<Expr>& shape,
+                                   bool align_corners = false,
+                                   std::string name = "tensor",
+                                   std::string tag = kInjective) {
+  Array<Expr> out_shape;
+  out_shape.push_back(input->shape[0]);
+  out_shape.push_back(input->shape[1]);
+  out_shape.push_back(shape[0]);
+  out_shape.push_back(shape[1]);
+
+  Expr cone = make_const(Int(32), 1);
+
+  auto in_height = as_const_int(input->shape[2]);
+  auto in_width = as_const_int(input->shape[3]);
+  auto out_height = as_const_int(shape[0]);
+  auto out_width = as_const_int(shape[1]);
+
+  Expr y_ratio;
+  Expr x_ratio;
+
+  if (!align_corners) {
+    y_ratio = make_const(Float(32), (static_cast<float>(*in_height) /
+                                     static_cast<float>(*out_height)));
+    x_ratio = make_const(Float(32), (static_cast<float>(*in_width) /
+                                     static_cast<float>(*out_width)));
+  } else {
+    y_ratio = make_const(Float(32), (static_cast<float>(*in_height - 1) /
+                                     static_cast<float>(*out_height - 1)));
+    x_ratio = make_const(Float(32), (static_cast<float>(*in_width - 1) /
+                                     static_cast<float>(*out_width - 1)));
+  }
+
+  Expr other_y = tvm::ir::Simplify(input->shape[2] - cone);
+  Expr other_x = tvm::ir::Simplify(input->shape[3] - cone);
+
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+    auto in_y = indices[2] * y_ratio;
+    auto yf = tvm::floor(in_y);
+    auto yc = HalideIR::Internal::Cast::make(Int(32), tvm::ceil(in_y));
+
+    auto y0 = HalideIR::Internal::Cast::make(Int(32), tvm::floor(in_y));
+    auto y1 = tvm::select((yc > other_y), other_y, yc);
+    auto y_lerp  = in_y - yf;
+
+    auto in_x = indices[3] * x_ratio;
+    auto xf = tvm::floor(in_x);
+    auto xc = HalideIR::Internal::Cast::make(Int(32), tvm::ceil(in_x));
+
+    auto x0 = HalideIR::Internal::Cast::make(Int(32), tvm::floor(in_x));
+    auto x1 = tvm::select((xc > other_x), other_x, xc);
+    auto x_lerp  = in_x - xf;
+
+    auto A = input(indices[0], indices[1], y0, x0);
+    auto B = input(indices[0], indices[1], y0, x1);
+    auto C = input(indices[0], indices[1], y1, x0);
+    auto D = input(indices[0], indices[1], y1, x1);
+
+    auto top = A + (B - A) * x_lerp;
+    auto bottom = C + (D - C) * x_lerp;
+
+    return  (top + (bottom - top) * y_lerp);
+    }, name, tag);
+}
+
+/*!
+* \brief Resize given tensor to given shape using bilinear interpolation
+*
+* \param input The input tensor.
+* \param shape Output shape to resize to.
+* \param layout input layout
+* \param align_corners To preserve centers of 4 corner pixels
+* \param name Name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor resized to given shape
+*/
+inline Tensor resize_bilinear(const Tensor& input,
+                              const Array<Expr>& shape,
+                              std::string layout = "NCHW",
+                              bool align_corners = false,
+                              std::string name = "tensor",
+                              std::string tag = kInjective) {
+  Tensor ret;
+
+  if (layout == "NHWC") {
+    ret = resize_bilinear_nhwc(input, shape, align_corners);
+  } else {
+    ret = resize_bilinear_nchw(input, shape, align_corners);
+  }
+
+  return cast(ret, input->dtype);
+}
+
+/*!
+* \brief Resize given tensor to given shape
+*
+* \param input The input tensor.
+* \param shape Output shape to resize to.
+* \param layout input layout
+* \param align_corners To preserve centers of 4 corner pixels
+* \param mode Angorithm to use (NEAREST_NEIGHBOR / BILINEAR)
+* \param name Name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor resized to given shape
+*/
+inline Tensor resize(const Tensor& input,
+                     const Array<Expr>& shape,
+                     std::string layout = "NCHW",
+                     bool align_corners = false,
+                     std::string mode = "BILINEAR",
+                     std::string name = "tensor",
+                     std::string tag = kInjective) {
+  if (mode == "NEAREST_NEIGHBOR") {
+    return resize_nearest_neighbor(input, shape, layout, align_corners);
+  } else {
+    return resize_bilinear(input, shape, layout, align_corners);
+  }
+}
+
+}  // namespace image
+}  // namespace topi
+#endif  // TOPI_IMAGE_RESIZE_H_
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index cfca85d1b704..53b899796e37 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -1,7 +1,7 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  * \brief NN op constructions
- * \file nn.h
+ * \file topi/nn.h
  */
 #ifndef TOPI_NN_H_
 #define TOPI_NN_H_
@@ -10,18 +10,20 @@
 #include <string>
 
 #include "topi/tags.h"
+#include "topi/detail/constant_utils.h"
 #include "tvm/ir.h"
 #include "tvm/ir_pass.h"
 #include "tvm/tvm.h"
 
 namespace topi {
+using namespace tvm;
 namespace detail {
 
 template <typename T>
 tvm::Expr Map(const tvm::Array<tvm::Expr>& exprs, T op) {
   CHECK_GE(exprs.size(), 1);
   tvm::Expr res = exprs[0];
-  for (int i = 1; i < exprs.size(); ++i) {
+  for (size_t i = 1; i < exprs.size(); ++i) {
     res = op(res, exprs[i]);
   }
   return res;
@@ -46,11 +48,72 @@ inline tvm::Tensor relu(const tvm::Tensor& t,
                         std::string tag = kElementWise) {
   return tvm::compute(
       t->shape,
-      [&](const tvm::Array<tvm::Var>& i) { return tvm::max(t(i), threshold); },
+      [&](const tvm::Array<tvm::Var>& i) {
+        auto threshold_const = tvm::make_const(t->dtype, threshold);
+        return tvm::max(t(i), threshold_const);
+      },
       name,
       tag);
 }
 
+/*!
+* \brief Creates an operation that performs a leaky rectified linear unit
+*
+* \param t The input tensor
+* \param alpha The slope for the small gradient when t < 0
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the relu operation
+*/
+inline tvm::Tensor leaky_relu(const tvm::Tensor& t,
+                              double alpha = 0.1,
+                              std::string name = "tensor",
+                              std::string tag = kElementWise) {
+  return tvm::compute(
+    t->shape,
+    [&](const tvm::Array<tvm::Var>& i) {
+      auto value = t(i);
+      auto calpha = tvm::make_const(value.type(), alpha);
+      return tvm::select(value > 0, value, value * calpha);
+    },
+    name,
+    tag);
+}
+
+/*!
+ * \brief Creates an operation that performs a parametric rectified linear unit
+ *
+ * \param x The input data tensor
+ * \param slope The channel-wise slope tensor
+ * \param axis The axis where the channel data needs to be applied
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is the relu operation
+ */
+inline tvm::Tensor prelu(const tvm::Tensor &x,
+                         const tvm::Tensor &slope,
+                         const int axis = 1,
+                         std::string name = "tensor",
+                         std::string tag = kBroadcast) {
+  CHECK_EQ(4, x->shape.size());
+  CHECK((size_t)axis < x->shape.size()) <<
+        "Wrong axis ("  << axis << ")value. ";
+  CHECK(topi::detail::GetConstInt(slope->shape[0]) ==
+        topi::detail::GetConstInt(x->shape[axis]))
+        << "Wrong slope shape received.";
+
+  return tvm::compute(x->shape,
+                     [&](const tvm::Array<tvm::Var> &indices) {
+                        return tvm::select(x(indices) > 0,
+                                           x(indices),
+                                           x(indices) * slope(indices[axis]));
+                      },
+                      name,
+                      tag);
+}
+
 /*!
  * \brief Creates an operation that performs padding
  *
@@ -59,10 +122,11 @@ inline tvm::Tensor relu(const tvm::Tensor& t,
  * respective iterator
  * \param pad_after An Array of Expr describing the padding after the
  * respective iterator
+ * \param pad_value The value to fill padding elements with
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
- * \return A Tensor whose op member is the relu operation
+ * \return A Tensor whose op member is the padding operation
  *
  * \note
  *  The pad_after Array must either be empty or have the same length as
@@ -86,17 +150,18 @@ inline tvm::Tensor relu(const tvm::Tensor& t,
 inline tvm::Tensor pad(const tvm::Tensor& t,
                        const tvm::Array<tvm::Expr>& pad_before,
                        tvm::Array<tvm::Expr> pad_after = tvm::Array<tvm::Expr>(),
+                       Expr pad_value = Expr(),
                        std::string name = "tensor",
                        std::string tag = kElementWise) {
   if (pad_after.size() < pad_before.size()) {
-    for (int i = pad_after.size(); i < pad_before.size(); ++i) {
+    for (size_t i = pad_after.size(); i < pad_before.size(); ++i) {
       pad_after.push_back(pad_before[i]);
     }
   }
   CHECK_GE(pad_before.size(), 1);
   CHECK_EQ(pad_before.size(), pad_after.size());
   tvm::Array<tvm::Expr> output_shape;
-  for (int i = 0; i < t->shape.size(); ++i) {
+  for (size_t i = 0; i < t->shape.size(); ++i) {
     if (i >= pad_before.size()) {
       output_shape.push_back(t->shape[i]);
     } else {
@@ -104,25 +169,33 @@ inline tvm::Tensor pad(const tvm::Tensor& t,
           tvm::ir::Simplify(t->shape[i] + pad_before[i] + pad_after[i]));
     }
   }
+
+  if (!pad_value.defined()) {
+    pad_value = tvm::make_const(t->dtype, 0);
+  }
+
   auto l = [&](tvm::Array<tvm::Var> ovars) {
     tvm::Array<tvm::Expr> indices;
     tvm::Array<tvm::Expr> sel;
-    for (int i = 0; i < t->shape.size(); ++i) {
+    for (size_t i = 0; i < t->shape.size(); ++i) {
       if (i >= pad_before.size()) {
         indices.push_back(ovars[i]);
         continue;
       }
-      if (!tvm::ir::Equal(pad_before[i], 0)) {
+      if (!topi::detail::EqualCheck(pad_before[i], 0)) {
         sel.push_back(ovars[i] >= pad_before[i]);
         indices.push_back(ovars[i] - pad_before[i]);
       } else {
         indices.push_back(ovars[i]);
       }
-      if (!tvm::ir::Equal(pad_after[i], 0)) {
+      if (!topi::detail::EqualCheck(pad_after[i], 0)) {
         sel.push_back(tvm::ir::Simplify(ovars[i] < pad_before[i] + t->shape[i]));
       }
     }
-    return tvm::select(detail::Map(sel, tvm::ir::And::make), t(indices), 0);
+    if (sel.size() != 0) {
+      return tvm::select(detail::Map(sel, tvm::ir::And::make), t(indices), pad_value);
+    }
+    return t(indices);
   };
   return tvm::compute(output_shape, l, name, tag);
 }
@@ -140,14 +213,14 @@ inline tvm::Tensor pad(const tvm::Tensor& t,
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
- * \return A Tensor whose op member is the matmult operation
+ * \return A Tensor whose op member is the matmul operation
  */
-inline tvm::Tensor matmult(const tvm::Tensor& A,
+inline tvm::Tensor matmul(const tvm::Tensor& A,
                            const tvm::Tensor& B,
                            bool trans_a = false,
                            bool trans_b = false,
                            std::string name = "tensor",
-                           std::string tag = kMatMult) {
+                           std::string tag = kMatMul) {
   tvm::Array<tvm::Expr> output_shape{A->shape[trans_a ? 1 : 0],
                                      B->shape[trans_b ? 0 : 1]};
   auto k = tvm::reduce_axis(tvm::Range{0, A->shape[trans_a ? 0 : 1]}, "k");
@@ -405,5 +478,27 @@ inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
   return tvm::compute(output_shape, l, name, tag);
 }
 
+using FLayoutIndicesTransform = std::function<Array<Expr>(const Array<Var>& indices)>;
+/*!
+ * \brief Transform the layout according to the mapping function \p to_src_indices.
+ * \param src the source input.
+ * \param dst_shape the output shape.
+ * \param to_src_indices the mapping function from input index to output index.
+ * \param name output tensor name.
+ * \param tag output tensor tag.
+ * \return A tensor with shape \p dst_shape.
+ */
+inline Tensor layout_transform(const Tensor& src,
+                               const Array<Expr>& dst_shape,
+                               const FLayoutIndicesTransform& to_src_indices,
+                               const std::string name = "layout_transform",
+                               const std::string tag = kInjective) {
+  auto src_shape = src->shape;
+  return compute(
+  dst_shape, [&](const Array<Var>& dst_indices) {
+    return src(to_src_indices(dst_indices));
+  }, name, tag);
+}
+
 }  // namespace topi
 #endif  // TOPI_NN_H_
diff --git a/topi/include/topi/nn/batch_norm.h b/topi/include/topi/nn/batch_norm.h
new file mode 100644
index 000000000000..be3e31d216d0
--- /dev/null
+++ b/topi/include/topi/nn/batch_norm.h
@@ -0,0 +1,65 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Batch normalization op constructions
+ * \file nn/batch_norm.h
+ */
+#ifndef TOPI_NN_BATCH_NORM_H_
+#define TOPI_NN_BATCH_NORM_H_
+
+#include <string>
+
+#include "topi/tags.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Batch normalization inference operator with NCHW layout
+*
+* \param x The input tensor. 4-D with shape [batch, channel, height, width]
+* \param gamma 1-D with shape [channel]
+* \param beta 1-D with shape [channel]
+* \param moving_mean 1-D with shape [channel]
+* \param moving_var 1-D with shape [channel]
+* \param eps Epsilon to prevent div by 0
+* \param fix_gamma Fix gamma while training
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the batch normalization operation
+*/
+inline Tensor batch_norm_inference(const Tensor& x,
+                                   const Tensor& gamma,
+                                   const Tensor& beta,
+                                   const Tensor& moving_mean,
+                                   const Tensor& moving_var,
+                                   float eps,
+                                   bool fix_gamma,
+                                   std::string name = "tensor",
+                                   std::string tag = kBroadcast) {
+  CHECK_EQ(x->shape.size(), 4) << "Batch norm requires 4-D input";
+
+  Tensor out;
+  if (fix_gamma) {
+    out = tvm::compute(
+      x->shape,
+      [&](const Array<Var>& indices) {
+        auto c = Array<Var>({ indices[1] });
+        return (x(indices) - moving_mean(c)) / tvm::sqrt(moving_var(c) + eps) + beta(c);
+      }, name, tag);
+  } else {
+    out = tvm::compute(
+      x->shape,
+      [&](const Array<Var>& indices) {
+        auto c = Array<Var>({ indices[1] });
+        return (x(indices) - moving_mean(c)) / tvm::sqrt(moving_var(c) + eps) * gamma(c) + beta(c);
+      }, name, tag);
+  }
+  return out;
+}
+
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_BATCH_NORM_H_
diff --git a/topi/include/topi/nn/bnn.h b/topi/include/topi/nn/bnn.h
new file mode 100644
index 000000000000..f7b1b860d461
--- /dev/null
+++ b/topi/include/topi/nn/bnn.h
@@ -0,0 +1,110 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Binary op constructions
+ * \file nn/bnn.h
+ */
+#ifndef TOPI_NN_BNN_H_
+#define TOPI_NN_BNN_H_
+
+#include <string>
+
+#include "tvm/tvm.h"
+#include "tvm/ir_pass.h"
+#include "topi/tags.h"
+#include "topi/detail/constant_utils.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Binarization and bit-packing along a certain axis.
+*
+* \param data N-D tensor, can be any layout
+* \param axis The axis along which to do binarization and bit-packing. This axis
+* must have a size equal to an integer multiple of 32.
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return Output tensor with dtype uint32
+*/
+inline tvm::Tensor binarize_pack(const tvm::Tensor& data,
+                                 int axis,
+                                 std::string name = "PackedInput",
+                                 std::string tag = "binarize_pack") {
+  auto ishape = data->shape;
+  CHECK_EQ(GetConstInt(ishape[axis]) % 32, 0)
+    << "binarize_pack: axis size must be a multiple of 32";
+
+  auto n = ishape.size();
+  Array<Expr> oshape;
+  for (size_t i = 0; i < n; ++i) {
+    oshape.push_back(i == static_cast<size_t>(axis) ?
+                     tvm::ir::Simplify(ishape[i] / 32) :
+                     ishape[i]);
+  }
+
+  return tvm::compute(
+    oshape,
+    [&](const Array<Var>& indices) {
+      Array<Expr> start_idx;
+      for (size_t i = 0; i < n; ++i) {
+        start_idx.push_back(i == static_cast<size_t>(axis) ?
+                            indices[i] * 32 :
+                            static_cast<Expr>(indices[i]));
+      }
+      auto packed = make_const(UInt(32), 0);
+      for (size_t j = 0; j < 32; ++j) {
+        Array<Expr> idx;
+        for (size_t i = 0; i < n; ++i) {
+          idx.push_back(i == static_cast<size_t>(axis) ?
+                        start_idx[i] + static_cast<int>(j) :
+                        start_idx[i]);
+        }
+        auto sign = tvm::cast(UInt(32), data(idx) >= 0);
+        packed = (packed | sign);
+        if (j == 31) {
+          return packed;
+        }
+        packed = packed << 1;
+      }
+      return packed;  // never reached, but suppress compiler warning
+    }, name, tag);
+}
+
+/*!
+* \brief Binary matrix multiplication using xor and bit-count
+*
+* \param data Tensor with shape [batch, in_dim], dtype is uint32
+* \param weight Tensor with shape [out_dim, in_dim], dtype is uint32
+*
+* \return Tensor with shape [batch, out_dim], dtype is float32
+*/
+inline tvm::Tensor binary_dense(const tvm::Tensor& data,
+                                const tvm::Tensor& weight) {
+  CHECK_EQ(data->shape.size(), 2) << "binary_dense requires 2-D data";
+  CHECK_EQ(weight->shape.size(), 2) << "binary_dense requires 2-D weight";
+  CHECK_EQ(data->dtype, UInt(32)) << "binary_dense requires uint32 data";
+  CHECK_EQ(weight->dtype, UInt(32)) << "binary_dense requires uint32 weight";
+
+  auto batch = data->shape[0];
+  auto in_dim = data->shape[1];
+  auto out_dim = weight->shape[0];
+
+  auto k = tvm::reduce_axis(Range(0, in_dim), "k");
+  auto matmul = tvm::compute(
+    { batch, out_dim },
+    [&](Var i, Var j) {
+      return tvm::sum(popcount(data(i, k) ^ weight(j, k)), { k });
+    }, "tensor", "binary_dense");
+
+  return tvm::compute(
+    { batch, out_dim },
+    [&](Var i, Var j) {
+      return 32 * in_dim - 2.0f * matmul(i, j);
+    }, "tensor", kElementWise);
+}
+
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_BNN_H_
diff --git a/topi/include/topi/nn/dense.h b/topi/include/topi/nn/dense.h
new file mode 100644
index 000000000000..695b8e187856
--- /dev/null
+++ b/topi/include/topi/nn/dense.h
@@ -0,0 +1,60 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Dense op constructions
+ * \file nn/dense.h
+ */
+#ifndef TOPI_NN_DENSE_H_
+#define TOPI_NN_DENSE_H_
+
+#include <string>
+
+#include "topi/tags.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Creates an operation that calculates data * weight^T + bias
+*
+* \param data Tensor with shape [batch, in_dim]
+* \param weight Tensor with shape [out_dim, in_dim]
+* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+*
+* \return Tensor with shape [batch, out_dim]
+*/
+inline tvm::Tensor dense(const tvm::Tensor& data,
+                         const tvm::Tensor& weight,
+                         const tvm::Tensor& bias) {
+  CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
+  CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
+  if (bias.defined()) {
+    CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
+  }
+
+  auto batch = data->shape[0];
+  auto in_dim = data->shape[1];
+  auto out_dim = weight->shape[0];
+
+  auto k = tvm::reduce_axis(Range(0, in_dim), "k");
+  auto matmul = tvm::compute(
+    { batch, out_dim },
+    [&](Var i, Var j) {
+      return tvm::sum(data(i, k) * weight(j, k), { k });
+    }, "tensor", "dense");
+
+  if (bias.defined()) {
+    matmul = tvm::compute(
+      { batch, out_dim },
+      [&](Var i, Var j) {
+        return matmul(i, j) + bias(j);
+      }, "tensor", kBroadcast);
+  }
+
+  return matmul;
+}
+
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_DENSE_H_
diff --git a/topi/include/topi/nn/dilate.h b/topi/include/topi/nn/dilate.h
new file mode 100644
index 000000000000..f4638f4b6b8a
--- /dev/null
+++ b/topi/include/topi/nn/dilate.h
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Dilate op constructions
+ * \file nn/dilate.h
+ */
+#ifndef TOPI_NN_DILATE_H_
+#define TOPI_NN_DILATE_H_
+
+#include <string>
+
+#include "tvm/tvm.h"
+#include "tvm/ir_pass.h"
+#include "topi/tags.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Create a new expression of the logical and of all
+* conditions in the arguments.
+*
+* \param args The arguments to find the logical conjunction of
+*
+* \return The logical conjunction expression
+*/
+Expr all(Array<Expr> args) {
+  CHECK_GT(args.size(), 0) << "all requires at least one argument";
+
+  Expr ret = args[0];
+  for (size_t i = 1; i < args.size(); ++i) {
+    ret = ret && args[i];
+  }
+  return ret;
+}
+
+/*!
+* \brief Dilate data with zeros
+*
+* \param x The input tensor, this can have any number of
+* dimensions and any layout.
+* \param strides Dilation stride for each dimension. Stride 1
+* means no dilation.
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return The output tensor.
+*/
+inline Tensor dilate(const Tensor& x,
+                     Array<Expr> strides,
+                     std::string name = "tensor",
+                     std::string tag = kInjective) {
+  auto n = x->shape.size();
+  CHECK_EQ(n, strides.size())
+    << "strides size (" << strides.size()
+    << ") must match dimension of x (" << n << ")";
+
+  Array<Expr> out_shape;
+  for (size_t i = 0; i < n; ++i) {
+    out_shape.push_back(tvm::ir::Simplify(
+      (x->shape[i] - 1) * strides[i] + 1));
+  }
+
+  return tvm::compute(
+    out_shape,
+    [&](const Array<Var>& indices) {
+      Array<Expr> not_zero;
+      Array<Expr> index_tuple;
+      for (size_t i = 0; i < n; ++i) {
+        if (IsConstInt(strides[i]) && GetConstInt(strides[i]) == 1) {
+          index_tuple.push_back(indices[i]);
+        } else {
+          index_tuple.push_back(indices[i] / strides[i]);
+          not_zero.push_back((indices[i] % strides[i]) == 0);
+        }
+      }
+      if (not_zero.size() > 0) {
+        auto all_not_zero = all(not_zero);
+        return tvm::select(all_not_zero, x(index_tuple), make_const(x->dtype, 0));
+      }
+      return x(index_tuple);
+    }, name, tag);
+}
+
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_DILATE_H_
diff --git a/topi/include/topi/nn/flatten.h b/topi/include/topi/nn/flatten.h
new file mode 100644
index 000000000000..d9577be36e5d
--- /dev/null
+++ b/topi/include/topi/nn/flatten.h
@@ -0,0 +1,63 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Softmax op constructions
+ * \file nn/flatten.h
+ */
+#ifndef TOPI_NN_FLATTEN_H_
+#define TOPI_NN_FLATTEN_H_
+
+#include <string>
+#include <vector>
+
+#include "topi/tags.h"
+#include "topi/detail/constant_utils.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Flattens the input tensor into a 2-D tensor by collapsing higher dimensions.
+* This requires the input tensor to have constant sized dimensions.
+*
+* \param x The input tensor.
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A 2-D tensor.
+*/
+inline Tensor flatten(const Tensor& x,
+                      std::string name = "tensor",
+                      std::string tag = kInjective) {
+  auto ishape = x->shape;
+  int dim = 1;
+  for (size_t i = 1; i < ishape.size(); ++i) {
+    dim = dim * static_cast<int>(topi::detail::GetConstInt(ishape[i]));
+  }
+
+  Array<Expr> oshape({ ishape[0], dim });
+
+  std::vector<Expr> extra_shape;
+  for (size_t i = 1; i < ishape.size(); ++i) {
+    extra_shape.push_back(ishape[i]);
+  }
+  std::reverse(extra_shape.begin(), extra_shape.end());
+
+  return tvm::compute(
+    oshape, [&](Var i, Var j) {
+      Expr idx = j;
+      std::vector<Expr> index;
+      for (auto s : extra_shape) {
+        index.push_back(idx % s);
+        idx = idx / s;
+      }
+      index.push_back(i);
+      std::reverse(index.begin(), index.end());
+      return x(index);
+    }, name, tag);
+}
+
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_FLATTEN_H_
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
new file mode 100644
index 000000000000..cda1f3b5c813
--- /dev/null
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -0,0 +1,46 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief l2 normalization op constructions
+ * \file nn/l2_normalize.h
+ */
+#ifndef TOPI_NN_L2_NORMALIZE_H_
+#define TOPI_NN_L2_NORMALIZE_H_
+
+#include <string>
+#include <algorithm>
+#include "topi/tags.h"
+#include "tvm/tvm.h"
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief L2 normalization inference operator
+*
+* \param data The input tensor. 4-D with shape [batch, channel, height, width]
+* \param eps Epsilon to prevent div by 0
+* \param axis Axes over the normalization applied
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the l2 normalization operation
+*/
+inline Tensor l2_normalize(const Tensor& data,
+                           float eps,
+                           const Array<Expr>& axis,
+                           std::string name = "tensor",
+                           std::string tag = "l2_normalize") {
+  CHECK_EQ(data->shape.size(), 4) << "L2 normalization requires 4-D input";
+  auto input_shape = data->shape;
+  Tensor dot_value = topi::power(data, static_cast<float>(2.0));
+  Tensor sum_value = topi::sum(dot_value, axis, true);
+  Tensor expand_sum = topi::broadcast_to(sum_value, input_shape);
+  return topi::divide(data,
+                      topi::sqrt(tvm::compute(expand_sum->shape,
+                                              [&](const Array<Var>& i){
+                                                return (max(expand_sum(i), eps));
+                                              }, name = name, tag = tag)));
+}
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_L2_NORMALIZE_H_
diff --git a/topi/include/topi/nn/local_response_norm.h b/topi/include/topi/nn/local_response_norm.h
new file mode 100644
index 000000000000..96e1fdcbda48
--- /dev/null
+++ b/topi/include/topi/nn/local_response_norm.h
@@ -0,0 +1,77 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief local response normalization op constructions
+ * \file nn/local_response_norm.h
+ */
+#ifndef TOPI_NN_LOCAL_RESPONSE_NORM_H_
+#define TOPI_NN_LOCAL_RESPONSE_NORM_H_
+
+#include <string>
+
+#include "topi/tags.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Local response normalization inference operator
+*
+* \param data The input tensor. 4-D shape NCHW or NHWC
+* \param size Integer to define normalisation window size
+* \param axis Input data layout channel axis
+* \param alpha Float scaling factor
+* \param beta Exponent value
+* \param bias Offset to avoid dividing by zero
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the Local response normalization operation
+*/
+inline Tensor lrn(const Tensor& data,
+                  int size,
+                  int axis = 1,
+                  float alpha = 0.0001,
+                  float beta = 0.75,
+                  float bias = 2,
+                  std::string name = "tensor",
+                  std::string tag = kBroadcast) {
+  CHECK_EQ(data->shape.size(), 4) << "LRN requires 4-D input";
+  CHECK_EQ(size % 2, 1) << "size should be odd number";
+  CHECK(axis == 1 || axis == 3) << "axis should be 1 or 3 for NCHW and NHWC";
+  auto input_shape = data->shape;
+  Array<Expr> pad_before{ 0, 0, 0, 0};
+  Array<Expr> pad_after{ 0, 0, 0, 0};
+  pad_before.Set(axis, static_cast<Expr>(size/2));
+  pad_after.Set(axis, static_cast<Expr>(size/2));
+  auto pad_data = pad(data, pad_before, pad_after, 0, "pad_data");
+  auto rxs = tvm::reduce_axis(Range(0, size), "rxs");
+  Tensor sqr_sum;
+  if (axis == 1) {
+    sqr_sum = tvm::compute(input_shape,
+                           [&](Var i, Var l, Var j, Var k) {
+                           return tvm::sum(pad_data(i, l + rxs, j, k) *
+                                           pad_data(i, l + rxs, j, k),
+                                           {rxs});
+                           });
+  } else if (axis == 3) {
+    sqr_sum = tvm::compute(input_shape,
+                           [&](Var i, Var l, Var j, Var k) {
+                           return tvm::sum(pad_data(i, l, j, k + rxs) *
+                                           pad_data(i, l, j, k + rxs),
+                                           {rxs});
+                           });
+  }
+  auto sqrt_sum_up = tvm::compute(
+      input_shape,
+      [&](Var i, Var j, Var k, Var l) {
+        return tvm::pow(bias +
+                        (alpha * sqr_sum(i, j, k, l) / size),
+                        beta);
+      });
+  return topi::divide(data, sqrt_sum_up);
+}
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_LOCAL_RESPONSE_NORM_H_
diff --git a/topi/include/topi/nn/mapping.h b/topi/include/topi/nn/mapping.h
new file mode 100644
index 000000000000..60cd6d6310a7
--- /dev/null
+++ b/topi/include/topi/nn/mapping.h
@@ -0,0 +1,66 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Mapping op constructions
+ * \file nn/mapping.h
+ */
+#ifndef TOPI_NN_MAPPING_H_
+#define TOPI_NN_MAPPING_H_
+
+#include <string>
+
+#include "topi/tags.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Scale and shift with NCHW order
+*
+* \param x The input tensor.
+* \param scale Scale tensor, 1-D of size channel
+* \param shift Shift tensor, 1-D of size channel
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the scale shift operation
+*/
+inline Tensor scale_shift_nchw(const Tensor& x,
+                               const Tensor& scale,
+                               const Tensor& shift,
+                               std::string name = "ScaleShift",
+                               std::string tag = kBroadcast) {
+  return tvm::compute(
+    x->shape,
+    [&](Var b, Var c, Var h, Var w) {
+      return x(b, c, h, w) * scale(c) + shift(w);
+    }, name, tag);
+}
+
+/*!
+* \brief Scale and shift with NHWC order
+*
+* \param x The input tensor.
+* \param scale Scale tensor, 1-D of size channel
+* \param shift Shift tensor, 1-D of size channel
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the scale shift operation
+*/
+inline Tensor scale_shift_nhwc(const Tensor& x,
+                               const Tensor& scale,
+                               const Tensor& shift,
+                               std::string name = "ScaleShift",
+                               std::string tag = kBroadcast) {
+  return tvm::compute(
+    x->shape,
+    [&](Var b, Var h, Var w, Var c) {
+      return x(b, h, w, c) * scale(c) + shift(w);
+    }, name, tag);
+}
+
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_MAPPING_H_
diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
new file mode 100644
index 000000000000..26d61d42991d
--- /dev/null
+++ b/topi/include/topi/nn/pooling.h
@@ -0,0 +1,290 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Pooling op constructions
+ * \file nn/pooling.h
+ */
+#ifndef TOPI_NN_POOLING_H_
+#define TOPI_NN_POOLING_H_
+
+#include <string>
+#include <vector>
+
+#include "tvm/tvm.h"
+#include "tvm/ir_pass.h"
+#include "topi/tags.h"
+#include "topi/detail/pad_utils.h"
+#include "topi/nn.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*! \brief Pooling type */
+enum PoolType : int {
+  kAvgPool,
+  kMaxPool,
+};
+
+/*!
+* \brief Perform pooling on height and width dimension of data.
+*
+* \param x The input tensor
+* \param kernel_size Vector of two ints: {kernel_height, kernel_width}
+* \param stride_size Vector of two ints: {stride_height, stride_width}
+* \param padding_size Vector of two ints: {padding_height, padding_width}
+* \param pool_type The type of pooling operator
+* \param ceil_mode Whether to use ceil when calculating the output size
+* \param height_axis index of the height dimension
+* \param width_axis index of the width dimension
+* \param count_include_pad Whether include padding in the calculation
+*
+* \return The output tensor in same layout order
+*/
+inline Tensor pool_impl(const Tensor& x,
+                        const Array<Expr>& kernel_size,
+                        const Array<Expr>& stride_size,
+                        const Array<Expr>& padding_size,
+                        PoolType pool_type,
+                        bool ceil_mode,
+                        const size_t height_axis,
+                        const size_t width_axis,
+                        bool count_include_pad) {
+  CHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
+  CHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements";
+  CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
+  CHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
+
+  auto kernel_height = kernel_size[0];
+  auto kernel_width = kernel_size[1];
+  auto stride_height = stride_size[0];
+  auto stride_width = stride_size[1];
+
+  auto height = x->shape[height_axis];
+  auto width = x->shape[width_axis];
+
+  auto pad_top = padding_size[0];
+  auto pad_left = padding_size[1];
+  auto pad_bottom = padding_size[2];
+  auto pad_right = padding_size[3];
+
+  if (ceil_mode) {
+    // Additional padding to ensure we do ceil instead of floor when
+    // dividing by stride.
+    pad_bottom += stride_height - 1;
+    pad_right += stride_width - 1;
+  }
+
+  Array<Expr> pad_before(std::vector<Expr>(x->shape.size(), 0));
+  pad_before.Set(height_axis, pad_top);
+  pad_before.Set(width_axis, pad_left);
+
+  Array<Expr> pad_after(std::vector<Expr>(x->shape.size(), 0));
+  pad_after.Set(height_axis, pad_bottom);
+  pad_after.Set(width_axis, pad_right);
+
+  auto out_height = tvm::ir::Simplify(
+    (height - kernel_height + pad_top + pad_bottom) / stride_height + 1);
+  auto out_width = tvm::ir::Simplify(
+    (width - kernel_width + pad_left + pad_right) / stride_width + 1);
+
+  auto dheight = tvm::reduce_axis(Range(0, kernel_height));
+  auto dwidth = tvm::reduce_axis(Range(0, kernel_width));
+
+  Array<Expr> out_shape = x->shape;
+  out_shape.Set(height_axis, out_height);
+  out_shape.Set(width_axis, out_width);
+
+  const int64_t *padding_h0 = HalideIR::Internal::as_const_int(pad_top);
+  const int64_t *padding_w0 = HalideIR::Internal::as_const_int(pad_left);
+  const int64_t *padding_h1 = HalideIR::Internal::as_const_int(pad_bottom);
+  const int64_t *padding_w1 = HalideIR::Internal::as_const_int(pad_right);
+  const bool do_pad = ((padding_h0 && *padding_h0) || (padding_w0 && *padding_w0)) ||
+                      ((padding_h1 && *padding_h1) || (padding_w1 && *padding_w1));
+
+  if (pool_type == kMaxPool) {
+    auto temp = do_pad ? pad(x, pad_before, pad_after, x->dtype.min(), "pad_temp") : x;
+    return tvm::compute(out_shape, [&](const Array<Var>& output) {
+      Array<Expr> indices;
+      for (const Var& var : output) indices.push_back(var);
+      indices.Set(height_axis, output[height_axis] * stride_height + dheight);
+      indices.Set(width_axis, output[width_axis] * stride_width + dwidth);
+      return tvm::max(temp(indices), { dheight, dwidth });
+    }, "tensor", "pool_max");
+  } else if (pool_type == kAvgPool) {
+    auto temp = do_pad ? pad(x, pad_before, pad_after, 0, "pad_temp") : x;
+    auto tsum = tvm::compute(out_shape, [&](const Array<Var>& output) {
+      Array<Expr> indices;
+      for (const Var& var : output) indices.push_back(var);
+      indices.Set(height_axis, output[height_axis] * stride_height + dheight);
+      indices.Set(width_axis, output[width_axis] * stride_width + dwidth);
+      return tvm::sum(temp(indices), { dheight, dwidth });
+    }, "tensor", "pool_avg");
+
+    return tvm::compute(out_shape,
+    [&](const Array<Var>& output) {
+      if (count_include_pad) {
+        return tsum(output) / (kernel_height * kernel_width);
+      } else {
+        Expr h_start = output[height_axis] * stride_height - pad_top;
+        Expr w_start = output[width_axis] * stride_width - pad_left;
+        Expr h_end = ir::Min::make(h_start + kernel_height, height);
+        Expr w_end = ir::Min::make(w_start + kernel_width, width);
+        h_start = ir::Max::make(h_start, make_const(Int(32), 0));
+        w_start = ir::Max::make(w_start, make_const(Int(32), 0));
+        Expr divide_factor = ir::Max::make((h_end - h_start) * (w_end - w_start),
+                                           make_const(Int(32), 1));
+        return tsum(output) / divide_factor;
+      }
+    }, "tensor", kElementWise);
+  } else {
+    LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
+    return x;
+  }
+}
+
+inline bool find_height_width(const std::string& layout,
+                              int* height_axis,
+                              int* width_axis) {
+  *height_axis = -1, *width_axis = -1;
+  int curr_idx = 0;
+  for (size_t i = 0; i < layout.size(); ++i) {
+    if ((layout[i] >= 'A' && layout[i] <= 'Z') ||
+        (layout[i] >= 'a' && layout[i] <= 'z')) {
+      if (layout[i] == 'H') {
+        if (*height_axis != -1) return false;
+        *height_axis = curr_idx;
+      } else if (layout[i] == 'W') {
+        if (*width_axis != -1) return false;
+        *width_axis = curr_idx;
+      } else if (layout[i] == 'h' || layout[i] == 'w') {
+        // do not support split on height or width, e.g., NCHW16w
+        return false;
+      }
+      ++curr_idx;
+    }
+  }
+  if (*height_axis == -1 || *width_axis == -1) return false;
+  return true;
+}
+
+/*!
+* \brief Perform pooling on height and width dimension of data.
+*        It decides the height and width dimension according to the layout string,
+*        in which 'W' and 'H' means width and height respectively.
+*        Width and height dimension cannot be split.
+*        For example, NCHW, NCHW16c, etc. are valid for pool,
+*        while NCHW16w, NCHW16h are not.
+*        See \a layout for more information of the layout string convention.
+* \param x The input tensor.
+* \param kernel_size Vector of two ints: {kernel_height, kernel_width}
+* \param stride_size Vector of two ints: {stride_height, stride_width}
+* \param padding_size Vector of two ints: {padding_height, padding_width}
+* \param pool_type The type of pooling operator
+* \param ceil_mode Whether to use ceil when calculating the output size
+* \param layout The input layout. Pooling supports any layout as long as 'H' and 'W' appear.
+*        The layout is supposed to be composed of upper cases, lower cases and (optional) numbers,
+*        where upper case indicates a dimension and
+*        the corresponding lower case (with factor size) indicates the split dimension.
+*        For example, NCHW16c can describe a 5-D tensor of
+*        [batch_size, channel, height, width, channel_block].
+*        (in which factor size `16` will not be used in pooling but for other operators,
+*        it can be used to decide the output shape).
+*        Since pooling does not care about the factor size of dimensions
+*        other than `H` and `W`, one can pass `NCHWc` as well.
+* \param  count_include_pad Whether include padding in the calculation when pool_type is 'avg'
+*        
+*
+* \return The output tensor in the same layout
+*/
+inline Tensor pool(const Tensor& x,
+                   const Array<Expr>& kernel_size,
+                   const Array<Expr>& stride_size,
+                   const Array<Expr>& padding_size,
+                   PoolType pool_type,
+                   bool ceil_mode,
+                   const std::string& layout = "NCHW",
+                   bool count_include_pad = true) {
+  int height_axis = -1, width_axis = -1;
+  CHECK(find_height_width(layout, &height_axis, &width_axis))
+    << "Unsupported layout " << layout;
+  return pool_impl(x, kernel_size, stride_size, padding_size,
+                   pool_type, ceil_mode, height_axis, width_axis,
+                   count_include_pad);
+}
+
+/*!
+* \brief Perform global pooling on height and width dimension of data.
+*        It decides the height and width dimension according to the layout string,
+*        in which 'W' and 'H' means width and height respectively.
+*        Width and height dimension cannot be split.
+*        For example, NCHW, NCHW16c, ... are valid for global_pool,
+*        while NCHW16w, NCHW16h are not.
+*        See \a layout for more information of the layout string convention.
+*
+* \param x The input tensor represent as layout
+* \param pool_type The type of pooling operator
+* \param layout The input layout. global-pooling supports any layout as long as 'H' and 'W' appear.
+*        The layout is supposed to be composed of upper cases, lower cases and (optional) numbers,
+*        where upper case indicates a dimension and
+*        the corresponding lower case (with factor size) indicates the sub-dimension.
+*        For example, `NCHW16c` can describe a 5-D tensor of
+*        [batch_size, channel, height, width, channel_block].
+*        (in which factor size `16` will not be used in pooling but for other operators,
+*        it can be used to decide the output shape).
+*        Since pooling does not care about the factor size of
+*        dimensions other than `H` and `W`, one can pass `NCHWc` as well.
+*
+* \return The output tensor in same layout with height and width dimension size of 1.
+*         e.g., for NCHW, the output shape will be [batch, channel, 1, 1]
+*/
+inline Tensor global_pool(const Tensor& x,
+                          PoolType pool_type,
+                          const std::string& layout = "NCHW") {
+  CHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
+
+  int height_axis = -1, width_axis = -1;
+  CHECK(find_height_width(layout, &height_axis, &width_axis))
+    << "Unsupported layout " << layout;
+
+  Array<Expr> out_shape = x->shape;
+  out_shape.Set(height_axis, 1);
+  out_shape.Set(width_axis, 1);
+
+  auto height = x->shape[height_axis];
+  auto width = x->shape[width_axis];
+
+  auto dheight = tvm::reduce_axis(Range(0, height));
+  auto dwidth = tvm::reduce_axis(Range(0, width));
+
+  if (pool_type == kMaxPool) {
+    return tvm::compute(out_shape,
+      [&](const Array<Var>& output) {
+        Array<Expr> indices;
+        for (const Var& var : output) indices.push_back(var);
+        indices.Set(height_axis, dheight);
+        indices.Set(width_axis, dwidth);
+        return tvm::max(x(indices), { dheight, dwidth });  // NOLINT(*)
+      }, "tensor", "global_pool_max");
+  } else if (pool_type == kAvgPool) {
+    auto tsum = tvm::compute(out_shape,
+      [&](const Array<Var>& output) {
+        Array<Expr> indices;
+        for (const Var& var : output) indices.push_back(var);
+        indices.Set(height_axis, dheight);
+        indices.Set(width_axis, dwidth);
+        return tvm::sum(x(indices), { dheight, dwidth });
+      }, "tensor", "global_pool_sum");
+
+    return tvm::compute(out_shape,
+      [&](const Array<Var>& output) {
+        return tsum(output) / tvm::cast(x->dtype, height * width);
+      }, "tensor", kElementWise);
+  } else {
+    LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
+    return x;
+  }
+}
+
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_POOLING_H_
diff --git a/topi/include/topi/nn/softmax.h b/topi/include/topi/nn/softmax.h
new file mode 100644
index 000000000000..d17f93046e72
--- /dev/null
+++ b/topi/include/topi/nn/softmax.h
@@ -0,0 +1,124 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Softmax op constructions
+ * \file nn/softmax.h
+ */
+#ifndef TOPI_NN_SOFTMAX_H_
+#define TOPI_NN_SOFTMAX_H_
+
+#include <algorithm>
+#include <string>
+
+#include "topi/reduction.h"
+#include "topi/tags.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Softmax activation
+*
+* \param x The input tensor. Can be any dimension
+* \param axis The channel axis along which softmax is performed
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the softmax operation
+*/
+inline Tensor softmax(const Tensor &x,
+                      int axis = -1,
+                      std::string name = "tensor",
+                      std::string tag = "softmax_output") {
+  auto input_shape = x->shape;
+  auto ndim = input_shape.size();
+  if (axis < 0) {
+    axis = ndim + axis;
+  }
+  CHECK_LT(axis, ndim) << "axis parameter should be less than input dim";
+
+  auto k1 = tvm::reduce_axis(Range(0, input_shape[axis]), "k1");
+  auto k2 = tvm::reduce_axis(Range(0, input_shape[axis]), "k2");
+  auto reduced_shape = MakeReduceTargetShape({axis}, x, false);
+
+  auto insert_reduce_index = [axis, ndim](const Array<Var> &indices,
+                                          const IterVar &reduce_index) {
+    Array<Expr> eval_range;
+    int arg_counter = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+      if (static_cast<int>(i) == axis)
+        eval_range.push_back(reduce_index);
+      else
+        eval_range.push_back(indices[arg_counter++]);
+    }
+    return eval_range;
+  };
+
+  auto _compute_max = [&](const Array<Var> &indices) {
+    auto eval_range = insert_reduce_index(indices, k1);
+    return topi::MaxOp(x(eval_range), {k1});
+  };
+
+  auto _compute_expsum = [&](const Tensor &max_elem,
+                             const Array<Var> &indices) {
+    auto eval_range = insert_reduce_index(indices, k2);
+    return tvm::sum(tvm::exp(x(eval_range) - max_elem(indices)), {k2});
+  };
+
+  auto _normalize = [&](const Tensor &max_elem, const Tensor &expsum,
+                        const Array<Var> &indices) {
+    Array<Expr> non_reduce_indices;
+    for (size_t i = 0; i < ndim; ++i) {
+      if (static_cast<int>(i) != axis)
+        non_reduce_indices.push_back(indices[i]);
+    }
+    return tvm::exp(x(indices) - max_elem(non_reduce_indices)) /
+           expsum(non_reduce_indices);
+  };
+
+  auto max_elem = tvm::compute(reduced_shape, _compute_max);
+  auto expsum = tvm::compute(reduced_shape, [&](const Array<Var> &indices) {
+      return _compute_expsum(max_elem, indices);
+  });
+  return tvm::compute(input_shape, [&](const Array<Var> &indices) {
+      return _normalize(max_elem, expsum, indices);
+  }, name, tag);
+}
+
+/*!
+* \brief Log softmax activation
+*
+* \param x The input tensor. 2-D where log softmax is performed along the second dimension
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the log softmax operation
+*/
+inline Tensor log_softmax(const Tensor& x,
+                          std::string name = "tensor",
+                          std::string tag = "log_softmax_output") {
+  CHECK_EQ(x->shape.size(), 2) << "Log softmax requires 2-D input";
+
+  Expr m = x->shape[0];
+  Expr n = x->shape[1];
+
+  auto k = tvm::reduce_axis(Range(0, n), "k");
+  auto max_elem = tvm::compute(
+    { m }, [&](Var i) {
+      return tvm::max(x(i, k), Array<IterVar>{ k }); });
+  k = tvm::reduce_axis(Range(0, n), "k");
+
+  auto expsum = tvm::compute(
+    { m }, [&](Var i) {
+      return tvm::sum(tvm::exp(x(i, k) - max_elem(i)), { k }); });
+
+  return tvm::compute(
+    x->shape, [&](Var i, Var j) {
+      return x(i, j) - max_elem(i) - tvm::log(expsum(i));
+    }, name, tag);
+}
+
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_SOFTMAX_H_
diff --git a/topi/include/topi/nn/upsampling.h b/topi/include/topi/nn/upsampling.h
new file mode 100644
index 000000000000..e0fe2991c649
--- /dev/null
+++ b/topi/include/topi/nn/upsampling.h
@@ -0,0 +1,44 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file topi/nn/upsampling.h
+ * \brief upsampling op constructors
+ */
+#ifndef TOPI_NN_UPSAMPLING_H_
+#define TOPI_NN_UPSAMPLING_H_
+
+#include <string>
+#include <vector>
+#include <iterator>
+#include <algorithm>
+
+#include "topi/image/resize.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+using namespace topi::image;
+
+/*!
+* \brief Upsample given tensor to given shape
+*
+* \param input The input tensor.
+* \param shape Output shape to upsample.
+* \param layout input layout
+* \param mode Angorithm to use (NEAREST_NEIGHBOR / BILINEAR)
+* \param name Name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor upsampled to given shape
+*/
+inline Tensor upsampling(const Tensor& input,
+                         const Array<Expr> shape,
+                         std::string layout = "NCHW",
+                         std::string mode = "NEAREST_NEIGHBOR",
+                         std::string name = "tensor",
+                         std::string tag = kInjective) {
+  return resize(input, shape, layout, false, mode);
+}
+
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_UPSAMPLING_H_
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
new file mode 100644
index 000000000000..f14187471faf
--- /dev/null
+++ b/topi/include/topi/reduction.h
@@ -0,0 +1,430 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file topi/reduction.h
+ * \brief Reduction op constructors
+ */
+#ifndef TOPI_REDUCTION_H_
+#define TOPI_REDUCTION_H_
+
+#include <algorithm>
+#include <string>
+#include <set>
+#include <vector>
+#include <iterator>
+
+#include "topi/elemwise.h"
+#include "topi/tags.h"
+#include "topi/transform.h"
+#include "topi/detail/ravel_unravel.h"
+#include "topi/detail/constant_utils.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+using namespace tvm;
+
+/*! \brief The operation to use for CommReduce */
+using FReduce = std::function<Expr(Expr source, const Array<IterVar>& axis)>;
+
+/*! \brief The operation to use for CommReduceIdx */
+using FCommReduce = std::function<
+  Array<Expr>(Array<Expr> exprs, const Array<IterVar>& axis, Expr* condition)>;
+
+/*!
+* \brief Convert a reduction axis which could be empty or have negative
+* elements into a real axis with valid dimension indices.
+*
+* \return A non-empty sorted array of valid dimension indices, with no duplicates.
+* If the input axis is empty, the result will be an axis including all dimensions.
+* If any input element is negative, it will be treated as an offset from the
+* last dimension (same as python indexing rules).
+*/
+inline std::vector<int> GetRealAxis(int ndim, const std::vector<int>& axis) {
+  std::vector<int> real_axis;
+  if (axis.size() == 0) {
+    for (int i = 0; i < ndim; ++i) {
+      real_axis.push_back(i);
+    }
+  } else {
+    // Use a set so duplicates are removed and the dims are sorted
+    std::set<int> dims;
+    for (auto ele : axis) {
+      if (ele < 0) {
+        ele += ndim;
+      }
+      if (ele >= ndim) {
+        LOG(ERROR) << ele << " exceeds the maximum dimension " << ndim;
+      }
+      dims.emplace(ele);
+    }
+    std::copy(dims.begin(), dims.end(), std::back_inserter(real_axis));
+  }
+  return real_axis;
+}
+
+/*! \brief Enumerate the axes for a reduce op */
+inline Array<IterVar> MakeReduceAxes(const std::vector<int>& real_axis, const Tensor& data) {
+  Array<IterVar> reduce_axes;
+  for (auto i : real_axis) {
+    std::string name = "k" + std::to_string(i);
+    reduce_axes.push_back(
+      tvm::reduce_axis(Range(0, data->shape[i]), name));
+  }
+  return reduce_axes;
+}
+
+/*! \brief Calculate the target shape for a reduce op */
+inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
+                                         const Tensor& data,
+                                         bool keepdims) {
+  auto ndim = data->shape.size();
+  Array<Expr> target_shape;
+  if (keepdims) {
+    for (size_t i = 0; i < ndim; ++i) {
+      if (std::find(real_axis.begin(), real_axis.end(), i) != real_axis.end()) {
+        // real_axis contains i
+        target_shape.push_back(1);
+      } else {
+        target_shape.push_back(data->shape[i]);
+      }
+    }
+  } else {
+    for (size_t i = 0; i < ndim; ++i) {
+      if (std::find(real_axis.begin(), real_axis.end(), i) == real_axis.end()) {
+        // real_axis does not contain i
+        target_shape.push_back(data->shape[i]);
+      }
+    }
+    if (target_shape.size() == 0) {
+      target_shape.push_back(1);
+    }
+  }
+  return target_shape;
+}
+
+/*!
+ * \brief Create a reduction operation.
+ *
+ * \param data The input tensor.
+ * \param func The reduction function eg. tvm::sum
+ * \param target_shape The output Tensor shape.
+ * \param reduce_axes The real axes along which the reduction is performed.
+ * \param squeeze_axes The real axes to squeeze. Unsqueezed, reduced axes will
+ *                     have shape 1 in the output tensor.
+ *
+ * \return The result tensor.
+ */
+inline Tensor DoCommReduce(const Tensor& data,
+                           FReduce func,
+                           const Array<Expr>& target_shape,
+                           const std::vector<int>& reduce_axes,
+                           const std::vector<int>& squeeze_axes) {
+  auto r_axes = MakeReduceAxes(reduce_axes, data);
+  auto compute = [&](const Array<Var>& indices) {
+    Array<Expr> eval_range;
+    Array<Var> eval_indices;
+    int arg_counter = 0;
+    int red_counter = 0;
+
+    for (size_t i = 0; i < data->shape.size(); ++i) {
+      bool squeeze_i = std::find(squeeze_axes.begin(), squeeze_axes.end(), i) != squeeze_axes.end();
+      if (std::find(reduce_axes.begin(), reduce_axes.end(), i) != reduce_axes.end()) {
+        // real_axis contains i
+        eval_range.push_back(r_axes[red_counter]);
+        eval_indices.push_back(r_axes[red_counter]->var);
+        red_counter++;
+        arg_counter += !squeeze_i;
+        continue;
+      }
+      eval_range.push_back(indices[arg_counter]);
+      arg_counter++;
+    }
+
+    return func(data(eval_range), r_axes);
+  };
+
+  return tvm::compute(target_shape, compute, data->op->name + "_red", kCommReduce);
+}
+
+/*!
+ * \brief Create a reduction operation.
+ *
+ * \param data The input tensor.
+ * \param axis The axes along which the reduction is performed.
+ * \param func The reduction function eg. tvm::sum
+ * \param keepdims If this is set to true, the axes which are reduced are
+ * left in the result as dimensions with size one. This enables the result
+ * to broadcast correctly against the input array.
+ *
+ * \return The result tensor.
+ */
+inline Tensor CommReduce(const Tensor& data,
+                         const Array<Expr>& axis,
+                         FReduce func,
+                         bool keepdims = false) {
+  auto ndim = data->shape.size();
+  CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
+  auto axis_val = detail::GetConstIntValues(axis, "axis");
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+  return DoCommReduce(data, func, target_shape, real_axis,
+      keepdims ? std::vector<int>() : real_axis);
+}
+
+/*!
+* \brief Create an index reduction operation.
+*
+* \param data The input tensor.
+* \param axis The axes along which the reduction is performed.
+* \param func The reduction function
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+*
+* \return The result tensor.
+*/
+inline Tensor CommReduceIdx(const Tensor& data,
+                            const Array<Expr>& axis,
+                            FCommReduce func,
+                            bool keepdims = false) {
+  auto ndim = data->shape.size();
+  CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
+  auto axis_val = detail::GetConstIntValues(axis, "axis");
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
+  auto reduce_axes = MakeReduceAxes(real_axis, data);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+
+  auto compute = [ndim, keepdims, &real_axis, &reduce_axes, &func, &data]
+  (const Array<Var>& indices) {
+    Array<Expr> eval_range;
+    Array<Var> eval_indices;
+    int arg_counter = 0;
+    int red_counter = 0;
+
+    for (size_t i = 0; i < ndim; ++i) {
+      if (std::find(real_axis.begin(), real_axis.end(), i) != real_axis.end()) {
+        // real_axis contains i
+        eval_range.push_back(reduce_axes[red_counter]);
+        eval_indices.push_back(reduce_axes[red_counter]->var);
+        red_counter++;
+      } else {
+        if (!keepdims) {
+          eval_range.push_back(indices[arg_counter]);
+          arg_counter++;
+        } else {
+          eval_range.push_back(indices[i]);
+        }
+      }
+    }
+
+    Array<Expr> ravel_shape;
+    for (auto i : real_axis) {
+      ravel_shape.push_back(data->shape[i]);
+    }
+    auto idx = detail::RavelIndex(eval_indices, ravel_shape);
+    return func({ idx, data(eval_range) }, reduce_axes, nullptr);
+  };
+
+  auto temp_idx_val = tvm::compute(target_shape, compute,
+                                   data->op->name + "_red_temp", kCommReduceIdx);
+  auto temp_idx = temp_idx_val[0];
+  auto temp_val = temp_idx_val[1];
+  return tvm::compute(
+    target_shape,
+    [&temp_idx](const Array<Var>& indices) { return temp_idx(indices); },
+    data->op->name + "_red",
+    kCommReduceIdx);
+}
+
+/*! \brief A combiner function for a reduction */
+using FCombine = std::function<Array<Expr>(Array<Var> lhs, Array<Var> rhs)>;
+
+/*! \brief An initializer function for a reduction */
+using FIdentity = std::function<Array<Expr>(std::vector<Type> types)>;
+
+/*!
+ * \brief Create a commutative reducer for a reduction
+ *
+ * \param fcombine A function to combine exprs
+ * \param fidentity A function to initialize elements
+ * \param name The name of the operation
+ *
+ * \return A reducer function which creates a reduce expression over an axis.
+ */
+inline FCommReduce MakeCommReducer(FCombine fcombine,
+                                   FIdentity fidentity,
+                                   std::string name = "reduce") {
+  return [fcombine, fidentity, &name]
+  (Array<Expr> exprs, const Array<IterVar>& axis, Expr* condition) {
+    Array<Var> lhs, rhs;
+    std::vector<Type> dtypes;
+
+    for (size_t i = 0; i < exprs.size(); ++i) {
+      auto dtype = exprs[i].type();
+      dtypes.push_back(dtype);
+      lhs.push_back(var(name + "_lhs_" + std::to_string(i), dtype));
+      rhs.push_back(var(name + "_rhs_" + std::to_string(i), dtype));
+    }
+
+    auto result = fcombine(lhs, rhs);
+    auto id_elem = fidentity(dtypes);
+    auto cond = condition != nullptr ? *condition : tvm::const_true();
+
+    auto combiner = tvm::ir::CommReducerNode::make(lhs, rhs, result, id_elem);
+    Array<Expr> outputs;
+    for (size_t i = 0; i < exprs.size(); ++i) {
+      outputs.push_back(tvm::ir::Reduce::make(combiner, exprs, axis, cond, static_cast<int>(i)));
+    }
+    return outputs;
+  };
+}
+
+/*! \brief Wrap tvm::min to ensure we get the correct overload */
+inline Expr MinOp(Expr source, Array<IterVar> axis) {
+  return tvm::min(source, axis);
+}
+
+/*! \brief Wrap tvm::max to ensure we get the correct overload */
+inline Expr MaxOp(Expr source, Array<IterVar> axis) {
+  return tvm::max(source, axis);  // NOLINT(*)
+}
+
+/*!
+* \brief Creates an operation that sums array elements over a given axis
+*
+* \param data The input tensor
+* \param axis The axis to sum over. If axis is empty, the operation will
+* sum over all elements of the array.
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+*
+* \return A Tensor whose op member is the sum operation
+*/
+inline Tensor sum(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+  return CommReduce(data, axis, tvm::sum, keepdims);
+}
+
+inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
+  CHECK_GE(data->shape.size(), target_shape.size());
+  auto ishape = detail::GetConstIntValues(data->shape, "ishape");
+  auto oshape = detail::GetConstIntValues(target_shape, "oshape");
+
+  std::vector<int> reduce_axes;
+  std::vector<int> squeeze_axes;
+  for (int i_ax = ishape.size() - 1,
+      o_ax = oshape.size() - 1; i_ax >= 0; --i_ax) {
+    if (o_ax >= 0 && ishape[i_ax] == oshape[o_ax]) {
+      --o_ax;
+      continue;
+    }
+    reduce_axes.push_back(i_ax);
+    if (o_ax < 0) {  // squeeze o_ax if was added during expansion
+      squeeze_axes.push_back(i_ax);
+    } else if (oshape[o_ax] == 1) {
+      --o_ax;
+    }
+  }
+
+  if (reduce_axes.size() == 0) return topi::identity(data, "tensor", kCommReduce);
+
+  std::reverse(reduce_axes.begin(), reduce_axes.end());
+  std::reverse(squeeze_axes.begin(), squeeze_axes.end());
+  return DoCommReduce(data, tvm::sum, target_shape, reduce_axes, squeeze_axes);
+}
+
+/*!
+* \brief Creates an operation that finds the minimum of elements over
+* a given axis.
+*
+* \param data The input tensor
+* \param axis The axis to find the minimum over. If axis is empty, the
+* operation will find the minimum over all elements of the array.
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+*
+* \return A Tensor whose op member is the min operation
+*/
+inline Tensor min(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+  return CommReduce(data, axis, MinOp, keepdims);
+}
+
+/*!
+* \brief Creates an operation that finds the maximum of elements over
+* a given axis.
+*
+* \param data The input tensor
+* \param axis The axis to find the maximum over. If axis is empty, the
+* operation will find the maximum over all elements of the array.
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+*
+* \return A Tensor whose op member is the max operation
+*/
+inline Tensor max(const Tensor& data, Array<Expr> axis, bool keepdims = false) {  // NOLINT(*)
+  return CommReduce(data, axis, MaxOp, keepdims);
+}
+
+/*!
+* \brief Creates an operation that finds the indices of the minimum
+* values over a given axis.
+*
+* \param data The input tensor
+* \param axis The axis along which the argmin is performed. If axis is empty,
+* the operation will find the minimum index over all elements of the array.
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+*
+* \return A Tensor whose op member is the argmin operation
+*/
+inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+  auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
+    Array<Expr> result;
+    result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[0], rhs[0]));  // idx
+    result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[1], rhs[1]));  // val
+    return result;
+  };
+  auto fidentity = [](std::vector<Type> types) {
+    Array<Expr> result;
+    result.push_back(tvm::make_const(types[0], -1));  // idx
+    result.push_back(types[1].max());  // val
+    return result;
+  };
+  auto func = MakeCommReducer(fcombine, fidentity, "argmin");
+  return CommReduceIdx(data, axis, func, keepdims);
+}
+
+/*!
+* \brief Creates an operation that finds the indices of the maximum
+* values over a given axis.
+*
+* \param data The input tensor
+* \param axis The axis along which the argmax is performed. If axis is empty,
+* the operation will find the maximum index over all elements of the array.
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+*
+* \return A Tensor whose op member is the argmax operation
+*/
+inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+  auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
+    Array<Expr> result;
+    result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[0], rhs[0]));  // idx
+    result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[1], rhs[1]));  // val
+    return result;
+  };
+  auto fidentity = [](std::vector<Type> types) {
+    Array<Expr> result;
+    result.push_back(tvm::make_const(types[0], -1));  // idx
+    result.push_back(types[1].min());  // val
+    return result;
+  };
+  auto func = MakeCommReducer(fcombine, fidentity, "argmax");
+  return CommReduceIdx(data, axis, func, keepdims);
+}
+
+}  // namespace topi
+#endif  // TOPI_REDUCTION_H_
diff --git a/topi/include/topi/rocm/dense.h b/topi/include/topi/rocm/dense.h
new file mode 100644
index 000000000000..6f171f6780fc
--- /dev/null
+++ b/topi/include/topi/rocm/dense.h
@@ -0,0 +1,81 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file rocm/dense.h
+* \brief rocm schedule for dense operation
+*/
+#ifndef TOPI_ROCM_DENSE_H_
+#define TOPI_ROCM_DENSE_H_
+
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+#include "topi/tags.h"
+#include "topi/detail/array_utils.h"
+#include "topi/nn/dense.h"
+#include "topi/contrib/rocblas.h"
+#include "topi/generic/extern.h"
+#include "topi/cuda/dense.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace rocm {
+/*!
+* \brief Implementation of dense for rocm backend
+*
+* \param target The target device
+* \param data Tensor with shape [batch, in_dim]
+* \param weight Tensor with shape [out_dim, in_dim]
+* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+*
+* \return Tensor with shape [batch, out_dim]
+*/
+inline tvm::Tensor dense_rocm(const Target& target,
+                              const tvm::Tensor& data,
+                              const tvm::Tensor& weight,
+                              const tvm::Tensor& bias) {
+  CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
+  CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
+  if (bias.defined()) {
+    CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
+  }
+
+  auto batch = data->shape[0];
+  auto in_dim = data->shape[1];
+  auto out_dim = weight->shape[0];
+
+  if (target->libs().count("rocblas")) {
+    auto mm = topi::contrib::rocblas_matmul(data, weight, false, true);
+    if (bias.defined()) {
+      mm = tvm::compute({ batch, out_dim },
+                        [&](Var i, Var j) {
+                          return mm(i, j) + bias(j);
+                        }, "tensor", kBroadcast);
+    }
+
+    return mm;
+  } else {
+    return topi::nn::dense(data, weight, bias);
+  }
+}
+
+/*!
+* \brief Create a rocm schedule for dense
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_dense(const Target &target, const Array<Tensor>& outs) {
+  if (target->target_name == "rocm" &&
+    target->libs().count("rocblas")) {
+    return topi::generic::schedule_extern(target, outs);
+  }
+
+  return topi::cuda::schedule_dense(target, outs);
+}
+
+}  // namespace rocm
+}  // namespace topi
+#endif  // TOPI_ROCM_DENSE_H_
+
diff --git a/topi/include/topi/rocm/normalization.h b/topi/include/topi/rocm/normalization.h
new file mode 100644
index 000000000000..b12e64aba963
--- /dev/null
+++ b/topi/include/topi/rocm/normalization.h
@@ -0,0 +1,41 @@
+/*!
+*  Copyright (c) 2018 by Contributors
+* \file rocm/normalization.h
+* \brief rocm schedule for LRN and l2 normalization operations
+*/
+#ifndef TOPI_ROCM_NORMALIZATION_H_
+#define TOPI_ROCM_NORMALIZATION_H_
+
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+#include "topi/tags.h"
+
+namespace topi {
+using namespace tvm;
+namespace rocm {
+/*!
+* \brief Create a rocm schedule for LRN
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_lrn(const Target &target, const Array<Tensor>& outs) {
+  return topi::cuda::schedule_lrn(target, outs);
+}
+
+/*!
+* \brief Create a rocm schedule for L2 Normalization
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_l2_normalize(const Target &target, const Array<Tensor>& outs) {
+  return topi::cuda::schedule_l2_normalize(target, outs);
+}
+}  // namespace rocm
+}  // namespace topi
+#endif  // TOPI_ROCM_NORMALIZATION_H_
diff --git a/topi/include/topi/rocm/vision.h b/topi/include/topi/rocm/vision.h
new file mode 100644
index 000000000000..4178a180deb4
--- /dev/null
+++ b/topi/include/topi/rocm/vision.h
@@ -0,0 +1,33 @@
+/*!
+*  Copyright (c) 2018 by Contributors
+* \file rocm/vision.h
+* \brief rocm schedule for region operation
+*/
+#ifndef TOPI_ROCM_VISION_H_
+#define TOPI_ROCM_VISION_H_
+
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+#include "topi/tags.h"
+#include "topi/detail/array_utils.h"
+#include "topi/contrib/rocblas.h"
+#include "topi/generic/extern.h"
+#include "topi/cuda/vision.h"
+
+namespace topi {
+using namespace tvm;
+namespace rocm {
+/*!
+* \brief Create a rocm schedule for region
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_region(const Target &target, const Array<Tensor>& outs) {
+  return topi::cuda::schedule_region(target, outs);
+}
+}  // namespace rocm
+}  // namespace topi
+#endif  // TOPI_ROCM_VISION_H_
diff --git a/topi/include/topi/tags.h b/topi/include/topi/tags.h
index 17882f877697..8c92644d96d3 100644
--- a/topi/include/topi/tags.h
+++ b/topi/include/topi/tags.h
@@ -6,11 +6,16 @@
 #ifndef TOPI_TAGS_H_
 #define TOPI_TAGS_H_
 
+#include <string>
+
 namespace topi {
 
 constexpr auto kElementWise = "elemwise";
+constexpr auto kInjective = "injective";
+constexpr auto kCommReduce = "comm_reduce";
+constexpr auto kCommReduceIdx = "comm_reduce_idx";
 constexpr auto kBroadcast = "broadcast";
-constexpr auto kMatMult = "matmult";
+constexpr auto kMatMul = "matmul";
 constexpr auto kConv2dNCHW = "conv2d_nchw";
 constexpr auto kConv2dHWCN = "conv2d_hwcn";
 constexpr auto kDepthwiseConv2dNCHW = "depthwise_conv2d_nchw";
@@ -19,6 +24,19 @@ constexpr auto kDepthwiseConv2dBackInputNHWC = "depthwise_conv2d_back_input_nhwc
 constexpr auto kDepthwiseConv2dBackWeightNHWC = "depthwise_conv2d_back_weight_nhwc";
 constexpr auto kGroupConv2d = "group_conv2d";
 
+inline bool is_broadcast(std::string tag) {
+  return
+    tag.rfind(kElementWise, 0) == 0 ||
+    tag.rfind(kBroadcast, 0) == 0;
+}
+
+inline bool is_injective(std::string tag) {
+  return
+    tag.rfind(kElementWise, 0) == 0 ||
+    tag.rfind(kBroadcast, 0) == 0 ||
+    tag.rfind(kInjective, 0) == 0;
+}
+
 }  // namespace topi
 
 #endif  // TOPI_TAGS_H_
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
new file mode 100644
index 000000000000..09af612b957b
--- /dev/null
+++ b/topi/include/topi/transform.h
@@ -0,0 +1,627 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file topi/transform.h
+ * \brief Transform op constructors
+ */
+#ifndef TOPI_TRANSFORM_H_
+#define TOPI_TRANSFORM_H_
+
+#include <string>
+#include <vector>
+#include <iterator>
+#include <algorithm>
+
+#include "topi/tags.h"
+#include "topi/detail/ravel_unravel.h"
+#include "topi/detail/constant_utils.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+using namespace tvm;
+using namespace topi::detail;
+
+/*!
+* \brief Creates an operation to insert new dimensions of length 1
+*
+* \param x The input tensor
+* \param axis The index of the first new dimension (allows negative
+* indices as offsets from the last dimension)
+* \param num_newaxis The number of new dimensions to insert
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the dim expansion operation
+*/
+inline Tensor expand_dims(const Tensor& x,
+                          int axis,
+                          int num_newaxis = 1,
+                          std::string name = "tensor",
+                          std::string tag = kBroadcast) {
+  if (axis < 0) {
+    // Calculate offset from last dimension
+    axis = static_cast<int>(x->shape.size()) + axis + 1;
+  }
+
+  Array<Expr> new_shape;
+  for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+    new_shape.push_back(x->shape[i]);
+  }
+  for (size_t i = 0; i < static_cast<size_t>(num_newaxis); ++i) {
+    new_shape.push_back(1);
+  }
+  for (size_t i = axis; i < x->shape.size(); ++i) {
+    new_shape.push_back(x->shape[i]);
+  }
+
+  return compute(
+    new_shape, [&](const Array<Var>& indices) {
+      Array<Expr> idx;
+      for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+        idx.push_back(indices[i]);
+      }
+      for (size_t i = axis + num_newaxis; i < indices.size(); ++i) {
+        idx.push_back(indices[i]);
+      }
+      return x(idx);
+    }, name, tag);
+}
+
+/*!
+* \brief Permute the dimensions of an array
+*
+* \param x The input tensor
+* \param axes The indices of the permutation. If this is empty,
+* the dimensions will be reversed.
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the transpose operation
+*/
+inline Tensor transpose(const Tensor& x,
+                        Array<Expr> axes,
+                        std::string name = "tensor",
+                        std::string tag = kInjective) {
+  if (axes.size() == 0) {
+    axes = Array<Expr>();
+    for (int i = static_cast<int>(x->shape.size()) - 1; i >= 0; --i) {
+      axes.push_back(i);
+    }
+  }
+
+  auto axes_val = GetConstIntValues(axes, "axes");
+  for (size_t i = 0; i < axes_val.size(); ++i) {
+    int axis = axes_val[i];
+    if (axes_val[i] < 0) {
+      axes_val[i] = static_cast<int>(x->shape.size()) + axes_val[i];
+    }
+    CHECK((0 <= axes_val[i]) && (axes_val[i] < static_cast<int>(x->shape.size())))
+      << "axis=" << axis << " is invalid for the "
+      << static_cast<int>(x->shape.size()) << "-dimensional input tensor";
+
+    CHECK(1 == std::count(std::begin(axes_val), std::end(axes_val), axes_val[i]))
+      << "repeated axis in transpose";
+  }
+
+  Array<Expr> new_shape;
+  for (size_t i = 0; i < axes_val.size(); ++i) {
+    new_shape.push_back(x->shape[axes_val[i]]);
+  }
+  return compute(
+    new_shape, [&](const Array<Var>& indices) {
+      std::vector<Expr> idx;
+      for (size_t i = 0; i < axes_val.size(); ++i) {
+        idx.push_back(1);
+      }
+      for (size_t i = 0; i < axes_val.size(); ++i) {
+        idx[axes_val[i]] = indices[i];
+      }
+      return x(idx);
+    }, name, tag);
+}
+
+/*!
+* \brief flip/reverse elements of an array in a particular axis
+*
+* \param x The input tensor
+* \param axis The axis along which the tensors will be reveresed
+* (allows negative indices)
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the reverse operation
+*/
+inline Tensor flip(const Tensor& x,
+                   int axis = 0,
+                   std::string name = "tensor",
+                   std::string tag = kInjective) {
+  size_t src_tensor_dim = x->shape.size();
+  int axis_inp = axis;
+
+  if (axis < 0) {
+    axis = static_cast<int>(x->shape.size()) + axis;
+  }
+
+  CHECK((0 <= axis) && (axis < static_cast<int>(x->shape.size())))
+    << "axis=" << axis_inp << " is invalid for the "
+    << static_cast<int>(x->shape.size()) << "-dimensional input tensor";
+
+  // Reverse the Input Tensor in the axis specified
+  return compute(
+    x->shape, [&](const Array<Var>& indices) {
+      Array<Expr> real_indices;
+      for (size_t i = 0; i < src_tensor_dim; ++i) {
+        if (i == static_cast<size_t>(axis)) {
+          real_indices.push_back(x->shape[i] - indices[i] - 1);
+        } else {
+          real_indices.push_back(indices[i]);
+        }
+      }
+      return x(real_indices);
+    }, name, tag);
+}
+
+/*!
+* \brief Reshape a tensor
+*
+* \param x The input tensor
+* \param newshape The new shape
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the reshape operation
+*/
+inline Tensor reshape(const Tensor& x,
+                      Array<Expr> newshape,
+                      std::string name = "tensor",
+                      std::string tag = kInjective) {
+  auto x_shape = x->shape;
+  return compute(
+    newshape, [&](const Array<Var>& indices) {
+      return x(UnavelIndex(RavelIndex(indices, newshape), x_shape));
+    }, name, tag);
+}
+
+/*!
+* \brief Remove size 1 dimensions from the shape of a tensor.
+* The removed dimensions must have a constant size of 1.
+*
+* \param x The input tensor
+* \param axis Indices of the dimensions to remove. If this is empty,
+* all entries with a constant size of 1 will be removed.
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the squeeze operation
+*/
+inline Tensor squeeze(const Tensor& x,
+                      Array<Expr> axis,
+                      std::string name = "tensor",
+                      std::string tag = kInjective) {
+  auto axis_val = GetConstIntValues(axis, "axis");
+  auto ndim = x->shape.size();
+  if (axis_val.size() == 0) {
+    for (size_t i = 0; i < ndim; ++i) {
+      if (IsConstInt(x->shape[i]) && GetConstInt(x->shape[i]) == 1) {
+        axis_val.push_back(static_cast<int>(i));
+      }
+    }
+  } else {
+    for (size_t i = 0; i < axis_val.size(); ++i) {
+      if (axis_val[i] < 0) {
+        axis_val[i] += static_cast<int>(x->shape.size());
+      }
+      CHECK_EQ(GetConstInt(x->shape[axis_val[i]]), 1) <<
+        "Dimension " << axis[i] << " must have size 1";
+    }
+  }
+
+  std::unordered_set<int> axis_set(axis_val.begin(), axis_val.end());
+
+  Array<Expr> out_shape;
+  for (size_t i = 0; i < ndim; ++i) {
+    if (axis_set.count(static_cast<int>(i)) == 0) {
+      out_shape.push_back(x->shape[i]);
+    }
+  }
+  if (out_shape.size() == 0) {
+    out_shape.push_back(1);
+  }
+
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+      Array<Expr> real_indices;
+      int flag = 0;
+      for (size_t i = 0; i < ndim; ++i) {
+        if (axis_set.count(static_cast<int>(i)) == 0) {
+          real_indices.push_back(indices[i - flag]);
+        } else {
+          real_indices.push_back(0);
+          flag += 1;
+        }
+      }
+      return x(real_indices);
+    }, name, tag);
+}
+
+/*!
+* \brief Join a sequence of tensors along an existing axis
+*
+* \param inputs The input tensors
+* \param axis The axis along which the tensors will be joined
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the concatenate operation
+*/
+inline Tensor concatenate(const Array<Tensor>& inputs,
+                          int axis = 0,
+                          std::string name = "tensor",
+                          std::string tag = kInjective) {
+  if (axis < 0) {
+    axis += static_cast<int>(inputs[0]->shape.size());
+  }
+  CHECK_LT(axis, inputs[0]->shape.size()) <<
+    "axis out of bounds";
+
+  Array<Expr> axis_sizes;
+  for (auto t : inputs) {
+    axis_sizes.push_back(t->shape[axis]);
+  }
+
+  Expr join_size = axis_sizes[0];
+  for (size_t i = 1; i < axis_sizes.size(); ++i) {
+    join_size += axis_sizes[i];
+  }
+  join_size = tvm::ir::Simplify(join_size);
+  Array<Expr> out_shape;
+  for (size_t i = 0; i < inputs[0]->shape.size(); ++i) {
+    out_shape.push_back(i == static_cast<size_t>(axis) ? join_size : inputs[0]->shape[i]);
+  }
+
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+      auto ret = inputs[0](indices);
+      auto ind = indices[axis];
+      for (size_t i = 0; i < inputs.size() - 1; ++i) {
+        ind -= axis_sizes[i];
+
+        Array<Expr> idx;
+        for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+          idx.push_back(indices[i]);
+        }
+        idx.push_back(ind);
+        for (size_t i = axis + 1; i < indices.size(); ++i) {
+          idx.push_back(indices[i]);
+        }
+
+        ret = tvm::select(ind >= 0,
+                          inputs[i + 1](idx),
+                          ret);
+      }
+      return ret;
+    }, name, tag);
+}
+
+/*!
+* \brief Split a tensor into multiple sub-tensors
+*
+* \param x The input tensor
+* \param split_indices The indices to split the input at. This must be in ascending
+* order.
+* \param axis The axis to split along.
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the split operation
+*/
+inline Array<Tensor> split(const Tensor& x,
+                           Array<Expr> split_indices,
+                           int axis,
+                           std::string name = "tensor",
+                           std::string tag = kInjective) {
+  if (axis < 0) {
+    axis += static_cast<int>(x->shape.size());
+  }
+  CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
+
+  auto src_axis_size = static_cast<int>(GetConstInt(x->shape[axis]));
+
+  auto split_indices_val = GetConstIntValues(split_indices, "split_indices");
+  CHECK(std::is_sorted(split_indices_val.begin(), split_indices_val.end())) <<
+    "split_indices must be sorted";
+
+  std::vector<int> begin_ids;
+  begin_ids.push_back(0);
+  std::copy(split_indices_val.begin(), split_indices_val.end(), std::back_inserter(begin_ids));
+
+  Array< Array<Expr> > out_shapes;
+  for (size_t i = 0; i < begin_ids.size(); ++i) {
+    int out_axis_size;
+    if (i == begin_ids.size() - 1) {
+      out_axis_size = src_axis_size - begin_ids[i];
+    } else {
+      out_axis_size = begin_ids[i + 1] - begin_ids[i];
+    }
+
+    Array<Expr> shape;
+    for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+      shape.push_back(x->shape[i]);
+    }
+    shape.push_back(out_axis_size);
+    for (size_t i = axis + 1; i < x->shape.size(); ++i) {
+      shape.push_back(x->shape[i]);
+    }
+
+    out_shapes.push_back(shape);
+  }
+
+  Array<Tensor> result;
+  for (size_t i = 0; i < begin_ids.size(); ++i) {
+    result.push_back(
+      compute(
+        out_shapes[i], [&](const Array<Var>& indices) {
+          auto begin = begin_ids[i];
+          Array<Expr> real_indices;
+          for (size_t j = 0; j < static_cast<size_t>(axis); ++j) {
+            real_indices.push_back(indices[j]);
+          }
+          real_indices.push_back(indices[axis] + begin);
+          for (size_t j = axis + 1; j < indices.size(); ++j) {
+            real_indices.push_back(indices[j]);
+          }
+
+          return x(real_indices);
+        }, name, tag));
+  }
+
+  return result;
+}
+
+/*!
+* \brief strided_slice of a tensor
+*
+* \param x The input tensor
+* \param begin The indices to begin with in the slicing
+* \param end Indicies indicating end of the slice
+* \param strides Specifies the stride values, it can be negative
+* in that case, the input tensor will be reversed in that particular axis
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the split operation
+*/
+inline Tensor strided_slice(const Tensor& x,
+                            const Array<Expr>& begin,
+                            const Array<Expr>& end,
+                            const Array<Expr>& strides,
+                            std::string name = "tensor",
+                            std::string tag = kInjective) {
+  size_t src_tensor_dim = static_cast<size_t>(x->shape.size());
+  std::vector<int64_t> begin_vec = GetConstInt64Values(begin, "begin");
+  std::vector<int64_t> end_vec = GetConstInt64Values(end, "end");
+  std::vector<int64_t> stride_vec = GetConstInt64Values(strides, "strides");
+  // in case user has not provided begin indices for all the axes,
+  // then inflate it with default value = 0
+  for (size_t i = begin_vec.size(); i < src_tensor_dim; ++i) {
+    begin_vec.push_back(0);
+  }
+  // in case user has not provided end indices for all the axes,
+  // then inflate it with default value = input_tensor.shape[axis]
+  for (size_t i = end_vec.size(); i < src_tensor_dim; ++i) {
+    end_vec.push_back(GetConstInt(x->shape[i]));
+  }
+  // in case user has not provided stride values,
+  // then inflate it with default value = 1
+  for (size_t i = stride_vec.size(); i < src_tensor_dim; ++i) {
+    stride_vec.push_back(1);
+  }
+
+  Array<Expr> out_shape;
+  Array<Expr> begin_expr;
+  Array<Expr> strides_expr;
+
+  for (size_t i = 0; i < src_tensor_dim; ++i) {
+    int64_t begin_range = stride_vec[i] < 0 ? -1 : 0;
+    int64_t dim_i = GetConstInt(x->shape[i]);
+    int64_t end_range = stride_vec[i] < 0 ? dim_i - 1 : dim_i;
+    // transform negative indices to positive value, clips on the correct range
+    auto index_canonicalization = [dim_i, begin_range, end_range](int64_t index) {
+      if (index < 0) {
+        index += dim_i;
+      }
+      return std::min(std::max(index, begin_range), end_range);
+    };
+
+    int64_t begin_i = index_canonicalization(begin_vec[i]);
+    int64_t end_i = index_canonicalization(end_vec[i]);
+
+    int interval = std::abs(end_i - begin_i);
+    int slice_size = static_cast<int>((interval
+                                     + std::abs(stride_vec[i]) - 1) / std::abs(stride_vec[i]));
+    CHECK(stride_vec[i] < 0 ? (end_i < begin_i) : (begin_i < end_i))
+      << ": Input [Begin=" << begin_vec[i] << ", End=" << end_vec[i]
+      << "] is invalid for axis=" << i;
+
+    begin_expr.push_back(make_const(begin[0].type(), begin_i));
+    strides_expr.push_back(make_const((strides.size() != 0 ? strides[0].type() : begin[0].type()),
+                                     stride_vec[i]));
+    out_shape.push_back(slice_size);
+  }
+
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+      Array<Expr> real_indices;
+      for (size_t i = 0; i < src_tensor_dim; ++i) {
+        real_indices.push_back(indices[i] * strides_expr[i] + begin_expr[i]);
+      }
+      return x(real_indices);
+    }, name, tag);
+}
+
+/*!
+* \brief Split a tensor into a number of sub-tensors
+*
+* \param x The input tensor
+* \param num_sections The number of sections to split the tensor into.
+* this must be an integer factor of the size of the axis being split.
+* \param axis The axis to split along.
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the split operation
+*/
+inline Array<Tensor> split_sections(const Tensor& x,
+                           int num_sections,
+                           int axis,
+                           std::string name = "tensor",
+                           std::string tag = kInjective) {
+  auto src_axis_size = static_cast<int>(GetConstInt(x->shape[axis]));
+
+  CHECK_GT(num_sections, 0) << "Slice count must be > 0";
+  CHECK_EQ(src_axis_size % num_sections, 0)
+    << "num_sections must be an integer factor of the size of axis " << axis
+    << " (" << src_axis_size << ")";
+
+  Array<Expr> split_indices;
+  auto seg_size = src_axis_size / num_sections;
+  for (int i = 0; i < num_sections; ++i) {
+    // region at index 0 is added by split()
+    if (i != 0) {
+      split_indices.push_back(seg_size * i);
+    }
+  }
+
+  return split(x, split_indices, axis, name, tag);
+}
+
+/*!
+* \brief Take elements from an flattened input array when axis is None.
+*
+* \param a The source array.
+* \param indices The indices of the values to extract.
+* \param name The name of the operation.
+* \param tag The tag to mark the operation.
+*
+* \return A Tensor whose op member is the take operation
+*/
+inline Tensor take(const Tensor& a,
+                   const Tensor& indices,
+                   std::string name = "tensor",
+                   std::string tag = kInjective) {
+  Array<Expr> a_shape = a->shape;
+  Array<Expr> out_shape;
+  for (size_t j = 0; j < indices->shape.size(); ++j) {
+    out_shape.push_back(indices->shape[j]);
+  }
+
+  return compute(
+        out_shape, [&](const Array<Var>& out_index) {
+          Array<Expr> indices_position;
+          for (size_t j = 0; j < indices->shape.size(); ++j) {
+            indices_position.push_back(out_index[j]);
+          }
+          return a(UnavelIndex(indices(indices_position), a_shape));
+        }, name, tag);
+}
+
+/*!
+* \brief Take elements from an array along an axis.
+*
+* \param a The source array.
+* \param indices The indices of the values to extract.
+* \param axis The axis over which to select values. By default,
+* the flattened input array is used.
+* \param name The name of the operation.
+* \param tag The tag to mark the operation.
+*
+* \return A Tensor whose op member is the take operation
+*/
+inline Tensor take(const Tensor& a,
+                   const Tensor& indices,
+                   int axis,
+                   std::string name = "tensor",
+                   std::string tag = kInjective) {
+  if (axis < 0) {
+    axis += static_cast<int>(a->shape.size());
+  }
+  CHECK_LT(axis, a->shape.size()) << "axis out of bounds";
+
+  int indices_len = static_cast<int>(indices->shape.size());
+  Array<Expr> out_shape;
+  for (size_t i = 0; i < a->shape.size(); ++i) {
+    if (axis == static_cast<int>(i)) {
+      for (size_t j = 0; j < indices->shape.size(); ++j) {
+        out_shape.push_back(indices->shape[j]);
+      }
+    } else {
+      out_shape.push_back(a->shape[i]);
+    }
+  }
+  return compute(
+        out_shape, [&](const Array<Var>& out_index) {
+          Array<Expr> indices_position;
+          for (size_t j = axis; j < static_cast<size_t>(axis+indices_len); ++j) {
+            indices_position.push_back(out_index[j]);
+          }
+          Array<Expr> real_indices;
+          for (size_t j = 0; j < static_cast<size_t>(axis); ++j) {
+            real_indices.push_back(out_index[j]);
+          }
+          real_indices.push_back(indices(indices_position));
+          for (size_t j = axis + indices_len; j < out_index.size(); ++j) {
+            real_indices.push_back(out_index[j]);
+          }
+          return a(real_indices);
+        }, name, tag);
+}
+
+/*!
+* \brief Return the elements, either from x or y, depending on the condition.
+*
+* \param condition The condition array.
+* \param x First array to be selected.
+* \param y Second array to be selected.
+* \param name The name of the operation.
+* \param tag The tag to mark the operation.
+*
+* \return A Tensor selected from x or y depending on condition.
+*/
+inline Tensor where(const Tensor& condition,
+                    const Tensor& x,
+                    const Tensor& y,
+                    std::string name = "tensor",
+                    std::string tag = kInjective) {
+  CHECK_EQ(x->shape.size(), y->shape.size())
+    << "x and y must have the same shape.Got different number of dimension: "
+    << x->shape.size() << " vs " << y->shape.size();
+  CHECK_EQ(x->dtype, y->dtype) << "x and y must have the same dtype: "
+                               << x->dtype << " vs " << y->dtype;
+  Array<Expr> oshape = x->shape;
+  Tensor out;
+
+  if (condition->shape.size() != 1) {
+    CHECK_EQ(condition->shape.size(), x->shape.size())
+      << "condition array must be either have the same shape as x or to be a "
+         "1-D array.Got different number of dimension: "
+      << condition->shape.size() << " vs " << x->shape.size();
+    out = compute(
+      oshape, [&](const Array<Var>& indices) {
+        return tvm::select(condition(indices) != 0, x(indices), y(indices));
+      }, name, tag);
+  } else {
+    CHECK_EQ(topi::GetConstInt(condition->shape[0]), topi::GetConstInt(x->shape[0]))
+      << "If condition is 1-D, the first dimension must be the same as x: "
+      << condition->shape[0] << " vs " << x->shape[0];
+    out = compute(
+      oshape, [&](const Array<Var>& indices) {
+        Array<Expr> condition_idx{indices[0]};
+        return tvm::select(condition(condition_idx) != 0,
+                           x(indices), y(indices));
+      }, name, tag);
+  }
+  return out;
+}
+
+
+}  // namespace topi
+#endif  // TOPI_TRANSFORM_H_
diff --git a/topi/include/topi/vision/reorg.h b/topi/include/topi/vision/reorg.h
new file mode 100644
index 000000000000..dd73b940682d
--- /dev/null
+++ b/topi/include/topi/vision/reorg.h
@@ -0,0 +1,63 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Reorg op constructions
+ * \file vision/reorg.h
+ */
+#ifndef TOPI_VISION_REORG_H_
+#define TOPI_VISION_REORG_H_
+
+#include <algorithm>
+#include <string>
+
+#include "topi/detail/constant_utils.h"
+#include "topi/reduction.h"
+#include "topi/tags.h"
+#include "topi/transform.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace vision {
+using namespace tvm;
+
+/*!
+* \brief Reorg operation
+*
+* \param data The input tensor. Can be any dimension
+* \param stride The input integer used as stride in reorg operation
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the reorg operation
+*/
+inline Tensor reorg(const Tensor &data,
+                    int stride = 1,
+                    std::string name = "tensor",
+                    std::string tag = "reorg_output") {
+  auto input_shape = data->shape;
+
+  int batch = GetConstInt(input_shape[0]);
+  int c_in = GetConstInt(input_shape[1]);
+  int h_in = GetConstInt(input_shape[2]);
+  int w_in = GetConstInt(input_shape[3]);
+  int out_c = c_in / (stride * stride);
+
+  auto out = tvm::compute(input_shape,
+                          [&](Var b, Var k, Var j, Var i) {
+                          return data(b * stride * stride,
+                                      (k % out_c) * stride * stride,
+                                      (j*stride + (k / out_c) / stride) * stride,
+                                      (i*stride + (k / out_c) % stride));
+                          },
+                          name,
+                          tag);
+
+  out_c = c_in * stride * stride;
+  int out_h = h_in / stride;
+  int out_w = w_in / stride;
+
+  Array<Expr> out_shape = {batch, out_c, out_h, out_w};
+  return reshape(out, out_shape);
+}
+}  // namespace vision
+}  // namespace topi
+#endif  // TOPI_VISION_REORG_H_
diff --git a/topi/include/topi/vision/yolo/region.h b/topi/include/topi/vision/yolo/region.h
new file mode 100644
index 000000000000..88553fc29b8a
--- /dev/null
+++ b/topi/include/topi/vision/yolo/region.h
@@ -0,0 +1,81 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Region op constructions
+ * \file vision/yolo/region.h
+ */
+#ifndef TOPI_VISION_YOLO_REGION_H_
+#define TOPI_VISION_YOLO_REGION_H_
+
+#include <algorithm>
+#include <string>
+
+#include "topi/detail/constant_utils.h"
+#include "topi/reduction.h"
+#include "topi/tags.h"
+#include "topi/transform.h"
+#include "topi/nn/softmax.h"
+#include "tvm/tvm.h"
+
+
+namespace topi {
+namespace vision {
+namespace yolo {
+using namespace tvm;
+using namespace nn;
+
+/*!
+* \brief region operation
+*
+* \param data The input tensor. Can be any dimension
+* \param num Darknet layer parameter n
+* \param classes number of classes in the yolo model
+* \param coords Darknet layer parameter coords
+* \param background Darknet layer parameter background
+* \param l_softmax if true apply softmax
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the region operation
+*/
+inline Tensor region(const Tensor &data,
+                     int num,
+                     int classes,
+                     int coords,
+                     int background,
+                     int l_softmax,
+                     std::string name = "tensor",
+                     std::string tag = "region_output") {
+  auto input_shape = data->shape;
+  int split_size = classes + coords + 1;
+  Array <Expr> intermediate_shape = {input_shape[0],
+                                     num,
+                                     split_size,
+                                     input_shape[2],
+                                     input_shape[3]};
+  auto data_block = reshape(data, intermediate_shape);
+  Array <Expr> split_indices;
+  for (int i = 1; i < split_size; ++i) {
+    split_indices.push_back(i);
+  }
+  Array <Tensor> split_res = split(data_block, split_indices, 2);
+  split_res.Set(0, sigmoid(split_res[0]));
+  split_res.Set(1, sigmoid(split_res[1]));
+  if (!background) {
+    split_res.Set(coords, sigmoid(split_res[coords]));
+  }
+
+  if (l_softmax) {
+    int offset = coords + static_cast<int>(!background);
+    Array <Tensor> softmax_input(split_res.begin() + offset, split_res.end());
+    auto softmax_output = softmax(concatenate(softmax_input, 2), 2);
+    Array <Tensor> data_block_1(split_res.begin(), split_res.begin() + offset);
+    data_block_1.push_back(softmax_output);
+    split_res = data_block_1;
+  }
+  Tensor out = concatenate(split_res, 2);
+  return reshape(out, input_shape);
+}
+}  // namespace yolo
+}  // namespace vision
+}  // namespace topi
+#endif  // TOPI_VISION_YOLO_REGION_H_
diff --git a/topi/include/topi/vision/yolo/yolo.h b/topi/include/topi/vision/yolo/yolo.h
new file mode 100644
index 000000000000..d2e24c01b253
--- /dev/null
+++ b/topi/include/topi/vision/yolo/yolo.h
@@ -0,0 +1,58 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief YOLO op constructions
+ * \file vision/yolo/yolo.h
+ */
+#ifndef TOPI_VISION_YOLO_YOLO_H_
+#define TOPI_VISION_YOLO_YOLO_H_
+
+#include <algorithm>
+#include <string>
+
+#include "topi/detail/constant_utils.h"
+#include "topi/tags.h"
+#include "topi/transform.h"
+#include "tvm/tvm.h"
+
+
+namespace topi {
+namespace vision {
+namespace yolo {
+using namespace tvm;
+using namespace nn;
+
+/*!
+* \brief yolo operation
+*
+* \param data The input tensor.
+* \param num Darknet layer parameter n
+* \param classes number of classes in the yolo model
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the yolo operation
+*/
+inline Tensor yolo(const Tensor &data,
+                   int num,
+                   int classes,
+                   std::string name = "tensor",
+                   std::string tag = "yolo_output") {
+  auto input_shape = data->shape;
+  int split_size = classes + 5;
+  Array <Expr> intermediate_shape = {input_shape[0],
+                                     num,
+                                     split_size,
+                                     input_shape[2],
+                                     input_shape[3]};
+  auto data_block = reshape(data, intermediate_shape);
+  Array <Expr> split_indices = {2, 4};
+  Array <Tensor> split_res = split(data_block, split_indices, 2);
+  split_res.Set(0, sigmoid(split_res[0]));
+  split_res.Set(2, sigmoid(split_res[2]));
+  Tensor out = concatenate(split_res, 2);
+  return reshape(out, input_shape);
+}
+}  // namespace yolo
+}  // namespace vision
+}  // namespace topi
+#endif  // TOPI_VISION_YOLO_YOLO_H_
diff --git a/topi/include/topi/x86/bnn.h b/topi/include/topi/x86/bnn.h
new file mode 100644
index 000000000000..f379ada8a516
--- /dev/null
+++ b/topi/include/topi/x86/bnn.h
@@ -0,0 +1,110 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file x86/bnn.h
+* \brief x86 schedule for binary operations
+*/
+#ifndef TOPI_X86_BNN_H_
+#define TOPI_X86_BNN_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace x86 {
+/*!
+* \brief Create a generic schedule for binarize_pack
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_binarize_pack(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+
+  auto _schedule = [&](const Tensor& out) {
+    s[out].parallel(out->op.as<ComputeOpNode>()->axis[0]);
+  };
+
+  std::function<void(Operation)> traverse;
+  traverse = [&](const Operation& op) {
+    if (op->tag == "binarize_pack") {
+      _schedule(op.output(0));
+    } else {
+      LOG(ERROR) << "Unsupported operator " << op->tag;
+    }
+  };
+
+  traverse(outs[0]->op);
+  return s;
+}
+
+/*!
+* \brief Create a generic schedule for binary_dense
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_binary_dense(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+
+  auto _schedule = [&](const Tensor& A, const Tensor& B, const Tensor& C) {
+    IterVar co, ci;
+    s[C].split(s[C]->op.as<ComputeOpNode>()->reduce_axis[0], 8, &co, &ci);
+    s[C].parallel(s[C]->op.as<ComputeOpNode>()->axis[0]);
+
+    Tensor out;
+    if (detail::contains(s->outputs, C->op)) {
+      out = C;
+    } else {
+      out = outs[0]->op.output(0);
+    }
+
+    IterVar xo, xi;
+    s[out].split(out->op.as<ComputeOpNode>()->axis[1], 8, &xo, &xi);
+    s[out].vectorize(xi);
+  };
+
+  std::function<void(Operation)> traverse;
+  traverse = [&](const Operation& op) {
+    // Inline all one-to-one-mapping operators except the last stage (output)
+    if (is_broadcast(op->tag)) {
+      if (!detail::contains(s->outputs, op)) {
+        s[op].compute_inline();
+      }
+      for (auto tensor : op->InputTensors()) {
+        if (tensor->op->InputTensors().size() > 0) {
+          traverse(tensor->op);
+        }
+      }
+    } else if (op->tag == "binary_dense") {
+      auto output = op.output(0);
+      auto data = op->InputTensors()[0];
+      auto weight = op->InputTensors()[1];
+      _schedule(data, weight, output);
+    } else {
+      LOG(ERROR) << "Unsupported operator " << op->tag;
+    }
+  };
+
+  traverse(outs[0]->op);
+  return s;
+}
+
+}  // namespace x86
+}  // namespace topi
+#endif  // TOPI_X86_BNN_H_
diff --git a/topi/include/topi/x86/default.h b/topi/include/topi/x86/default.h
new file mode 100644
index 000000000000..5d71855a8c94
--- /dev/null
+++ b/topi/include/topi/x86/default.h
@@ -0,0 +1,84 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file x86/default.h
+* \brief default x86 schedule
+*/
+#ifndef TOPI_X86_DEFAULT_H_
+#define TOPI_X86_DEFAULT_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace x86 {
+/*!
+* \brief Helper to create a default x86 schedule for the given ops.
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+* \param auto_inline Whether to apply the auto inline step.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule MakeDefaultSchedule(const Target &target,
+                                    const Array<Tensor>& outs,
+                                    bool auto_inline) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  auto x = outs[0];
+  auto axis = s[x]->op.as<ComputeOpNode>()->axis;
+
+  if (auto_inline) {
+    tvm::schedule::AutoInlineInjective(s);
+    if (axis.size() > 0) {
+      detail::Fuse(s[x], axis);
+    }
+    return s;
+  }
+
+  if (axis.size() == 4) {
+    auto n = axis[0];
+    auto c = axis[1];
+    auto fused = detail::Fuse(s[x], { n, c });  // for nhwc layout, fuse n and h
+    s[x].parallel(fused);
+  } else {
+    s[x].parallel(axis[0]);
+  }
+
+  return s;
+}
+
+/*!
+* \brief Create a default x86 schedule for the given ops.
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule default_schedule(const Target &target, const Array<Tensor>& outs) {
+  return MakeDefaultSchedule(target, outs, false);
+}
+
+/*!
+* \brief Create a default x86 schedule for the given ops, with auto inline
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule default_schedule_auto_inline(const Target &target, const Array<Tensor>& outs) {
+  return MakeDefaultSchedule(target, outs, true);
+}
+
+}  // namespace x86
+}  // namespace topi
+#endif  // TOPI_X86_DEFAULT_H_
diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h
new file mode 100644
index 000000000000..9e5a603af908
--- /dev/null
+++ b/topi/include/topi/x86/injective.h
@@ -0,0 +1,50 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file x86/injective.h
+* \brief x86 schedule for injective ops
+*/
+#ifndef TOPI_X86_INJECTIVE_H_
+#define TOPI_X86_INJECTIVE_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace x86 {
+/*!
+* \brief Create an x86 schedule for the given injective ops.
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_injective(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  tvm::schedule::AutoInlineInjective(s);
+
+  auto x = outs[0];
+  auto axis = s[x]->op.as<ComputeOpNode>()->axis;
+  if (axis.size() == 4) {
+    auto n = axis[0];
+    auto c = axis[1];
+    auto fused = detail::Fuse(s[x], { n, c });  // for nhwc layout, fuse n and h
+    s[x].parallel(fused);
+  } else {
+    s[x].parallel(axis[0]);
+  }
+
+  return s;
+}
+
+}  // namespace x86
+}  // namespace topi
+#endif  // TOPI_X86_INJECTIVE_H_
diff --git a/topi/python/setup.py b/topi/python/setup.py
index e2a8a5d77a4f..dadb79453857 100644
--- a/topi/python/setup.py
+++ b/topi/python/setup.py
@@ -2,6 +2,7 @@
 """Setup TOPI package."""
 from __future__ import absolute_import
 import sys
+import os
 
 from setuptools import find_packages
 from setuptools.dist import Distribution
@@ -13,7 +14,46 @@
     from setuptools import setup
     from setuptools.extension import Extension
 
-__version__ = "0.1.0"
+def get_lib_names():
+    if sys.platform.startswith('win32'):
+        return ['libtvm_topi.dll', 'tvm_topi.dll']
+    if sys.platform.startswith('darwin'):
+        return ['libtvm_topi.dylib', 'tvm_topi.dylib']
+    return ['libtvm_topi.so', 'tvm_topi.so']
+
+def get_lib_path():
+    """Get library path, name and version"""
+    # We can not import `libinfo.py` in setup.py directly since __init__.py
+    # Will be invoked which introduces dependences
+    CURRENT_DIR = os.path.dirname(__file__)
+    libinfo_py = os.path.join(CURRENT_DIR, '../../python/tvm/_ffi/libinfo.py')
+    libinfo = {'__file__': libinfo_py}
+    exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
+    version = libinfo['__version__']
+    if not os.getenv('CONDA_BUILD'):
+        lib_path = libinfo['find_lib_path'](get_lib_names())
+        libs = [lib_path[0]]
+        if libs[0].find("runtime") == -1:
+            for name in lib_path[1:]:
+                if name.find("runtime") != -1:
+                    libs.append(name)
+                    break
+    else:
+        libs = None
+    return libs, version
+
+LIB_LIST, __version__ = get_lib_path()
+
+if not os.getenv('CONDA_BUILD'):
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    for i, path in enumerate(LIB_LIST):
+        LIB_LIST[i] = os.path.relpath(path, curr_path)
+    setup_kwargs = {
+        "include_package_data": True,
+        "data_files": [('topi', LIB_LIST)]
+    }
+else:
+    setup_kwargs = {}
 
 setup(name='topi',
       version=__version__,
@@ -23,4 +63,5 @@
         "decorator",
         ],
       packages=find_packages(),
-      url='https://github.com/dmlc/tvm')
+      url='https://github.com/dmlc/tvm',
+      **setup_kwargs)
diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py
index 1306f9d9cac8..349f805cc7f2 100644
--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -9,12 +9,30 @@
 """
 from __future__ import absolute_import as _abs
 
+from tvm._ffi.libinfo import __version__
+
+# Ensure C++ schedules get registered first, so python schedules can
+# override them.
+from . import cpp
+
 from .math import *
+from .tensor import *
+from .generic_op_impl import *
 from .reduction import *
 from .transform import *
 from .broadcast import *
 from . import nn
+from . import x86
 from . import cuda
-from . import rasp
-from . import testing
+from . import arm_cpu
+from . import mali
+from . import intel_graphics
+from . import opengl
 from . import util
+from . import rocm
+from . import vision
+from . import image
+# not import testing by default
+# because testing can have extra deps that are not necessary
+# we can import them from test cases explicitly
+# from . import testing
diff --git a/topi/python/topi/arm_cpu/__init__.py b/topi/python/topi/arm_cpu/__init__.py
new file mode 100644
index 000000000000..bb79769c1adc
--- /dev/null
+++ b/topi/python/topi/arm_cpu/__init__.py
@@ -0,0 +1,5 @@
+"""Schedule for ARM CPU"""
+
+from . import conv2d
+from . import depthwise_conv2d
+from . import bitserial_conv2d
diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py
new file mode 100644
index 000000000000..470aea0b4523
--- /dev/null
+++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py
@@ -0,0 +1,365 @@
+# pylint: disable=invalid-name,unused-variable,invalid-name
+"""Bitserial conv2d schedule on raspberry pi"""
+from __future__ import absolute_import as _abs
+from collections import namedtuple
+import tvm
+from .. import tag
+from ..nn.pad import pad
+from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload, bitpack
+from ..nn.bitserial_conv2d import SpatialPackNCHW, _WORKLOADS, spatial_pack_nchw
+from ..nn.util import get_pad_tuple
+from ..util import get_const_int
+from .. import generic
+
+RaspSpatialPack = namedtuple('SpatialPack',
+                             ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor'])
+
+_QUANTIZED_SCHEDULES_NHWC = [
+    RaspSpatialPack(2, 2, 8, 1, 1, False, 8),
+    RaspSpatialPack(1, 4, 8, 4, 1, False, 8),
+    RaspSpatialPack(1, 4, 8, 1, 16, False, 8),
+    RaspSpatialPack(1, 4, 8, 4, 8, False, 8),
+    RaspSpatialPack(1, 7, 8, 3, 8, False, 16),
+    RaspSpatialPack(1, 2, 8, 1, 8, False, 16),
+    RaspSpatialPack(2, 1, 8, 1, 4, False, 16),
+    RaspSpatialPack(1, 7, 8, 1, 1, True, 16),
+    RaspSpatialPack(1, 1, 8, 1, 16, True, 16),
+    RaspSpatialPack(1, 1, 8, 1, 8, True, 16),
+    RaspSpatialPack(1, 1, 8, 1, 16, True, 16),
+]
+
+_QUANTIZED_SCHEDULES_NCHW = [
+    # resnet
+    SpatialPackNCHW(2, 2, 8, 1, 1),
+    SpatialPackNCHW(1, 4, 8, 4, 1),
+    SpatialPackNCHW(1, 4, 8, 1, 16),
+    SpatialPackNCHW(1, 4, 8, 4, 8),
+    SpatialPackNCHW(1, 7, 8, 3, 8),
+    SpatialPackNCHW(1, 2, 8, 1, 8),
+    SpatialPackNCHW(2, 1, 8, 1, 4),
+    SpatialPackNCHW(1, 7, 8, 1, 1),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+    SpatialPackNCHW(1, 1, 8, 1, 8),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+]
+
+@_get_schedule.register("arm_cpu")
+def _get_schedule_bitserial_conv2d(wkl, layout):
+    if wkl not in _WORKLOADS:
+        raise ValueError("no schedule for such workload: {}".format(wkl))
+    idx = _WORKLOADS.index(wkl)
+    if layout == "NCHW":
+        sch = _QUANTIZED_SCHEDULES_NCHW[idx]
+    elif layout == "NHWC":
+        sch = _QUANTIZED_SCHEDULES_NHWC[idx]
+    return sch
+
+
+@bitserial_conv2d.register("arm_cpu")
+def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
+                                  layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False):
+    if out_dtype is None:
+        out_dtype = data.dtype
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
+    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+    if dorefa:
+        assert layout == "NCHW", "Cannot support dorea with NHWC layout yet"
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
+    sch = _get_schedule(wkl, layout)
+    if layout == "NCHW":
+        return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits,
+                                 pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
+    return _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits,
+                              weight_bits, out_dtype)
+
+def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC):
+    kernel_q = bitpack(kernel, kernel_bits, pack_axis=2, bit_axis=2, pack_type='uint8')
+    KH, KW, KB, CI, CO = kernel_q.shape
+    kvshape = (CO//VC, KH, KW, KB, VC, CI)
+    return tvm.compute(kvshape, lambda co, dh, dw, b, vc, ci: \
+        kernel_q[dh][dw][b][ci][co*VC+vc], name='kernel_vec')
+
+def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype):
+    """ Compute convolution with pack on spatial axes. """
+    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+
+    data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8')
+    kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
+    N, H, W, IB, CI = data_q.shape
+    OCO, KH, KW, KB, VC, _ = kernel_vec.shape
+
+    CO = OCO * VC
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    HCAT, WCAT = KH-1, KW-1
+
+    PAD_H = H + 2*HPAD
+    PAD_W = W + 2*WPAD
+    OH = (H + 2*HPAD - KH) // HSTR + 1
+    OW = (W + 2*WPAD - KW) // WSTR + 1
+    dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI)
+    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)
+    oshape = (1, OH, OW, CO)
+
+    if (HPAD != 0 and WPAD != 0):
+        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
+    else:
+        data_pad = data_q
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
+        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    dh = tvm.reduce_axis((0, KH), name='dh')
+    dw = tvm.reduce_axis((0, KW), name='dw')
+    ib = tvm.reduce_axis((0, IB), name='ib')
+    kb = tvm.reduce_axis((0, KB), name='kb')
+
+    def _conv(n, h, w, co, vh, vw, vc):
+        return tvm.sum((tvm.popcount(
+            kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') &
+            data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16'))
+                        << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci])
+
+    conv = tvm.compute(ovshape, _conv, name='conv')
+
+    return tvm.compute(oshape, lambda n, h, w, co:
+                       conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype),
+                       name='output_vec', tag='spatial_bitserial_conv_nhwc')
+
+def _intrin_popcount(m, k_i, w_b, x_b):
+    dtype = 'uint8'
+    w = tvm.placeholder((w_b, m, k_i), dtype=dtype, name='w')
+    x = tvm.placeholder((x_b, k_i,), dtype=dtype, name='x')
+    k = tvm.reduce_axis((0, k_i), name='k')
+    bw = tvm.reduce_axis((0, w_b), name='bw')
+    bx = tvm.reduce_axis((0, x_b), name='bx')
+    z = tvm.compute((m,), lambda i:
+                    tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') &
+                                         x[bx, k].astype('uint16'))
+                            << (bw+bx).astype('uint16'), axis=[bw, bx, k]), name='z')
+
+    Wb = tvm.decl_buffer(w.shape, w.dtype,
+                         name="W",
+                         offset_factor=k_i,
+                         strides=[tvm.var('ldw'), tvm.var('ldw'), 1])
+    Xb = tvm.decl_buffer(x.shape, x.dtype,
+                         name="X",
+                         offset_factor=k_i,
+                         strides=[tvm.var('ldw'), 1])
+
+    def _intrin_func(ins, outs):
+        ww, xx = ins
+        zz = outs[0]
+        vpadd = "llvm.arm.neon.vpadd.v8u8"
+        vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16"
+        args_1 = tvm.const(1, 'uint32')
+        args_2 = tvm.const(2, 'uint32')
+
+        def _instr(index):
+            irb = tvm.ir_builder.create()
+            if index == 1:
+                irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
+                return irb.get()
+
+            cnts8 = [None] * 8
+            cnts4 = [None] * 4
+            cnts2 = [None] * 2
+            for bw in range(w_b):
+                for bx in range(x_b):
+                    if k_i == 16:
+                        for i in range(m):
+                            ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16')
+                            cnts = tvm.popcount(ands)
+                            upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts)
+                            lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts)
+                            cnts8[i] = upper_half + lower_half
+                        for i in range(m//2):
+                            cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
+                                                            args_1, cnts8[i*2], cnts8[i*2+1])
+                        for i in range(m//4):
+                            cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
+                                                            args_1, cnts4[i*2], cnts4[i*2+1])
+                        cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
+                        shifted_cnts = cnts << tvm.const(bw+bx, dtype)
+                        out = tvm.call_llvm_intrin('uint16x8', vpadalu,
+                                                   args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
+                    else: # ki == 8
+                        for i in range(m):
+                            ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8')
+                            cnts8[i] = tvm.popcount(ands)
+                        for i in range(m//2):
+                            cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
+                                                            args_1, cnts8[i*2], cnts8[i*2+1])
+                        for i in range(m//4):
+                            cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
+                                                            args_1, cnts4[i*2], cnts4[i*2+1])
+                        cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
+                        shifted_cnts = cnts << tvm.const(bw+bx, dtype)
+                        out = tvm.call_llvm_intrin('uint16x8', vpadalu,
+                                                   args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
+                    irb.emit(zz.vstore(0, out))
+            return irb.get()
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+    with tvm.build_config(offset_factor=1, partition_const_loop=True):
+        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x:Xb})
+
+# ARM specific schedule that using custom microkernel
+def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                                  kernel, kernel_q, kernel_vec,
+                                  conv_out, output, last):
+    # no stride and padding info here
+    _, H, W, IB, CI = data_q.shape
+    KH, KW, KB, _, CO = kernel_q.shape
+    KB = get_const_int(KB)
+    IB = get_const_int(IB)
+
+    if data_pad is None:
+        padding = (0, 0)
+        _, in_h, in_w, _, _ = data_q.shape
+        kern_h, kern_w, _, _ = kernel.shape
+        _, out_h, out_w, _ = output.shape
+        hstride = (in_h - kern_h) // (out_h - 1)
+        wstride = (in_w - kern_w) // (out_w - 1)
+        stride = get_const_int(hstride), get_const_int(wstride)
+    else:
+        _, in_h, in_w, _, _ = data_q.shape
+        _, pad_h, pad_w, _, _ = data_pad.shape
+        hpad = (pad_h - in_h) // 2
+        wpad = (pad_w - in_w) // 2
+        padding = get_const_int(hpad), get_const_int(wpad)
+
+        _, in_h, in_w, _, _ = data_pad.shape
+        kern_h, kern_w, _, _ = kernel.shape
+        _, out_h, out_w, _ = output.shape
+        hstride = (in_h - kern_h) // (out_h - 1)
+        wstride = (in_w - kern_w) // (out_w - 1)
+        stride = get_const_int(hstride), get_const_int(wstride)
+
+    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+    ba = sch.ba
+    bc = sch.bc
+
+    ##### Schedule data packing
+    if data_pad is not None:
+        s[data_pad].compute_inline()
+
+    _, h, _, _, _, _, _ = s[data_vec].op.axis
+    if ba == 1:
+        oaxis = h
+        paxis = h
+    else:
+        oh, ih = s[data_vec].split(h, ba)
+        oaxis = oh
+        paxis = ih
+
+    s[data_vec].parallel(paxis)
+    s[data_vec].pragma(oaxis, "parallel_launch_point")
+    s[data_vec].pragma(paxis, "parallel_stride_pattern")
+    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+    ##### Schedule kernel packing
+    co, _, _, _, _, _ = s[kernel_vec].op.axis
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[kernel_vec].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[kernel_vec].parallel(paxis)
+    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
+    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
+    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+    ##### Schedule Convolution
+    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
+    dh, dw, kb, ib, ci = s[conv_out].op.reduce_axis
+
+    kfactor = sch.kfactor
+    if sch.split_ci:
+        oci, ici = s[conv_out].split(ci, kfactor)
+        s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, oci, kb, ib, vc, ici)
+    else:
+        s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, kb, ib, vc, ci)
+
+    pc = _intrin_popcount(8, kfactor, KB, IB)
+    s[conv_out].tensorize(kb, pc)
+
+    n, h, w, co = s[last].op.axis
+    co, vc = s[last].split(co, VC)
+    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
+    s[last].reorder(n, oh, ow, co, vc, vh, vw)
+    s[last].vectorize(vw)
+    if last != output:
+        s[last].compute_inline()
+
+    s[conv_out].compute_at(s[last], ow)
+    if co == 1:
+        oaxis = oh
+        paxis = oh
+    else:
+        oho, iho = s[last].split(oh, bc)
+        oaxis = oho
+        paxis = iho
+
+    s[last].parallel(paxis)
+    s = s.normalize()
+    return s
+
+@generic.schedule_bitserial_conv2d_nhwc.register(["arm_cpu"])
+def schedule_bitserial_conv2d_nhwc(outs):
+    """Raspverry pi schedule for bitserial conv2d"""
+    s = tvm.create_schedule([x.op for x in outs])
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        if 'spatial_bitserial_conv_nhwc' in op.tag:
+            output = op.output(0)
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[0]
+            kernel_q = kernel_vec.op.input_tensors[0]
+            kernel = kernel_q.op.input_tensors[0]
+            if "QuantizeInput" in kernel.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                kernel = kernel.op.input_tensors[0]
+            data_vec = conv_out.op.input_tensors[1]
+            data_q = data_vec.op.input_tensors[0]
+            data = data_q.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+                data_pad = data_q
+                data_q = data
+                data = data_q.op.input_tensors[0]
+            if "QuantizeInput" in data.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                data = data.op.input_tensors[0]
+
+            _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                                          kernel, kernel_q, kernel_vec, conv_out, output, outs[0])
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
new file mode 100644
index 000000000000..f5dbec8e552b
--- /dev/null
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -0,0 +1,515 @@
+# pylint: disable=invalid-name,unused-variable,no-else-return
+"""Conv2D schedule for ARM CPU"""
+from __future__ import absolute_import as _abs
+
+import numpy as np
+
+import tvm
+from tvm import autotvm
+
+from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
+from ..util import traverse_inline, get_const_tuple, const_matrix
+from ..nn import pad, conv2d, conv2d_alter_layout, conv2d_winograd_without_weight_transform
+from ..nn.util import get_const_int, get_pad_tuple
+
+def _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
+    """convert argument to workload"""
+    if len(kernel.shape) == 4:
+        raw_kernel = kernel
+    else:  # the input kernel is transformed by alter_op_layout
+        shape = get_const_tuple(kernel.shape)
+        raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]),
+                                     dtype=kernel.dtype)
+    return ('conv2d', ) + autotvm.task.args_to_workload(
+        [data, raw_kernel, strides, padding, layout, out_dtype])
+
+@conv2d.register('arm_cpu')
+@autotvm.task.dispatcher
+def conv2d_arm_cpu(data, kernel, strides, padding, layout, out_dtype):
+    """TOPI compute callback. Mark this function as a dispatcher, so
+    this template can assign config according to workload"""
+    return _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
+
+@conv2d_arm_cpu.register(['direct'])
+def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
+    """spatial packing template"""
+    return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=2)
+
+@autotvm.task.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
+def schedule_conv2d_nchw_arm_cpu(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        # schedule conv2d
+        if 'spatial_conv_output' in op.tag:
+            output = op.output(0)
+            conv = op.input_tensors[0]
+
+            data_vec = conv.op.input_tensors[0]
+            data_pad = data_vec.op.input_tensors[0]
+            s[data_pad].compute_inline()
+
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
+
+        if 'winograd_conv_output' in op.tag:
+            output = op.output(0)
+            _schedule_winograd(cfg, s, output, outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
+    assert layout == "NCHW", "Only support NCHW"
+    out_dtype = out_dtype or data.dtype
+
+    _, CI, IH, IW = get_const_tuple(data.shape)
+    if len(kernel.shape) == 4:
+        pre_packed = False
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+    else:  # kernel tensor is pre packed
+        pre_packed = True
+        CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
+        CO = CO * VC
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (KH, KW))
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+
+    N = 1
+    OH = (IH + pad_top + pad_down - KH) // HSTR + 1
+    OW = (IW + pad_left + pad_right - KW) // WSTR + 1
+    data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_down, pad_right])
+
+    # ==================== define configuration space ====================
+    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
+    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
+
+    if num_tile == 2:     # for arm cpu
+        co, vc = cfg.define_split('tile_co', co, num_outputs=2)
+        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
+        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
+    elif num_tile == 3:   # for mali gpu
+        co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
+        oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
+        ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
+    else:
+        raise RuntimeError("Invalid num_tile")
+
+    cfg.define_reorder("reorder_0",
+                       [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                       policy='candidate', candidate=[
+                           [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                           [n, co, oh, ow, ci, kh, kw, vc, vh, vw]])
+
+    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
+    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
+    # ====================================================================
+
+    VC = cfg["tile_co"].size[-1]
+    VH = cfg["tile_oh"].size[-1]
+    VW = cfg["tile_ow"].size[-1]
+
+    dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
+    kvshape = (CO // VC, CI, KH, KW, VC)
+    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
+    oshape = (N, CO, OH, OW)
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
+                           data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
+                           name='data_vec')
+
+    if pre_packed:
+        kernel_vec = kernel
+    else:
+        kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
+                                 kernel[co*VC+vc][ci][kh][kw],
+                                 name='kernel_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    kh = tvm.reduce_axis((0, KH), name='kh')
+    kw = tvm.reduce_axis((0, KW), name='kw')
+
+    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
+                kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
+                axis=[ci, kh, kw]), name='conv')
+
+    output = tvm.compute(oshape, lambda n, co, h, w:
+                         conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
+                         name='output_unpack', tag='spatial_conv_output',
+                         attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
+                                                                  layout, out_dtype)})
+    return output
+
+def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
+                           conv, output, last):
+    """schedule implementation"""
+    n, co, oh, ow, vh, vw, vc = s[conv].op.axis
+    ci, kh, kw = s[conv].op.reduce_axis
+
+    # schedule conv
+    cfg["reorder_0"].apply(s, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
+    cfg["ann_reduce"].apply(s, conv, [kh, kw],
+                            axis_lens=[get_const_int(kh.dom.extent),
+                                       get_const_int(kw.dom.extent)],
+                            max_unroll=16,
+                            cfg=cfg)
+    cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
+                             axis_lens=[cfg['tile_oh'].size[-1],
+                                        cfg['tile_ow'].size[-1],
+                                        cfg['tile_co'].size[-1]],
+                             max_unroll=16,
+                             cfg=cfg)
+
+    # schedule fusion
+    n, co, h, w = s[last].op.axis
+    co, vc = cfg['tile_co'].apply(s, last, co)
+    oh, vh = cfg['tile_oh'].apply(s, last, h)
+    ow, vw = cfg['tile_ow'].apply(s, last, w)
+    s[last].reorder(n, co, oh, ow, vh, vw, vc)
+    if last != output:
+        s[output].compute_inline()
+        cfg["ann_spatial"].apply(s, last, [vh, vw, vc],
+                                 axis_lens=[cfg['tile_oh'].size[-1],
+                                            cfg['tile_ow'].size[-1],
+                                            cfg['tile_co'].size[-1]],
+                                 max_unroll=16,
+                                 cfg=cfg)
+    s[conv].compute_at(s[last], ow)
+
+    # mark parallel
+    s[last].parallel(co)
+
+    _, h, _, _, _, _ = s[data_vec].op.axis
+    s[data_vec].parallel(h)
+
+    if kernel_vec.op.name == 'kernel_vec':
+        co, _, _, _, _ = s[kernel_vec].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel packing will be pre-computed during compliation, so we skip
+            # this part to make tuning records correct
+            s[kernel_vec].pragma(co, 'debug_skip_region')
+        else:
+            s[kernel_vec].parallel(co)
+
+    return s
+
+
+@conv2d_arm_cpu.register('winograd')
+def decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
+    tile_size = 4
+    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
+
+def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    if len(kernel.shape) == 4:
+        pre_computed = False
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+    else:
+        pre_computed = True
+        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
+        CO *= VC
+        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    assert layout == 'NCHW'
+    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
+    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+
+    if tile_size == 4:
+        G_data = np.array([
+            [1 / 4.0, 0, 0],
+            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
+            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
+            [1 / 24.0, 1 / 12.0, 1 / 6.0],
+            [1 / 24.0, -1 / 12.0, 1 / 6.0],
+            [0, 0, 1]], dtype=np.float32)
+
+        B_data = np.array([
+            [4, 0, 0, 0, 0, 0],
+            [0, -4, 4, -2, 2, 4],
+            [-5, -4, -4, -1, -1, 0],
+            [0, 1, -1, 2, -2, -5],
+            [1, 1, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0, 0, 0],
+            [1, 1, 1, 1],
+            [1, -1, 1, -1],
+            [1, 2, 4, 8],
+            [1, -2, 4, -8],
+            [0, 0, 0, 1]], out_dtype)
+    elif tile_size == 2:
+        G_data = np.array([
+            [1, 0, 0],
+            [1.0/2, 1.0/2, 1.0/2],
+            [1.0/2, -1.0/2, 1.0/2],
+            [0, 0, 1]], np.float32)
+
+        B_data = np.array([
+            [1, 0, 0, 0],
+            [0, 1, -1, 1],
+            [-1, 1, 1, 0],
+            [0, 0, 0, -1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0],
+            [1, 1],
+            [1, -1],
+            [0, -1]], out_dtype)
+    else:
+        raise ValueError("Unsupported tile size for winograd: " + str(tile_size))
+
+    m = A_data.shape[1]
+    r = 3
+    alpha = m + r - 1
+    K = CO
+    C = CI
+
+    H = (IH + 2 * HPAD - 3) // HSTR + 1
+    W = (IW + 2 * WPAD - 3) // WSTR + 1
+    nH, nW = (H + m-1) // m, (W + m-1) // m
+    P = N * nH * nW
+
+    cfg.define_split('tile_p', cfg.axis(P), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
+    cfg.define_split('tile_k', cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
+    VP = cfg['tile_p'].size[-1]
+    VK = cfg['tile_k'].size[-1]
+
+    # pack input tile
+    input_tile = tvm.compute((C, P // VP, alpha, alpha, VP),
+                             lambda c, b, eps, nu, bb:
+                             data_pad[(b*VP+bb) // (nH*nW)][c][(b*VP+bb) // nW % nH * m + eps]
+                             [(b*VP+bb) % nW * m + nu],
+                             name='d')
+
+    # transform kernel
+    if pre_computed:
+        U = kernel
+    else:
+        G = const_matrix(G_data, 'G')
+        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
+        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
+        U = tvm.compute((alpha, alpha, K // VK, C, VK), lambda eps, nu, k, c, kk:
+                        tvm.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
+                                G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')
+
+    # transform image
+    B = const_matrix(B_data, 'B')
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    V = tvm.compute((alpha, alpha, P // VP, C, VP), lambda eps, nu, b, c, bb:
+                    tvm.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) *
+                            B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V')
+
+    # batch gemm
+    c = tvm.reduce_axis((0, C), name='c')
+    M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b:
+                    tvm.sum(U[eps][nu][k // VK][c][k % VK] *
+                            V[eps][nu][b // VP][c][b % VP], axis=c), name='M')
+
+    # inverse transform
+    A = const_matrix(A_data, 'A')
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
+                    tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
+                            axis=[r_eps, r_nu]), name='Y')
+
+    # unpack output
+    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
+                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
+                         name='output', tag='winograd_conv_output',
+                         attrs={'workload': _winograd_conv_arg_to_workload(
+                             data, kernel, strides, padding, layout, out_dtype, tile_size)})
+
+    # we have to manually assign effective GFLOP for winogard
+    cfg.add_flop(2 * N * K * H * W * KH * KW * C)
+    return output
+
+def _schedule_winograd(cfg, s, output, last):
+    Y = output.op.input_tensors[0]
+    M, A = Y.op.input_tensors
+    U, V = M.op.input_tensors
+    d, B = V.op.input_tensors
+    data_pad = d.op.input_tensors[0]
+
+    # padding
+    s[data_pad].compute_inline()
+
+    # pack input tiles
+    s[d].compute_inline()
+
+    # transform kernel
+    if isinstance(U.op, tvm.tensor.ComputeOp):
+        kernel, G = U.op.input_tensors
+        s[G].compute_inline()
+        eps, nu, k, c, kk, = s[U].op.axis
+        r_kh, r_kw = s[U].op.reduce_axis
+        s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
+        s[U].unroll(eps)
+        s[U].unroll(nu)
+        s[U].unroll(r_kh)
+        s[U].unroll(r_kw)
+        s[U].vectorize(kk)
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel transformation will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[U].pragma(k, 'debug_skip_region')
+        else:
+            s[U].parallel(k)
+
+    # transform image
+    DD = s.cache_read(d, 'global', [V])
+    s[B].compute_inline()
+    eps, nu, b, c, bb = s[V].op.axis
+    r_eps, r_nu = s[V].op.reduce_axis
+    s[V].reorder(b, c, eps, nu, r_eps, r_nu, bb)
+    s[V].unroll(eps)
+    s[V].unroll(nu)
+    s[V].unroll(r_eps)
+    s[V].unroll(r_nu)
+    s[DD].compute_at(s[V], c)
+    s[V].vectorize(bb)
+    s[V].parallel(b)
+
+    # batch gemm
+    eps, nu, k, b = s[M].op.axis
+    c = s[M].op.reduce_axis[0]
+    cfg.define_split('tile_c', c, num_outputs=2, filter=lambda x: x.size[-1] <= 16)
+    co, ci = cfg['tile_c'].apply(s, M, c)
+    xo, xi = cfg['tile_p'].apply(s, M, b)
+    s[M].reorder(eps, nu, xo, co, k, ci, xi)
+    cfg.define_annotate('ann_reduce', [ci], policy='try_unroll')
+    cfg.define_annotate('ann_spatial', [k, xi], policy='try_unroll_vec')
+    cfg['ann_reduce'].apply(s, M, [ci],
+                            axis_lens=[cfg['tile_c'].size[-1]],
+                            max_unroll=16,
+                            cfg=cfg)
+    cfg['ann_spatial'].apply(s, M, [k, xi])
+
+    # inverse transform
+    s[A].compute_inline()
+    k, b, vh, vw = s[Y].op.axis
+    r_eps, r_nu = s[Y].op.reduce_axis
+    s[Y].unroll(vh)
+    s[Y].unroll(vw)
+    s[Y].unroll(r_eps)
+    s[Y].unroll(r_nu)
+
+    # output
+    n, co, h, w = s[last].op.axis
+    co, coi = cfg['tile_k'].apply(s, last, co)
+    s[M].compute_at(s[last], co)
+    s[last].parallel(co)
+
+    MM = s.cache_read(M, 'global', [Y])
+    m = get_const_int(V.shape[0]) + 1 - 3
+    ho, wo, hi, wi = s[last].tile(h, w, m, m)
+    s[Y].compute_at(s[last], wo)
+    s[MM].compute_at(s[last], wo)
+
+    if output != last:
+        s[output].compute_inline()
+
+
+def _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype, tile_size):
+    """convert argument to workload"""
+    K = 3
+    shape = get_const_tuple(kernel.shape)
+    alpha = tile_size + K - 1
+    if len(kernel.shape) == 4:
+        assert shape[2:] == (K, K)
+        CO, CI = shape[:2]
+    else:
+        assert shape[:2] == (alpha, alpha)
+        CO, CI, VCO = shape[2:]
+        CO *= VCO
+
+    raw_kernel = tvm.placeholder((CO, CI, K, K), dtype=kernel.dtype)
+    return ('conv2d', ) + autotvm.task.args_to_workload(
+        [data, raw_kernel, strides, padding, layout, out_dtype])
+
+
+@conv2d_winograd_without_weight_transform.register(['arm_cpu'])
+@autotvm.task.dispatcher
+def winograd_ww_config_dispatcher_(data, kernel, strides, padding, layout, out_dtype, tile_size):
+    return _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype,
+                                          tile_size)
+
+
+@winograd_ww_config_dispatcher_.register(['winograd'])
+def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype,
+                          tile_size)
+
+
+@autotvm.task.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                     'arm_cpu', ['winograd'])
+def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_conv_output' in op.tag:
+            output = op.output(0)
+            _schedule_winograd(cfg, s, output, outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@conv2d_alter_layout.register(["arm_cpu", "mali"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    """Alter op layout for pre-computing kernel transformation"""
+    import nnvm.symbol as sym
+    copy_inputs = [s for s in inputs]
+
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
+                                                      "when alter_op_layout is enabled"
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    groups = attrs.get_int('groups')
+    layout = attrs["layout"]
+    out_dtype = attrs["out_dtype"]
+    out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
+
+    if groups == 1:
+        # query config of this workload
+        workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding,
+                                         layout, out_dtype)
+        cfg = autotvm.task.DispatchContext.current.query(tvm.target.current_target(), workload)
+
+        if cfg.template_key == 'direct':  # packing weight tensor
+            new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
+            return sym.conv2d(*copy_inputs, **new_attrs)
+        else:  # pre-compute weight transformation in winograd
+            tile_size = 4
+
+            weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
+                                                                  tile_size=tile_size)
+            CO, CI, KH, KW = get_const_tuple(tinfos[1].shape)
+            VC = cfg['tile_k'].size[-1]
+            weight = sym.reshape(weight,
+                                 shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
+            weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
+
+            copy_inputs[1] = weight
+            new_attrs['tile_size'] = tile_size
+            return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
+
+    # do nothing for depthwise convolution
+    return None
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
new file mode 100644
index 000000000000..65fabddb34df
--- /dev/null
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -0,0 +1,94 @@
+# pylint: disable=invalid-name,unused-variable
+"""Depthwise convolution schedule for ARM CPU"""
+
+import tvm
+from tvm import autotvm
+
+from ..generic import schedule_depthwise_conv2d_nchw
+from ..nn import depthwise_conv2d_nchw
+from ..util import traverse_inline
+
+# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
+autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
+                                   depthwise_conv2d_nchw.fdefault)
+
+# register customized schedule for arm cpu.
+@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
+def schedule_depthwise_conv2d_nchw_(cfg, outs):
+    """Schedule depthwise conv2d"""
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(cfg, s, data, data_pad, kernel, output):
+        A, B, C = data, kernel, output
+        s[data_pad].compute_inline()
+
+        # define tile
+        n, c, h, w = s[output].op.axis
+        cfg.define_split('tile_c', c, num_outputs=2)
+        cfg.define_split('tile_h', h, num_outputs=2)
+        cfg.define_split('tile_w', w, num_outputs=2)
+
+        # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
+        A0 = s.cache_read(data_pad, "global", C)
+        _, c, h, w = s[A0].op.axis
+        c, vc = cfg['tile_c'].apply(s, A0, c)
+        s[A0].reorder(c, h, w, vc)
+        A1 = s.cache_write(A0, 'global')
+        s[A0].compute_inline()
+
+        # park kernel to vector form  [co, ci, kh, kw] -> [CO, ci, kh, kw, VC]
+        B0 = s.cache_read(B, "global", C)
+        c, m, h, w = s[B0].op.axis
+        c, vc, = cfg['tile_c'].apply(s, B0, c)
+        s[B0].reorder(c, m, h, w, vc)
+        B1 = s.cache_write(B0, 'global')
+        s[B0].compute_inline()
+
+        _, c, h, w = s[C].op.axis
+        c, vc, = cfg['tile_c'].apply(s, C, c)
+        s[C].reorder(c, h, w, vc)
+
+        # depthwise conv
+        C0 = s.cache_write(C, 'global')
+        _, c, h, w, vc = s[C0].op.axis
+        dh, dw = s[C0].op.reduce_axis
+        oh, ih = cfg['tile_h'].apply(s, C0, h)
+        ow, iw = cfg['tile_w'].apply(s, C0, w)
+        s[C0].reorder(c, oh, ow, dh, dw, ih, iw, vc)
+        s[A1].compute_at(s[C0], oh)
+
+        # try unroll and vectorization
+        cfg.define_annotate('ann', [ih, iw, vc], policy='try_unroll_vec')
+        cfg['ann'].apply(s, C0, [ih, iw, vc],
+                         axis_lens=[cfg['tile_h'].size[-1],
+                                    cfg['tile_w'].size[-1],
+                                    cfg['tile_c'].size[-1]],
+                         max_unroll=16,
+                         cfg=cfg)
+
+        # mark parallel
+        n, c, h, w = s[C].op.axis
+        s[C].parallel(c)
+
+        n, c, h, w, vc = s[C0].op.axis
+        s[C0].parallel(c)
+
+        c, m, h, w, vc = s[B1].op.axis
+        s[B1].parallel(c)
+
+        return s
+
+    def _callback(op):
+        if op.tag == 'depthwise_conv2d_nchw':
+            output = op.output(0)
+            kernel = op.input_tensors[1]
+            data = op.input_tensors[0]
+            data_pad = None
+            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+            _schedule(cfg, s, data, data_pad, kernel, output)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/broadcast.py b/topi/python/topi/broadcast.py
index f75a65bb969a..9cfd99fbeec4 100644
--- a/topi/python/topi/broadcast.py
+++ b/topi/python/topi/broadcast.py
@@ -1,244 +1,327 @@
-# pylint: disable=no-member,consider-using-enumerate
 """Broadcast operators"""
 from __future__ import absolute_import as _abs
-import tvm
-from .import tag
-from .util import get_const_tuple, equal_const_int
+from .import cpp as _cpp
 
-def _get_bcast_info(original_shape, target_shape):
-    """Get the broadcasting info.
+def broadcast_to(data, shape):
+    """Broadcast the src to the target shape
+
+    We follows the numpy broadcasting rule.
+    See also https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        The input data
+
+    shape : list or tuple
+        The target shape to be broadcasted.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return _cpp.broadcast_to(data, shape)
 
-    bcast_info = _get_bcast_info(original_shape, target_shape)
-    In bcast_info:
-      -1 means to the padding dim
-       0 means to to be the same as the original shape
-       1 means to the broadcasted dim
 
-    E.g
-    original: (2, 1, 5), target: (2, 4, 5) => bcast_info: (0, 1, 0)
-    original: (2, 5), target: (4, 2, 5) => bcast_info: (-1, 0, 0)
-    original: (1, 5), target: (4, 2, 5) => bcast_info: (-1, 1, 0)
+def add(lhs, rhs):
+    """Addition with auto-broadcasting
 
     Parameters
     ----------
-    original_shape : tuple of tvm.expr.IntImm
-        The original shape before broadcasting
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
+
+    Returns
+    -------
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
+    """
+    return _cpp.add(lhs, rhs)
 
-    target_shape : tuple
-        The target shape
+
+def subtract(lhs, rhs):
+    """Subtraction with auto-broadcasting
+
+    Parameters
+    ----------
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    bcast_info : list
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    assert len(target_shape) >= len(original_shape)
-    bcast_info = [-1 for _ in range(len(target_shape))]
-    original_shape = [original_shape[i] for i in range(len(original_shape))]
-    original_shape = original_shape[::-1]
-    target_shape = target_shape[::-1]
-    for i in range(len(original_shape)):
-        if equal_const_int(original_shape[i], target_shape[i]):
-            bcast_info[i] = 0
-        elif equal_const_int(original_shape[i], 1):
-            bcast_info[i] = 1
-        else:
-            raise ValueError("Original Shape: {} cannot be broadcast to  {}"
-                             .format(original_shape[::-1], target_shape[::-1]))
-    bcast_info = bcast_info[::-1]
-    return bcast_info
+    return _cpp.subtract(lhs, rhs)
 
 
-def _get_binary_op_bcast_shape(lhs_shape, rhs_shape):
-    """Get the shape after binary broadcasting.
+def multiply(lhs, rhs):
+    """Multiplication with auto-broadcasting
 
-    We will strictly follow the broadcasting rule in numpy.
+    Parameters
+    ----------
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
+
+    Returns
+    -------
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
+    """
+    return _cpp.multiply(lhs, rhs)
+
+
+def divide(lhs, rhs):
+    """Division with auto-broadcasting
 
     Parameters
     ----------
-    lhs_shape : tuple
-    rhs_shape : tuple
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    ret_shape : tuple
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    ret_shape = []
-    if len(lhs_shape) > len(rhs_shape):
-        lhs_shape, rhs_shape = rhs_shape, lhs_shape
-    for ptr in range(len(rhs_shape)):
-        if ptr < len(lhs_shape):
-            l_val, r_val = lhs_shape[len(lhs_shape) - 1 - ptr], \
-                           rhs_shape[len(rhs_shape) - 1 - ptr]
-            assert(l_val == 1 or r_val == 1 or l_val == r_val),\
-                "Shape is NOT broadcastable, lhs=%s, rhs=%s"\
-                %(str(lhs_shape), str(rhs_shape))
-            ret_shape.append(max(l_val, r_val))
-        else:
-            ret_shape.append(rhs_shape[len(rhs_shape) - 1 - ptr])
-    ret_shape = ret_shape[::-1]
-    return ret_shape
+    return _cpp.divide(lhs, rhs)
 
 
+def mod(lhs, rhs):
+    """Modulus with auto-broadcasting
 
-@tvm.tag_scope(tag=tag.BROADCAST)
-def broadcast_to(data, shape):
-    """Broadcast the src to the target shape
+    Parameters
+    ----------
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
-    We follows the numpy broadcasting rule.
-    See also https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html
+    Returns
+    -------
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
+    """
+    return _cpp.mod(lhs, rhs)
+
+
+def maximum(lhs, rhs):
+    """Take element-wise maximum of two tensors with auto-broadcasting
 
     Parameters
     ----------
-    data : tvm.Tensor
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
-    shape : list or tuple
+    Returns
+    -------
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
+    """
+    return _cpp.maximum(lhs, rhs)
+
+
+def minimum(lhs, rhs):
+    """Take element-wise maximum of two tensors with auto-broadcasting
+
+    Parameters
+    ----------
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    def _bcast_to_arg_eval(data, bcast_info, *indices):
-        indices_tuple = []
-        for i, ind in enumerate(indices):
-            if bcast_info[i] == 0:
-                indices_tuple.append(ind)
-            elif bcast_info[i] == 1:
-                indices_tuple.append(0)
-        return data[tuple(indices_tuple)]
-    original_shape = data.shape
-    bcast_info = _get_bcast_info(original_shape=original_shape, target_shape=shape)
-    ret = tvm.compute(shape,
-                      lambda *indices: _bcast_to_arg_eval(data,
-                                                          bcast_info,
-                                                          *indices), name=data.name + "_broadcast")
-    return ret
-
-
-@tvm.tag_scope(tag=tag.BROADCAST)
-def broadcast_binary_op(lhs, rhs, func, name="bop"):
-    """Binary operands that will automatically broadcast the inputs
+    return _cpp.minimum(lhs, rhs)
 
-    We follows the numpy broadcasting rule.
-    See also https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html
+
+def power(lhs, rhs):
+    """Power with auto-broadcasting
 
     Parameters
     ----------
-    lhs : tvm.Tensor
-    rhs : tvm.Tensor
-    func : function
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    def _inner_arg_eval(lhs, rhs, lhs_bcast_info, rhs_bcast_info, func, *indices):
-        lhs_indices = []
-        rhs_indices = []
-        for i, ind in enumerate(indices):
-            if lhs_bcast_info[i] == 0:
-                lhs_indices.append(ind)
-            elif lhs_bcast_info[i] == 1:
-                lhs_indices.append(0)
-            if rhs_bcast_info[i] == 0:
-                rhs_indices.append(ind)
-            elif rhs_bcast_info[i] == 1:
-                rhs_indices.append(0)
-        return func(lhs[tuple(lhs_indices)], rhs[tuple(rhs_indices)])
-    ret_shape = _get_binary_op_bcast_shape(get_const_tuple(lhs.shape), get_const_tuple(rhs.shape))
-    lhs_bcast_info = _get_bcast_info(original_shape=lhs.shape, target_shape=ret_shape)
-    rhs_bcast_info = _get_bcast_info(original_shape=rhs.shape, target_shape=ret_shape)
-    ret = tvm.compute(ret_shape,
-                      lambda *indices: _inner_arg_eval(lhs, rhs, lhs_bcast_info, rhs_bcast_info,
-                                                       func, *indices),
-                      name=lhs.name + "_" + rhs.name + "_" + name)
-    return ret
+    return _cpp.power(lhs, rhs)
 
 
-def broadcast_add(lhs, rhs):
-    """Binary addition with auto-broadcasting
+def left_shift(lhs, rhs):
+    """Left shift with auto-broadcasting
 
     Parameters
     ----------
-    lhs : tvm.Tensor
-    rhs : tvm.Tensor
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    return broadcast_binary_op(lhs, rhs, lambda a, b: a + b, "add")
+    return _cpp.left_shift(lhs, rhs)
 
 
-def broadcast_mul(lhs, rhs):
-    """Binary multiplication with auto-broadcasting
+def right_shift(lhs, rhs):
+    """Right shift with auto-broadcasting
 
     Parameters
     ----------
-    lhs : tvm.Tensor
-    rhs : tvm.Tensor
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    return broadcast_binary_op(lhs, rhs, lambda a, b: a * b, "mul")
+    return _cpp.right_shift(lhs, rhs)
 
 
-def broadcast_div(lhs, rhs):
-    """Binary division with auto-broadcasting
+def greater(lhs, rhs):
+    """Compute (lhs>rhs) with auto-broadcasting
 
     Parameters
     ----------
-    lhs : tvm.Tensor
-    rhs : tvm.Tensor
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    return broadcast_binary_op(lhs, rhs, lambda a, b: a / b, "div")
+    return _cpp.greater(lhs, rhs)
 
 
-def broadcast_sub(lhs, rhs):
-    """Binary subtraction with auto-broadcasting
+def less(lhs, rhs):
+    """Compute (lhs<rhs) with auto-broadcasting
 
     Parameters
     ----------
-    lhs : tvm.Tensor
-    rhs : tvm.Tensor
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    return broadcast_binary_op(lhs, rhs, lambda a, b: a - b, "sub")
+    return _cpp.less(lhs, rhs)
 
 
-def broadcast_maximum(lhs, rhs):
-    """Take element-wise maximum of two tensors with auto-broadcasting
+def equal(lhs, rhs):
+    """Compute (lhs==rhs) with auto-broadcasting
 
     Parameters
     ----------
-    lhs : tvm.Tensor
-    rhs : tvm.Tensor
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    return broadcast_binary_op(lhs, rhs, tvm.max, "maximum")
+    return _cpp.equal(lhs, rhs)
 
 
-def broadcast_minimum(lhs, rhs):
-    """Take element-wise minimum of two tensors with auto-broadcasting
+def not_equal(lhs, rhs):
+    """Compute (lhs!=rhs) with auto-broadcasting
 
     Parameters
     ----------
-    lhs : tvm.Tensor
-    rhs : tvm.Tensor
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
+    """
+    return _cpp.not_equal(lhs, rhs)
+
+
+def greater_equal(lhs, rhs):
+    """Compute (lhs>=rhs) with auto-broadcasting
+
+    Parameters
+    ----------
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
+
+    Returns
+    -------
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
+    """
+    return _cpp.greater_equal(lhs, rhs)
+
+
+def less_equal(lhs, rhs):
+    """Compute (lhs<=rhs) with auto-broadcasting
+
+    Parameters
+    ----------
+    lhs : tvm.Tensor or Expr
+        The left operand
+    rhs : tvm.Tensor or Expr
+        The right operand
+
+    Returns
+    -------
+    ret : tvm.Tensor or Expr
+        Returns Expr if both operands are Expr.
+        Otherwise returns Tensor.
     """
-    return broadcast_binary_op(lhs, rhs, tvm.min, "minimum")
+    return _cpp.less_equal(lhs, rhs)
diff --git a/topi/python/topi/cpp.py b/topi/python/topi/cpp.py
new file mode 100644
index 000000000000..85f203387805
--- /dev/null
+++ b/topi/python/topi/cpp.py
@@ -0,0 +1,52 @@
+"""FFI for C++ TOPI ops and schedules"""
+import sys
+import os
+import ctypes
+from imp import new_module as _new_module
+from tvm._ffi.function import _init_api_prefix
+from tvm._ffi import libinfo
+
+def _get_lib_names():
+    if sys.platform.startswith('win32'):
+        return ['libtvm_topi.dll', 'tvm_topi.dll']
+    if sys.platform.startswith('darwin'):
+        return ['libtvm_topi.dylib', 'tvm_topi.dylib']
+    return ['libtvm_topi.so', 'tvm_topi.so']
+
+def _load_lib():
+    """Load libary by searching possible path."""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    lib_search = curr_path
+    lib_path = libinfo.find_lib_path(_get_lib_names(), lib_search, optional=True)
+    if lib_path is None:
+        return None, None
+    lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL)
+    return lib, os.path.basename(lib_path[0])
+
+_LIB, _LIB_NAME = _load_lib()
+
+_init_api_prefix("topi.cpp", "topi")
+
+def _create_module(name):
+    fullname = __name__ + "." + name
+    mod = _new_module(fullname)
+    sys.modules[fullname] = mod
+    return mod
+
+# pylint: disable-msg=C0103
+nn = _create_module("nn")
+_init_api_prefix("topi.cpp.nn", "topi.nn")
+generic = _create_module("generic")
+_init_api_prefix("topi.cpp.generic", "topi.generic")
+cuda = _create_module("cuda")
+_init_api_prefix("topi.cpp.cuda", "topi.cuda")
+rocm = _create_module("rocm")
+_init_api_prefix("topi.cpp.rocm", "topi.rocm")
+x86 = _create_module("x86")
+_init_api_prefix("topi.cpp.x86", "topi.x86")
+vision = _create_module("vision")
+_init_api_prefix("topi.cpp.vision", "topi.vision")
+yolo = _create_module("vision.yolo")
+_init_api_prefix("topi.cpp.vision.yolo", "topi.vision.yolo")
+image = _create_module("image")
+_init_api_prefix("topi.cpp.image", "topi.image")
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index b898dde6a8cf..b8740f811ff7 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -2,6 +2,7 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
+from .conv2d import conv2d_cuda
 from .conv2d_nchw import schedule_conv2d_nchw
 from .conv2d_hwcn import schedule_conv2d_hwcn
 from .depthwise_conv2d import schedule_depthwise_conv2d_nchw, schedule_depthwise_conv2d_nhwc
@@ -10,6 +11,12 @@
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
-from .dense import schedule_dense
+from .dense import dense_cuda, schedule_dense
 from .pooling import schedule_pool, schedule_global_pool
 from .conv2d_transpose_nchw import schedule_conv2d_transpose_nchw
+from .extern import schedule_extern
+from .nn import schedule_lrn, schedule_l2_normalize
+from .vision import *
+from . import ssd
+from .ssd import *
+from .nms import *
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
new file mode 100644
index 000000000000..3c494cdeb0fa
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d.py
@@ -0,0 +1,83 @@
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
+"""Compute definition for conv2d with cuda backend"""
+import tvm
+from tvm.contrib import cudnn
+import topi
+from ..nn.conv2d import conv2d
+from ..util import get_const_int
+
+@conv2d.register("cuda")
+def conv2d_cuda(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+    """Conv2D operator for cuda backend.
+
+    Parameters
+    ----------
+    input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    filter : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    assert isinstance(stride, int) or len(stride) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+    if isinstance(padding, int):
+        pad_h = pad_w = padding
+    else:
+        pad_h, pad_w = padding
+    # handle dilation
+    dilation_h = dilation_w = 1
+    kernel_tvm = kernel
+    kernel_cudnn = kernel
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        kernel_before_dilation = kernel.op.input_tensors[0]
+        kernel_cudnn = kernel_before_dilation
+        if layout == 'NCHW':
+            dilation_h = (get_const_int(kernel.shape[2]) + get_const_int(kernel_before_dilation.shape[2]) - 1) \
+                // get_const_int(kernel_before_dilation.shape[2])
+            dilation_w = (get_const_int(kernel.shape[3]) + get_const_int(kernel_before_dilation.shape[3]) - 1) \
+                // get_const_int(kernel_before_dilation.shape[2])
+        elif layout == 'NHWC':
+            dilation_h = (get_const_int(kernel.shape[1]) + get_const_int(kernel_before_dilation.shape[1]) - 1) \
+                // get_const_int(kernel_before_dilation.shape[1])
+            dilation_w = (get_const_int(kernel.shape[2]) + get_const_int(kernel_before_dilation.shape[2]) - 1) \
+                // get_const_int(kernel_before_dilation.shape[2])
+    target = tvm.target.current_target()
+    if "cudnn" in target.libs:
+        assert layout != 'HWCN', "HWCN layout not supported with CUDNN."
+        tensor_format = 0 # CUDNN_TENSOR_NCHW
+        if layout == 'NHWC':
+            tensor_format = 1 # CUDNN_TENSOR_NHWC
+        return cudnn.conv2d_forward(data,
+                                    kernel_cudnn,
+                                    stride_h,
+                                    stride_w,
+                                    pad_h,
+                                    pad_w,
+                                    dilation_h,
+                                    dilation_w,
+                                    conv_mode=1,
+                                    tensor_format=tensor_format,
+                                    algo=-1) # let CUDNN choose the best algo
+    elif layout == 'NCHW':
+        return topi.nn.conv2d_nchw(data, kernel_tvm, stride, padding, out_dtype)
+    elif layout == 'HWCN':
+        return topi.nn.conv2d_hwcn(data, kernel_tvm, stride, padding, out_dtype)
+    else:
+        raise ValueError("not support this layout {} yet".format(layout))
diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index ec902585508c..082966a3c05a 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -110,6 +110,8 @@ def traverse(operator):
         elif operator.tag == 'conv2d_hwcn':
             Apad = operator.input_tensors[0]
             W = operator.input_tensors[1]
+            if isinstance(W.op, tvm.tensor.ComputeOp) and 'dilate' in W.op.tag:
+                sch[W].compute_inline()
             B = operator.output(0)
             schedule(Apad, W, B)
         else:
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
index 81271938edea..844d4a04758c 100644
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -1,31 +1,41 @@
-#pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches
+#pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
 """Schedule for conv2d_nchw with auto fusion"""
 import tvm
+import topi
 from .. import util
 from .. import tag
 from .. import generic
 
-def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L):
+def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
     """Schedule conv2d for specific feature_in_out_filter pattern"""
     # scheduler params
     ofactor = 16
     hfactor = 2
+    if flag >= 96:
+        hfactor = 4
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
     ow_size = util.get_const_int(Out.shape[3])
-    num_thread = ow_size * hfactor
+    num_thread = min(max_threads, ow_size * hfactor)
     vthread = ofactor
     block_x = tvm.thread_axis("blockIdx.x")
     thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
     thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
 
     i, oc, h, w = s[Out].op.axis
-    ooc, ioc = s[Out].split(oc, factor=vthread)
-    oh, ih = s[Out].split(h, factor=hfactor)
-    s[Out].reorder(ooc, oh, ioc, ih, w)
-    oc = s[Out].fuse(ooc, oh)
-    w = s[Out].fuse(w, ih)
-    s[Out].bind(w, thread_x)
-    s[Out].bind(ioc, thread_xz)
-    s[Out].bind(oc, block_x)
+    if ow_size * hfactor == num_thread:
+        ooc, ioc = s[Out].split(oc, factor=vthread)
+        oh, ih = s[Out].split(h, factor=hfactor)
+        s[Out].reorder(ooc, oh, ioc, ih, w)
+        oc = s[Out].fuse(ooc, oh)
+        ow, _ = s[Out].split(w, nparts=ow_size)
+        w = s[Out].fuse(ow, ih)
+        s[Out].bind(w, thread_x)
+        s[Out].bind(ioc, thread_xz)
+        s[Out].bind(oc, block_x)
+    else:
+        ow, w = s[Out].split(w, factor=num_thread)
+        s[Out].bind(w, thread_x)
+        s[Out].bind(ow, block_x)
 
     s[Out_L].compute_at(s[Out], w)
 
@@ -36,7 +46,7 @@ def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L):
     s[temp_S].compute_at(s[Out_L], ic)
     s[Filter_S].compute_at(s[Out_L], w)
 
-    num_thread1 = tvm.target.current_target(allow_none=False).max_num_threads
+    num_thread1 = max_threads
     thread_xx = tvm.thread_axis((0, num_thread1), "threadIdx.x")
     block_xx = tvm.thread_axis("blockIdx.x")
 
@@ -55,7 +65,8 @@ def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L):
     h = s[temp_S].fuse(h, ow)
     _, tx = s[temp_S].split(h, factor=num_thread)
     s[temp_S].bind(tx, thread_x)
-    s[temp_S].vectorize(iw)
+    if num_thread < max_threads:
+        s[temp_S].vectorize(iw)
 
     #schedule Filter_S shared mem load
     i, oc, h, w = s[Filter_S].op.axis
@@ -72,6 +83,9 @@ def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
         if mark % 8 == 0 and mark % 7 == 0:
             num_thread_x = 8
             vthread_x = 7
+        elif mark % 4 == 0 and mark % 7 == 0:
+            num_thread_x = 4
+            vthread_x = 7
         else:
             for i in range(5, mark):
                 if mark % i == 0 and num_thread_x == 0:
@@ -80,7 +94,7 @@ def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
                 if mark % i == 0 and vthread_x > 0:
                     num_thread_x = i
                     break
-        if num_thread_x * vthread_x > 128:
+        if mark < 5 or num_thread_x * vthread_x > 128:
             num_thread_x = 8
             vthread_x = 8
         num_thread_y = 8
@@ -95,20 +109,30 @@ def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
         thread_yz = tvm.thread_axis((0, vthread_y), "vthread", name="vy")
 
         i, oc, h, w = s[Out].op.axis
-        ow, iw = s[Out].split(w, factor=num_thread_x)
-        oh, ih = s[Out].split(h, factor=vthread_x)
+        factor = util.get_const_int(Out.shape[3])
         ooc, ioc = s[Out].split(oc, factor=num_thread_y*vthread_y)
         oioc, iioc = s[Out].split(ioc, nparts=vthread_y)
-        s[Out].reorder(i, ooc, oh, ow, oioc, ih, iioc, iw)
-        oh = s[Out].fuse(oh, ow)
-        s[Out].bind(iw, thread_x)
         s[Out].bind(iioc, thread_y)
-        s[Out].bind(ih, thread_xz)
         s[Out].bind(oioc, thread_yz)
-        s[Out].bind(oh, block_x)
         s[Out].bind(ooc, block_y)
-
-        s[Out_L].compute_at(s[Out], iw)
+        if factor < num_thread_x*vthread_x:
+            oh, ih = s[Out].split(h, factor=num_thread_x*vthread_x//factor)
+            w = s[Out].fuse(ih, w)
+            ow, iw = s[Out].split(w, nparts=vthread_x)
+            s[Out].reorder(i, ooc, oh, oioc, ow, iioc, iw)
+            s[Out].bind(iw, thread_x)
+            s[Out].bind(ow, thread_xz)
+            s[Out].bind(oh, block_x)
+            s[Out_L].compute_at(s[Out], iw)
+        else:
+            ow, iw = s[Out].split(w, factor=num_thread_x)
+            oh, ih = s[Out].split(h, factor=vthread_x)
+            s[Out].reorder(i, ooc, oh, ow, oioc, ih, iioc, iw)
+            oh = s[Out].fuse(oh, ow)
+            s[Out].bind(iw, thread_x)
+            s[Out].bind(ih, thread_xz)
+            s[Out].bind(oh, block_x)
+            s[Out_L].compute_at(s[Out], iw)
 
         # schedule Out_L local write
         i, oc, h, w = s[Out_L].op.axis
@@ -233,12 +257,13 @@ def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
 
 def conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L):
     """Schedule conv2d for specific feature_in_out_filter pattern"""
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
     if util.get_const_int(Filter.shape[0]) + util.get_const_int(Filter.shape[1]) <= 768:
         # scheduler params
         vthread_x = util.get_const_int(Out.shape[3])
         num_thread_x = 64
         ofactor = 8
-        if util.get_const_int(Filter.shape[3]) == 1:
+        if util.get_const_int(Filter.shape[3]) == 1 and vthread_x * 5 <= max_threads:
             ofactor = 64
         block_x = tvm.thread_axis("blockIdx.x")
         thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
@@ -278,9 +303,9 @@ def conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L):
 
     else:
         # scheduler params
-        vthread_x = util.get_const_int(Out.shape[2])
+        vthread_x = min(8, util.get_const_int(Out.shape[2]))
         num_thread_x = 16
-        num_thread_y = util.get_const_int(Out.shape[3])
+        num_thread_y = min(max_threads // num_thread_x, util.get_const_int(Out.shape[3]))
         ofactor = 8
         block_x = tvm.thread_axis("blockIdx.x")
         thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
@@ -288,11 +313,13 @@ def conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L):
         thread_xz = tvm.thread_axis((0, vthread_x), "vthread", name="vx")
 
         i, oc, h, w = s[Out].op.axis
+        ow, iw = s[Out].split(w, factor=num_thread_y)
+        oh, ih = s[Out].split(h, factor=vthread_x)
         ooc, ioc = s[Out].split(oc, factor=num_thread_x)
-        s[Out].reorder(i, ooc, h, w, ioc)
+        s[Out].reorder(i, ooc, oh, ih, ow, iw, ioc)
         s[Out].bind(ioc, thread_x)
-        s[Out].bind(w, thread_y)
-        s[Out].bind(h, thread_xz)
+        s[Out].bind(iw, thread_y)
+        s[Out].bind(ih, thread_xz)
         s[Out].bind(ooc, block_x)
 
         s[Out_L].compute_at(s[Out], ioc)
@@ -306,7 +333,7 @@ def conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L):
         s[temp_S].compute_at(s[Out_L], oic)
         s[Filter_S].compute_at(s[Out_L], oic)
 
-        num_thread = tvm.target.current_target(allow_none=False).max_num_threads
+        num_thread = max_threads
         thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
         block_xx = tvm.thread_axis("blockIdx.x")
 
@@ -350,15 +377,19 @@ def conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L):
     if util.get_const_int(Filter.shape[0]) == 64:
         opart2 = 8
         ifactor = 16
-    sfactor = max(1, ofactor//(opart2*2))
+    if util.get_const_int(Out.shape[2]) == 224:
+        num_thread = 4
+        wfactor = 112
+        ifactor = 4
+    sfactor = max(1, ofactor // (opart2*vthread))
     spart = max(1, (wfactor + vthread-1) // vthread)
 
     block_x = tvm.thread_axis("blockIdx.x")
     block_y = tvm.thread_axis("blockIdx.y")
     block_z = tvm.thread_axis("blockIdx.z")
     thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-    thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
-    thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
+    thread_y = tvm.thread_axis((0, wfactor // vthread), "threadIdx.y")
+    thread_xz = tvm.thread_axis((0, opart2), "vthread", name="vx")
     thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
 
     i, oc, h, w = s[Out].op.axis
@@ -384,10 +415,10 @@ def conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L):
     ic, dh, dw = s[Out_L].op.reduce_axis
     oic, iic = s[Out_L].split(ic, factor=ifactor)
     s[Out_L].reorder(oic, dh, dw, iic, h, w)
-
     fuse_index = s[Out_L].fuse(dw, dh)
     fuse_index = s[Out_L].fuse(fuse_index, oic)
     dw = fuse_index
+
     s[temp_S].compute_at(s[Out_L], dw)
     s[Filter_S].compute_at(s[Out_L], dw)
 
@@ -411,16 +442,6 @@ def schedule_conv2d_small_batch(outs):
 
     def schedule(temp, Filter, Output):
         """Schedule conv2d_nchw"""
-        block_h = util.get_const_int(Output.shape[3])
-        block_w = util.get_const_int(temp.shape[1])
-        if block_h % 48 == 0:
-            block_h = 48
-        elif block_h % 32 == 0:
-            block_h = 32
-        if block_w % 48 == 0:
-            block_w = 48
-        elif block_w % 32 == 0:
-            block_w = 32
 
         flag = util.get_const_int(Filter.shape[0])+util.get_const_int(Filter.shape[1])
 
@@ -440,7 +461,7 @@ def schedule(temp, Filter, Output):
             s[temp_G].reorder(i, oic, h, w, iic)
             temp_R = s.cache_write(temp_G, "global")
             temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif util.get_const_int(Filter.shape[3]) == 7:
+        elif util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
             temp_G = s.cache_read(temp, "global", [Output])
             s[temp_G].compute_inline()
             i, ic, h, w = s[temp_G].op.axis
@@ -462,8 +483,8 @@ def schedule(temp, Filter, Output):
             s[Output].set_scope("local")
             Out_L = Output
 
-        if util.get_const_int(Filter.shape[3]) == 7:
-            conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L)
+        if util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
+            conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
         elif 128 < flag < 512:
             conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
         elif flag >= 512:
@@ -484,6 +505,8 @@ def traverse(OP):
         if 'conv2d_nchw' in OP.tag:
             temp = OP.input_tensors[0]
             Filter = OP.input_tensors[1]
+            if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
+                s[Filter].compute_inline()
             Output = OP.output(0)
             schedule(temp, Filter, Output)
 
@@ -506,6 +529,10 @@ def schedule_conv2d_nchw(outs):
     s: Schedule
         The computation schedule for conv2d_nchw.
     """
+    target = tvm.target.current_target()
+    if target.target_name == "cuda" and "cudnn" in target.libs:
+        return topi.generic.schedule_extern(outs)
+
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     batch_size = util.get_const_int(outs[0].op.output(0).shape[0])
     if batch_size > 1:
diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py
index 82fd451f4ecb..0d439bfdfdea 100644
--- a/topi/python/topi/cuda/conv2d_transpose_nchw.py
+++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py
@@ -1,4 +1,4 @@
-#pylint: disable=invalid-name
+#pylint: disable=invalid-name, line-too-long
 """Schedule for conv2d_transpose_nchw with auto fusion"""
 import tvm
 from .. import util
@@ -42,7 +42,7 @@ def schedule(temp, Filter, Output):
             s[temp_G].reorder(i, oic, h, w, iic)
             temp_R = s.cache_write(temp_G, "global")
             temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif util.get_const_int(Filter.shape[3]) == 7:
+        elif util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
             temp_G = s.cache_read(temp, "global", [Output])
             s[temp_G].compute_inline()
             i, ic, h, w = s[temp_G].op.axis
@@ -64,8 +64,8 @@ def schedule(temp, Filter, Output):
             s[Output].set_scope("local")
             Out_L = Output
 
-        if util.get_const_int(Filter.shape[3]) == 7:
-            conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L)
+        if util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
+            conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
         elif 128 < flag < 512:
             conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
         elif flag >= 512:
@@ -74,6 +74,7 @@ def schedule(temp, Filter, Output):
             conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L)
 
     def traverse(OP):
+        """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_injective(OP.tag):
             if OP not in s.outputs:
diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py
index e7b142758295..7c62fab743f5 100644
--- a/topi/python/topi/cuda/dense.py
+++ b/topi/python/topi/cuda/dense.py
@@ -2,9 +2,48 @@
 """Schedule for dense operator"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm.contrib import cublas
+from ..nn.dense import dense, dense_default
 from .. import tag
 from .. import generic
 
+@dense.register("cuda")
+def dense_cuda(data, weight, bias=None):
+    """Dense operator for cuda backend.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim]
+
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [out_dim]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    assert len(data.shape) == 2 and len(weight.shape) == 2, \
+        "only support 2-dim dense"
+    if bias is not None:
+        assert len(bias.shape) == 1
+    batch, in_dim = data.shape
+    out_dim, _ = weight.shape
+    target = tvm.target.current_target()
+    if "cublas" in target.libs:
+        matmul = cublas.matmul(data, weight, False, True)
+        if bias is not None:
+            matmul = tvm.compute((batch, out_dim), \
+                                 lambda i, j: matmul[i, j] + bias[j], \
+                                 tag=tag.BROADCAST)
+        return matmul
+    return dense_default(data, weight, bias)
+
+
 @generic.schedule_dense.register(["cuda", "gpu"])
 def schedule_dense(outs):
     """Schedule for dense operator.
@@ -20,6 +59,10 @@ def schedule_dense(outs):
     s: Schedule
         The computation schedule for dense.
     """
+    target = tvm.target.current_target()
+    if target.target_name == "cuda" and "cublas" in target.libs:
+        return generic.schedule_extern(outs)
+
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
     def _schedule(Dense):
@@ -44,6 +87,7 @@ def _schedule(Dense):
         s[Out].set_store_predicate(thread_x.var.equal(0))
 
     def traverse(OP):
+        """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py
index 851a00db0a48..c7ab547c88ec 100644
--- a/topi/python/topi/cuda/depthwise_conv2d.py
+++ b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -102,6 +102,7 @@ def _schedule(PaddedInput, Filter, DepthwiseConv2d):
         s[FS].bind(tx, thread_x)
 
     def traverse(OP):
+        """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
@@ -113,6 +114,8 @@ def traverse(OP):
         if OP.tag == 'depthwise_conv2d_nchw':
             PaddedInput = OP.input_tensors[0]
             Filter = OP.input_tensors[1]
+            if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
+                s[Filter].compute_inline()
             DepthwiseConv2d = OP.output(0)
             _schedule(PaddedInput, Filter, DepthwiseConv2d)
 
@@ -155,7 +158,7 @@ def _schedule(temp, Filter, DepthwiseConv2d):
         # num_thread here could be 728, it is larger than cuda.max_num_threads
         num_thread = tvm.ir_pass.Simplify(temp.shape[3]).value
         target = tvm.target.current_target()
-        if target and target.target_name != "cuda":
+        if target and (target.target_name not in ["cuda", "nvptx"]):
             num_thread = target.max_num_threads
         xoc, xic = s[Output].split(c, factor=num_thread)
         s[Output].reorder(xoc, b, h, w, xic)
@@ -178,6 +181,7 @@ def _schedule(temp, Filter, DepthwiseConv2d):
         s[FS].bind(fused, thread_x)
 
     def traverse(OP):
+        """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
@@ -189,6 +193,8 @@ def traverse(OP):
         if OP.tag == 'depthwise_conv2d_nhwc':
             PaddedInput = OP.input_tensors[0]
             Filter = OP.input_tensors[1]
+            if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
+                s[Filter].compute_inline()
             DepthwiseConv2d = OP.output(0)
             _schedule(PaddedInput, Filter, DepthwiseConv2d)
 
diff --git a/topi/python/topi/cuda/extern.py b/topi/python/topi/cuda/extern.py
new file mode 100644
index 000000000000..94046196c074
--- /dev/null
+++ b/topi/python/topi/cuda/extern.py
@@ -0,0 +1,32 @@
+# pylint: disable=invalid-name, unused-variable,
+"""Schedule for cudnn and miopen extern op"""
+import tvm
+from .. import generic
+from .injective import _schedule_injective
+
+
+@generic.schedule_extern.register(["cuda", "gpu"])
+def schedule_extern(outs):
+    """Schedule for an extern op followed by injective operations.
+       For example, cudnn kernel + bias add + relu.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of extern plus injective ops in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    tvm.schedule.AutoInlineInjective(s)
+    for out in outs:
+        if isinstance(out.op, tvm.tensor.ExternOp):
+            continue
+        _schedule_injective(out.op, s)
+    return s
diff --git a/topi/python/topi/cuda/injective.py b/topi/python/topi/cuda/injective.py
index 0143aec36a7b..4ca89fb3ecd3 100644
--- a/topi/python/topi/cuda/injective.py
+++ b/topi/python/topi/cuda/injective.py
@@ -1,15 +1,32 @@
 # pylint: disable=invalid-name, unused-variable,
 """Schedule for composition of injective operator"""
 import tvm
-from .. import generic
+from .. import generic, util
 
 def _schedule_injective(op, sch):
     x = op.output(0)
     fused = sch[x].fuse(*sch[x].op.axis)
     num_thread = tvm.target.current_target(allow_none=False).max_num_threads
-    bx, tx = sch[x].split(fused, factor=num_thread)
-    sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
-    sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))
+    max_block = 256
+
+    try:
+        const_size = util.get_const_int(util.prod(x.shape))
+        max_block = 256
+        need_block_split = const_size > max_block * num_thread
+    except ValueError:
+        need_block_split = False
+
+    if need_block_split:
+        xo, xi = sch[x].split(fused, factor=num_thread * max_block)
+        bx, tx = sch[x].split(xi, factor=num_thread)
+        sch[x].reorder(bx, tx, xo)
+        sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
+        sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))
+    else:
+        bx, tx = sch[x].split(fused, factor=num_thread)
+        sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))
+        sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
+
     return sch
 
 
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
new file mode 100644
index 000000000000..4d4e402de5c2
--- /dev/null
+++ b/topi/python/topi/cuda/nms.py
@@ -0,0 +1,354 @@
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison
+"""Non-maximum suppression operator"""
+import math
+import tvm
+
+from tvm import api
+from topi.vision import nms
+
+
+def sort_ir(data, index, output, axis, is_descend):
+    """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
+
+    Parameters
+    ----------
+    data: Buffer
+        2D Buffer of input boxes' score with shape [batch_size, num_anchors].
+
+    index : Buffer
+        Buffer of number of valid number of boxes.
+
+    output : Buffer
+        Output buffer of indicies of sorted tensor.
+
+    axis : int
+        The axis used for sorting.
+
+    is_descend : bool
+        If the sorted data is in descending order.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib = tvm.ir_builder.create()
+    p_data = ib.buffer_ptr(data)
+    p_index = ib.buffer_ptr(index)
+    p_out = ib.buffer_ptr(output)
+    ndim = len(data.shape)
+    assert data.dtype == "float32", "Currently only supports input dtype to be float32"
+    assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim
+
+    axis_mul_before = 1
+    axis_mul_after = 1
+    if axis < 0:
+        axis = ndim + axis
+    for i in range(0, ndim):
+        if i < axis:
+            axis_mul_before *= data.shape[i]
+        elif i > axis:
+            axis_mul_after *= data.shape[i]
+
+    dshape = 0
+    for i in range(0, len(index.shape)):
+        dshape += index.shape[i]
+    dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape,
+                        axis_mul_before*axis_mul_after)
+
+    sizes_temp = ib.allocate(
+        "int32", dshape, name="sizes_temp", scope="global")
+    sizes = ib.allocate("int32", dshape, name="sizes", scope="global")
+    temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local")
+    temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local")
+    data_new = ib.allocate("float32", dshape, name="data_new", scope="global")
+    index_new = ib.allocate("int32", dshape, name="index_new", scope="global")
+    nthread_tx = max_threads
+    nthread_bx = dshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        sizes[tid] = p_index[tid]
+        sizes_temp[tid] = p_index[tid]
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \
+             .astype("float32"))) + 1, name="k") as k:
+            with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0):
+                with ib.if_scope(k % 2 == 0):
+                    sizes[tid] += sizes_temp[tid - (
+                        tvm.const(1, "int32") << k)]
+                    sizes_temp[tid] = sizes[tid]
+                with ib.else_scope():
+                    sizes_temp[tid] += sizes[tid - (
+                        tvm.const(1, "int32") << k)]
+                    sizes[tid] = sizes_temp[tid]
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        i = tid / axis_mul_after
+        j = tid % axis_mul_after
+        current_sort_num = p_index[tid]
+        base_idx = i * data.shape[axis] * axis_mul_after + j
+        with ib.for_range(0, current_sort_num, name="k") as k:
+            full_idx = base_idx + k * axis_mul_after
+            with ib.if_scope(tid == 0):
+                start = 0
+            with ib.else_scope():
+                start = sizes[tid-1]
+            index_new[start + k] = k
+            data_new[start + k] = p_data[full_idx]
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        with ib.if_scope(tid == 0):
+            start = 0
+        with ib.else_scope():
+            start = sizes[tid-1]
+        # OddEvenTransposeSort
+        with ib.for_range(0, p_index[tid], name="k") as k:
+            with ib.for_range(0, p_index[tid] - 1, name="i") as i:
+                with ib.if_scope(i % 2 == (k & 1)):
+                    with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^
+                                      is_descend) == False):
+                        temp_data[tid] = data_new[i+start]
+                        data_new[i+start] = data_new[i+start+1]
+                        data_new[i+start+1] = temp_data[tid]
+                        temp_index[tid] = index_new[i+start]
+                        index_new[i+start] = index_new[i+start+1]
+                        index_new[i+start+1] = temp_index[tid]
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        i = tid / axis_mul_after
+        j = tid % axis_mul_after
+        current_sort_num = p_index[tid]
+        base_idx = i * data.shape[axis] * axis_mul_after + j
+        with ib.for_range(0, data.shape[axis], name="k") as k:
+            with ib.if_scope(tid == 0):
+                start = 0
+            with ib.else_scope():
+                start = sizes[tid-1]
+            p_out[base_idx + k * axis_mul_after] = tvm.select(
+                k < current_sort_num,
+                index_new[k+start], k)
+    body = ib.get()
+    return body
+
+
+def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
+    """Low level IR routing for transform location in multibox_detection operator.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of output boxes with class and score.
+
+    sort_result : Buffer
+        Buffer of output box indexes sorted by score.
+
+    valid_count : Buffer
+        Buffer of number of valid output boxes.
+
+    out : Buffer
+        Output buffer.
+
+    nms_threshold : float
+        Non-maximum suppression threshold.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
+        """Calculate overlap of two boxes.
+        """
+        w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
+                         - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
+        h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
+                         - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
+        i = w * h
+        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
+            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
+            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
+            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
+        return tvm.select(u <= 0.0, 0.0, i / u)
+
+    max_threads = int(math.sqrt(
+        tvm.target.current_target(allow_none=False).max_num_threads))
+    tx = tvm.thread_axis("threadIdx.x")
+    ty = tvm.thread_axis("threadIdx.y")
+    bx = tvm.thread_axis("blockIdx.x")
+    by = tvm.thread_axis("blockIdx.y")
+    ib = tvm.ir_builder.create()
+    p_data = ib.buffer_ptr(data)
+    p_sort_result = ib.buffer_ptr(sort_result)
+    p_valid_count = ib.buffer_ptr(valid_count)
+    p_out = ib.buffer_ptr(out)
+    batch_size = out.shape[0]
+    num_anchors = out.shape[1]
+    nthread_tx = max_threads
+    nthread_bx = num_anchors // max_threads + 1
+    nthread_ty = max_threads
+    nthread_by = 6 // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(ty, "thread_extent", nthread_ty)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    ib.scope_attr(by, "thread_extent", nthread_by)
+    i = bx * max_threads + tx
+    j = by * max_threads + ty
+
+    nms_threshold_node = tvm.make.node(
+        "FloatImm", dtype="float32", value=nms_threshold)
+    nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
+    force_suppress_node = tvm.make.node(
+        "IntImm", dtype="int32", value=1 if force_suppress else 0)
+    with ib.for_range(0, batch_size, for_type="unroll", name="n") as n:
+        with ib.if_scope(
+            tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
+                    p_valid_count[0] > 0)):
+            # Reorder output
+            nkeep = tvm.select(
+                tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
+                nms_topk, p_valid_count[n])
+            with ib.if_scope(i < nkeep):
+                with ib.if_scope(j < 6):
+                    p_out[(n * num_anchors * 6
+                           + i * 6 + j)] = p_data[(n * num_anchors * 6
+                                                   + p_sort_result[n * num_anchors + i] * 6 + j)]
+            with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])):
+                with ib.if_scope(i < p_valid_count[n] - nkeep):
+                    with ib.if_scope(j < 6):
+                        p_out[(n * num_anchors * 6
+                               + (i + nkeep) * 6 + j)] = p_data[(n * num_anchors * 6
+                                                                 + (i + nkeep) * 6 + j)]
+            # Apply nms
+            with ib.if_scope(i < p_valid_count[n]):
+                offset_i = i * 6
+                with ib.if_scope(p_out[n * num_anchors * 6 + offset_i] >= 0):
+                    with ib.if_scope(j < p_valid_count[n]):
+                        offset_j = j * 6
+                        with ib.if_scope(tvm.all(j > i, p_out[n * num_anchors * 6
+                                                              + offset_j] >= 0)):
+                            with ib.if_scope(tvm.any(force_suppress_node > 0,
+                                                     p_out[n * num_anchors * 6 + offset_i] ==
+                                                     p_out[n * num_anchors * 6 + offset_j])):
+                                # When force_suppress == True or class_id equals
+                                iou = calculate_overlap(
+                                    p_out, n * num_anchors * 6 + offset_i + 2,
+                                    n * num_anchors * 6 + offset_j + 2)
+                                with ib.if_scope(iou >= nms_threshold):
+                                    p_out[
+                                        n * num_anchors * 6 + offset_j] = -1.0
+        with ib.else_scope():
+            with ib.if_scope(i < p_valid_count[n]):
+                with ib.if_scope(j < 6):
+                    p_out[(n * num_anchors * 6
+                           + i * 6 + j)] = p_data[n * num_anchors * 6 + i * 6 + j]
+        # Set invalid entry to be -1
+        with ib.if_scope(i < num_anchors - p_valid_count[n]):
+            with ib.if_scope(j < 6):
+                p_out[n * num_anchors * 6 + (i +
+                                             p_valid_count[n]) * 6 + j] = -1.0
+    body = ib.get()
+    return body
+
+
+@nms.register(["cuda", "gpu"])
+def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+    """Non-maximum suppression operator for object detection.
+
+    Parameters
+    ----------
+    data: tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+        The last dimension should be in format of
+        [class_id, score, box_left, box_top, box_right, box_bottom].
+
+    valid_count : tvm.Tensor
+        1-D tensor for valid number of boxes.
+
+    nms_threshold : float
+        Non-maximum suppression threshold.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+
+    Example
+    --------
+    .. code-block:: python
+
+        # An example to use nms
+        dshape = (1, 5, 6)
+        data = tvm.placeholder(dshape, name="data")
+        valid_count = tvm.placeholder(
+            (dshape[0],), dtype="int32", name="valid_count")
+        nms_threshold = 0.7
+        force_suppress = True
+        nms_topk = -1
+        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
+        s = topi.generic.schedule_nms(out)
+        f = tvm.build(s, [data, valid_count, out], "llvm")
+        ctx = tvm.cpu()
+        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
+        f(tvm_data, tvm_valid_count, tvm_out)
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    valid_count_dtype = "int32"
+    valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
+                                      "valid_count_buf", data_alignment=4)
+    data_buf = api.decl_buffer(
+        data.shape, data.dtype, "data_buf", data_alignment=8)
+    score_axis = 1
+    score_shape = (batch_size, num_anchors)
+    score_tensor = tvm.compute(
+        score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor")
+    score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype,
+                                       "score_tensor_buf", data_alignment=8)
+    sort_tensor_dtype = "int32"
+    sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
+                                      "sort_tensor_buf", data_alignment=8)
+
+    sort_tensor = \
+        tvm.extern(score_shape,
+                   [score_tensor, valid_count],
+                   lambda ins, outs: sort_ir(
+                       ins[0], ins[1], outs[0], score_axis, True),
+                   dtype=sort_tensor_dtype,
+                   in_buffers=[score_tensor_buf, valid_count_buf],
+                   out_buffers=sort_tensor_buf,
+                   name="nms_sort")
+    out = \
+        tvm.extern(data.shape,
+                   [data, sort_tensor, valid_count],
+                   lambda ins, outs: nms_ir(
+                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
+                       force_suppress, nms_topk),
+                   dtype="float32",
+                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
+                   tag="nms")
+    return out
diff --git a/topi/python/topi/cuda/nn.py b/topi/python/topi/cuda/nn.py
new file mode 100644
index 000000000000..b503b2dad50f
--- /dev/null
+++ b/topi/python/topi/cuda/nn.py
@@ -0,0 +1,45 @@
+# pylint: disable=invalid-name
+"""scheduler functions for cuda backend"""
+from __future__ import absolute_import as _abs
+
+import tvm
+from .. import generic
+from .. import cpp
+
+@generic.schedule_lrn.register(["cuda"])
+def schedule_lrn(outs):
+    """Schedule for LRN
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of LRN
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.cuda.schedule_lrn(cpp_target, outs)
+
+@generic.schedule_l2_normalize.register(["cuda"])
+def schedule_l2_normalize(outs):
+    """Schedule for L2 normalize
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of L2 normalize
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.cuda.schedule_l2_normalize(cpp_target, outs)
diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py
index 4ed5ae66c19b..ed4419491428 100644
--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -33,9 +33,8 @@ def _schedule(Pool):
         else:
             Out = outs[0].op.output(0)
             s[Pool].set_scope("local")
-        i, c, h, w = s[Out].op.axis
-        by, ty = s[Out].split(i, factor=num_thread)
-        bx, tx = s[Out].split(c, factor=num_thread)
+        by, ty = s[Out].split(s[Out].op.axis[0], factor=num_thread)
+        bx, tx = s[Out].split(s[Out].op.axis[1], factor=num_thread)
         s[Out].reorder(by, bx, ty, tx)
         s[Out].bind(ty, thread_y)
         s[Out].bind(tx, thread_x)
@@ -47,6 +46,7 @@ def _schedule(Pool):
             s[Pool].compute_at(s[Out], tx)
 
     def traverse(OP):
+        """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
@@ -83,7 +83,8 @@ def schedule_pool(outs):
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
     def _schedule(PaddedInput, Pool):
-        s[PaddedInput].compute_inline()
+        if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
+            s[PaddedInput].compute_inline()
         num_thread = tvm.target.current_target(allow_none=False).max_num_threads
         if Pool.op in s.outputs:
             Out = Pool
@@ -101,6 +102,7 @@ def _schedule(PaddedInput, Pool):
             s[Pool].compute_at(s[Out], tx)
 
     def traverse(OP):
+        """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 932f2aae3098..c1f7d19d82fb 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -4,6 +4,7 @@
 import tvm
 from .. import tag
 from .. import generic
+from .injective import _schedule_injective
 
 def _schedule_reduce(op, sch, is_idx_reduce=False):
     if is_idx_reduce:
@@ -11,7 +12,9 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
     else:
         data_in = op.input_tensors[0]
         data_out = op.output(0)
-    assert len(sch[data_out].op.reduce_axis) > 0, "reduce_axis must be bigger than zero!"
+
+    if not sch[data_out].op.reduce_axis:
+        return _schedule_injective(op, sch)
 
     if len(sch[data_out].op.axis) > 0:
         all_reduce = False
@@ -87,6 +90,7 @@ def schedule_reduce(outs):
     sch = tvm.create_schedule([x.op for x in outs])
 
     def traverse_before_reduce(operator):
+        """Internal travserse function"""
         if isinstance(operator, tvm.tensor.PlaceholderOp):
             return
         elif tag.is_injective(operator.tag):
@@ -97,6 +101,7 @@ def traverse_before_reduce(operator):
             raise RuntimeError("Unsupported operator: %s" % operator.tag)
 
     def traverse_after_reduce(operator):
+        """Internal travserse function"""
         if tag.is_broadcast(operator.tag):
             raise RuntimeError("Not yet support ewise after reduce")
         elif operator.tag == 'comm_reduce':
diff --git a/topi/python/topi/cuda/ssd/__init__.py b/topi/python/topi/cuda/ssd/__init__.py
new file mode 100644
index 000000000000..d680c578e7aa
--- /dev/null
+++ b/topi/python/topi/cuda/ssd/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=wildcard-import
+"""VISION network operators"""
+from __future__ import absolute_import as _abs
+
+from .multibox import *
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
new file mode 100644
index 000000000000..c22e7a513d7d
--- /dev/null
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -0,0 +1,360 @@
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements
+"""SSD multibox operators"""
+from __future__ import absolute_import as _abs
+import math
+import tvm
+
+from tvm import api
+
+import topi
+
+from topi.vision.ssd import multibox_prior
+from topi.vision.ssd import multibox_detection
+from topi.vision.ssd import multibox_transform_loc
+from ..nms import nms
+
+def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
+    """Low level IR routing for multibox_prior operator.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input data buffer.
+
+    out : Buffer
+        Output buffer.
+
+    sizes : tuple of float
+        Tuple of sizes for anchor boxes.
+
+    ratios : tuple of float
+        Tuple of ratios for anchor boxes.
+
+    steps : Tuple of float
+        Priorbox step across y and x, -1 for auto calculation.
+
+    offsets : tuple of int
+        Priorbox center offsets, y and x respectively.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    tx = tvm.thread_axis("threadIdx.x")
+    ty = tvm.thread_axis("threadIdx.y")
+    bx = tvm.thread_axis("blockIdx.x")
+    by = tvm.thread_axis("blockIdx.y")
+    ib = tvm.ir_builder.create()
+    p_out = ib.buffer_ptr(out)
+    in_height = data.shape[2]
+    in_width = data.shape[3]
+    nthread_tx = max_threads
+    nthread_bx = in_height // max_threads + 1
+    nthread_ty = max_threads
+    nthread_by = in_width // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(ty, "thread_extent", nthread_ty)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    ib.scope_attr(by, "thread_extent", nthread_by)
+
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
+    size_ratio_concat = sizes + ratios
+    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
+    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+    offset_h = offsets[0]
+    offset_w = offsets[1]
+
+    i = bx * max_threads + tx
+    j = by * max_threads + ty
+    with ib.if_scope((i < in_height)):
+        with ib.if_scope((j < in_width)):
+            center_h = (i + offset_h) * steps_h
+            center_w = (j + offset_w) * steps_w
+
+            for k in range(num_sizes + num_ratios - 1):
+                w = tvm.select(k < num_sizes,
+                               size_ratio_concat[k] * in_height / in_width / 2.0,
+                               size_ratio_concat[0] * in_height / in_width *
+                               math.sqrt(size_ratio_concat[k + 1]) / 2.0)
+                h = tvm.select(k < num_sizes, size_ratio_concat[k] / 2.0,
+                               size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0)
+                count = (i * in_width * (num_sizes + num_ratios - 1) +
+                         j * (num_sizes + num_ratios - 1) + k) * 4
+                p_out[count] = center_w - w
+                p_out[count + 1] = center_h - h
+                p_out[count + 2] = center_w + w
+                p_out[count + 3] = center_h + h
+
+    body = ib.get()
+    return body
+
+
+@multibox_prior.register(["cuda", "gpu"])
+def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \
+                       offsets=(0.5, 0.5), clip=False):
+    """Generate prior(anchor) boxes from data, sizes and ratios.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, c_in, h_in, w_in]]
+
+    sizes : tuple of float
+        Tuple of sizes for anchor boxes.
+
+    ratios : tuple of float
+        Tuple of ratios for anchor boxes.
+
+    steps : Tuple of float
+        Priorbox step across y and x, -1 for auto calculation.
+
+    offsets : tuple of int
+        Priorbox center offsets, y and x respectively.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
+    """
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
+    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
+    out = tvm.extern(oshape, [data], lambda ins, outs:
+                     multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
+                     tag="multibox_prior")
+    if clip:
+        out = topi.clip(out, 0, 1)
+    return out
+
+
+def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
+    """Low level IR routing for transform location in multibox_detection operator.
+
+    Parameters
+    ----------
+    cls_prob : Buffer
+        Buffer of class probabilities.
+
+    loc_pred : Buffer
+        Buffer of location regression predictions.
+
+    anchor : Buffer
+        Buffer of prior anchor boxes.
+
+    valid_count : Buffer
+        Buffer of number of valid output boxes.
+
+    out : Buffer
+        Output buffer.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    variances : tuple of float
+        Variances to be decoded from box regression output.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
+        """Transform prior anchor box to output box through location predictions.
+        """
+        al = anchor[anchor_base_idx]
+        at = anchor[anchor_base_idx + 1]
+        ar = anchor[anchor_base_idx + 2]
+        ab = anchor[anchor_base_idx + 3]
+        aw = ar - al
+        ah = ab - at
+        ax = (al + ar) / 2.0
+        ay = (at + ab) / 2.0
+        px = loc[loc_base_idx]
+        py = loc[loc_base_idx + 1]
+        pw = loc[loc_base_idx + 2]
+        ph = loc[loc_base_idx + 3]
+        ox = px * vx * aw + ax
+        oy = py * vy * ah + ay
+        ow = tvm.exp(pw * vw) * aw / 2.0
+        oh = tvm.exp(ph * vh) * ah / 2.0
+        return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \
+               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
+               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
+               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
+
+    batch_size = cls_prob.shape[0]
+    num_classes = cls_prob.shape[1]
+    num_anchors = cls_prob.shape[2]
+
+    ib = tvm.ir_builder.create()
+    temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, \
+                 ), name="temp_score", scope="global")
+    score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local")
+    cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local")
+    flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global")
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    nthread_tx = max_threads
+    nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    p_cls_prob = ib.buffer_ptr(cls_prob)
+    p_loc_pred = ib.buffer_ptr(loc_pred)
+    p_anchor = ib.buffer_ptr(anchor)
+    p_valid_count = ib.buffer_ptr(valid_count)
+    p_out = ib.buffer_ptr(out)
+    with ib.if_scope(tid < batch_size * num_anchors * num_classes):
+        n = tid / (num_anchors * num_classes)
+        j = (tid % (num_anchors * num_classes)) / num_anchors
+        i = tid % num_anchors
+        with ib.if_scope(j > 0):
+            temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = \
+            p_cls_prob[tid]
+        p_valid_count[n] = 0
+    with ib.if_scope(tid < batch_size * num_anchors):
+        n = tid / num_anchors
+        i = tid % num_anchors
+        score[tid] = -1.0
+        cls_id[tid] = 0
+        with ib.for_range(0, num_classes-1, name="k") as k:
+            temp = temp_score[tid * (num_classes-1) + k]
+            cls_id[tid] = tvm.select(temp > score[tid], k + 1, cls_id[tid])
+            score[tid] = tvm.make.Max(temp, score[tid])
+        with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)):
+            cls_id[tid] = 0
+        with ib.if_scope(cls_id[tid] > 0):
+            flag[tid] = 1
+        with ib.else_scope():
+            flag[tid] = 0
+    with ib.if_scope(tid < batch_size):
+        with ib.for_range(0, num_anchors, name="k") as k:
+            with ib.if_scope(k > 0):
+                flag[tid * num_anchors + k] += flag[tid * num_anchors + k - 1]
+        p_valid_count[tid] = flag[tid * num_anchors + num_anchors - 1]
+    with ib.if_scope(tid < batch_size * num_anchors):
+        n = tid / num_anchors
+        i = tid % num_anchors
+        with ib.if_scope(cls_id[tid] > 0):
+            with ib.if_scope(tid == 0):
+                out_base_idx = n * num_anchors * 6
+            with ib.else_scope():
+                out_base_idx = n * num_anchors * 6 + flag[tid - 1] * 6
+            p_out[out_base_idx] = cls_id[tid] - 1.0
+            p_out[out_base_idx + 1] = score[tid]
+            p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
+            p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, p_anchor, i*4,
+                                                    clip, variances[0], variances[1],
+                                                    variances[2], variances[3])
+
+    body = ib.get()
+    return body
+
+
+@multibox_transform_loc.register(["cuda", "gpu"])
+def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
+                               variances=(0.1, 0.1, 0.2, 0.2)):
+    """Location transformation for multibox detection
+
+    Parameters
+    ----------
+    cls_prob : tvm.Tensor
+        Class probabilities.
+
+    loc_pred : tvm.Tensor
+        Location regression predictions.
+
+    anchor : tvm.Tensor
+        Prior anchor boxes.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    variances : tuple of float
+        Variances to be decoded from box regression output.
+
+    Returns
+    -------
+    ret : tuple of tvm.Tensor composed of
+
+    out : tvm.Tensor
+        3-D tensor with shape (batch_size, num_anchors, 6)
+
+    valid_count : tvm.Tensor
+        1-D tensor with shape (batch_size,), number of valid anchor boxes.
+    """
+    batch_size = cls_prob.shape[0]
+    num_anchors = anchor.shape[1]
+    oshape = (batch_size, num_anchors, 6)
+    # Define data alignment for intermediate buffer
+    valid_count_dtype = "int32"
+    valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
+                                      "valid_count_buf", data_alignment=4)
+    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
+    valid_count, out = \
+        tvm.extern([(batch_size,), oshape],
+                   [cls_prob, loc_pred, anchor],
+                   lambda ins, outs: transform_loc_ir(
+                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
+                   dtype=[valid_count_dtype, cls_prob.dtype],
+                   out_buffers=[valid_count_buf, out_buf],
+                   tag="multibox_transform_loc")
+    return [out, valid_count]
+
+
+@multibox_detection.register(["cuda", "gpu"])
+def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
+                           force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1):
+    """Convert multibox detection predictions.
+
+    Parameters
+    ----------
+    cls_prob : tvm.Tensor
+        Class probabilities.
+
+    loc_pred : tvm.Tensor
+        Location regression predictions.
+
+    anchor : tvm.Tensor
+        Prior anchor boxes.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    nms_threshold : float
+        Non-maximum suppression threshold.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    variances : tuple of float
+        Variances to be decoded from box regression output.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape (batch_size, num_anchors, 6)
+    """
+    inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
+                                       clip, threshold, variances)
+    out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
+    return out
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
new file mode 100644
index 000000000000..c5d94b5ab4de
--- /dev/null
+++ b/topi/python/topi/cuda/vision.py
@@ -0,0 +1,144 @@
+# pylint: disable=invalid-name, unused-variable, unused-argument, no-member
+"""Schedule for vision operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import generic
+from .. import cpp
+from .. import tag
+
+def _default_schedule(outs):
+    """Default schedule for gpu."""
+    target = tvm.target.current_target()
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def traverse(op):
+        """inline all one-to-one-mapping operators except the last stage (output)"""
+        if "nms" in op.tag:
+            sort = op.input_tensors[1]
+            score = s[sort].op.input_tensors[0]
+            fused = s[score].fuse(*s[score].op.axis)
+            num_thread = tvm.target.current_target(allow_none=False).max_num_threads
+            bx, tx = s[score].split(fused, factor=num_thread)
+            s[score].bind(bx, tvm.thread_axis("blockIdx.x"))
+            s[score].bind(tx, tvm.thread_axis("threadIdx.x"))
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            else:
+                x = op.output(0)
+                fused = s[x].fuse(*s[x].op.axis)
+                num_thread = tvm.target.current_target(allow_none=False).max_num_threads
+                bx, tx = s[x].split(fused, factor=num_thread)
+                s[x].bind(bx, tvm.thread_axis("blockIdx.x"))
+                s[x].bind(tx, tvm.thread_axis("threadIdx.x"))
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+    traverse(outs[0].op)
+    return s
+
+@generic.schedule_reorg.register(["cuda", "gpu"])
+def schedule_reorg(outs):
+    """Schedule for reorg operator.
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of reorg
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for reorg.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.cuda.schedule_injective(cpp_target, outs)
+
+@generic.schedule_region.register(["cuda", "gpu"])
+def schedule_region(outs):
+    """Schedule for region operator.
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of region
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for region.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.cuda.schedule_region(cpp_target, outs)
+
+@generic.schedule_nms.register(["cuda", "gpu"])
+def schedule_nms(outs):
+    """Schedule for non-maximum suppression
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of nms
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs)
+
+@generic.schedule_multibox_prior.register(["cuda", "gpu"])
+def schedule_multibox_prior(outs):
+    """Schedule for multibox_prior operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of multibox_prior
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for multibox_prior.
+    """
+    return _default_schedule(outs)
+
+@generic.schedule_multibox_transform_loc.register(["cuda", "gpu"])
+def schedule_multibox_transform_loc(outs):
+    """Schedule for multibox_transform_loc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of
+      multibox_transform_loc in the format
+      of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs)
+
+@generic.schedule_multibox_detection.register(["cuda", "gpu"])
+def schedule_multibox_detection(outs):
+    """Schedule for multibox_detection operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of multibox_detection
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for multibox_detection.
+    """
+    return _default_schedule(outs)
diff --git a/topi/python/topi/generic/__init__.py b/topi/python/topi/generic/__init__.py
index d10d5c5ebecc..8450e2d4c4e2 100644
--- a/topi/python/topi/generic/__init__.py
+++ b/topi/python/topi/generic/__init__.py
@@ -17,3 +17,5 @@
 
 from .nn import *
 from .injective import *
+from .extern import *
+from .vision import *
diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py
new file mode 100644
index 000000000000..92a47f8a9630
--- /dev/null
+++ b/topi/python/topi/generic/extern.py
@@ -0,0 +1,26 @@
+# pylint: disable=invalid-name
+"""generic declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+import tvm
+
+@tvm.target.generic_func
+def schedule_extern(outs):
+    """Schedule for an extern op followed by injective operations.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of extern plus injective ops in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    if target.target_name != "llvm":
+        raise RuntimeError("schedule_extern not registered for '%s'" % target)
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    return tvm.create_schedule([x.op for x in outs])
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index 765f4e4f518d..0a9e394661af 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -4,7 +4,7 @@
 
 import tvm
 
-@tvm.target.generic_func
+@tvm.target.override_native_generic_func("schedule_injective")
 def schedule_injective(outs):
     """Schedule for injective op.
 
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 2cb64407c88e..1e01adb899b7 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -1,7 +1,8 @@
+# pylint: disable=invalid-name,unused-argument
 """Generic nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
-
+from .. import cpp
 
 def _default_schedule(outs, auto_inline):
     """Default schedule for llvm."""
@@ -35,6 +36,109 @@ def schedule_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
+@tvm.target.generic_func
+def schedule_conv2d_nhwc(outs):
+    """Schedule for conv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_conv2d_NCHWc(num_filter, kernel_size, strides,
+                          padding, layout, out_layout, outs):
+    """Schedule for conv2d_NCHW[x]c
+
+    Parameters
+    ----------
+    num_filter : int
+        The number of filter, i.e., the output channel.
+
+    kernel_size : tuple of int
+        (kernel_height, kernel_width)
+
+    strides : tuple of int
+        (stride_of_height, stride_of_width)
+
+    padding : tuple of int
+        (pad_of_height, pad_of_width)
+
+    layout : str
+        Input data layout
+
+    out_layout : str
+        Output data layout
+
+    outs : Array of Tensor
+        The computation graph description of conv2d_NCHWc
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    sch : Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_conv2d_winograd_weight_transform(outs):
+    """Schedule for weight transformation of winograd
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    # Typically this is computed in nnvm PreCompute pass
+    # so we make a schedule here for cpu llvm
+    s = tvm.create_schedule([x.op for x in outs])
+    output = outs[0]
+    _, G = s[output].op.input_tensors
+    s[G].compute_inline()
+    eps, nu, co, ci = s[output].op.axis
+    r_kh, r_kw = s[output].op.reduce_axis
+    s[output].reorder(co, ci, r_kh, r_kw, eps, nu)
+    for axis in [r_kh, r_kw, eps, nu]:
+        s[output].unroll(axis)
+    s[output].parallel(co)
+    return s
+
+
+@tvm.target.generic_func
+def schedule_conv2d_winograd_without_weight_transform(outs):
+    """Schedule for winograd without weight transformation
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_conv2d_transpose_nchw(outs):
     """Schedule for conv2d_transpose_nchw
@@ -87,8 +191,43 @@ def schedule_depthwise_conv2d_nhwc(outs):
     """
     return _default_schedule(outs, False)
 
+@tvm.target.generic_func
+def schedule_bitserial_conv2d_nchw(outs):
+    """Schedule for bitserial_conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of bitserial_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
 
 @tvm.target.generic_func
+def schedule_bitserial_conv2d_nhwc(outs):
+    """Schedule for bitserial_conv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of bitserial_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.override_native_generic_func("schedule_reduce")
 def schedule_reduce(outs):
     """Schedule for reduction
 
@@ -106,7 +245,7 @@ def schedule_reduce(outs):
     return _default_schedule(outs, True)
 
 
-@tvm.target.generic_func
+@tvm.target.override_native_generic_func("schedule_softmax")
 def schedule_softmax(outs):
     """Schedule for softmax
 
@@ -124,7 +263,7 @@ def schedule_softmax(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
+@tvm.target.override_native_generic_func("schedule_dense")
 def schedule_dense(outs):
     """Schedule for dense
 
@@ -142,7 +281,7 @@ def schedule_dense(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
+@tvm.target.override_native_generic_func("schedule_pool")
 def schedule_pool(outs):
     """Schedule for pool
 
@@ -160,7 +299,7 @@ def schedule_pool(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
+@tvm.target.override_native_generic_func("schedule_global_pool")
 def schedule_global_pool(outs):
     """Schedule for global pool
 
@@ -176,3 +315,77 @@ def schedule_global_pool(outs):
         The computation schedule for the op.
     """
     return _default_schedule(outs, False)
+
+@tvm.target.override_native_generic_func("schedule_binarize_pack")
+def schedule_binarize_pack(outs):
+    """Schedule for binarize_pack
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of binarize_pack
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.override_native_generic_func("schedule_binary_dense")
+def schedule_binary_dense(outs):
+    """Schedule for binary_dense
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of binary_dense
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_lrn(outs):
+    """Schedule for lrn
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of lrn
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.generic.default_schedule(cpp_target, outs, False)
+
+@tvm.target.generic_func
+def schedule_l2_normalize(outs):
+    """Schedule for l2 normalize
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of l2 normalize
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.generic.default_schedule(cpp_target, outs, False)
diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py
new file mode 100644
index 000000000000..75f32ce19cbe
--- /dev/null
+++ b/topi/python/topi/generic/vision.py
@@ -0,0 +1,142 @@
+# pylint: disable=invalid-name, no-member
+"""Generic vision operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import cpp
+
+def _default_schedule(outs, auto_inline):
+    """Default schedule for llvm."""
+    target = tvm.target.current_target(allow_none=False)
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    if target.target_name != "llvm":
+        raise RuntimeError("schedule not registered for '%s'" % target)
+    s = tvm.create_schedule([x.op for x in outs])
+    if auto_inline:
+        x = outs[0]
+        tvm.schedule.AutoInlineInjective(s)
+        s[x].fuse(s[x].op.axis)
+    return s
+
+@tvm.target.generic_func
+def schedule_shortcut(outs):
+    """Schedule for shortcut
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of shortcut
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+@tvm.target.generic_func
+def schedule_reorg(outs):
+    """Schedule for reorg
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of reorg
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.generic.default_schedule(cpp_target, outs, False)
+
+@tvm.target.generic_func
+def schedule_region(outs):
+    """Schedule for region
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of region
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.generic.default_schedule(cpp_target, outs, False)
+
+@tvm.target.generic_func
+def schedule_nms(outs):
+    """Schedule for non-maximum suppression
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of nms
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+@tvm.target.generic_func
+def schedule_multibox_prior(outs):
+    """Schedule for multibox_prior
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of multibox_prior
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+@tvm.target.generic_func
+def schedule_multibox_transform_loc(outs):
+    """Schedule for multibox_transform_loc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of
+      multibox_transform_loc in the format
+      of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+@tvm.target.generic_func
+def schedule_multibox_detection(outs):
+    """Schedule for multibox_detection
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of multibox_detection
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
diff --git a/topi/python/topi/generic_op_impl.py b/topi/python/topi/generic_op_impl.py
new file mode 100644
index 000000000000..9123b46cabbe
--- /dev/null
+++ b/topi/python/topi/generic_op_impl.py
@@ -0,0 +1,85 @@
+"""Implementation of generic operators in the presence of Tensor"""
+# pylint: disable=invalid-name, too-many-arguments
+from __future__ import absolute_import as _abs
+import tvm
+from . import broadcast as _broadcast
+from . import math as _math
+
+
+def _make_bop(broadcast_bop, orig_bop):
+    """Make a specific overloaded binary operator of Tensor when applicable;
+    apply the original operator if it is not supposed to be overloaded.
+
+    Consider the following scenario:
+    OP :   + | - | * | /
+    R0 :   int | float | Expr | TensorSlice | Tensor (rank zero)
+    R1 :   Tensor (positive rank)
+
+    In terms of (LHS OP RHS), we apply the following overloading rules:
+    (1) We use broadcast_OP(LHS, RHS), when both LHS and RHS are R1.
+    (2) We perform element-wise operation of Tensor and scalar,
+        when one of LHS and RHS is R1 and another is R0.
+    (3) We do not overload OP (i.e. stick to orig_bop) otherwise.
+
+    Parameters
+    ----------
+    broadcast_bop : operator function
+        Operator for broadcast tensor-tensor operation, for rule (1).
+
+    orig_bop: operator function
+        Operator before overloading, for rule (3).
+
+    Returns
+    -------
+    ret : operator function
+        The overloaded operator function if applicable or orig_bop otherwise.
+    """
+
+    name = orig_bop.__name__
+
+    def _tensor_bop_impl(lhs, rhs):
+        """Overloaded {op} operator.
+
+        If both operands are non-zero-rank Tensors, it performs
+        tensor-tensor {op} operation, and broadcasts inputs when necessary.
+
+        If one operand is non-zero-rank Tensor, while the other operand is
+        scalar like type (e.g., numeric types, Expr, or TensorSlice),
+        it performs tensor-scalar {op} operation on an element-wise basis.
+
+        Otherwise, it performs default generic.{op} operation, as defined
+        in tvm.generic module.
+
+        Parameters
+        ----------
+        lhs : object
+            Left operand.
+        rhs : object
+            Right operand.
+
+        Returns
+        -------
+        ret : tvm.Tensor (if at least one operand is non-zero-rank Tensor)
+              tvm.Expr (otherwise)
+            The result of {op} operation.
+        """
+        if not isinstance(lhs, tvm.tensor.Tensor) and not isinstance(rhs, tvm.tensor.Tensor):
+            return orig_bop(lhs, rhs)
+        return broadcast_bop(lhs, rhs)
+    _tensor_bop_impl.__doc__ = _tensor_bop_impl.__doc__.format(op=name)
+    return _tensor_bop_impl
+
+
+def _bind_generic_ops():
+    """Bind generic operators for Tensor."""
+    # Check __op_priority__ to make sure the binding happens only once.
+    __op_priority__ = 1
+    if __op_priority__ > tvm.generic.__op_priority__:
+        tvm.generic.__op_priority__ = __op_priority__
+        tvm.generic.add = _make_bop(_broadcast.add, tvm.generic.add)
+        tvm.generic.subtract = _make_bop(_broadcast.subtract, tvm.generic.subtract)
+        tvm.generic.multiply = _make_bop(_broadcast.multiply, tvm.generic.multiply)
+        tvm.generic.divide = _make_bop(_broadcast.divide, tvm.generic.divide)
+        tvm.generic.cast = _math.cast
+
+_bind_generic_ops()
diff --git a/topi/python/topi/image/__init__.py b/topi/python/topi/image/__init__.py
new file mode 100644
index 000000000000..840ae83ea8ac
--- /dev/null
+++ b/topi/python/topi/image/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=wildcard-import
+"""IMAGE network operators"""
+from __future__ import absolute_import as _abs
+
+from .resize import *
diff --git a/topi/python/topi/image/resize.py b/topi/python/topi/image/resize.py
new file mode 100644
index 000000000000..461214cb1fbd
--- /dev/null
+++ b/topi/python/topi/image/resize.py
@@ -0,0 +1,33 @@
+"""TVM operator input resize compute."""
+from __future__ import absolute_import
+import topi
+
+def resize(data, size, layout="NCHW", align_corners=False, method="BILINEAR"):
+    """Perform resize operation on the data.
+
+    Parameters
+    ----------
+    inputs : tvm.Tensor
+        inputs is a 4-D tensor with shape
+        [batch, channel, in_height, in_width]
+        or  [batch, in_height, in_width, channel]
+
+    size: Tuple
+        Output resolution scale to
+
+    layout: string, optional
+        either "NCHW" or "NHWC"
+
+    align_corners: Boolean, optional
+        To preserve the values at the corner pixels
+
+    method: {"BILINEAR", "NEAREST_NEIGHBOR"}
+        Method to be used for resizing.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, channel, in_height*scale, in_width*scale]
+        or [batch, in_height*scale, in_width*scale, channel]
+    """
+    return topi.cpp.image.resize(data, size, layout, align_corners, method)
diff --git a/topi/python/topi/intel_graphics/__init__.py b/topi/python/topi/intel_graphics/__init__.py
new file mode 100644
index 000000000000..336b1508f977
--- /dev/null
+++ b/topi/python/topi/intel_graphics/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=redefined-builtin, wildcard-import
+"""Intel Gen9 GPU specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+from .conv2d import *
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
new file mode 100644
index 000000000000..29b61dd02ee3
--- /dev/null
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -0,0 +1,560 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches
+"""conv2d schedule on Intel Graphics"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+
+from .. import generic
+from .. import util
+from .. import tag
+from ..nn import pad
+from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, _get_workload
+from ..nn.util import get_pad_tuple
+from ..util import simplify
+
+
+
+##### SCHEDULE UTILITIES #####
+def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+    """ tile and bind 3d """
+    y_factor = y_factor or z_factor
+    x_factor = x_factor or y_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].reorder(zo, yo, xo, zi, yi, xi)
+
+    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
+    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
+    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
+    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, thread_z)
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, thread_y)
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, thread_x)
+    return xi, thread_z, thread_y, thread_x
+
+@conv2d_alter_layout.register(["intel_graphics"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    import nnvm.symbol as sym
+    copy_inputs = [s for s in inputs]
+
+    data = tinfos[0]
+    kernel = tinfos[1]
+
+    import ast
+    padding = ast.literal_eval(attrs['padding'])
+    stride = ast.literal_eval(attrs['strides'])
+
+    wkl = _get_workload(data, kernel, stride, padding, data.dtype)
+    oc_bn = 16
+
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+    new_attrs['kernel_layout'] = 'OIHW%do' % (oc_bn)
+
+    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+
+@conv2d_NCHWc.register(["intel_graphics"])
+def _decl_conv2d(data, kernel, num_filter, kernel_size, stride, padding, layout,\
+                 out_layout, out_dtype='float32'):
+    """Conv2D operator for Intel Graphics backend.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        5-D with shape [num_filter, in_channel, filter_height, filter_width, nnum_filter_vec]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
+    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
+
+    out_dtype = data.dtype
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    kernel_shape = util.get_const_tuple(kernel.shape)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+
+    return _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype)
+
+@generic.schedule_conv2d_NCHWc.register(["intel_graphics"])
+def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, layout, out_layout, outs):
+    """Schedule for conv2d_nchw for Intel Graphics
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        """inline all one-to-one-mapping operators except the last stage (output)"""
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
+           or "1_16" in op.tag:
+            _schedule_cl_spatialpack_NCHWc(s, op)
+
+    traverse(outs[0].op)
+
+    return s
+
+def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16'):
+    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
+    num_filter, channel, kernel_h, kernel_w, nv = [util.get_const_int(x) for x in kernel.shape]
+    num_filter = num_filter * nv
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        stride_h, stride_w = stride
+    else:
+        stride_h, stride_w = stride, stride
+
+    out_channel = num_filter
+    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    oshape = (batch, out_channel, out_height, out_width)
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down, pad_right]
+    temp = pad(data, pad_before, pad_after, name="pad_temp")
+
+    rc = tvm.reduce_axis((0, in_channel), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+
+    block_w = 0
+    block_h = 0
+    if stride_h == 2:
+        if num_filter + kernel_h == 515:
+            conv_tag = "4_4"
+            block_h = 4
+            block_w = 4
+        else:
+            conv_tag = "4_5"
+            block_h = 4
+            block_w = 5
+    elif kernel_h == 3:
+        if num_filter == 512:
+            conv_tag = "2_7"
+            block_h = 2
+            block_w = 7
+        else:
+            conv_tag = "2_14"
+            block_h = 2
+            block_w = 14
+    else:
+        conv_tag = "1_16"
+        block_h = 1
+        block_w = 16
+
+    c_h = out_height
+    c_w = out_width
+
+    if not out_height % block_h == 0:
+        c_h = (out_height // block_h + 1) * block_h
+
+    if not out_width % block_w == 0:
+        c_w = (out_width // block_w + 1) * block_w
+
+    cshape = (batch, out_channel // nv, c_h, c_w, nv)
+
+    conv = tvm.compute(
+        cshape,
+        lambda nn, ff, yy, xx, vc:\
+          tvm.sum(
+              temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
+              kernel[ff, rc, ry, rx, vc].astype(out_dtype),
+              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+
+    output = tvm.compute(
+        oshape,
+        lambda nn, ff, yy, xx:
+        conv[nn][ff//nv][yy][xx][ff%nv],
+        name='output_unpack', tag=conv_tag)
+
+    return output
+
+def _schedule_cl_spatialpack_NCHWc(s, op):
+    output = op.output(0)
+    _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
+
+    conv = op.input_tensors[0]
+    temp = s[conv].op.input_tensors[0]
+    kernel = s[conv].op.input_tensors[1]
+    temp_W = s.cache_read(temp, "warp", [conv])
+    conv_L = s.cache_write(conv, "local")
+
+    kernel_L = s.cache_read(kernel, "local", [conv_L])
+    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
+    if "1_16" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 1
+        OUTPUT_BLOCK_WIDTH = 16
+    elif "2_14" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 2
+        OUTPUT_BLOCK_WIDTH = 14
+    elif "2_7" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 2
+        OUTPUT_BLOCK_WIDTH = 7
+    elif "4_5" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 4
+        OUTPUT_BLOCK_WIDTH = 5
+    elif "4_4" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 4
+        OUTPUT_BLOCK_WIDTH = 4
+
+    # schedule conv
+    z_factor = 1
+    y_factor = 1
+    x_factor = 16
+    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
+    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
+    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
+    _, co, oh, ow, vc = s[conv].op.axis
+    ooh, ioh = s[conv].split(oh, factor=OUTPUT_BLOCK_HEIGHT)
+    oow, iow = s[conv].split(ow, factor=OUTPUT_BLOCK_WIDTH)
+    s[conv].reorder(_, co, ooh, oow, vc, ioh, iow)
+    coo, coi = s[conv].split(co, nparts=1)
+    ooho, oohi = s[conv].split(ooh, factor=z_factor)
+    oowo, oowi = s[conv].split(oow, factor=y_factor)
+    vco, vci = s[conv].split(vc, factor=x_factor)
+    s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow)
+    s[conv].bind(oohi, thread_z)
+    s[conv].bind(oowi, thread_y)
+    s[conv].bind(vci, thread_x)
+    s[conv].bind(ooho, tvm.thread_axis("blockIdx.z"))
+    s[conv].bind(oowo, tvm.thread_axis("blockIdx.y"))
+    s[conv].bind(coi, tvm.thread_axis("blockIdx.x"))
+
+    # schedule conv_L
+    s[conv_L].compute_at(s[conv], vci)
+    i, oc, h, w, vc = s[conv_L].op.axis
+    rc, ry, rx = s[conv_L].op.reduce_axis
+    if in_channel == 2048:
+        rco, rci = s[conv_L].split(rc, nparts=128)
+        s[conv_L].unroll(rci)
+        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
+        s[temp_W].compute_at(s[conv_L], rco)
+    else:
+        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+        s[temp_W].compute_at(s[conv_L], rc)
+    if kernel.shape[3].value != 7:
+        s[conv_L].unroll(ry)
+        s[conv_L].unroll(rx)
+    if kernel.shape[3].value != 7:
+        s[conv_L].unroll(ry)
+        s[conv_L].unroll(rx)
+
+    # schedule temp
+    _, ci, h, w = s[temp].op.axis
+    tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16)
+
+    # schedule temp_W
+    _, ci, h, w = s[temp_W].op.axis
+    zo, zi = s[temp_W].split(ci, 1)
+    yo, yi = s[temp_W].split(h, 1)
+    xo, xi = s[temp_W].split(w, 16)
+    s[temp_W].reorder(zo, yo, xo, zi, yi, xi)
+    s[temp_W].bind(zi, thread_z)
+    s[temp_W].bind(yi, thread_y)
+    s[temp_W].bind(xi, thread_x)
+    s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
+
+    #schedule kernel
+
+    # schedule kernel_L
+    if "2_14" in s[conv].op.tag:
+        s[kernel_L].compute_at(s[conv_L], ry)
+    else:
+        s[kernel_L].compute_at(s[conv_L], rx)
+
+    # schedule output
+    if output.op in s.outputs:
+        out = output
+    else:
+        s[output].compute_inline()
+        out = s.outputs[0]
+
+    _, co, h, w = s[out].op.axis
+    tile_and_bind3d(s, out, w, h, co, 4, 8, 8)
+
+
+@conv2d.register(["intel_graphics"])
+def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+    """Conv2D operator for Intel Graphics backend.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    layout : str
+        layout of data
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    assert layout == 'NCHW', "only support NCHW convolution on intel gpu"
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
+    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
+
+    out_dtype = data.dtype
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    kernel_shape = util.get_const_tuple(kernel.shape)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+
+    return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype)
+
+@generic.schedule_conv2d_nchw.register(["intel_graphics"])
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw for Intel Graphics
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        """inline all one-to-one-mapping operators except the last stage (output)"""
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
+           or "1_16" in op.tag:
+            _schedule_cl_spatialpack(s, op)
+
+    traverse(outs[0].op)
+    return s
+
+def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'):
+    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
+    num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        stride_h, stride_w = stride
+    else:
+        stride_h, stride_w = stride, stride
+
+    out_channel = num_filter
+    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    oshape = (batch, out_channel, out_height, out_width)
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down, pad_right]
+    temp = pad(data, pad_before, pad_after, name="pad_temp")
+
+    rc = tvm.reduce_axis((0, in_channel), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+
+    block_w = 0
+    block_h = 0
+    if stride_h == 2:
+        if num_filter + kernel_h == 515:
+            conv_tag = "4_4"
+            block_h = 4
+            block_w = 4
+        else:
+            conv_tag = "4_5"
+            block_h = 4
+            block_w = 5
+    elif kernel_h == 3:
+        if num_filter == 512:
+            conv_tag = "2_7"
+            block_h = 2
+            block_w = 7
+        else:
+            conv_tag = "2_14"
+            block_h = 2
+            block_w = 14
+    else:
+        conv_tag = "1_16"
+        block_h = 1
+        block_w = 16
+
+    c_h = out_height
+    c_w = out_width
+
+    if not out_height % block_h == 0:
+        c_h = (out_height // block_h + 1) * block_h
+
+    if not out_width % block_w == 0:
+        c_w = (out_width // block_w + 1) * block_w
+
+    nv = 16
+    cshape = (batch, out_channel // nv, c_h, c_w, nv)
+    kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)
+
+    kernel_vec = tvm.compute(
+        kvshape,
+        lambda co, ci, kh, kw, vc:
+        kernel[co*nv + vc][ci][kh][kw], name='kernel_vec')
+
+    conv = tvm.compute(
+        cshape,
+        lambda nn, ff, yy, xx, vc:\
+          tvm.sum(
+              temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
+              kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
+              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+
+    output = tvm.compute(
+        oshape,
+        lambda nn, ff, yy, xx:
+        conv[nn][ff//nv][yy][xx][ff%nv],
+        name='output_unpack', tag=conv_tag)
+
+    return output
+
+def _schedule_cl_spatialpack(s, op):
+    output = op.output(0)
+    _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
+
+    conv = op.input_tensors[0]
+    temp = s[conv].op.input_tensors[0]
+    kernel_vec = s[conv].op.input_tensors[1]
+    kernel = s[kernel_vec].op.input_tensors[0]
+    temp_W = s.cache_read(temp, "warp", [conv])
+    conv_L = s.cache_write(conv, "local")
+
+    kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
+    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
+
+    if "1_16" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 1
+        OUTPUT_BLOCK_WIDTH = 16
+    elif "2_14" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 2
+        OUTPUT_BLOCK_WIDTH = 14
+    elif "2_7" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 2
+        OUTPUT_BLOCK_WIDTH = 7
+    elif "4_5" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 4
+        OUTPUT_BLOCK_WIDTH = 5
+    elif "4_4" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 4
+        OUTPUT_BLOCK_WIDTH = 4
+
+    # schedule conv
+    z_factor = 1
+    y_factor = 1
+    x_factor = 16
+    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
+    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
+    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
+    _, co, oh, ow, vc = s[conv].op.axis
+    ooh, ioh = s[conv].split(oh, factor=OUTPUT_BLOCK_HEIGHT)
+    oow, iow = s[conv].split(ow, factor=OUTPUT_BLOCK_WIDTH)
+    s[conv].reorder(_, co, ooh, oow, vc, ioh, iow)
+    coo, coi = s[conv].split(co, nparts=1)
+    ooho, oohi = s[conv].split(ooh, factor=z_factor)
+    oowo, oowi = s[conv].split(oow, factor=y_factor)
+    vco, vci = s[conv].split(vc, factor=x_factor)
+    s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow)
+    s[conv].bind(oohi, thread_z)
+    s[conv].bind(oowi, thread_y)
+    s[conv].bind(vci, thread_x)
+    s[conv].bind(ooho, tvm.thread_axis("blockIdx.z"))
+    s[conv].bind(oowo, tvm.thread_axis("blockIdx.y"))
+    s[conv].bind(coi, tvm.thread_axis("blockIdx.x"))
+
+    # schedule conv_L
+    s[conv_L].compute_at(s[conv], vci)
+    i, oc, h, w, vc = s[conv_L].op.axis
+    rc, ry, rx = s[conv_L].op.reduce_axis
+    if in_channel == 2048:
+        rco, rci = s[conv_L].split(rc, nparts=128)
+        s[conv_L].unroll(rci)
+        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
+        s[temp_W].compute_at(s[conv_L], rco)
+    else:
+        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+        s[temp_W].compute_at(s[conv_L], rc)
+    if kernel.shape[3].value != 7:
+        s[conv_L].unroll(ry)
+        s[conv_L].unroll(rx)
+
+    # schedule temp
+    _, ci, h, w = s[temp].op.axis
+    tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16)
+
+    # schedule temp_W
+    _, ci, h, w = s[temp_W].op.axis
+    zo, zi = s[temp_W].split(ci, 1)
+    yo, yi = s[temp_W].split(h, 1)
+    xo, xi = s[temp_W].split(w, 16)
+    s[temp_W].reorder(zo, yo, xo, zi, yi, xi)
+    s[temp_W].bind(zi, thread_z)
+    s[temp_W].bind(yi, thread_y)
+    s[temp_W].bind(xi, thread_x)
+    s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
+
+    s[kernel_vec].compute_inline()
+
+    # schedule kernel_L
+    if "2_14" in s[conv].op.tag:
+        s[kernel_L].compute_at(s[conv_L], ry)
+    else:
+        s[kernel_L].compute_at(s[conv_L], rx)
+
+    # schedule output
+    if output.op in s.outputs:
+        out = output
+    else:
+        s[output].compute_inline()
+        out = s.outputs[0]
+
+    _, co, h, w = s[out].op.axis
+    tile_and_bind3d(s, out, w, h, co, 4, 8, 8)
diff --git a/topi/python/topi/rasp/__init__.py b/topi/python/topi/mali/__init__.py
similarity index 67%
rename from topi/python/topi/rasp/__init__.py
rename to topi/python/topi/mali/__init__.py
index 2ac059128c60..bdd718e043a0 100644
--- a/topi/python/topi/rasp/__init__.py
+++ b/topi/python/topi/mali/__init__.py
@@ -1,6 +1,7 @@
 # pylint: disable=redefined-builtin, wildcard-import
-"""Raspberry pi specific declaration and schedules."""
+"""ARM Mali GPU specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
 from .conv2d import *
 from .depthwise_conv2d import *
+from .dense import *
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
new file mode 100644
index 000000000000..ad1dfbe61740
--- /dev/null
+++ b/topi/python/topi/mali/conv2d.py
@@ -0,0 +1,690 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
+"""conv2d schedule on ARM Mali GPU"""
+
+from __future__ import absolute_import as _abs
+
+import numpy as np
+import tvm
+
+from .. import generic
+from .. import util
+from .. import tag
+from ..nn import pad
+from ..nn.conv2d import conv2d
+from ..nn.util import get_pad_tuple
+
+##### SCHEDULE UTILITIES #####
+def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+    """ fuse all the axis and bind to GPU threads """
+    axis = axis or s[tensor].op.axis
+    fused = s[tensor].fuse(*axis)
+    max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+    bx, tx = s[tensor].split(fused, num_thread or max_threads)
+    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    return bx, tx
+
+def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
+    """ tile and bind to GPU threads """
+    x_factor = x_factor or y_factor
+    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    return yo, xo, yi, xi
+
+def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+    """ tile and bind 3d """
+    y_factor = y_factor or z_factor
+    x_factor = x_factor or y_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+
+def pack_tensor(s, tensor, factor, readers):
+    """ do transform X[n, m] -> X[n / factor, m, factor] """
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis
+    yo, yi = s[tmp].split(y, factor)
+    s[tmp].reorder(yo, x, yi)
+    s[tmp].compute_inline()
+    return s.cache_write(tmp, 'global')
+
+def transpose(s, tensor, readers):
+    """ do transform X[n, m] -> X[m, n] """
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis
+    s[tmp].reorder(x, y)
+    s[tmp].compute_inline()
+    return s.cache_write(tmp, "global"), tmp
+
+def const_array(data, name):
+    """ convert an const array to tvm tensor"""
+    row, col = data.shape
+    dtype = str(data.dtype)
+
+    def select_array(i, j):
+        now = tvm.const(0.0, dtype)
+        for ii in range(row):
+            for jj in range(col):
+                now = tvm.select(tvm.all(i % row == ii, j % col == jj),
+                                 tvm.const(data[ii][jj], dtype),
+                                 now)
+        return now
+    return tvm.compute(data.shape, select_array, name=name)
+
+
+@conv2d.register(["mali"])
+def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+    """Conv2D operator for ARM Mali GPU backend.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    assert layout == 'NCHW', "only support NCHW convolution on mali"
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on mali"
+    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
+
+    out_dtype = data.dtype
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    kernel_shape = util.get_const_tuple(kernel.shape)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+
+    if (kernel_shape[2:4] == (3, 3) and (HPAD, WPAD) == (1, 1) and kernel_shape[0] >= 64 and
+            (HSTR, WSTR) == (1, 1)):
+        return _decl_winograd(data, kernel, stride, padding, layout, out_dtype)
+    elif kernel_shape[2:4] == (1, 1):
+        return _decl_im2col(data, kernel, stride, padding, layout, out_dtype)
+    else:
+        return _decl_spatialpack(data, kernel, stride, padding, layout, out_dtype)
+
+@generic.schedule_conv2d_nchw.register(["mali"])
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw for ARM Mali GPU
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        """inline all one-to-one-mapping operators except the last stage (output)"""
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        if 'im2col_conv_output' in op.tag:
+            _schedule_im2col_conv2d(s, op)
+
+        if 'spatialpack_conv_output' in op.tag:
+            _schedule_spatialpack_conv2d(s, op)
+
+        if 'winograd_conv_output' in op.tag:
+            _schedule_winograd(s, op)
+
+    traverse(outs[0].op)
+    return s
+
+def _decl_spatialpack(data, kernel, stride, padding, layout, out_dtype):
+    """declare the spatialpack method (spatial packing) for conv2d"""
+    _, CI, IH, IW = [util.get_const_int(x) for x in data.shape]
+    CO, _, KH, KW = [util.get_const_int(x) for x in kernel.shape]
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    HCAT, WCAT = KH - 1, KW - 1
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+
+    N = 1
+    TH = IH + 2*HPAD
+    TW = IW + 2*WPAD
+    OH = (IH + 2*HPAD - KH) // HSTR + 1
+    OW = (IW + 2*WPAD - KW) // WSTR + 1
+
+    DO_PAD = (HPAD != 0 and WPAD != 0)
+    if DO_PAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+    else:
+        data_pad = data
+
+    # set tunable parameters (tile factor, ...)
+    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
+    if tune_config is None:
+        VH = 1
+        VW, VC = 4, 4
+        # correct tile factor
+        if OW % VW != 0:
+            if OW == 14:
+                VW = 2
+                VC = 8
+            elif OW == 7:
+                VW = 7
+    else:
+        VH = tune_config['VH']
+        VW = tune_config['VW']
+        VC = tune_config['VC']
+
+    if data.dtype == 'float16':
+        VC *= 2
+
+    assert CO % VC == 0
+    assert OH % VH == 0, "OH: %d  VH : %d" % (OH, VH)
+    assert OW % VW == 0, "OW: %d  VW : %d" % (OW, VW)
+
+    dvshape = (N, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT)
+    kvshape = (CO // VC, CI, KH, KW, VC)
+    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
+    oshape = (N, CO, OH, OW)
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
+                           data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
+                           name='data_vec')
+
+    kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
+                             kernel[co*VC+vc][ci][kh][kw],
+                             name='kernel_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    kh = tvm.reduce_axis((0, KH), name='kh')
+    kw = tvm.reduce_axis((0, KW), name='kw')
+
+    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc:\
+                tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
+                        kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
+                        axis=[ci, kh, kw]), name='conv')
+
+    output = tvm.compute(oshape, lambda n, co, h, w:
+                         conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC],
+                         name='output_unpack', tag='spatialpack_conv_output')
+
+    return output
+
+def _schedule_spatialpack_conv2d(s, op):
+    """schedule the spatialpack method (spatial packing) for conv2d"""
+    # get ops and tensors
+    output = op.output(0)
+    output_height = util.get_const_int(output.shape[2])
+
+    conv = op.input_tensors[0]
+    data_vec = s[conv].op.input_tensors[0]
+    kernel_vec = s[conv].op.input_tensors[1]
+    data = s[data_vec].op.input_tensors[0]
+    kernel = s[kernel_vec].op.input_tensors[0]
+
+    # set tunable parameters (tile factor, ...)
+    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
+    if tune_config is None:
+        num_thread = 8
+
+        out_channel = util.get_const_int(kernel.shape[0])
+        in_channel = util.get_const_int(kernel.shape[1])
+        in_width = util.get_const_int(data.shape[2])
+
+        if in_width >= 224:
+            pass
+        elif in_width >= 112:
+            pass
+        elif in_width >= 56:
+            if out_channel != in_channel:
+                num_thread = 16
+        elif in_width >= 28:
+            if out_channel >= 256:
+                num_thread = 16
+        elif in_width >= 14:
+            if in_channel == out_channel:
+                num_thread = 8
+            else:
+                num_thread = 4
+    else:
+        num_thread = tune_config["num_thread"]
+
+    last = 1
+    if output_height == 28:
+        last = 7
+        num_thread = 32
+
+    if data.dtype == 'float16' and (util.get_const_int(conv.shape[1]) == 4 or output_height == 28):
+        num_thread //= 2
+
+    # schedule dilation
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    # schedule padding
+    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+        data_pad = data
+        data = data_pad.op.input_tensors[0]
+        s[data_pad].compute_inline()
+
+    # schedule data packing
+    _, h, w, ci, vh, vw = s[data_vec].op.axis
+    tile_and_bind3d(s, data_vec, h, w, ci, 1)
+    s[data_vec].unroll(vw)
+
+    # schedule kernel packing
+    co, ci, kh, kw, vc = s[kernel_vec].op.axis
+    tile_and_bind(s, kernel_vec, co, ci, 1)
+    s[kernel_vec].unroll(kh)
+    s[kernel_vec].unroll(kw)
+    s[kernel_vec].vectorize(vc)
+
+    # schedule convolution
+    _, c, h, w, vh, vw, vc = s[conv].op.axis
+    kc, kh, kw = s[conv].op.reduce_axis
+    s[conv].reorder(_, c, h, w, vh, kc, kh, kw, vw, vc)
+    tile_and_bind3d(s, conv, c, h, w, num_thread, 1, last)
+    s[conv].unroll(kh)
+    s[conv].unroll(kw)
+    s[conv].unroll(vw)
+    s[conv].vectorize(vc)
+
+    # schedule output
+    if output.op not in s.outputs:  # has bias
+        s[output].compute_inline()
+        output = s.outputs[0]
+
+    _, co, oh, ow = s[output].op.axis
+    tile_and_bind3d(s, output, co, oh, ow, num_thread, 1, last)
+
+def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+    """declare the Im2Col method for conv2d"""
+    _, CI, IH, IW = [x.value for x in data.shape]
+    CO, _, KH, KW = [x.value for x in kernel.shape]
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+
+    N = 1
+    OH = (IH + 2*HPAD - KH) // HSTR + 1
+    OW = (IW + 2*WPAD - KW) // WSTR + 1
+
+    DO_PAD = (HPAD != 0 and WPAD != 0)
+    if DO_PAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+    else:
+        data_pad = data
+
+    ALIGN = 16
+    def upround(x, align):
+        return (x + align - 1) // align * align
+
+    # A [CO, CI * KH * KW]
+    reduce_len = upround(CI * KH * KW, ALIGN)
+    A = tvm.compute((upround(CO, ALIGN), reduce_len), lambda i, j:
+                    kernel[i][j // KW // KH][j // KW % KH][j % KW], name='A')
+
+    # B [CI * KH * KW, N * OH * OW]
+    B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\
+            tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW),
+                       data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH]
+                       [j % OW*WSTR + i % KW],
+                       tvm.const(0, data_pad.dtype)), name='B')
+
+    gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1]
+
+    # C [CO, N * OH * OW]
+    k = tvm.reduce_axis((0, gemm_l), name='k')
+    C = tvm.compute((gemm_n, gemm_m), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+
+    # output
+    # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment,
+    # otherwise the alignment above will be eliminated by bound inference
+    output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\
+                 C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1],
+                         name='output', tag='im2col_conv_output')
+
+    return output
+
+def _schedule_im2col_conv2d(s, op):
+    """schedule the Im2Col method for conv2d"""
+
+    # get ops and tensors
+    output = op.output(0)
+    C = op.input_tensors[0]
+    A, B = C.op.input_tensors
+    kernel = A.op.input_tensors[0]
+    data = B.op.input_tensors[0]
+
+    # tuning parameter config
+    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
+    if tune_config is None: # use rule
+        bn = 4
+        unroll_step = 16
+
+        total_work = util.get_const_int(C.shape[0] * C.shape[1])
+        reduce_work = util.get_const_int(A.shape[1])
+        if total_work > 200000:
+            last_work = util.get_const_int(C.shape[1])
+            if last_work > 10000:
+                num_thread = 16
+            elif last_work > 3000:
+                num_thread = 8
+            elif reduce_work > 100:
+                num_thread = 4
+            else:
+                num_thread = 2
+
+            if reduce_work < 50 and last_work < 30000:
+                num_thread = 4
+        elif total_work > 150000:
+            num_thread = 8
+        elif total_work > 50000:
+            num_thread = 4
+        else:
+            num_thread = 2
+
+        if num_thread == 4:
+            unroll_step = 2
+    else:
+        bn = tune_config["bn"]
+        num_thread = tune_config["num_thread"]
+        unroll_step = tune_config["unroll_step"]
+
+    bna = bnb = bn
+    num_thread1 = num_thread2 = num_thread
+    if data.dtype == 'float16':
+        bnb *= 2
+        last_work = util.get_const_int(C.shape[1])
+        if last_work % (bnb * num_thread2) != 0:
+            num_thread1 = num_thread * 2
+            num_thread2 = num_thread // 2
+
+    # schedule dilation
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    # schedule padding
+    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+        data_pad = data
+        s[data_pad].compute_inline()
+
+    ##### SCHEDULE A #####
+    if util.get_const_int(kernel.shape[2]) == 1 and util.get_const_int(kernel.shape[3]) == 1:
+        s[A].compute_inline()
+    else:
+        y, x = s[A].op.axis
+        yo, xo, yi, xi = s[A].tile(y, x, bna, util.get_const_int(kernel.shape[3]))
+        s[A].vectorize(xi)
+        fuse_and_bind(s, A, [yo, xo])
+
+    # pack to vector form
+    packedA = pack_tensor(s, A, bna, [C])
+
+    # vectorize load
+    y, x = s[packedA].op.axis[:2]
+    tmp = s.cache_write(packedA, "local")
+    x, xt = s[packedA].split(x, bna)
+    _, _, _, xi = tile_and_bind(s, packedA, y, x, num_thread)
+    s[tmp].compute_at(s[packedA], xi)
+    s[tmp].vectorize(s[tmp].op.axis[1])
+    s[tmp].unroll(s[tmp].op.axis[2])
+    s[packedA].vectorize(s[packedA].op.axis[2])
+    s[packedA].unroll(xt)
+
+    ##### SCHEDULE B #####
+    y, x = s[B].op.axis
+    yo, xo, yi, xi = s[B].tile(y, x, 1, 1 * bnb)
+    fuse_and_bind(s, B, [yo, xo])
+
+    # transpose and pack to vector form
+    B_transpose, B_tmp = transpose(s, B, [C])
+    s[B_transpose].compute_inline()
+    packedB = pack_tensor(s, B_transpose, bnb, [B_tmp])
+
+    # vectorize load
+    s[packedB].vectorize(s[packedB].op.axis[2])
+    y, x = s[packedB].op.axis[:2]
+    tile_and_bind(s, packedB, y, x, num_thread)
+
+    ##### SCHEDULE C #####
+    # vectorize and unroll dot
+    y, x = s[C].op.axis
+    y, x, yt, xt = s[C].tile(y, x, bna, bnb)
+
+    k = s[C].op.reduce_axis[0]
+    s[C].reorder(k, yt, xt)
+    if unroll_step != 1:
+        k, k_unroll = s[C].split(k, unroll_step)
+        s[C].unroll(k_unroll)
+    s[C].unroll(yt)
+    s[C].vectorize(xt)
+
+    tile_and_bind(s, C, y, x, num_thread1, num_thread2)
+
+    ##### COPY TO OUTPUT #####
+    if output.op in s.outputs:  # no bias
+        output = output
+    else:                       # has bias
+        s[output].compute_inline()
+        output = s.outputs[0]
+
+    n, co, h, w = s[output].op.axis
+    h, w, vh, vw = s[output].tile(h, w, 1, bnb)
+    s[output].unroll(vh)
+    if util.get_const_int(s[output].op.output(0).shape[3]) % bnb != 0:
+        pass
+    else:
+        s[output].vectorize(vw)
+    fuse_and_bind(s, output, [n, co, h, w])
+
+def _decl_winograd(data, kernel, stride, padding, layout, out_dtype):
+    """declare winograd fast convolution F(2x2, 3x3) for conv2d"""
+    N, CI, H, W = [util.get_const_int(x) for x in data.shape]
+    CO, CI, KH, KW = [util.get_const_int(x) for x in kernel.shape]
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+
+    assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3
+    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+
+    B_data = np.array([
+        [1, 0, 0, 0],
+        [0, 1, -1, 1],
+        [-1, 1, 1, 0],
+        [0, 0, 0, -1]
+    ], out_dtype)
+
+    G_data = np.array([
+        [1, 0, 0],
+        [1.0/2, 1.0/2, 1.0/2],
+        [1.0/2, -1.0/2, 1.0/2],
+        [0, 0, 1],
+    ], out_dtype)
+
+    A_data = np.array([
+        [1, 0],
+        [1, 1],
+        [1, -1],
+        [0, -1],
+    ], out_dtype)
+
+    m = 2
+    r = 3
+    alpha = m + r - 1
+    K = CO
+    C = CI
+
+    nH, nW = (H + m-1) // m, (W + m-1) // m
+    P = N * nH * nW
+
+    bna, bnb = 4, 4
+    if data.dtype == 'float16':
+        bnb *= 2
+    P_round = (P + bnb - 1) // bnb * bnb
+    assert K % bna == 0 and P_round % bnb == 0
+
+    # pack input tile
+    input_tile = tvm.compute((C, P_round // bnb, alpha, alpha, bnb),
+                             lambda c, b, eps, nu, bb:
+                             tvm.select(b * bnb + bb < P,\
+                             data_pad[(b*bnb+bb) // (nH*nW)][c][(b*bnb+bb) // nW % nH * m + eps]\
+                             [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)),
+                             name='d')
+
+    # transform kernel
+    G = const_array(G_data, 'G')
+    r_kh = tvm.reduce_axis((0, KH), 'r_kh')
+    r_kw = tvm.reduce_axis((0, KW), 'r_kw')
+    U = tvm.compute((alpha, alpha, K // bna, C, bna), lambda eps, nu, k, c, kk:
+                    tvm.sum(kernel[k * bna + kk][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
+                            axis=[r_kh, r_kw]), name='U')
+
+    # transform image
+    B = const_array(B_data, 'B')
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    V = tvm.compute((alpha, alpha, P_round // bnb, C, bnb), lambda eps, nu, b, c, bb:
+                    tvm.sum(input_tile[c][b][r_eps][r_nu][bb] * B[r_eps][eps] * B[r_nu][nu],
+                            axis=[r_eps, r_nu]), name='V')
+
+    # batch gemm
+    c = tvm.reduce_axis((0, C), name='c')
+    M = tvm.compute((alpha, alpha, K, P_round), lambda eps, nu, k, b:
+                    tvm.sum(U[eps][nu][k // bna][c][k % bna] *
+                            V[eps][nu][b // bnb][c][b % bnb], axis=c), name='M')
+
+    # inverse transform
+    A = const_array(A_data, 'A')
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
+                    tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
+                            axis=[r_eps, r_nu]), name='Y')
+
+    # unpack output
+    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
+                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
+                         # thw following term is used to make the padding effective,
+                         # otherwise the padding will be eliminated by bound inference
+                         + tvm.const(0, out_dtype) * M[alpha-1][alpha-1][K-1][P_round-1],
+                         name='output', tag='winograd_conv_output')
+
+    return output
+
+def _schedule_winograd(s, op):
+    """schedule winograd fast convolution F(2x2, 3x3) for conv2d"""
+
+    # get ops and tensors
+    output = op.output(0)
+
+    Y = op.input_tensors[0]
+    M, A = s[Y].op.input_tensors
+    U, V = s[M].op.input_tensors
+    kernel, G = s[U].op.input_tensors
+    d, B = s[V].op.input_tensors
+    data_pad = s[d].op.input_tensors[0]
+    data = s[data_pad].op.input_tensors[0]
+
+    # dilation
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    # padding
+    s[data_pad].compute_inline()
+
+    # pack input tiles
+    c, b, eps, nu, bb = s[d].op.axis
+    s[d].reorder(eps, nu, bb)
+    aha = s[d].fuse(eps, nu)
+    s[d].unroll(bb)
+    tile_and_bind3d(s, d, c, b, aha, 4, 1, 1)
+
+    # transform kernel
+    s[G].compute_inline()
+    eps, nu, k, c, kk, = s[U].op.axis
+    r_kh, r_kw = s[U].op.reduce_axis
+    s[U].reorder(k, c, kk, eps, nu, r_kh, r_kw)
+    _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
+    s[U].vectorize(kk)
+    tile_and_bind(s, U, k, c, 1, 256)
+
+    # transform image
+    s[B].compute_inline()
+    eps, nu, b, c, bb = s[V].op.axis
+    r_eps, r_nu = s[V].op.reduce_axis
+    s[V].reorder(b, c, bb, eps, nu, r_nu, r_eps)
+    _ = [s[V].unroll(x) for x in [eps, nu, r_eps, r_nu]]
+    s[V].vectorize(bb)
+    tile_and_bind(s, V, b, c, 2, 1)
+
+    # batch gemm
+    bna, bnb = 4, 4
+    if data.dtype == 'float16':
+        bnb *= 2
+
+    eps, nu, k, b = s[M].op.axis
+    c = s[M].op.reduce_axis[0]
+    yo, xo, yi, xi = s[M].tile(k, b, bna, bnb)
+    s[M].reorder(c, yi, xi)
+    c, c_unroll = s[M].split(c, 2)
+    s[M].unroll(c_unroll)
+    s[M].unroll(yi)
+    s[M].vectorize(xi)
+    z = s[M].fuse(eps, nu)
+    tile_and_bind3d(s, M, z, yo, xo, 1, 8, 1)
+
+    # inverse transform
+    s[A].compute_inline()
+    k, b, vh, vw = s[Y].op.axis
+    r_eps, r_nu = s[Y].op.reduce_axis
+    _ = [s[Y].unroll(x) for x in [vh, vw, r_eps, r_nu]]
+    tile_and_bind(s, Y, k, b, 4, 1)
+
+    # schedule output
+    if output.op in s.outputs:  # no bias
+        output = output
+    else:                       # has bias
+        s[output].compute_inline()
+        output = s.outputs[0]
+
+    _, k, h, w = s[output].op.axis
+    tile_and_bind3d(s, output, k, h, w, 1, 2, 2)
diff --git a/topi/python/topi/mali/dense.py b/topi/python/topi/mali/dense.py
new file mode 100644
index 000000000000..ff88ce51866f
--- /dev/null
+++ b/topi/python/topi/mali/dense.py
@@ -0,0 +1,101 @@
+# pylint: disable=invalid-name,unused-variable
+"""dense schedule on ARM Mali GPU"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+
+from .. import generic
+from .. import util
+from .. import tag
+
+@generic.schedule_dense.register(["mali"])
+def schedule_dense(outs):
+    """Schedule for dense operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of dense
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for dense.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(dense):
+        data = s[dense].op.input_tensors[0]
+        weight = s[dense].op.input_tensors[1]
+
+        hidden = util.get_const_int(weight.shape[1])
+        out = util.get_const_int(weight.shape[0])
+
+        # set tunable parameter
+        tune_config = getattr(tvm.target.current_target(), "tune_config", None)
+        if tune_config is None:
+            if hidden > 8192:
+                num_thread = 32
+                unroll_step = 32
+            else:
+                if out <= 1024:
+                    num_thread = 32
+                    unroll_step = 16
+                else:
+                    num_thread = 256
+                    unroll_step = 32
+
+            if data.dtype == 'float16':
+                if hidden > 8192:
+                    num_thread = 2
+                    unroll_step = 32
+                else:
+                    num_thread = 8
+                    unroll_step = 256
+        else:
+            num_thread = tune_config['num_thread']
+            unroll_step = tune_config['unroll_step']
+
+        def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+            """ fuse all the axis and bind to GPU threads """
+            axis = axis or s[tensor].op.axis
+            fused = s[tensor].fuse(*axis)
+            max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+            bx, tx = s[tensor].split(fused, num_thread or max_threads)
+            s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+            s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+            return bx, tx
+
+        output = outs[0]
+        bx, tx = fuse_and_bind(s, output, num_thread=num_thread)
+
+        k = s[dense].op.reduce_axis[0]
+        k, k_unroll = s[dense].split(k, unroll_step)
+        s[dense].unroll(k_unroll)
+
+        if dense.op not in s.outputs:
+            s[dense].compute_at(s[output], tx)
+
+#        bias = s[outs[0]].op.input_tensors[1]
+#        print(tvm.lower(s, [data, weight, bias, outs[0]], simple_mode=True))
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule dense
+        elif OP.tag == 'dense':
+            dense = OP.output(0)
+            _schedule(dense)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/mali/depthwise_conv2d.py b/topi/python/topi/mali/depthwise_conv2d.py
new file mode 100644
index 000000000000..61ec6334e0d2
--- /dev/null
+++ b/topi/python/topi/mali/depthwise_conv2d.py
@@ -0,0 +1,109 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""depthwise_conv2d schedule on ARM Mali GPU"""
+
+from __future__ import absolute_import as _abs
+import tvm
+
+from .. import generic
+from .. import util
+from .. import tag
+
+@generic.schedule_depthwise_conv2d_nchw.register(["mali"])
+def schedule_depthwise_conv2d_nchw(outs):
+    """Schedule for depthwise_conv2d nchw forward.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of depthwise_conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for depthwise_conv2d nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(pad_data, kernel, conv):
+        raw_data = s[pad_data].op.input_tensors[0]
+
+        if conv.op not in s.outputs:  # has bias or relu
+            output = outs[0]
+        else:                         # no bias or relu
+            output = conv
+
+        def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+            """ tile and bind 3d """
+            y_factor = y_factor or z_factor
+            x_factor = x_factor or y_factor
+            zo, zi = s[tensor].split(z, z_factor)
+            yo, yi = s[tensor].split(y, y_factor)
+            xo, xi = s[tensor].split(x, x_factor)
+            s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+            s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+            s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+            s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+            s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+            s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+            return zo, zi, yo, yi, xo, xi
+
+        # set tunable parameters
+        VH = 1
+        VW = 1
+        num_thread = 4
+        while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
+            VW = VW * 2
+        while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
+            VH = VH * 2
+        if raw_data.dtype == 'float16':
+            if util.get_const_int(conv.shape[3]) % (VW * 2) == 0:
+                VW *= 2
+                num_thread *= 2
+            else:
+                num_thread *= 2
+
+        # schedule padding
+        _, c, y, x = s[pad_data].op.axis
+        tile_and_bind3d(pad_data, c, y, x, num_thread, 1, 1)
+
+        # schedule conv
+        di, dj = s[conv].op.reduce_axis
+        s[conv].unroll(di)
+        s[conv].unroll(dj)
+
+        _, c, y, x = s[output].op.axis
+        y, x, yi, xi = s[output].tile(y, x, VH, VW)
+        s[output].unroll(yi)
+        s[output].vectorize(xi)
+
+        _, _, _, _, _, ji = tile_and_bind3d(output, c, y, x, num_thread, 1, 1)
+
+        if conv.op not in s.outputs:
+            _, c, y, x = s[conv].op.axis
+            y, x, yi, xi = s[conv].tile(y, x, VH, VW)
+            s[conv].unroll(yi)
+            s[conv].vectorize(xi)
+            s[conv].compute_at(s[output], ji)
+
+    def traverse(op):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        # schedule depthwise_conv2d
+        if op.tag == 'depthwise_conv2d_nchw':
+            pad_data = op.input_tensors[0]
+            kernel = op.input_tensors[1]
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+                s[kernel].compute_inline()
+            conv = op.output(0)
+            _schedule(pad_data, kernel, conv)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/math.py b/topi/python/topi/math.py
index 2a1de9972bfe..a5d28d351719 100644
--- a/topi/python/topi/math.py
+++ b/topi/python/topi/math.py
@@ -1,4 +1,5 @@
 """Elementwise operators"""
+# pylint: disable=redefined-builtin
 from __future__ import absolute_import as _abs
 import tvm
 from . import tag
@@ -73,6 +74,91 @@ def tanh(x):
     return tvm.compute(x.shape, lambda *i: tvm.tanh(x(*i)))
 
 
+@tvm.tag_scope(tag=tag.ELEMWISE)
+def floor(x):
+    """Take floor of input x.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return tvm.compute(x.shape, lambda *i: tvm.floor(x(*i)))
+
+
+@tvm.tag_scope(tag=tag.ELEMWISE)
+def ceil(x):
+    """Take ceil of input x.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return tvm.compute(x.shape, lambda *i: tvm.ceil(x(*i)))
+
+
+@tvm.tag_scope(tag=tag.ELEMWISE)
+def trunc(x):
+    """Take truncated value of the input of x, element-wise.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return tvm.compute(x.shape, lambda *i: tvm.trunc(x(*i)))
+
+
+@tvm.tag_scope(tag=tag.ELEMWISE)
+def abs(x):
+    """Take absolute value of the input of x, element-wise.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return tvm.compute(x.shape, lambda *i: tvm.abs(x(*i)))
+
+
+@tvm.tag_scope(tag=tag.ELEMWISE)
+def round(x):
+    """Round elements of x to nearest integer.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return tvm.compute(x.shape, lambda *i: tvm.round(x(*i)))
+
+
 @tvm.tag_scope(tag=tag.ELEMWISE)
 def log(x):
     """Take logarithm of input x.
@@ -189,14 +275,14 @@ def _compute(*indices):
     return tvm.compute(x.shape, _compute)
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
 def cast(x, dtype):
     """Cast input to specified data type.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.Tensor or Expr
         Input argument.
+
     dtype : str
         Data type.
 
@@ -205,4 +291,7 @@ def cast(x, dtype):
     y : tvm.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: x(*i).astype(dtype))
+    if isinstance(x, tvm.tensor.Tensor):
+        return tvm.compute(
+            x.shape, lambda *i: x(*i).astype(dtype), tag=tag.ELEMWISE)
+    return tvm.make.static_cast(dtype, x)
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index b6606108268c..690379135e06 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -13,3 +13,8 @@
 from .pooling import *
 from .softmax import *
 from .conv2d_transpose import *
+from .bnn import *
+from .upsampling import *
+from .local_response_norm import *
+from .bitserial_conv2d import *
+from .l2_normalize import *
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
new file mode 100644
index 000000000000..ca2efb0820c1
--- /dev/null
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -0,0 +1,341 @@
+# pylint: disable=invalid-name, unused-variable, too-many-locals, too-many-arguments, unused-argument
+"""Bitserial Conv2D operators"""
+from __future__ import absolute_import as _abs
+from collections import namedtuple
+import numpy as np
+import tvm
+from topi.transform import concatenate
+from .pad import pad
+from .util import get_pad_tuple
+from ..util import get_const_tuple, get_const_int
+
+# workload description of conv2d
+Workload = namedtuple('Workload',
+                      ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
+                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+
+SpatialPackNCHW = namedtuple('SpatialPack',
+                             ['vh', 'vw', 'vc', 'ba', 'bc'])
+
+SpatialPackNHWC = namedtuple('SpatialPack',
+                             ['vh', 'vw', 'vc', 'ba', 'bc'])
+
+_WORKLOADS = [
+    # workloads of resnet18 on imagenet
+    # input_size, input_size, ic, oc, kh, kw, pad, pad, stride, stride
+    Workload('uint32', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+    Workload('uint32', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+    Workload('uint32', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+    Workload('uint32', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+    Workload('uint32', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+    Workload('uint32', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+    Workload('uint32', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+    Workload('uint32', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+    Workload('uint32', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+    Workload('uint32', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+    Workload('uint32', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+
+    # workload of alexnet on cifar10
+    Workload('int32', 'int32', 27, 27, 96, 192, 5, 5, 2, 2, 1, 1),
+    Workload('int32', 'int32', 13, 13, 192, 384, 3, 3, 1, 1, 1, 1),
+    Workload('int32', 'int32', 13, 13, 384, 384, 3, 3, 1, 1, 1, 1),
+    Workload('int32', 'int32', 13, 13, 384, 256, 3, 3, 1, 1, 1, 1),
+]
+
+@tvm.target.generic_func
+def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
+                     layout='NCHW', pack_dtype='uint32', out_dtype='int32', dorefa=True):
+    """Bitserial Conv2D operator.
+
+    Parameters
+    ----------
+    input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width] or
+                       [batch, in_height, in_width, in_channel]
+
+    filter : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+		       [filter_height, filter_width, in_channel, num_filter]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    activation_bits: int
+        number of bits used for activations/input elements
+
+    weight_bits: int
+        number of bits used for weight elements
+
+    out_dtype: str
+        return type of convolution
+
+    pack_dtype: str
+        bit packing type
+
+    dorefa: bool
+        preform the bitserial dot-product using 2 popcounts (required for DoReFa-Net)
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width] or
+                       [batch, out_height, out_width, out_channel]
+    """
+    # search platform specific declaration first
+    # default declaration
+    if layout == 'NCHW':
+        return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits,
+                                 pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
+    elif layout == 'NHWC':
+        return spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits,
+                                 pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
+    raise ValueError("not support this layout {} yet".format(layout))
+
+def _get_workload(data, kernel, stride, padding, out_dtype, layout):
+    """ Get the workload structure. """
+    assert layout == "NCHW" or layout == "NHWC", \
+        "Only support layouts NCHW and NHWC"
+    if layout == "NCHW":
+        _, CI, IH, IW = [x.value for x in data.shape]
+        CO, _, KH, KW = [x.value for x in kernel.shape]
+    else: # NHWC
+        IH, IW = data.shape[1].value, data.shape[2].value
+        KH, KW, CI, CO = [x for x in get_const_tuple(kernel.shape)]
+
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+
+    return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
+
+@tvm.target.generic_func
+def _get_schedule(wkl, layout):
+    # pylint: disable=unreachable
+    """ Get the platform specific schedule. """
+    target = tvm.target.current_target()
+    raise RuntimeError(
+        "No schedule for current target:{}".format(target))
+    # This return has no use, merely to supress pylint warning
+    return wkl
+
+def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits,
+                      pack_dtype, out_dtype, dorefa=False):
+    """ Compute convolution with pack on spatial axes. """
+    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
+    data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
+    kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
+    IB, _, CI, H, W = data_q.shape
+    KB, CO, _, KH, KW = kernel_q.shape
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    HCAT, WCAT = KH-1, KW-1
+
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NCHW")
+    sch = _get_schedule(wkl, "NCHW")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+
+    TH = H + 2*HPAD
+    TW = W + 2*WPAD
+    OH = (H + 2*HPAD - KH) // HSTR + 1
+    OW = (W + 2*WPAD - KW) // WSTR + 1
+
+    dshape = (IB, 1, CI, H, W)
+    dpshape = (IB, 1, CI, TH, TW)
+    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT, IB)
+
+    kshape = (KB, CO, CI, KH, KW)
+    kvshape = (CO//VC, CI, KH, KW, KB, VC)
+
+    ovshape = (1, CO//VC, OH//VH, OW//VW, VH, VW, VC)
+    oshape = (1, CO, OH, OW)
+
+    DOPAD = (HPAD != 0 and WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data_q, (0, 0, 0, HPAD, WPAD), name="data_pad")
+    else:
+        data_pad = data_q
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \
+        data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')
+
+    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \
+        kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    dh = tvm.reduce_axis((0, KH), name='dh')
+    dw = tvm.reduce_axis((0, KW), name='dw')
+    b1 = tvm.reduce_axis((0, IB), name='ib')
+    b2 = tvm.reduce_axis((0, KB), name='kb')
+
+    def _conv(n, co, h, w, vh, vw, vc):
+        b1b2 = (b1+b2).astype(out_dtype)
+        if dorefa:
+            return tvm.sum((tvm.popcount(
+                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) &
+                kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype))  -
+                            tvm.popcount(
+                                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype)
+                                & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2,
+                           axis=[ci, dh, dw, b1, b2])
+
+        return tvm.sum((tvm.popcount(
+            data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
+            kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
+                       axis=[ci, dh, dw, b1, b2])
+
+    conv = tvm.compute(ovshape, _conv, name='conv_out')
+
+    return tvm.compute(oshape, lambda n, co, h, w:
+                       conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
+                       name='conv_vec', tag='spatial_bitserial_conv_nchw')
+
+def spatial_pack_nhwc(data, kernel, stride, padding, in_bits, weight_bits,
+                      pack_dtype, out_dtype, dorefa=False):
+    """ Compute convolution with pack on spatial axes. """
+    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
+    data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype)
+    kernel_q = bitpack(kernel, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_dtype)
+    _, H, W, CI, IB = data_q.shape
+    KH, KW, _, CO, KB = kernel_q.shape
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    HCAT, WCAT = KH-1, KW-1
+
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+
+    PAD_H = H + 2*HPAD
+    PAD_W = W + 2*WPAD
+    OH = (H + 2*HPAD - KH) // HSTR + 1
+    OW = (W + 2*WPAD - KW) // WSTR + 1
+
+    dvshape = (1, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, CI, IB)
+    kvshape = (CO, KH, KW, CI, VC, KB)
+    ovshape = (1, OH, OW, CO, VH, VW, VC)
+    oshape = (1, OH, OW, CO)
+
+    if (HPAD != 0 and WPAD != 0):
+        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
+    else:
+        data_pad = data_q
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \
+        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec')
+
+    kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \
+        kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    dh = tvm.reduce_axis((0, KH), name='dh')
+    dw = tvm.reduce_axis((0, KW), name='dw')
+    b1 = tvm.reduce_axis((0, IB), name='ib')
+    b2 = tvm.reduce_axis((0, KB), name='kb')
+
+    def _conv(n, h, w, co, vh, vw, vc):
+        b1b2 = (b1+b2).astype(out_dtype)
+        if dorefa:
+            return tvm.sum(
+                (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) &
+                              kernel_vec[co, dh, dw, ci, vc, b2].astype(out_dtype)) -
+                 tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) &
+                              ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2,
+                axis=[dh, dw, ci, b1, b2])
+
+        return tvm.sum(tvm.popcount(
+            data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
+            kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2,
+                       axis=[dh, dw, ci, b1, b2])
+
+    conv = tvm.compute(ovshape, _conv, name='conv')
+
+    return tvm.compute(oshape, lambda n, h, w, co:
+                       conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC],
+                       name='output_unpack', tag='spatial_bitserial_conv_nhwc')
+
+def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):
+    """Packs data into format necessary for bitserial computation
+    pack_axis : int
+       index of the axis to pack in data
+    bit_axis : int
+       index of axis to place bit axis in resulting packed data"""
+    ishape = data.shape
+    n = len(ishape)
+    if pack_type == 'uint8':
+        data_width = 8
+    elif pack_type == 'uint16':
+        data_width = 16
+    elif pack_type == 'uint32':
+        data_width = 32
+    elif pack_type == 'uint64':
+        data_width = 64
+
+    # Data must be in multiples of the data_width
+    assert get_const_int(ishape[pack_axis]) % data_width == 0, "Not a multiple of word size"
+
+    shape_vec = list(ishape)
+    shape_vec[pack_axis] = (shape_vec[pack_axis] // data_width)
+    shape_vec.insert(bit_axis, 1)
+    bitserial_oshape = tuple(shape_vec)
+    masks = np.array([0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80])
+
+    # pack axis shifts if bit axis comes before
+    if bit_axis <= pack_axis:
+        pack_axis += 1
+
+    def _bitpack(*indices):
+        packed_data = [tvm.const(0, pack_type)] * bits
+        for k in range(data_width):
+            # Translate indices for packed data back to original
+            idx = [0] * n
+            j = 0
+            for i in range(n+1):
+                if i == bit_axis:
+                    continue
+                elif i == pack_axis:
+                    idx[j] = indices[i] * data_width + k
+                else:
+                    idx[j] = indices[i]
+                j += 1
+
+            element = data(*idx)
+            for b in range(bits):
+                extracted_bit = ((element & tvm.const(masks[b])) >> b).astype(pack_type)
+                packed_data[b] = (packed_data[b] | extracted_bit)
+                if k < data_width - 1:
+                    packed_data[b] = packed_data[b] << 1
+
+            if k == data_width - 1:
+                return tuple(packed_data)
+        return tuple(packed_data)
+
+    output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack')
+
+    if bits > 1:
+        return concatenate(output_tuple, axis=bit_axis)
+    return output_tuple
+
+_SCH_TO_DECL_FUNC_QUANT = {
+    SpatialPackNCHW: spatial_pack_nchw,
+    SpatialPackNHWC: spatial_pack_nhwc,
+}
diff --git a/topi/python/topi/nn/bnn.py b/topi/python/topi/nn/bnn.py
new file mode 100644
index 000000000000..591a082e7d30
--- /dev/null
+++ b/topi/python/topi/nn/bnn.py
@@ -0,0 +1,80 @@
+"""Binary Neural Network (BNN) Operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import tag
+from ..util import simplify, get_const_int
+
+
+def binarize_pack(data, axis=None, name="PackedInput"):
+    """Binarization and bit-packing along a certain axis.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        n-D input, can be any layout.
+
+    axis : None or int
+        The axis along which to do binarization and bit-packing,
+        default is the last axis.
+
+    name : str, optional
+        The name prefix operators generate.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        n-D, the same layout as input, dtype is uint32.
+    """
+    ishape = data.shape
+    if axis is None:
+        axis = len(ishape) - 1
+    assert get_const_int(ishape[axis]) % 32 == 0
+    n = len(ishape)
+    oshape = tuple(simplify(ishape[i] // 32) if i == axis \
+        else ishape[i] for i in range(n))
+
+    def _binarize_pack(*indices):
+        start_idx = [indices[i] * 32 if i == axis else indices[i] for i in range(n)]
+        packed = tvm.const(0, 'uint32')
+        for j in range(32):
+            idx = [start_idx[i] + j if i == axis else start_idx[i] for i in range(n)]
+            sign = (data(*idx) >= 0).astype("uint32")
+            packed = (packed | sign)
+            if j == 31:
+                return packed
+            packed = packed << 1
+        raise RuntimeError("not resach")
+
+    return tvm.compute(oshape, _binarize_pack, name=name, tag='binarize_pack')
+
+
+def binary_dense(data, weight):
+    """Binary matrix multiplication using xor and bit-count.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim], dtype is uint32.
+
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim], dtype is uint32.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim], dtype is float32.
+    """
+    assert data.dtype == 'uint32' and weight.dtype == 'uint32', \
+        "dtype of data and weight should be uint32"
+    assert len(data.shape) == 2 and len(weight.shape) == 2, \
+        "only support 2-dim binary dense"
+    batch, in_dim = data.shape
+    out_dim, _ = weight.shape
+    k = tvm.reduce_axis((0, in_dim), name='k')
+    matmul = tvm.compute((batch, out_dim), lambda i, j: \
+                          tvm.sum(tvm.popcount(data[i, k] ^ weight[j, k]), axis=k), \
+                          tag='binary_dense')
+
+    return tvm.compute((batch, out_dim), lambda i, j: \
+                        32 * in_dim - 2. * matmul(i, j), \
+                        tag=tag.ELEMWISE)
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index db3a6079f96a..e0d2c403d4b4 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -1,57 +1,22 @@
-# pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument
+# pylint: disable=invalid-name, unused-variable, too-many-locals
+# pylint: disable=unused-argument, redefined-builtin
 """Conv2D operators"""
 from __future__ import absolute_import as _abs
 from collections import namedtuple
+import numpy as np
 import tvm
+
 from .pad import pad
 from .util import get_pad_tuple
-from ..util import simplify
+from ..util import simplify, const_matrix, get_const_tuple
 
 # workload description of conv2d
 Workload = namedtuple('Workload',
-                      ['height', 'width', 'in_filter', 'out_filter',
+                      ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
                        'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
 
-# schedule description of spatial
-SpatialPack = namedtuple('SpatialPack',
-                         ['vh', 'vw', 'vc', 'ba', 'bc', 'unroll'])
-
-# schedule description of im2col
-Im2ColPack = namedtuple('Im2ColPack',
-                        ['vp', 'vq', 'ba', 'bc', 'unroll'])
-
-_WORKLOADS = [
-    # workloads of resnet18 on imagenet
-    Workload(224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
-    Workload(56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-    Workload(56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-    Workload(56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-    Workload(56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-    Workload(28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-    Workload(28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-    Workload(28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-    Workload(14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-    Workload(14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-    Workload(14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-    Workload(7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-    # workloads of mobile net on imagenet
-    Workload(224, 224, 3, 32, 3, 3, 1, 1, 2, 2),
-    Workload(112, 112, 32, 64, 1, 1, 0, 0, 1, 1),
-    Workload(56, 56, 64, 128, 1, 1, 0, 0, 1, 1),
-    Workload(56, 56, 128, 128, 1, 1, 0, 0, 1, 1),
-    Workload(28, 28, 128, 256, 1, 1, 0, 0, 1, 1),
-    Workload(28, 28, 256, 256, 1, 1, 0, 0, 1, 1),
-    Workload(14, 14, 256, 512, 1, 1, 0, 0, 1, 1),
-    Workload(14, 14, 512, 512, 1, 1, 0, 0, 1, 1),
-    Workload(7, 7, 512, 1024, 1, 1, 0, 0, 1, 1),
-    Workload(7, 7, 1024, 1024, 1, 1, 0, 0, 1, 1),
-]
-
-# platform specific schedule
-_CONV_SCHEDULE = {}
-
 @tvm.target.generic_func
-def conv2d(data, kernel, stride, padding, layout='NCHW'):
+def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
     """Conv2D operator.
 
     Parameters
@@ -62,7 +27,7 @@ def conv2d(data, kernel, stride, padding, layout='NCHW'):
     filter : tvm.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
 
-    stride : int or a list/tuple of two ints
+    strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
 
     padding : int or a list/tuple of two ints
@@ -79,14 +44,33 @@ def conv2d(data, kernel, stride, padding, layout='NCHW'):
     # search platform specific declaration first
     # default declaration
     if layout == 'NCHW':
-        return conv2d_nchw(data, kernel, stride, padding)
+        return conv2d_nchw(input, filter, strides, padding, out_dtype)
     elif layout == 'HWCN':
-        return conv2d_hwcn(data, kernel, stride, padding)
+        return conv2d_hwcn(input, filter, strides, padding, out_dtype)
+    elif layout == 'NHWC':
+        return conv2d_nhwc(input, filter, strides, padding, out_dtype)
     else:
         raise ValueError("not support this layout {} yet".format(layout))
 
 
-def _get_workload(data, kernel, stride, padding):
+@tvm.target.generic_func
+def conv2d_alter_layout(attrs, inputs, tinfos):
+    """Change Conv2D layout.
+
+    Parameters
+    ----------
+    attrs : nnvm.top.AttrDict
+        Attributes of current convolution
+    inputs : nnvm.symbol
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    """
+    # not to change by default
+    return None
+
+
+def _get_workload(data, kernel, stride, padding, out_dtype):
     """ Get the workload structure. """
     _, CI, IH, IW = [x.value for x in data.shape]
     CO, _, KH, KW = [x.value for x in kernel.shape]
@@ -95,146 +79,46 @@ def _get_workload(data, kernel, stride, padding):
         HSTR, WSTR = stride
     else:
         HSTR, WSTR = stride, stride
-    return Workload(IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
+    assert data.dtype == kernel.dtype, \
+        "Do not support inputs with different data types now. ' \
+        '{} vs. {}".format(data.dtype, kernel.dtype)
+    return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
 
 @tvm.target.generic_func
-def _get_schedule(wkl):
+def _get_alter_layout_schedule(wkl):
     # pylint: disable=unreachable
-    """ Get the platform specific schedule. """
+    """ Get the platform specific schedule for conv2d_alter_layout. """
     target = tvm.target.current_target()
     raise RuntimeError(
         "No schedule for current target:{}".format(target))
     # This return has no use, merely to supress pylint warning
     return wkl
 
-def _spatial_pack(data, kernel, stride, padding):
-    """ Compute convolution with pack on spatial axes. """
-    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
-    wkl = _get_workload(data, kernel, stride, padding)
-    sch = _get_schedule(wkl)
-
-    H, W = wkl.height, wkl.width
-    CI, CO = wkl.in_filter, wkl.out_filter
-    KH, KW = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    HCAT, WCAT = KH-1, KW-1
-
-    VH = sch.vh
-    VW = sch.vw
-    VC = sch.vc
-    UNROLL = sch.unroll
-
-    TH = H + 2*HPAD
-    TW = W + 2*WPAD
-    OH = (H + 2*HPAD - KH) // HSTR + 1
-    OW = (W + 2*WPAD - KW) // WSTR + 1
-
-    dshape = (1, CI, H, W)
-    dpshape = (1, CI, TH, TW)
-    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT)
-
-    kshape = (CO, CI, KH, KW)
-    kvshape = (CO/VC, CI, KH, KW, VC)
-
-    ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
-    oshape = (1, CO, OH, OW)
-
-    DOPAD = (HPAD != 0 and WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
-
-    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \
-        data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')
-
-    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
-        kernel[co*VC+vc][ci][dh][dw], name='kernel_vec')
-
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    dh = tvm.reduce_axis((0, KH), name='dh')
-    dw = tvm.reduce_axis((0, KW), name='dw')
-
-    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
-        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw] *
-                kernel_vec[co, ci, dh, dw, vc],
-                axis=[ci, dh, dw]), name='conv')
-
-    output = tvm.compute(oshape, lambda n, co, h, w:
-                         conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC],
-                         name='output_unpack', tag='spatial_conv_output')
-
-    return output
 
+@tvm.target.generic_func
+def _get_schedule(wkl):
+    # pylint: disable=unreachable
+    """ Get the platform specific schedule. """
+    target = tvm.target.current_target()
+    raise RuntimeError(
+        "No schedule for current target:{}".format(target))
+    # This return has no use, merely to supress pylint warning
+    return wkl
 
-def _im2col_pack(data, kernel, stride, padding):
-    """ Compute convolution with im2col pack layout. """
-    assert data.shape[0].value == 1, "im2col pack convolution only support batch size=1"
-    wkl = _get_workload(data, kernel, stride, padding)
-    sch = _get_schedule(wkl)
-
-    N = 1
-    H, W = wkl.height, wkl.width
-    CI = wkl.in_filter
-    CO = wkl.out_filter
-    KH, KW = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.hpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    OH = (H + 2*HPAD - KH) // HSTR + 1
-    OW = (W + 2*WPAD - KW) // WSTR + 1
-
-    P = sch.vp
-    Q = sch.vq
-    UNROLL = sch.unroll
-
-    dshape = (N, CI, H, W)
-    dpshape = (N, CI, H+2*HPAD, W+2*WPAD)
-    dcshape = (N, OH, OW, CI, KH, KW)
-    dvshape = (N, OH * OW // P, CI, KH, KW, P)
-
-    kshape = (CO, CI, KH, KW)
-    kvshape = (CO // Q, CI, KH, KW, Q)
-
-    ovshape = (N, CO // Q, OH * OW // P, P, Q)
-    oshape = (N, CO, OH, OW)
-
-    ############### declaration
-
-    DO_PAD = (wkl.hpad != 0 and wkl.wpad != 0)
-    if DO_PAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
-
-    data_col = tvm.compute(dcshape, lambda n, oh, ow, ci, hk, wk: \
-        data_pad[n][ci][oh*HSTR+hk][ow*WSTR+wk], name='data_col')
-
-    data_vec = tvm.compute(dvshape, lambda n, im, ci, hk, wk, vim: \
-        data_col[n][(im*P+vim)//OW][(im*P+vim)%OW][ci][hk][wk], name='data_vec')
-
-
-    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
-        kernel[co*Q+vc][ci][dh][dw], name='kernel_vec')
-
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    hk = tvm.reduce_axis((0, KH), name='hk')
-    wk = tvm.reduce_axis((0, KW), name='wk')
-
-    conv = tvm.compute(ovshape, lambda n, co, im, vim, vco: \
-        tvm.sum(data_vec[n][im][ci][hk][wk][vim] * kernel_vec[co][ci][hk][wk][vco],
-                axis=[ci, hk, wk]), name='conv')
-
-    output = tvm.compute(oshape, lambda n, co, h, w: \
-                         conv[n][co//Q][(h*OW+w)//P][(h*OW+w)%P][co%Q],
-                         name='output_vec', tag='im2col_conv_output')
 
-    return output
+@tvm.target.generic_func
+def _get_schedule_NCHWc(wkl, layout, out_layout):
+    # pylint: disable=unreachable
+    """ Get the platform specific schedule. """
+    target = tvm.target.current_target()
+    raise RuntimeError(
+        "No schedule for current target:{}".format(target))
+    # This return has no use, merely to supress pylint warning
+    return wkl
 
 
-def conv2d_nchw(Input, Filter, stride, padding):
+def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     """Convolution operator in NCHW layout.
 
     Parameters
@@ -256,6 +140,8 @@ def conv2d_nchw(Input, Filter, stride, padding):
     Output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
+    if out_dtype is None:
+        out_dtype = Input.dtype
     assert isinstance(stride, int) or len(stride) == 2
     batch, in_channel, in_height, in_width = Input.shape
     num_filter, channel, kernel_h, kernel_w = Filter.shape
@@ -280,11 +166,12 @@ def conv2d_nchw(Input, Filter, stride, padding):
     return tvm.compute(
         (batch, out_channel, out_height, out_width),
         lambda nn, ff, yy, xx: tvm.sum(
-            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx] * Filter[ff, rc, ry, rx],
+            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
+            Filter[ff, rc, ry, rx].astype(out_dtype),
             axis=[rc, ry, rx]), tag="conv2d_nchw")
 
 
-def conv2d_hwcn(Input, Filter, stride, padding):
+def conv2d_hwcn(Input, Filter, stride, padding, out_dtype=None):
     """Convolution operator in HWCN layout.
 
     Parameters
@@ -306,6 +193,8 @@ def conv2d_hwcn(Input, Filter, stride, padding):
     output : tvm.Tensor
         4-D with shape [out_height, out_width, out_channel, batch]
     """
+    if out_dtype is None:
+        out_dtype = Input.dtype
     assert isinstance(stride, int) or len(stride) == 2
     in_height, in_width, in_channel, batch = Input.shape
     kernel_h, kernel_w, channel, num_filter = Filter.shape
@@ -329,13 +218,182 @@ def conv2d_hwcn(Input, Filter, stride, padding):
     Output = tvm.compute(
         (out_height, out_width, out_channel, batch),
         lambda yy, xx, ff, nn: tvm.sum(
-            PaddedInput[yy * stride_h + ry, xx * stride_w + rx, rc, nn] * Filter[ry, rx, rc, ff],
-            axis=[ry, rx, rc]),
+            PaddedInput[yy * stride_h + ry, xx * stride_w + rx, rc, nn].astype(out_dtype) *
+            Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
         name="Conv2dOutput", tag="conv2d_hwcn")
     return Output
 
-# map from schedule type to declaration function
-_SCH_TO_DECL_FUNC = {
-    SpatialPack: _spatial_pack,
-    Im2ColPack: _im2col_pack,
-}
+
+def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
+    """Convolution operator in NHWC layout.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+
+    Filter : tvm.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    assert isinstance(stride, int) or len(stride) == 2
+    batch, in_height, in_width, in_channel = Input.shape
+    kernel_h, kernel_w, channel, num_filter = Filter.shape
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (kernel_h, kernel_w))
+    # compute the output shape
+    out_channel = num_filter
+    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    pad_before = [0, pad_top, pad_left, 0]
+    pad_after = [0, pad_down, pad_right, 0]
+    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
+    rc = tvm.reduce_axis((0, in_channel), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    Output = tvm.compute(
+        (batch, out_height, out_width, out_channel),
+        lambda nn, yy, xx, ff: tvm.sum(
+            PaddedInput[nn, yy * stride_h + ry, xx * stride_w + rx, rc].astype(out_dtype) *
+            Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
+        name="Conv2dOutput", tag="conv2d_nhwc")
+    return Output
+
+
+@tvm.target.generic_func
+def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
+                 padding, layout, out_layout, out_dtype='float32'):
+    """Conv2D operator for nChw[x]c layout.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    kernel : tvm.Tensor
+        6-D with shape
+        [num_filter_chunk, in_channel_chunk, filter_height, filter_width,
+        in_channel_block, num_filter_block]
+
+    num_filter : int
+        number of filters, i.e., output channel size
+
+    kernel_size : tuple of two ints
+       [kernel_height, kernel_width]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        Input data layout
+
+    out_layout : str
+        Output data layout
+
+    out_dtype : str
+        output data type
+
+    Returns
+    -------
+    output : tvm.Tensor
+        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
+    """
+    # search platform specific declaration first
+    # default declaration
+    raise ValueError("missing register for topi.nn.conv2d_NCHWc")
+
+
+def conv2d_winograd_weight_transform(kernel, tile_size):
+    """Weight transformation for winograd
+
+    Parameters
+    ----------
+    kernel: Tensor
+        The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now
+    tile_size: int
+        Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [alpha, alpha, CO, CI]
+    """
+    K = 3
+
+    shape = get_const_tuple(kernel.shape)
+    assert shape[2:] == (K, K), "Only support 3x3 kernel"
+
+    r = tile_size + K - 1
+    shape = (r, r) + shape[:2]
+
+    if tile_size == 2:
+        G_data = np.array([
+            [1, 0, 0],
+            [1.0/2, 1.0/2, 1.0/2],
+            [1.0/2, -1.0/2, 1.0/2],
+            [0, 0, 1],
+        ], dtype=kernel.dtype)
+    elif tile_size == 4:
+        G_data = np.array([
+            [1 / 4.0, 0, 0],
+            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
+            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
+            [1 / 24.0, 1 / 12.0, 1 / 6.0],
+            [1 / 24.0, -1 / 12.0, 1 / 6.0],
+            [0, 0, 1]
+        ], dtype=kernel.dtype)
+    else:
+        raise ValueError("Unsupoorted tile size:" + tile_size)
+
+    G = const_matrix(G_data, 'G')
+    r_kh = tvm.reduce_axis((0, K), name='r_kh')
+    r_kw = tvm.reduce_axis((0, K), name='r_kw')
+    return tvm.compute(shape, lambda eps, nu, co, ci:
+                       tvm.sum(kernel[co][ci][r_kh][r_kw] *
+                               G[eps][r_kh] * G[nu][r_kw],
+                               axis=[r_kh, r_kw]), name='transform_weight')
+
+
+@tvm.target.generic_func
+def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
+                                             layout, out_dtype, tile_size):
+    """Compute convolution in winograd algorithm. The filter is supposed to be transformed
+    in advance.
+
+    Parameters
+    ----------
+    input : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    filter : tvm.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+    tile_size: int
+        Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
diff --git a/topi/python/topi/nn/conv2d_transpose.py b/topi/python/topi/nn/conv2d_transpose.py
index 33f66d95c798..270409ad099b 100644
--- a/topi/python/topi/nn/conv2d_transpose.py
+++ b/topi/python/topi/nn/conv2d_transpose.py
@@ -9,6 +9,7 @@
 from ..util import simplify
 
 
+@tvm.target.generic_func
 def conv2d_transpose_nchw(Input, Filter, strides, padding):
     """Transposed 2D convolution nchw forward operator.
 
@@ -18,7 +19,7 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
         4-D with shape [batch, in_channel, in_height, in_width]
 
     Filter : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        4-D with shape [in_channel, num_filter, filter_height, filter_width]
 
     strides : tuple of two ints
         The spatial stride along height and width
@@ -32,7 +33,7 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     batch, in_c, in_h, in_w = Input.shape
-    out_c, _, filter_h, filter_w = Filter.shape
+    _, out_c, filter_h, filter_w = Filter.shape
     stride_h, stride_w = strides
     # dilate stage
     DilatedInput = dilate(Input, [1, 1, stride_h, stride_w], name='DilatedInput')
@@ -57,7 +58,7 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
     Output = tvm.compute(
         (batch, out_c, out_h, out_w),
         lambda b, c, h, w: tvm.sum(
-            PaddedInput[b, dc, h+dh, w+dw] * Filter[c, dc, filter_h-1-dh, filter_w-1-dw],
+            PaddedInput[b, dc, h+dh, w+dw] * Filter[dc, c, filter_h-1-dh, filter_w-1-dw],
             axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")
 
     return Output
diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py
index caa736a41416..abd424a64aeb 100644
--- a/topi/python/topi/nn/dense.py
+++ b/topi/python/topi/nn/dense.py
@@ -3,9 +3,8 @@
 import tvm
 from .. import tag
 
-
-def dense(data, weight, bias=None):
-    """Applies a linear transformation: :math:`Y = XW^T + b`.
+def dense_default(data, weight, bias=None):
+    """The default implementation of dense in topi.
 
     Parameters
     ----------
@@ -25,7 +24,7 @@ def dense(data, weight, bias=None):
     """
     assert len(data.shape) == 2 and len(weight.shape) == 2, \
         "only support 2-dim dense"
-    if bias:
+    if bias is not None:
         assert len(bias.shape) == 1
     batch, in_dim = data.shape
     out_dim, _ = weight.shape
@@ -33,8 +32,31 @@ def dense(data, weight, bias=None):
     matmul = tvm.compute((batch, out_dim), \
                          lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \
                          tag='dense')
-    if bias:
+    if bias is not None:
         matmul = tvm.compute((batch, out_dim), \
                              lambda i, j: matmul[i, j] + bias[j], \
                              tag=tag.BROADCAST)
     return matmul
+
+
+@tvm.target.override_native_generic_func("dense")
+def dense(data, weight, bias=None):
+    """Applies a linear transformation: :math:`Y = XW^T + b`.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim]
+
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [out_dim]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    return dense_default(data, weight, bias)
diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py
index 40aed1572db1..c7906d3a4373 100644
--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -9,7 +9,8 @@
 from ..util import simplify
 
 
-def depthwise_conv2d_nchw(Input, Filter, stride, padding):
+@tvm.target.generic_func
+def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     """Depthwise convolution nchw forward operator.
 
     Parameters
@@ -26,14 +27,22 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    out_dtype: str, optional
+        Output data type
+
     Returns
     -------
     Output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
+    out_dtype = Input.dtype if out_dtype is None else out_dtype
+
     batch, in_channel, in_height, in_width = Input.shape
     filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
 
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (filter_height, filter_width))
@@ -51,14 +60,15 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding):
     Output = tvm.compute(
         (batch, out_channel, out_height, out_width),
         lambda b, c, i, j: tvm.sum(
-            (PaddedInput[b, c/channel_multiplier, i*stride_h + di, j*stride_w + dj] *
-             Filter[c/channel_multiplier, c%channel_multiplier, di, dj]),
+            (PaddedInput[b, c/channel_multiplier, i*stride_h+di, j*stride_w+dj].astype(out_dtype) *
+             Filter[c/channel_multiplier, c%channel_multiplier, di, dj].astype(out_dtype)),
             axis=[di, dj]),
         name='DepthwiseConv2d', tag="depthwise_conv2d_nchw")
     return Output
 
 
-def depthwise_conv2d_nhwc(Input, Filter, stride, padding):
+@tvm.target.generic_func
+def depthwise_conv2d_nhwc(Input, Filter, stride, padding, out_dtype=None):
     """Depthwise convolution nhwc forward operator.
 
     Parameters
@@ -75,14 +85,22 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    out_dtype: str, optional
+        Output data type
+
     Returns
     -------
     Output : tvm.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
+    out_dtype = Input.dtype if out_dtype is None else out_dtype
+
     batch, in_height, in_width, in_channel = Input.shape
     filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
 
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (filter_height, filter_width))
@@ -100,8 +118,9 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding):
     Output = tvm.compute(
         (batch, out_height, out_width, out_channel),
         lambda b, i, j, c: tvm.sum(
-            (PaddedInput[b, i*stride_h + di, j*stride_w + dj, c/channel_multiplier] *
-             Filter[di, dj, c/channel_multiplier, c%channel_multiplier]),
+            (PaddedInput[b, i*stride_h + di, j*stride_w + dj, c/channel_multiplier].astype(
+                out_dtype) *
+             Filter[di, dj, c/channel_multiplier, c%channel_multiplier].astype(out_dtype)),
             axis=[di, dj]),
         name='DepthwiseConv2d', tag="depthwise_conv2d_nhwc")
     return Output
@@ -131,7 +150,10 @@ def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, strid
     batch, in_h, in_w, in_c = ishape
     _, out_h, out_w, out_c = oshape
     filter_h, filter_w, _, channel_multiplier = Filter.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
 
     dilated_out_grad = dilate(Out_grad, [1, stride_h, stride_w, 1], name='dilated_out_grad')
 
@@ -186,7 +208,10 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
     batch, out_h, out_w, out_c = oshape
     filter_h, filter_w, _, channel_multiplier = fshape
     in_c = Input.shape[3].value
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
 
     pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (filter_h, filter_w))
 
diff --git a/topi/python/topi/nn/dilate.py b/topi/python/topi/nn/dilate.py
index 07433280cfb3..8ea2808781a5 100644
--- a/topi/python/topi/nn/dilate.py
+++ b/topi/python/topi/nn/dilate.py
@@ -5,7 +5,7 @@
 from .. import util
 from .. import tag
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
+@tvm.tag_scope(tag=tag.INJECTIVE+",dilate")
 def dilate(data, strides, name="DilatedInput"):
     """Dilate data with zeros.
 
diff --git a/topi/python/topi/nn/elemwise.py b/topi/python/topi/nn/elemwise.py
index 856652da02e8..b4132b2f1b7a 100644
--- a/topi/python/topi/nn/elemwise.py
+++ b/topi/python/topi/nn/elemwise.py
@@ -2,6 +2,7 @@
 from __future__ import absolute_import as _abs
 import tvm
 from .. import tag
+from ..util import get_const_int
 
 @tvm.tag_scope(tag=tag.ELEMWISE)
 def relu(x):
@@ -42,3 +43,36 @@ def _compute(*indices):
         calpha = tvm.const(alpha, value.dtype)
         return tvm.select(value > 0, value, value * calpha)
     return tvm.compute(x.shape, _compute)
+
+@tvm.tag_scope(tag=tag.BROADCAST)
+def prelu(x, slope, axis=1):
+    """ PReLU.
+    It accepts two arguments: an input ``x`` and a weight array ``W``
+    and computes the output as :math:`PReLU(x) y = x > 0 ? x : W * x`,
+    where :math:`*` is an elementwise multiplication for each sample in the
+    batch.
+    Arguments:
+    x : tvm.Tensor
+        Input argument.
+
+    slope : tvm.Tensor
+        Channelised slope tensor for prelu
+
+    axis : int
+        The axis where the channel data needs to be applied
+
+    Returns:
+    y : tvm.Tensor
+        The result.
+
+    Links:
+        [http://arxiv.org/pdf/1502.01852v1.pdf]
+    """
+
+    assert len(x.shape) == 4 and len(slope.shape) == 1
+    assert axis < len(x.shape)
+    assert get_const_int(slope.shape[0]) == get_const_int(x.shape[axis])
+
+    def _compute_channelwise(*indices):
+        return tvm.select(x(*indices) > 0, x(*indices), x(*indices) * slope(indices[axis]))
+    return tvm.compute(x.shape, _compute_channelwise)
diff --git a/topi/python/topi/nn/l2_normalize.py b/topi/python/topi/nn/l2_normalize.py
new file mode 100644
index 000000000000..951084379eec
--- /dev/null
+++ b/topi/python/topi/nn/l2_normalize.py
@@ -0,0 +1,29 @@
+# pylint: disable=invalid-name
+"""TVM operator for l2 normalize"""
+from __future__ import absolute_import
+import tvm
+from .. import cpp
+
+@tvm.target.generic_func
+def l2_normalize(data, eps, axis=None):
+    """Perform L2 normalization on the input data
+
+    For axis=None, y(i, j) = x(i, j) / sqrt(max(sum(x^2), eps))
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with NCHW or NHWC layout
+
+    eps : float
+        epsilon value
+
+    axis : list of int
+        axis over the normalization applied
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D output with same shape
+    """
+    return cpp.nn.l2_normalize(data, eps, axis)
diff --git a/topi/python/topi/nn/local_response_norm.py b/topi/python/topi/nn/local_response_norm.py
new file mode 100644
index 000000000000..73eb41242513
--- /dev/null
+++ b/topi/python/topi/nn/local_response_norm.py
@@ -0,0 +1,44 @@
+# pylint: disable=invalid-name
+"""TVM operator for local response norm compute."""
+from __future__ import absolute_import
+import tvm
+from .. import cpp
+
+@tvm.target.generic_func
+def lrn(data, size, axis=1, alpha=0.0001, beta=0.75, bias=2):
+    """Perform the across channels local response normalisation
+    on the input data.
+
+    sum_sqr_up^i{x, y} = (bias+((alpha/size)* \
+                                {sum_{j=max(0, i-size/2)}^{min(N-1,i+size/2)} \
+                                     (data^j{x,y})^2}))^beta
+    output^i{x, y} = data^i{x, y}/sum_sqr_up^i{x, y}
+    N is the number for input channels
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, channel, height, width]
+
+    size : int
+        normalisation window size
+
+    axis : int
+        input data layout channel axis
+        default value is 1 for NCHW format
+
+    bias : float
+        offset to avoid dividing by 0
+
+    alpha : float
+        to be divided
+
+    beta : float
+        exponent
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D output with same shape
+    """
+    return cpp.nn.lrn(data, size, axis, alpha, beta, bias)
diff --git a/topi/python/topi/nn/pad.py b/topi/python/topi/nn/pad.py
index 9420f8a70308..7ebbc566c3a3 100644
--- a/topi/python/topi/nn/pad.py
+++ b/topi/python/topi/nn/pad.py
@@ -6,7 +6,7 @@
 
 @tvm.tag_scope(tag=tag.INJECTIVE+",pad")
 def pad(data, pad_before, pad_after=None, pad_value=0.0, name="PadInput"):
-    """Dilate Input with zeros.
+    """Pad Input with zeros.
 
     Parameters
     ----------
diff --git a/topi/python/topi/nn/pooling.py b/topi/python/topi/nn/pooling.py
index 955f724220a5..478141ee1d7e 100644
--- a/topi/python/topi/nn/pooling.py
+++ b/topi/python/topi/nn/pooling.py
@@ -1,56 +1,67 @@
 """TVM operator pooling compute."""
 from __future__ import absolute_import
-import tvm
-from .pad import pad
-from .util import get_pad_tuple
-from .. import util
-from .. import tag
-
-
-def global_pool(data, pool_type):
-    """Perform global pooling on the data
+from .. import cpp
+
+POOL_TYPE_CODE = {
+    "avg": 0,
+    "max": 1
+}
+
+def global_pool(data, pool_type, layout="NCHW"):
+    """Perform global pooling on height and width dimension of data.
+       It decides the height and width dimension according to the layout string,
+       in which 'W' and 'H' means width and height respectively.
+       Width and height dimension cannot be split.
+       For example, NCHW, NCHW16c, etc. are valid for pool,
+       while NCHW16w, NCHW16h are not.
+       See parameter `layout` for more information of the layout string convention.
 
     Parameters
     ----------
     data : tvm.Tensor
-        4-D with shape [batch, channel, in_height, in_width]
+        n-D with shape of layout
 
     pool_type : str
         Pool type, 'max' or 'avg'
 
+    layout : str
+        Layout of the input data.
+        The layout is supposed to be composed of upper cases, lower cases and numbers,
+        where upper case indicates a dimension and
+        the corresponding lower case with factor size indicates the split dimension.
+        For example, NCHW16c can describe a 5-D tensor of
+        [batch_size, channel, height, width, channel_block],
+        in which channel_block=16 is a split of dimension channel.
+
     Returns
     -------
     output : tvm.Tensor
-        4-D with shape [batch, channel, 1, 1]
+        n-D in same layout with height and width dimension size of 1.
+        e.g., for NCHW, the output shape will be [batch, channel, 1, 1]
     """
-    assert len(data.shape) == 4, "only support 4-dim pooling"
-    batch, channel, height, width = data.shape
-
-    dheight = tvm.reduce_axis((0, height))
-    dwidth = tvm.reduce_axis((0, width))
-
-    if pool_type == 'max':
-        return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
-                            tvm.max(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \
-                            tag="global_pool_max")
-    elif pool_type == 'avg':
-        tsum = tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
-                            tvm.sum(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \
-                            tag="global_pool_sum")
-        return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
-                            tsum[n, c, h, w] / (height*width).astype(tsum.dtype), \
-                            tag=tag.ELEMWISE)
-    else:
-        raise ValueError("Pool type should be 'avg' or 'max'.")
-
-
-def pool(data, kernel, stride, padding, pool_type, ceil_mode=False):
-    """Perform pooling on the data
+    return cpp.nn.global_pool(data, POOL_TYPE_CODE[pool_type], layout)
+
+
+def pool(data,
+         kernel,
+         stride,
+         padding,
+         pool_type,
+         ceil_mode=False,
+         layout="NCHW",
+         count_include_pad=True):
+    """Perform pooling on height and width dimension of data.
+       It decides the height and width dimension according to the layout string,
+       in which 'W' and 'H' means width and height respectively.
+       Width and height dimension cannot be split.
+       For example, NCHW, NCHW16c, etc. are valid for pool,
+       while NCHW16w, NCHW16h are not.
+       See parameter `layout` for more information of the layout string convention.
 
     Parameters
     ----------
     data : tvm.Tensor
-        4-D with shape [batch, channel, in_height, in_width]
+        n-D with shape of layout
 
     kernel : list/tuple of two ints
         Kernel size, [kernel_height, kernel_width]
@@ -58,62 +69,31 @@ def pool(data, kernel, stride, padding, pool_type, ceil_mode=False):
     stride : list/tuple of two ints
         Stride size, [stride_height, stride_width]
 
-    paddding : list/tuple of two ints
-        Pad size, [pad_height, pad_width]
+    padding : list/tuple of four ints
+        Pad size, [pad_top, pad_left, pad_bottom, pad_right]]
 
     pool_type : str
         Pool type, 'max' or 'avg'
 
     ceil_mode : bool
-        Whether to use ceil when caculate output size.
+        Whether to use ceil when calculating output size.
+
+    layout: string
+        Layout of the input data.
+        The layout is supposed to be composed of upper cases, lower cases and numbers,
+        where upper case indicates a dimension and
+        the corresponding lower case with factor size indicates the split dimension.
+        For example, NCHW16c can describe a 5-D tensor of
+        [batch_size, channel, height, width, channel_block],
+        in which channel_block=16 is a split of dimension channel.
+
+    count_include_pad: bool
+        Whether include padding in the calculation when pool_type is 'avg'
 
     Returns
     -------
     output : tvm.Tensor
-        4-D with shape [batch, channel, out_height, out_width]
+        n-D in the same layout
     """
-    assert len(data.shape) == 4, "only support 4-dim pooling"
-    assert len(stride) == 2, "only support 2-dim stride"
-    kernel_height, kernel_width = kernel
-    stride_height, stride_width = stride
-    batch, channel, height, width = data.shape
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (kernel_height, kernel_width))
-
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-
-    if ceil_mode:
-        # Additional padding to ensure we do ceil instead of floor when divide stride.
-        pad_down += stride_height -1
-        pad_right += stride_width - 1
-
-    out_height = util.simplify((height - kernel_height + pad_top + pad_down) // stride_height + 1)
-    out_width = util.simplify((width - kernel_width + pad_left + pad_right) // stride_width + 1)
-
-    dheight = tvm.reduce_axis((0, kernel_height))
-    dwidth = tvm.reduce_axis((0, kernel_width))
-
-    if pool_type == 'max':
-        temp = pad(data, pad_before, pad_after, name="pad_temp", \
-            pad_value=tvm.min_value(data.dtype))
-        return tvm.compute((batch, channel, out_height, out_width), \
-                            lambda n, c, h, w: \
-                            tvm.max(temp[n, c, h*stride_height+dheight, w*stride_width+dwidth], \
-                                axis=[dheight, dwidth]), \
-                            tag="pool_max")
-    elif pool_type == 'avg':
-        temp = pad(data, pad_before, pad_after, name="pad_temp", \
-            pad_value=tvm.const(0.).astype(data.dtype))
-        tsum = tvm.compute((batch, channel, out_height, out_width), \
-                            lambda n, c, h, w: \
-                            tvm.sum(temp[n, c, h*stride_height+dheight, w*stride_width+dwidth], \
-                                axis=[dheight, dwidth]), \
-                            tag="pool_avg")
-        return tvm.compute((batch, channel, out_height, out_width), \
-                            lambda n, c, h, w: \
-                            tsum[n, c, h, w] / (kernel_height*kernel_width), \
-                            tag=tag.ELEMWISE)
-    else:
-        raise ValueError("Pool type should be 'avg' or 'max'.")
+    return cpp.nn.pool(data, kernel, stride, padding,
+                       POOL_TYPE_CODE[pool_type], ceil_mode, layout, count_include_pad)
diff --git a/topi/python/topi/nn/softmax.py b/topi/python/topi/nn/softmax.py
index 9060a31f532b..5e73f7633f4d 100644
--- a/topi/python/topi/nn/softmax.py
+++ b/topi/python/topi/nn/softmax.py
@@ -4,28 +4,51 @@
 import tvm
 
 @tvm.tag_scope(tag='softmax_output')
-def softmax(x):
+def softmax(x, axis=-1):
     """Perform softmax activation on the data
 
     Parameters
     ----------
     data : tvm.Tensor
-        2-D input data
+        can be any dimension
+
+    axis : int
+        channel axis
 
     Returns
     -------
     output : tvm.Tensor
-        2-D output with same shape
+        output shape is the same as input
     """
-    assert len(x.shape) == 2, "only support 2-dim softmax"
-    m, n = x.shape
-    k = tvm.reduce_axis((0, n), name='k')
-    max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k))
-    k = tvm.reduce_axis((0, n), name='k')
-    expsum = tvm.compute(
-        (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k))
-    return tvm.compute(
-        x.shape, lambda i, j: tvm.exp(x[i, j] - max_elem[i]) / expsum[i])
+    shape = x.shape
+    if axis < 0:
+        axis = len(shape) + axis
+    if axis >= len(shape):
+        ValueError("axis parameter should be less than input dim")
+
+    k1 = tvm.reduce_axis((0, shape[axis]), name='k')
+    k2 = tvm.reduce_axis((0, shape[axis]), name='k')
+
+    def insert_reduce_index(indices, reduce_index):
+        return indices[:axis] + (reduce_index,) + indices[axis:]
+
+    def _compute_max(*indices):
+        eval_range = insert_reduce_index(indices, k1)
+        return tvm.max(x[eval_range], axis=k1)
+
+    def _compute_expsum(max_elem, *indices):
+        eval_range = insert_reduce_index(indices, k2)
+        return tvm.sum(tvm.exp(x[eval_range] - max_elem[indices]), axis=k2)
+
+    def _normalize(max_elem, expsum, *indices):
+        non_reduce_indices = tuple([var for (i, var) in enumerate(indices) if i != axis])
+        return tvm.exp(x[indices] - max_elem[non_reduce_indices]) / expsum[non_reduce_indices]
+
+    reduced_shape = tuple([dim for (i, dim) in enumerate(shape) if i != axis])
+    max_elem = tvm.compute(reduced_shape, _compute_max)
+    expsum = tvm.compute(reduced_shape, lambda *indices: _compute_expsum(max_elem, *indices))
+    return tvm.compute(shape, lambda *indices: _normalize(max_elem, expsum, *indices))
+
 
 @tvm.tag_scope(tag='log_softmax_output')
 def log_softmax(x):
diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py
new file mode 100644
index 000000000000..55f7844319f3
--- /dev/null
+++ b/topi/python/topi/nn/upsampling.py
@@ -0,0 +1,40 @@
+"""TVM operator upsampling compute."""
+from __future__ import absolute_import
+import topi
+
+
+def upsampling(data, scale, layout="NCHW", method='NEAREST_NEIGHBOR'):
+    """Perform upsampling on the data.
+       Nearest neighbor and bilinear upsampling are supported.
+
+    Parameters
+    ----------
+    inputs : tvm.Tensor
+        inputs is a 4-D tensor with shape
+        [batch, channel, in_height, in_width]
+        or  [batch, in_height, in_width, channel]
+
+    scale : int
+        Scaling factor
+
+    layout : string, optional
+        either "NCHW" or "NHWC"
+
+    method : {"BILINEAR", "NEAREST_NEIGHBOR"}
+        Method to be used for upsampling.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, channel, in_height*scale, in_width*scale]
+        or [batch, in_height*scale, in_width*scale, channel]
+    """
+
+    if layout == "NCHW":
+        out_shape = (data.shape[2] * scale, data.shape[3] * scale)
+    elif layout == "NHWC":
+        out_shape = (data.shape[1] * scale, data.shape[2] * scale)
+    else:
+        raise ValueError("not support this layout {} yet".format(layout))
+
+    return topi.cpp.nn.upsampling(data, out_shape, layout, method)
diff --git a/topi/python/topi/nn/util.py b/topi/python/topi/nn/util.py
index c18cabd32dc0..6264ced76953 100644
--- a/topi/python/topi/nn/util.py
+++ b/topi/python/topi/nn/util.py
@@ -1,6 +1,8 @@
 # pylint: disable=invalid-name, unused-variable
 """NN operator common utilities"""
 from __future__ import absolute_import
+
+import tvm
 from ..util import get_const_int
 
 def infer_pad(data, data_pad):
@@ -53,8 +55,8 @@ def infer_stride(data, kernel, out):
     _, _, IH, IW = data.shape
     _, _, KH, KW = kernel.shape
     _, _, OH, OW = out.shape
-    hstride = (IH - KH) // (OH - 1)
-    wstride = (IW - KW) // (OW - 1)
+    hstride = (IH - KH) // tvm.make.Max(OH - 1, 1) + tvm.select(OH == 1, 1, 0)
+    wstride = (IW - KW) // tvm.make.Max(OW - 1, 1) + tvm.select(OW == 1, 1, 0)
     return get_const_int(hstride), get_const_int(wstride)
 
 
diff --git a/topi/python/topi/opengl/__init__.py b/topi/python/topi/opengl/__init__.py
new file mode 100644
index 000000000000..c8f20b9825a7
--- /dev/null
+++ b/topi/python/topi/opengl/__init__.py
@@ -0,0 +1,9 @@
+# pylint: disable=redefined-builtin, wildcard-import
+"""CUDA specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+from .conv2d_nchw import schedule_conv2d_nchw
+from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
+from .softmax import schedule_softmax
+from .dense import schedule_dense
+from .pooling import schedule_pool, schedule_global_pool
diff --git a/topi/python/topi/opengl/conv2d_nchw.py b/topi/python/topi/opengl/conv2d_nchw.py
new file mode 100644
index 000000000000..573270c375a0
--- /dev/null
+++ b/topi/python/topi/opengl/conv2d_nchw.py
@@ -0,0 +1,54 @@
+#pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
+"""Schedule for conv2d_nchw with auto fusion"""
+import tvm
+from .. import tag
+from .. import generic
+
+@generic.schedule_conv2d_nchw.register(["opengl"])
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(conv2d, data):
+        if conv2d.op in s.outputs:
+            Out = conv2d
+        else:
+            Out = outs[0].op.output(0)
+            s[conv2d].opengl()
+        s[Out].opengl()
+        s[data].opengl()
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].opengl()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule conv2d_nchw
+        elif OP.tag.startswith('conv2d_nchw'):
+            conv2d = OP.output(0)
+            data = OP.input_tensors[0]
+            kernel = OP.input_tensors[1]
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+            _schedule(conv2d, data)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/opengl/dense.py b/topi/python/topi/opengl/dense.py
new file mode 100644
index 000000000000..e4d327afa4d6
--- /dev/null
+++ b/topi/python/topi/opengl/dense.py
@@ -0,0 +1,50 @@
+# pylint: disable=invalid-name, unused-variable
+"""Schedule for dense operator"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import tag
+from .. import generic
+
+@generic.schedule_dense.register(["opengl"])
+def schedule_dense(outs):
+    """Schedule for dense operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of dense
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for dense.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(Dense):
+        if Dense.op in s.outputs:
+            Out = Dense
+        else:
+            Out = outs[0].op.output(0)
+            s[Dense].opengl()
+        s[Out].opengl()
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule dense
+        elif OP.tag == 'dense':
+            Dense = OP.output(0)
+            _schedule(Dense)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/opengl/injective.py b/topi/python/topi/opengl/injective.py
new file mode 100644
index 000000000000..9665e330864f
--- /dev/null
+++ b/topi/python/topi/opengl/injective.py
@@ -0,0 +1,36 @@
+# pylint: disable=invalid-name, unused-variable,
+"""Schedule for composition of injective operator"""
+import tvm
+from .. import generic
+
+def _schedule_injective(op, sch):
+    x = op.output(0)
+    sch[x].opengl()
+    return sch
+
+
+@generic.schedule_injective.register(["opengl"])
+def schedule_injective(outs):
+    """Schedule for injective op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    tvm.schedule.AutoInlineInjective(s)
+    for out in outs:
+        _schedule_injective(out.op, s)
+    return s
+
+schedule_elemwise = schedule_injective
+schedule_broadcast = schedule_injective
diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py
new file mode 100644
index 000000000000..0bc17885548a
--- /dev/null
+++ b/topi/python/topi/opengl/pooling.py
@@ -0,0 +1,97 @@
+# pylint: disable=invalid-name, unused-variable
+"""Schedule for pooling operators"""
+import tvm
+from .. import tag
+from .. import generic
+
+@generic.schedule_global_pool.register(["opengl"])
+def schedule_global_pool(outs):
+    """Schedule for global_pool.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of global_pool
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for global_pool.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(Pool):
+        if Pool.op in s.outputs:
+            Out = Pool
+        else:
+            Out = outs[0].op.output(0)
+            s[Pool].opengl()
+        s[Out].opengl()
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].opengl()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule global_pool
+        elif OP.tag.startswith('global_pool'):
+            Pool = OP.output(0)
+            _schedule(Pool)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+    return s
+
+
+@generic.schedule_pool.register(["opengl"])
+def schedule_pool(outs):
+    """Schedule for pool.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of pool
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for pool.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(PaddedInput, Pool):
+        if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
+            s[PaddedInput].opengl()
+        if Pool.op in s.outputs:
+            Out = Pool
+        else:
+            Out = outs[0].op.output(0)
+            s[Pool].opengl()
+        s[Out].opengl()
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule pool
+        elif OP.tag.startswith('pool'):
+            PaddedInput = OP.input_tensors[0]
+            Pool = OP.output(0)
+            _schedule(PaddedInput, Pool)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/opengl/softmax.py b/topi/python/topi/opengl/softmax.py
new file mode 100644
index 000000000000..a5bf4371eb13
--- /dev/null
+++ b/topi/python/topi/opengl/softmax.py
@@ -0,0 +1,29 @@
+# pylint: disable=invalid-name, unused-variable, trailing-whitespace
+"""Schedule for softmax operator"""
+import tvm
+from .. import generic
+
+@generic.schedule_softmax.register(["opengl"])
+def schedule_softmax(outs):
+    """Schedule for softmax op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    softmax = outs[0]
+    max_elem = softmax.op.input_tensors[1]
+    expsum = softmax.op.input_tensors[2]
+    s[max_elem].opengl()
+    s[expsum].opengl()
+    s[softmax].opengl()
+    return s
diff --git a/topi/python/topi/rasp/conv2d.py b/topi/python/topi/rasp/conv2d.py
deleted file mode 100644
index 86cb8a9d01a0..000000000000
--- a/topi/python/topi/rasp/conv2d.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# pylint: disable=invalid-name,unused-variable,invalid-name
-"""Conv2D schedule on raspberry pi"""
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import target as _target
-from .. import tag
-from ..nn.conv2d import conv2d, _get_schedule
-from ..nn.conv2d import SpatialPack, Im2ColPack
-from ..nn.conv2d import _WORKLOADS, _SCH_TO_DECL_FUNC
-from ..nn.conv2d import _get_workload
-from ..nn.util import infer_pad, infer_stride
-from .. import generic
-
-_SCHEDULES = [
-    SpatialPack(1, 8, 4, 1, 4, True),
-    SpatialPack(1, 7, 4, 2, 4, True),
-    SpatialPack(1, 4, 8, 4, 1, True),
-    SpatialPack(1, 4, 4, 1, 16, False),
-    SpatialPack(1, 4, 8, 4, 8, False),
-    SpatialPack(1, 7, 4, 3, 8, True),
-    SpatialPack(1, 2, 8, 1, 8, True),
-    SpatialPack(2, 1, 16, 1, 4, True),
-    SpatialPack(1, 7, 4, 1, 1, True),
-    Im2ColPack(7, 4, 1, 16, True),
-    Im2ColPack(7, 4, 1, 8, False),
-    Im2ColPack(7, 4, 1, 16, False),
-
-    SpatialPack(2, 2, 4, 28, 1, True),
-    SpatialPack(1, 4, 8, 14, 1, False),
-    SpatialPack(1, 2, 16, 8, 1, True),
-    SpatialPack(1, 4, 8, 8, 8, True),
-    SpatialPack(2, 2, 8, 1, 1, False),
-    SpatialPack(1, 4, 8, 4, 8, False),
-    SpatialPack(2, 2, 8, 1, 4, False),
-    SpatialPack(2, 2, 8, 1, 8, False),
-    Im2ColPack(7, 4, 1, 16, False),
-    Im2ColPack(7, 4, 1, 4, True),
-]
-
-@_get_schedule.register("rasp")
-def _schedule_conv2d(wkl):
-    if wkl not in _WORKLOADS:
-        raise ValueError("no schedule for such workload: {}".format(wkl))
-    idx = _WORKLOADS.index(wkl)
-    sch = _SCHEDULES[idx]
-    return sch
-
-
-@conv2d.register("rasp")
-def _declaration_conv2d(data, kernel, stride, padding, layout):
-    assert layout == 'NCHW', "only support NCHW convolution on rasp"
-    assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
-    wkl = _get_workload(data, kernel, stride, padding)
-    sch = _get_schedule(wkl)
-    return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding)
-
-
-def _schedule_spatial_conv2d(s, data, data_pad, data_vec,
-                             kernel, kernel_vec,
-                             conv_out, output, last):
-    # no stride and padding info here
-    padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-    wkl = _get_workload(data, kernel, stride, padding)
-
-    with tvm.target.rasp():
-        sch = _get_schedule(wkl)
-
-    H, W = wkl.height, wkl.width
-    CI, CO = wkl.in_filter, wkl.out_filter
-    HK, WK = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    HCAT, WCAT = HK-1, WK-1
-    DOPAD = (HPAD != 0 and WPAD != 0)
-
-    VH = sch.vh
-    VW = sch.vw
-    VC = sch.vc
-    UNROLL = sch.unroll
-
-    A, B, C = data, kernel, last
-    A0, A1 = data_pad, data_vec
-    B0 = kernel_vec
-    C0, C1 = conv_out, output
-
-    CC = s.cache_write(C0, "global")
-
-    _, co, oh, ow, vh, vw, vc = s[C0].op.axis
-    if UNROLL:
-        s[C0].unroll(vw)
-    s[C0].vectorize(vc)
-
-    s[CC].compute_at(s[C0], ow)
-    _, co, oh, ow, vh, vw, vc = s[CC].op.axis
-    ci, dh, dw = s[CC].op.reduce_axis
-    s[CC].reorder(ci, dh, vh, dw, vw, vc)
-
-    if UNROLL:
-        s[CC].unroll(vw)
-    s[CC].vectorize(vc)
-
-    ##### Schedule A
-    if DOPAD:
-        s[A0].compute_inline()
-
-    _, h, _, _, _, _ = s[A1].op.axis
-    if sch.ba == 1:
-        oaxis = h
-        paxis = h
-    else:
-        oh, ih = s[A1].split(h, sch.ba)
-        oaxis = oh
-        paxis = ih
-
-    s[A1].parallel(paxis)
-    s[A1].pragma(oaxis, "parallel_launch_point")
-    s[A1].pragma(paxis, "parallel_stride_pattern")
-    s[A1].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule B
-    co, _, _, _, _ = s[B0].op.axis
-    if sch.bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[B0].split(co, sch.bc)
-        oaxis = oco
-        paxis = ico
-
-    s[B0].parallel(paxis)
-    s[B0].pragma(oaxis, "parallel_launch_point")
-    s[B0].pragma(paxis, "parallel_stride_pattern")
-    s[B0].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule C
-    n, co, h, w = s[C].op.axis
-    co, vc = s[C].split(co, VC)
-    oh, ow, vh, vw = s[C].tile(h, w, VH, VW)
-    s[C].reorder(n, co, oh, ow, vh, vw, vc)
-    if C != C1:
-        s[C1].compute_inline()
-    s[C0].compute_at(s[C], ow)
-
-    if sch.bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[C].split(co, sch.bc)
-        oaxis = oco
-        paxis = ico
-
-    s[C].parallel(paxis)
-    s[C].pragma(oaxis, "parallel_launch_point")
-    s[C].pragma(paxis, "parallel_stride_pattern")
-    s[C].pragma(oaxis, "parallel_barrier_when_finish")
-
-    return s
-
-def _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec,
-                            kernel, kernel_vec,
-                            conv_out, output, last):
-    # no stride and padding info here
-    padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-    wkl = _get_workload(data, kernel, stride, padding)
-
-    with _target.rasp():
-        sch = _get_schedule(wkl)
-
-    H, W = wkl.height, wkl.width
-    CI = wkl.in_filter
-    CO = wkl.out_filter
-    HK, WK = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    HCAT, WCAT = HK-1, WK-1
-    DOPAD = (HPAD != 0 and WPAD != 0)
-
-    P = sch.vp
-    Q = sch.vq
-    UNROLL = sch.unroll
-
-    A, B, C = data, kernel, last
-    A0, A1, A2 = data_pad, data_col, data_vec
-    B0 = kernel_vec
-    C0, C1 = conv_out, output
-
-    CC = s.cache_write(C0, "global")
-    AA = s.cache_read(A2, "global", [CC])
-    BB = s.cache_read(B0, "global", [CC])
-
-
-    ##### Schedule CC
-    _, co, im, vim, vco = s[C0].op.axis
-    s[C0].unroll(vim)
-    s[C0].vectorize(vco)
-
-    s[CC].compute_at(s[C0], im)
-    _, co, im, vim, vco = s[CC].op.axis
-    ci, hk, wk = s[CC].op.reduce_axis
-    s[CC].reorder(ci, hk, wk, vim, vco)
-    s[CC].unroll(vim)
-    s[CC].vectorize(vco)
-    # s[CC].unroll(ccr)
-
-    ### Schedule C
-    _, co, h, w = s[C].op.axis
-    im = s[C].fuse(h, w)
-    im, vim = s[C].split(im, P)
-    co, vco = s[C].split(co, Q)
-    s[C].reorder(co, im, vim, vco)
-
-    if sch.bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[C].split(co, sch.bc)
-        oaxis = oco
-        paxis = ico
-
-    s[C].parallel(paxis)
-    s[C].pragma(oaxis, "parallel_launch_point")
-    s[C].pragma(paxis, "parallel_stride_pattern")
-    s[C].pragma(oaxis, "parallel_barrier_when_finish")
-    if C1 != C:
-        s[C1].compute_inline()
-
-    s[C0].compute_at(s[C], paxis)
-
-    ##### Schedule A
-    if DOPAD:
-        s[A0].compute_inline()
-    s[A1].compute_inline()
-    s[AA].compute_at(s[CC], wk)
-    s[AA].unroll(AA.op.axis[4])
-
-    _, im, _, _, _, _ = s[A2].op.axis
-    if sch.ba == 1:
-        oaxis = im
-        paxis = im
-    else:
-        oim, iim = s[A2].split(im, sch.ba)
-        oaxis = oim
-        paxis = iim
-
-    s[A2].parallel(paxis)
-    s[A2].pragma(oaxis, "parallel_launch_point")
-    s[A2].pragma(paxis, "parallel_stride_pattern")
-    s[A2].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule B
-    s[BB].compute_at(s[CC], wk)
-    s[BB].vectorize(BB.op.axis[4])
-
-    co, _, _, _, _ = s[B0].op.axis
-    if sch.bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[B0].split(co, sch.bc)
-        oaxis = oco
-        paxis = ico
-
-    s[B0].parallel(paxis)
-    s[B0].pragma(oaxis, "parallel_launch_point")
-    s[B0].pragma(paxis, "parallel_stride_pattern")
-    s[B0].pragma(oaxis, "parallel_barrier_when_finish")
-
-    return s
-
-@generic.schedule_conv2d_nchw.register(["cpu", "rasp"])
-def schedule_conv2d(outs):
-    """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-        if 'spatial_conv_output' in op.tag:
-            output = op.output(0)
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel = kernel_vec.op.input_tensors[0]
-            data_vec = conv_out.op.input_tensors[0]
-            data = data_vec.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            _schedule_spatial_conv2d(s, data, data_pad, data_vec,
-                                     kernel, kernel_vec,
-                                     conv_out, output, outs[0])
-
-        if 'im2col_conv_output' in op.tag:
-            output = op.output(0)
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel = kernel_vec.op.input_tensors[0]
-            data_vec = conv_out.op.input_tensors[0]
-            data_col = data_vec.op.input_tensors[0]
-            data = data_col.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-            _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec,
-                                    kernel, kernel_vec,
-                                    conv_out, output, outs[0])
-
-    traverse(outs[0].op)
-    return s
diff --git a/topi/python/topi/rasp/depthwise_conv2d.py b/topi/python/topi/rasp/depthwise_conv2d.py
deleted file mode 100644
index e695f0463852..000000000000
--- a/topi/python/topi/rasp/depthwise_conv2d.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# pylint: disable=invalid-name,unused-variable, unused-argument
-"""Schedule for depthwise_conv2d with auto fusion"""
-from __future__ import absolute_import as _abs
-from collections import namedtuple
-import tvm
-from .. import tag
-from ..nn.util import infer_pad, infer_stride, get_pad_tuple
-from .. import generic
-
-_Workload = namedtuple('Workload',
-                       ['height', 'width', 'channel', 'multiplier',
-                        'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
-
-_Schedule = namedtuple('Schedule', ['vh', 'vw', 'vc', 'bc', 'unroll'])
-
-# workloads of depthwise conv mobile net on imagenet
-_WORKLOADS = [
-    _Workload(112, 112, 32, 1, 3, 3, 1, 1, 1, 1),
-    _Workload(112, 112, 64, 1, 3, 3, 1, 1, 2, 2),
-    _Workload(56, 56, 128, 1, 3, 3, 1, 1, 1, 1),
-    _Workload(56, 56, 128, 1, 3, 3, 1, 1, 2, 2),
-    _Workload(28, 28, 256, 1, 3, 3, 1, 1, 1, 1),
-    _Workload(28, 28, 256, 1, 3, 3, 1, 1, 2, 2),
-    _Workload(14, 14, 512, 1, 3, 3, 1, 1, 1, 1),
-    _Workload(14, 14, 512, 1, 3, 3, 1, 1, 2, 2),
-    _Workload(14, 14, 1024, 1, 3, 3, 1, 1, 1, 1),
-]
-
-_SCHEDULES = [
-    _Schedule(2, 1, 4, 1, True),
-    _Schedule(2, 4, 4, 2, True),
-    _Schedule(2, 1, 4, 2, False),
-    _Schedule(2, 4, 4, 1, True),
-    _Schedule(4, 1, 4, 8, True),
-    _Schedule(1, 1, 4, 2, True),
-    _Schedule(1, 1, 8, 8, True),
-    _Schedule(1, 1, 4, 1, False),
-    _Schedule(2, 1, 4, 16, False),
-]
-
-def _get_workload(data, kernel, stride, padding):
-    _, C, IH, IW = [x.value for x in data.shape]
-    _, MT, KH, KW = [x.value for x in kernel.shape]
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-    return _Workload(IH, IW, C, MT, KH, KW, HPAD, WPAD, HSTR, WSTR)
-
-
-def _schedule(s, data, data_pad, kernel, output, last):
-    padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-    wkl = _get_workload(data, kernel, stride, padding)
-
-    if wkl not in _WORKLOADS:
-        return s
-
-    # use specified schedule
-    sch = _SCHEDULES[_WORKLOADS.index(wkl)]
-
-    H, W = wkl.height, wkl.width
-    CN = wkl.channel
-    MT = wkl.multiplier
-
-    HK, WK = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    VH, VW = sch.vh, sch.vw
-    BC = sch.bc
-    VC = sch.vc
-
-    TH = H + 2*HPAD
-    TW = W + 2*WPAD
-    OH = (H + 2*HPAD - HK) / HSTR + 1
-    OW = (W + 2*WPAD - WK) / WSTR + 1
-
-
-    A, B, C = data, kernel, output
-    A0 = data_pad
-
-    A1 = s.cache_read(A0, "global", C)
-    _, c, h, w = s[A1].op.axis
-    c, vc = s[A1].split(c, VC)
-    s[A1].reorder(c, h, w, vc)
-
-    A2 = s.cache_write(A1, 'global')
-    s[A0].compute_inline()
-    s[A1].compute_inline()
-
-    B0 = s.cache_read(B, "global", C)
-    c, m, h, w = s[B0].op.axis
-    c, vc = s[B0].split(c, VC)
-    s[B0].reorder(c, m, h, w, vc)
-
-    B1 = s.cache_write(B0, 'global')
-    s[B0].compute_inline()
-
-    _, c, h, w = s[C].op.axis
-    c, vc = s[C].split(c, VC)
-    s[C].reorder(c, h, w, vc)
-
-
-    C0 = s.cache_write(C, 'global')
-    _, c, h, w, vc = s[C0].op.axis
-    dh, dw = s[C0].op.reduce_axis
-    oh, ow, ih, iw = s[C0].tile(h, w, VH, VW)
-    s[C0].reorder(c, oh, ow, dh, dw, ih, iw, vc)
-    if sch.unroll:
-        s[C0].unroll(iw)
-    s[C0].vectorize(vc)
-
-
-    # # s[C0].compute_at(s[C0], ow)
-    launch, c, _, _ = s[C].op.axis
-    s[C].pragma(launch, "parallel_launch_point")
-
-    s[C].parallel(c)
-    s[C].pragma(c, "parallel_stride_pattern")
-    s[C].pragma(c, "parallel_barrier_when_finish")
-
-
-    s[C0].compute_at(s[C], launch)
-    _, c, h, w, vc = s[C0].op.axis
-    s[C0].parallel(c)
-    s[C0].pragma(c, "parallel_stride_pattern")
-    s[C0].pragma(c, "parallel_barrier_when_finish")
-
-
-    s[A2].compute_at(s[C0], oh)
-    # parallel(s[A2], s[A2].op.axis[1], BC)
-
-    # # s[B0].compute_at(s[C0], ow)
-    s[B1].compute_at(s[C], launch)
-    c, m, h, w, vc = s[B1].op.axis
-    s[B1].parallel(c)
-    s[B1].pragma(c, "parallel_stride_pattern")
-    s[B1].pragma(c, "parallel_barrier_when_finish")
-
-    return s
-
-
-@generic.schedule_depthwise_conv2d_nchw.register(["cpu", "rasp"])
-def schedule_depthwise_conv2d(outs):
-    """Schedule for depthwise_conv2d nchw forward.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d nchw.
-    """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def traverse(op):
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-        # schedule depthwise_conv2d
-        if op.tag == 'depthwise_conv2d_nchw':
-            output = op.output(0)
-            kernel = op.input_tensors[1]
-            data = op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-            _schedule(s, data, data_pad, kernel, output, outs[0])
-
-    traverse(outs[0].op)
-    return s
diff --git a/topi/python/topi/reduction.py b/topi/python/topi/reduction.py
index 3c6bf1ca0e2c..9f88953bb770 100644
--- a/topi/python/topi/reduction.py
+++ b/topi/python/topi/reduction.py
@@ -107,10 +107,8 @@ def comm_reduce(data, axis=None, keepdims=False, func=tvm.sum, is_idx_reduce=Fal
     ret : tvm.Tensor
     """
     ndim = len(data.shape)
+    assert ndim != 0, "Reduce a dim-0 input is not supported!"
     real_axis = _get_real_axis(ndim, axis)
-    if real_axis == list(range(ndim)) and keepdims is False:
-        raise ValueError("Currently we do not support all reduce + keepdims = False!"
-                         " axis={}, keepdims={}".format(axis, keepdims))
     reduce_axes = [tvm.reduce_axis((0, data.shape[i]), "k%d" %i) for i in real_axis]
     if keepdims:
         target_shape = [1 if i in real_axis else data.shape[i] for i in range(ndim)]
@@ -188,9 +186,9 @@ def max(data, axis=None, keepdims=False):
         The input tvm tensor
 
     axis : None or int or tuple of int
-        Axis or axes along which a sum is performed.
-        The default, axis=None, will sum all of the elements of the input array.
-        If axis is negative it counts from the last to the first axis.
+        Axis or axes along which the max operation is performed.
+        The default, axis=None, will find the max element from all of the elements of the input
+        array. If axis is negative it counts from the last to the first axis.
 
     keepdims : bool
         If this is set to True, the axes which are reduced are left in the result as dimensions
@@ -214,9 +212,9 @@ def min(data, axis=None, keepdims=False):
         The input tvm tensor
 
     axis : None or int or tuple of int
-        Axis or axes along which a sum is performed.
-        The default, axis=None, will sum all of the elements of the input array.
-        If axis is negative it counts from the last to the first axis.
+        Axis or axes along which a minimum operation is performed.
+        The default, axis=None, will find the minimum element from all of the elements of the
+        input array. If axis is negative it counts from the last to the first axis.
 
     keepdims : bool
         If this is set to True, the axes which are reduced are left in the result as dimensions
@@ -240,9 +238,9 @@ def argmax(data, axis=None, keepdims=False):
         The input tvm tensor
 
     axis : None or int or tuple of int
-        Axis or axes along which a sum is performed.
-        The default, axis=None, will sum all of the elements of the input array.
-        If axis is negative it counts from the last to the first axis.
+        Axis or axes along which a argmax operation is performed.
+        The default, axis=None, will find the indices of the maximum element of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
 
     keepdims : bool
         If this is set to True, the axes which are reduced are left in the result as dimensions
@@ -267,9 +265,9 @@ def argmin(data, axis=None, keepdims=False):
         The input tvm tensor
 
     axis : None or int or tuple of int
-        Axis or axes along which a sum is performed.
-        The default, axis=None, will sum all of the elements of the input array.
-        If axis is negative it counts from the last to the first axis.
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
 
     keepdims : bool
         If this is set to True, the axes which are reduced are left in the result as dimensions
diff --git a/topi/python/topi/rocm/__init__.py b/topi/python/topi/rocm/__init__.py
new file mode 100644
index 000000000000..96a04794c680
--- /dev/null
+++ b/topi/python/topi/rocm/__init__.py
@@ -0,0 +1,8 @@
+# pylint: disable=redefined-builtin, wildcard-import
+"""rocm specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+from .conv2d import *
+from .dense import *
+from .vision import *
+from .nn import *
diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py
new file mode 100644
index 000000000000..1aa125f8f68f
--- /dev/null
+++ b/topi/python/topi/rocm/conv2d.py
@@ -0,0 +1,90 @@
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
+"""Compute and schedule for rocm conv2d_nchw with auto fusion"""
+import tvm
+from tvm.contrib import miopen
+import topi
+from .. import generic
+from ..nn.conv2d import conv2d
+from ..util import get_const_int
+
+
+@conv2d.register("rocm")
+def conv2d_rocm(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+    """Conv2D operator for rocm backend.
+
+    Parameters
+    ----------
+    input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    filter : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    assert layout == 'NCHW', "Only NCHW layout is supported."
+    assert isinstance(stride, int) or len(stride) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+    if isinstance(padding, int):
+        pad_h = pad_w = padding
+    else:
+        pad_h, pad_w = padding
+    # handle dilation
+    dilation_h = dilation_w = 1
+    kernel_tvm = kernel
+    kernel_cudnn = kernel
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        kernel_before_dilation = kernel.op.input_tensors[0]
+        kernel_cudnn = kernel_before_dilation
+        dilation_h = (get_const_int(kernel.shape[2]) + get_const_int(kernel_before_dilation.shape[2]) - 1) \
+            // get_const_int(kernel_before_dilation.shape[2])
+        dilation_w = (get_const_int(kernel.shape[3]) + get_const_int(kernel_before_dilation.shape[3]) - 1) \
+            // get_const_int(kernel_before_dilation.shape[2])
+    target = tvm.target.current_target()
+    if "miopen" in target.libs:
+        return miopen.conv2d_forward(data,
+                                     kernel_cudnn,
+                                     stride_h,
+                                     stride_w,
+                                     pad_h,
+                                     pad_w,
+                                     dilation_h,
+                                     dilation_w,
+                                     conv_mode=0)
+    return topi.nn.conv2d_nchw(data, kernel_tvm, stride, padding, out_dtype)
+
+
+@generic.schedule_conv2d_nchw.register(["rocm"])
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw with rocm backend.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    target = tvm.target.current_target()
+    if target and "miopen" in target.libs:
+        return topi.generic.schedule_extern(outs)
+    return topi.cuda.schedule_conv2d_nchw(outs)
diff --git a/topi/python/topi/rocm/dense.py b/topi/python/topi/rocm/dense.py
new file mode 100644
index 000000000000..cfeed247a4a1
--- /dev/null
+++ b/topi/python/topi/rocm/dense.py
@@ -0,0 +1,66 @@
+# pylint: disable=invalid-name, unused-variable
+"""Schedule for dense operator"""
+from __future__ import absolute_import as _abs
+import tvm
+from tvm.contrib import rocblas
+import topi
+from ..nn.dense import dense, dense_default
+from .. import tag
+from .. import generic
+
+@dense.register("rocm")
+def dense_rocm(data, weight, bias=None):
+    """Dense operator for rocm backend.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim]
+
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [out_dim]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    assert len(data.shape) == 2 and len(weight.shape) == 2, \
+        "only support 2-dim dense"
+    if bias is not None:
+        assert len(bias.shape) == 1
+    batch, in_dim = data.shape
+    out_dim, _ = weight.shape
+    target = tvm.target.current_target()
+    if "rocblas" in target.libs:
+        matmul = rocblas.matmul(data, weight, False, True)
+        if bias is not None:
+            matmul = tvm.compute((batch, out_dim), \
+                                 lambda i, j: matmul[i, j] + bias[j], \
+                                 tag=tag.BROADCAST)
+        return matmul
+    return dense_default(data, weight, bias)
+
+
+@generic.schedule_dense.register(["rocm"])
+def schedule_dense(outs):
+    """Schedule for dense operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of dense
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for dense.
+    """
+    target = tvm.target.current_target()
+    if target.target_name == "rocm" and "rocblas" in target.libs:
+        return generic.schedule_extern(outs)
+    return topi.cuda.schedule_dense(outs)
diff --git a/topi/python/topi/rocm/nn.py b/topi/python/topi/rocm/nn.py
new file mode 100644
index 000000000000..5a9b2ad84db0
--- /dev/null
+++ b/topi/python/topi/rocm/nn.py
@@ -0,0 +1,18 @@
+"""scheduler for normalization functions on rocm backend"""
+from __future__ import absolute_import as _abs
+
+import tvm
+from .. import generic
+from .. import cpp
+
+@generic.schedule_lrn.register(["rocm", "gpu"])
+def schedule_lrn(outs):
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.rocm.schedule_lrn(cpp_target, outs)
+
+@generic.schedule_l2_normalize.register(["rocm", "gpu"])
+def schedule_l2_normalize(outs):
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.rocm.schedule_l2_normalize(cpp_target, outs)
diff --git a/topi/python/topi/rocm/vision.py b/topi/python/topi/rocm/vision.py
new file mode 100644
index 000000000000..84ae436e3531
--- /dev/null
+++ b/topi/python/topi/rocm/vision.py
@@ -0,0 +1,25 @@
+# pylint: disable=invalid-name, unused-variable
+"""Schedule for vision operator"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import generic
+from .. import cpp
+
+@generic.schedule_region.register(["rocm"])
+def schedule_region(outs):
+    """Schedule for region operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of region
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for region.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.rocm.schedule_region(cpp_target, outs)
diff --git a/topi/python/topi/tensor.py b/topi/python/topi/tensor.py
new file mode 100644
index 000000000000..6fcddedbe445
--- /dev/null
+++ b/topi/python/topi/tensor.py
@@ -0,0 +1,63 @@
+# pylint: disable=invalid-name,consider-using-enumerate,unused-argument,len-as-condition
+"""Elementwise operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from . import cpp
+from . import tag
+
+@tvm.tag_scope(tag=tag.ELEMWISE)
+def elemwise_sum(xs):
+    """Perform element-wise sum on inputs
+
+    Parameters
+    ----------
+    xs : list of tvm.Tensor
+        Input arguments.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return cpp.elemwise_sum(xs)
+
+
+@tvm.tag_scope(tag=tag.ELEMWISE)
+def full(shape, dtype, fill_value):
+    """Fill tensor with fill_value
+
+    Parameters
+    ----------
+    shape : tuple
+        Input tensor shape.
+    dtype : str
+        Data type
+    fill_value : float
+        Value to be filled
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return cpp.full(shape, dtype, fill_value)
+
+
+@tvm.tag_scope(tag=tag.ELEMWISE)
+def full_like(x, fill_value):
+    """Construct a tensor with same shape as input tensor,
+       then fill tensor with fill_value.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+    fill_value : float
+        Value to be filled
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return cpp.full_like(x, fill_value)
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 3a43a04437a1..c9d995a38686 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -6,7 +6,16 @@
 
 from .conv2d_hwcn_python import conv2d_hwcn_python
 from .conv2d_nchw_python import conv2d_nchw_python
+from .conv2d_nhwc_python import conv2d_nhwc_python
 from .conv2d_transpose_nchw_python import conv2d_transpose_nchw_python
 from .depthwise_conv2d_python import depthwise_conv2d_python_nchw, depthwise_conv2d_python_nhwc
 from .dilate_python import dilate_python
 from .softmax_python import softmax_python, log_softmax_python
+from .upsampling_python import upsampling_python
+from .bilinear_resize_python import bilinear_resize_python
+from .reorg_python import reorg_python
+from .region_python import region_python
+from .yolo_python import yolo_python
+from .shortcut_python import shortcut_python
+from .lrn_python import lrn_python
+from .l2_normalize_python import l2_normalize_python
diff --git a/topi/python/topi/testing/bilinear_resize_python.py b/topi/python/topi/testing/bilinear_resize_python.py
new file mode 100644
index 000000000000..c014b075681a
--- /dev/null
+++ b/topi/python/topi/testing/bilinear_resize_python.py
@@ -0,0 +1,65 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Bilinear Scale in python"""
+import math
+import numpy as np
+
+def bilinear_resize_python(image, out_size, layout, align_corners=False):
+    """ Bilinear scaling using python"""
+    (new_h, new_w) = out_size
+
+    if layout == 'NHWC':
+        (batch, h, w, channel) = image.shape
+        scaled_image = np.ones((batch, new_h, new_w, channel))
+    else:
+        (batch, channel, h, w) = image.shape
+        scaled_image = np.ones((batch, channel, new_h, new_w))
+
+    if align_corners:
+        height_scale = np.float32(h-1) / np.float32(out_size[0]-1)
+        width_scale = np.float32(w-1) / np.float32(out_size[1]-1)
+    else:
+        height_scale = np.float32(h) / np.float32(out_size[0])
+        width_scale = np.float32(w) / np.float32(out_size[1])
+
+    for b in range(batch):
+        for i in range(channel):
+            for j in range(new_h):
+                for k in range(new_w):
+                    in_y = j * height_scale
+                    y0 = math.floor(in_y)
+                    y1 = min(math.ceil(in_y), h - 1)
+                    y_lerp = in_y - y0
+
+                    y0 = int(y0)
+                    y1 = int(y1)
+
+                    in_x = k * width_scale
+                    x0 = math.floor(in_x)
+                    x1 = min(math.ceil(in_x), w - 1)
+                    x_lerp = in_x - x0
+
+                    x0 = int(x0)
+                    x1 = int(x1)
+
+                    if layout == 'NHWC':
+                        A = image[b][y0][x0][i]
+                        B = image[b][y0][x1][i]
+                        C = image[b][y1][x0][i]
+                        D = image[b][y1][x1][i]
+                    else:
+                        A = image[b][i][y0][x0]
+                        B = image[b][i][y0][x1]
+                        C = image[b][i][y1][x0]
+                        D = image[b][i][y1][x1]
+
+                    top = A + (B - A) * x_lerp
+                    bottom = C + (D - C) * x_lerp
+
+                    pixel = np.float32(top + (bottom - top) * y_lerp)
+
+                    if layout == 'NHWC':
+                        scaled_image[b][j][k][i] = pixel
+                    else:
+                        scaled_image[b][i][j][k] = pixel
+
+    return scaled_image
diff --git a/topi/python/topi/testing/conv2d_nchw_python.py b/topi/python/topi/testing/conv2d_nchw_python.py
index 20bcddd1d069..4a40d02d215c 100644
--- a/topi/python/topi/testing/conv2d_nchw_python.py
+++ b/topi/python/topi/testing/conv2d_nchw_python.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals, too-many-branches
 """Convolution in python"""
 import numpy as np
 import scipy.signal
@@ -18,8 +18,8 @@ def conv2d_nchw_python(a_np, w_np, stride, padding):
     stride : int or a list/tuple of two ints
         Stride size, or [stride_height, stride_width]
 
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
+    padding : int or str or a list/tuple of two ints
+        Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width]
 
     Returns
     -------
@@ -34,12 +34,11 @@ def conv2d_nchw_python(a_np, w_np, stride, padding):
         stride_h, stride_w = stride
     if isinstance(padding, int):
         pad_h = pad_w = padding * 2
-    elif padding == 'VALID':
-        pad_h = 0
-        pad_w = 0
-    else: # 'SAME'
-        pad_h = kernel_h - 1
-        pad_w = kernel_w - 1
+    elif isinstance(padding, (list, tuple)):
+        pad_h, pad_w = padding[0] * 2, padding[1] * 2
+    else:
+        pad_h = 0 if padding == 'VALID' else kernel_h - 1
+        pad_w = 0 if padding == 'VALID' else kernel_w - 1
     pad_top = int(np.ceil(float(pad_h) / 2))
     pad_bottom = pad_h - pad_top
     pad_left = int(np.ceil(float(pad_w) / 2))
@@ -53,9 +52,14 @@ def conv2d_nchw_python(a_np, w_np, stride, padding):
     for n in range(batch):
         for f in range(out_channel):
             for c in range(in_channel):
-                if pad_h > 0:
+                if pad_h > 0 or pad_w > 0:
                     apad = np.zeros((in_height + pad_h, in_width + pad_w))
-                    apad[pad_top:-pad_bottom, pad_left:-pad_right] = a_np[n, c]
+                    if pad_h == 0:
+                        apad[:, pad_left:-pad_right] = a_np[n, c]
+                    elif pad_w == 0:
+                        apad[pad_top:-pad_bottom, :] = a_np[n, c]
+                    else:
+                        apad[pad_top:-pad_bottom, pad_left:-pad_right] = a_np[n, c]
                 else:
                     apad = a_np[n, c]
                 out = scipy.signal.convolve2d(
diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py
new file mode 100644
index 000000000000..880088a6f89f
--- /dev/null
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -0,0 +1,67 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Convolution in python"""
+import numpy as np
+import scipy.signal
+
+
+def conv2d_nhwc_python(a_np, w_np, stride, padding):
+    """Convolution operator in NHWC layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_height, in_width, in_channel]
+
+    w_np : numpy.ndarray
+        4-D with shape [num_filter, filter_height, filter_width, in_channel]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [out_height, out_width, out_channel, batch]
+    """
+    batch, in_height, in_width, in_channel = a_np.shape
+    kernel_h, kernel_w, _, num_filter = w_np.shape
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+    if isinstance(padding, int):
+        pad_h = pad_w = padding * 2
+    elif padding == 'VALID':
+        pad_h = 0
+        pad_w = 0
+    else: # 'SAME'
+        pad_h = kernel_h - 1
+        pad_w = kernel_w - 1
+    pad_top = int(np.ceil(float(pad_h) / 2))
+    pad_bottom = pad_h - pad_top
+    pad_left = int(np.ceil(float(pad_w) / 2))
+    pad_right = pad_w - pad_left
+    # compute the output shape
+    out_channel = num_filter
+    out_height = (in_height - kernel_h + pad_h) // stride_h + 1
+    out_width = (in_width - kernel_w + pad_w) // stride_w + 1
+    # change the layout from NHWC to NCHW
+    at = a_np.transpose((0, 3, 1, 2))
+    wt = w_np.transpose((3, 2, 0, 1))
+    bt = np.zeros((batch, out_channel, out_height, out_width))
+    # computation
+    for n in range(batch):
+        for f in range(out_channel):
+            for c in range(in_channel):
+                if pad_h > 0:
+                    apad = np.zeros((in_height + pad_h, in_width + pad_w))
+                    apad[pad_top:-pad_bottom, pad_left:-pad_right] = at[n, c]
+                else:
+                    apad = at[n, c]
+                out = scipy.signal.convolve2d(
+                    apad, np.rot90(np.rot90(wt[f, c])), mode='valid')
+                bt[n, f] += out[::stride, ::stride]
+    return bt.transpose((0, 2, 3, 1))
diff --git a/topi/python/topi/testing/conv2d_transpose_nchw_python.py b/topi/python/topi/testing/conv2d_transpose_nchw_python.py
index 43af160e8038..2b78452b95cd 100644
--- a/topi/python/topi/testing/conv2d_transpose_nchw_python.py
+++ b/topi/python/topi/testing/conv2d_transpose_nchw_python.py
@@ -1,6 +1,7 @@
 # pylint: disable=unused-variable
 """Transposed convolution in python"""
 import numpy as np
+import scipy
 import topi
 from topi.nn.util import get_pad_tuple
 
@@ -14,7 +15,7 @@ def conv2d_transpose_nchw_python(a_np, w_np, stride, padding):
         4-D with shape [batch, in_channel, in_height, in_width]
 
     w_np : numpy.ndarray
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        4-D with shape [in_channel, num_filter, filter_height, filter_width]
 
     stride : int or a list/tuple of two ints
         Stride size, or [stride_height, stride_width]
@@ -28,7 +29,7 @@ def conv2d_transpose_nchw_python(a_np, w_np, stride, padding):
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     batch, in_c, in_h, in_w = a_np.shape
-    out_c, _, filter_h, filter_w = w_np.shape
+    _, out_c, filter_h, filter_w = w_np.shape
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
@@ -46,6 +47,13 @@ def conv2d_transpose_nchw_python(a_np, w_np, stride, padding):
     padded_a_np[:, :, bpad_top:dilated_a_np.shape[2]+bpad_top, \
         bpad_left:dilated_a_np.shape[3]+bpad_left] = dilated_a_np
     # convolution stage
-    rotated_w_np = np.rot90(w_np, k=2, axes=(2, 3))
-    b_np = topi.testing.conv2d_nchw_python(padded_a_np, rotated_w_np, stride=1, padding='VALID')
+    out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
+    out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
+    b_np = np.zeros((batch, out_c, out_h, out_w))
+    for n in range(batch):
+        for f in range(out_c):
+            for c in range(in_c):
+                out = scipy.signal.convolve2d(
+                    padded_a_np[n, c], w_np[c, f], mode='valid')
+                b_np[n, f] += out
     return b_np
diff --git a/topi/python/topi/testing/depthwise_conv2d_python.py b/topi/python/topi/testing/depthwise_conv2d_python.py
index 84784f97c2b8..d7baf4a1beaf 100644
--- a/topi/python/topi/testing/depthwise_conv2d_python.py
+++ b/topi/python/topi/testing/depthwise_conv2d_python.py
@@ -27,7 +27,11 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding):
     """
     batch, in_channel, in_height, in_width = input_np.shape
     _, channel_multiplier, filter_height, filter_width = filter_np.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
     # calculate output shape
     if padding == 'VALID':
         out_channel = in_channel * channel_multiplier
@@ -84,7 +88,11 @@ def depthwise_conv2d_python_nhwc(input_np, filter_np, stride, padding):
     """
     batch, in_height, in_width, in_channel = input_np.shape
     filter_height, filter_width, _, channel_multiplier = filter_np.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
     # calculate output shape
     if padding == 'VALID':
         out_channel = in_channel * channel_multiplier
diff --git a/topi/python/topi/testing/l2_normalize_python.py b/topi/python/topi/testing/l2_normalize_python.py
new file mode 100644
index 000000000000..98f1843233a7
--- /dev/null
+++ b/topi/python/topi/testing/l2_normalize_python.py
@@ -0,0 +1,27 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""L2 normalize in python"""
+import numpy as np
+
+def l2_normalize_python(a_np, eps, axis=None):
+    """L2 normalize operator in NCHW layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    eps : float
+        epsilon constant value
+    axis : list of int
+        axis over the normalization applied
+
+    Returns
+    -------
+    l2_normalize_out : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    dot_value = np.power(a_np, 2.0)
+    sqr_sum = np.sum(dot_value, axis, keepdims=True)
+    sqrt_sum = np.sqrt(np.maximum(np.broadcast_to(sqr_sum, a_np.shape), eps))
+    l2_normalize_out = np.divide(a_np, sqrt_sum)
+    return l2_normalize_out
diff --git a/topi/python/topi/testing/lrn_python.py b/topi/python/topi/testing/lrn_python.py
new file mode 100644
index 000000000000..4e44e8bcb635
--- /dev/null
+++ b/topi/python/topi/testing/lrn_python.py
@@ -0,0 +1,53 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""LRN in python"""
+from itertools import product
+import numpy as np
+
+def lrn_python(a_np, size, axis, bias, alpha, beta):
+    """Local response normalization operator in NCHW layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    size : int
+        normalization window size
+
+    axis : int
+        input data layout channel axis
+
+    bias : float
+        offset to avoid dividing by 0. constant value
+
+    alpha : float
+        constant value
+
+    beta : float
+        exponent constant value
+
+    Returns
+    -------
+    lrn_out : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    radius = size // 2
+    sqr_sum = np.zeros(shape=a_np.shape).astype(a_np.dtype)
+    for i, j, k, l in product(*[range(_axis) for _axis in a_np.shape]):
+        axis_size = a_np.shape[axis]
+        if axis == 1:
+            #NCHW layout
+            sum_start = j-radius if j-radius >= 0 else 0
+            sum_end = j+radius+1 if j+radius+1 < axis_size else axis_size
+            sqr_sum[i, j, k, l] = sum(a_np[i, sum_start:sum_end, k, l] * \
+                                      a_np[i, sum_start:sum_end, k, l])
+        elif axis == 3:
+            #NHWC layout
+            sum_start = l-radius if l-radius >= 0 else 0
+            sum_end = l+radius+1 if l+radius+1 < axis_size else axis_size
+            sqr_sum[i, j, k, l] = sum(a_np[i, j, k, sum_start:sum_end] * \
+                                      a_np[i, j, k, sum_start:sum_end])
+
+    sqr_sum_up = np.power((bias + (alpha * sqr_sum /size)), beta)
+    lrn_out = np.divide(a_np, sqr_sum_up)
+    return lrn_out
diff --git a/topi/python/topi/testing/region_python.py b/topi/python/topi/testing/region_python.py
new file mode 100644
index 000000000000..3bab53892607
--- /dev/null
+++ b/topi/python/topi/testing/region_python.py
@@ -0,0 +1,69 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Region in python"""
+import numpy as np
+
+def entry_index(batch, w, h, outputs, classes, coords, location, entry):
+    n = int(location/(w*h))
+    loc = location%(w*h)
+    return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc
+
+def region_python(a_np, N, classes, coords, background, softmax):
+    """Region operator
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    N : int
+        Darknet layer parameter n
+
+    classes : int
+        Darknet layer parameter classes
+
+    coords : int
+        Darknet layer parameter coords
+
+    background : int
+        Darknet layer parameter background
+
+    softmax : int
+        Darknet layer parameter softmax
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+
+    batch, in_channel, in_height, in_width = a_np.shape
+    a_np_temp = np.reshape(a_np, batch*in_channel*in_height*in_width)
+    outputs = batch*in_channel*in_height*in_width
+    b_np = np.zeros(batch*in_channel*in_height*in_width)
+    for i in range(batch*in_channel*in_height*in_width):
+        b_np[i] = a_np_temp[i]
+    for b in range(batch):
+        for n in range(N):
+            index = entry_index(b, in_width, in_height, outputs, classes, coords, n*in_width*in_height, 0)
+            b_np[index: index+2*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+2*in_width*in_height]))
+            index = entry_index(b, in_width, in_height, outputs, classes, coords, n*in_width*in_height, coords)
+            if not background:
+                b_np[index: index+in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+in_width*in_height]))
+
+    b_np = np.reshape(b_np, (batch, in_channel, in_height, in_width))
+    def local_softmax(data_in):
+        data_c, data_h, data_w = data_in.shape
+        largest = np.max(data_in, axis=1)
+        data_out = np.zeros((data_c, data_h, data_w))
+        for i in range(data_h):
+            for j in range(data_w):
+                data_out[:, i, j] = np.exp(data_in[:, i, j] - largest[i, j])
+        return data_out/data_out.sum(axis=0)
+
+    if softmax:
+        index = coords + int(not background)
+        for b in range(batch):
+            for i in range(N):
+                b_np_index = int(i*(in_channel/N) + index)
+                b_np[b, b_np_index: b_np_index + classes+background, :, :] = local_softmax(b_np[b, b_np_index:b_np_index + classes+background, :, :])
+
+    return b_np
diff --git a/topi/python/topi/testing/reorg_python.py b/topi/python/topi/testing/reorg_python.py
new file mode 100644
index 000000000000..185e5566e5bc
--- /dev/null
+++ b/topi/python/topi/testing/reorg_python.py
@@ -0,0 +1,42 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Reorg in python"""
+import numpy as np
+
+def reorg_python(a_np, stride):
+    """Reorg operator
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    stride : int
+        Stride size
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+
+    batch, in_channel, in_height, in_width = a_np.shape
+    a_np = np.reshape(a_np, batch*in_channel*in_height*in_width)
+    out_c = int(in_channel/(stride*stride))
+    out_channel = in_channel*stride*stride
+    out_height = int(in_height/stride)
+    out_width = int(in_width/stride)
+    b_np = np.zeros(batch*out_channel*out_height*out_width)
+    cnt = 0
+    for b in range(batch):
+        for k in range(in_channel):
+            for j in range(in_height):
+                for i in range(in_width):
+                    c2 = k % out_c
+                    offset = int(k / out_c)
+                    w2 = int(i*stride + offset % stride)
+                    h2 = int(j*stride + offset / stride)
+                    out_index = int(w2 + in_width*stride*(h2 + in_height*stride*(c2 + out_c*b)))
+                    b_np[cnt] = a_np[int(out_index)]
+                    cnt = cnt+1
+    b_np = np.reshape(b_np, (batch, out_channel, out_height, out_width))
+    return b_np
diff --git a/topi/python/topi/testing/shortcut_python.py b/topi/python/topi/testing/shortcut_python.py
new file mode 100644
index 000000000000..575c28b61c2c
--- /dev/null
+++ b/topi/python/topi/testing/shortcut_python.py
@@ -0,0 +1,47 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Shortcut in python"""
+import numpy as np
+
+def shortcut_python(a_np1, a_np2):
+    """Reorg operator
+
+    Parameters
+    ----------
+    a_np1 : numpy.ndarray
+        4-D with shape [batch1, in_channel1, in_height1, in_width1]
+
+    a_np2 : numpy.ndarray
+        4-D with shape [batch2, in_channel2, in_height2, in_width2]
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch1, out_channel1, out_height1, out_width1]
+    """
+
+    batch1, in_channel1, in_height1, in_width1 = a_np1.shape
+    batch2, in_channel2, in_height2, in_width2 = a_np2.shape
+    a_np1_temp = np.reshape(a_np1, batch1*in_channel1*in_height1*in_width1)
+    a_np2_temp = np.reshape(a_np2, batch2*in_channel2*in_height2*in_width2)
+    b_np = np.zeros(batch1*in_channel1*in_height1*in_width1)
+    stride = int(in_width1/in_width2)
+    sample = int(in_width2/in_width1)
+    if stride < 1:
+        stride = 1
+    if sample < 1:
+        sample = 1
+    minw = min(in_width1, in_width2)
+    minh = min(in_height1, in_height2)
+    minc = min(in_channel1, in_channel2)
+
+    for i in range((batch1*in_channel1*in_height1*in_width1)):
+        b_np[i] = a_np1_temp[i]
+    for b in range(batch1):
+        for k in range(minc):
+            for j in range(minh):
+                for i in range(minw):
+                    out_index = i*sample + in_width2*(j*sample + in_height2*(k + in_channel2*b))
+                    add_index = i*stride + in_width1*(j*stride + in_height1*(k + in_channel1*b))
+                    b_np[out_index] = a_np1_temp[out_index] + a_np2_temp[add_index]
+    b_np = np.reshape(b_np, (batch1, in_channel1, in_height1, in_width1))
+    return b_np
diff --git a/topi/python/topi/testing/upsampling_python.py b/topi/python/topi/testing/upsampling_python.py
new file mode 100644
index 000000000000..fc4ad652f900
--- /dev/null
+++ b/topi/python/topi/testing/upsampling_python.py
@@ -0,0 +1,28 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Upsampling in python"""
+import numpy as np
+
+def upsample_nearest(arr, scale):
+    """ Populate the array by scale factor"""
+    return arr.repeat(scale, axis=0).repeat(scale, axis=1)
+
+def upsampling_python(data, scale, layout='NCHW'):
+    """ Python version of scaling using nearest neighbour """
+
+    ishape = data.shape
+    if layout == 'NCHW':
+        oshape = (ishape[0], ishape[1], ishape[2]*scale, ishape[3]*scale)
+        output_np = np.zeros(oshape, dtype=data.dtype)
+        for b in range(oshape[0]):
+            for c in range(oshape[1]):
+                output_np[b, c, :, :] = upsample_nearest(data[b, c, :, :], scale)
+        return output_np
+    elif layout == 'NHWC':
+        oshape = (ishape[0], ishape[1]*scale, ishape[1]*scale, ishape[3])
+        output_np = np.zeros(oshape, dtype=data.dtype)
+        for b in range(oshape[0]):
+            for c in range(oshape[3]):
+                output_np[b, :, :, c] = upsample_nearest(data[b, :, :, c], scale)
+        return output_np
+    else:
+        raise ValueError("not support this layout {} yet".format(layout))
diff --git a/topi/python/topi/testing/yolo_python.py b/topi/python/topi/testing/yolo_python.py
new file mode 100644
index 000000000000..a6b3a41203c6
--- /dev/null
+++ b/topi/python/topi/testing/yolo_python.py
@@ -0,0 +1,43 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Yolo operator in python"""
+import numpy as np
+
+def entry_index(batch, w, h, outputs, classes, coords, location, entry):
+    n = int(location/(w*h))
+    loc = location%(w*h)
+    return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc
+
+def yolo_python(a_np, N, classes):
+    """Yolo operator
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    N : int
+        Darknet layer parameter n
+
+    classes : int
+        Darknet layer parameter classes
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+
+    batch, in_channel, in_height, in_width = a_np.shape
+    a_np_temp = np.reshape(a_np, batch*in_channel*in_height*in_width)
+    outputs = batch*in_channel*in_height*in_width
+    b_np = np.zeros(batch*in_channel*in_height*in_width)
+    for i in range(batch*in_channel*in_height*in_width):
+        b_np[i] = a_np_temp[i]
+    for b in range(batch):
+        for n in range(N):
+            index = entry_index(b, in_width, in_height, outputs, classes, 4, n*in_width*in_height, 0)
+            b_np[index: index+2*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+2*in_width*in_height]))
+            index = entry_index(b, in_width, in_height, outputs, classes, 4, n*in_width*in_height, 4)
+            b_np[index: index+(1+classes)*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+(1+classes)*in_width*in_height]))
+
+    b_np = np.reshape(b_np, (batch, in_channel, in_height, in_width))
+    return b_np
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 3194a5601688..2ad01852c8b9 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -2,8 +2,10 @@
 """Injective transformation operators"""
 from __future__ import absolute_import as _abs
 import tvm
+import topi
 from . import tag
 from .util import ravel_index, unravel_index, get_const_int, get_const_tuple
+from . import cpp
 
 @tvm.tag_scope(tag=tag.BROADCAST)
 def expand_dims(a, axis, num_newaxis=1):
@@ -29,6 +31,60 @@ def _compute(*indices):
     return tvm.compute(new_shape, _compute)
 
 
+@tvm.tag_scope(tag=tag.BROADCAST)
+def expand_like(a, shape_like, axis):
+    """Expand an input array with the shape of second array.
+    This operation can always be composed of unsqueezing and
+    expanding dims on those unsqueezed axes.
+
+    Examples::
+    input = [ 12.  19.  27.]
+    input.shape = (3,)
+
+    new_shape_array = [[[1,2],[2,3],[1,3]],
+                      [[1,4],[4,3],[5,2]],
+                      [[7,1],[7,2],[7,3]]]
+    new_shape_array.shape = (3, 3, 2)
+
+    expand_like(input, [1,2], new_shape_array) =
+                      [[[12,12],[12,12],[12,12]],
+                      [[19,19],[19,19],[19,19]],
+                      [[27,27],[27,27],[27,27]]]
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The tensor to be expanded.
+    shape_like : tvm.Tensor
+        The tensor to with target shape.
+    axis: list of int
+        axis to be expanded on
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    odim = len(axis) + len(a.shape)
+    if odim != len(shape_like.shape):
+        if len(a.shape) == 1 and len(axis) == len(shape_like.shape):
+            # A special case: `a` is a scalar represented as a 1-dim tensor
+            return tvm.compute(shape_like.shape, lambda *idxs: a(0))
+        raise ValueError("shape inconsistent when expand_like ({}, {}, {})".format(
+            len(axis), len(a.shape), len(shape_like.shape)))
+
+    real_axis = topi.reduction._get_real_axis(len(shape_like.shape), axis)
+    real_axis = sorted(real_axis)
+
+    def _compute(*idxs):
+        indices = []
+        axis_index = 0
+        for i in range(0, len(idxs)):
+            if i not in real_axis:
+                indices.append(idxs[i])
+                axis_index += 1
+        return a(*indices)
+    return tvm.compute(shape_like.shape, _compute)
+
+
 @tvm.tag_scope(tag=tag.INJECTIVE)
 def transpose(a, axes=None):
     """Permute the dimensions of an array.
@@ -55,6 +111,49 @@ def _compute(*indices):
         return a(*idx)
     return tvm.compute(new_shape, _compute)
 
+@tvm.tag_scope(tag=tag.INJECTIVE)
+def flip(a, axis=0):
+    """Flip/reverse elements of an array in a particular axis.
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The tensor to be expanded.
+
+    axis : int, optional
+        The axis along which the tensors will be reveresed.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.flip(a, axis)
+
+@tvm.tag_scope(tag=tag.INJECTIVE)
+def strided_slice(a, begin, end, strides=None):
+    """Slice of an array.
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The tensor to be sliced.
+
+    begin: list of int
+        The indices to begin with in the slicing.
+
+    end: list of int
+        Indicies indicating end of the slice.
+
+    strides: list of int, optional
+        Specifies the stride values, it can be negative
+        in that case, the input tensor will be reversed
+        in that particular axis.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.strided_slice(a, begin, end, strides)
 
 @tvm.tag_scope(tag=tag.INJECTIVE)
 def reshape(a, newshape):
@@ -153,6 +252,7 @@ def concatenate(a_tuple, axis=0):
     axis_sizes = [a_tuple[i].shape[axis] for i in range(len(a_tuple))]
     out_shape = [a_tuple[0].shape[i] for i in range(0, axis)] + [sum(axis_sizes)]\
                 + [a_tuple[0].shape[i] for i in range(axis + 1, len(a_tuple[0].shape))]
+    out_shape[axis] = tvm.ir_pass.Simplify(out_shape[axis])
 
     def _compute(*indices):
         ret = a_tuple[0](*indices)
@@ -199,7 +299,7 @@ def _compute(begin, *indices):
             "Should be sorted, recieved %s" % str(indices_or_sections)
         begin_ids = [0] + list(indices_or_sections)
     else:
-        raise NotImplementedError
+        raise NotImplementedError()
     out_shapes = []
     for i in range(len(begin_ids)):
         if i == len(begin_ids) - 1:
@@ -213,3 +313,28 @@ def _compute(begin, *indices):
                         lambda *indices: _compute(begin_id, *indices), name="s%d" %i)
             for i, (out_shape, begin_id) in enumerate(zip(out_shapes, begin_ids))]
     # pylint: enable=cell-var-from-loop
+
+
+@tvm.tag_scope(tag=tag.INJECTIVE)
+def take(a, indices, axis=None):
+    """Take elements from an array along an axis.
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The source array.
+
+    indices : tvm.Tensor
+        The indices of the values to extract.
+
+    axis : int, optional
+        The axis over which to select values. By default,
+        the flattened input array is used.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    if axis is None:
+        return cpp.take(a, indices)
+    return cpp.take(a, indices, int(axis))
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index 246d283cd2ed..b5d5dd2b99ad 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -1,13 +1,58 @@
+# pylint: disable=invalid-name
 """Common topi utilities"""
 from __future__ import absolute_import as _abs
 import tvm
 
+from . import tag
+
+def traverse_inline(s, op, callback):
+    """Traverse computation graph and do auto inline
+
+    Parameters
+    ----------
+    s: schedule
+        The schedule
+    op: Operation
+        The final output operator.
+    callback: callable
+        The callback function on each op
+    """
+    if tag.is_injective(op.tag):
+        if op not in s.outputs:
+            s[op].compute_inline()
+        for tensor in op.input_tensors:
+            if tensor.op.input_tensors:
+                traverse_inline(s, tensor.op, callback)
+    callback(op)
+
+
+def prod(x):
+    """Get the product of every items in the tuple.
+
+    Parameters
+    ----------
+    x: tuple
+        Input tuple
+
+    Returns
+    -------
+    value : Expr
+        The result value
+    """
+    if not x:
+        return tvm.const(1, "int32")
+    res = x[0]
+    for i in range(1, len(x)):
+        res = res * x[i]
+    return res
+
+
 def get_const_int(expr):
     """Verifies expr is integer and get the constant value.
 
     Parameters
     ----------
-    expr : tvm.Expr
+    expr : tvm.Expr or int
         The input expression.
 
     Returns
@@ -15,6 +60,8 @@ def get_const_int(expr):
     out_value : int
         The output.
     """
+    if isinstance(expr, int):
+        return expr
     if not isinstance(expr, (tvm.expr.IntImm, tvm.expr.UIntImm)):
         expr = tvm.ir_pass.Simplify(expr)
     if not isinstance(expr, (tvm.expr.IntImm, tvm.expr.UIntImm)):
@@ -59,9 +106,8 @@ def get_const_tuple(in_tuple):
     """
     out_tuple = ()
     for elem in in_tuple:
-        if not isinstance(elem, (tvm.expr.IntImm, tvm.expr.UIntImm)):
-            raise ValueError("Element of input tuple should be const int")
-        out_tuple = out_tuple + (elem.value, )
+        value = get_const_int(elem)
+        out_tuple = out_tuple + (value, )
     return out_tuple
 
 
@@ -128,3 +174,33 @@ def unravel_index(idx, shape):
         idx = idx // shape[i]
     indices = indices[::-1]
     return indices
+
+
+def const_matrix(matrix, name="const_matrix"):
+    """convert a const numpy 2-dimensional matrix to tvm tensor
+
+    Parameters
+    ----------
+    matrix: numpy.ndarray
+        Const input array
+    name: str, optional
+        The name of output op
+
+    Returns
+    -------
+    tensor: Tensor
+        The created tensor
+    """
+    row, col = matrix.shape
+    dtype = str(matrix.dtype)
+
+    def select_array(i, j):
+        now = tvm.const(0.0, dtype)
+        for ii in range(row):
+            for jj in range(col):
+                now = tvm.select(tvm.all(i % row == ii, j % col == jj),
+                                 tvm.const(matrix[ii][jj], dtype),
+                                 now)
+        return now
+
+    return tvm.compute(matrix.shape, select_array, name=name)
diff --git a/topi/python/topi/vision/__init__.py b/topi/python/topi/vision/__init__.py
new file mode 100644
index 000000000000..a94cd9c7c53c
--- /dev/null
+++ b/topi/python/topi/vision/__init__.py
@@ -0,0 +1,8 @@
+# pylint: disable=wildcard-import
+"""VISION network operators"""
+from __future__ import absolute_import as _abs
+
+from . import yolo, ssd
+from .shortcut import *
+from .reorg import *
+from .nms import *
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
new file mode 100644
index 000000000000..71431dd01f2c
--- /dev/null
+++ b/topi/python/topi/vision/nms.py
@@ -0,0 +1,191 @@
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+"""Non-maximum suppression operator"""
+import tvm
+
+from tvm import api
+
+def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
+    """Low level IR routing for transform location in multibox_detection operator.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of output boxes with class and score.
+
+    sort_result : Buffer
+        Buffer of output box indexes sorted by score.
+
+    valid_count : Buffer
+        Buffer of number of valid output boxes.
+
+    out : Buffer
+        Output buffer.
+
+    nms_threshold : float
+        Non-maximum suppression threshold.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
+        """Calculate overlap of two boxes.
+        """
+        w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
+                         - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
+        h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
+                         - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
+        i = w * h
+        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
+            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
+            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
+            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
+        return tvm.select(u <= 0.0, 0.0, i / u)
+
+    ib = tvm.ir_builder.create()
+    p_data = ib.buffer_ptr(data)
+    p_sort_result = ib.buffer_ptr(sort_result)
+    p_valid_count = ib.buffer_ptr(valid_count)
+    p_out = ib.buffer_ptr(out)
+    batch_size = out.shape[0]
+    num_anchors = out.shape[1]
+
+    nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold)
+    nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
+    force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
+    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
+        with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
+                                 p_valid_count[0] > 0)):
+            # Reorder output
+            nkeep = tvm.select(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
+                               nms_topk, p_valid_count[n])
+            with ib.for_range(0, nkeep, name="l") as l:
+                with ib.for_range(0, 6, name="m") as m:
+                    p_out[(n * num_anchors * 6
+                           + l * 6 + m)] = p_data[(n * num_anchors * 6
+                                                   + p_sort_result[n * num_anchors + l] * 6 + m)]
+            with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])):
+                with ib.for_range(0, p_valid_count[n] - nkeep, name="l") as l:
+                    with ib.for_range(0, 6, name="m") as m:
+                        p_out[(n * num_anchors * 6
+                               + (l + nkeep) * 6 + m)] = p_data[(n * num_anchors * 6
+                                                                 + (l + nkeep) * 6 + m)]
+            # Apply nms
+            with ib.for_range(0, p_valid_count[n], name="l") as l:
+                offset_l = l * 6
+                with ib.if_scope(p_out[n * num_anchors * 6 + offset_l] >= 0):
+                    with ib.for_range(0, p_valid_count[n], name="m") as m:
+                        offset_m = m * 6
+                        with ib.if_scope(tvm.all(m > l, p_out[n * num_anchors * 6
+                                                              + offset_m] >= 0)):
+                            with ib.if_scope(tvm.any(force_suppress_node > 0,
+                                                     p_out[n * num_anchors * 6 + offset_l] ==
+                                                     p_out[n * num_anchors * 6 + offset_m])):
+                                # When force_suppress == True or class_id equals
+                                iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_l + 2,
+                                                        n * num_anchors * 6 + offset_m + 2)
+                                with ib.if_scope(iou >= nms_threshold):
+                                    p_out[n * num_anchors * 6 + offset_m] = -1.0
+        with ib.else_scope():
+            with ib.for_range(0, p_valid_count[n], name="l") as l:
+                with ib.for_range(0, 6, name="m") as m:
+                    p_out[(n * num_anchors * 6
+                           + l * 6 + m)] = p_data[n * num_anchors * 6 + l * 6 + m]
+        # Set invalid entry to be -1
+        with ib.for_range(0, num_anchors - p_valid_count[n], name="l") as l:
+            with ib.for_range(0, 6, name="m") as m:
+                p_out[n * num_anchors * 6 + (l + p_valid_count[n]) * 6 + m] = -1.0
+    return ib.get()
+
+
+@tvm.target.generic_func
+def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+    """Non-maximum suppression operator for object detection.
+
+    Parameters
+    ----------
+    data: tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+        The last dimension should be in format of
+        [class_id, score, box_left, box_top, box_right, box_bottom].
+
+    valid_count : tvm.Tensor
+        1-D tensor for valid number of boxes.
+
+    nms_threshold : float
+        Non-maximum suppression threshold.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+
+    Example
+    --------
+    .. code-block:: python
+
+        # An example to use nms
+        dshape = (1, 5, 6)
+        data = tvm.placeholder(dshape, name="data")
+        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+        nms_threshold = 0.7
+        force_suppress = True
+        nms_topk = -1
+        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
+        s = topi.generic.schedule_nms(out)
+        f = tvm.build(s, [data, valid_count, out], "llvm")
+        ctx = tvm.cpu()
+        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
+        f(tvm_data, tvm_valid_count, tvm_out)
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    valid_count_dtype = "int32"
+    valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
+                                      "valid_count_buf", data_alignment=4)
+    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    score_axis = 1
+    score_shape = (batch_size, num_anchors)
+    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
+    score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype,
+                                       "score_tensor_buf", data_alignment=8)
+    sort_tensor_dtype = "int32"
+    sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
+                                      "sort_tensor_buf", data_alignment=8)
+    sort_tensor = \
+        tvm.extern(score_shape,
+                   [score_tensor, valid_count],
+                   lambda ins, outs: tvm.call_packed(
+                       "tvm.contrib.sort.argsort", ins[0], ins[1],
+                       outs[0], score_axis, True),
+                   dtype=sort_tensor_dtype,
+                   in_buffers=[score_tensor_buf, valid_count_buf],
+                   out_buffers=sort_tensor_buf,
+                   name="nms_sort")
+    out = \
+        tvm.extern(data.shape,
+                   [data, sort_tensor, valid_count],
+                   lambda ins, outs: nms_ir(
+                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
+                       force_suppress, nms_topk),
+                   dtype="float32",
+                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
+                   tag="nms")
+    return out
diff --git a/topi/python/topi/vision/reorg.py b/topi/python/topi/vision/reorg.py
new file mode 100644
index 000000000000..9f4fa70a71f6
--- /dev/null
+++ b/topi/python/topi/vision/reorg.py
@@ -0,0 +1,27 @@
+"""
+REORG Operator
+====================
+Reorg operator, used in darknet.
+"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import cpp
+
+@tvm.target.generic_func
+def reorg(data, stride):
+    """Reorg forward operators.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    stride : int
+        Stride value for reorganization
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    return cpp.vision.reorg(data, stride)
diff --git a/topi/python/topi/vision/shortcut.py b/topi/python/topi/vision/shortcut.py
new file mode 100644
index 000000000000..529360190a4e
--- /dev/null
+++ b/topi/python/topi/vision/shortcut.py
@@ -0,0 +1,45 @@
+"""Shortcut operators (short-cut connections)."""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import util
+from .. import transform
+
+@tvm.target.generic_func
+def shortcut(inp1, inp2):
+    """Shortcut forward operators.
+
+    Parameters
+    ----------
+    First Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    Second Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+
+    _, inp1_c, inp1_h, inp1_w = util.get_const_tuple(inp1.shape)
+    batch, inp2_c, inp2_h, inp2_w = util.get_const_tuple(inp2.shape)
+
+    stride = int(max(inp2_w / inp1_w, 1))
+    sample = int(max(inp1_w / inp2_w, 1))
+    minc = min(inp2_c, inp1_c)
+    minh = min(inp2_h, inp1_h)
+    minw = min(inp2_w, inp1_w)
+
+    out = tvm.compute((batch, minc, minh, minw), lambda b, c, h, w:
+                      inp1[b, c, h * sample, w * sample] +
+                      inp2[b, c, h * stride, w * stride],
+                      tag="shortcut")
+
+    split_indices = int(inp1_c / minc)
+    if split_indices > 1:
+        split_res = transform.split(inp1, split_indices, 1)
+        split_res[0] = out
+        out = transform.concatenate(split_res, 1)
+
+    return out
diff --git a/topi/python/topi/vision/ssd/__init__.py b/topi/python/topi/vision/ssd/__init__.py
new file mode 100644
index 000000000000..d680c578e7aa
--- /dev/null
+++ b/topi/python/topi/vision/ssd/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=wildcard-import
+"""VISION network operators"""
+from __future__ import absolute_import as _abs
+
+from .multibox import *
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
new file mode 100644
index 000000000000..a8f97146519b
--- /dev/null
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -0,0 +1,303 @@
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+"""SSD multibox operators"""
+from __future__ import absolute_import as _abs
+import math
+import tvm
+
+from tvm import api
+
+import topi
+
+from ..nms import nms
+
+def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
+    """Low level IR routing for multibox_prior operator.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input data buffer.
+
+    out : Buffer
+        Output buffer.
+
+    sizes : tuple of float
+        Tuple of sizes for anchor boxes.
+
+    ratios : tuple of float
+        Tuple of ratios for anchor boxes.
+
+    steps : Tuple of float
+        Priorbox step across y and x, -1 for auto calculation.
+
+    offsets : tuple of int
+        Priorbox center offsets, y and x respectively.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    ib = tvm.ir_builder.create()
+    p_out = ib.buffer_ptr(out)
+    in_height = data.shape[2]
+    in_width = data.shape[3]
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
+    size_ratio_concat = sizes + ratios
+    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
+    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+    offset_h = offsets[0]
+    offset_w = offsets[1]
+
+    with ib.for_range(0, in_height, for_type="parallel", name="i") as i:
+        center_h = (i + offset_h) * steps_h
+        with ib.for_range(0, in_width, name="j") as j:
+            center_w = (j + offset_w) * steps_w
+            for k in range(num_sizes + num_ratios - 1):
+                w = tvm.select(k < num_sizes,
+                               size_ratio_concat[k] * in_height / in_width / 2.0,
+                               size_ratio_concat[0] * in_height / in_width *
+                               math.sqrt(size_ratio_concat[k + 1]) / 2.0)
+                h = tvm.select(k < num_sizes, size_ratio_concat[k] / 2.0,
+                               size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0)
+                count = (i * in_width * (num_sizes + num_ratios - 1) +
+                         j * (num_sizes + num_ratios - 1) + k) * 4
+                p_out[count] = center_w - w
+                p_out[count + 1] = center_h - h
+                p_out[count + 2] = center_w + w
+                p_out[count + 3] = center_h + h
+
+    return ib.get()
+
+
+@tvm.target.generic_func
+def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
+    """Generate prior(anchor) boxes from data, sizes and ratios.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, c_in, h_in, w_in]]
+
+    sizes : tuple of float
+        Tuple of sizes for anchor boxes.
+
+    ratios : tuple of float
+        Tuple of ratios for anchor boxes.
+
+    steps : Tuple of float
+        Priorbox step across y and x, -1 for auto calculation.
+
+    offsets : tuple of int
+        Priorbox center offsets, y and x respectively.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
+    """
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
+    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
+    out = tvm.extern(oshape, [data], lambda ins, outs:
+                     multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
+                     tag="multibox_prior")
+    if clip:
+        out = topi.clip(out, 0, 1)
+    return out
+
+
+def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
+    """Low level IR routing for transform location in multibox_detection operator.
+
+    Parameters
+    ----------
+    cls_prob : Buffer
+        Buffer of class probabilities.
+
+    loc_pred : Buffer
+        Buffer of location regression predictions.
+
+    anchor : Buffer
+        Buffer of prior anchor boxes.
+
+    valid_count : Buffer
+        Buffer of number of valid output boxes.
+
+    out : Buffer
+        Output buffer.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    variances : tuple of float
+        Variances to be decoded from box regression output.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
+        """Transform prior anchor box to output box through location predictions.
+        """
+        al = anchor[anchor_base_idx]
+        at = anchor[anchor_base_idx + 1]
+        ar = anchor[anchor_base_idx + 2]
+        ab = anchor[anchor_base_idx + 3]
+        aw = ar - al
+        ah = ab - at
+        ax = (al + ar) / 2.0
+        ay = (at + ab) / 2.0
+        px = loc[loc_base_idx]
+        py = loc[loc_base_idx + 1]
+        pw = loc[loc_base_idx + 2]
+        ph = loc[loc_base_idx + 3]
+        ox = px * vx * aw + ax
+        oy = py * vy * ah + ay
+        ow = tvm.exp(pw * vw) * aw / 2.0
+        oh = tvm.exp(ph * vh) * ah / 2.0
+        return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \
+               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
+               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
+               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
+
+    batch_size = cls_prob.shape[0]
+    num_classes = cls_prob.shape[1]
+    num_anchors = cls_prob.shape[2]
+
+    ib = tvm.ir_builder.create()
+    p_cls_prob = ib.buffer_ptr(cls_prob)
+    p_loc_pred = ib.buffer_ptr(loc_pred)
+    p_anchor = ib.buffer_ptr(anchor)
+    p_valid_count = ib.buffer_ptr(valid_count)
+    p_out = ib.buffer_ptr(out)
+    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
+        p_valid_count[n] = 0
+        with ib.for_range(0, num_anchors, name="i") as i:
+            # Find the predicted class id and probability
+            score = ib.allocate('float32', (1,), name="score", scope="local")
+            cls_id = ib.allocate('int32', (1,), name="id", scope="local")
+            score[0] = -1.0
+            cls_id[0] = 0
+            with ib.for_range(0, num_classes, name="j") as j:
+                with ib.if_scope(j > 0):
+                    temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i]
+                    cls_id[0] = tvm.select(temp > score[0], j, cls_id[0])
+                    score[0] = tvm.make.Max(temp, score[0])
+            with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
+                cls_id[0] = 0
+            # [id, prob, xmin, ymin, xmax, ymax]
+            # Remove background, restore original id
+            with ib.if_scope(cls_id[0] > 0):
+                out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6
+                p_out[out_base_idx] = cls_id[0] - 1.0
+                p_out[out_base_idx + 1] = score[0]
+                offset = i * 4
+                p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
+                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset,
+                                                        p_anchor, offset, clip, variances[0],
+                                                        variances[1], variances[2], variances[3])
+                p_valid_count[n] += 1
+
+    return ib.get()
+
+
+@tvm.target.generic_func
+def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
+                           variances=(0.1, 0.1, 0.2, 0.2)):
+    """Location transformation for multibox detection
+
+    Parameters
+    ----------
+    cls_prob : tvm.Tensor
+        Class probabilities.
+
+    loc_pred : tvm.Tensor
+        Location regression predictions.
+
+    anchor : tvm.Tensor
+        Prior anchor boxes.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    variances : tuple of float
+        Variances to be decoded from box regression output.
+
+    Returns
+    -------
+    ret : tuple of tvm.Tensor
+    """
+    batch_size = cls_prob.shape[0]
+    num_anchors = anchor.shape[1]
+    oshape = (batch_size, num_anchors, 6)
+    # Define data alignment for intermediate buffer
+    valid_count_dtype = "int32"
+    valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
+                                      "valid_count_buf", data_alignment=4)
+    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
+    valid_count, out = \
+        tvm.extern([(batch_size,), oshape],
+                   [cls_prob, loc_pred, anchor],
+                   lambda ins, outs: transform_loc_ir(
+                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
+                   dtype=[valid_count_dtype, cls_prob.dtype],
+                   out_buffers=[valid_count_buf, out_buf],
+                   tag="multibox_transform_loc")
+    return [out, valid_count]
+
+
+@tvm.target.generic_func
+def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
+                       force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1):
+    """Convert multibox detection predictions.
+
+    Parameters
+    ----------
+    cls_prob : tvm.Tensor
+        Class probabilities.
+
+    loc_pred : tvm.Tensor
+        Location regression predictions.
+
+    anchor : tvm.Tensor
+        Prior anchor boxes.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    nms_threshold : float
+        Non-maximum suppression threshold.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    variances : tuple of float
+        Variances to be decoded from box regression output.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape (batch_size, num_anchors, 6)
+    """
+    inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
+                                       clip, threshold, variances)
+    out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
+    return out
diff --git a/topi/python/topi/vision/yolo/__init__.py b/topi/python/topi/vision/yolo/__init__.py
new file mode 100644
index 000000000000..2c0a165f8aac
--- /dev/null
+++ b/topi/python/topi/vision/yolo/__init__.py
@@ -0,0 +1,6 @@
+# pylint: disable=wildcard-import
+"""VISION network operators"""
+from __future__ import absolute_import as _abs
+
+from .region import *
+from .yolo import *
diff --git a/topi/python/topi/vision/yolo/region.py b/topi/python/topi/vision/yolo/region.py
new file mode 100644
index 000000000000..77c1c86a8d06
--- /dev/null
+++ b/topi/python/topi/vision/yolo/region.py
@@ -0,0 +1,39 @@
+# pylint: disable=invalid-name, unused-variable
+"""
+REGION Operator
+====================
+Region operator, used in darknet.
+"""
+from __future__ import absolute_import as _abs
+import tvm
+from ... import cpp
+
+@tvm.target.generic_func
+def region(data, num, classes, coords, background, softmax=True):
+    """Region forward operators.
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, c_in, h_in, w_in]
+
+    num : int
+        Darknet layer parameter n
+
+    classes : int
+        Darknet layer parameter classes
+
+    coords : int
+        Darknet layer parameter coords
+
+    background : int
+        Darknet layer parameter background
+
+    softmax : boolean
+        Darknet layer parameter softmax
+
+    Returns
+    -------
+    out : tvm.Tensor
+        4-D with shape [batch, c_in, h_in, w_in]
+    """
+    return cpp.yolo.region(data, num, classes, coords, background, softmax)
diff --git a/topi/python/topi/vision/yolo/yolo.py b/topi/python/topi/vision/yolo/yolo.py
new file mode 100644
index 000000000000..6ae630a86d8f
--- /dev/null
+++ b/topi/python/topi/vision/yolo/yolo.py
@@ -0,0 +1,30 @@
+# pylint: disable=invalid-name, unused-variable
+"""
+YOLO Operator
+=============
+YOLO operator, used in darknet.
+"""
+from __future__ import absolute_import as _abs
+import tvm
+from ... import cpp
+
+@tvm.target.generic_func
+def yolo(data, num, classes):
+    """YOLO forward operators.
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, c_in, h_in, w_in]
+
+    num : int
+        Darknet layer parameter n
+
+    classes : int
+        Darknet layer parameter classes
+
+    Returns
+    -------
+    out : tvm.Tensor
+        4-D with shape [batch, c_in, h_in, w_in]
+    """
+    return cpp.yolo.yolo(data, num, classes)
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
new file mode 100644
index 000000000000..c146419fcec9
--- /dev/null
+++ b/topi/python/topi/x86/__init__.py
@@ -0,0 +1,11 @@
+# pylint: disable=redefined-builtin, wildcard-import
+"""x86 specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+from .conv2d import schedule_conv2d, schedule_conv2d_nhwc
+from .binarize_pack import schedule_binarize_pack
+from .binary_dense import schedule_binary_dense
+from .nn import *
+from .injective import *
+from .pooling import schedule_pool, schedule_global_pool
+from .bitserial_conv2d import schedule_bitserial_conv2d
diff --git a/topi/python/topi/x86/binarize_pack.py b/topi/python/topi/x86/binarize_pack.py
new file mode 100644
index 000000000000..adf0a714b3fb
--- /dev/null
+++ b/topi/python/topi/x86/binarize_pack.py
@@ -0,0 +1,38 @@
+# pylint: disable=invalid-name
+"""Schedule for binarization and bit-packing."""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import generic
+
+
+@generic.schedule_binarize_pack.register(["cpu"])
+def schedule_binarize_pack(outs):
+    """Schedule for binarize_pack.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of binarize_pack
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for binarize_pack.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(Out):
+        s[Out].parallel(Out.op.axis[0])
+
+    def traverse(OP):
+        # schedule binarize_pack
+        if OP.tag == 'binarize_pack':
+            Out = OP.output(0)
+            _schedule(Out)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/x86/binary_dense.py b/topi/python/topi/x86/binary_dense.py
new file mode 100644
index 000000000000..8b28dd728842
--- /dev/null
+++ b/topi/python/topi/x86/binary_dense.py
@@ -0,0 +1,56 @@
+# pylint: disable=invalid-name, unused-variable, unused-argument
+"""Schedule for binary dense operator."""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import tag
+from .. import generic
+
+
+@generic.schedule_binary_dense.register(["cpu"])
+def schedule_binary_dense(outs):
+    """Schedule for binary_dense.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of binary_dense
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for binary_dense.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(A, B, C):
+        s[C].split(s[C].op.reduce_axis[0], factor=8)
+        s[C].parallel(s[C].op.axis[0])
+        if C.op in s.outputs:
+            Out = C
+        else:
+            Out = outs[0].op.output(0)
+        xo, xi = s[Out].split(Out.op.axis[1], factor=8)
+        s[Out].vectorize(xi)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule binary_dense
+        elif OP.tag == 'binary_dense':
+            output = OP.output(0)
+            data = OP.input_tensors[0]
+            weight = OP.input_tensors[1]
+            _schedule(data, weight, output)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py
new file mode 100644
index 000000000000..1c01b96f9c30
--- /dev/null
+++ b/topi/python/topi/x86/bitserial_conv2d.py
@@ -0,0 +1,316 @@
+# pylint: disable=invalid-name,unused-variable,invalid-name
+"""Bitserial conv2d schedule on x86"""
+import tvm
+from topi.util import get_const_int
+from .. import generic, tag
+from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload
+from ..nn.bitserial_conv2d import SpatialPackNCHW, SpatialPackNHWC
+from ..nn.bitserial_conv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT
+
+_QUANTIZED_SCHEDULES_NCHW = [
+    # resnet
+    SpatialPackNCHW(2, 2, 8, 1, 1),
+    SpatialPackNCHW(1, 4, 8, 4, 1),
+    SpatialPackNCHW(1, 4, 8, 1, 16),
+    SpatialPackNCHW(1, 4, 8, 4, 8),
+    SpatialPackNCHW(1, 7, 8, 3, 8),
+    SpatialPackNCHW(1, 2, 8, 1, 8),
+    SpatialPackNCHW(2, 1, 8, 1, 4),
+    SpatialPackNCHW(1, 7, 8, 1, 1),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+    SpatialPackNCHW(1, 1, 8, 1, 8),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+
+    SpatialPackNCHW(3, 3, 16, 3, 16),
+    SpatialPackNCHW(1, 1, 16, 2, 16),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+]
+
+_QUANTIZED_SCHEDULES_NHWC = [
+    # resnet
+    SpatialPackNHWC(2, 2, 8, 1, 1),
+    SpatialPackNHWC(1, 4, 8, 4, 1),
+    SpatialPackNHWC(1, 4, 8, 1, 16),
+    SpatialPackNHWC(1, 4, 8, 4, 8),
+    SpatialPackNHWC(1, 7, 8, 3, 8),
+    SpatialPackNHWC(1, 2, 8, 1, 8),
+    SpatialPackNHWC(2, 1, 8, 1, 4),
+    SpatialPackNHWC(1, 7, 8, 1, 1),
+    SpatialPackNHWC(1, 1, 8, 1, 16),
+    SpatialPackNHWC(1, 1, 8, 1, 8),
+    SpatialPackNHWC(1, 1, 8, 1, 16),
+]
+
+@_get_schedule.register("cpu")
+def _get_schedule_bitserial_conv2d(wkl, layout):
+    if wkl not in _WORKLOADS:
+        raise ValueError("no schedule for such workload: {}".format(wkl))
+    idx = _WORKLOADS.index(wkl)
+    if layout == "NCHW":
+        sch = _QUANTIZED_SCHEDULES_NCHW[idx]
+    elif layout == "NHWC":
+        sch = _QUANTIZED_SCHEDULES_NHWC[idx]
+    return sch
+
+@bitserial_conv2d.register("cpu")
+def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
+                                  layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False):
+    if out_dtype is None:
+        out_dtype = data.dtype
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
+    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
+    sch = _get_schedule(wkl, layout)
+    return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits,
+                                              weight_bits, pack_dtype, out_dtype, dorefa)
+
+@generic.schedule_bitserial_conv2d_nchw.register(["cpu"])
+@generic.schedule_bitserial_conv2d_nhwc.register(["cpu"])
+def schedule_bitserial_conv2d(outs):
+    """CPU schedule for bitserial convolutions NCHW and NHWC"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        output = op.output(0)
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag) or 'elemwise' in op.tag:
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        elif 'spatial_bitserial_conv_nchw' in op.tag or 'spatial_bitserial_conv_nhwc' in op.tag:
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[1]
+            kernel_q = kernel_vec.op.input_tensors[0]
+            kernel = kernel_q.op.input_tensors[0]
+            data_vec = conv_out.op.input_tensors[0]
+            data_q = data_vec.op.input_tensors[0]
+            data = data_q.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+                data_pad = data_q
+                data_q = data
+                data = data_q.op.input_tensors[0]
+            if "QuantizeInput" in kernel.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                kernel = kernel.op.input_tensors[0]
+            if "QuantizeInput" in data.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                data = data.op.input_tensors[0]
+
+            if 'spatial_bitserial_conv_nchw' in op.tag:
+                _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec,
+                                              kernel, kernel_q, kernel_vec,
+                                              conv_out, output, outs[0])
+            elif 'spatial_bitserial_conv_nhwc' in op.tag:
+                _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                                              kernel, kernel_q, kernel_vec,
+                                              conv_out, output, outs[0])
+
+    traverse(outs[0].op)
+    return s
+
+def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec,
+                                  kernel, kernel_q, kernel_vec,
+                                  conv_out, output, last):
+    IB, _, CI, IH, IW = data_q.shape
+    KB, CO, _, KH, KW = kernel_q.shape
+    _, _, OH, OW = output.shape
+
+    # Infer padding and stride
+    if data_pad is None:
+        padding = (0, 0)
+        TH, TW = IH, IW
+    else:
+        _, _, _, TH, TW = data_pad.shape
+        hpad = get_const_int((TH - IH) // 2)
+        wpad = get_const_int((TW - IW) // 2)
+        padding = (hpad, wpad)
+
+    hstride = get_const_int((TH - KH) // (OH - 1))
+    wstride = get_const_int((TW - KW) // (OW - 1))
+    stride = (hstride, wstride)
+
+    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NCHW")
+    sch = _get_schedule(wkl, "NCHW")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+    ba = sch.ba
+    bc = sch.bc
+
+    CC = s.cache_write(conv_out, "global")
+    n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
+    s[conv_out].vectorize(vc)
+
+    s[CC].compute_at(s[conv_out], ow)
+    n, co, oh, ow, vh, vw, vc = s[CC].op.axis
+    ci, dh, dw, b1, b2 = s[CC].op.reduce_axis
+    s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc)
+    s[CC].unroll(b1)
+    s[CC].unroll(b2)
+    s[CC].vectorize(vc)
+
+    ##### Schedule A
+    if data_pad is not None:
+        s[data_pad].compute_inline()
+
+    _, h, _, _, _, _, vw = s[data_vec].op.axis
+    s[data_vec].vectorize(vw)
+    if ba == 1:
+        oaxis = h
+        paxis = h
+    else:
+        oh, ih = s[data_vec].split(h, ba)
+        oaxis = oh
+        paxis = ih
+
+    s[data_vec].parallel(paxis)
+    s[data_vec].pragma(oaxis, "parallel_launch_point")
+    s[data_vec].pragma(paxis, "parallel_stride_pattern")
+    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule B
+    co, _, _, _, _, vc = s[kernel_vec].op.axis
+    s[kernel_vec].vectorize(vc)
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[kernel_vec].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[kernel_vec].parallel(paxis)
+    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
+    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
+    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule C
+    n, co, h, w = s[last].op.axis
+    co, vc = s[last].split(co, VC)
+    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
+    s[last].reorder(n, co, oh, ow, vh, vw, vc)
+    if last != output:
+        s[output].compute_inline()
+    s[conv_out].compute_at(s[last], ow)
+
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[last].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[last].parallel(paxis)
+    s[last].pragma(oaxis, "parallel_launch_point")
+    s[last].pragma(paxis, "parallel_stride_pattern")
+    s[last].pragma(oaxis, "parallel_barrier_when_finish")
+
+    return s
+
+def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                                  kernel, kernel_q, kernel_vec,
+                                  conv_out, output, last):
+    # no stride and padding info here
+    _, IH, IW, CI, IB = data_q.shape
+    KH, KW, _, CO, KB = kernel_q.shape
+    _, OH, OW, _ = output.shape
+    # Infer padding and stride
+    if data_pad is None:
+        padding = (0, 0)
+        TH, TW = IH, IW
+    else:
+        _, TH, TW, _, _ = data_pad.shape
+        hpad = get_const_int((TH - IH) // 2)
+        wpad = get_const_int((TW - IW) // 2)
+        padding = (hpad, wpad)
+
+    hstride = get_const_int((TH - KH) // (OH - 1))
+    wstride = get_const_int((TW - KW) // (OW - 1))
+    stride = (hstride, wstride)
+
+    wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+    ba = sch.ba
+    bc = sch.bc
+
+    ##### Schedule data packing
+    if data_pad is not None:
+        s[data_pad].compute_inline()
+
+    _, h, _, _, _, _, _ = s[data_vec].op.axis
+    if ba == 1:
+        oaxis = h
+        paxis = h
+    else:
+        oh, ih = s[data_vec].split(h, ba)
+        oaxis = oh
+        paxis = ih
+    s[data_vec].parallel(paxis)
+    s[data_vec].pragma(oaxis, "parallel_launch_point")
+    s[data_vec].pragma(paxis, "parallel_stride_pattern")
+    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule kernel packing
+    co, _, _, _, _, _ = s[kernel_vec].op.axis
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[kernel_vec].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[kernel_vec].parallel(paxis)
+    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
+    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
+    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule Convolution
+    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
+    dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis
+
+    s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
+
+    s[conv_out].unroll(b1)
+    s[conv_out].unroll(b2)
+    s[conv_out].vectorize(vc)
+
+    # # Schedule output
+    n, h, w, co = s[last].op.axis
+    co, vc = s[last].split(co, VC)
+    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
+    s[last].reorder(n, oh, ow, co, vh, vw, vc)
+    s[last].vectorize(vc)
+    if last != output:
+        s[output].compute_inline()
+    s[conv_out].compute_at(s[last], ow)
+
+    if bc == 1:
+        oaxis = oh
+        paxis = oh
+    else:
+        oho, iho = s[last].split(oh, bc)
+        oaxis = oho
+        paxis = iho
+
+    s[last].parallel(paxis)
+    s[last].pragma(oaxis, "parallel_launch_point")
+    s[last].pragma(paxis, "parallel_stride_pattern")
+    s[last].pragma(oaxis, "parallel_barrier_when_finish")
+
+    return s
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
new file mode 100644
index 000000000000..ae4a567f1dfa
--- /dev/null
+++ b/topi/python/topi/x86/conv2d.py
@@ -0,0 +1,326 @@
+# pylint: disable=invalid-name,unused-variable,invalid-name,unused-argument
+"""Conv2D schedule on x86"""
+import tvm
+from .. import generic, tag
+from .. import nn
+from ..nn.util import infer_pad, infer_stride
+from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, \
+    _get_workload, _get_schedule, _get_schedule_NCHWc, \
+    _get_alter_layout_schedule, Workload
+
+from . import conv2d_avx_1x1, conv2d_avx_common
+from .conv2d_avx_common import AVXConvCommonFwd
+from .conv2d_avx_1x1 import AVXConv1x1Fwd
+
+@_get_schedule.register("cpu")
+def _get_schedule_conv(wkl):
+    _WORKLOADS_AVX = [
+        # workloads of resnet18_v1 on imagenet
+        Workload('float32', 'float32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
+        Workload('float32', 'float32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+        Workload('float32', 'float32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+        Workload('float32', 'float32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+        Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+        Workload('float32', 'float32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+        Workload('float32', 'float32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+        Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+        Workload('float32', 'float32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+        Workload('float32', 'float32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+        Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+        Workload('float32', 'float32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+        # workloads of resnet34_v1 on imagenet, no extra workload required
+        # workloads of resnet50_v1 on imagenet
+        Workload('float32', 'float32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
+        Workload('float32', 'float32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
+        Workload('float32', 'float32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
+        Workload('float32', 'float32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
+        Workload('float32', 'float32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
+        Workload('float32', 'float32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
+        Workload('float32', 'float32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
+        Workload('float32', 'float32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
+        Workload('float32', 'float32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
+        Workload('float32', 'float32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
+        Workload('float32', 'float32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
+        Workload('float32', 'float32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
+        Workload('float32', 'float32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
+        Workload('float32', 'float32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
+        # workloads of resnet101_v1 on imagenet, no extra workload required
+        # workloads of resnet152_v1 on imagenet, no extra workload required
+        # workloads of resnet18_v2 on imagenet, no extra workload required
+        # workloads of resnet34_v2 on imagenet, no extra workload required
+    ]
+
+    fp32_vec_len = 8
+    target = tvm.target.current_target(allow_none=False)
+    for opt in target.options:
+        if opt == '-mcpu=skylake-avx512':
+            fp32_vec_len = 16
+
+    _SCHEDULES_AVX = [
+        # workloads of resnet18_v1 on imagenet
+        AVXConvCommonFwd(3, fp32_vec_len, 28, False),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, False),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, True),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 7),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
+        # workloads of resnet34_v1 on imagenet, no extra workload required
+        # workloads of resnet50_v1 on imagenet
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        # workloads of resnet101_v1 on imagenet, no extra workload required
+        # workloads of resnet152_v1 on imagenet, no extra workload required
+        # workloads of resnet18_v2 on imagenet, no extra workload required
+        # workloads of resnet34_v2 on imagenet, no extra workload required
+    ]
+
+    if wkl not in _WORKLOADS_AVX:
+        if wkl.hkernel == 1 and wkl.wkernel == 1:
+            return conv2d_avx_1x1._get_default_schedule(wkl, fp32_vec_len)
+        return conv2d_avx_common._get_default_schedule(wkl, fp32_vec_len)
+    idx = _WORKLOADS_AVX.index(wkl)
+    sch = _SCHEDULES_AVX[idx]
+    return sch
+
+@_get_schedule_NCHWc.register("cpu")
+def _get_schedule_NCHWc_x86(wkl, layout, out_layout):
+    return _get_schedule_conv(wkl)
+
+@_get_alter_layout_schedule.register("cpu")
+def _get_alter_layout_schedule_x86(wkl):
+    return _get_schedule_conv(wkl)
+
+@conv2d.register("cpu")
+def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
+    _AVX_SCH_TO_DECL_FUNC = {
+        AVXConvCommonFwd: conv2d_avx_common._declaration_conv,
+        AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv
+    }
+    out_dtype = data.dtype if out_dtype is None else out_dtype
+    target = tvm.target.current_target(allow_none=False)
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
+    if layout == 'NCHW':
+        sch = _get_schedule(wkl)
+        return _AVX_SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout, out_dtype)
+    elif layout == 'HWCN':
+        return nn.conv2d_hwcn(data, kernel, stride, padding, out_dtype)
+    elif layout == 'NHWC':
+        return nn.conv2d_nhwc(data, kernel, stride, padding, out_dtype)
+    else:
+        raise ValueError("not support this layout {} yet".format(layout))
+
+
+@conv2d_alter_layout.register("cpu")
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    import nnvm.symbol as sym
+    copy_inputs = [s for s in inputs]
+    new_attrs = {k : attrs[k] for k in attrs.keys()}
+    # only optimize for NCHW, groups=1 conv
+    if attrs['layout'] != 'NCHW' or attrs.get_int("groups") != 1:
+        return None
+
+    data = tinfos[0]
+    kernel = tinfos[1]
+
+    import ast
+    padding = ast.literal_eval(attrs['padding'])
+    stride = ast.literal_eval(attrs['strides'])
+
+    wkl = _get_workload(data, kernel, stride, padding, data.dtype)
+    sch = _get_alter_layout_schedule(wkl)
+    is_kernel_1x1 = isinstance(sch, AVXConv1x1Fwd)
+    ic_bn, oc_bn = sch.ic_bn, sch.oc_bn
+
+    new_attrs['layout'] = 'NCHW%dc' % ic_bn
+    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+
+    if is_kernel_1x1:
+        # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w)
+        new_attrs['kernel_layout'] = 'OI%di%doHW' % (ic_bn, oc_bn)
+    else:
+        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
+        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+
+    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+
+
+@conv2d_NCHWc.register("cpu")
+def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
+                            padding, layout, out_layout, out_dtype):
+    _AVX_SCH_TO_DECL_FUNC = {
+        AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc,
+        AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc
+    }
+    n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
+    ic = ic_chunk * ic_block
+    kh, kw = kernel_size
+    wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=out_dtype),
+                        tvm.placeholder((num_filter, ic, kh, kw), dtype=out_dtype),
+                        stride, padding, out_dtype)
+    sch = _get_schedule_NCHWc(wkl, layout, out_layout)
+    return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel)
+
+
+@generic.schedule_conv2d_nchw.register(["cpu"])
+def schedule_conv2d(outs):
+    """Create schedule for tensors"""
+    _AVX_SCH_TO_SCH_FUNC = {
+        AVXConvCommonFwd: conv2d_avx_common._schedule_conv,
+        AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv
+    }
+    s = tvm.create_schedule([x.op for x in outs])
+    target = tvm.target.current_target(allow_none=False)
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        if 'conv2d_nchw' in op.tag:
+            output = op.output(0)
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[1]
+            kernel = kernel_vec.op.input_tensors[0]
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+            data_vec = conv_out.op.input_tensors[0]
+            data = data_vec.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+            padding = infer_pad(data, data_pad)
+            if data_pad is None:
+                stride = infer_stride(data, kernel, output)
+            else:
+                stride = infer_stride(data_pad, kernel, output)
+
+            wkl = _get_workload(data, kernel, stride, padding, output.dtype)
+            sch = _get_schedule(wkl)
+            _AVX_SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec,
+                                            kernel, kernel_vec, conv_out, output, outs[0])
+
+    traverse(outs[0].op)
+    return s
+
+
+@generic.schedule_conv2d_nhwc.register(["cpu"])
+def schedule_conv2d_nhwc(outs):
+    """Create schedule for tensors"""
+    s = tvm.create_schedule([x.op for x in outs])
+    output_op = outs[0].op
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            else: # inject custom schedule
+                if len(op.axis) == 4: # schedule bias + bn + relu
+                    n, h, w, c = op.axis
+                    fused = s[op].fuse(n, h, w)
+                    s[op].parallel(fused)
+                    s[op].vectorize(c)
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        if 'conv2d_nhwc' in op.tag:
+            conv = op.output(0)
+            kernel = op.input_tensors[1]
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            data = op.input_tensors[0]
+            data_pad = None
+            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+
+            n_pad, h_pad, w_pad, c_pad = data_pad.op.axis
+            pad_fused = s[data_pad].fuse(n_pad, h_pad)
+            s[data_pad].parallel(pad_fused)
+            C = conv
+            n, h, w, c = C.op.axis
+            ry, rx, rc = C.op.reduce_axis
+            n_out, h_out, w_out, c_out = output_op.axis
+            s[C].vectorize(c)
+            if op != output_op: # fuse bias + bn + relu into conv
+                s[C].compute_at(s[output_op], c_out)
+            else:
+                fused = s[C].fuse(n, h, w)
+                s[C].parallel(fused)
+
+    traverse(output_op)
+    return s
+
+
+@generic.schedule_conv2d_NCHWc.register(["cpu"])
+def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding,
+                          layout, out_layout, outs):
+    """Create schedule for tensors"""
+    _AVX_SCH_TO_SCH_FUNC = {
+        AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc,
+        AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc
+    }
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        if 'conv2d_NCHWc' in op.tag:
+            conv_out = op.output(0)
+            kernel = conv_out.op.input_tensors[1]
+            data_vec = conv_out.op.input_tensors[0]
+            data = data_vec.op.input_tensors[0] \
+                if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
+                else data_vec
+            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+
+            n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
+            ic = ic_chunk * ic_block
+            original_data = tvm.placeholder((n, ic, h, w), dtype=conv_out.dtype)
+
+            kh, kw = kernel_size
+            original_kernel = tvm.placeholder((num_filter, ic, kh, kw), dtype=conv_out.dtype)
+
+            wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype)
+            sch = _get_schedule_NCHWc(wkl, layout, out_layout)
+            _AVX_SCH_TO_SCH_FUNC[type(sch)](s, wkl, sch, data_vec,
+                                            kernel, conv_out, outs[0])
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
new file mode 100644
index 000000000000..7d820701e1f4
--- /dev/null
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -0,0 +1,231 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
+"""1x1 Conv2D schedule on for Intel CPU"""
+from __future__ import absolute_import as _abs
+from collections import namedtuple
+import tvm
+
+from ..util import get_const_tuple
+from ..nn.conv2d import _get_schedule, _get_workload
+from ..nn.util import infer_pad, infer_stride
+from ..nn.pad import pad
+
+AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor'])
+
+
+def _get_default_schedule(wkl, simd_width):
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if wkl.out_filter % bn == 0:
+            oc_bn = bn
+            break
+
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if wkl.in_filter % bn == 0:
+            ic_bn = bn
+            break
+
+    for ow_factor in range(out_width, 0, -1):
+        if out_width % ow_factor == 0:
+            for oh_factor in range(out_height, 0, -1):
+                if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
+                    return AVXConv1x1Fwd(ic_bn, oc_bn, oh_factor, ow_factor)
+
+    raise ValueError("cannot decide default schedule for workload: {}".format(wkl))
+
+
+def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
+    assert layout == 'NCHW', "only support NCHW convolution for AVX"
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
+    sch = _get_schedule(wkl)
+
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
+
+    pad_height = in_height + 2 * HPAD
+    pad_width = in_width + 2 * WPAD
+
+    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
+    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1
+
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+    else:
+        data_pad = data
+    shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn)
+    data_vec = tvm.compute(shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w])
+
+    shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, sch.ic_bn, sch.oc_bn, 1, 1)
+    kernel_vec = tvm.compute(shape, lambda CO, CI, ci, co, h, w:
+                             kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w],
+                             name='kernel_vec')
+
+    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn)
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR, ow*WSTR, ic%sch.ic_bn] *
+                               kernel_vec[oc_chunk, ic//sch.ic_bn, ic%sch.ic_bn, oc_block, 0, 0],
+                               axis=[ic]), name='conv')
+
+    oshape = (batch_size, num_filter, out_height, out_width)
+    unpack = tvm.compute(oshape, lambda n, oc, oh, ow:
+                         conv[n, oc // sch.oc_bn, oh, ow, oc % sch.oc_bn],
+                         tag='conv2d_nchw')
+    return unpack
+
+
+def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last):
+    # no stride and padding info here
+    padding = infer_pad(data, data_pad)
+    if data_pad is None:
+        stride = infer_stride(data, kernel, output)
+    else:
+        stride = infer_stride(data_pad, kernel, output)
+
+    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
+    sch = _get_schedule(wkl)
+
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    DOPAD = (HPAD != 0 or WPAD != 0)
+
+    A, W = data, kernel_vec
+    A0, A1 = data_pad, data_vec
+    # schedule data
+    if DOPAD:
+        s[A0].compute_inline()
+    batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
+    parallel_axis = s[A1].fuse(ic_chunk, ih)
+    s[A1].parallel(parallel_axis)
+
+    # schedule kernel pack
+    oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
+    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+    if sch.oc_bn > 1:
+        s[W].vectorize(oc_block)
+    parallel_axis = s[W].fuse(oc_chunk, oh)
+    s[W].parallel(parallel_axis)
+
+    C, O0, O = conv_out, output, last
+    CC = s.cache_write(C, 'global')
+
+    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
+    s[C].vectorize(oc_block)
+
+    s[CC].compute_at(s[C], oh_outer)
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    ic, = s[CC].op.reduce_axis
+
+    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+
+    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
+
+    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block)
+    s[CC].vectorize(oc_block)
+
+    s[CC].unroll(ow_inner)
+    s[CC].unroll(oh_inner)
+
+    if O0 != O:
+        s[O0].compute_inline()
+    batch, oc, oh, ow = s[O].op.axis
+
+    oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn)
+    oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
+    ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
+    s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+
+    parallel_axis = s[O].fuse(oc_chunk, oh_outer)
+    s[C].compute_at(s[O], parallel_axis)
+    s[O].vectorize(oc_block)
+
+    s[O].parallel(parallel_axis)
+
+    return s
+
+
+def _declaration_conv_NCHWc(wkl, sch, data, kernel):
+    out_dtype = wkl.out_dtype
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+
+    batch_size = data.shape[0]
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
+    else:
+        data_pad = data
+
+    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
+    ic = tvm.reduce_axis((0, wkl.in_filter), name='ic')
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_pad[n, ic//sch.ic_bn, oh*HSTR, ow*WSTR, ic%sch.ic_bn]
+                               .astype(out_dtype) *
+                               kernel[oc_chunk, ic // sch.ic_bn, ic % sch.ic_bn, oc_block, 0, 0],
+                               axis=[ic]), name='conv2d_NCHWc', tag='conv2d_NCHWc')
+
+    return conv
+
+
+def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
+    # schedule data
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
+        parallel_axis = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(parallel_axis)
+
+    C, O = conv_out, last
+    CC = s.cache_write(C, 'global')
+
+    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
+    ow_outer, ow_inner = s[C].split(ow, factor=sch.ow_factor)
+    s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+    s[C].vectorize(oc_block)
+
+    parallel_axis = s[C].fuse(oc_chunk, oh_outer)
+    s[CC].compute_at(s[C], parallel_axis)
+    if C == O:
+        s[C].parallel(parallel_axis)
+
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    ic, = s[CC].op.reduce_axis
+
+    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+
+    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
+
+    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block)
+    s[CC].fuse(oc_chunk, oh_outer)
+    s[CC].vectorize(oc_block)
+
+    s[CC].unroll(ow_inner)
+    s[CC].unroll(oh_inner)
+
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
+        ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
+        s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+
+        parallel_axis = s[O].fuse(oc_chunk, oh_outer)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
+
+    return s
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
new file mode 100644
index 000000000000..8f8086fdebb4
--- /dev/null
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -0,0 +1,254 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
+"""Conv2D schedule on for Intel CPU"""
+from __future__ import absolute_import as _abs
+from collections import namedtuple
+import tvm
+
+from ..util import get_const_tuple
+from ..nn.conv2d import _get_schedule, _get_workload
+from ..nn.util import infer_pad, infer_stride
+from ..nn.pad import pad
+
+AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw'])
+
+
+def _get_default_schedule(wkl, simd_width):
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if wkl.out_filter % bn == 0:
+            oc_bn = bn
+            break
+
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if wkl.in_filter % bn == 0:
+            ic_bn = bn
+            break
+
+    reg_n = 1
+    for n in range(31, 0, -1):
+        if out_width % n == 0:
+            reg_n = n
+            break
+
+    return AVXConvCommonFwd(ic_bn, oc_bn, reg_n, False)
+
+
+def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
+    out_dtype = data.dtype if out_dtype is None else out_dtype
+    assert layout == 'NCHW', "only support NCHW convolution for AVX"
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
+    sch = _get_schedule(wkl)
+
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
+
+    pad_height = in_height + 2 * HPAD
+    pad_width = in_width + 2 * WPAD
+
+    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
+    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1
+
+    # pack data
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+    else:
+        data_pad = data
+
+    shape = (batch_size, in_channel // sch.ic_bn, pad_height, sch.ic_bn, pad_width)
+    data_vec = tvm.compute(shape,
+                           lambda n, C, h, c, w: data_pad[n, C * sch.ic_bn + c, h, w],
+                           name='data_vec')
+
+    # pack kernel
+    shape = (num_filter//sch.oc_bn, in_channel//sch.ic_bn,
+             kernel_height, kernel_width, sch.ic_bn, sch.oc_bn)
+    kernel_vec = tvm.compute(shape, lambda CO, CI, h, w, ci, co:
+                             kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w],
+                             name='kernel_vec')
+
+    # convolution
+    oshape = (batch_size, num_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
+    unpack_shape = (batch_size, num_filter, out_height, out_width)
+
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR+kh, ic%sch.ic_bn, ow*WSTR+kw]
+                               .astype(out_dtype) *
+                               kernel_vec[oc_chunk, ic//sch.ic_bn, kh, kw, ic%sch.ic_bn, oc_block]
+                               .astype(out_dtype),
+                               axis=[ic, kh, kw]),
+                       name='conv')
+
+    unpack = tvm.compute(unpack_shape,
+                         lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn]
+                         .astype(out_dtype),
+                         name='output_unpack',
+                         tag='conv2d_nchw')
+    return unpack
+
+
+def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last):
+    # no stride and padding info here
+    padding = infer_pad(data, data_pad)
+    if data_pad is None:
+        stride = infer_stride(data, kernel, output)
+    else:
+        stride = infer_stride(data_pad, kernel, output)
+    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
+    sch = _get_schedule(wkl)
+
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    DOPAD = (HPAD != 0 or WPAD != 0)
+
+    A, W = data, kernel_vec
+    A0, A1 = data_pad, data_vec
+
+    # schedule data
+    if DOPAD:
+        s[A0].compute_inline()
+    batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
+    parallel_axis = s[A1].fuse(ic_chunk, ih)
+    s[A1].parallel(parallel_axis)
+
+    # schedule kernel pack
+    oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
+    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+    if sch.oc_bn > 1:
+        s[W].vectorize(oc_block)
+    parallel_axis = s[W].fuse(oc_chunk, oh)
+    s[W].parallel(parallel_axis)
+
+    # schedule conv
+    C, O0, O = conv_out, output, last
+    CC = s.cache_write(C, 'global')
+
+    _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n)
+    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    s[C].fuse(oc_chunk, oh)
+    s[C].vectorize(oc_block)
+
+    s[CC].compute_at(s[C], ow_chunk)
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    ic, kh, kw = s[CC].op.reduce_axis
+
+    ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+
+    if sch.unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
+        s[CC].unroll(kw)
+    else:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
+
+    s[CC].fuse(oc_chunk, oh)
+    s[CC].vectorize(oc_block)
+    s[CC].unroll(ow_block)
+
+    if O0 != O:
+        s[O0].compute_inline()
+
+    batch, oc, oh, ow = s[O].op.axis
+    ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n)
+    oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn)
+    s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    parallel_axis = s[O].fuse(oc_chunk, oh)
+    s[C].compute_at(s[O], parallel_axis)
+    s[O].vectorize(oc_block)
+
+    s[O].parallel(parallel_axis)
+
+    return s
+
+
+def _declaration_conv_NCHWc(wkl, sch, data, kernel):
+    out_dtype = wkl.out_dtype
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+
+    batch_size = data.shape[0]
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    # pack data
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
+    else:
+        data_pad = data
+
+    # convolution
+    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
+
+    ic = tvm.reduce_axis((0, wkl.in_filter), name='ic')
+    kh = tvm.reduce_axis((0, wkl.hkernel), name='kh')
+    kw = tvm.reduce_axis((0, wkl.wkernel), name='kw')
+
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_pad[n, ic//sch.ic_bn, oh*HSTR+kh, ow*WSTR+kw, ic%sch.ic_bn]
+                               .astype(out_dtype) *
+                               kernel[oc_chunk, ic//sch.ic_bn, kh, kw, ic%sch.ic_bn, oc_block],
+                               axis=[ic, kh, kw]), name='conv2d_NCHWc', tag="conv2d_NCHWc")
+
+    return conv
+
+
+def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
+    # schedule data
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
+        parallel_axis = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(parallel_axis)
+
+    # schedule 5-D NCHW[x]c conv
+    C, O = conv_out, last
+    CC = s.cache_write(C, 'global')
+
+    _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n)
+    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    parallel_axis = s[C].fuse(oc_chunk, oh)
+    s[C].vectorize(oc_block)
+    if C == O:
+        s[C].parallel(parallel_axis)
+
+    s[CC].compute_at(s[C], ow_chunk)
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    ic, kh, kw = s[CC].op.reduce_axis
+
+    ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+
+    if sch.unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
+        s[CC].unroll(kw)
+    else:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
+
+    s[CC].vectorize(oc_block)
+    s[CC].unroll(ow_block)
+
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n)
+        s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+        parallel_axis = s[O].fuse(oc_chunk, oh)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
+
+    return s
diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py
new file mode 100644
index 000000000000..b43ebb98b82f
--- /dev/null
+++ b/topi/python/topi/x86/injective.py
@@ -0,0 +1,37 @@
+# pylint: disable=invalid-name
+"""x86 declaration and schedules."""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import generic
+
+@generic.schedule_injective.register(["cpu"])
+def schedule_injective(outs):
+    """X86 schedule for injective op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of injective in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    x = outs[0]
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+    if len(s[x].op.axis) >= 5:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
+        s[x].parallel(fused)
+    elif len(s[x].op.axis) >= 3:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
+        s[x].parallel(fused)
+    else:
+        s[x].parallel(s[x].op.axis[0])
+    return s
+
+schedule_elemwise = schedule_injective
+schedule_broadcast = schedule_injective
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
new file mode 100644
index 000000000000..ccb9f01471a1
--- /dev/null
+++ b/topi/python/topi/x86/nn.py
@@ -0,0 +1,93 @@
+# pylint: disable=invalid-name,too-many-locals
+"""x86 nn operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import generic
+from .. import tag
+
+@generic.schedule_softmax.register(["cpu"])
+def schedule_softmax(outs):
+    """Schedule for softmax
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of softmax
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    x = outs[0]
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+    if len(s[x].op.axis) >= 5:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
+        s[x].parallel(fused)
+    elif len(s[x].op.axis) >= 3:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
+        s[x].parallel(fused)
+    else:
+        s[x].parallel(s[x].op.axis[0])
+    return s
+
+
+@generic.schedule_dense.register(["cpu"])
+def schedule_dense(outs):
+    """Schedule for dense
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of pool
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        if 'dense' in op.tag:
+            C = op.output(0)
+            x, y = C.op.axis
+
+            # Write cache for blocks
+            CC = s.cache_write(C, 'global')
+
+            # Tile
+            bnx = 1
+            bny = 4
+            _, yo, _, yi = s[C].tile(x, y, bnx, bny)
+            s[CC].compute_at(s[C], yo)
+            xc, yc = s[CC].op.axis
+            k, = s[CC].op.reduce_axis
+            ko, ki = s[CC].split(k, factor=4)
+            s[CC].reorder(ko, xc, ki, yc)
+            s[CC].unroll(ki)
+            s[CC].vectorize(yc)
+
+            # Vectorization
+            s[C].vectorize(yi)
+
+            # Parallelization
+            s[C].parallel(yo)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py
new file mode 100644
index 000000000000..946ecd2c7697
--- /dev/null
+++ b/topi/python/topi/x86/pooling.py
@@ -0,0 +1,94 @@
+# pylint: disable=invalid-name, unused-variable
+"""Schedule for pooling operators"""
+import tvm
+from .. import generic
+from .. import tag
+
+def _parallel_sch(sch):
+    if len(sch.op.axis) >= 5:
+        fused = sch.fuse(sch.op.axis[0], sch.op.axis[1], sch.op.axis[2])
+        sch.parallel(fused)
+    elif len(sch.op.axis) >= 3:
+        fused = sch.fuse(sch.op.axis[0], sch.op.axis[1])
+        sch.parallel(fused)
+    else:
+        sch.parallel(sch.op.axis[0])
+
+
+@generic.schedule_pool.register(["cpu"])
+def schedule_pool(outs):
+    """Schedule for pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of pool
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(PaddedInput, Pool):
+        if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
+            s[PaddedInput].compute_inline()
+        _parallel_sch(s[Pool])
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule pool
+        elif OP.tag.startswith('pool'):
+            PaddedInput = OP.input_tensors[0]
+            Pool = OP.output(0)
+            _schedule(PaddedInput, Pool)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+    traverse(outs[0].op)
+    return s
+
+
+@generic.schedule_global_pool.register(["cpu"])
+def schedule_global_pool(outs):
+    """Schedule for global pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of pool
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule pool
+        elif OP.tag.startswith('global_pool'):
+            Pool = OP.output(0)
+            _parallel_sch(s[Pool])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+    traverse(outs[0].op)
+    return s
diff --git a/topi/recipe/gemm/android_gemm_square.py b/topi/recipe/gemm/android_gemm_square.py
new file mode 100644
index 000000000000..361c56f636b2
--- /dev/null
+++ b/topi/recipe/gemm/android_gemm_square.py
@@ -0,0 +1,117 @@
+"""Example code to do square matrix multiplication on Android Phone."""
+import tvm
+import os
+from tvm import rpc
+from tvm.contrib import util, ndk
+import numpy as np
+
+# Set to be address of tvm proxy.
+proxy_host = os.environ["TVM_ANDROID_RPC_PROXY_HOST"]
+proxy_port = 9090
+key = "android"
+
+# Change target configuration.
+# Run `adb shell cat /proc/cpuinfo` to find the arch.
+arch = "arm64"
+target = "llvm -target=%s-linux-android" % arch
+
+def ngflops(N):
+    return 2.0 * float(N * N * N) / (10**9)
+
+dtype = 'float32'
+def evaluate(func, ctx, N, times):
+    a_np = np.random.uniform(size=(N, N)).astype(dtype)
+    b_np = np.random.uniform(size=(N, N)).astype(dtype)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx)
+
+    time_f = func.time_evaluator(func.entry_name, ctx, number=times)
+    cost = time_f(a, b, c).mean
+    gf = ngflops(N) / cost
+    print('%g secs/op, %g GFLOPS' % (cost, gf))
+    np.testing.assert_almost_equal(c.asnumpy(), a_np.dot(b_np), decimal=2)
+
+def test_gemm_gpu(N, times, bn, num_block, num_thread):
+    assert(bn <= N)
+    assert(num_thread * num_thread * 16 <= N)
+    assert(num_block * num_block * 2 <= N)
+    A = tvm.placeholder((N, N), name='A')
+    B = tvm.placeholder((N, N), name='Btmp')
+    k = tvm.reduce_axis((0, N), name='k')
+
+    packedB = tvm.compute((N, N / bn, bn),
+              lambda x, y, z: B[x, y * bn + z], name = 'B')
+
+    C = tvm.compute(
+        (N, N),
+        lambda ii, jj: tvm.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k),
+        name='C')
+
+    s = tvm.create_schedule(C.op)
+    CC = s.cache_write(C, "local")
+
+    block_x = tvm.thread_axis("blockIdx.x")
+    block_y = tvm.thread_axis("blockIdx.y")
+    thread_x = tvm.thread_axis("threadIdx.x")
+    thread_y = tvm.thread_axis("threadIdx.y")
+
+    thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx")
+    thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy")
+
+    pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread)
+    pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread)
+    s[packedB].bind(pby, thread_y)
+    s[packedB].bind(pbx, thread_x)
+    pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8)
+    s[packedB].vectorize(pbk)
+
+    by, yi = s[C].split(C.op.axis[0], nparts=num_block)
+    bx, xi = s[C].split(C.op.axis[1], nparts=num_thread)
+
+    s[C].bind(by, block_y)
+    s[C].bind(bx, thread_y)
+    s[C].reorder(by, bx, yi, xi)
+
+    tyz, yi = s[C].split(yi, nparts=2)
+    ty, yi = s[C].split(yi, nparts=num_block)
+    txz, xi = s[C].split(xi, nparts=2)
+    tx, xi = s[C].split(xi, nparts=num_thread)
+
+    s[C].reorder(tyz, txz, ty, tx, yi, xi)
+    s[C].bind(tyz, thread_yz)
+    s[C].bind(txz, thread_xz)
+
+    s[C].bind(ty, block_x)
+    s[C].bind(tx, thread_x)
+
+    xyi, xxi = s[C].split(xi, factor=8)
+    s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi)
+    s[C].vectorize(xxi)
+
+    s[CC].compute_at(s[C], yi)
+    yo, xo = CC.op.axis
+    s[CC].reorder(k, yo, xo)
+    xo, xi = s[CC].split(xo, factor=8)
+    s[CC].vectorize(xi)
+
+    ko, ki = s[CC].split(k, factor=2)
+    s[CC].unroll(ki)
+
+    print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+    f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu")
+    temp = util.tempdir()
+    path_dso = temp.relpath("gemm_gpu.so")
+    f.export_library(path_dso, ndk.create_shared)
+
+    # connect to the proxy
+    remote = rpc.connect(proxy_host, proxy_port, key=key)
+    ctx = remote.cl(0)
+    remote.upload(path_dso)
+    f = remote.load_module("gemm_gpu.so")
+
+    evaluate(f, ctx, N, times)
+
+if __name__ == "__main__":
+    test_gemm_gpu(1024, times=5, bn=8, num_block=2, num_thread=8)
diff --git a/topi/recipe/gemm/cuda_gemm_square.py b/topi/recipe/gemm/cuda_gemm_square.py
index c06686bc46a6..f2cabb26bb66 100644
--- a/topi/recipe/gemm/cuda_gemm_square.py
+++ b/topi/recipe/gemm/cuda_gemm_square.py
@@ -2,6 +2,7 @@
 import tvm
 import os
 from tvm.contrib import nvcc
+from tvm.contrib import spirv
 import numpy as np
 
 TASK="gemm"
@@ -25,6 +26,7 @@ def tvm_callback_cuda_postproc(code):
         code = open("perf/%s_manual.cu" % TASK).read()
     return code
 
+
 def test_gemm():
     # graph
     nn = 2048
@@ -101,12 +103,12 @@ def test_gemm():
     s[BB].double_buffer()
     # correctness
     def check_device(device):
-        print("Device %s" % device)
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
+        print("Device %s" % device)
         f = tvm.build(s, [A, B, C], device)
-        ctx = tvm.context(device, 0)
         # launch the kernel.
         n, m, l = nn, nn, nn
         a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
@@ -126,7 +128,7 @@ def check_device(device):
         GFLOPS = num_flops / (t * 1e3) / 1e6
         print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))
 
-    for device in ["cuda", "opencl", "rocm", "nvptx"]:
+    for device in ["cuda", "opencl", "rocm", "nvptx", "vulkan"]:
         with tvm.build_config(auto_unroll_max_step=128,
                               unroll_explicit=(device != "cuda")):
             check_device(device)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
new file mode 100644
index 000000000000..4cdab4401459
--- /dev/null
+++ b/topi/src/topi.cc
@@ -0,0 +1,640 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \brief Registration of TVM operators and schedules
+* \file topi.cc
+*/
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/build_module.h>
+
+#include <topi/broadcast.h>
+#include <topi/elemwise.h>
+#include <topi/nn.h>
+#include <topi/reduction.h>
+#include <topi/transform.h>
+
+#include <topi/nn/batch_norm.h>
+#include <topi/nn/bnn.h>
+#include <topi/nn/dense.h>
+#include <topi/nn/dilate.h>
+#include <topi/nn/flatten.h>
+#include <topi/nn/mapping.h>
+#include <topi/nn/pooling.h>
+#include <topi/nn/softmax.h>
+#include <topi/nn/upsampling.h>
+#include <topi/nn/l2_normalize.h>
+#include <topi/nn/local_response_norm.h>
+
+#include <topi/vision/reorg.h>
+#include <topi/image/resize.h>
+#include <topi/vision/yolo/region.h>
+#include <topi/vision/yolo/yolo.h>
+#include <topi/generic/default.h>
+#include <topi/generic/extern.h>
+#include <topi/generic/injective.h>
+
+#include <topi/cuda/dense.h>
+#include <topi/cuda/extern.h>
+#include <topi/cuda/injective.h>
+#include <topi/cuda/pooling.h>
+#include <topi/cuda/reduction.h>
+#include <topi/cuda/softmax.h>
+#include <topi/cuda/vision.h>
+#include <topi/cuda/normalization.h>
+
+#include <topi/x86/bnn.h>
+#include <topi/x86/default.h>
+#include <topi/x86/injective.h>
+
+#include <topi/rocm/dense.h>
+#include <topi/rocm/vision.h>
+#include <topi/rocm/normalization.h>
+
+namespace topi {
+
+using namespace tvm;
+using namespace tvm::runtime;
+
+/*! \brief Canonicalize an argument that may be Array<Expr> or int to Array<Expr> */
+Array<Expr> ArrayOrInt(TVMArgValue arg) {
+  if (arg.type_code() == kDLInt || arg.type_code() == kDLUInt) {
+    Array<Expr> result;
+    result.push_back(arg.operator int());
+    return result;
+  } else {
+    return arg;
+  }
+}
+
+inline bool IsTensorType(TVMArgValue arg) {
+  return (arg.type_code() == kNodeHandle &&
+          arg.node_sptr()->is_type<tvm::TensorNode>());
+}
+
+
+TVM_REGISTER_GLOBAL("topi.TEST_create_target")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = tvm::Target::create(args[0]);
+  });
+
+/* Ops from broadcast.h */
+#define TOPI_REGISTER_BCAST_OP(OpName, Op)                              \
+  TVM_REGISTER_GLOBAL(OpName)                                           \
+  .set_body([](TVMArgs args, TVMRetValue *rv) {                         \
+      bool lhs_is_tensor = IsTensorType(args[0]);                       \
+      bool rhs_is_tensor = IsTensorType(args[1]);                       \
+      if (lhs_is_tensor && rhs_is_tensor) {                             \
+        *rv = Op(args[0].operator tvm::Tensor(),                        \
+                 args[1].operator tvm::Tensor());                       \
+      } else if (!lhs_is_tensor && rhs_is_tensor) {                     \
+        *rv = Op(args[0].operator tvm::Expr(),                          \
+                 args[1].operator tvm::Tensor());                       \
+      } else if (lhs_is_tensor && !rhs_is_tensor) {                     \
+        *rv = Op(args[0].operator tvm::Tensor(),                        \
+                 args[1].operator tvm::Expr());                         \
+      } else if (!lhs_is_tensor && !rhs_is_tensor) {                    \
+        *rv = Op(args[0].operator tvm::Expr(),                          \
+                 args[1].operator tvm::Expr());                         \
+      }                                                                 \
+    });                                                                 \
+
+TVM_REGISTER_GLOBAL("topi.broadcast_to")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = broadcast_to(args[0], args[1]);
+  });
+
+TOPI_REGISTER_BCAST_OP("topi.add", topi::add);
+TOPI_REGISTER_BCAST_OP("topi.subtract", topi::subtract);
+TOPI_REGISTER_BCAST_OP("topi.multiply", topi::multiply);
+TOPI_REGISTER_BCAST_OP("topi.divide", topi::divide);
+TOPI_REGISTER_BCAST_OP("topi.mod", topi::mod);
+TOPI_REGISTER_BCAST_OP("topi.maximum", topi::maximum);
+TOPI_REGISTER_BCAST_OP("topi.minimum", topi::minimum);
+TOPI_REGISTER_BCAST_OP("topi.power", topi::power);
+TOPI_REGISTER_BCAST_OP("topi.left_shift", topi::left_shift);
+TOPI_REGISTER_BCAST_OP("topi.right_shift", topi::right_shift);
+TOPI_REGISTER_BCAST_OP("topi.greater", topi::greater);
+TOPI_REGISTER_BCAST_OP("topi.less", topi::less);
+TOPI_REGISTER_BCAST_OP("topi.equal", topi::equal);
+TOPI_REGISTER_BCAST_OP("topi.not_equal", topi::not_equal);
+TOPI_REGISTER_BCAST_OP("topi.greater_equal", topi::greater_equal);
+TOPI_REGISTER_BCAST_OP("topi.less_equal", topi::less_equal);
+
+/* Ops from elemwise.h */
+TVM_REGISTER_GLOBAL("topi.exp")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = exp(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.tanh")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = tanh(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.sigmoid")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = sigmoid(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.sqrt")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = sqrt(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.log")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = log(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.identity")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = identity(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.negative")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = negative(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.clip")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = clip(args[0], args[1], args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cast")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = cast(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.elemwise_sum")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = elemwise_sum(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.full")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = full(args[0], args[1], args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.full_like")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = full_like(args[0], args[1]);
+  });
+
+/* Ops from nn.h */
+TVM_REGISTER_GLOBAL("topi.nn.relu")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = relu<float>(args[0]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.nn.leaky_relu")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = leaky_relu(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.nn.prelu")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = prelu(args[0], args[1], args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.nn.pad")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = pad(args[0], args[1], args[2], args[3]);
+  });
+
+/* Ops from reduction.h */
+TVM_REGISTER_GLOBAL("topi.sum")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::sum(args[0], ArrayOrInt(args[1]), args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.min")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::min(args[0], ArrayOrInt(args[1]), args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.max")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::max(args[0], ArrayOrInt(args[1]), args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.argmin")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::argmin(args[0], ArrayOrInt(args[1]), args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.argmax")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::argmax(args[0], ArrayOrInt(args[1]), args[2]);
+  });
+
+/* Ops from transform.h */
+TVM_REGISTER_GLOBAL("topi.expand_dims")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = expand_dims(args[0], args[1], args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.transpose")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = transpose(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.flip")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = flip(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.reshape")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = reshape(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.squeeze")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = squeeze(args[0], ArrayOrInt(args[1]));
+  });
+
+TVM_REGISTER_GLOBAL("topi.concatenate")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = concatenate(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.split")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  if (args[1].type_code() == kDLInt || args[1].type_code() == kDLUInt) {
+    *rv = split_sections(args[0], args[1], args[2]);
+  } else {
+    *rv = split(args[0], args[1], args[2]);
+  }
+  });
+
+TVM_REGISTER_GLOBAL("topi.take")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  if (args.size() == 2) {
+    *rv = take(args[0], args[1]);
+  } else {
+    int axis = args[2];
+    *rv = take(args[0], args[1], axis);
+  }
+  });
+
+TVM_REGISTER_GLOBAL("topi.where")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = where(args[0], args[1], args[2]);
+});
+
+TVM_REGISTER_GLOBAL("topi.strided_slice")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = strided_slice(args[0], args[1], args[2], args[3]);
+  });
+
+/* Ops from nn/upsampling.h */
+TVM_REGISTER_GLOBAL("topi.nn.upsampling")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::upsampling(args[0], args[1], args[2], args[3]);
+  });
+
+/* Ops from nn/batch_norm.h */
+TVM_REGISTER_GLOBAL("topi.nn.batch_norm_inference")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::batch_norm_inference(args[0],
+                                 args[1],
+                                 args[2],
+                                 args[3],
+                                 args[4],
+                                 static_cast<double>(args[5]),
+                                 args[6]);
+  });
+
+/* Ops from nn/bnn.h */
+TVM_REGISTER_GLOBAL("topi.nn.binarize_pack")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::binarize_pack(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.nn.binary_dense")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::binary_dense(args[0], args[1]);
+  });
+
+/* Ops from nn/dense.h */
+TVM_REGISTER_GLOBAL("topi.nn.dense")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::dense(args[0], args[1], args[2]);
+  });
+
+/* Ops from nn/dilate.h */
+TVM_REGISTER_GLOBAL("topi.nn.dilate")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::dilate(args[0], args[1]);
+  });
+
+/* Ops from nn/flatten.h */
+TVM_REGISTER_GLOBAL("topi.nn.flatten")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::flatten(args[0]);
+  });
+
+/* Ops from nn/mapping.h */
+TVM_REGISTER_GLOBAL("topi.nn.scale_shift_nchw")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::scale_shift_nchw(args[0], args[1], args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.nn.scale_shift_nhwc")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::scale_shift_nhwc(args[0], args[1], args[2]);
+  });
+
+/* Ops from nn/pooling.h */
+TVM_REGISTER_GLOBAL("topi.nn.pool")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::pool(args[0], args[1], args[2], args[3],
+                 static_cast<nn::PoolType>(static_cast<int>(args[4])),
+                 args[5], args[6], args[7]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.nn.global_pool")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::global_pool(args[0],
+                        static_cast<nn::PoolType>(static_cast<int>(args[1])));
+  });
+
+/* Ops from nn/softmax.h */
+TVM_REGISTER_GLOBAL("topi.nn.softmax")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::softmax(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.nn.log_softmax")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::log_softmax(args[0]);
+  });
+
+/* Ops from nn/l2_normalize.h */
+TVM_REGISTER_GLOBAL("topi.nn.l2_normalize")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::l2_normalize(args[0], static_cast<double>(args[1]), args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.nn.lrn")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::lrn(args[0], args[1], args[2],
+                static_cast<double>(args[3]),
+                static_cast<double>(args[4]),
+                static_cast<double>(args[5]));
+  });
+
+TVM_REGISTER_GLOBAL("topi.vision.reorg")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = vision::reorg(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.vision.yolo.region")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = vision::yolo::region(args[0], args[1], args[2], args[3], args[4], args[5]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.vision.yolo.yolo")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = vision::yolo::yolo(args[0], args[1], args[2]);
+  });
+
+/* Ops from image/resize.h */
+TVM_REGISTER_GLOBAL("topi.image.resize")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = image::resize(args[0], args[1], args[2], args[3], args[4]);
+  });
+
+/* Generic schedules */
+TVM_REGISTER_GLOBAL("topi.generic.default_schedule")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  if (args[2]) {
+    *rv = topi::generic::default_schedule_auto_inline(args[0], args[1]);
+  } else {
+    *rv = topi::generic::default_schedule(args[0], args[1]);
+  }
+  });
+
+TVM_REGISTER_GLOBAL("topi.generic.schedule_extern")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::generic::schedule_extern(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.generic.schedule_injective")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::generic::schedule_injective(args[0], args[1]);
+  });
+
+/* x86 schedules */
+TVM_REGISTER_GLOBAL("topi.x86.schedule_binarize_pack")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::x86::schedule_binarize_pack(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.x86.schedule_binary_dense")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::x86::schedule_binary_dense(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.x86.default_schedule")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  if (args[2]) {
+    *rv = topi::x86::default_schedule_auto_inline(args[0], args[1]);
+  } else {
+    *rv = topi::x86::default_schedule(args[0], args[1]);
+  }
+  });
+
+TVM_REGISTER_GLOBAL("topi.x86.schedule_injective")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::x86::schedule_injective(args[0], args[1]);
+  });
+
+/* ROCm schedules */
+TVM_REGISTER_GLOBAL("topi.rocm.dense_cuda")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = rocm::dense_rocm(args[0], args[1], args[2], args[3]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.rocm.schedule_dense")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::rocm::schedule_dense(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.rocm.schedule_region")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::rocm::schedule_region(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.rocm.schedule_lrn")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::rocm::schedule_lrn(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.rocm.schedule_l2_normalize")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::rocm::schedule_l2_normalize(args[0], args[1]);
+  });
+
+/* CUDA schedules */
+TVM_REGISTER_GLOBAL("topi.cuda.dense_cuda")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = cuda::dense_cuda(args[0], args[1], args[2], args[3]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_dense")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_dense(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_extern")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_extern(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_injective(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_pool")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_pool(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_global_pool")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_global_pool(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_reduce")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_reduce(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_softmax(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_region")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_region(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_lrn")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_lrn(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_l2_normalize")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_l2_normalize(args[0], args[1]);
+  });
+
+/*! \brief Builder function for instantiating schedules. */
+using FTVMScheduleBuilder = std::function<
+  tvm::Schedule(const tvm::Target& target, const tvm::Array<tvm::Tensor>& outs)>;
+
+/*!
+ * \brief Helper function for registering generic functions matching the
+ * FTVMScheduleBuilder signature. The schedule builder function is wrapped
+ * with a PackedFunc suitable for passing to a tvm::GenericFunc.
+ *
+ * \param builder The schedule builder to wrap.
+ *
+ * \return The wrapped schedule builder
+ */
+inline PackedFunc WrapSchedule(FTVMScheduleBuilder builder) {
+  return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
+    auto target = Target::current_target(false);
+    Array<Tensor> outs;
+    NodeRef argNodeRef = args[0];
+    if (argNodeRef->type_index() == outs->type_index()) {
+      outs = args[0];
+    } else {
+      outs = Array<Tensor> { args[0] };
+    }
+
+    *ret = builder(target, outs);
+  });
+}
+
+TVM_REGISTER_GENERIC_FUNC(schedule_injective)
+.set_default(WrapSchedule(topi::generic::schedule_injective))
+.register_func({ "cpu" }, WrapSchedule(topi::x86::schedule_injective))
+.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_injective));
+
+TVM_REGISTER_GENERIC_FUNC(schedule_softmax)
+.set_default(WrapSchedule(topi::generic::default_schedule))
+.register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule))
+.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_softmax));
+
+TVM_REGISTER_GENERIC_FUNC(schedule_dense)
+.set_default(WrapSchedule(topi::generic::default_schedule))
+.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_dense))
+.register_func({ "rocm" }, WrapSchedule(topi::rocm::schedule_dense));
+
+TVM_REGISTER_GENERIC_FUNC(schedule_pool)
+.set_default(WrapSchedule(topi::generic::default_schedule))
+.register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule))
+.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_pool));
+
+TVM_REGISTER_GENERIC_FUNC(schedule_global_pool)
+.set_default(WrapSchedule(topi::generic::default_schedule))
+.register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule))
+.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_global_pool));
+
+TVM_REGISTER_GENERIC_FUNC(schedule_reduce)
+.set_default(WrapSchedule(topi::generic::default_schedule_auto_inline))
+.register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule_auto_inline))
+.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_reduce));
+
+TVM_REGISTER_GENERIC_FUNC(schedule_binarize_pack)
+.set_default(WrapSchedule(topi::generic::default_schedule))
+.register_func({ "cpu" }, WrapSchedule(topi::x86::schedule_binarize_pack));
+
+TVM_REGISTER_GENERIC_FUNC(schedule_binary_dense)
+.set_default(WrapSchedule(topi::generic::default_schedule))
+.register_func({ "cpu" }, WrapSchedule(topi::x86::schedule_binary_dense));
+
+/*! \brief Builder function for instantiating dense ops. */
+using FTVMDenseOpBuilder = std::function<tvm::Tensor(const Target& target,
+                                                     const tvm::Tensor& data,
+                                                     const tvm::Tensor& weight,
+                                                     const tvm::Tensor& bias)>;
+
+/*!
+* \brief Helper function for registering dense ops matching the
+* FTVMDenseOpBuilder signature. The op builder function is wrapped
+* with a PackedFunc suitable for passing to a tvm::GenericFunc.
+*
+* \param builder The op builder to wrap.
+*
+* \return The wrapped op builder
+*/
+inline PackedFunc WrapDenseOp(FTVMDenseOpBuilder builder) {
+  return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
+    auto target = Target::current_target(false);
+    Tensor data = args[0];
+    Tensor weight = args[1];
+    Tensor bias = args[2];
+
+    *ret = builder(target, data, weight, bias);
+  });
+}
+
+TVM_REGISTER_GENERIC_FUNC(dense)
+.set_default(WrapDenseOp([](const Target& target,
+                            const tvm::Tensor& data,
+                            const tvm::Tensor& weight,
+                            const tvm::Tensor& bias) {
+  return topi::nn::dense(data, weight, bias);
+}))
+.register_func({ "cuda", "gpu" }, WrapDenseOp(topi::cuda::dense_cuda))
+.register_func({ "rocm" }, WrapDenseOp(topi::rocm::dense_rocm));
+
+}  // namespace topi
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
new file mode 100644
index 000000000000..6df18483a45f
--- /dev/null
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -0,0 +1,112 @@
+import os
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+from tvm.contrib import util
+from tvm.contrib.pickle_memoize import memoize
+
+def generate_quantized_np(shape, bits, out_dtype):
+    min_val = 0
+    max_val = 1 << bits
+    return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
+
+def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
+    activation_bits, weight_bits, dorefa):
+    in_height = in_width = in_size
+    input_type='uint32'
+    out_dtype='int32'
+
+    with tvm.target.create('llvm'):
+        A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_type, name='A')
+        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_type, name='W')
+        B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, 
+            out_dtype=out_dtype, layout="NCHW", dorefa=dorefa)
+        s = topi.generic.schedule_bitserial_conv2d_nchw([B])
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    dtype = A.dtype
+
+    def get_ref_data():
+        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        if dorefa:
+            w_ = np.copy(w_np).astype(out_dtype)
+            for x in np.nditer(w_, op_flags=['readwrite']):
+                x[...] = 1 if x == 1 else -1
+            b_np = topi.testing.conv2d_nchw_python(a_np.astype(out_dtype), w_, stride, padding)
+        else:
+            b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
+        return a_np, w_np, b_np
+    a_np, w_np, b_np = get_ref_data()
+
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    func = tvm.build(s, [A, W, B], "llvm")
+    func(a, w, b)
+    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
+                        activation_bits, weight_bits, dorefa):
+    in_height = in_width = in_size
+    input_type='uint32'
+    out_dtype='int32'
+
+    with tvm.target.create('llvm'):
+        A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
+        W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
+        B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
+                            layout="NHWC", dorefa=dorefa)
+        s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    dtype = A.dtype
+
+    def get_ref_data():
+        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        if dorefa:
+            w_ = np.copy(w_np).astype(out_dtype)
+            for x in np.nditer(w_, op_flags=['readwrite']):
+                x[...] = 1 if x == 1 else -1
+            b_np = topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype)
+        else:
+            b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(out_dtype)
+        return a_np, w_np, b_np
+    a_np, w_np, b_np = get_ref_data()
+
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    func = tvm.build(s, [A, W, B], 'llvm')
+
+    func(a, w, b)
+    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+def test_bitserial_conv2d():
+    in_size = 56
+    ic, oc = 64, 64
+    k = 3
+    stride = 1
+    pad = 1
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
+
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
+
+if __name__ == "__main__":
+    test_bitserial_conv2d()
\ No newline at end of file
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
new file mode 100644
index 000000000000..3de954abc291
--- /dev/null
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -0,0 +1,56 @@
+import os
+import re
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+from tvm.contrib import util
+
+target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
+
+def generate_quantized_np(shape, bits, out_dtype):
+    np.random.seed(0)
+    min_val = 0
+    max_val = 1 << bits
+    return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
+
+# Verify that certain special instructions from the tensorize pass exist
+def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
+                        activation_bits, weight_bits, dorefa):
+    in_height = in_width = in_size
+    input_type='uint32'
+    out_dtype='int32'
+
+    with tvm.target.arm_cpu('rasp3b'):
+        A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
+        W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
+        B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
+                            layout="NHWC", dorefa=dorefa)
+        s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
+
+    
+    func = tvm.build(s, [A, W, B], target)
+   
+    assembly = func.get_source('asm')
+    matches = re.findall("vpadal", assembly)
+    assert (len(matches) > 0)
+    matches = re.findall("vcnt", assembly)
+    assert (len(matches) > 0)
+    matches = re.findall("vpadd", assembly)
+    assert (len(matches) > 0)
+
+def test_bitserial_conv2d():
+    in_size = 56
+    ic, oc = 64, 64
+    k = 3
+    stride = 1
+    pad = 1
+
+
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+
+if __name__ == "__main__":
+    test_bitserial_conv2d()
+
diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py
new file mode 100644
index 000000000000..90abc68e6b68
--- /dev/null
+++ b/topi/tests/python/test_topi_bnn.py
@@ -0,0 +1,55 @@
+"""Test code for binary neural network operators."""
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+
+def verify_binary_dense(batch, in_dim, out_dim):
+    A = tvm.placeholder((batch, in_dim), name='A')
+    B = tvm.placeholder((out_dim, in_dim), name='B')
+    bnn_A = topi.nn.binarize_pack(A)
+    bnn_B = topi.nn.binarize_pack(B)
+    # binary dense
+    bnn_A1 = tvm.placeholder(bnn_A.shape, dtype=bnn_A.dtype)
+    bnn_B1 = tvm.placeholder(bnn_B.shape, dtype=bnn_B.dtype)
+    bnn_C = topi.nn.binary_dense(bnn_A1, bnn_B1)
+    # schedule
+    with tvm.target.create('llvm'):
+        s1 = topi.generic.schedule_binarize_pack(bnn_A)
+        s2 = topi.generic.schedule_binarize_pack(bnn_B)
+        s3 = topi.generic.schedule_binary_dense(bnn_C)
+
+    dtype = A.dtype
+    @memoize("topi.tests.test_topi_binary_dense")
+    def get_ref_data():
+        # generate random matrix of +1 or -1 value
+        a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype)
+        b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype)
+        c_np = np.dot(a_np, b_np.T)
+        return (a_np, b_np, c_np)
+
+    a_np, b_np, c_np = get_ref_data()
+
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx)
+    bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx)
+    bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx)
+    f1 = tvm.build(s1, [A, bnn_A], 'llvm')
+    f2 = tvm.build(s2, [B, bnn_B], 'llvm')
+    f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], 'llvm')
+    f1(a, bnn_a)
+    f2(b, bnn_b)
+    f3(bnn_a, bnn_b, bnn_c)
+    np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
+
+def test_binary_dense():
+    verify_binary_dense(1, 4096, 1024)
+    verify_binary_dense(1, 1024, 1000)
+
+
+if __name__ == "__main__":
+    test_binary_dense()
diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py
index e5f88e9d4df6..c5720050e538 100644
--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -4,18 +4,18 @@
 import tvm
 import topi
 
-def verify_broadcast_to_ele(in_shape, out_shape):
+def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
     # Build the logic and compile the function
     A = tvm.placeholder(shape=in_shape, name="A")
-    B = topi.broadcast_to(A, out_shape)
+    B = fbcast(A, out_shape)
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_broadcast(B)
-        ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="broadcast_to")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = np.broadcast_to(data_npy, out_shape)
@@ -25,83 +25,171 @@ def check_device(device):
             foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
+    check_device("vulkan")
     check_device("opencl")
     check_device("cuda")
     check_device("metal")
     check_device("rocm")
+    check_device("nvptx")
 
 
-def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"):
+def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
+                                ftopi, fnumpy,
+                                lhs_min=-100, lhs_max=100,
+                                rhs_min=-100, rhs_max=100,
+                                dtype="float32"):
     # Build the logic and compile the function
-    A = tvm.placeholder(shape=lhs_shape, name="A")
-    B = tvm.placeholder(shape=rhs_shape, name="B")
-    if typ == "add":
-        C = topi.broadcast_add(A, B)
-    elif typ == "sub":
-        C = topi.broadcast_sub(A, B)
-    elif typ == "div":
-        C = topi.broadcast_div(A, B)
-    elif typ == "mul":
-        C = topi.broadcast_mul(A, B)
-    elif typ == "maximum":
-        C = topi.broadcast_maximum(A, B)
-    elif typ == "minimum":
-        C = topi.broadcast_minimum(A, B)
-    else:
-        raise NotImplementedError
+    A = (tvm.var("A", dtype=dtype) if lhs_shape is None
+         else tvm.placeholder(shape=lhs_shape, name="A", dtype=dtype))
+    B = (tvm.var("B", dtype=dtype) if rhs_shape is None
+         else tvm.placeholder(shape=rhs_shape, name="B", dtype=dtype))
+    C = ftopi(A, B)
+    if (isinstance(A, tvm.expr.Expr) and isinstance(B, tvm.expr.Expr)):
+        assert(isinstance(C, tvm.expr.Expr))
+        return
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_broadcast(C)
-        ctx = tvm.context(device, 0)
-        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
-        lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
-        rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
-        if typ == "add":
-            out_npy = lhs_npy + rhs_npy
-        elif typ == "sub":
-            out_npy = lhs_npy - rhs_npy
-        elif typ == "div":
-            rhs_npy = np.abs(rhs_npy) + 0.001
-            out_npy = lhs_npy / rhs_npy
-        elif typ == "mul":
-            out_npy = lhs_npy * rhs_npy
-        elif typ == "maximum":
-            out_npy = np.maximum(lhs_npy, rhs_npy)
-        elif typ == "minimum":
-            out_npy = np.minimum(lhs_npy, rhs_npy)
+        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__)
+        if lhs_shape is None:
+            lhs_npy = float(np.random.uniform(low=lhs_min, high=lhs_max))
+            if dtype.startswith('int'):
+                lhs_npy = int(lhs_npy)
+            lhs_nd = lhs_npy
         else:
-            raise NotImplementedError
-        lhs_nd = tvm.nd.array(lhs_npy, ctx)
-        rhs_nd = tvm.nd.array(rhs_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
-        for _ in range(1):
-            foo(lhs_nd, rhs_nd, out_nd)
+            lhs_npy = np.random.uniform(low=lhs_min, high=lhs_max,
+                                        size=lhs_shape).astype(A.dtype)
+            lhs_nd = tvm.nd.array(lhs_npy, ctx)
+
+        if rhs_shape is None:
+            rhs_npy = float(np.random.uniform(low=rhs_min, high=rhs_max))
+            if dtype.startswith('int'):
+                rhs_npy = int(rhs_npy)
+            rhs_nd = rhs_npy
+        else:
+            rhs_npy = np.random.uniform(low=rhs_min, high=rhs_max,
+                                        size=rhs_shape).astype(A.dtype)
+            rhs_nd = tvm.nd.array(rhs_npy, ctx)
+
+        out_npy = fnumpy(lhs_npy, rhs_npy)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx)
+        foo(lhs_nd, rhs_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
     check_device("opencl")
+    check_device("vulkan")
     check_device("cuda")
     check_device("metal")
     check_device("rocm")
+    check_device("nvptx")
 
 def test_broadcast_to():
-    verify_broadcast_to_ele((1,), (10,))
-    verify_broadcast_to_ele((1, 1, 5, 4), (3, 4, 4, 4, 5, 4))
-    verify_broadcast_to_ele((1, 128, 1, 32), (64, 128, 64, 32))
+    verify_broadcast_to_ele((1,), (10,), topi.broadcast_to)
+    verify_broadcast_to_ele((), (10,), topi.broadcast_to)
+    verify_broadcast_to_ele((1, 1, 5, 4), (3, 4, 4, 4, 5, 4), topi.broadcast_to)
+    verify_broadcast_to_ele((1, 128, 1, 32), (64, 128, 64, 32), topi.broadcast_to)
+
+def test_add():
+    verify_broadcast_binary_ele(
+        (), (), topi.add, np.add)
+    verify_broadcast_binary_ele(
+        (5, 2, 3), (2, 1), topi.add, np.add)
+
+def test_subtract():
+    verify_broadcast_binary_ele(
+        (5, 2, 3), (), topi.subtract, np.subtract)
+    verify_broadcast_binary_ele(
+        (5, 2, 3), None, topi.subtract, np.subtract)
+    verify_broadcast_binary_ele(
+        None, None, topi.subtract, np.subtract)
+    verify_broadcast_binary_ele(
+        (1, 32), (64, 32), topi.subtract, np.subtract)
+
+def test_multiply():
+    verify_broadcast_binary_ele(
+        (5, 64, 128), (2, 5, 64, 1), topi.multiply, np.multiply)
+
+def test_divide():
+    verify_broadcast_binary_ele(
+        None, (10,), topi.divide, np.divide, rhs_min=0.0001)
+    verify_broadcast_binary_ele(
+        (), None, topi.divide, np.divide, rhs_min=0.0001)
+    verify_broadcast_binary_ele(
+        (2, 3, 1, 32), (64, 32), topi.divide, np.divide, rhs_min=0.0001)
+
+def test_maximum_minmum():
+    verify_broadcast_binary_ele(
+        (32,), (64, 32), topi.maximum, np.maximum)
+    verify_broadcast_binary_ele(
+        (1, 2, 2, 1, 32), (64, 32), topi.minimum, np.minimum)
+
+def test_power():
+    verify_broadcast_binary_ele(
+        (1, 2, 2), (2,), topi.power, np.power, lhs_min=0.001, rhs_min=0.001, rhs_max=2)
+
+def test_mod():
+    verify_broadcast_binary_ele(
+        (1, 2, 2), (2,), topi.mod, np.mod, lhs_min=0.001, rhs_min=1, dtype="int32")
+
+def test_cmp():
+    # explicit specify the output type
+    def greater(x, y):
+        return topi.greater(x, y).astype("int8")
+    def less(x, y):
+        return topi.less(x, y).astype("int8")
+    def equal(x, y):
+        return topi.equal(x, y).astype("int8")
+    def not_equal(x, y):
+        return topi.not_equal(x, y).astype("int8")
+    def greater_equal(x, y):
+        return topi.greater_equal(x, y).astype("int8")
+    def less_equal(x, y):
+        return topi.less_equal(x, y).astype("int8")
+    verify_broadcast_binary_ele(
+        (1, 2, 2), (2,), greater, np.greater)
+    verify_broadcast_binary_ele(
+        (2, 1, 2), (2, 3, 1), less, np.less)
+    verify_broadcast_binary_ele(
+        (2, 1, 2), (2, 3, 1), equal, np.equal,
+        lhs_min=-2, lhs_max=2, rhs_min=-2, rhs_max=2, dtype='int32')
+    verify_broadcast_binary_ele(
+        (2, 1, 2), (2, 3, 1), not_equal, np.not_equal,
+        lhs_min=-2, lhs_max=2, rhs_min=-2, rhs_max=2, dtype='int32')
+    verify_broadcast_binary_ele(
+        (7, 1, 5), (7, 3, 1), greater_equal, np.greater_equal,
+        lhs_min=-3, lhs_max=3, rhs_min=-3, rhs_max=3, dtype='int32')
+    verify_broadcast_binary_ele(
+        (7, 1, 5), (7, 3, 1), less_equal, np.less_equal,
+        lhs_min=-3, lhs_max=3, rhs_min=-3, rhs_max=3, dtype='int32')
+
+def test_shift():
+    # explicit specify the output type
+    verify_broadcast_binary_ele(
+        (2, 1, 2), None, topi.right_shift, np.right_shift,
+        dtype="int32", rhs_min=0, rhs_max=32)
 
+    verify_broadcast_binary_ele(
+        (1, 2, 2), (2,), topi.left_shift, np.left_shift,
+        dtype="int32", rhs_min=0, rhs_max=32)
 
-def test_broadcast_binary():
-    verify_broadcast_binary_ele((5, 2, 3), (2, 1), typ="add")
-    verify_broadcast_binary_ele((5, 64, 128), (2, 5, 64, 1), typ="mul")
-    verify_broadcast_binary_ele((2, 3, 1, 32), (64, 32), typ="div")
-    verify_broadcast_binary_ele((1, 32), (64, 32), typ="sub")
-    verify_broadcast_binary_ele((32,), (64, 32), typ="maximum")
-    verify_broadcast_binary_ele((1, 2, 2, 1, 32), (64, 32), typ="minimum")
+    verify_broadcast_binary_ele(
+        (1, 2, 2), (2,), topi.left_shift, np.left_shift,
+        dtype="int8", rhs_min=0, rhs_max=32)
 
 
 if __name__ == "__main__":
+    test_add()
+    test_shift()
+    test_cmp()
+    test_mod()
+    test_subtract()
+    test_multiply()
+    test_divide()
+    test_maximum_minmum()
+    test_power()
     test_broadcast_to()
-    test_broadcast_binary()
diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py
index 52da4922e1d6..041565433bcc 100644
--- a/topi/tests/python/test_topi_clip.py
+++ b/topi/tests/python/test_topi_clip.py
@@ -20,23 +20,27 @@ def get_ref_data():
     a_np, b_np = get_ref_data()
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
-        ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0)
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
+
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device, name="clip")
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['llvm']:
+    for device in ['llvm', 'opencl']:
         check_device(device)
 
 def test_clip():
-    verify_clip(1024, -127, 127, 'int8')
-    verify_clip(1024, -127, 127, 'int16')
     verify_clip(1024, -127, 127, 'float32')
+    verify_clip(1024, -127, 127, 'int16')
+    verify_clip(1024, -127, 127, 'int8')
 
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_conv2d.py b/topi/tests/python/test_topi_conv2d.py
index 3e6978752b29..124c98c65c7a 100644
--- a/topi/tests/python/test_topi_conv2d.py
+++ b/topi/tests/python/test_topi_conv2d.py
@@ -2,7 +2,9 @@
 import os
 import numpy as np
 import tvm
+from tvm import autotvm
 import topi
+import topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
@@ -10,10 +12,10 @@
 def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding):
     in_height = in_width = in_size
 
-    with tvm.target.rasp():
+    with tvm.target.arm_cpu():
         A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
         W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-        B = topi.nn.conv2d(A, W, stride, padding)
+        B = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), 'NCHW', 'float32')
         s = topi.generic.schedule_conv2d_nchw([B])
 
     a_shape = get_const_tuple(A.shape)
@@ -38,7 +40,8 @@ def get_ref_data():
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 def test_conv2d():
-    verify_conv2d(1, 56,  64, 64,  3, 1, 1)
+    with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b')):
+        verify_conv2d(1, 56, 64, 64, 3, 1, 1)
 
 if __name__ == "__main__":
     test_conv2d()
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index 9e6da1a58f0e..1ff4b02470c4 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -3,16 +3,18 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
 
-def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, padding):
+def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
     in_height = in_width = in_size
 
     A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
     W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-    B = topi.nn.conv2d_hwcn(A, W, stride, padding)
+    dW = topi.nn.dilate(W, (dilation, dilation, 1, 1))
+    B = topi.nn.conv2d_hwcn(A, dW, stride, padding)
     C = topi.nn.relu(B)
     s1 = topi.cuda.schedule_conv2d_hwcn([B])
     s2 = topi.cuda.schedule_conv2d_hwcn([C])
@@ -25,17 +27,18 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = topi.testing.conv2d_hwcn_python(a_np, w_np, stride, padding)
+        dw_np = topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+        b_np = topi.testing.conv2d_hwcn_python(a_np, dw_np, stride, padding)
         c_np = np.maximum(b_np, 0)
         return a_np, w_np, b_np, c_np
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
@@ -49,7 +52,7 @@ def check_device(device):
             np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
             np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
 
 
@@ -62,7 +65,8 @@ def test_conv2d_hwcn():
     verify_conv2d_hwcn(1, 256, 32, 256, 3, 1, "VALID")
     verify_conv2d_hwcn(4, 128, 16, 128, 5, 2, "VALID")
     verify_conv2d_hwcn(4, 128, 16, 256, 5, 2, "VALID")
-
+    # dilation = 2
+    verify_conv2d_hwcn(1, 256, 32, 256, 3, 1, "SAME", dilation=2)
 
 if __name__ == "__main__":
     test_conv2d_hwcn()
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index 6e18d9110520..c663384b8187 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -3,60 +3,66 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
 
-def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
     in_height = in_width = in_size
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
     W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-    B = topi.nn.conv2d_nchw(A, W, stride, padding)
-    C = topi.nn.relu(B)
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
     dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw")
+    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        b_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
         c_np = np.maximum(b_np, 0)
         return a_np, w_np, b_np, c_np
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
+            B = topi.nn.conv2d(A, dW, stride, padding, layout='NCHW')
+            C = topi.nn.relu(B)
             s1 = topi.generic.schedule_conv2d_nchw([B])
             s2 = topi.generic.schedule_conv2d_nchw([C])
-        ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=128,
-                              unroll_explicit=(device != "cuda")):
-            func1 = tvm.build(s1, [A, W, B], device)
-            func2 = tvm.build(s2, [A, W, C], device)
+        no_unroll_explicit = device in ["cuda", "nvptx", "rocm"]
+        with tvm.build_config(auto_unroll_max_step=1400,
+                              unroll_explicit=not no_unroll_explicit):
+            func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
             func1(a, w, b)
             func2(a, w, c)
             np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
             np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
 
 
 def test_conv2d_nchw():
-    verify_conv2d_nchw(1, 3, 224, 64, 7, 3, 2)
+    # ResNet18 workloads
+    verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3)
     verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
     verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0)
     verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1)
@@ -68,7 +74,30 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1)
     verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0)
     verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
+    # ResNet50 workloads
+    verify_conv2d_nchw(1, 64, 56, 256, 1, 1, 0)
+    verify_conv2d_nchw(1, 256, 56, 64, 1, 1, 0)
+    verify_conv2d_nchw(1, 256, 56, 128, 1, 2, 0)
+    verify_conv2d_nchw(1, 128, 28, 512, 1, 1, 0)
+    verify_conv2d_nchw(1, 256, 56, 512, 1, 2, 0)
+    verify_conv2d_nchw(1, 512, 28, 128, 1, 1, 0)
+    verify_conv2d_nchw(1, 512, 28, 256, 1, 2, 0)
+    verify_conv2d_nchw(1, 256, 14, 1024, 1, 1, 0)
+    verify_conv2d_nchw(1, 512, 28, 1024, 1, 2, 0)
+    verify_conv2d_nchw(1, 1024, 14, 256, 1, 1, 0)
+    verify_conv2d_nchw(1, 1024, 14, 512, 1, 2, 0)
+    verify_conv2d_nchw(1, 512, 7, 2048, 1, 2, 0)
+    verify_conv2d_nchw(1, 1024, 14, 2048, 1, 2, 0)
+    verify_conv2d_nchw(1, 2048, 7, 512, 1, 1, 0)
+    # Vgg16 workloads
     verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1)
+    # Super resolution workloads
+    verify_conv2d_nchw(1, 1, 224, 64, 5, 1, 2)
+    verify_conv2d_nchw(1, 64, 224, 64, 3, 1, 1)
+    verify_conv2d_nchw(1, 64, 224, 32, 3, 1, 1)
+    verify_conv2d_nchw(1, 32, 224, 9, 3, 1, 1)
+    # dilation = 2
+    verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1, dilation=2)
 
 if __name__ == "__main__":
     test_conv2d_nchw()
diff --git a/topi/tests/python/test_topi_conv2d_nhwc.py b/topi/tests/python/test_topi_conv2d_nhwc.py
new file mode 100644
index 000000000000..7e41517c5d61
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_nhwc.py
@@ -0,0 +1,64 @@
+"""Example code to do convolution."""
+import os
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+
+def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A')
+    W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
+    dW = topi.nn.dilate(W, (1, dilation, dilation, 1))
+    B = topi.nn.conv2d_nhwc(A, dW, stride, padding)
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_nhwc")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, dilation, dilation, 1))
+        b_np = topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
+        return a_np, w_np, b_np
+    a_np, w_np, b_np = get_ref_data()
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_conv2d_nhwc([B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, W, B], device)
+        func(a, w, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['llvm']:
+        check_device(device)
+
+
+def test_conv2d_nhwc():
+    verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "SAME")
+    verify_conv2d_nhwc(4, 128, 16, 128, 5, 2, "SAME")
+    verify_conv2d_nhwc(4, 128, 16, 256, 5, 2, "SAME")
+    verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "VALID")
+    verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "VALID")
+    verify_conv2d_nhwc(4, 128, 16, 128, 5, 2, "VALID")
+    verify_conv2d_nhwc(4, 128, 16, 256, 5, 2, "VALID")
+    # dilation = 2
+    verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "SAME", dilation=2)
+
+
+if __name__ == "__main__":
+    test_conv2d_nhwc()
diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
index 02e085387253..674fe9fd8953 100644
--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -2,6 +2,7 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
@@ -10,7 +11,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel,
     in_height = in_width = in_size
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W')
     B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding)
     C = topi.nn.relu(B)
 
@@ -29,14 +30,14 @@ def get_ref_data():
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s1 = topi.generic.schedule_conv2d_transpose_nchw([B])
             s2 = topi.generic.schedule_conv2d_transpose_nchw([C])
-        ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
@@ -50,7 +51,7 @@ def check_device(device):
             np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
             np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py
index 0e5984afc011..2df43eb30887 100644
--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -2,6 +2,7 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
@@ -29,13 +30,13 @@ def get_ref_data():
     a_np, b_np, c_np, d_np = get_ref_data()
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_dense(D)
-        ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(c_np, ctx)
@@ -44,7 +45,7 @@ def check_device(device):
         f(a, b, c, d)
         np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
 
 def test_dense():
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index df9a755c078e..3086054ba487 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -1,5 +1,6 @@
 import tvm
 import topi
+import topi.testing
 import numpy as np
 from scipy import signal
 from topi.util import get_const_tuple
@@ -7,24 +8,24 @@
 from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_nhwc
 
 
-def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding):
+def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
     in_width = in_height
     filter_channel = in_channel
     filter_width = filter_height
-    stride_w = stride_h
     # placeholder
     Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
     Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
+    DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
     # declare
-    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, stride=[stride_h, stride_w], padding=padding)
+    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, DilatedFilter, stride=stride, padding=padding)
     ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
     Relu = topi.nn.relu(ScaleShift)
 
-
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -33,7 +34,6 @@ def check_device(device):
             s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift)
             s3 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
-        ctx = tvm.context(device, 0)
         # build the kernels
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
@@ -52,11 +52,12 @@ def check_device(device):
         def get_ref_data():
             input_np = np.random.uniform(size=input_shape).astype(dtype)
             filter_np = np.random.uniform(size=filter_shape).astype(dtype)
+            dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation))
             scale_np = np.random.uniform(size=scale_shape).astype(dtype)
             shift_np = np.random.uniform(size=shift_shape).astype(dtype)
             # correctness with scipy
             depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
-                input_np, filter_np, stride=[stride_h, stride_w], padding=padding)
+                input_np, dilated_filter_np, stride=stride, padding=padding)
             scale_shift_scipy = np.zeros(shape=scale_shift_shape)
             for c in range(in_channel * channel_multiplier):
                 scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
@@ -91,8 +92,11 @@ def get_ref_data():
     check_device("cuda")
     check_device("metal")
     check_device("rocm")
+    check_device("vulkan")
+    check_device("nvptx")
 
-def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding):
+
+def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
     in_width = in_height
     filter_channel = in_channel
     filter_width = filter_height
@@ -100,16 +104,18 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
     # placeholder
     Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input')
     Filter = tvm.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
+    DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
     # declare
-    DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter, stride=[stride_h, stride_w], padding=padding)
+    DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, DilatedFilter, stride=[stride_h, stride_w], padding=padding)
     ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
     Relu = topi.nn.relu(ScaleShift)
     # schedule
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -118,7 +124,6 @@ def check_device(device):
             s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift)
             s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu)
-        ctx = tvm.context(device, 0)
         # build the kernels
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
@@ -137,11 +142,12 @@ def check_device(device):
         def get_ref_data():
             input_np = np.random.uniform(size=input_shape).astype(dtype)
             filter_np = np.random.uniform(size=filter_shape).astype(dtype)
+            dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation))
             scale_np = np.random.uniform(size=scale_shape).astype(dtype)
             shift_np = np.random.uniform(size=shift_shape).astype(dtype)
             # correctness with scipy
             depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc(
-                input_np, filter_np, stride=[stride_h, stride_w], padding=padding)
+                input_np, dilated_filter_np, stride=[stride_h, stride_w], padding=padding)
             scale_shift_scipy = np.zeros(shape=scale_shift_shape)
             for c in range(in_channel * channel_multiplier):
                 scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c]
@@ -178,6 +184,8 @@ def get_ref_data():
     check_device("cuda")
     check_device("metal")
     check_device("rocm")
+    check_device("vulkan")
+    check_device("nvptx")
 
 def test_depthwise_conv2d():
     print("testing nchw")
@@ -189,6 +197,8 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_nchw(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nchw(4, 256, 64, 2, 5, 2, "VALID")
     depthwise_conv2d_with_workload_nchw(4, 256, 32, 2, 5, 2, "VALID")
+    # dilation = 2
+    depthwise_conv2d_with_workload_nchw(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
     print("testing nhwc")
     depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "SAME")
@@ -198,6 +208,8 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID")
     depthwise_conv2d_with_workload_nhwc(4, 256, 32, 2, 5, 2, "VALID")
+    # dilation = 2
+    depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
 
 if __name__ == "__main__":
     test_depthwise_conv2d()
diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
index 732e31e7e166..f7c027344840 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
@@ -5,6 +5,7 @@
 from scipy import signal
 from topi.util import get_const_tuple
 from topi.nn.util import get_pad_tuple
+import topi.testing
 from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 
 
@@ -32,11 +33,11 @@ def verify_depthwise_conv2d_back_input(batch, in_channel, in_h, channel_multipli
     schedule = schedule_depthwise_conv2d_backward_input_nhwc(In_grad)
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        ctx = tvm.context(device, 0)
         # build the kernel
         f = tvm.build(schedule, [Filter, Out_grad, In_grad], device)
         # prepare pod type for test data closure
@@ -85,6 +86,8 @@ def get_ref_data():
     check_device("cuda")
     check_device("metal")
     check_device("rocm")
+    check_device("vulkan")
+    check_device("nvptx")
 
 def test_topi_depthwise_conv2d_backward_input_nhwc():
     verify_depthwise_conv2d_back_input(16, 256, 56, 1, 3, 1, 1)
diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
index b9a0fcbb3ddf..da5b0351ae3c 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
@@ -1,5 +1,6 @@
 import tvm
 import topi
+import topi.testing
 import numpy as np
 from tvm.contrib.pickle_memoize import memoize
 from scipy import signal
@@ -32,11 +33,11 @@ def verify_depthwise_conv2d_back_weight(batch, in_channel, in_h, channel_multipl
     schedule = schedule_depthwise_conv2d_backward_weight_nhwc(Weight_grad)
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        ctx = tvm.context(device, 0)
         # build the kernel
         f = tvm.build(schedule, [Input, Out_grad, Weight_grad], device)
         # prepare pod type for test data closure
@@ -78,6 +79,8 @@ def get_ref_data():
     check_device("cuda")
     check_device("metal")
     check_device("rocm")
+    check_device("vulkan")
+    check_device("nvptx")
 
 def test_topi_depthwise_conv2d_backward_weight_nhwc():
     verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 3, 1, 1)
diff --git a/topi/tests/python/test_topi_dilate.py b/topi/tests/python/test_topi_dilate.py
index 778c0ba5e9c4..9cc44719745a 100644
--- a/topi/tests/python/test_topi_dilate.py
+++ b/topi/tests/python/test_topi_dilate.py
@@ -1,5 +1,6 @@
 import tvm
 import topi
+import topi.testing
 import numpy as np
 
 
diff --git a/topi/tests/python/test_topi_l2norm.py b/topi/tests/python/test_topi_l2norm.py
new file mode 100644
index 000000000000..75dc57057893
--- /dev/null
+++ b/topi/tests/python/test_topi_l2norm.py
@@ -0,0 +1,47 @@
+"""Test code for L2 normalization"""
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+import topi.testing
+
+def verify_l2_normalize(ishape, eps, axis=None):
+
+    A = tvm.placeholder(ishape, name='A')
+    B = topi.nn.l2_normalize(A, eps, axis)
+    dtype = A.dtype
+
+    a_np = np.random.uniform(size=ishape).astype(dtype)
+    b_np = topi.testing.l2_normalize_python(a_np, eps, axis)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            if device == 'llvm':
+                s = topi.generic.schedule_l2_normalize([B])
+            else:
+                s = topi.cuda.schedule_l2_normalize([B])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+        check_device(device)
+
+def test_l2_normalize():
+    verify_l2_normalize((1, 3, 20, 20), 0.001)
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (1,))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (1, 2))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (2, 3))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (0, 3))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (0, 2, 3))
+
+
+if __name__ == "__main__":
+    test_l2_normalize()
diff --git a/topi/tests/python/test_topi_lrn.py b/topi/tests/python/test_topi_lrn.py
new file mode 100644
index 000000000000..478054ddb134
--- /dev/null
+++ b/topi/tests/python/test_topi_lrn.py
@@ -0,0 +1,42 @@
+"""Test code for local response normalization"""
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+import topi.testing
+
+def verify_lrn(shape, size, axis, bias, alpha, beta):
+    A = tvm.placeholder(shape, name='A')
+    B = topi.nn.lrn(A, size, axis, alpha, beta, bias)
+    dtype = A.dtype
+
+    a_np = np.random.uniform(size=shape).astype(dtype)
+    b_np = topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            if device == 'llvm':
+                s = topi.generic.schedule_lrn([B])
+            else:
+                s = topi.cuda.schedule_lrn([B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+        check_device(device)
+
+def test_lrn():
+    verify_lrn((1, 3, 5, 5), 3, 1, 1.0, 1.0, 0.5)
+    verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5)
+    verify_lrn((1, 3, 20, 20), 3, 1, 2.0, 1.0, 0.75)
+
+if __name__ == "__main__":
+    test_lrn()
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
new file mode 100644
index 000000000000..4190c8e1d213
--- /dev/null
+++ b/topi/tests/python/test_topi_math.py
@@ -0,0 +1,59 @@
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi import util
+
+
+def test_util():
+    x = tvm.const(100)
+    assert util.get_const_int(x) == 100
+    assert util.get_const_tuple((x, x)) == (100, 100)
+
+
+def test_ewise():
+    m = tvm.var('m')
+    l = tvm.var('l')
+    A = tvm.placeholder((m, l), name='A')
+
+    shape = (20, 3)
+
+    def test_apply(func, name, f_numpy, low, high):
+        B = func(A)
+        assert tuple(B.shape) == tuple(A.shape)
+        assert B.op.body[0].name == name
+        a_np = np.random.uniform(low=low, high=high, size=shape).astype(A.dtype) * 10
+        b_np = f_numpy(a_np)
+
+        def check_device(device):
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("Skip because %s is not enabled" % device)
+                return
+            print("Running on target: %s" % device)
+            with tvm.target.create(device):
+                s = topi.generic.schedule_injective(B)
+            foo = tvm.build(s, [A, B], device, name=name)
+            a = tvm.nd.array(a_np, ctx)
+            b = tvm.nd.array(np.zeros_like(b_np), ctx)
+            foo(a, b)
+            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+
+        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx']:
+            check_device(device)
+
+
+    test_apply(topi.floor, "floor", np.floor, -100, 100)
+    test_apply(topi.ceil, "ceil", np.ceil, -100, 100)
+    test_apply(topi.trunc, "trunc", np.trunc, -100, 100)
+    test_apply(topi.abs, "fabs", np.abs, -100, 100)
+    test_apply(topi.round, "round", np.round, -100, 100)
+    test_apply(topi.exp, "exp", np.exp, -1, 1)
+    test_apply(topi.tanh, "tanh", np.tanh, -10, 10)
+    test_apply(topi.sigmoid, "sigmoid", lambda x:1/(1+np.exp(-x)), -1, 1)
+    test_apply(topi.log, "log", np.log, 0, 100)
+    test_apply(topi.sqrt, "sqrt", np.sqrt, 0, 100)
+
+if __name__ == "__main__":
+    test_util()
+    test_ewise()
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index 08aade5213cd..c9f790146b4a 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -5,30 +5,29 @@
 import math
 from topi.util import get_const_tuple
 
-def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode):
+def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True):
     iw = ih
     kw = kh
     sw = sh
-    ph, pw = padding
+    pt, pl, pb, pr = padding
     A = tvm.placeholder((n, ic, ih, iw), name='A')
     B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding,
-                     pool_type=pool_type, ceil_mode=ceil_mode)
+                     pool_type=pool_type, ceil_mode=ceil_mode, count_include_pad=count_include_pad)
     B = topi.nn.relu(B)
     dtype = A.dtype
 
     bshape = get_const_tuple(B.shape)
     ashape = get_const_tuple(A.shape)
     if ceil_mode:
-        assert bshape[2] == int(math.ceil(float(ashape[2] - kh + ph * 2) / sh) + 1)
-        assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pw * 2) / sw) + 1)
+        assert bshape[2] == int(math.ceil(float(ashape[2] - kh + pt + pb) / sh) + 1)
+        assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pl + pr) / sw) + 1)
     else:
-        assert bshape[2] == int(math.floor(float(ashape[2] - kh + ph * 2) / sh) + 1)
-        assert bshape[3] == int(math.floor(float(ashape[3] - kw + pw * 2) / sw) + 1)
+        assert bshape[2] == int(math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1)
+        assert bshape[3] == int(math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1)
 
-
-    a_np = np.random.uniform(size=(n, ic, ih, iw)).astype(dtype)
-    pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype)
-    no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw)))
+    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
+    pad_np = np.zeros(shape=(n, ic, ih+pt+pb, iw+pl+pr)).astype(dtype)
+    no_zero = (range(n), range(ic), (range(pt, ih+pt)), (range(pl, iw+pl)))
     pad_np[np.ix_(*no_zero)] = a_np
     _, oc, oh, ow = get_const_tuple(B.shape)
     b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)
@@ -36,7 +35,12 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode):
     if pool_type == 'avg':
         for i in range(oh):
             for j in range(ow):
-                b_np[:,:,i,j] = np.mean(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3))
+                if count_include_pad:
+                    b_np[:,:,i,j] = np.mean(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3))
+                else:
+                    pad_count = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3))
+                    b_np[:,:,i,j] = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) / np.maximum(pad_count, 1)
+
     elif pool_type =='max':
         for i in range(oh):
             for j in range(ow):
@@ -44,29 +48,37 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode):
     b_np = np.maximum(b_np, 0.0)
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_pool(B)
-        ctx = tvm.context(device, 0)
+
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
 
 def test_pool():
-    verify_pool(1, 256, 32, 2, 2, [0, 0], 'avg', False)
-    verify_pool(1, 256, 31, 3, 3, [1, 2], 'avg', False)
-    verify_pool(1, 256, 32, 2, 2, [0, 0], 'max', False)
-    verify_pool(1, 256, 31, 3, 3, [2, 1], 'max', False)
-    verify_pool(1, 256, 31, 3, 3, [2, 1], 'max', True)
+    verify_pool(1, 256, 32, 2, 2, [0, 0, 0, 0], 'avg', False, True)
+    verify_pool(1, 256, 31, 3, 3, [1, 2, 1, 2], 'avg', False, True)
+    verify_pool(1, 256, 32, 2, 2, [1, 2, 1, 2], 'avg', False, False)
+    verify_pool(1, 256, 31, 4, 4, [3, 3, 3, 3], 'avg', False, False)
+    verify_pool(1, 256, 31, 4, 4, [0, 0, 0, 0], 'avg', False, False)
+    verify_pool(1, 256, 32, 2, 2, [0, 0, 0, 0], 'max', False)
+    verify_pool(1, 256, 31, 3, 3, [2, 1, 2, 1], 'max', False)
+    verify_pool(1, 256, 31, 3, 3, [2, 1, 2, 1], 'max', True)
 
+    verify_pool(1, 256, 31, 3, 3, [2, 1, 0, 3], 'avg', False, True)
+    verify_pool(1, 256, 32, 2, 2, [0, 3, 2, 1], 'avg', False, False)
+    verify_pool(1, 256, 31, 3, 3, [1, 0, 3, 2], 'max', False)
+    verify_pool(1, 256, 31, 3, 3, [3, 2, 1, 0], 'max', True)
 
 
 def verify_global_pool(n, c, h, w, pool_type):
@@ -82,20 +94,20 @@ def verify_global_pool(n, c, h, w, pool_type):
     b_np = np.maximum(b_np, 0.0)
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_global_pool(B)
-        ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
 
 def test_global_pool():
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 13cd8fcdcd21..331498deb10c 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -25,12 +25,11 @@ def _my_npy_argmin(arr, axis, keepdims):
         return arr.argmin(axis=axis).reshape(out_shape)
 
 
-def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
+def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32"):
     # Build the logic and compile the function
-    dat_dtype = "float32"
-    A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype)
+    A = tvm.placeholder(shape=in_shape, name="A", dtype=dtype)
     A1 = topi.sqrt(topi.exp(A))
-    out_dtype = "float32"
+    out_dtype = dtype
     if type == "sum":
         B = topi.sum(A1, axis=axis, keepdims=keepdims)
     elif type == "max":
@@ -47,17 +46,18 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
         raise NotImplementedError
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_reduce(B)
-        ctx = tvm.context(device, 0)
-        foo = tvm.build(s, [A, B], device, name="sum")
+
+        foo = tvm.build(s, [A, B], device, name=type)
         # Test
-        in_npy = np.random.uniform(size=in_shape).astype(np.float32)
-        in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
+        in_npy = np.random.uniform(size=in_shape).astype(dtype)
+        in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
         if type == "sum":
             out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
         elif type == "max":
@@ -74,8 +74,23 @@ def check_device(device):
         out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
         for _ in range(1):
             foo(data_tvm, out_tvm)
-        np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
-    for device in ["cuda", "opencl", "metal", "llvm", "rocm"]:
+        if type == "argmax" or type == "argmin":
+            out_tvm_indices = out_tvm.asnumpy()
+            if keepdims:
+                out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
+            if axis is None:
+                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
+            else:
+                other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis+1):]))
+                sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
+                out_tvm_val = in_npy_map[sel_indices]
+            if type == "argmax":
+                np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
+            elif type == "argmin":
+                np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
+        else:
+            np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
+    for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan", "nvptx"]:
         check_device(device)
 
 
@@ -108,7 +123,15 @@ def test_reduce_map():
                           axis=None,
                           keepdims=True,
                           type="argmax")
-
+    verify_reduce_map_ele(in_shape=(31, 21, 15),
+                          axis=None,
+                          keepdims=False,
+                          type="sum")
+    verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
+                          axis=(1, 2, 3),
+                          keepdims=True,
+                          type="sum",
+                          dtype="float64")
 
 if __name__ == "__main__":
     test_reduce_map()
diff --git a/topi/tests/python/test_topi_region.py b/topi/tests/python/test_topi_region.py
new file mode 100644
index 000000000000..a2835339e8eb
--- /dev/null
+++ b/topi/tests/python/test_topi_region.py
@@ -0,0 +1,49 @@
+"""Example code to do region."""
+import numpy as np
+import topi
+from topi.util import get_const_tuple
+import tvm
+import topi.testing
+
+def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax):
+    '''Verify region operator by comparing outputs from tvm and numpy implementation'''
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    B = topi.vision.yolo.region(A, n, classes, coords, background, l_softmax)
+
+    a_shape = get_const_tuple(A.shape)
+    dtype = A.dtype
+
+    def get_ref_data_region():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax)
+        return a_np, b_np
+
+    a_np, b_np = get_ref_data_region()
+    def check_device(device):
+        '''Cheching devices is enabled or not'''
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            if device == 'llvm':
+                s = topi.generic.vision.schedule_region([B])
+            else:
+                s = topi.cuda.vision.schedule_region([B])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, B], device)
+        func(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['llvm', 'cuda']:
+        check_device(device)
+
+def test_region():
+    verify_region(1, 19, 425, 5, 80, 4, 0, 1)
+
+if __name__ == "__main__":
+    test_region()
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index f70f48c50110..9feeabbf122f 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -9,24 +9,25 @@ def verify_relu(m, n):
     A = tvm.placeholder((m, n), name='A')
     B = topi.nn.relu(A)
 
-    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
+    a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = a_np * (a_np > 0)
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_elemwise(B)
-        ctx = tvm.context(device, 0)
+
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="relu")
         foo(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
 
 
@@ -45,13 +46,44 @@ def verify_leaky_relu(m, alpha):
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 
+def verify_prelu(x, w, axis, weight_reshape):
+    X = tvm.placeholder((x), name='X')
+    W = tvm.placeholder((w), name='W')
+    x_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(X.shape)).astype(X.dtype)
+    w_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(W.shape)).astype(W.dtype)
+
+    def _prelu_numpy(x, W):
+        return (x < 0) * (x *W.reshape(weight_reshape)) + (x>=0) * x
+
+    B = topi.nn.prelu(X, W, axis)
+    s = tvm.create_schedule([B.op])
+
+    ctx = tvm.cpu(0)
+    x_tvm = tvm.nd.array(x_np, ctx)
+    w_tvm = tvm.nd.array(w_np, ctx)
+
+    b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), ctx)
+    foo = tvm.build(s, [X, W, B], "llvm", name="prelu")
+    foo(x_tvm, w_tvm, b)
+    out_np = _prelu_numpy(x_np, w_np)
+    np.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
+
 def test_relu():
     verify_relu(10, 128)
 
+def test_schedule_big_array():
+    verify_relu(1024 * 100 , 512)
+
+
 def test_leaky_relu():
     verify_leaky_relu(100, 0.1)
 
+def test_prelu():
+    verify_prelu((1, 3, 2, 2), (3,), 1, (3, 1, 1))
+    verify_prelu((1, 3, 2, 2), (2,), 2, (2, 1))
 
 if __name__ == "__main__":
+    test_schedule_big_array()
     test_relu()
     test_leaky_relu()
+    test_prelu()
diff --git a/topi/tests/python/test_topi_reorg.py b/topi/tests/python/test_topi_reorg.py
new file mode 100644
index 000000000000..5b15b9f6c5aa
--- /dev/null
+++ b/topi/tests/python/test_topi_reorg.py
@@ -0,0 +1,50 @@
+"""Example code to do reorg."""
+import numpy as np
+import topi
+from topi.util import get_const_tuple
+import tvm
+import topi.testing
+
+def verify_reorg(batch, in_size, in_channel, stride):
+    '''Verify reorg operator by comparing outputs from tvm and numpy implementation'''
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    B = topi.vision.reorg(A, stride)
+
+    a_shape = get_const_tuple(A.shape)
+    dtype = A.dtype
+
+    def get_ref_data_reorg():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_np = topi.testing.reorg_python(a_np, stride)
+        return a_np, b_np
+
+    a_np, b_np = get_ref_data_reorg()
+
+    def check_device(device):
+        '''Cheching devices is enabled or not'''
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            if device == 'llvm':
+                s = topi.generic.schedule_reorg([B])
+            else:
+                s = topi.cuda.schedule_reorg([B])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, B], device)
+        func(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['llvm', 'cuda']:
+        check_device(device)
+
+def test_reorg():
+    verify_reorg(1, 20, 8, 2)
+
+if __name__ == "__main__":
+    test_reorg()
diff --git a/topi/tests/python/test_topi_resize.py b/topi/tests/python/test_topi_resize.py
new file mode 100644
index 000000000000..cb2a69caf22b
--- /dev/null
+++ b/topi/tests/python/test_topi_resize.py
@@ -0,0 +1,57 @@
+"""Test code for bilinear scale """
+import numpy as np
+import tvm
+import topi
+import topi.testing
+import math
+
+def verify_bilinear_scale(batch, in_channel, in_height, in_width, out_height, out_width, layout='NCHW', align_corners=False):
+
+    if layout == 'NCHW':
+        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='float32')
+        dtype = A.dtype
+        out_shape = (batch, in_channel, out_height, out_width)
+        a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
+    elif layout == 'NHWC':
+        A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A', dtype='float32')
+        dtype = A.dtype
+        out_shape = (batch, out_height, out_width, in_channel)
+        a_np = np.random.uniform(size=(batch, in_height, in_width, in_channel)).astype(dtype)
+    else:
+        raise NotImplementedError(
+            'Layout not supported {} '.format(layout))
+
+    B = topi.image.resize(A, (out_height, out_width), layout=layout, align_corners=align_corners)
+
+    b_np = topi.testing.bilinear_resize_python(a_np, (out_height, out_width), layout, align_corners)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
+
+    for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
+        check_device(device)
+
+def test_resize():
+    # Scale NCHW
+    verify_bilinear_scale(4, 16, 32, 32, 50, 50, 'NCHW')
+    # Scale NCHW + Align Corners
+    verify_bilinear_scale(6, 32, 64, 64, 20, 20, 'NCHW', True)
+    # Scale NHWC
+    verify_bilinear_scale(4, 16, 32, 32, 50, 50, "NHWC")
+    # Scale NHWC + Align Corners
+    verify_bilinear_scale(6, 32, 64, 64, 20, 20, "NHWC", True)
+
+if __name__ == "__main__":
+    test_resize()
diff --git a/topi/tests/python/test_topi_shortcut.py b/topi/tests/python/test_topi_shortcut.py
new file mode 100644
index 000000000000..b5840fe8e7b2
--- /dev/null
+++ b/topi/tests/python/test_topi_shortcut.py
@@ -0,0 +1,48 @@
+"""Example code to do shortcut."""
+import numpy as np
+import topi
+from topi.util import get_const_tuple
+import tvm
+
+def verify_shortcut(batch, in_size, in_channel):
+    '''Verify shortcut operator by comparing outputs from tvm and numpy implementation'''
+    in_height = in_width = in_size
+
+    A1 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A1')
+    A2 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A2')
+    B = topi.vision.shortcut(A1, A2)
+
+    a_shape = get_const_tuple(A1.shape)
+    dtype = A1.dtype
+    def get_ref_data_shortcut():
+        a_np1 = np.random.uniform(size=a_shape).astype(dtype)
+        a_np2 = np.random.uniform(size=a_shape).astype(dtype)
+        b_np = topi.testing.shortcut_python(a_np1, a_np2)
+        return a_np1, a_np2, b_np
+
+    a_np1, a_np2, b_np = get_ref_data_shortcut()
+    def check_device(device):
+        '''Cheching devices is enabled or not'''
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective([B])
+
+        a1 = tvm.nd.array(a_np1, ctx)
+        a2 = tvm.nd.array(a_np2, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A1, A2, B], device)
+        func(a1, a2, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['llvm', 'cuda']:
+        check_device(device)
+
+def test_shortcut():
+    verify_shortcut(1, 144, 32)
+
+if __name__ == "__main__":
+    test_shortcut()
diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py
index 72d86a43271d..f12070695220 100644
--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -3,11 +3,12 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 import logging
 from topi.util import get_const_tuple
 
-def verify_softmax(m, n):
-    A = tvm.placeholder((m, n), name='A')
+def verify_softmax(m, n, dtype="float32"):
+    A = tvm.placeholder((m, n), dtype=dtype, name='A')
     B = topi.nn.softmax(A)
     # confirm lower works
     s = tvm.create_schedule([B.op])
@@ -17,29 +18,30 @@ def verify_softmax(m, n):
     b_np = topi.testing.softmax_python(a_np)
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_softmax(B)
-        ctx = tvm.context(device, 0)
+
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="softmax")
         foo(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
 
 def test_softmax():
     verify_softmax(32, 10)
     verify_softmax(3, 4)
+    verify_softmax(32, 10, "float64")
 
-
-def verify_log_softmax(m, n):
-    A = tvm.placeholder((m, n), name='A')
+def verify_log_softmax(m, n, dtype="float32"):
+    A = tvm.placeholder((m, n), dtype=dtype, name='A')
     B = topi.nn.log_softmax(A)
     # confirm lower works
     s = tvm.create_schedule([B.op])
@@ -48,26 +50,27 @@ def verify_log_softmax(m, n):
     b_np = topi.testing.log_softmax_python(a_np)
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_softmax(B)
-        ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="log_softmax")
         foo(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ["cuda", "opencl", "metal", "rocm"]:
+    for device in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
         check_device(device)
 
 
 def test_log_softmax():
     verify_log_softmax(32, 10)
     verify_log_softmax(3, 4)
+    verify_log_softmax(32, 10, "float64")
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.DEBUG)
diff --git a/topi/tests/python/test_topi_tensor.py b/topi/tests/python/test_topi_tensor.py
new file mode 100644
index 000000000000..3d563c21b5c4
--- /dev/null
+++ b/topi/tests/python/test_topi_tensor.py
@@ -0,0 +1,85 @@
+"""Test code for tensor operator"""
+import numpy as np
+import tvm
+import topi
+from tvm.contrib.pickle_memoize import memoize
+
+def verify_elemwise_sum(num_args, dtype):
+    shape = (3,5,4)
+
+    tvm_placeholders = []
+    for i in range(num_args):
+        tvm_placeholders.append(
+            tvm.placeholder(shape, name="data"+str(i), dtype=dtype))
+    esum = topi.elemwise_sum(tvm_placeholders)
+    s = tvm.create_schedule([esum.op])
+
+    @memoize("topi.tests.test_topi_elemwise_sum")
+    def get_ref_data():
+        np_nd = [np.random.uniform(0, 10, size=shape).astype(dtype)
+                 for i in range(num_args)]
+        return np_nd
+    np_nd = get_ref_data()
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+
+        ctx = tvm.context(device, 0)
+        out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
+        f = tvm.build(s, tvm_placeholders + [esum], device, name="elemwise_sum")
+        tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out]
+        f(*tvm_nd)
+        np_out = np.sum(np.array(np_nd), axis=0)
+        np.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+
+def verify_full(shape, dtype, fill_value):
+    A = tvm.placeholder(shape, dtype=dtype, name="A")
+    B = topi.full_like(A, fill_value=fill_value)
+    C = topi.full(shape=shape, dtype=dtype, fill_value=fill_value)
+    s1 = tvm.create_schedule([B.op])
+    s2 = tvm.create_schedule([C.op])
+
+    @memoize("topi.tests.test_topi_full")
+    def get_ref_data():
+        return np.full(shape, fill_value, dtype)
+    np_nd = get_ref_data()
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+
+        ctx = tvm.context(device, 0)
+        out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
+        f = tvm.build(s1, [A, B], device, name="full_like")
+        f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
+        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+
+        f = tvm.build(s2, [C], device, name="full")
+        f(out)
+        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+
+def test_elemwise_sum():
+    verify_elemwise_sum(1, "float32")
+    verify_elemwise_sum(5, "float32")
+    verify_elemwise_sum(4, "int32")
+
+
+def test_full():
+    verify_full((3,4,5), "float32", 3.14)
+    verify_full((10,), "int32", 7)
+
+
+if __name__ == "__main__":
+    test_elemwise_sum()
+    test_full()
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index b8f44b5633c3..b62f6b43c3de 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -7,13 +7,13 @@ def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
     A = tvm.placeholder(shape=in_shape, name="A")
     B = topi.expand_dims(A, axis, num_newaxis)
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_broadcast(B)
-        ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="expand_dims")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = data_npy.reshape(out_shape)
@@ -22,7 +22,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
         check_device(device)
 
 
@@ -30,13 +30,13 @@ def verify_tranpose(in_shape, axes):
     A = tvm.placeholder(shape=in_shape, name="A")
     B = topi.transpose(A, axes)
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_injective(B)
-        ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="tranpose")
         data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype)
         out_npy = data_npy.transpose(axes)
@@ -45,7 +45,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
         check_device(device)
 
 
@@ -53,13 +53,13 @@ def verify_reshape(src_shape, dst_shape):
     A = tvm.placeholder(shape=src_shape, name="A")
     B = topi.reshape(A, dst_shape)
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_injective(B)
-        ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="reshape")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.reshape(data_npy, newshape=dst_shape)
@@ -68,7 +68,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
         check_device(device)
 
 
@@ -76,13 +76,14 @@ def verify_squeeze(src_shape, axis):
     A = tvm.placeholder(shape=src_shape, name="A")
     B = topi.squeeze(A, axis=axis)
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_injective(B)
-        ctx = tvm.context(device, 0)
+
         foo = tvm.build(s, [A, B], device, name="squeeze")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
@@ -95,7 +96,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
         check_device(device)
 
 def verify_concatenate(shapes, axis):
@@ -104,13 +105,14 @@ def verify_concatenate(shapes, axis):
         tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
     out_tensor = topi.concatenate(a_tuple=tensor_l, axis=axis)
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_injective(out_tensor)
-        ctx = tvm.context(device, 0)
+
         foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate")
         data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
         out_npy = np.concatenate(data_npys, axis=axis)
@@ -119,7 +121,7 @@ def check_device(device):
         foo(*(data_nds + [out_nd]))
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
         check_device(device)
 
 
@@ -127,13 +129,14 @@ def verify_split(src_shape, indices_or_sections, axis):
     A = tvm.placeholder(shape=src_shape, name="A")
     tensor_l = topi.split(A, indices_or_sections, axis=axis)
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = topi.generic.schedule_injective(tensor_l)
-        ctx = tvm.context(device, 0)
+
         foo = tvm.build(s, [A] + tensor_l, device, name="split")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npys = np.split(data_npy, indices_or_sections, axis=axis)
@@ -143,9 +146,140 @@ def check_device(device):
         for out_nd, out_npy in zip(out_nds, out_npys):
             np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
+        check_device(device)
+
+
+def verify_expand_like(in_shape, out_shape, axis):
+    A = tvm.placeholder(shape=in_shape, name="A")
+    B = tvm.placeholder(shape=out_shape, name="B")
+    C = topi.expand_like(A, B, axis)
+    s = tvm.create_schedule([C.op])
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+
+        ctx = tvm.context(device, 0)
+        f = tvm.build(s, [A, B, C], device, name="expand_like")
+        input = np.random.uniform(size=in_shape).astype(A.dtype)
+        tvm_input = tvm.nd.array(input, ctx)
+
+        odim = len(out_shape)
+        real_axis = [x if x >= 0 else x + odim for x in axis]
+        real_axis = sorted(real_axis)
+        for x in real_axis:
+            input = np.expand_dims(input, x).astype(A.dtype)
+        for x in real_axis:
+            input = np.concatenate([input]*out_shape[x], axis=x).astype(A.dtype)
+        assert input.shape == out_shape
+
+        tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), ctx)
+        out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), ctx)
+        f(tvm_input, tvm_shape_like, out)
+        np.testing.assert_allclose(out.asnumpy(), input)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+def verify_flip(in_shape, axis):
+    A = tvm.placeholder(shape=in_shape, name="A")
+    B = topi.flip(A, axis) + 1
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
+
+        foo = tvm.build(s, [A, B], device, name="reverse")
+        x_np = np.random.uniform(size=in_shape).astype(A.dtype)
+        out_npy = np.flip(x_np, axis) + 1
+        data_nd = tvm.nd.array(x_np, ctx)
+        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
+        foo(data_nd, out_nd)
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "cuda", "opencl"]:
         check_device(device)
 
+def verify_take(src_shape, indices_src, axis=None):
+    src_dtype = "float32"
+    indices_dtype = "int32"
+    indices_src = np.array(indices_src, dtype=indices_dtype)
+    A = tvm.placeholder(shape=src_shape, dtype=src_dtype, name="A")
+    indices = tvm.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
+    if axis is None:
+        out_tensor = topi.take(a=A, indices=indices)
+    else:
+        out_tensor = topi.take(a=A, indices=indices, axis=axis)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(out_tensor)
+
+        foo = tvm.build(s, [A] + [indices] + [out_tensor] , device, name="take")
+        shape_size = 1
+        for i in range(len(src_shape)):
+            shape_size = shape_size * src_shape[i]
+        data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
+
+        if axis is None:
+            out_npys = np.take(data_npy, indices_src)
+        else:
+            out_npys = np.take(data_npy, indices_src, axis=axis)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        indices_nd = tvm.nd.array(indices_src, ctx)
+        out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
+        foo(data_nd, indices_nd, out_nd)
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npys)
+
+    for device in ["llvm", "opencl"]:
+        check_device(device)
+
+def verify_strided_slice(in_shape, begin, end, stride=None):
+    stride = stride if stride else [1, 1, 1]
+    A = tvm.placeholder(shape=in_shape, name="A")
+    B = topi.strided_slice(A, begin, end, stride) + 1
+    def test_forward(x, begin, end, stride):
+        return x[begin[0]:end[0]:stride[0],
+                    begin[1]:end[1]:stride[1], begin[2]:end[2]:stride[2]] + 1
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
+
+        foo = tvm.build(s, [A, B], device, name="stride_slice")
+        x_np = np.random.uniform(size=in_shape).astype(A.dtype)
+        out_npy = test_forward(x_np, begin, end, stride)
+        data_nd = tvm.nd.array(x_np, ctx)
+        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
+        foo(data_nd, out_nd)
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "opencl"]:
+        check_device(device)
+
+def test_strided_slice():
+    verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
+    verify_strided_slice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1])
+    verify_strided_slice((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2])
+    verify_strided_slice((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3])
 
 def test_expand_dims():
     verify_expand_dims((3, 10), (3, 10, 1, 1), 2, 2)
@@ -188,6 +322,30 @@ def test_split():
     verify_split((2, 12, 3), [2, 4], 1)
     verify_split((10, 12, 24), [5, 7, 9], -1)
 
+def test_flip():
+    verify_flip((3, 4, 3), 1)
+    verify_flip((3, 4, 3), 0)
+    verify_flip((3, 4, 3), 2)
+    verify_flip((3, 4, 3), -1)
+    verify_flip((3, 4, 3), -3)
+    verify_flip((3, 4, 3), -2)
+
+def test_expand_like():
+    verify_expand_like((3,), (2, 3), [0])
+    verify_expand_like((2,), (2, 3), [1])
+    verify_expand_like((3, 4), (3, 5, 4), [1])
+    verify_expand_like((5, 7), (5, 6, 7, 8), [1, 3])
+
+def test_take():
+    verify_take((4,), [1])
+    verify_take((4,), [[0,1,2,3]])
+    verify_take((3,3,3), [[11,25]])
+    verify_take((4,), [[0,1],[2,3]])
+    verify_take((4,), [1], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 1)
+    verify_take((4,3,5,6), [[2,1,0,0]], -2)
+
 if __name__ == "__main__":
     test_concatenate()
     test_tranpose()
@@ -195,3 +353,7 @@ def test_split():
     test_reshape()
     test_squeeze()
     test_split()
+    test_flip()
+    test_expand_like()
+    test_take()
+    test_strided_slice()
diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py
new file mode 100644
index 000000000000..3affc30a0722
--- /dev/null
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -0,0 +1,56 @@
+"""Test code for upsampling"""
+import numpy as np
+import tvm
+import topi
+import topi.testing
+import math
+
+def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCHW'):
+
+
+    if layout == 'NCHW':
+        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+        dtype = A.dtype
+        out_shape = (batch, in_channel, in_height*scale, in_width*scale)
+        a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
+    elif layout == 'NHWC':
+        A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A')
+        dtype = A.dtype
+        out_shape = (batch, in_height*scale, in_width*scale, in_channel)
+        a_np = np.random.uniform(size=(batch, in_height, in_width, in_channel)).astype(dtype)
+    else:
+        raise NotImplementedError(
+            'Layout not supported {} '.format(layout))
+
+    B = topi.nn.upsampling(A, scale, layout=layout)
+
+    b_np = topi.testing.upsampling_python(a_np, scale, layout)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
+        check_device(device)
+
+def test_upsampling():
+    # NCHW
+    verify_upsampling(8, 16, 32, 32, 2)
+    verify_upsampling(12, 32, 64, 64, 3)
+    # NHWC
+    verify_upsampling(8, 16, 32, 32, 2, "NHWC")
+    verify_upsampling(12, 32, 64, 64, 3, "NHWC")
+
+if __name__ == "__main__":
+    test_upsampling()
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
new file mode 100644
index 000000000000..959b10f82ca5
--- /dev/null
+++ b/topi/tests/python/test_topi_vision.py
@@ -0,0 +1,160 @@
+"""Test code for vision package"""
+import numpy as np
+import tvm
+import topi
+import math
+
+from topi.vision import ssd, nms
+
+
+def test_nms():
+    dshape = (1, 5, 6)
+    data = tvm.placeholder(dshape, name="data")
+    valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+    nms_threshold = 0.7
+    force_suppress = True
+    nms_topk = 2
+
+    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
+                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
+                         [1, 0.5, 100, 60, 70, 110]]]).astype(data.dtype)
+    np_valid_count = np.array([4]).astype(valid_count.dtype)
+    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
+                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1]]])
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            if device == 'llvm':
+                out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+            else:
+                out = topi.cuda.nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+            s = topi.generic.schedule_nms(out)
+
+        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
+        f = tvm.build(s, [data, valid_count, out], device)
+        f(tvm_data, tvm_valid_count, tvm_out)
+        np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
+
+    for device in ['llvm', 'opencl']:
+        check_device(device)
+
+
+def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
+    data = tvm.placeholder(dshape, name="data")
+
+    dtype = data.dtype
+    input_data = np.random.uniform(size=dshape).astype(dtype)
+
+    in_height = data.shape[2].value
+    in_width = data.shape[3].value
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
+    size_ratio_concat = sizes + ratios
+    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
+    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+    offset_h = offsets[0]
+    offset_w = offsets[1]
+
+    oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
+    np_out = np.zeros(oshape).astype(dtype)
+
+    for i in range(in_height):
+        center_h = (i + offset_h) * steps_h
+        for j in range(in_width):
+            center_w = (j + offset_w) * steps_w
+            for k in range(num_sizes + num_ratios - 1):
+                w = size_ratio_concat[k] * in_height / in_width / 2.0 if k < num_sizes else \
+                    size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                h = size_ratio_concat[k] / 2.0 if k < num_sizes else \
+                    size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                count = i * in_width * (num_sizes + num_ratios - 1) + j * (num_sizes + num_ratios - 1) + k
+                np_out[0][count][0] = center_w - w
+                np_out[0][count][1] = center_h - h
+                np_out[0][count][2] = center_w + w
+                np_out[0][count][3] = center_h + h
+    if clip:
+        np_out = np.clip(np_out, 0, 1)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            if device == 'llvm':
+                out = ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip)
+            else:
+                out = topi.cuda.ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip)
+            s = topi.generic.schedule_multibox_prior(out)
+
+        tvm_input_data = tvm.nd.array(input_data, ctx)
+        tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx)
+        f = tvm.build(s, [data, out], device)
+        f(tvm_input_data, tvm_out)
+        np.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3)
+
+    for device in ['llvm', 'opencl']:
+        check_device(device)
+
+
+def test_multibox_prior():
+    verify_multibox_prior((1, 3, 50, 50))
+    verify_multibox_prior((1, 3, 224, 224), sizes=(0.5, 0.25, 0.1), ratios=(1, 2, 0.5))
+    verify_multibox_prior((1, 32, 32, 32), sizes=(0.5, 0.25), ratios=(1, 2), steps=(2, 2), clip=True)
+
+
+def test_multibox_detection():
+    batch_size = 1
+    num_anchors = 3
+    num_classes = 3
+    cls_prob = tvm.placeholder((batch_size, num_anchors, num_classes), name="cls_prob")
+    loc_preds = tvm.placeholder((batch_size, num_anchors * 4), name="loc_preds")
+    anchors = tvm.placeholder((1, num_anchors, 4), name="anchors")
+
+    # Manually create test case
+    np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]])
+    np_loc_preds = np.array([[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4, -0.8]])
+    np_anchors = np.array([[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2], [1.2, 1.2, 1.5, 1.5]]])
+
+    expected_np_out = np.array([[[1, 0.69999999, 0, 0, 0.10818365, 0.10008108],
+                                 [0, 0.44999999, 1, 1, 1, 1],
+                                 [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]])
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            if device == 'llvm':
+                out = ssd.multibox_detection(cls_prob, loc_preds, anchors)
+            else:
+                out = topi.cuda.ssd.multibox_detection(cls_prob, loc_preds, anchors)
+            s = topi.generic.schedule_multibox_detection(out)
+
+        tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), ctx)
+        tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), ctx)
+        tvm_anchors = tvm.nd.array(np_anchors.astype(anchors.dtype), ctx)
+        tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), ctx)
+        f = tvm.build(s, [cls_prob, loc_preds, anchors, out], device)
+        f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out)
+        np.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4)
+
+    for device in ['llvm', 'opencl']:
+        check_device(device)
+
+
+if __name__ == "__main__":
+    test_nms()
+    test_multibox_prior()
+    test_multibox_detection()
diff --git a/topi/tests/python_cpp/test_topi_basic.py b/topi/tests/python_cpp/test_topi_basic.py
new file mode 100644
index 000000000000..1057f746b004
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_basic.py
@@ -0,0 +1,36 @@
+import tvm
+import topi
+from topi import util
+
+
+def test_util():
+    x = tvm.const(100)
+    assert util.get_const_int(x) == 100
+    assert util.get_const_tuple((x, x)) == (100, 100)
+
+
+def test_ewise():
+    m = tvm.var('m')
+    l = tvm.var('l')
+    A = tvm.placeholder((m, l), name='A')
+
+    def test_apply(func, name):
+        B = func(A)
+        assert tuple(B.shape) == tuple(A.shape)
+        assert B.op.body[0].name == name
+
+    test_apply(topi.cpp.exp, "exp")
+    test_apply(topi.cpp.tanh, "tanh")
+    test_apply(topi.cpp.sigmoid, "sigmoid")
+    test_apply(topi.cpp.log, "log")
+    test_apply(topi.cpp.sqrt, "sqrt")
+
+def test_flatten_tag():
+    A = tvm.placeholder((3, 4), name='A')
+    B = topi.cpp.nn.flatten(A)
+    assert B.op.tag == topi.tag.INJECTIVE
+
+if __name__ == "__main__":
+    test_util()
+    test_ewise()
+    test_flatten_tag()
diff --git a/topi/tests/python_cpp/test_topi_bnn.py b/topi/tests/python_cpp/test_topi_bnn.py
new file mode 100644
index 000000000000..3fa5cfc4a0a7
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_bnn.py
@@ -0,0 +1,55 @@
+"""Test code for binary neural network operators."""
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+
+def verify_binary_dense(batch, in_dim, out_dim):
+    A = tvm.placeholder((batch, in_dim), name='A')
+    B = tvm.placeholder((out_dim, in_dim), name='B')
+    bnn_A = topi.cpp.nn.binarize_pack(A, 1)
+    bnn_B = topi.cpp.nn.binarize_pack(B, 1)
+    # binary dense
+    bnn_A1 = tvm.placeholder(bnn_A.shape, dtype=bnn_A.dtype)
+    bnn_B1 = tvm.placeholder(bnn_B.shape, dtype=bnn_B.dtype)
+    bnn_C = topi.cpp.nn.binary_dense(bnn_A1, bnn_B1)
+    # schedule
+    target = topi.cpp.TEST_create_target("llvm")
+    s1 = topi.cpp.x86.schedule_binarize_pack(target, [bnn_A])
+    s2 = topi.cpp.x86.schedule_binarize_pack(target, [bnn_B])
+    s3 = topi.cpp.x86.schedule_binary_dense(target, [bnn_C])
+
+    dtype = A.dtype
+    @memoize("topi.tests.test_topi_binary_dense")
+    def get_ref_data():
+        # generate random matrix of +1 or -1 value
+        a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype)
+        b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype)
+        c_np = np.dot(a_np, b_np.T)
+        return (a_np, b_np, c_np)
+
+    a_np, b_np, c_np = get_ref_data()
+
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx)
+    bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx)
+    bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx)
+    f1 = tvm.build(s1, [A, bnn_A], 'llvm')
+    f2 = tvm.build(s2, [B, bnn_B], 'llvm')
+    f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], 'llvm')
+    f1(a, bnn_a)
+    f2(b, bnn_b)
+    f3(bnn_a, bnn_b, bnn_c)
+    np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
+
+def test_binary_dense():
+    verify_binary_dense(1, 4096, 1024)
+    verify_binary_dense(1, 1024, 1000)
+
+
+if __name__ == "__main__":
+    test_binary_dense()
diff --git a/topi/tests/python_cpp/test_topi_clip.py b/topi/tests/python_cpp/test_topi_clip.py
new file mode 100644
index 000000000000..fe00408642f5
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_clip.py
@@ -0,0 +1,44 @@
+"""Test code for clip operator"""
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+
+def verify_clip(N, a_min, a_max, dtype):
+    A = tvm.placeholder((N, N), dtype=dtype, name='A')
+    B = topi.cpp.clip(A, a_min, a_max)
+
+    # use memoize to pickle the test data for next time use
+    @memoize("topi.tests.test_topi_clip")
+    def get_ref_data():
+        a_np = np.random.uniform(a_min*2, a_max*2, size=(N, N)).astype(dtype)
+        b_np = np.clip(a_np, a_min, a_max)
+        return a_np, b_np
+    a_np, b_np = get_ref_data()
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        target = topi.cpp.TEST_create_target(device)
+        s = topi.cpp.generic.default_schedule(target, [B], False)
+        ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device, name="clip")
+        f(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['llvm']:
+        check_device(device)
+
+def test_clip():
+    verify_clip(1024, -127, 127, 'int8')
+    verify_clip(1024, -127, 127, 'int16')
+    verify_clip(1024, -127, 127, 'float32')
+
+
+if __name__ == "__main__":
+    test_clip()
diff --git a/topi/tests/python_cpp/test_topi_dense.py b/topi/tests/python_cpp/test_topi_dense.py
new file mode 100644
index 000000000000..f2369af4319a
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_dense.py
@@ -0,0 +1,61 @@
+"""Test code for dense operator"""
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+
+def verify_dense(batch, in_dim, out_dim, use_bias=True):
+    A = tvm.placeholder((batch, in_dim), name='A')
+    B = tvm.placeholder((out_dim, in_dim), name='B')
+    C = tvm.placeholder((out_dim,), name='C')
+    D = topi.cpp.nn.dense(A, B, C if use_bias else None)
+    D = topi.cpp.nn.relu(D)
+    dtype = A.dtype
+
+    # use memoize to pickle the test data for next time use
+    @memoize("topi.tests.test_topi_dense")
+    def get_ref_data():
+        a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype)
+        b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype)
+        c_np = np.random.uniform(size=(out_dim,)).astype(dtype)
+        if use_bias:
+            d_np = np.maximum(np.dot(a_np, b_np.T) + c_np, 0.0)
+        else:
+            d_np = np.maximum(np.dot(a_np, b_np.T), 0.0)
+        return (a_np, b_np, c_np, d_np)
+    # get the test data
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_dense(target, [D])
+        elif device == "rocm":
+            s = topi.cpp.rocm.schedule_dense(target, [D])
+        else:
+            s = topi.cpp.cuda.schedule_dense(target, [D])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A, B, C, D], device, name="dense")
+        f(a, b, c, d)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+        check_device(device)
+
+def test_dense():
+    verify_dense(1, 1024, 1000, use_bias=True)
+    verify_dense(1, 1024, 1000, use_bias=False)
+
+
+if __name__ == "__main__":
+    test_dense()
diff --git a/topi/tests/python_cpp/test_topi_dilate.py b/topi/tests/python_cpp/test_topi_dilate.py
new file mode 100644
index 000000000000..f1924239cc77
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_dilate.py
@@ -0,0 +1,35 @@
+import tvm
+import topi
+import topi.testing
+import numpy as np
+
+def test_dilate():
+    target = 'llvm'
+    ctx = tvm.cpu(0)
+
+    def _test_dilate(input_size, strides):
+        Input = tvm.placeholder((input_size))
+        Output = topi.cpp.nn.dilate(Input, strides)
+        tgt = topi.cpp.TEST_create_target(target)
+        schedule = topi.cpp.generic.default_schedule(tgt, [Output], True)
+        input_np = np.random.uniform(size=input_size).astype(Input.dtype)
+        output_np = topi.testing.dilate_python(input_np, strides)
+        input_tvm = tvm.nd.array(input_np, ctx=ctx)
+        output_size = topi.util.get_const_tuple(Output.shape)
+        output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
+        f = tvm.build(schedule, [Input, Output], target)
+        f(input_tvm, output_tvm)
+        np.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
+
+    _test_dilate((32,), (2,))
+    _test_dilate((32,32), (2,2))
+    _test_dilate((1,3,32,32), (1,1,1,1))
+    _test_dilate((1,3,32,32), (2,2,2,2))
+    _test_dilate((1,32,32,3,3), (1,1,1,1,1))
+    _test_dilate((1,32,32,3,3), (2,2,2,2,2))
+    _test_dilate((1,32,32,32,3,3), (1,1,1,2,2,2))
+    _test_dilate((1,32,32,32,3,3), (2,2,2,1,1,1))
+
+
+if __name__ == "__main__":
+    test_dilate()
diff --git a/topi/tests/python_cpp/test_topi_l2norm.py b/topi/tests/python_cpp/test_topi_l2norm.py
new file mode 100644
index 000000000000..08799f76c5c3
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_l2norm.py
@@ -0,0 +1,48 @@
+"""Test code for l2 normalization"""
+import numpy as np
+import tvm
+import topi
+import logging
+from topi.util import get_const_tuple
+import topi.testing
+
+def verify_l2_normalize(shape, eps, axis=None):
+    '''Verify l2 normalization operator by comparing outputs from tvm and numpy implementation'''
+    A = tvm.placeholder(shape, name='A')
+    B = topi.cpp.nn.l2_normalize(A, eps, axis)
+    dtype = A.dtype
+
+    a_np = np.random.uniform(size=shape).astype(dtype)
+    b_np = topi.testing.l2_normalize_python(a_np, eps, axis)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.cuda.schedule_l2_normalize(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, B], device, name="l2_normalize")
+        func(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm']:
+        check_device(device)
+
+def test_l2_normalize():
+    verify_l2_normalize((1, 3, 20, 20), 0.001)
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (1,))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (1, 2))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (2, 3))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (0, 3))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (0, 2, 3))
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    test_l2_normalize()
diff --git a/topi/tests/python_cpp/test_topi_lrn.py b/topi/tests/python_cpp/test_topi_lrn.py
new file mode 100644
index 000000000000..d685643a9406
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_lrn.py
@@ -0,0 +1,44 @@
+"""Test code for LRN"""
+import numpy as np
+import tvm
+import topi
+import logging
+from topi.util import get_const_tuple
+import topi.testing
+
+def verify_lrn(shape, size, axis, bias, alpha, beta):
+    '''Verify Local response normalization operator by comparing outputs from tvm and numpy implementation'''
+    A = tvm.placeholder(shape, name='A')
+    B = topi.cpp.nn.lrn(A, size, axis, alpha, beta, bias)
+    dtype = A.dtype
+
+    a_np = np.random.uniform(size=shape).astype(dtype)
+    b_np = topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.cuda.schedule_lrn(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-1)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm']:
+        check_device(device)
+
+def test_lrn():
+    verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5)
+    verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5)
+    verify_lrn((1, 3, 20, 20), 3, 1, 2.0, 1.0, 0.75)
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    test_lrn()
diff --git a/topi/tests/python_cpp/test_topi_pooling.py b/topi/tests/python_cpp/test_topi_pooling.py
new file mode 100644
index 000000000000..42232c8e4848
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_pooling.py
@@ -0,0 +1,132 @@
+"""Test code for pooling"""
+import numpy as np
+import tvm
+import topi
+import math
+from topi.util import get_const_tuple
+
+pool_code = {
+    "avg": 0,
+    "max": 1
+}
+def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True):
+    iw = ih
+    kw = kh
+    sw = sh
+    pt, pl, pb, pr = padding
+    A = tvm.placeholder((n, ic, ih, iw), name='A')
+    B = topi.cpp.nn.pool(A, [kh, kw], [sh, sw], padding,
+                         pool_code[pool_type], ceil_mode, "NCHW", count_include_pad)
+    B = topi.cpp.nn.relu(B)
+    dtype = A.dtype
+
+    bshape = get_const_tuple(B.shape)
+    ashape = get_const_tuple(A.shape)
+    if ceil_mode:
+        assert bshape[2] == int(math.ceil(float(ashape[2] - kh + pt + pb) / sh) + 1)
+        assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pl + pr) / sw) + 1)
+    else:
+        assert bshape[2] == int(math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1)
+        assert bshape[3] == int(math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1)
+
+
+    a_np = np.random.uniform(size=(n, ic, ih, iw)).astype(dtype)
+    pad_np = np.zeros(shape=(n, ic, ih+pt+pb, iw+pl+pr)).astype(dtype)
+    no_zero = (range(n), range(ic), (range(pt, ih+pt)), (range(pl, iw+pl)))
+    pad_np[np.ix_(*no_zero)] = a_np
+    _, oc, oh, ow = get_const_tuple(B.shape)
+    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)
+
+    if pool_type == 'avg':
+        for i in range(oh):
+            for j in range(ow):
+                if count_include_pad:
+                    b_np[:,:,i,j] = np.mean(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3))
+                else:
+                    pad_count = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3))
+                    b_np[:,:,i,j] = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) / np.maximum(pad_count, 1)
+
+    elif pool_type =='max':
+        for i in range(oh):
+            for j in range(ow):
+                b_np[:,:,i,j] = np.max(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3))
+    b_np = np.maximum(b_np, 0.0)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.cuda.schedule_pool(target, [B])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+        check_device(device)
+
+def test_pool():
+    verify_pool(1, 256, 32, 2, 2, [0, 0, 0, 0], 'avg', False, True)
+    verify_pool(1, 256, 31, 3, 3, [1, 2, 1, 2], 'avg', False, True)
+    verify_pool(1, 256, 32, 2, 2, [1, 2, 1, 2], 'avg', False, False)
+    verify_pool(1, 256, 31, 4, 4, [3, 3, 3, 3], 'avg', False, False)
+    verify_pool(1, 256, 31, 4, 4, [0, 0, 0, 0], 'avg', False, False)
+    verify_pool(1, 256, 32, 2, 2, [0, 0, 0, 0], 'max', False)
+    verify_pool(1, 256, 31, 3, 3, [2, 1, 2, 1], 'max', False)
+    verify_pool(1, 256, 31, 3, 3, [2, 1, 2, 1], 'max', True)
+
+    verify_pool(1, 256, 31, 3, 3, [2, 1, 0, 3], 'avg', False, True)
+    verify_pool(1, 256, 32, 2, 2, [0, 3, 2, 1], 'avg', False, False)
+    verify_pool(1, 256, 31, 3, 3, [1, 0, 3, 2], 'max', False)
+    verify_pool(1, 256, 31, 3, 3, [3, 2, 1, 0], 'max', True)
+
+
+def verify_global_pool(n, c, h, w, pool_type):
+    A = tvm.placeholder((n, c, h, w), name='A')
+    B = topi.cpp.nn.global_pool(A, pool_code[pool_type])
+    B = topi.cpp.nn.relu(B)
+
+    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
+    if pool_type == 'avg':
+        b_np = np.mean(a_np, axis=(2,3), keepdims=True)
+    elif pool_type =='max':
+        b_np = np.max(a_np, axis=(2,3), keepdims=True)
+    b_np = np.maximum(b_np, 0.0)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.cuda.schedule_global_pool(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+        check_device(device)
+
+def test_global_pool():
+    verify_global_pool(1, 1024, 7, 7, 'avg')
+    verify_global_pool(4, 1024, 7, 7, 'avg')
+    verify_global_pool(1, 1024, 7, 7, 'max')
+    verify_global_pool(4, 1024, 7, 7, 'max')
+
+
+if __name__ == "__main__":
+    test_pool()
+    test_global_pool()
diff --git a/topi/tests/python_cpp/test_topi_reduce.py b/topi/tests/python_cpp/test_topi_reduce.py
new file mode 100644
index 000000000000..7bf369c7f1ff
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_reduce.py
@@ -0,0 +1,136 @@
+"""Test code for reduce."""
+import os
+import numpy as np
+import tvm
+import topi
+
+def _my_npy_argmax(arr, axis, keepdims):
+    if not keepdims:
+        return arr.argmax(axis=axis)
+    else:
+        if axis is not None:
+            out_shape = list(arr.shape)
+            out_shape[axis] = 1
+        else:
+            out_shape = [1 for _ in range(len(arr.shape))]
+        return arr.argmax(axis=axis).reshape(out_shape)
+
+
+def _my_npy_argmin(arr, axis, keepdims):
+    if not keepdims:
+        return arr.argmin(axis=axis)
+    else:
+        out_shape = list(arr.shape)
+        out_shape[axis] = 1
+        return arr.argmin(axis=axis).reshape(out_shape)
+
+def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
+    # Build the logic and compile the function
+    dat_dtype = "float32"
+    A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype)
+    A1 = topi.cpp.sqrt(topi.cpp.exp(A))
+    out_dtype = "float32"
+    if type == "sum":
+        B = topi.cpp.sum(A1, axis, keepdims)
+    elif type == "max":
+        B = topi.cpp.max(A1, axis, keepdims)
+    elif type == "min":
+        B = topi.cpp.min(A1, axis, keepdims)
+    elif type == "argmax":
+        B = topi.cpp.argmax(A1, axis, keepdims)
+        out_dtype = "int32"
+    elif type == "argmin":
+        B = topi.cpp.argmin(A1, axis, keepdims)
+        out_dtype = "int32"
+    else:
+        raise NotImplementedError
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], True)
+        else:
+            s = topi.cpp.cuda.schedule_reduce(target, [B])
+
+        foo = tvm.build(s, [A, B], device, name="sum")
+        # Test
+        in_npy = np.random.uniform(size=in_shape).astype(np.float32)
+        in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
+        if type == "sum":
+            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
+        elif type == "max":
+            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
+        elif type == "min":
+            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
+        elif type == "argmax":
+            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
+        elif type == "argmin":
+            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
+        else:
+            raise NotImplementedError
+        out_npy = np.atleast_1d(out_npy)
+        data_tvm = tvm.nd.array(in_npy, ctx=ctx)
+        out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
+        for _ in range(1):
+            foo(data_tvm, out_tvm)
+        if type == "argmax" or type == "argmin":
+            out_tvm_indices = out_tvm.asnumpy()
+            if keepdims:
+                out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
+            if axis is None:
+                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
+            else:
+                other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis+1):]))
+                sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
+                out_tvm_val = in_npy_map[sel_indices]
+            if type == "argmax":
+                np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
+            elif type == "argmin":
+                np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
+        else:
+            np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
+    for device in ["cuda", "opencl", "metal", "llvm", "rocm"]:
+        check_device(device)
+
+
+def test_reduce_map():
+    verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
+                        axis=(1, 2, 3),
+                        keepdims=True,
+                        type="sum")
+    verify_reduce_map_ele(in_shape=(128, 24 * 128 * 24),
+                        axis=(1,),
+                        keepdims=False,
+                        type="max")
+    verify_reduce_map_ele(in_shape=(32, 128, 24),
+                        axis=None,
+                        keepdims=True,
+                        type="sum")
+    verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
+                        axis=(0, 2),
+                        keepdims=False,
+                        type="min")
+    verify_reduce_map_ele(in_shape=(32, 128),
+                          axis=1,
+                          keepdims=True,
+                          type="argmax")
+    verify_reduce_map_ele(in_shape=(32, 24, 32, 24),
+                          axis=2,
+                          keepdims=False,
+                          type="argmin")
+    verify_reduce_map_ele(in_shape=(31, 21, 15),
+                          axis=None,
+                          keepdims=True,
+                          type="argmax")
+    verify_reduce_map_ele(in_shape=(31, 21, 15),
+                          axis=None,
+                          keepdims=False,
+                          type="sum")
+
+if __name__ == "__main__":
+    test_reduce_map()
diff --git a/topi/tests/python_cpp/test_topi_region.py b/topi/tests/python_cpp/test_topi_region.py
new file mode 100644
index 000000000000..a37cf6610a0f
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_region.py
@@ -0,0 +1,52 @@
+"""Test code for region"""
+import logging
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+
+def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax):
+    '''Verify region operator by comparing outputs from tvm and numpy implementation'''
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    B = topi.cpp.yolo.region(A, n, classes, coords, background, l_softmax)
+
+    a_shape = get_const_tuple(A.shape)
+    dtype = A.dtype
+
+    def get_ref_data_region():
+        '''Randomly initialize the data variables and get refernce output for the region operation'''
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax)
+        return a_np, b_np
+
+    a_np, b_np = get_ref_data_region()
+    def check_device(device):
+        '''Check the device is available and if so, build and run the program'''
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.rocm.schedule_region(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, B], device, name="region")
+        func(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
+        check_device(device)
+
+def test_region():
+    verify_region(1, 19, 425, 5, 80, 4, 0, 1)
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    test_region()
diff --git a/topi/tests/python_cpp/test_topi_relu.py b/topi/tests/python_cpp/test_topi_relu.py
new file mode 100644
index 000000000000..6677c1bf5551
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_relu.py
@@ -0,0 +1,90 @@
+"""Test code for relu activation"""
+import os
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+
+def verify_relu(m, n, dtype):
+    A = tvm.placeholder((m, n), name='A', dtype=dtype)
+    B = topi.cpp.nn.relu(A)
+    assert B.dtype == dtype
+
+    a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
+    b_np = a_np * (a_np > 0)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_injective(target, [B])
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        foo = tvm.build(s, [A, B], device, name="relu")
+        foo(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+        check_device(device)
+
+
+def verify_leaky_relu(m, alpha):
+    A = tvm.placeholder((m,), name='A')
+    B = topi.cpp.nn.leaky_relu(A, alpha)
+    device = "llvm"
+    target = topi.cpp.TEST_create_target(device)
+    s = topi.cpp.generic.schedule_injective(target, [B])
+
+    a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
+    b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    foo = tvm.build(s, [A, B], device, name="leaky_relu")
+    foo(a, b)
+    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+def verify_prelu(x, w, axis, weight_reshape):
+    X = tvm.placeholder((x), name='X')
+    W = tvm.placeholder((w), name='W')
+    x_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(X.shape)).astype(X.dtype)
+    w_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(W.shape)).astype(W.dtype)
+    def _prelu_numpy(x, W):
+        return (x < 0) * (x *W.reshape(weight_reshape)) + (x>=0) * x
+
+    out_np = _prelu_numpy(x_np, w_np)
+    B = topi.cpp.nn.prelu(X, W, axis)
+    device = "llvm"
+    target = topi.cpp.TEST_create_target(device)
+    s = topi.cpp.generic.schedule_injective(target, [B])
+
+    ctx = tvm.cpu(0)
+    x_tvm = tvm.nd.array(x_np, ctx)
+    w_tvm = tvm.nd.array(w_np, ctx)
+
+    b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), ctx)
+    foo = tvm.build(s, [X, W, B], "llvm", name="prelu")
+    foo(x_tvm, w_tvm, b)
+    np.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
+
+def test_relu():
+    for dtype in ['float32', 'float64', 'int32', 'int16', 'int8', 'int64']:
+        verify_relu(10, 128, dtype)
+
+def test_leaky_relu():
+    verify_leaky_relu(100, 0.5)
+
+def test_prelu():
+    verify_prelu((1, 3, 2, 2), (3,), 1, (3, 1, 1))
+    verify_prelu((1, 3, 2, 2), (2,), 2, (2, 1))
+
+if __name__ == "__main__":
+    test_relu()
+    test_leaky_relu()
+    test_prelu()
diff --git a/topi/tests/python_cpp/test_topi_reorg.py b/topi/tests/python_cpp/test_topi_reorg.py
new file mode 100644
index 000000000000..e5b8aa7f8b31
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_reorg.py
@@ -0,0 +1,52 @@
+"""Test code for reorg"""
+import logging
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+
+def verify_reorg(batch, in_size, in_channel, stride):
+    '''Verify reorg operator by comparing outputs from tvm and numpy implementation'''
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    B = topi.cpp.vision.reorg(A, stride)
+
+    a_shape = get_const_tuple(A.shape)
+    dtype = A.dtype
+
+    def get_ref_data_reorg():
+        '''Randomly initialize the data variables and get refernce output for the reorg operation'''
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_np = topi.testing.reorg_python(a_np, stride)
+        return a_np, b_np
+
+    a_np, b_np = get_ref_data_reorg()
+    def check_device(device):
+        '''Check the device is available and if so, build and run the program'''
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, B], device, name="reorg")
+        func(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
+        check_device(device)
+
+def test_reorg():
+    verify_reorg(1, 38, 64, 2)
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    test_reorg()
diff --git a/topi/tests/python_cpp/test_topi_softmax.py b/topi/tests/python_cpp/test_topi_softmax.py
new file mode 100644
index 000000000000..4d4ac387bccf
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_softmax.py
@@ -0,0 +1,82 @@
+"""Test code for softmax"""
+import os
+import numpy as np
+import tvm
+import topi
+import logging
+import topi.testing
+from topi.util import get_const_tuple
+
+def verify_softmax(m, n):
+    A = tvm.placeholder((m, n), name='A')
+    B = topi.cpp.nn.softmax(A, 1)
+    # confirm lower works
+    s = tvm.create_schedule([B.op])
+    tvm.lower(s, [A, B], simple_mode=True)
+
+    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
+    b_np = topi.testing.softmax_python(a_np)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.cuda.schedule_softmax(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        foo = tvm.build(s, [A, B], device, name="softmax")
+        foo(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+        check_device(device)
+
+def test_softmax():
+    verify_softmax(32, 10)
+    verify_softmax(3, 4)
+
+
+def verify_log_softmax(m, n):
+    A = tvm.placeholder((m, n), name='A')
+    B = topi.cpp.nn.log_softmax(A)
+    # confirm lower works
+    s = tvm.create_schedule([B.op])
+    tvm.lower(s, [A, B], simple_mode=True)
+    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
+    b_np = topi.testing.log_softmax_python(a_np)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.cuda.schedule_softmax(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        foo = tvm.build(s, [A, B], device, name="log_softmax")
+        foo(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ["cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+
+def test_log_softmax():
+    verify_log_softmax(32, 10)
+    verify_log_softmax(3, 4)
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    test_softmax()
+    test_log_softmax()
diff --git a/topi/tests/python_cpp/test_topi_tensor.py b/topi/tests/python_cpp/test_topi_tensor.py
new file mode 100644
index 000000000000..1a0a7c92db7e
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_tensor.py
@@ -0,0 +1,81 @@
+"""Test code for tensor operator"""
+import numpy as np
+import tvm
+import topi
+
+def verify_elemwise_sum(num_args, dtype):
+    shape = (3,5,4)
+
+    tvm_placeholders = []
+    for i in range(num_args):
+        tvm_placeholders.append(
+            tvm.placeholder(shape, name="data"+str(i), dtype=dtype))
+    esum = topi.cpp.elemwise_sum(tvm_placeholders)
+    s = tvm.create_schedule([esum.op])
+
+    def get_ref_data():
+        np_nd = [np.random.uniform(0, 10, size=shape).astype(dtype)
+                 for i in range(num_args)]
+        return np_nd
+    np_nd = get_ref_data()
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+
+        ctx = tvm.context(device, 0)
+        out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
+        f = tvm.build(s, tvm_placeholders + [esum], device, name="elemwise_sum")
+        tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out]
+        f(*tvm_nd)
+        np_out = np.sum(np.array(np_nd), axis=0)
+        np.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+
+def verify_full(shape, dtype, fill_value):
+    A = tvm.placeholder(shape, dtype=dtype, name="A")
+    B = topi.cpp.full_like(A, fill_value)
+    C = topi.cpp.full(shape, dtype, fill_value)
+    s1 = tvm.create_schedule([B.op])
+    s2 = tvm.create_schedule([C.op])
+
+    def get_ref_data():
+        return np.full(shape, fill_value, dtype)
+    np_nd = get_ref_data()
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        target = topi.cpp.TEST_create_target(device)
+        ctx = tvm.context(device, 0)
+        out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
+        f = tvm.build(s1, [A, B], device, name="full_like")
+        f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
+        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+
+        f = tvm.build(s2, [C], device, name="full")
+        f(out)
+        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+
+def test_elemwise_sum():
+    verify_elemwise_sum(1, "float32")
+    verify_elemwise_sum(5, "float32")
+    verify_elemwise_sum(4, "int32")
+
+
+def test_full():
+    verify_full((3,4,5), "float32", 3.14)
+    verify_full((10,), "int32", 7)
+
+if __name__ == "__main__":
+    test_elemwise_sum()
+    test_full()
diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py
new file mode 100644
index 000000000000..c8b7c3906caa
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_transform.py
@@ -0,0 +1,386 @@
+"""Test code for broadcasting operators."""
+import numpy as np
+import tvm
+import topi
+
+def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
+    A = tvm.placeholder(shape=in_shape, name="A")
+    B = topi.cpp.expand_dims(A, axis, num_newaxis)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_injective(target, [B])
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [B])
+        foo = tvm.build(s, [A, B], device, name="expand_dims")
+        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
+        out_npy = data_npy.reshape(out_shape)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
+        foo(data_nd, out_nd)
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+
+def verify_tranpose(in_shape, axes):
+    A = tvm.placeholder(shape=in_shape, name="A")
+    B = topi.cpp.transpose(A, axes)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_injective(target, [B])
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [B])
+        ctx = tvm.context(device, 0)
+        foo = tvm.build(s, [A, B], device, name="tranpose")
+        data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype)
+        out_npy = data_npy.transpose(axes)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype)
+        foo(data_nd, out_nd)
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+
+def verify_reshape(src_shape, dst_shape):
+    A = tvm.placeholder(shape=src_shape, name="A")
+    B = topi.cpp.reshape(A, dst_shape)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_injective(target, [B])
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [B])
+        foo = tvm.build(s, [A, B], device, name="reshape")
+        data_npy = np.random.normal(size=src_shape).astype(A.dtype)
+        out_npy = np.reshape(data_npy, newshape=dst_shape)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype)
+        foo(data_nd, out_nd)
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+
+def verify_squeeze(src_shape, axis):
+    A = tvm.placeholder(shape=src_shape, name="A")
+    B = topi.cpp.squeeze(A, axis)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_injective(target, [B])
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [B])
+        foo = tvm.build(s, [A, B], device, name="squeeze")
+        data_npy = np.random.normal(size=src_shape).astype(A.dtype)
+        out_npy = np.squeeze(data_npy, axis=axis)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        if out_npy.shape == ():
+            out_nd_shape = (1,)
+        else:
+            out_nd_shape = out_npy.shape
+        out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
+        foo(data_nd, out_nd)
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+def verify_concatenate(shapes, axis):
+    tensor_l = []
+    for i, shape in enumerate(shapes):
+        tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
+    out_tensor = topi.cpp.concatenate(tensor_l, axis)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_injective(target, [out_tensor])
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [out_tensor])
+        foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate")
+        data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
+        out_npy = np.concatenate(data_npys, axis=axis)
+        data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
+        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
+        foo(*(data_nds + [out_nd]))
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+
+def verify_split(src_shape, indices_or_sections, axis):
+    A = tvm.placeholder(shape=src_shape, name="A")
+    tensor_l = topi.cpp.split(A, indices_or_sections, axis)
+    tensor_l = list(tensor_l)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_injective(target, tensor_l)
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, tensor_l)
+        ctx = tvm.context(device, 0)
+        foo = tvm.build(s, [A] + tensor_l, device, name="split")
+        data_npy = np.random.normal(size=src_shape).astype(A.dtype)
+        out_npys = np.split(data_npy, indices_or_sections, axis=axis)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        out_nds = [tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys]
+        foo(*([data_nd] + out_nds))
+        for out_nd, out_npy in zip(out_nds, out_npys):
+            np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+def verify_take(src_shape, indices_src, axis=None):
+    src_dtype = "float32"
+    indices_dtype = "int32"
+    indices_src = np.array(indices_src, dtype=indices_dtype)
+    A = tvm.placeholder(shape=src_shape, dtype=src_dtype, name="A")
+    indices = tvm.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
+    if axis is None:
+        out_tensor = topi.cpp.take(A, indices)
+    else:
+        out_tensor = topi.cpp.take(A, indices, axis)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(out_tensor)
+
+        foo = tvm.build(s, [A] + [indices] + [out_tensor] , device, name="take")
+        shape_size = 1
+        for i in range(len(src_shape)):
+            shape_size = shape_size * src_shape[i]
+        data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
+
+        if axis is None:
+            out_npys = np.take(data_npy, indices_src)
+        else:
+            out_npys = np.take(data_npy, indices_src, axis=axis)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        indices_nd = tvm.nd.array(indices_src, ctx)
+        out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
+        foo(data_nd, indices_nd, out_nd)
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npys)
+
+    for device in ["llvm", "opencl"]:
+        check_device(device)
+
+def verify_where(condition, x, y):
+    dtype = "float32"
+    if len(condition.shape) == 1:
+        np_out = np.array([xv if c else yv for (c,xv,yv) in zip(condition,x,y)])
+    else:
+        np_out = np.where(condition, x, y)
+    A = tvm.placeholder(shape=condition.shape, dtype=dtype, name="condition")
+    B = tvm.placeholder(shape=x.shape, dtype=dtype, name="x")
+    C = tvm.placeholder(shape=y.shape, dtype=dtype, name="y")
+    out_tensor = topi.cpp.where(A, B, C)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(out_tensor)
+
+        foo = tvm.build(s, [A, B, C, out_tensor], device, name="where")
+        tvm_out = tvm.nd.empty(x.shape, ctx=ctx, dtype=dtype)
+        foo(tvm.nd.array(condition, ctx), tvm.nd.array(x, ctx),
+            tvm.nd.array(y, ctx), tvm_out)
+        np.testing.assert_allclose(tvm_out.asnumpy(), np_out)
+
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+def verify_concatenate_split(shapes, axis, indices_or_sections):
+    tensor_l_concatenate = []
+    for i, shape in enumerate(shapes):
+        tensor_l_concatenate.append(tvm.placeholder(shape, name="A" + str(i)))
+    out_tensor = topi.cpp.concatenate(tensor_l_concatenate, axis)
+    tensor_l = topi.cpp.split(out_tensor, indices_or_sections, axis)
+    tensor_l = list(tensor_l)
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_injective(target, tensor_l)
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, tensor_l)
+        ctx = tvm.context(device, 0)
+        foo = tvm.build(s, tensor_l_concatenate + tensor_l, device, name="concatenate_split")
+        data_npys = [np.random.normal(size=shape).astype(tensor_l_concatenate[0].dtype) for shape in shapes]
+        out_npy_conc = np.concatenate(data_npys, axis=axis)
+        out_npys_split = np.split(out_npy_conc, indices_or_sections, axis=axis)
+        data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
+        out_nds = [tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys_split]
+        foo(*(data_nds + out_nds))
+        for out_nd, out_npy in zip(out_nds, out_npys_split):
+            np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+def verify_concatenate_broadcast(shapes, axis, rhs_shape):
+    B = tvm.placeholder(shape=rhs_shape, name="B")
+    tensor_l = []
+    for i, shape in enumerate(shapes):
+        tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
+    out_tensor = topi.cpp.concatenate(tensor_l, axis)
+    C = out_tensor + B
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.schedule_injective(target, [C])
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [C])
+        ctx = tvm.context(device, 0)
+        foo = tvm.build(s, tensor_l + [B, C], device, name="broadcast_binary_add")
+        data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
+        lhs_npy = np.concatenate(data_npys, axis=axis)
+        rhs_npy = np.random.uniform(size=rhs_shape).astype(B.dtype)
+        out_npy = lhs_npy + rhs_npy
+        data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
+        rhs_nd = tvm.nd.array(rhs_npy, ctx)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        for _ in range(1):
+            foo(*(data_nds + [rhs_nd] + [out_nd]))
+        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
+
+    for device in ["llvm", "cuda", "opencl", "metal", "rocm"]:
+        check_device(device)
+
+
+def test_expand_dims():
+    verify_expand_dims((3, 10), (3, 10, 1, 1), 2, 2)
+    verify_expand_dims((3, 10), (1, 3, 10), -3, 1)
+
+
+def test_tranpose():
+    verify_tranpose((3, 10, 2), (1, 0, 2))
+    verify_tranpose((3, 10, 5), (2, 0, 1))
+    verify_tranpose((3, 10), None)
+    verify_tranpose((3, 10, 5), (2, -3, 1))
+
+
+def test_reshape():
+    verify_reshape((1, 2, 3, 4), (2, 3, 4))
+    verify_reshape((4, 2, 3, 4), (2, 4, 12))
+    verify_reshape((4, 2, 3, 4), (2, 48))
+    verify_reshape((16, ), (2, 2, 2, 2))
+
+
+def test_squeeze():
+    verify_squeeze((1, 2, 3, 4), 0)
+    verify_squeeze((1, 2, 1, 4), None)
+    verify_squeeze((1, 1, 1, 4), (1, 2))
+    verify_squeeze((1, 1, 1, 1), None)
+
+
+def test_concatenate():
+    verify_concatenate([(2,), (2,), (2,)], 0)
+    verify_concatenate([(2, 3, 4), (2, 2, 4), (2, 5, 4)], 1)
+    verify_concatenate([(1, 2, 4), (1, 2, 3), (1, 2, 7), (1, 2, 8), (1, 2, 1)], -1)
+    verify_concatenate([(5, 6, 7, 3),
+                        (16, 6, 7, 3),
+                        (12, 6, 7, 3),
+                        (8, 6, 7, 3),
+                        (2, 6, 7, 3)], 0)
+
+
+def test_split():
+    verify_split((2, 12, 3), 3, 1)
+    verify_split((2, 12, 3), [2, 4], 1)
+    verify_split((10, 12, 24), [5, 7, 9], -1)
+
+def test_take():
+    verify_take((4,), [1])
+    verify_take((4,), [[0,1,2,3]])
+    verify_take((3,3,3), [[11,25]])
+    verify_take((4,), [[0,1],[2,3]])
+    verify_take((4,), [1], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 1)
+    verify_take((4,3,5,6), [[2,1,0,0]], -2)
+
+def test_where():
+    shape = (10, 3, 7, 13)
+    condition = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+    x = np.random.uniform(size=shape).astype("float32")
+    y = np.random.uniform(size=shape).astype("float32")
+    verify_where(condition, x, y)
+    condition = np.random.uniform(low=-1, high=1, size=(shape[0],)).astype("float32")
+    x = np.random.uniform(size=shape).astype("float32")
+    y = np.random.uniform(size=shape).astype("float32")
+    verify_where(condition, x, y)
+
+
+def test_regression_1():
+    verify_concatenate_split([(2, 3, 4), (2, 2, 4), (2, 5, 4)], 1, [3, 7])
+    verify_concatenate_split([(3, 4), (2, 4), (3, 4)], 0, [1, 2, 3, 4])
+
+def test_regression_2():
+    verify_concatenate_broadcast([(5, 1, 3), (5, 1, 3)], 1, [2, 1])
+    verify_concatenate_broadcast([(5, 1, 2), (5, 1, 3)], 2, [1, 5])
+
+if __name__ == "__main__":
+    test_concatenate()
+    test_tranpose()
+    test_expand_dims()
+    test_reshape()
+    test_squeeze()
+    test_split()
+    test_take()
+    test_where()
+    test_regression_1()
+    test_regression_2()
diff --git a/topi/tests/python_cpp/test_topi_yolo.py b/topi/tests/python_cpp/test_topi_yolo.py
new file mode 100644
index 000000000000..ed234b7bd134
--- /dev/null
+++ b/topi/tests/python_cpp/test_topi_yolo.py
@@ -0,0 +1,49 @@
+"""Test code for yolo op"""
+import logging
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+
+def verify_yolo(ishape, n, classes):
+    '''Verify yolo operator by comparing outputs from tvm and numpy implementation'''
+    
+    A = tvm.placeholder(ishape, name='A')
+    B = topi.cpp.yolo.yolo(A, n, classes)
+    dtype = A.dtype
+
+    def get_ref_data_yolo():
+        '''Randomly initialize the data variables and get refernce output for the yolo operation'''
+        a_np = np.random.uniform(size=ishape).astype(dtype)
+        b_np = topi.testing.yolo_python(a_np, n, classes)
+        return a_np, b_np
+
+    a_np, b_np = get_ref_data_yolo()
+    def check_device(device):
+        '''Check the device is available and if so, build and run the program'''
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, B], device, name="yolo")
+        func(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
+        check_device(device)
+
+def test_yolo():
+    verify_yolo((1, 425, 19, 19), 5, 80)
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    test_yolo()
diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt
new file mode 100644
index 000000000000..c511381dd57d
--- /dev/null
+++ b/tutorials/autotvm/README.txt
@@ -0,0 +1,3 @@
+Auto tuning
+-------------
+
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
new file mode 100644
index 000000000000..03f319ba57f1
--- /dev/null
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -0,0 +1,212 @@
+"""
+Tuning High Performance Convolution on NVIDIA GPUs
+=========================================================================
+**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+
+This is an advanced tutorial for writing high performance tunable template for 
+NVIDIA GPU. By running auto-tuner on this template, we can outperform the
+vendor provided library CuDNN in many cases.
+"""
+
+######################################################################
+# Install dependencies
+# ----------------------------------------
+# To use autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost
+#
+# To make tvm run faster in tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import logging
+import sys
+import numpy as np
+
+import tvm
+import topi
+from topi.testing import conv2d_nchw_python
+
+from tvm import autotvm
+
+######################################################################
+# Step 1:  Define the search space
+# ---------------------------------
+# There are plenty of useful schedule primitives in tvm. You can also find 
+# some tutorials that describe them in more details, such as 
+# (1). :ref:`opt-conv-gpu`
+# (2). `Optimizing DepthwiseConv on NVIDIA GPU <https://tvm.ai/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html>`_
+# 
+# However, their implementations are manually tuned for some special input
+# shapes. In this section, we build a large enough space to cover
+# the techniques used in these tutorials. Then we rely on the efficient auto-tuner
+# to search through this space and pick some good configurations.
+# 
+# If you are familiar with writing cuda schedule, you can find the following
+# template is very general. Actually this template can be easily modified 
+# to tune other operators such as depthwise convolution and gemm.
+# In order to fully understand this template, you should be familiar with
+# the schedule primitives and auto tuning API. You can refer to the above
+# tutorials and :doc:`autotvm tutorial <tune_simple_template>`
+#
+# It is worth noting that the search space for a conv2d operator
+# can be very large (at the level of 10^9 for some input shapes)
+#
+
+@autotvm.template
+def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
+    assert N == 1, "Only consider batch_size = 1 in this template"
+
+    data = tvm.placeholder((N, CI, H, W), name='data')
+    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
+    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, 'float32')
+    s = tvm.create_schedule([conv.op])
+
+    # inline padding
+    pad_data = s[conv].op.input_tensors[0]
+    s[pad_data].compute_inline()
+    data, raw_data = pad_data, data
+
+    output = conv
+    OL = s.cache_write(conv, 'local')
+
+    # create cache stage
+    AA = s.cache_read(data, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+    AL = s.cache_read(AA, 'local', [OL])
+    WL = s.cache_read(WW, 'local', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+    kernel_scope = n  # this is the scope to attach global config inside this kernel
+
+    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile and bind reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
+    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
+    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+    s[AL].compute_at(s[OL], rxm)
+    s[WL].compute_at(s[OL], rxm)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # tune unroll
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    return s, [raw_data, kernel, conv]
+
+######################################################################
+# Step 2:  Search through the space
+# ---------------------------------
+# We pick the last layer on resnet as test case.
+# Since our space is very large, :code:`XGBoostTuner` is most suitable
+# for our case. Here we only do 20 trials for demonstration.
+# In practice, making 1000 trials usually can find some good kernels
+# for this template
+
+# logging config (for printing tuning log to screen)
+logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
+
+# the last layer in resnet
+N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
+task = autotvm.task.create(conv2d_no_batching,
+                           args=(N, H, W, CO, CI, KH, KW, strides, padding),
+                           target='cuda')
+print(task.config_space)
+
+# use local gpu, measure 5 times for every config to reduce variance
+# run 8 parallel threads for compilation
+measure_option = autotvm.measure_option('local',
+                                        number=5,
+                                        parallel_num=8,
+                                        timeout=20)
+
+# begin tuning, log records to file `conv2d.log`
+tuner = autotvm.tuner.XGBTuner(task)
+tuner.tune(n_trial=20,
+           measure_option=measure_option,
+           callbacks=[autotvm.callback.log_to_file('conv2d.log')])
+
+#########################################################################
+# Finally we can inspect the best config from log file, check correctness,
+# and measure running time.
+
+# inspect the best config
+dispatch_context = autotvm.apply_history_best("conv2d.log")
+best_config = dispatch_context.query(task.target, task.workload)
+print("\nBest config:")
+print(best_config)
+
+# apply history best from log file
+with autotvm.apply_history_best('conv2d.log'):
+    with tvm.target.create("cuda"):
+        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)
+        func = tvm.build(s, arg_bufs)
+
+# check correctness
+a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
+w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
+c_np = conv2d_nchw_python(a_np, w_np, strides, padding)
+
+ctx = tvm.gpu()
+a_tvm = tvm.nd.array(a_np, ctx=ctx)
+w_tvm = tvm.nd.array(w_np, ctx=ctx)
+c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
+func(a_tvm, w_tvm, c_tvm)
+
+np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
+
+# Evaluate running time. Here we choose a large repeat number (200) to reduce the noise
+# and the overhead of kernel launch. You can also use nvprof to validate the result.
+evaluator = func.time_evaluator(func.entry_name, ctx, number=200)
+print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
+
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
new file mode 100644
index 000000000000..23bd0f93ff23
--- /dev/null
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -0,0 +1,364 @@
+"""
+Auto-tuning a convolutional network for ARM CPU
+====================================================
+**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+
+Auto-tuning for a specific ARM device is critical for getting the best
+performance. This is a tutorial about how to tune a whole convolutional
+network.
+
+The operator implementation for ARM CPU in TVM is written in template form.
+It has many tunable knobs (tile factor, vectorization, unrolling, etc).
+We will do tuning for all convolution and depthwise convolution operators
+in the neural network. After the tuning, we can get a log file which stores
+the best knob values for all required operators. When the tvm compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some arm devices. You can go to
+`ARM CPU Benchmark <https://github.com/dmlc/tvm/wiki/Benchmark#arm-cpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# ----------------------------------------
+# To use autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost
+#
+# To make tvm run faster in tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    shape = {"data": (batch_size, 3, 224, 224)}
+    output_shape = (batch_size, 1000)
+
+    if name =='resnet-18':
+        net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
+    elif name =='mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name =='squeezenet v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name =='vgg-16':
+        net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
+    elif name =='custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, shape, output_shape
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with ARM boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 phones, we can register all of them
+# to the tracker, then we can run 10 measurements in parallel, which accelerates
+# the tuning process.
+#
+# To start an RPC tracker, run this command in the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build tvm runtime for the ARM devices.
+#
+# * For Linux:
+#   Follow this section :ref:`build-tvm-runtime-on-device` to build
+#   tvm runtime on the device. Then register the device to tracker by
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399
+#
+#   (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# * For Android:
+#   Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
+#   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------
+#    key          free    pending
+#    ----------------------------
+#    mate10pro    2       0
+#    rk3399       2       0
+#    rpi3b        11      0
+#    ----------------------------
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should do some configurations. Here I use an RK3399 board
+# in our environment as example. In your setting, you should modify the target
+# and device_key accordingly.
+
+# Replace "aarch64-linux-gnu" with the correct target of your board.
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+target = tvm.target.create('llvm -device=arm_cpu -target=aarch64-linux-gnu')
+
+# Also replace this with the device key in your tracker
+device_key = 'rk3399'
+
+# tuning option
+network = 'resnet-18'
+log_file = "%s.%s.log" % (device_key, network)
+dtype = 'float32'
+
+tuning_option = {
+   'log_filename': log_file,
+
+   'tuner':'xgb',
+   'n_trial': 1000,
+   'early_stopping': 200,
+
+   'measure_option': autotvm.measure_option(
+       autotvm.use_rpc(device_key, host='localhost', port=9190),
+       number=4,
+       parallel_num=1,
+       timeout=10),
+
+   'use_transfer_learning': True,
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default value provided here works well. It is the same
+#   value that we used to generate pre-tuned parameters.
+#   If you have multiple devices, you can set :code:`parallel_num` to
+#   the number of devices you have. (e.g. set it to 3 if you register 3 rk3399
+#   boards to the tracker).
+#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning run longer.
+#   If your device is very slow or a single conv2d operator in your network has large FLOPs,
+#   consider setting timeout larger.
+#
+#   **For android phone**, add :code:`build_func='ndk'` to the argument list of
+#   :code:`autotvm.measure_option` to use Android NDK for creating shared library.
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tune them in sequential order.
+# Later we will bring more sophisticated tuner scheduler.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=500,
+               early_stopping=200,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    if try_winograd:
+        for i in range(len(tasks)):
+            try:  # try winograd template
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host, 'winograd')
+                tasks.append(tsk)
+            except Exception:
+                pass
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(tasks):
+        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate():
+    # extract workloads from nnvm graph
+    net, params, shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, shape=shape, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,),
+                                            target=target)
+
+    # run tuning tasks
+    tune_tasks(tasks, **tuning_option)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target,
+                shape=shape, params=params, dtype=dtype)
+
+        # export library
+        tmp = tempdir()
+        if tuning_option['measure_option']['build_func'] == 'ndk': # for android
+            from tvm.contrib import ndk
+            filename = "net.so"
+            lib.export_library(tmp.relpath(filename), ndk.create_shared)
+        else:
+            filename = "net.tar"
+            lib.export_library(tmp.relpath(filename))
+
+        # upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(device_key, timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # upload parameters to device
+        ctx = remote.context(str(target), 0)
+        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+        data_tvm = tvm.nd.array((np.random.uniform(size=shape['data'])).astype(dtype))
+        module = runtime.create(graph, rlib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**rparams)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
+        prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+                (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run by yourself.
+# tune_and_evaluate()
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to train xgboost models and use them for prediction.
+# So a high performance CPU is recommended.
+# It takes about 1.5 hour on a 32T AMD Ryzen CPU.
+# One sample output is
+#
+# .. code-block:: bash
+#
+#    [Task  1/16]  Current/Best:   13.15/  20.49 GFLOPS | Progress: (297/1000) | 348.51 s Done.
+#    [Task  2/16]  Current/Best:   16.66/  22.64 GFLOPS | Progress: (475/1000) | 415.42 s Done.
+#    [Task  3/16]  Current/Best:   10.33/  14.19 GFLOPS | Progress: (306/1000) | 239.61 s Done.
+#    [Task  4/16]  Current/Best:   13.29/  20.88 GFLOPS | Progress: (242/1000) | 227.48 s Done.
+#    [Task  5/16]  Current/Best:   13.28/  15.61 GFLOPS | Progress: (237/1000) | 191.56 s Done.
+#    [Task  6/16]  Current/Best:   20.16/  23.86 GFLOPS | Progress: (315/1000) | 304.31 s Done.
+#    [Task  7/16]  Current/Best:    9.22/  22.00 GFLOPS | Progress: (458/1000) | 433.26 s Done.
+#    [Task  8/16]  Current/Best:   14.12/  17.80 GFLOPS | Progress: (270/1000) | 240.73 s Done.
+#    [Task  9/16]  Current/Best:   14.59/  24.02 GFLOPS | Progress: (209/1000) | 213.61 s Done.
+#    [Task 10/16]  Current/Best:    9.86/  21.74 GFLOPS | Progress: (367/1000) | 359.93 s Done.
+#    [Task 11/16]  Current/Best:    5.01/  18.86 GFLOPS | Progress: (202/1000) | 191.18 s Done.
+#    [Task 12/16]  Current/Best:    8.61/  25.23 GFLOPS | Progress: (220/1000) | 220.74 s Done.
+#    [Task 13/16]  Current/Best:   10.87/  25.79 GFLOPS | Progress: (465/1000) | 902.14 s Done.
+#    [Task 14/16]  Current/Best:   15.33/  29.38 GFLOPS | Progress: (239/1000) | 481.33 s Done.
+#    [Task 15/16]  Current/Best:   12.09/  38.60 GFLOPS | Progress: (476/1000) | 928.35 s Done.
+#    [Task 16/16]  Current/Best:   16.77/  47.08 GFLOPS | Progress: (255/1000) | 439.91 s Done.
+#    Compile...
+#    Upload...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 156.51 ms (0.89 ms)
+#
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
new file mode 100644
index 000000000000..f2a2ea9e1266
--- /dev/null
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -0,0 +1,304 @@
+"""
+Writing tunable template and Using auto-tuner
+=============================================
+**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+
+This is an introduction tutorial to the auto-tuning module in tvm.
+
+There are two steps in auto-tuning.
+The first step is defining a search space.
+The second step is running a search algorithm to explore through this space.
+In this tutorial, you can learn how to perform these two steps in tvm.
+The whole workflow is illustrated by a matrix multiplication example.
+"""
+
+######################################################################
+# Install dependencies
+# ----------------------------------------
+# To use autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost
+#
+# To make tvm run faster in tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import logging
+import sys
+
+import numpy as np
+import tvm
+
+# the module is called `autotvm`
+from tvm import autotvm
+
+######################################################################
+# Step 1:  Define the search space
+# ---------------------------------
+# In this section, we will rewrite a deterministic tvm schedule code to a
+# tunable schedule template. You can regard the process of search space definition
+# as the parametrization of our exiting schedule code.
+#
+# To begin with, here is how we implement a blocked matrix multiplication in tvm.
+
+# Matmul V0: Constant tiling factor
+def matmul_v0(N, L, M, dtype):
+    A = tvm.placeholder((N, L), name='A', dtype=dtype)
+    B = tvm.placeholder((L, M), name='B', dtype=dtype)
+
+    k = tvm.reduce_axis((0, L), name='k')
+    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = tvm.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    yo, yi = s[C].split(y, 8)
+    xo, xi = s[C].split(x, 8)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+#####################################################################
+# Parametrize the schedule
+# ^^^^^^^^^^^^^^^^^^^^^^^^^
+# In the previous schedule code, we use a constant "8" as tiling factor.
+# However, it might not be the best one because the best tiling factor depends
+# on real hardware environment and input shape.
+#
+# If you want the schedule code to be portable across a wider range of input shapes
+# and target hardware, it is better to define a set of candidate values and
+# pick the best one according to the measurement results on target hardware.
+#
+# In autotvm, we can define a tunable parameter, or a "knob" for such kind of value.
+
+# Matmul V1: List candidate values
+@autotvm.template  # 1. use a decorator
+def matmul_v1(N, L, M, dtype):
+    A = tvm.placeholder((N, L), name='A', dtype=dtype)
+    B = tvm.placeholder((L, M), name='B', dtype=dtype)
+
+    k = tvm.reduce_axis((0, L), name='k')
+    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = tvm.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    # 2. get the config object
+    cfg = autotvm.get_config()
+
+    # 3. define search space
+    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
+    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
+
+    # 4. schedule according to config
+    yo, yi = s[C].split(y, cfg['tile_y'].val)
+    xo, xi = s[C].split(x, cfg['tile_x'].val)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+###############################################################################
+# Here we make four modifications to the previous schedule code and get
+# a tunable "template". We can explain the modifications one by one.
+#
+# 1. Use a decorator to mark this function as a simple template
+# 2. Get a config object:
+#    You can regard this :code:`cfg` as an argument of this function but
+#    we obtain it in a different way. With this argument, this function is no longer
+#    a deterministic schedule code. Instead, we can pass different configurations to
+#    this function and get different schedules, so this function is a "template".
+#
+#    To make the template function more compact, we do two things in a single function.
+#    (1) define a search space and (2) schedule according to an entity in this space.
+#    To achieve this, we make :code:`cfg` be either
+#    a :any:`ConfigSpace` or a :any:`ConfigEntity` object.
+#
+#    When it is a :any:`ConfigSpace`, it will collect all tunable knobs in this function and
+#    build the search space.
+#    When it is a :any:`ConfigEntity`, it will ignore all space definition API
+#    (namely, :code:`cfg.define_XXXXX(...)`).   Instead, it stores deterministic values for
+#    all tunable knobs, and we schedule according to these values.
+#
+#    During auto-tuning, we will first call this template with a :any:`ConfigSpace`
+#    object to build the search space. Then we call this template with different :any:`ConfigEntity`
+#    in the built space to get different schedules. Finally we will measure the code generated by
+#    different schedules and pick the best one.
+#
+# 3. Define two tunable knobs. The first one is :code:`tile_y` with
+#    5 possible values. The second one is :code:`tile_x` with a same
+#    list of possible values. These two knobs are independent, so they
+#    span a search space with size = 5x5 = 25
+# 4. Schedule according to the deterministic values in :code:`cfg`
+#
+
+#####################################################################
+# Use better space definition API
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# In the previous template, we manually list all possible values for a knob.
+# This is the lowest level API to define the space.
+# However, we also provide another set of API to make the space definition
+# easier and smarter. It is recommended to use this set of high level API.
+#
+# In the flowing example, we use :any:`ConfigSpace.define_split` to define a split
+# knob. It will enumerate all the possible ways to split an axis and construct
+# the space.
+#
+# We also have :any:`ConfigSpace.define_reorder` for reorder knob and
+# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization,
+# thread binding.
+# When the high level API cannot meet your requirement, you can always fall
+# back to use low level API.
+
+@autotvm.template
+def matmul(N, L, M, dtype):
+    A = tvm.placeholder((N, L), name='A', dtype=dtype)
+    B = tvm.placeholder((L, M), name='B', dtype=dtype)
+
+    k = tvm.reduce_axis((0, L), name='k')
+    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = tvm.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    ##### define space begin #####
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_y", y, num_outputs=2)
+    cfg.define_split("tile_x", x, num_outputs=2)
+    ##### define space end #####
+
+    # schedule according to config
+    yo, yi = cfg["tile_y"].apply(s, C, y)
+    xo, xi = cfg["tile_x"].apply(s, C, x)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+######################################################################
+# .. note:: More Explanation on :code:`cfg.defile_split`
+#
+#  In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will enumerate
+#  all possible combinations that can split axis y into two axes with factors of the length of y.
+#  For example, if the length of y is 32 and we want to split it into two axes
+#  using factors of 32, then there are 6 possible values for
+#  (length of outer axis, length of inner axis) pair, namely
+#  (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32).
+#  They are just the 6 possible values of `tile_y`.
+#
+#  During schedule, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object.
+#  We stores the lengths of outer axes and inner axes in :code:`cfg['tile_y'].size`
+#  (a tuple with two elements).
+#  In this template, we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`.
+#  Actually, this is equivalent to
+#  :code:`yo, yi = s[C].split(y, cfg["tile_y"].size[1])`
+#  or  :code:`yo, yi = s[C].split(y, nparts=cfg['tile_y"].size[0])`
+#
+#  The advantage of using cfg.apply API is that it makes multi-level split
+#  (when num_outputs >= 3) easier.
+
+######################################################################
+# Step 2:  Search through the space
+# ---------------------------------
+# In step 1, we build the search space by extending our old schedule code
+# into a template. The next step is to pick a tuner and explore in this space.
+#
+# Auto-tuners in tvm
+# ^^^^^^^^^^^^^^^^^^
+# The job for a tuner can be described by following pseudo code
+#
+#   .. code-block:: c
+#
+#    ct = 0
+#    while ct < max_number_of_trials:
+#        propose a batch of configs
+#        measure this batch of configs on real hardware and get results
+#        ct += batch_size
+#
+# When proposing the next batch of configs, the tuner can take different strategies. We
+# provide four tuners with different strategies in autotvm.
+#
+# * :any:`RandomTuner`: Enumerate the space in a random order
+# * :any:`GridSearchTuner`: Enumerate the space in a grid search order
+# * :any:`GATuner`: Using genetic algorithm to search through the space
+# * :any:`XGBTuner`: Uses a model based method. Train a XGBoost model to predict the speed of lowered IR and pick the next batch according to the prediction.
+#
+# You can choose the tuner according to the size of your space, your time budget and other factors.
+# For example, if your space is very small (less than 1000), a gridsearch tuner or a
+# random tuner is good enough. If your space is at the level of 10^9 (this is the space
+# size of a conv2d operator on CUDA GPU), XGBoostTuner can explore more efficiently
+# and find better configs.
+
+################################################################
+# Begin tuning
+# ^^^^^^^^^^^^
+# Here we continue our matrix multiplication example.
+# First we should create a tuning task.
+# We can also inspect the initialized search space.
+# In this case, for a 512x512 square matrix multiplication, the space size
+# is 10x10=100
+N, L, M = 512, 512, 512
+task = autotvm.task.create(matmul, args=(N, L, M, 'float32'), target='llvm')
+print(task.config_space)
+
+################################################################
+# Then we need to define how to measure the generated code and pick a tuner.
+# Since our space is small, a random tuner is just okay.
+# 
+# We only make 10 trials in this tutorial for demonstration. In practice,
+# you can do more trials according to your time budget.
+# We will log the tuning results into a log file. This file can be
+# used to get the best config later.
+
+# logging config (for printing tuning log to screen)
+logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
+
+# use local cpu, measure 5 times for every config to reduce variance
+measure_option = autotvm.measure_option('local',
+                                        number=5)
+
+# begin tuning, log records to file `matmul.log`
+tuner = autotvm.tuner.RandomTuner(task)
+tuner.tune(n_trial=10,
+           measure_option=measure_option,
+           callbacks=[autotvm.callback.log_to_file('matmul.log')])
+
+#########################################################################
+# Finally we apply history best from the cache file and check its correctness.
+# We can call the function :code:`matmul` directly under the 
+# :any:`autotvm.apply_history_best` context. When we call this function,
+# it will query the dispatch context with its argument and get the best config 
+# with the same argument.
+
+# apply history best from log file
+with autotvm.apply_history_best('matmul.log'):
+    with tvm.target.create("llvm"):
+        s, arg_bufs = matmul(N, L, M, 'float32')
+        func = tvm.build(s, arg_bufs)
+
+# check correctness
+a_np = np.random.uniform(size=(N, L)).astype(np.float32)
+b_np = np.random.uniform(size=(L, M)).astype(np.float32)
+c_np = a_np.dot(b_np)
+
+c_tvm = tvm.nd.empty(c_np.shape)
+func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)
+
+np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
diff --git a/tutorials/cross_compilation_and_rpc.py b/tutorials/cross_compilation_and_rpc.py
new file mode 100644
index 000000000000..a770a2758e01
--- /dev/null
+++ b/tutorials/cross_compilation_and_rpc.py
@@ -0,0 +1,246 @@
+"""
+.. _tutorial-cross-compilation-and-rpc:
+
+Cross Compilation and RPC
+=========================
+**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_, `Lianmin Zheng <https://github.com/merrymercy/>`_
+
+This tutorial introduces cross compilation and remote device
+execution with RPC in TVM.
+
+With cross compilation and RPC, you can **compile program on your
+local machine then run it on the remote device**. It is useful when
+the resource of remote devices is limited, like Raspberry Pi and mobile
+platforms. In this tutorial, we will take Raspberry Pi for CPU example
+and Firefly-RK3399 for opencl example.
+"""
+
+######################################################################
+# Build TVM Runtime on Device
+# ---------------------------
+#
+# The first step is to build tvm runtime on the remote device.
+#
+# .. note::
+#
+#   All instructions in both this section and next section should be
+#   executed on the target device, e.g. Raspberry Pi. And we assume it
+#   has Linux running.
+# 
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/dmlc/tvm
+#   cd tvm
+#   make runtime -j2
+#
+# After building runtime successfully, we need to set environment variables
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
+# directory is in :code:`~/tvm`):
+#
+# .. code-block:: bash
+#
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
+#
+# To update the environment variables, execute :code:`source ~/.bashrc`.
+
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To start an RPC server, run the following command on your remote device
+# (Which is Raspberry Pi in this example).
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# If you see the line below, it means the RPC server started
+# successfully on your device.
+#
+#    .. code-block:: bash
+#
+#      INFO:root:RPCServer: bind to 0.0.0.0:9090
+#
+
+######################################################################
+# Declare and Cross Compile Kernel on Local Machine
+# -------------------------------------------------
+#
+# .. note::
+#
+#   Now we back to the local machine, which has a full TVM installed
+#   (with LLVM).
+#
+# Here we will declare a simple kernel on the local machine:
+
+import numpy as np
+
+import tvm
+from tvm import rpc
+from tvm.contrib import util
+
+n = tvm.convert(1024)
+A = tvm.placeholder((n,), name='A')
+B = tvm.compute((n,), lambda i: A[i] + 1.0, name='B')
+s = tvm.create_schedule(B.op)
+
+######################################################################
+# Then we cross compile the kernel.
+# The target should be 'llvm -target=armv7l-linux-gnueabihf' for
+# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable
+# on our webpage building server. See the detailed note in the following block.
+
+local_demo = True
+
+if local_demo:
+    target = 'llvm'
+else:
+    target = 'llvm -target=armv7l-linux-gnueabihf'
+
+func = tvm.build(s, [A, B], target=target, name='add_one')
+# save the lib at a local temp folder
+temp = util.tempdir()
+path = temp.relpath('lib.tar')
+func.export_library(path)
+
+######################################################################
+# .. note::
+#
+#   To run this tutorial with real remote device, change :code:`local_demo`
+#   to False and replace :code:`target` in :code:`build` with the true
+#   target triple of your device. The target triple which might be
+#   different for different devices. For example, it is
+#   :code:`'llvm -target=armv7l-linux-gnueabihf'` for Raspberry Pi 3B and
+#   :code:`'llvm -target=aarch64-linux-gnu'` for RK3399.
+#
+#   Usually, you can query the target by execute :code:`gcc -v` on your
+#   device, and look for the line starting with :code:`Target:`
+#   (Though it may be still a loose configuration.)
+#
+#   Besides :code:`-target`, you can also set other compilation options
+#   like:
+#
+#   * -mcpu=<cpuname>
+#       Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture.
+#   * -mattr=a1,+a2,-a3,...
+#       Override or control specific attributes of the target, such as whether SIMD operations are enabled or not. The default set of attributes is set by the current CPU.
+#       To get the list of available attributes, you can do:
+#
+#       .. code-block:: bash
+#
+#         llc -mtriple=<your device target triple> -mattr=help
+#
+#   These options are consistent with `llc <http://llvm.org/docs/CommandGuide/llc.html>`_.
+#   It is recommended to set target triple and feature set to contain specific
+#   feature available, so we can take full advantage of the features of the
+#   board.
+#   You can find more details about cross compilation attributes from
+#   `LLVM guide of cross compilation <https://clang.llvm.org/docs/CrossCompilation.html>`_.
+
+######################################################################
+# Run CPU Kernel Remotely by RPC
+# ------------------------------
+# We show how to run the generated cpu kernel on the remote device.
+# First we obtain an RPC session from remote device.
+
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.162'
+    port = 9090
+    remote = rpc.connect(host, port)
+
+######################################################################
+# Upload the lib to the remote device, then invoke a device local
+# compiler to relink them. Now `func` is a remote module object.
+
+remote.upload(path)
+func = remote.load_module('lib.tar')
+
+# create arrays on the remote device
+ctx = remote.cpu()
+a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+# the function will run on the remote device
+func(a, b)
+np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
+
+######################################################################
+# When you want to evaluate the performance of the kernel on the remote
+# device, it is important to avoid the overhead of network.
+# :code:`time_evaluator` will returns a remote function that runs the
+# function over number times, measures the cost per run on the remote
+# device and returns the measured cost. Network overhead is excluded.
+
+time_f = func.time_evaluator(func.entry_name, ctx, number=10)
+cost = time_f(a, b).mean
+print('%g secs/op' % cost)
+
+#########################################################################
+# Run OpenCL Kernel Remotely by RPC
+# ---------------------------------
+# As for remote OpenCL devices, the workflow is almost the same as above.
+# You can define the kernel, upload files, and run by RPC. 
+#
+# .. note::
+#
+#    Raspberry Pi does not support OpenCL, the following code is tested on
+#    Firefly-RK3399. You may follow this `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
+#    to setup the OS and OpenCL driver for RK3399.
+#
+#    Also we need to build the runtime with OpenCL enabled on rk3399 board. In the tvm
+#    root directory, execute
+#
+# .. code-block:: bash
+#
+#    cp cmake/config.cmake .
+#    sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake
+#    make runtime -j4
+#
+# The following function shows how we run OpenCL kernel remotely
+
+def run_opencl():
+    # NOTE: This is the setting for my rk3399 board. You need to modify
+    # them according to your environment.
+    target_host = "llvm -target=aarch64-linux-gnu"
+    opencl_device_host = '10.77.1.145'
+    opencl_device_port = 9090
+
+    # create scheule for the above "add one" compute decleration
+    s = tvm.create_schedule(B.op)
+    xo, xi = s[B].split(B.op.axis[0], factor=32)
+    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+    func = tvm.build(s, [A, B], "opencl", target_host=target_host)
+
+    remote = rpc.connect(opencl_device_host, opencl_device_port)
+
+    # export and upload
+    path = temp.relpath('lib_cl.tar')
+    func.export_library(path)
+    remote.upload(path)
+    func = remote.load_module('lib_cl.tar')
+
+    # run
+    ctx = remote.cl()
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+    func(a, b)
+    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
+    print("OpenCP test passed!")
+
+######################################################################
+# Summary
+# -------
+# This tutorial provides a walk through of cross compilation and RPC
+# features in TVM.
+#
+# - Set up RPC server on the remote device.
+# - Set up target device configuration to cross compile kernel on the
+#   local machine.
+# - Upload and run the kernel remotely by RPC API.
diff --git a/tutorials/deployment/README.txt b/tutorials/deployment/README.txt
deleted file mode 100644
index 16db717c5eef..000000000000
--- a/tutorials/deployment/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Run and Deploy
---------------
diff --git a/tutorials/deployment/cross_compilation_and_rpc.py b/tutorials/deployment/cross_compilation_and_rpc.py
deleted file mode 100644
index 7848b2a23273..000000000000
--- a/tutorials/deployment/cross_compilation_and_rpc.py
+++ /dev/null
@@ -1,219 +0,0 @@
-"""
-Cross Compilation and RPC
-=========================
-**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_
-
-This tutorial introduces cross compilation and remote device
-execution with RPC in TVM.
-
-With cross compilation and RPC, you can **compile program on your
-local machine then run it on remote device**. It is useful when the
-resource of remote device is limited, like Raspberry Pi and mobile
-platforms, so you do not wish to put the compilation procedure on
-the device in order to save time and space.
-In this tutorial, I will take Raspberry Pi as our target platform
-for example.
-"""
-from __future__ import absolute_import, print_function
-
-import tvm
-import numpy as np
-from tvm.contrib import rpc, util
-
-######################################################################
-# Build TVM Runtime on Device
-# ---------------------------
-#
-# There're some prerequisites: similar as compiling TVM on your
-# local machine, we need build runtime on remote device.
-#
-# To get started, clone tvm repo from github. It is important to clone
-# the submodules along, with --recursive option (Assuming you are in
-# your home directory):
-#
-#   .. code-block:: bash
-#
-#     git clone --recursive https://github.com/dmlc/tvm
-#
-# .. note::
-#
-#   Usually device has limited resources and we only need to build
-#   runtime. The idea is we will use TVM compiler on the local server
-#   to compile and upload the compiled program to the device and run
-#   the device function remotely.
-#
-#   .. code-block:: bash
-#
-#     cd tvm
-#     cp make/config.mk .
-#     echo USE_RPC=1>> config.mk
-#
-#   Also make sure that you have set :code:`USE_RPC=1` in your
-#   :code:`config.mk`. We don't need LLVM when building runtime, so
-#   :code:`LLVM_CONFIG = llvm-config` in :code:`config.mk` is commented
-#   out by default. After that, build runtime!
-#
-#   .. code-block:: bash
-#
-#     make runtime
-#
-# After success of buildind runtime, we need set environment varibles
-# in :code:`~/.bashrc` file of yourself account or :code:`/etc/profile`
-# of system enviroment variables. Assuming your TVM directory is in
-# :code:`~/tvm` and set environment variables below your account.
-#
-#   .. code-block:: bash
-#
-#    vi ~/.bashrc
-#
-# We need edit :code:`~/.bashrc` using :code:`vi ~/.bashrc` and add
-# lines below (Assuming your TVM directory is in :code:`~/tvm`):
-#
-#   .. code-block:: bash
-#
-#    export TVM_HOME=~/tvm
-#    export PATH=$PATH:$TVM_HOME/lib
-#    export PYTHONPATH=$PYTHONPATH:$TVM_HOME/python
-#
-# To enable updated :code:`~/.bashrc`, execute :code:`source ~/.bashrc`.
-
-######################################################################
-# Set Up RPC Server on Device
-# ---------------------------
-# To set up a TVM RPC server on the Raspberry Pi (our remote device),
-# we have prepared a one-line script so you only need to run this
-# command after following the installation guide to install TVM on
-# your device:
-#
-#   .. code-block:: bash
-#
-#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
-#
-# After executing command above, if you see these lines below, it's
-# successful to start RPC server on your device.
-#
-#    .. code-block:: bash
-#
-#      Loading runtime library /home/YOURNAME/code/tvm/lib/libtvm_runtime.so... exec only
-#      INFO:root:RPCServer: bind to 0.0.0.0:9090
-#
-# In the following code block, we simply start an RPC server on the
-# same machine, for demonstration. This line can be omitted if we
-# started an remote server.
-#
-server = rpc.Server(host='0.0.0.0', port=9090)
-
-######################################################################
-# Declare and Cross Compile Kernel on Local Machine
-# -------------------------------------------------
-# Here we will declare a simple kernel with TVM on the local machine:
-#
-n = tvm.convert(1024)
-A = tvm.placeholder((n,), name='A')
-B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-s = tvm.create_schedule(B.op)
-
-######################################################################
-# Then we cross compile the kernel:
-#
-
-# the target here should be 'llvm -target=armv7l-none-linux-gnueabihf',
-# and we use 'llvm' here to make example run locally, see the detailed
-# note in the following block
-f = tvm.build(s, [A, B], target='llvm', name='myadd')
-# save the lib at local temp folder
-temp = util.tempdir()
-path = temp.relpath('mylib.o')
-f.save(path)
-
-######################################################################
-# .. note::
-#
-#   the argument :code:`target` in :code:`build` should be replaced
-#   :code:`'llvm'` with the target triple of your device, which might be
-#   different for different device. For example, it is
-#   :code:`'llvm -target=armv7l-none-linux-gnueabihf'` for my Raspberry
-#   Pi. Here we use :code:`'llvm'` directly to make the tutorial runable.
-#
-#   Usually, you can query the target by execute :code:`gcc -v` on your
-#   device, although it may be still a loose configuration.
-#
-#   Besides :code:`-target`, you can also set other compilation options
-#   like:
-#
-#   * -mtriple=<target triple>
-#       Specify the target triple, same as '-target'.
-#   * -mcpu=<cpuname>
-#       Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture.
-#   * -mattr=a1,+a2,-a3,...
-#       Override or control specific attributes of the target, such as whether SIMD operations are enabled or not. The default set of attributes is set by the current CPU.
-#       To get the list of available attributes, you can do:
-#
-#       .. code-block:: bash
-#
-#         llc -mtriple=<your device target triple> -mattr=help
-#
-#   These options are consistent with `llc <http://llvm.org/docs/CommandGuide/llc.html>`_.
-#   So for my board, to get the best performance, the complete compilation
-#   option would be:
-#
-#   .. code-block:: bash
-#
-#     llvm -mtriple=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon
-#
-#   It is recommended to set target triple and feature set to contain specific
-#   feature available, so we can take full advantage of the features of the
-#   board.
-#   You can find more details about cross compilation attributes from
-#   `LLVM guide of cross compilation <https://clang.llvm.org/docs/CrossCompilation.html>`_.
-
-######################################################################
-# Run Kernel Remotely by RPC
-# --------------------------
-# Here we will show you how to run the kernel on the remote device:
-
-# replace host with the ip address of your device
-host = '0.0.0.0'
-port = 9090
-# connect the remote device
-remote = rpc.connect(host, port)
-
-######################################################################
-# Here we upload the lib to the remote device, then invoke a device local
-# compiler for shared lib and load it into device memory. now `f` is a
-# remote module object.
-remote.upload(path)
-f = remote.load_module('mylib.o')
-
-# create array on the remote device
-ctx = remote.cpu(0)
-a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
-b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-# the function will run on the remote device
-f(a, b)
-np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
-
-######################################################################
-# When you want to evaluate the performance of the kernel on the remote
-# device, it is important to avoid overhead of remote function call.
-# :code:`time_evaluator` will returns a remote function that runs the
-# function over number times, measures the cost per run on the remote
-# device and returns the measured cost.
-#
-time_f = f.time_evaluator(f.entry_name, ctx, number=10)
-cost = time_f(a, b).mean
-print('%g secs/op' % cost)
-
-# terminate the server after experiment
-server.terminate()
-
-######################################################################
-# Summary
-# -------
-# This tutorial provides a walk through of cross compilation and RPC
-# features in TVM.
-#
-# - Set up RPC server on the remote device.
-# - Set up target device configuration to cross compile kernel on the
-#   local machine.
-# - Upload and run the kernel remotely by RPC API.
diff --git a/tutorials/get_started.py b/tutorials/get_started.py
index 3dce21a64950..de94827ab1e9 100644
--- a/tutorials/get_started.py
+++ b/tutorials/get_started.py
@@ -13,6 +13,12 @@
 import tvm
 import numpy as np
 
+# Global declarations of environment.
+
+tgt_host="llvm"
+# Change it to respective GPU if gpu is enabled Ex: cuda, opencl
+tgt="cuda"
+
 ######################################################################
 # Vector Add Example
 # ------------------
@@ -88,8 +94,9 @@
 # compute grid. These are GPU specific constructs that allows us
 # to generate code that runs on GPU.
 #
-s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+if tgt == "cuda":
+  s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+  s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
 
 ######################################################################
 # Compilation
@@ -103,12 +110,12 @@
 # function(including the inputs and outputs) as well as target language
 # we want to compile to.
 #
-# The result of compilation fadd is a CUDA device function that can
-# as well as a host wrapper that calls into the CUDA function.
+# The result of compilation fadd is a GPU device function(if GPU is involved)
+# that can as well as a host wrapper that calls into the GPU function.
 # fadd is the generated host wrapper function, it contains reference
 # to the generated device function internally.
 #
-fadd_cuda = tvm.build(s, [A, B, C], "cuda", target_host="llvm", name="myadd")
+fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
 
 ######################################################################
 # Run the Function
@@ -124,12 +131,13 @@
 # - fadd runs the actual computation.
 # - asnumpy() copies the gpu array back to cpu and we can use this to verify correctness
 #
-ctx = tvm.gpu(0)
+ctx = tvm.context(tgt, 0)
+
 n = 1024
 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
 b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
 c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
-fadd_cuda(a, b, c)
+fadd(a, b, c)
 np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
@@ -137,13 +145,16 @@
 # --------------------------
 # You can inspect the generated code in TVM. The result of tvm.build
 # is a tvm Module. fadd is the host module that contains the host wrapper,
-# it also contains a device module for the CUDA function.
+# it also contains a device module for the CUDA (GPU) function.
 #
 # The following code fetches the device module and prints the content code.
 #
-dev_module = fadd_cuda.imported_modules[0]
-print("-----CUDA code-----")
-print(dev_module.get_source())
+if tgt == "cuda":
+    dev_module = fadd.imported_modules[0]
+    print("-----GPU code-----")
+    print(dev_module.get_source())
+else:
+    print(fadd.get_source())
 
 ######################################################################
 # .. note:: Code Specialization
@@ -179,8 +190,9 @@
 from tvm.contrib import util
 
 temp = util.tempdir()
-fadd_cuda.save(temp.relpath("myadd.o"))
-fadd_cuda.imported_modules[0].save(temp.relpath("myadd.ptx"))
+fadd.save(temp.relpath("myadd.o"))
+if tgt == "cuda":
+    fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
 cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")])
 print(temp.listdir())
 
@@ -201,8 +213,9 @@
 # re-link them together. We can verify that the newly loaded function works.
 #
 fadd1 = tvm.module.load(temp.relpath("myadd.so"))
-fadd1_dev = tvm.module.load(temp.relpath("myadd.ptx"))
-fadd1.import_module(fadd1_dev)
+if tgt == "cuda":
+    fadd1_dev = tvm.module.load(temp.relpath("myadd.ptx"))
+    fadd1.import_module(fadd1_dev)
 fadd1(a, b, c)
 np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -215,7 +228,7 @@
 # them together with the host code.
 # Currently we support packing of Metal, OpenCL and CUDA modules.
 #
-fadd_cuda.export_library(temp.relpath("myadd_pack.so"))
+fadd.export_library(temp.relpath("myadd_pack.so"))
 fadd2 = tvm.module.load(temp.relpath("myadd_pack.so"))
 fadd2(a, b, c)
 np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
@@ -241,16 +254,17 @@
 # The following codeblocks generate opencl code, creates array on opencl
 # device, and verifies the correctness of the code.
 #
-fadd_cl = tvm.build(s, [A, B, C], "opencl", name="myadd")
-print("------opencl code------")
-print(fadd_cl.imported_modules[0].get_source())
-ctx = tvm.cl(0)
-n = 1024
-a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
-fadd_cl(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+if tgt == "opencl":
+    fadd_cl = tvm.build(s, [A, B, C], "opencl", name="myadd")
+    print("------opencl code------")
+    print(fadd_cl.imported_modules[0].get_source())
+    ctx = tvm.cl(0)
+    n = 1024
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+    fadd_cl(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
 # Summary
diff --git a/tutorials/language/README.txt b/tutorials/language/README.txt
index c15ab3f334ff..6da8e3c57c1f 100644
--- a/tutorials/language/README.txt
+++ b/tutorials/language/README.txt
@@ -1,2 +1,2 @@
-Lanuage and Schedules
----------------------
+Tensor Expression and Schedules
+-------------------------------
diff --git a/tutorials/language/intrin_math.py b/tutorials/language/intrin_math.py
index 50668dab8812..44c0692c04fa 100644
--- a/tutorials/language/intrin_math.py
+++ b/tutorials/language/intrin_math.py
@@ -116,7 +116,7 @@ def my_cuda_mylog_rule(op):
     if op.dtype == "float32":
         return tvm.call_pure_extern("float32", "logf", op.args[0])
     elif op.dtype == "float64":
-        return tvm.call_pure_extern("float32", "log", op.args[0])
+        return tvm.call_pure_extern("float64", "log", op.args[0])
     else:
         return op
 tvm.register_intrin_rule("cuda", "mylog", my_cuda_mylog_rule, override=True)
diff --git a/tutorials/language/tuple_inputs.py b/tutorials/language/tuple_inputs.py
index ff1928158f43..3f6ef1eb9d21 100644
--- a/tutorials/language/tuple_inputs.py
+++ b/tutorials/language/tuple_inputs.py
@@ -18,7 +18,7 @@
 # Describe Batchwise Computation
 # ------------------------------
 # For operators which have the same shape, we can put them together as
-# the inputs of :any:`tvm.compute`, if we wish they can be scheduled
+# the inputs of :any:`tvm.compute`, if we want them to be scheduled
 # together in the next schedule procedure.
 #
 n = tvm.var("n")
diff --git a/tutorials/nnvm/README.txt b/tutorials/nnvm/README.txt
new file mode 100644
index 000000000000..772953ce96ac
--- /dev/null
+++ b/tutorials/nnvm/README.txt
@@ -0,0 +1,4 @@
+.. _tutorial-nnvm:
+
+Compile Deep Learning Models
+----------------------------
diff --git a/tutorials/nnvm/deploy_model_on_mali_gpu.py b/tutorials/nnvm/deploy_model_on_mali_gpu.py
new file mode 100644
index 000000000000..8aacb8433d3d
--- /dev/null
+++ b/tutorials/nnvm/deploy_model_on_mali_gpu.py
@@ -0,0 +1,216 @@
+"""
+.. _tutorial-deploy-model-on-mali-gpu:
+
+Deploy the Pretrained Model on ARM Mali GPU
+===========================================
+**Author**: `Lianmin Zheng <https://lmzheng.net/>`_, `Ziheng Jiang <https://ziheng.org/>`_
+
+This is an example of using NNVM to compile a ResNet model and
+deploy it on Firefly-RK3399 with ARM Mali GPU. We will use the
+Mali-T860 MP4 GPU on this board to accelerate the inference.
+"""
+
+import tvm
+import nnvm.compiler
+import nnvm.testing
+from tvm import rpc
+from tvm.contrib import util, graph_runtime as runtime
+
+######################################################################
+# Build TVM Runtime on Device
+# ---------------------------
+#
+# The first step is to build tvm runtime on the remote device.
+#
+# .. note::
+#
+#   All instructions in both this section and next section should be
+#   executed on the target device, e.g. Rk3399. And we assume it
+#   has Linux running.
+# 
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device. Make sure you have opencl driver in your board.
+# You can refer to `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
+# to setup OS and opencl driver for rk3399.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/dmlc/tvm
+#   cd tvm
+#   cp cmake/config.cmake .
+#   sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake 
+#   make runtime -j4
+#
+# After building runtime successfully, we need to set environment varibles
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM 
+# directory is in :code:`~/tvm`):
+#
+# .. code-block:: bash
+#
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
+#
+# To update the environment variables, execute :code:`source ~/.bashrc`.
+
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To start an RPC server, run the following command on your remote device
+# (Which is RK3399 in our example).
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# If you see the line below, it means the RPC server started
+# successfully on your device.
+#
+#    .. code-block:: bash
+#
+#      INFO:root:RPCServer: bind to 0.0.0.0:9090
+#
+
+######################################################################
+# Prepare the Pre-trained Model
+# -----------------------------
+# Back to the host machine, which should have a full TVM installed (with LLVM).
+# 
+# We will use pre-trained model from
+# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
+
+from mxnet.gluon.model_zoo.vision import get_model
+from mxnet.gluon.utils import download
+from PIL import Image
+import numpy as np
+
+# only one line to get the model
+block = get_model('resnet18_v1', pretrained=True)
+
+######################################################################
+# In order to test our model, here we download an image of cat and
+# transform its format.
+img_name = 'cat.jpg'
+download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
+image = Image.open(img_name).resize((224, 224))
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+
+######################################################################
+# synset is used to transform the label from number of ImageNet class to
+# the word human can understand.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+
+synset_name = 'synset.txt'
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+
+######################################################################
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+net, params = nnvm.frontend.from_mxnet(block)
+# we want a probability so add a softmax operator
+net = nnvm.sym.softmax(net)
+
+######################################################################
+# Here are some basic data workload configurations.
+batch_size = 1
+num_classes = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+out_shape = (batch_size, num_classes)
+
+######################################################################
+# Compile The Graph
+# -----------------
+# To compile the graph, we call the :any:`nnvm.compiler.build` function
+# with the graph configuration and parameters. As we use OpenCL for
+# GPU computing, the tvm will generate both OpenCL kernel code and ARM
+# CPU host code. The CPU host code is used for calling OpenCL kernels.
+# In order to generate correct CPU code, we need to specify the target
+# triplet for host ARM device by setting the parameter :code:`target_host`.
+
+######################################################################
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the RK3399, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you
+# want to run this tutorial with a real device.
+
+local_demo = True
+
+if local_demo:
+    target_host = "llvm"
+    target = "llvm"
+else:
+    # Here is the setting for my rk3399 board
+    # If you don't use rk3399, you can query your target triple by 
+    # execute `gcc -v` on your board.
+    target_host = "llvm -target=aarch64-linux-gnu"
+
+    # set target as  `tvm.target.mali` instead of 'opencl' to enable
+    # optimization for mali
+    target = tvm.target.mali()
+
+with nnvm.compiler.build_config(opt_level=2):
+    graph, lib, params = nnvm.compiler.build(net, target=target,
+            shape={"data": data_shape}, params=params, target_host=target_host)
+
+# After `nnvm.compiler.build`, you will get three return values: graph,
+# library and the new parameter, since we do some optimization that will
+# change the parameters but keep the result of model as the same.
+
+# Save the library at local temporary directory.
+tmp = util.tempdir()
+lib_fname = tmp.relpath('net.tar')
+lib.export_library(lib_fname)
+
+######################################################################
+# Deploy the Model Remotely by RPC
+# --------------------------------
+# With RPC, you can deploy the model remotely from your host machine
+# to the remote device.
+
+# obtain an RPC session from remote device.
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.145'
+    port = 9090
+    remote = rpc.connect(host, port)
+
+# upload the library to remote device and load it
+remote.upload(lib_fname)
+rlib = remote.load_module('net.tar')
+
+ctx = remote.cpu(0) if local_demo else remote.cl(0)
+# upload the parameter
+rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+
+# create the remote runtime module
+module = runtime.create(graph, rlib, ctx)
+# set parameter
+module.set_input(**rparams)
+# set input data
+module.set_input('data', tvm.nd.array(x.astype('float32')))
+# run
+module.run()
+# get output
+out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
+# get top1 result
+top1 = np.argmax(out.asnumpy())
+print('TVM prediction top-1: {}'.format(synset[top1]))
diff --git a/tutorials/nnvm/deploy_model_on_rasp.py b/tutorials/nnvm/deploy_model_on_rasp.py
new file mode 100644
index 000000000000..c11f202c1251
--- /dev/null
+++ b/tutorials/nnvm/deploy_model_on_rasp.py
@@ -0,0 +1,207 @@
+"""
+.. _tutorial-deploy-model-on-rasp:
+
+Deploy the Pretrained Model on Raspberry Pi
+===========================================
+**Author**: `Ziheng Jiang <https://ziheng.org/>`_
+
+This is an example of using NNVM to compile a ResNet model and deploy
+it on raspberry pi.
+"""
+
+import tvm
+import nnvm.compiler
+import nnvm.testing
+from tvm import rpc
+from tvm.contrib import util, graph_runtime as runtime
+
+######################################################################
+# .. _build-tvm-runtime-on-device:
+#
+# Build TVM Runtime on Device
+# ---------------------------
+#
+# The first step is to build tvm runtime on the remote device.
+#
+# .. note::
+#
+#   All instructions in both this section and next section should be
+#   executed on the target device, e.g. Raspberry Pi. And we assume it
+#   has Linux running.
+# 
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/dmlc/tvm
+#   cd tvm
+#   make runtime -j4
+#
+# After building runtime successfully, we need to set environment varibles
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM 
+# directory is in :code:`~/tvm`):
+#
+# .. code-block:: bash
+#
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
+#
+# To update the environment variables, execute :code:`source ~/.bashrc`.
+
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To start an RPC server, run the following command on your remote device
+# (Which is Raspberry Pi in our example).
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# If you see the line below, it means the RPC server started
+# successfully on your device.
+#
+#    .. code-block:: bash
+#
+#      INFO:root:RPCServer: bind to 0.0.0.0:9090
+#
+
+######################################################################
+# Prepare the Pre-trained Model
+# -----------------------------
+# Back to the host machine, which should have a full TVM installed (with LLVM).
+# 
+# We will use pre-trained model from
+# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
+
+from mxnet.gluon.model_zoo.vision import get_model
+from mxnet.gluon.utils import download
+from PIL import Image
+import numpy as np
+
+# one line to get the model
+block = get_model('resnet18_v1', pretrained=True)
+
+######################################################################
+# In order to test our model, here we download an image of cat and
+# transform its format.
+img_name = 'cat.jpg'
+download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
+image = Image.open(img_name).resize((224, 224))
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+
+######################################################################
+# synset is used to transform the label from number of ImageNet class to
+# the word human can understand.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+
+######################################################################
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+net, params = nnvm.frontend.from_mxnet(block)
+# we want a probability so add a softmax operator
+net = nnvm.sym.softmax(net)
+
+######################################################################
+# Here are some basic data workload configurations.
+batch_size = 1
+num_classes = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+out_shape = (batch_size, num_classes)
+
+######################################################################
+# Compile The Graph
+# -----------------
+# To compile the graph, we call the :any:`nnvm.compiler.build` function
+# with the graph configuration and parameters. However, You cannot to
+# deploy a x86 program on a device with ARM instruction set. It means
+# NNVM also needs to know the compilation option of target device,
+# apart from arguments :code:`net` and :code:`params` to specify the
+# deep learning workload. Actually, the option matters, different option
+# will lead to very different performance.
+
+######################################################################
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the Raspberry Pi, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you want
+# to run this tutorial with a real device.
+
+local_demo = True
+
+if local_demo:
+    target = tvm.target.create('llvm')
+else:
+    target = tvm.target.arm_cpu('rasp3b')
+    # The above line is a simple form of
+    # target = tvm.target.create('llvm -devcie=arm_cpu -target=armv7l-linux-gnueabihf')
+
+with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+    graph, lib, params = nnvm.compiler.build(
+        net, target, shape={"data": data_shape}, params=params)
+
+# After `nnvm.compiler.build`, you will get three return values: graph,
+# library and the new parameter, since we do some optimization that will
+# change the parameters but keep the result of model as the same.
+
+# Save the library at local temporary directory.
+tmp = util.tempdir()
+lib_fname = tmp.relpath('net.tar')
+lib.export_library(lib_fname)
+
+######################################################################
+# Deploy the Model Remotely by RPC
+# --------------------------------
+# With RPC, you can deploy the model remotely from your host machine
+# to the remote device.
+
+# obtain an RPC session from remote device.
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.162'
+    port = 9090
+    remote = rpc.connect(host, port)
+
+# upload the library to remote device and load it
+remote.upload(lib_fname)
+rlib = remote.load_module('net.tar')
+
+# upload the parameter (this may take a while)
+ctx = remote.cpu(0)
+rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+
+# create the remote runtime module
+module = runtime.create(graph, rlib, ctx)
+# set parameter
+module.set_input(**rparams)
+# set input data
+module.set_input('data', tvm.nd.array(x.astype('float32')))
+# run
+module.run()
+# get output
+out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
+# get top1 result
+top1 = np.argmax(out.asnumpy())
+print('TVM prediction top-1: {}'.format(synset[top1]))
diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd.py
new file mode 100644
index 000000000000..58725a3c92d3
--- /dev/null
+++ b/tutorials/nnvm/deploy_ssd.py
@@ -0,0 +1,137 @@
+"""
+Deploy Single Shot Multibox Detector(SSD) model
+===============================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_
+
+This article is an introductory tutorial to deploy SSD models with TVM.
+We will use mxnet pretrained SSD model with Resnet50 as body network and
+convert it to NNVM graph.
+"""
+import os
+import zipfile
+import tvm
+import mxnet as mx
+import cv2
+import numpy as np
+
+from nnvm import compiler
+from nnvm.frontend import from_mxnet
+from tvm.contrib.download import download
+from tvm.contrib import graph_runtime
+from mxnet.model import load_checkpoint
+
+
+######################################################################
+# Set the parameters here
+# -----------------------
+# .. note::
+#
+#   Currently we support compiling SSD on CPU only.
+#   GPU support is in progress.
+
+model_name = "ssd_resnet50_512"
+model_file = "%s.zip" % model_name
+test_image = "dog.jpg"
+dshape = (1, 3, 512, 512)
+dtype = "float32"
+target = "llvm"
+ctx = tvm.cpu()
+
+######################################################################
+# Download MXNet SSD pre-trained model and demo image
+# ---------------------------------------------------
+# Pre-trained model available at
+# https://github.com/apache/incubator-\mxnet/tree/master/example/ssd
+
+model_url = "https://github.com/zhreshold/mxnet-ssd/releases/download/v0.6/" \
+            "resnet50_ssd_512_voc0712_trainval.zip"
+image_url = "https://cloud.githubusercontent.com/assets/3307514/20012567/" \
+            "cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg"
+inference_symbol_folder = "c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
+inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
+                       "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
+            
+dir = "ssd_model"
+if not os.path.exists(dir):
+    os.makedirs(dir)
+model_file_path = "%s/%s" % (dir, model_file)
+test_image_path = "%s/%s" % (dir, test_image)
+inference_symbol_path = "%s/inference_model.zip" % dir
+download(model_url, model_file_path)
+download(image_url, test_image_path)
+download(inference_symbol_url, inference_symbol_path)
+
+zip_ref = zipfile.ZipFile(model_file_path, 'r')
+zip_ref.extractall(dir)
+zip_ref.close()
+zip_ref = zipfile.ZipFile(inference_symbol_path)
+zip_ref.extractall(dir)
+zip_ref.close()
+
+######################################################################
+# Convert and compile model with NNVM for CPU.
+
+sym = mx.sym.load("%s/%s/ssd_resnet50_inference.json" % (dir, inference_symbol_folder))
+_, arg_params, aux_params = load_checkpoint("%s/%s" % (dir, model_name), 0)
+net, params = from_mxnet(sym, arg_params, aux_params)
+with compiler.build_config(opt_level=3):
+    graph, lib, params = compiler.build(net, target, {"data": dshape}, params=params)
+
+######################################################################
+# Create TVM runtime and do inference
+
+# Preprocess image
+image = cv2.imread(test_image_path)
+img_data = cv2.resize(image, (dshape[2], dshape[3]))
+img_data = img_data[:, :, (2, 1, 0)].astype(np.float32)
+img_data -= np.array([123, 117, 104])
+img_data = np.transpose(np.array(img_data), (2, 0, 1))
+img_data = np.expand_dims(img_data, axis=0)
+# Build TVM runtime
+m = graph_runtime.create(graph, lib, ctx)
+m.set_input('data', tvm.nd.array(img_data.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+_, oshape = compiler.graph_util.infer_shape(graph, shape={"data": dshape})
+tvm_output = m.get_output(0, tvm.nd.empty(tuple(oshape[0]), dtype))
+
+
+######################################################################
+# Display result
+
+class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair",
+               "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant",
+               "sheep", "sofa", "train", "tvmonitor"]
+def display(img, out, thresh=0.5):
+    import random
+    import matplotlib as mpl
+    import matplotlib.pyplot as plt
+    mpl.rcParams['figure.figsize'] = (10,10)
+    pens = dict()
+    plt.clf()
+    plt.imshow(img)
+    for det in out:
+        cid = int(det[0])
+        if cid < 0:
+            continue
+        score = det[1]
+        if score < thresh:
+            continue
+        if cid not in pens:
+            pens[cid] = (random.random(), random.random(), random.random())
+        scales = [img.shape[1], img.shape[0]] * 2
+        xmin, ymin, xmax, ymax = [int(p * s) for p, s in zip(det[2:6].tolist(), scales)]
+        rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False,
+                             edgecolor=pens[cid], linewidth=3)
+        plt.gca().add_patch(rect)
+        text = class_names[cid]
+        plt.gca().text(xmin, ymin-2, '{:s} {:.3f}'.format(text, score),
+                       bbox=dict(facecolor=pens[cid], alpha=0.5),
+                       fontsize=12, color='white')
+    plt.show()
+
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+display(image, tvm_output.asnumpy()[0], thresh=0.45)
+
diff --git a/tutorials/nnvm/from_coreml.py b/tutorials/nnvm/from_coreml.py
new file mode 100644
index 000000000000..3cf8babe6418
--- /dev/null
+++ b/tutorials/nnvm/from_coreml.py
@@ -0,0 +1,100 @@
+"""
+Compile CoreML Models
+=====================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_
+
+This article is an introductory tutorial to deploy CoreML models with NNVM.
+
+For us to begin with, coremltools module is required to be installed.
+
+A quick solution is to install via pip
+```bash
+pip install -U coremltools --user
+```
+or please refer to official site
+https://github.com/apple/coremltools
+"""
+import nnvm
+import tvm
+import coremltools as cm
+import numpy as np
+from PIL import Image
+
+def download(url, path, overwrite=False):
+    import os
+    if os.path.isfile(path) and not overwrite:
+        print('File {} existed, skip.'.format(path))
+        return
+    print('Downloading from url {} to {}'.format(url, path))
+    try:
+        import urllib.request
+        urllib.request.urlretrieve(url, path)
+    except:
+        import urllib
+        urllib.urlretrieve(url, path)
+
+######################################################################
+# Load pretrained CoreML model
+# ----------------------------
+# We will download and load a pretrained mobilenet classification network
+# provided by apple in this example
+model_url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
+model_file = 'mobilenet.mlmodel'
+download(model_url, model_file)
+# now you mobilenet.mlmodel on disk
+mlmodel = cm.models.MLModel(model_file)
+# we can load the graph as NNVM compatible model
+sym, params = nnvm.frontend.from_coreml(mlmodel)
+
+######################################################################
+# Load a test image
+# ------------------
+# A single cat dominates the examples!
+from PIL import Image
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+download(img_url, 'cat.png')
+img = Image.open('cat.png').resize((224, 224))
+#x = np.transpose(img, (2, 0, 1))[np.newaxis, :]
+image = np.asarray(img)
+image = image.transpose((2, 0, 1))
+x = image[np.newaxis, :]
+######################################################################
+# Compile the model on NNVM
+# ---------------------------
+# We should be familiar with the process right now.
+import nnvm.compiler
+target = 'cuda'
+shape_dict = {'image': x.shape}
+graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+
+######################################################################
+# Execute on TVM
+# -------------------
+# The process is no different from other example
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('image', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+output_shape = (1000,)
+tvm_output = m.get_output(0, tvm.nd.empty(output_shape, dtype)).asnumpy()
+top1 = np.argmax(tvm_output)
+
+#####################################################################
+# Look up synset name
+# -------------------
+# Look up prdiction top 1 index in 1000 class synset.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+print('Top-1 id', top1, 'class name', synset[top1])
diff --git a/tutorials/nnvm/from_darknet.py b/tutorials/nnvm/from_darknet.py
new file mode 100644
index 000000000000..883026f2af98
--- /dev/null
+++ b/tutorials/nnvm/from_darknet.py
@@ -0,0 +1,161 @@
+"""
+Compile YOLO-V2 in DarkNet Models
+=================================
+**Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
+
+This article is an introductory tutorial to deploy darknet models with NNVM.
+All the required models and libraries will be downloaded from the internet by the script.
+This script runs the YOLO-V2 Model with the bounding boxes
+Darknet parsing have dependancy with CFFI and CV2 library
+Please install CFFI and CV2 before executing this script
+
+.. code-block:: bash
+
+  pip install cffi
+  pip install opencv-python
+"""
+
+import nnvm
+import nnvm.frontend.darknet
+import nnvm.testing.darknet
+import matplotlib.pyplot as plt
+import numpy as np
+import tvm
+import os
+
+from ctypes import *
+from tvm.contrib.download import download
+from nnvm.testing.darknet import __darknetffi__
+
+######################################################################
+# Set the parameters here.
+# Supported models alexnet, resnet50, resnet152, extraction, yolo
+#
+model_name = 'yolo'
+test_image = 'dog.jpg'
+target = 'llvm'
+ctx = tvm.cpu(0)
+
+######################################################################
+# Prepare cfg and weights file
+# ----------------------------
+# Pretrained model available https://pjreddie.com/darknet/imagenet/
+# Download cfg and weights file first time.
+
+cfg_name = model_name + '.cfg'
+weights_name = model_name + '.weights'
+cfg_url = 'https://github.com/siju-samuel/darknet/blob/master/cfg/' + \
+            cfg_name + '?raw=true'
+weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+
+download(cfg_url, cfg_name)
+download(weights_url, weights_name)
+
+######################################################################
+# Download and Load darknet library
+# ---------------------------------
+
+darknet_lib = 'libdarknet.so'
+darknetlib_url = 'https://github.com/siju-samuel/darknet/blob/master/lib/' + \
+                        darknet_lib + '?raw=true'
+download(darknetlib_url, darknet_lib)
+
+#if the file doesnt exist, then exit normally.
+if os.path.isfile('./' + darknet_lib) is False:
+    exit(0)
+
+darknet_lib = __darknetffi__.dlopen('./' + darknet_lib)
+cfg = "./" + str(cfg_name)
+weights = "./" + str(weights_name)
+net = darknet_lib.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
+dtype = 'float32'
+batch_size = 1
+print("Converting darknet to nnvm symbols...")
+sym, params = nnvm.frontend.darknet.from_darknet(net, dtype)
+
+######################################################################
+# Compile the model on NNVM
+# -------------------------
+# compile the model
+data = np.empty([batch_size, net.c ,net.h, net.w], dtype);
+shape = {'data': data.shape}
+print("Compiling the model...")
+with nnvm.compiler.build_config(opt_level=2):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape, dtype, params)
+
+#####################################################################
+# Save the JSON
+# -------------
+def save_lib():
+    #Save the graph, params and .so to the current directory
+    print("Saving the compiled output...")
+    path_name = 'nnvm_darknet_' + model_name
+    path_lib = path_name + '_deploy_lib.so'
+    lib.export_library(path_lib)
+    with open(path_name
++ "deploy_graph.json", "w") as fo:
+        fo.write(graph.json())
+    with open(path_name
++ "deploy_param.params", "wb") as fo:
+        fo.write(nnvm.compiler.save_param_dict(params))
+#save_lib()
+
+######################################################################
+# Load a test image
+# --------------------------------------------------------------------
+print("Loading the test image...")
+img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \
+            test_image   +'?raw=true'
+download(img_url, test_image)
+
+data = nnvm.testing.darknet.load_image(test_image, net.w, net.h)
+
+######################################################################
+# Execute on TVM Runtime
+# ----------------------
+# The process is no different from other examples.
+from tvm.contrib import graph_runtime
+
+m = graph_runtime.create(graph, lib, ctx)
+
+# set inputs
+m.set_input('data', tvm.nd.array(data.astype(dtype)))
+m.set_input(**params)
+# execute
+print("Running the test image...")
+
+m.run()
+# get outputs
+out_shape = (net.outputs,)
+tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+
+#do the detection and bring up the bounding boxes
+thresh = 0.24
+hier_thresh = 0.5
+img = nnvm.testing.darknet.load_image_color(test_image)
+_, im_h, im_w = img.shape
+probs= []
+boxes = []
+region_layer = net.layers[net.n - 1]
+boxes, probs = nnvm.testing.yolo2_detection.get_region_boxes(region_layer, im_w, im_h, net.w, net.h,
+                       thresh, probs, boxes, 1, tvm_out)
+
+boxes, probs = nnvm.testing.yolo2_detection.do_nms_sort(boxes, probs,
+                       region_layer.w*region_layer.h*region_layer.n, region_layer.classes, 0.3)
+
+coco_name = 'coco.names'
+coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name   +'?raw=true'
+font_name = 'arial.ttf'
+font_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + font_name   +'?raw=true'
+download(coco_url, coco_name)
+download(font_url, font_name)
+
+with open(coco_name) as f:
+    content = f.readlines()
+
+names = [x.strip() for x in content]
+
+nnvm.testing.yolo2_detection.draw_detections(img, region_layer.w*region_layer.h*region_layer.n,
+                 thresh, boxes, probs, names, region_layer.classes)
+plt.imshow(img.transpose(1,2,0))
+plt.show()
diff --git a/tutorials/nnvm/from_keras.py b/tutorials/nnvm/from_keras.py
new file mode 100644
index 000000000000..402010b98634
--- /dev/null
+++ b/tutorials/nnvm/from_keras.py
@@ -0,0 +1,114 @@
+"""
+Compile Keras Models
+=====================
+**Author**: `Yuwei Hu <https://Huyuwei.github.io/>`_
+
+This article is an introductory tutorial to deploy keras models with NNVM.
+
+For us to begin with, keras should be installed.
+Tensorflow is also required since it's used as the default backend of keras.
+
+A quick solution is to install via pip
+```
+pip install -U keras --user
+```
+```
+pip install -U tensorflow --user
+```
+or please refer to official site
+https://keras.io/#installation
+"""
+import nnvm
+import tvm
+import keras
+import numpy as np
+
+def download(url, path, overwrite=False):
+    import os
+    if os.path.isfile(path) and not overwrite:
+        print('File {} exists, skip.'.format(path))
+        return
+    print('Downloading from url {} to {}'.format(url, path))
+    try:
+        import urllib.request
+        urllib.request.urlretrieve(url, path)
+    except:
+        import urllib
+        urllib.urlretrieve(url, path)
+
+######################################################################
+# Load pretrained keras model
+# ----------------------------
+# We load a pretrained resnet-50 classification model provided by keras.
+weights_url = ''.join(['https://github.com/fchollet/deep-learning-models/releases/',
+                       'download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'])
+weights_file = 'resnet50_weights.h5'
+download(weights_url, weights_file)
+keras_resnet50 = keras.applications.resnet50.ResNet50(include_top=True, weights=None,
+	input_shape=(224,224,3), classes=1000)
+keras_resnet50.load_weights('resnet50_weights.h5')
+
+######################################################################
+# Load a test image
+# ------------------
+# A single cat dominates the examples!
+from PIL import Image
+from matplotlib import pyplot as plt
+from keras.applications.resnet50 import preprocess_input
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+download(img_url, 'cat.jpg')
+img = Image.open('cat.jpg').resize((224, 224))
+plt.imshow(img)
+plt.show()
+# input preprocess
+data = np.array(img)[np.newaxis, :].astype('float32')
+data = preprocess_input(data).transpose([0, 3, 1, 2])
+print('input_1', data.shape)
+
+######################################################################
+# Compile the model on NNVM
+# --------------------------
+# We should be familiar with the process now.
+
+# convert the keras model(NHWC layout) to NNVM format(NCHW layout).
+sym, params = nnvm.frontend.from_keras(keras_resnet50)
+# compile the model
+target = 'cuda'
+shape_dict = {'input_1': data.shape}
+with nnvm.compiler.build_config(opt_level=2):
+	graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+
+######################################################################
+# Execute on TVM
+# ---------------
+# The process is no different from other examples.
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('input_1', tvm.nd.array(data.astype('float32')))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+out_shape = (1000,)
+tvm_out = m.get_output(0, tvm.nd.empty(out_shape, 'float32')).asnumpy()
+top1_tvm = np.argmax(tvm_out)
+
+#####################################################################
+# Look up synset name
+# -------------------
+# Look up prdiction top 1 index in 1000 class synset.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+print('NNVM top-1 id: {}, class name: {}'.format(top1_tvm, synset[top1_tvm]))
+# confirm correctness with keras output
+keras_out = keras_resnet50.predict(data.transpose([0, 2, 3, 1]))
+top1_keras = np.argmax(keras_out)
+print('Keras top-1 id: {}, class name: {}'.format(top1_keras, synset[top1_keras]))
diff --git a/tutorials/nnvm/from_mxnet.py b/tutorials/nnvm/from_mxnet.py
new file mode 100644
index 000000000000..cce3bc37126a
--- /dev/null
+++ b/tutorials/nnvm/from_mxnet.py
@@ -0,0 +1,116 @@
+"""
+.. _tutorial-from-mxnet:
+
+Compile MXNet Models
+====================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_
+
+This article is an introductory tutorial to deploy mxnet models with NNVM.
+
+For us to begin with, mxnet module is required to be installed.
+
+A quick solution is
+```
+pip install mxnet --user
+```
+or please refer to offical installation guide.
+https://mxnet.incubator.apache.org/versions/master/install/index.html
+"""
+# some standard imports
+import mxnet as mx
+import nnvm
+import tvm
+import numpy as np
+
+######################################################################
+# Download Resnet18 model from Gluon Model Zoo
+# ---------------------------------------------
+# In this section, we download a pretrained imagenet model and classify an image.
+from mxnet.gluon.model_zoo.vision import get_model
+from mxnet.gluon.utils import download
+from PIL import Image
+from matplotlib import pyplot as plt
+block = get_model('resnet18_v1', pretrained=True)
+img_name = 'cat.jpg'
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+image = Image.open(img_name).resize((224, 224))
+plt.imshow(image)
+plt.show()
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+print('x', x.shape)
+
+######################################################################
+# Compile the Graph
+# -----------------
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+sym, params = nnvm.frontend.from_mxnet(block)
+# we want a probability so add a softmax operator
+sym = nnvm.sym.softmax(sym)
+
+######################################################################
+# now compile the graph
+import nnvm.compiler
+target = 'cuda'
+shape_dict = {'data': x.shape}
+graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now, we would like to reproduce the same forward computation using TVM.
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('data', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0, tvm.nd.empty((1000,), dtype))
+top1 = np.argmax(tvm_output.asnumpy())
+print('TVM prediction top-1:', top1, synset[top1])
+
+######################################################################
+# Use MXNet symbol with pretrained weights
+# ----------------------------------------
+# MXNet often use `arg_prams` and `aux_params` to store network parameters
+# separately, here we show how to use these weights with existing API
+def block2symbol(block):
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    args = {}
+    auxs = {}
+    for k, v in block.collect_params().items():
+        args[k] = mx.nd.array(v.data().asnumpy())
+    return sym, args, auxs
+mx_sym, args, auxs = block2symbol(block)
+# usually we would save/load it as checkpoint
+mx.model.save_checkpoint('resnet18_v1', 0, mx_sym, args, auxs)
+# there are 'resnet18_v1-0000.params' and 'resnet18_v1-symbol.json' on disk
+
+######################################################################
+# for a normal mxnet model, we start from here
+mx_sym, args, auxs = mx.model.load_checkpoint('resnet18_v1', 0)
+# now we use the same API to get NNVM compatible symbol
+nnvm_sym, nnvm_params = nnvm.frontend.from_mxnet(mx_sym, args, auxs)
+# repeat the same steps to run this model using TVM
diff --git a/tutorials/nnvm/from_mxnet_to_webgl.py b/tutorials/nnvm/from_mxnet_to_webgl.py
new file mode 100644
index 000000000000..75279839bfb3
--- /dev/null
+++ b/tutorials/nnvm/from_mxnet_to_webgl.py
@@ -0,0 +1,501 @@
+"""
+Deploy Deep Learning Models to OpenGL and WebGL
+===============================================
+**Author**: `Zhixun Tan <https://github.com/phisiart>`_
+
+This example shows how to build a neural network with NNVM python frontend and
+generate runtime library for WebGL running in a browser with TVM.
+To run this notebook, you need to install tvm and nnvm.
+Notice that you need to build tvm with OpenGL.
+"""
+
+######################################################################
+# Overview
+# --------
+# In this tutorial, we will download a pre-trained resnet18 model from Gluon
+# Model Zoo, and run image classification in 3 different ways:
+#
+# - Run locally:
+#   We will compile the model into a TVM library with OpenGL device code and
+#   directly run it locally.
+#
+# - Run in a browser through RPC:
+#   We will compile the model into a JavaScript TVM library with WebGL device
+#   code, and upload it to an RPC server that is hosting JavaScript TVM runtime
+#   to run it.
+#
+# - Export a JavaScript library and run in a browser:
+#   We will compile the model into a JavaScript TVM library with WebGL device
+#   code, combine it with JavaScript TVM runtime, and pack everything together.
+#   Then we will run it directly in a browser.
+#
+from __future__ import print_function
+
+import numpy as np
+import tvm
+import nnvm.compiler
+import nnvm.testing
+
+# This tutorial must be run with OpenGL backend enabled in TVM.
+# The NNVM CI does not enable OpenGL yet. But the user can run this script.
+opengl_enabled = tvm.module.enabled("opengl")
+
+# To run the local demo, set this flag to True.
+run_deploy_local = False
+
+# To run the RPC demo, set this flag to True.
+run_deploy_rpc = False
+
+# To run the WebGL deploy demo, set this flag to True.
+run_deploy_web = False
+
+######################################################################
+# Download a Pre-trained Resnet18 Model
+# -------------------------------------
+# Here we define 2 functions:
+#
+# - A function that downloads a pre-trained resnet18 model from Gluon Model Zoo.
+#   The model that we download is in MXNet format, we then transform it into an
+#   NNVM computation graph.
+#
+# - A function that downloads a file that contains the name of all the image
+#   classes in this model.
+#
+def load_mxnet_resnet():
+    """Load a pretrained resnet model from MXNet and transform that into NNVM
+       format.
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The loaded resnet computation graph.
+
+    params : dict[str -> NDArray]
+        The pretrained model parameters.
+
+    data_shape: tuple
+        The shape of the input tensor (an image).
+
+    out_shape: tuple
+        The shape of the output tensor (probability of all classes).
+    """
+
+    print("Loading pretrained resnet model from MXNet...")
+
+    # Download a pre-trained mxnet resnet18_v1 model.
+    from mxnet.gluon.model_zoo.vision import get_model
+    block = get_model('resnet18_v1', pretrained=True)
+
+    # Transform the mxnet model into NNVM.
+    # We want a probability so add a softmax operator.
+    sym, params = nnvm.frontend.from_mxnet(block)
+    sym = nnvm.sym.softmax(sym)
+
+    print("- Model loaded!")
+    return sym, params, (1, 3, 224, 224), (1, 1000)
+
+def download_synset():
+    """Download a dictionary from class index to name.
+    This lets us know what our prediction actually is.
+
+    Returns
+    -------
+    synset : dict[int -> str]
+        The loaded synset.
+    """
+
+    print("Downloading synset...")
+
+    from mxnet import gluon
+
+    url = "https://gist.githubusercontent.com/zhreshold/" + \
+          "4d0b62f3d01426887599d4f7ede23ee5/raw/" + \
+          "596b27d23537e5a1b5751d2b0481ef172f58b539/" + \
+          "imagenet1000_clsid_to_human.txt"
+    file_name = "synset.txt"
+
+    gluon.utils.download(url, file_name)
+    with open(file_name) as f:
+        synset = eval(f.read())
+
+    print("- Synset downloaded!")
+    return synset
+
+######################################################################
+# Download Input Image
+# --------------------
+# Here we define 2 functions that prepare an image that we want to perform
+# classification on.
+#
+# - A function that downloads a cat image.
+#
+# - A function that performs preprocessing to an image so that it fits the
+#   format required by the resnet18 model.
+#
+def download_image():
+    """Download a cat image and resize it to 224x224 which fits resnet.
+
+    Returns
+    -------
+    image : PIL.Image.Image
+        The loaded and resized image.
+    """
+
+    print("Downloading cat image...")
+
+    from matplotlib import pyplot as plt
+    from mxnet import gluon
+    from PIL import Image
+
+    url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+    img_name = "cat.jpg"
+
+    gluon.utils.download(url, img_name)
+    image = Image.open(img_name).resize((224, 224))
+
+    print("- Cat image downloaded!")
+
+    plt.imshow(image)
+    plt.show()
+
+    return image
+
+def transform_image(image):
+    """Perform necessary preprocessing to input image.
+
+    Parameters
+    ----------
+    image : numpy.ndarray
+        The raw image.
+
+    Returns
+    -------
+    image : numpy.ndarray
+        The preprocessed image.
+    """
+
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+######################################################################
+# Compile the Model
+# -----------------
+# Here we define a function that invokes the NNVM compiler.
+#
+def compile_net(net, target_host, target, data_shape, params):
+    """Compiles an NNVM computation graph.
+
+    Parameters
+    ----------
+    net : nnvm.Graph
+        The NNVM computation graph.
+
+    target_host : str
+        The target to compile the host portion of the library.
+
+    target : str
+        The target to compile the device portion of the library.
+
+    data_shape : tuple
+        The shape of the input data (image).
+
+    params : dict[str -> NDArray]
+        Model parameters.
+
+    Returns
+    -------
+    graph : Graph
+        The final execution graph.
+
+    libmod : tvm.Module
+        The module that comes with the execution graph
+
+    params : dict[str -> NDArray]
+        The updated parameters of graph if params is passed.
+        This can be different from the params passed in.
+    """
+
+    print("Compiling the neural network...")
+
+    with nnvm.compiler.build_config(opt_level=0):
+        deploy_graph, lib, deploy_params = nnvm.compiler.build(
+            net,
+            target_host=target_host,
+            target=target,
+            shape={"data": data_shape},
+            params=params)
+
+    print("- Complilation completed!")
+    return deploy_graph, lib, deploy_params
+
+######################################################################
+# Demo 1: Deploy Locally
+# ----------------------
+# In this demo, we will compile the model targetting the local machine.
+#
+# Then we will demonstrate how to save the compiled model as a shared library
+# and load it back.
+#
+# Finally, we will run the model.
+#
+def deploy_local():
+    """Runs the demo that deploys a model locally.
+    """
+
+    # Load resnet model.
+    net, params, data_shape, out_shape = load_mxnet_resnet()
+
+    # Compile the model.
+    # Note that we specify the the host target as "llvm".
+    deploy_graph, lib, deploy_params = compile_net(
+        net,
+        target_host="llvm",
+        target="opengl",
+        data_shape=data_shape,
+        params=params)
+
+    # Save the compiled module.
+    # Note we need to save all three files returned from the NNVM compiler.
+    print("Saving the compiled module...")
+    from tvm.contrib import util
+    temp = util.tempdir()
+
+    path_lib = temp.relpath("deploy_lib.so")
+    path_graph_json = temp.relpath("deploy_graph.json")
+    path_params = temp.relpath("deploy_param.params")
+
+    lib.export_library(path_lib)
+    with open(path_graph_json, "w") as fo:
+        fo.write(deploy_graph.json())
+    with open(path_params, "wb") as fo:
+        fo.write(nnvm.compiler.save_param_dict(deploy_params))
+
+    print("- Saved files:", temp.listdir())
+
+    # Load the module back.
+    print("Loading the module back...")
+    loaded_lib = tvm.module.load(path_lib)
+    with open(path_graph_json) as fi:
+        loaded_graph_json = fi.read()
+    with open(path_params, "rb") as fi:
+        loaded_params = bytearray(fi.read())
+    print("- Module loaded!")
+
+    # Run the model! We will perform prediction on an image.
+    print("Running the graph...")
+    from tvm.contrib import graph_runtime
+
+    module = graph_runtime.create(loaded_graph_json, loaded_lib, tvm.opengl(0))
+    module.load_params(loaded_params)
+
+    image = transform_image(download_image())
+    input_data = tvm.nd.array(image.astype("float32"), ctx=tvm.opengl(0))
+
+    module.set_input("data", input_data)
+    module.run()
+
+    # Retrieve the output.
+    out = module.get_output(0, tvm.nd.empty(out_shape, ctx=tvm.opengl(0)))
+    top1 = np.argmax(out.asnumpy())
+    synset = download_synset()
+    print('TVM prediction top-1:', top1, synset[top1])
+
+if run_deploy_local and opengl_enabled:
+    deploy_local()
+
+######################################################################
+# Demo 2: Deploy the Model to WebGL Remotely with RPC
+# -------------------------------------------------------
+# Following the steps above, we can also compile the model for WebGL.
+# TVM provides rpc module to help with remote deploying.
+#
+# When we deploy a model locally to OpenGL, the model consists of two parts:
+# the host LLVM part and the device GLSL part. Now that we want to deploy to
+# WebGL, we need to leverage Emscripten to transform LLVM into JavaScript. In
+# order to do that, we will need to specify the host target as
+# 'llvm -target=asmjs-unknown-emscripten -system-lib`. Then call Emscripten to
+# compile the LLVM binary output into a JavaScript file.
+#
+# First, we need to manually start an RPC server. Please follow the instructions
+# in `tvm/web/README.md`. After following the steps, you should have a web page
+# opened in a browser, and a Python script running a proxy.
+#
+def deploy_rpc():
+    """Runs the demo that deploys a model remotely through RPC.
+    """
+    from tvm import rpc
+    from tvm.contrib import util, emscripten
+
+    # As usual, load the resnet18 model.
+    net, params, data_shape, out_shape = load_mxnet_resnet()
+
+    # Compile the model.
+    # Note that this time we are changing the target.
+    # This is because we want to translate the host library into JavaScript
+    # through Emscripten.
+    graph, lib, params = compile_net(
+        net,
+        target_host="llvm -target=asmjs-unknown-emscripten -system-lib",
+        target="opengl",
+        data_shape=data_shape,
+        params=params)
+
+    # Now we want to deploy our model through RPC.
+    # First we ned to prepare the module files locally.
+    print("Saving the compiled module...")
+
+    temp = util.tempdir()
+    path_obj = temp.relpath("deploy.bc") # host LLVM part
+    path_dso = temp.relpath("deploy.js") # host JavaScript part
+    path_gl = temp.relpath("deploy.gl") # device GLSL part
+    path_json = temp.relpath("deploy.tvm_meta.json")
+
+    lib.save(path_obj)
+    emscripten.create_js(path_dso, path_obj, side_module=True)
+    lib.imported_modules[0].save(path_gl)
+
+    print("- Saved files:", temp.listdir())
+
+    # Connect to the RPC server.
+    print("Connecting to RPC server...")
+    proxy_host = 'localhost'
+    proxy_port = 9090
+    remote = rpc.connect(proxy_host, proxy_port, key="js")
+    print("- Connected to RPC server!")
+
+    # Upload module to RPC server.
+    print("Uploading module to RPC server...")
+    remote.upload(path_dso, "deploy.dso")
+    remote.upload(path_gl)
+    remote.upload(path_json)
+    print("- Upload completed!")
+
+    # Load remote library.
+    print("Loading remote library...")
+    fdev = remote.load_module("deploy.gl")
+    fhost = remote.load_module("deploy.dso")
+    fhost.import_module(fdev)
+    rlib = fhost
+    print("- Remote library loaded!")
+
+    ctx = remote.opengl(0)
+
+    # Upload the parameters.
+    print("Uploading parameters...")
+    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+    print("- Parameters uploaded!")
+
+    # Create the remote runtime module.
+    print("Running remote module...")
+    from tvm.contrib import graph_runtime
+    module = graph_runtime.create(graph, rlib, ctx)
+
+    # Set parameter.
+    module.set_input(**rparams)
+
+    # Set input data.
+    input_data = np.random.uniform(size=data_shape)
+    module.set_input('data', tvm.nd.array(input_data.astype('float32')))
+
+    # Run.
+    module.run()
+    print("- Remote module execution completed!")
+
+    out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx))
+    # Print first 10 elements of output.
+    print(out.asnumpy()[0][0:10])
+
+if run_deploy_rpc and opengl_enabled:
+    deploy_rpc()
+
+######################################################################
+# Demo 3: Deploy the Model to WebGL SystemLib
+# -----------------------------------------------
+# This time we are not using RPC. Instead, we will compile the model and link it
+# with the entire tvm runtime into a single giant JavaScript file. Then we will
+# run the model using JavaScript.
+#
+def deploy_web():
+    """Runs the demo that deploys to web.
+    """
+
+    import base64
+    import json
+    import os
+    import shutil
+    import SimpleHTTPServer, SocketServer
+
+    from tvm.contrib import emscripten
+
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(os.getcwd())))
+    working_dir = os.getcwd()
+    output_dir = os.path.join(working_dir, "resnet")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # As usual, load the resnet18 model.
+    net, params, data_shape, out_shape = load_mxnet_resnet()
+
+    # As usual, compile the model.
+    graph, lib, params = compile_net(
+        net,
+        target_host="llvm -target=asmjs-unknown-emscripten -system-lib",
+        target="opengl",
+        data_shape=data_shape,
+        params=params)
+
+    # Now we save the model and link it with the TVM web runtime.
+    path_lib = os.path.join(output_dir, "resnet.js")
+    path_graph = os.path.join(output_dir, "resnet.json")
+    path_params = os.path.join(output_dir, "resnet.params")
+    path_data_shape = os.path.join(output_dir, "data_shape.json")
+    path_out_shape = os.path.join(output_dir, "out_shape.json")
+
+    lib.export_library(path_lib, emscripten.create_js, options=[
+        "-s", "USE_GLFW=3",
+        "-s", "USE_WEBGL2=1",
+        "-lglfw",
+        "-s", "TOTAL_MEMORY=1073741824",
+    ])
+    with open(path_graph, "w") as fo:
+        fo.write(graph.json())
+    with open(path_params, "w") as fo:
+        fo.write(base64.b64encode(nnvm.compiler.save_param_dict(params)))
+
+    shutil.copyfile(os.path.join(curr_path, "../tvm/web/tvm_runtime.js"),
+                    os.path.join(output_dir, "tvm_runtime.js"))
+    shutil.copyfile(os.path.join(curr_path, "web/resnet.html"),
+                    os.path.join(output_dir, "resnet.html"))
+
+    # Now we want to save some extra files so that we can execute the model from
+    # JavaScript.
+    # - data shape
+    with open(path_data_shape, "w") as fo:
+        json.dump(list(data_shape), fo)
+    # - out shape
+    with open(path_out_shape, "w") as fo:
+        json.dump(list(out_shape), fo)
+    # - input image
+    image = download_image()
+    image.save(os.path.join(output_dir, "data.png"))
+    # - synset
+    synset = download_synset()
+    with open(os.path.join(output_dir, "synset.json"), "w") as fo:
+        json.dump(synset, fo)
+
+    print("Output files are in", output_dir)
+
+    # Finally, we fire up a simple web server to serve all the exported files.
+    print("Now running a simple server to serve the files...")
+    os.chdir(output_dir)
+    port = 8080
+    handler = SimpleHTTPServer.SimpleHTTPRequestHandler
+    httpd = SocketServer.TCPServer(("", port), handler)
+    print("Please open http://localhost:" + str(port) + "/resnet.html")
+    httpd.serve_forever()
+
+if run_deploy_web and opengl_enabled:
+    deploy_web()
diff --git a/tutorials/nnvm/from_onnx.py b/tutorials/nnvm/from_onnx.py
new file mode 100644
index 000000000000..8fb5a1048569
--- /dev/null
+++ b/tutorials/nnvm/from_onnx.py
@@ -0,0 +1,104 @@
+"""
+Compile ONNX Models
+===================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_
+
+This article is an introductory tutorial to deploy ONNX models with NNVM.
+
+For us to begin with, onnx module is required to be installed.
+
+A quick solution is to install protobuf compiler, and
+```bash
+pip install onnx --user
+```
+or please refer to offical site.
+https://github.com/onnx/onnx
+"""
+import nnvm
+import tvm
+import onnx
+import numpy as np
+
+def download(url, path, overwrite=False):
+    import os
+    if os.path.isfile(path) and not overwrite:
+        print('File {} existed, skip.'.format(path))
+        return
+    print('Downloading from url {} to {}'.format(url, path))
+    try:
+        import urllib.request
+        urllib.request.urlretrieve(url, path)
+    except:
+        import urllib
+        urllib.urlretrieve(url, path)
+
+######################################################################
+# Load pretrained ONNX model
+# ---------------------------------------------
+# The example super resolution model used here is exactly the same model in onnx tutorial
+# http://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html
+# we skip the pytorch model construction part, and download the saved onnx model
+model_url = ''.join(['https://gist.github.com/zhreshold/',
+                     'bcda4716699ac97ea44f791c24310193/raw/',
+                     '93672b029103648953c4e5ad3ac3aadf346a4cdc/',
+                     'super_resolution_0.2.onnx'])
+download(model_url, 'super_resolution.onnx', True)
+# now you have super_resolution.onnx on disk
+onnx_model = onnx.load('super_resolution.onnx')
+# we can load the graph as NNVM compatible model
+sym, params = nnvm.frontend.from_onnx(onnx_model)
+
+######################################################################
+# Load a test image
+# ---------------------------------------------
+# A single cat dominates the examples!
+from PIL import Image
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+download(img_url, 'cat.png')
+img = Image.open('cat.png').resize((224, 224))
+img_ycbcr = img.convert("YCbCr")  # convert to YCbCr
+img_y, img_cb, img_cr = img_ycbcr.split()
+x = np.array(img_y)[np.newaxis, np.newaxis, :, :]
+
+######################################################################
+# Compile the model on NNVM
+# ---------------------------------------------
+# We should be familiar with the process right now.
+import nnvm.compiler
+target = 'cuda'
+# assume first input name is data
+input_name = sym.list_input_names()[0]
+shape_dict = {input_name: x.shape}
+graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+
+######################################################################
+# Execute on TVM
+# ---------------------------------------------
+# The process is no different from other example
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+output_shape = (1, 1, 672, 672)
+tvm_output = m.get_output(0, tvm.nd.empty(output_shape, dtype)).asnumpy()
+
+######################################################################
+# Display results
+# ---------------------------------------------
+# We put input and output image neck to neck
+from matplotlib import pyplot as plt
+out_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode='L')
+out_cb = img_cb.resize(out_y.size, Image.BICUBIC)
+out_cr = img_cr.resize(out_y.size, Image.BICUBIC)
+result = Image.merge('YCbCr', [out_y, out_cb, out_cr]).convert('RGB')
+canvas = np.full((672, 672*2, 3), 255)
+canvas[0:224, 0:224, :] = np.asarray(img)
+canvas[:, 672:, :] = np.asarray(result)
+plt.imshow(canvas.astype(np.uint8))
+plt.show()
diff --git a/tutorials/nnvm/from_tensorflow.py b/tutorials/nnvm/from_tensorflow.py
new file mode 100644
index 000000000000..ee025c5b09ff
--- /dev/null
+++ b/tutorials/nnvm/from_tensorflow.py
@@ -0,0 +1,211 @@
+"""
+Compile Tensorflow Models
+=========================
+This article is an introductory tutorial to deploy tensorflow models with TVM.
+
+For us to begin with, tensorflow python module is required to be installed.
+
+A quick solution is to install tensorflow from
+
+https://www.tensorflow.org/install
+"""
+
+# tvm and nnvm
+import nnvm
+import tvm
+
+# os and numpy
+import numpy as np
+import os.path
+
+# Tensorflow imports
+import tensorflow as tf
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
+
+# Tensorflow utility functions
+import nnvm.testing.tf
+
+# Base location for model related files.
+repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/'
+
+# Test image
+img_name = 'elephant-299.jpg'
+image_url = os.path.join(repo_base, img_name)
+
+# InceptionV1 model protobuf
+# .. note::
+#
+#   protobuf should be exported with :any:`add_shapes=True` option.
+#   Could use https://github.com/dmlc/web-data/tree/master/tensorflow/scripts/tf-to-nnvm.py
+#   to add shapes for existing models.
+#
+model_name = 'classify_image_graph_def-with_shapes.pb'
+model_url = os.path.join(repo_base, model_name)
+
+# Image label map
+map_proto = 'imagenet_2012_challenge_label_map_proto.pbtxt'
+map_proto_url = os.path.join(repo_base, map_proto)
+
+# Human readable text for labels
+lable_map = 'imagenet_synset_to_human_label_map.txt'
+lable_map_url = os.path.join(repo_base, lable_map)
+
+
+######################################################################
+# Download required files
+# -----------------------
+# Download files listed above.
+from mxnet.gluon.utils import download
+
+download(image_url, img_name)
+download(model_url, model_name)
+download(map_proto_url, map_proto)
+download(lable_map_url, lable_map)
+
+
+######################################################################
+# Import model
+# ------------
+# Creates tensorflow graph definition from protobuf file.
+
+with tf.gfile.FastGFile(os.path.join("./", model_name), 'rb') as f:
+    graph_def = tf.GraphDef()
+    graph_def.ParseFromString(f.read())
+    graph = tf.import_graph_def(graph_def, name='')
+    # Call the utility to import the graph definition into default graph.
+    graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
+
+
+######################################################################
+# Decode image
+# ------------
+# .. note::
+#
+#   tensorflow frontend import doesn't support preprocessing ops like JpegDecode
+#   JpegDecode is bypassed (just return source node).
+#   Hence we supply decoded frame to TVM instead.
+#
+
+from PIL import Image
+image = Image.open(img_name).resize((299, 299))
+
+x = np.array(image)
+
+######################################################################
+# Import the graph to NNVM
+# ------------------------
+# Import tensorflow graph definition to nnvm.
+#
+# Results:
+#   sym: nnvm graph for given tensorflow protobuf.
+#   params: params converted from tensorflow params (tensor protobuf).
+sym, params = nnvm.frontend.from_tensorflow(graph_def)
+
+print ("Tensorflow protobuf imported as nnvm graph")
+######################################################################
+# NNVM Compilation
+# ----------------
+# Compile the graph to llvm target with given input specification.
+#
+# Results:
+#   graph: Final graph after compilation.
+#   params: final params after compilation.
+#   lib: target library which can be deployed on target with tvm runtime.
+
+import nnvm.compiler
+target = 'llvm'
+shape_dict = {'DecodeJpeg/contents': x.shape}
+dtype_dict = {'DecodeJpeg/contents': 'uint8'}
+graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, dtype=dtype_dict, params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now we can try deploying the NNVM compiled model on cpu target.
+
+from tvm.contrib import graph_runtime
+ctx = tvm.cpu(0)
+dtype = 'uint8'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('DecodeJpeg/contents', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0, tvm.nd.empty(((1, 1008)), 'float32'))
+
+######################################################################
+# Process the output
+# ------------------
+# Process the model output to human readable text for InceptionV1.
+predictions = tvm_output.asnumpy()
+predictions = np.squeeze(predictions)
+
+# Creates node ID --> English string lookup.
+node_lookup = nnvm.testing.tf.NodeLookup(label_lookup_path=os.path.join("./", map_proto),
+                                         uid_lookup_path=os.path.join("./", lable_map))
+
+# Print top 5 predictions from TVM output.
+top_k = predictions.argsort()[-5:][::-1]
+for node_id in top_k:
+    human_string = node_lookup.id_to_string(node_id)
+    score = predictions[node_id]
+    print('%s (score = %.5f)' % (human_string, score))
+
+######################################################################
+# Inference on tensorflow
+# -----------------------
+# Run the corresponding model on tensorflow
+
+def create_graph():
+    """Creates a graph from saved GraphDef file and returns a saver."""
+    # Creates graph from saved graph_def.pb.
+    with tf.gfile.FastGFile(model_name, 'rb') as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+        graph = tf.import_graph_def(graph_def, name='')
+        # Call the utility to import the graph definition into default graph.
+        graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
+
+def run_inference_on_image(image):
+    """Runs inference on an image.
+
+    Parameters
+    ----------
+    image: String
+        Image file name.
+
+    Returns
+    -------
+        Nothing
+    """
+    if not tf.gfile.Exists(image):
+        tf.logging.fatal('File does not exist %s', image)
+    image_data = tf.gfile.FastGFile(image, 'rb').read()
+
+    # Creates graph from saved GraphDef.
+    create_graph()
+
+    with tf.Session() as sess:
+        softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
+        predictions = sess.run(softmax_tensor,
+                               {'DecodeJpeg/contents:0': image_data})
+
+        predictions = np.squeeze(predictions)
+
+        # Creates node ID --> English string lookup.
+        node_lookup = nnvm.testing.tf.NodeLookup(label_lookup_path=os.path.join("./", map_proto),
+                                                 uid_lookup_path=os.path.join("./", lable_map))
+
+        # Print top 5 predictions from tensorflow.
+        top_k = predictions.argsort()[-5:][::-1]
+        print ("===== TENSORFLOW RESULTS =======")
+        for node_id in top_k:
+            human_string = node_lookup.id_to_string(node_id)
+            score = predictions[node_id]
+            print('%s (score = %.5f)' % (human_string, score))
+
+run_inference_on_image (img_name)
diff --git a/tutorials/nnvm/get_started.py b/tutorials/nnvm/get_started.py
new file mode 100644
index 000000000000..f505853375df
--- /dev/null
+++ b/tutorials/nnvm/get_started.py
@@ -0,0 +1,174 @@
+"""
+Get Started with NNVM
+=====================
+**Author**: `Tianqi Chen <https://tqchen.github.io/>`_
+
+This article is an introductory tutorial to workflow in NNVM.
+"""
+import nnvm.compiler
+import nnvm.symbol as sym
+
+######################################################################
+# Declare Computation
+# -------------------
+# We start by describing our need using computational graph.
+# Most deep learning frameworks use computation graph to describe
+# their computation. In this example, we directly use
+# NNVM's API to construct the computational graph.
+#
+# .. note::
+#
+#   In a typical deep learning compilation workflow,
+#   we can get the models from :any:`nnvm.frontend`
+#
+# The following code snippet describes :math:`z = x + \sqrt{y}`
+# and creates a nnvm graph from the description.
+# We can print out the graph ir to check the graph content.
+
+x = sym.Variable("x")
+y = sym.Variable("y")
+z = sym.elemwise_add(x, sym.sqrt(y))
+compute_graph = nnvm.graph.create(z)
+print("-------compute graph-------")
+print(compute_graph.ir())
+
+######################################################################
+# Compile
+# -------
+# We can call :any:`nnvm.compiler.build` to compile the graph.
+# The build function takes a shape parameter which specifies the
+# input shape requirement. Here we only need to pass in shape of ``x``
+# and the other one will be inferred automatically by NNVM.
+#
+# The function returns three values. ``deploy_graph`` contains
+# the final compiled graph structure. ``lib`` is a :any:`tvm.module.Module`
+# that contains compiled CUDA functions. We do not need the ``params``
+# in this case.
+shape = (4,)
+deploy_graph, lib, params = nnvm.compiler.build(
+    compute_graph, target="cuda", shape={"x": shape}, dtype="float32")
+
+######################################################################
+# We can print out the IR of ``deploy_graph`` to understand what just
+# happened under the hood. We can find that ``deploy_graph`` only
+# contains a single operator ``tvm_op``. This is because NNVM
+# automatically fused the operator together into one operator.
+#
+print("-------deploy graph-------")
+print(deploy_graph.ir())
+
+######################################################################
+# Let us also peek into content of ``lib``.
+# Typically a compiled TVM CUDA module contains a host module(lib)
+# and a device module(``lib.imported_modules[0]``) that contains the CUDA code.
+# We print out the the generated device code here.
+# This is exactly a fused CUDA version of kernel that the graph points to.
+#
+print("-------deploy library-------")
+print(lib.imported_modules[0].get_source())
+
+######################################################################
+# Deploy and Run
+# --------------
+# Now that we have have compiled module, let us run it.
+# We can use :any:`graph_runtime <tvm.contrib.graph_runtime.create>`
+# in tvm to create a deployable :any:`GraphModule <tvm.contrib.graph_runtime.GraphModule>`.
+# We can use the :any:`set_input <tvm.contrib.graph_runtime.GraphModule.set_input>`,
+# :any:`run <tvm.contrib.graph_runtime.GraphModule.run>` and
+# :any:`get_output <tvm.contrib.graph_runtime.GraphModule.get_output>` function
+# to set the input, execute the graph and get the output we need.
+#
+import tvm
+import numpy as np
+from tvm.contrib import graph_runtime, util
+
+module = graph_runtime.create(deploy_graph, lib, tvm.gpu(0))
+x_np = np.array([1, 2, 3, 4]).astype("float32")
+y_np = np.array([4, 4, 4, 4]).astype("float32")
+# set input to the graph module
+module.set_input(x=x_np, y=y_np)
+# run forward computation
+module.run()
+# get the first output
+out = module.get_output(0, out=tvm.nd.empty(shape))
+print(out.asnumpy())
+
+######################################################################
+# Provide Model Parameters
+# ------------------------
+# Most deep learning models contains two types of inputs: parameters
+# that remains fixed during inference and data input that need to
+# change for each inference task. It is helpful to provide these
+# information to NNVM. Let us assume that ``y`` is the parameter
+# in our example. We can provide the model parameter information
+# by the params argument to :any:`nnvm.compiler.build`.
+#
+deploy_graph, lib, params = nnvm.compiler.build(
+    compute_graph, target="cuda", shape={"x": shape}, params={"y": y_np})
+
+######################################################################
+# This time we will need params value returned by :any:`nnvm.compiler.build`.
+# NNVM applys  optimization  to pre-compute the intermediate values in
+# the graph that can be determined by parameters. In this case
+# :math:`\sqrt{y}` can be pre-computed. The pre-computed values
+# are returned as new params. We can print out the new compiled library
+# to confirm that the fused kernel only now contains add.
+#
+print("-----optimized params-----")
+print(params)
+print("-------deploy library-------")
+print(lib.imported_modules[0].get_source())
+
+######################################################################
+# Save the Deployed Module
+# ------------------------
+# We can save the ``deploy_graph``, ``lib`` and ``params`` separately
+# and load them back later. We can use :any:`tvm.module.Module` to export
+# the compiled library. ``deploy_graph`` is saved in json format and ``params``
+# is serialized into a bytearray.
+#
+temp = util.tempdir()
+path_lib = temp.relpath("deploy.so")
+lib.export_library(path_lib)
+with open(temp.relpath("deploy.json"), "w") as fo:
+    fo.write(deploy_graph.json())
+with open(temp.relpath("deploy.params"), "wb") as fo:
+    fo.write(nnvm.compiler.save_param_dict(params))
+print(temp.listdir())
+
+######################################################################
+# We can load the module back.
+loaded_lib = tvm.module.load(path_lib)
+loaded_json = open(temp.relpath("deploy.json")).read()
+loaded_params = bytearray(open(temp.relpath("deploy.params"), "rb").read())
+module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
+params = nnvm.compiler.load_param_dict(loaded_params)
+# directly load from byte array
+module.load_params(loaded_params)
+module.run(x=x_np)
+# get the first output
+out = module.get_output(0, out=tvm.nd.empty(shape))
+print(out.asnumpy())
+
+######################################################################
+# Deploy using Another Language
+# -----------------------------
+# We use python in this example for demonstration.
+# We can also deploy the compiled modules with other languages
+# supported by TVM such as  c++, java, javascript.
+# The graph module itself is fully embedded in TVM runtime.
+#
+# The following block demonstrates how we can directly use TVM's
+# runtime API to execute the compiled module.
+# You can find similar runtime API in TVMRuntime of other languages.
+#
+fcreate = tvm.get_global_func("tvm.graph_runtime.create")
+ctx = tvm.gpu(0)
+gmodule = fcreate(loaded_json, loaded_lib, ctx.device_type, ctx.device_id)
+set_input, get_output, run = gmodule["set_input"], gmodule["get_output"], gmodule["run"]
+set_input("x", tvm.nd.array(x_np))
+gmodule["load_params"](loaded_params)
+run()
+out = tvm.nd.empty(shape)
+get_output(0, out)
+print(out.asnumpy())
diff --git a/tutorials/nnvm/nlp/from_darknet_rnn.py b/tutorials/nnvm/nlp/from_darknet_rnn.py
new file mode 100644
index 000000000000..54013f04fca6
--- /dev/null
+++ b/tutorials/nnvm/nlp/from_darknet_rnn.py
@@ -0,0 +1,184 @@
+"""
+Compile Darknet Models for RNN
+==============================
+**Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
+
+This article is an introductory tutorial to deploy darknet rnn models with NNVM.
+
+This script will run a character prediction model
+Each module consists of 3 fully-connected layers. The input layer propagates information from the
+input to the current state. The recurrent layer propagates information through time from the
+previous state to the current one.
+
+The input to the network is a 1-hot encoding of ASCII characters. We train the network to predict
+the next character in a stream of characters. The output is constrained to be a probability
+distribution using a softmax layer.
+
+Since each recurrent layer contains information about the current character and the past
+characters, it can use this context to predict the future characters in a word or phrase.
+
+All the required models and libraries will be downloaded from the internet
+by the script.
+"""
+import random
+import numpy as np
+from mxnet.gluon.utils import download
+import tvm
+from tvm.contrib import graph_runtime
+from nnvm.testing.darknet import __darknetffi__
+import nnvm
+import nnvm.frontend.darknet
+
+# Set the parameters
+# -----------------------
+# Set the seed value and the number of characters to predict
+
+#Model name
+MODEL_NAME = 'rnn'
+#Seed value
+seed = 'Thus'
+#Number of characters to predict
+num = 1000
+
+# Download required files
+# -----------------------
+# Download cfg and weights file if first time.
+CFG_NAME = MODEL_NAME + '.cfg'
+WEIGHTS_NAME = MODEL_NAME + '.weights'
+REPO_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/'
+CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
+WEIGHTS_URL = REPO_URL + 'weights/' + WEIGHTS_NAME + '?raw=true'
+
+download(CFG_URL, CFG_NAME)
+download(WEIGHTS_URL, WEIGHTS_NAME)
+
+# Download and Load darknet library
+DARKNET_LIB = 'libdarknet.so'
+DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
+download(DARKNET_URL, DARKNET_LIB)
+DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
+cfg = "./" + str(CFG_NAME)
+weights = "./" + str(WEIGHTS_NAME)
+net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
+dtype = 'float32'
+batch_size = 1
+
+# Import the graph to NNVM
+# ------------------------
+# Import darknet graph definition to nnvm.
+#
+# Results:
+#   sym: nnvm graph for rnn model
+#   params: params converted from darknet weights
+print("Converting darknet rnn model to nnvm symbols...")
+sym, params = nnvm.frontend.darknet.from_darknet(net, dtype)
+
+# Compile the model on NNVM
+data = np.empty([1, net.inputs], dtype)#net.inputs
+
+target = 'llvm'
+shape = {'data': data.shape}
+print("Compiling the model...")
+
+shape_dict = {'data': data.shape}
+dtype_dict = {'data': data.dtype}
+
+with nnvm.compiler.build_config(opt_level=2):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, dtype_dict, params)
+
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now we can try deploying the NNVM compiled model on cpu target.
+
+# Set the cpu context
+ctx = tvm.cpu(0)
+# Create graph runtime
+m = graph_runtime.create(graph, lib, ctx)
+# Set the params to runtime
+m.set_input(**params)
+
+def _init_state_memory(rnn_cells_count, dtype):
+    '''Initialize memory for states'''
+    states = {}
+    state_shape = (1024,)
+    for i in range(rnn_cells_count):
+        k = 'rnn' + str(i) + '_state'
+        states[k] = tvm.nd.array(np.zeros(state_shape, dtype).astype(dtype))
+    return states
+
+def _set_state_input(runtime, states):
+    '''Set the state inputs'''
+    for state in states:
+        runtime.set_input(state, states[state])
+
+def _get_state_output(runtime, states):
+    '''Get the state outputs and save'''
+    i = 1
+    for state in states:
+        data = states[state]
+        states[state] = runtime.get_output((i), tvm.nd.empty(data.shape, data.dtype))
+        i += 1
+
+def _proc_rnn_output(out_data):
+    '''Generate the characters from the output array'''
+    sum_array = 0
+    n = out_data.size
+    r = random.uniform(0, 1)
+    for j in range(n):
+        if out_data[j] < 0.0001:
+            out_data[j] = 0
+        sum_array += out_data[j]
+
+    for j in range(n):
+        out_data[j] *= float(1.0) / sum_array
+        r = r - out_data[j]
+        if r <= 0:
+            return j
+    return n-1
+
+print("RNN generaring text...")
+
+out_shape = (net.outputs,)
+rnn_cells_count = 3
+
+# Initialize state memory
+# -----------------------
+states = _init_state_memory(rnn_cells_count, dtype)
+
+len_seed = len(seed)
+count = len_seed + num
+out_txt = ""
+
+#Initialize random seed
+random.seed(0)
+c = ord(seed[0])
+inp_data = np.zeros([net.inputs], dtype)
+
+# Run the model
+# -------------
+
+# Predict character by character till `num`
+for i in range(count):
+    inp_data[c] = 1
+
+    # Set the input data
+    m.set_input('data', tvm.nd.array(inp_data.astype(dtype)))
+    inp_data[c] = 0
+
+    # Set the state inputs
+    _set_state_input(m, states)
+
+    # Run the model
+    m.run()
+
+    # Get the output
+    tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+
+    # Get the state outputs
+    _get_state_output(m, states)
+
+    # Get the predicted character and keep buffering it
+    c = ord(seed[i])  if i < len_seed else _proc_rnn_output(tvm_out)
+    out_txt += chr(c)
+
+print("Predicted Text =", out_txt)
diff --git a/tutorials/nnvm/using_external_lib.py b/tutorials/nnvm/using_external_lib.py
new file mode 100644
index 000000000000..fd00768b93be
--- /dev/null
+++ b/tutorials/nnvm/using_external_lib.py
@@ -0,0 +1,218 @@
+"""
+Using External Libraries in NNVM
+================================
+**Author**: `Masahiro Masuda <https://github.com/masahi>`_
+
+This is a short tutorial on how to use external libraries such as cuDNN, or cuBLAS with NNVM.
+
+NNVM uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network.
+But sometimes it is also helpful to incorporate external libraries developed by various vendors into NNVM.
+Luckily, TVM has a mechanism to transparently call into these libraries.
+For NNVM users, all we need to do is just to set a target string appropriately.
+
+Before we can use external libraries from NNVM, your TVM needs to be built with libraries you want to use.
+For example, to use cuDNN, USE_CUDNN option in tvm/make/config.mk needs to be enabled, and cuDNN include and library directories need to be specified.
+
+To begin with, we import NNVM and TVM.
+"""
+import tvm
+import numpy as np
+from tvm.contrib import graph_runtime as runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing import utils
+
+######################################################################
+# Create a simple network
+# -----------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, and ReLU activation.
+
+out_channels = 16
+data = sym.Variable(name="data")
+simple_net = sym.conv2d(data=data, kernel_size=(3,3), channels=out_channels, padding = (1, 1), use_bias=True)
+simple_net = sym.batch_norm(data=simple_net)
+simple_net = sym.relu(data=simple_net)
+
+batch_size = 1
+data_shape = (batch_size, 3, 224, 224)
+net, params = utils.create_workload(simple_net, batch_size, data_shape[1:])
+
+######################################################################
+# Build and run with cuda backend
+# -------------------------------
+# We build and run this network with cuda backend, as usual.
+# By setting the logging level to DEBUG, the result of NNVM graph compilation will be dumped as pseudo code.
+import logging
+logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion
+
+target = "cuda"
+graph, lib, params = nnvm.compiler.build(
+    net, target, shape={"data": data_shape}, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cuda = out.asnumpy()
+
+######################################################################
+# The generated pseudo code should look something like below.
+# Note how bias add, batch normalization, and ReLU activation are fused into the convolution kernel.
+# TVM generates a single, fused kernel from this representation.
+#
+# .. code-block:: text
+#
+#       produce compute {
+#         // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 112
+#         // attr [input1.shared] storage_scope = "shared"
+#         allocate input1.shared[float32 * 16 * 3 * 3 * 3]
+#         // attr [compute] storage_scope = "local"
+#         allocate compute[float32 * 16 * 1 * 1 * 1 * 1]
+#         // attr [pad_temp.global.global.shared] storage_scope = "shared"
+#         allocate pad_temp.global.global.shared[float32 * 1 * 1 * 4 * 57 * 4]
+#         // attr [iter_var(threadIdx.x, Range(min=0, extent=448), threadIdx.x)] thread_extent = 448
+#         produce compute {
+#           produce input1.shared {
+#             for (ax0, 0, 16) {
+#               if (likely((threadIdx.x < 27))) {
+#                 input1.shared[(threadIdx.x + (ax0*27))] = input1[((((((blockIdx.x/112)*48) + (threadIdx.x/9))*9) + (threadIdx.x % 9)) + (ax0*27))]
+#               }
+#             }
+#           }
+#           compute[0] = 0.000000f
+#           compute[1] = 0.000000f
+#           compute[2] = 0.000000f
+#           compute[3] = 0.000000f
+#           compute[4] = 0.000000f
+#           compute[5] = 0.000000f
+#           compute[6] = 0.000000f
+#           compute[7] = 0.000000f
+#           compute[8] = 0.000000f
+#           compute[9] = 0.000000f
+#           compute[10] = 0.000000f
+#           compute[11] = 0.000000f
+#           compute[12] = 0.000000f
+#           compute[13] = 0.000000f
+#           compute[14] = 0.000000f
+#           compute[15] = 0.000000f
+#           for (rc, 0, 3) {
+#             produce pad_temp.global.global.shared {
+#               if (likely((threadIdx.x < 228))) {
+#                 if (likely(((blockIdx.x*2) < (226 - (threadIdx.x/57))))) {
+#                   pad_temp.global.global.shared[ramp((threadIdx.x*4), 1, 4)] = pad_temp[ramp(((((((blockIdx.x*2) + (threadIdx.x/57))*57) + (threadIdx.x % 57)) + (rc*12882))*4), 1, 4)]
+#                 }
+#               }
+#             }
+#             for (ry, 0, 3) {
+#               for (rx, 0, 3) {
+#                 compute[0] = (compute[0] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[((((rc*3) + ry)*3) + rx)]))
+#                 compute[1] = (compute[1] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 27)]))
+#                 compute[2] = (compute[2] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 54)]))
+#                 compute[3] = (compute[3] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 81)]))
+#                 compute[4] = (compute[4] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 108)]))
+#                 compute[5] = (compute[5] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 135)]))
+#                 compute[6] = (compute[6] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 162)]))
+#                 compute[7] = (compute[7] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 189)]))
+#                 compute[8] = (compute[8] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 216)]))
+#                 compute[9] = (compute[9] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 243)]))
+#                 compute[10] = (compute[10] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 270)]))
+#                 compute[11] = (compute[11] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 297)]))
+#                 compute[12] = (compute[12] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 324)]))
+#                 compute[13] = (compute[13] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 351)]))
+#                 compute[14] = (compute[14] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 378)]))
+#                 compute[15] = (compute[15] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 405)]))
+#               }
+#             }
+#           }
+#         }
+#         compute[(((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224))] = max((((compute[0] + input2[((blockIdx.x/112)*16)])*input3[((blockIdx.x/112)*16)]) + input4[((blockIdx.x/112)*16)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 50176)] = max((((compute[1] + input2[(((blockIdx.x/112)*16) + 1)])*input3[(((blockIdx.x/112)*16) + 1)]) + input4[(((blockIdx.x/112)*16) + 1)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 100352)] = max((((compute[2] + input2[(((blockIdx.x/112)*16) + 2)])*input3[(((blockIdx.x/112)*16) + 2)]) + input4[(((blockIdx.x/112)*16) + 2)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 150528)] = max((((compute[3] + input2[(((blockIdx.x/112)*16) + 3)])*input3[(((blockIdx.x/112)*16) + 3)]) + input4[(((blockIdx.x/112)*16) + 3)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 200704)] = max((((compute[4] + input2[(((blockIdx.x/112)*16) + 4)])*input3[(((blockIdx.x/112)*16) + 4)]) + input4[(((blockIdx.x/112)*16) + 4)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 250880)] = max((((compute[5] + input2[(((blockIdx.x/112)*16) + 5)])*input3[(((blockIdx.x/112)*16) + 5)]) + input4[(((blockIdx.x/112)*16) + 5)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 301056)] = max((((compute[6] + input2[(((blockIdx.x/112)*16) + 6)])*input3[(((blockIdx.x/112)*16) + 6)]) + input4[(((blockIdx.x/112)*16) + 6)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 351232)] = max((((compute[7] + input2[(((blockIdx.x/112)*16) + 7)])*input3[(((blockIdx.x/112)*16) + 7)]) + input4[(((blockIdx.x/112)*16) + 7)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 401408)] = max((((compute[8] + input2[(((blockIdx.x/112)*16) + 8)])*input3[(((blockIdx.x/112)*16) + 8)]) + input4[(((blockIdx.x/112)*16) + 8)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 451584)] = max((((compute[9] + input2[(((blockIdx.x/112)*16) + 9)])*input3[(((blockIdx.x/112)*16) + 9)]) + input4[(((blockIdx.x/112)*16) + 9)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 501760)] = max((((compute[10] + input2[(((blockIdx.x/112)*16) + 10)])*input3[(((blockIdx.x/112)*16) + 10)]) + input4[(((blockIdx.x/112)*16) + 10)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 551936)] = max((((compute[11] + input2[(((blockIdx.x/112)*16) + 11)])*input3[(((blockIdx.x/112)*16) + 11)]) + input4[(((blockIdx.x/112)*16) + 11)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 602112)] = max((((compute[12] + input2[(((blockIdx.x/112)*16) + 12)])*input3[(((blockIdx.x/112)*16) + 12)]) + input4[(((blockIdx.x/112)*16) + 12)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 652288)] = max((((compute[13] + input2[(((blockIdx.x/112)*16) + 13)])*input3[(((blockIdx.x/112)*16) + 13)]) + input4[(((blockIdx.x/112)*16) + 13)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 702464)] = max((((compute[14] + input2[(((blockIdx.x/112)*16) + 14)])*input3[(((blockIdx.x/112)*16) + 14)]) + input4[(((blockIdx.x/112)*16) + 14)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 752640)] = max((((compute[15] + input2[(((blockIdx.x/112)*16) + 15)])*input3[(((blockIdx.x/112)*16) + 15)]) + input4[(((blockIdx.x/112)*16) + 15)]), 0.000000f)
+#       }
+#
+
+######################################################################
+# Use cuDNN for a convolutional layer
+# -----------------------------------
+# We can use cuDNN to replace convolution kernels with cuDNN ones.
+# To do that, all we need to do is to append the option " -libs=cudnn" to the target string.
+net, params = utils.create_workload(simple_net, batch_size, data_shape[1:])
+target = "cuda -libs=cudnn" # use cudnn for convolution
+graph, lib, params = nnvm.compiler.build(
+    net, target, shape={"data": data_shape}, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cudnn = out.asnumpy()
+
+######################################################################
+# Note that if you use cuDNN, NNVM cannot fuse convolution with layers following it.
+# This is because layer fusion happens at the level of TVM internal representation(IR).
+# NNVM treats external libraries as black box, so there is no way to fuse them with TVM IR.
+#
+# The pseudo code below shows that cuDNN convolution + bias add + batch norm + ReLU turned into two stages of computation, one for cuDNN call and the other for the rest of operations.
+#
+# .. code-block:: text
+#
+#       allocate y[float32 * 1 * 16 * 224 * 224]
+#       produce y {
+#          // attr [0] extern_scope = 0
+#          tvm_call_packed("tvm.contrib.cudnn.conv2d.forward", 1, 0, 1, 1, 1, 1, 1, 1, 1, tvm_stack_make_array(input0, tvm_stack_make_shape(1, 3, 224, 224), 0, 4, 0.000000f, 0), tvm_stack_make_array(input1, tvm_stack_make_shape(16, 3, 3, 3), 0, 4, 0.000000f, 0), tvm_stack_make_array(y, tvm_stack_make_shape(1, 16, 224, 224), 0, 4, 0.000000f, 0))
+#        }
+#       produce compute {
+#          // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1568
+#          // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 512
+#          compute[((((((blockIdx.x*512) + threadIdx.x)/50176) + ((((blockIdx.x*512) + threadIdx.x)/802816)*16))*50176) + ((((((blockIdx.x*512) + threadIdx.x)/224) % 224)*224) + (((blockIdx.x*64) + threadIdx.x) % 224)))] = max((((y[((((((blockIdx.x*512) + threadIdx.x)/50176) + ((((blockIdx.x*512) + threadIdx.x)/802816)*16))*50176) + ((((((blockIdx.x*512) + threadIdx.x)/224) % 224)*224) + (((blockIdx.x*64) + threadIdx.x) % 224)))] + input2[(((blockIdx.x*512) + threadIdx.x)/50176)])*input3[(((blockIdx.x*512) + threadIdx.x)/50176)]) + input4[(((blockIdx.x*512) + threadIdx.x)/50176)]), 0.000000f)
+#        }
+#
+
+######################################################################
+# Verify the result
+# -----------------
+# We can check that the results of two runs match.
+
+np.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)
+
+#####################################################################
+# Conclusion
+# ----------
+# This tutorial covered the usage of cuDNN with NNVM.
+# We also have support for cuBLAS. If cuBLAS is enabled, it will be used inside a fully connected layer (nnvm.symbol.dense).
+# To use cuBLAS, set a target string as "cuda -libs=cublas".
+# You can use both cuDNN and cuBLAS with "cuda -libs=cudnn,cublas".
+#
+# For ROCm backend, we have support for MIOpen and rocBLAS.
+# They can be enabled with target "rocm -libs=miopen,rocblas".
+#
+# Being able to use external libraries is great, but we need to keep in mind some cautions.
+#
+# First, the use of external libraries may restrict your usage of TVM and NNVM.
+# For example, MIOpen only supports NCHW layout and fp32 data type at the moment, so you cannot use other layouts or data type in TVM.
+#
+# Second, and more importantly, external libraries restrict the possibility of operator fusion during graph compilation, as shown above.
+# TVM and NNVM aim to achieve the best performance on a variety of hardwares, with joint operator level and graph level optimization.
+# To achieve this goal, we should continue developing better optimizations for TVM and NNVM, while using external libraries as a nice way to fall back to existing implementation when necessary.
diff --git a/tutorials/nnvm/web/resnet.html b/tutorials/nnvm/web/resnet.html
new file mode 100644
index 000000000000..b858fbc19770
--- /dev/null
+++ b/tutorials/nnvm/web/resnet.html
@@ -0,0 +1,187 @@
+<html>
+
+<head>
+  <meta charset="UTF-8">
+  <title>NNVM WebGL Test Page</title>
+</head>
+
+<body>
+  <h1>NNVM WebGL Test Page</h1>
+
+  <!-- We will draw the input image here. -->
+  <div>Input Image:</div>
+  <img id="image", src="data.png">
+
+  <!-- We need a canvas to get the image pixel data. Hide this element. -->
+  <canvas hidden id="image_canvas" width="224" height="224"></canvas>
+
+  <!-- We will write te prediction result here. -->
+  <div id="prediction"></div>
+
+  <!-- We will write all log messages here. -->
+  <div id="log">Log:</div>
+
+  <!-- The OpenGL canvas. -->
+  <canvas id="canvas"></canvas>
+
+  <script>
+    var Module = {};
+
+    // resnet.js would recognize Module["canvas"]
+    Module["canvas"] = document.getElementById("canvas");
+  </script>
+
+  <script src="resnet.js"></script>
+  <script src="tvm_runtime.js"></script>
+
+  <script>
+
+    /**
+     * Load a text file synchronously.
+     * @param {string} url The file path.
+     * @return {string} The file content.
+     */
+    function load_file(url) {
+      assert(typeof url == "string", "URL must be string");
+
+      var req = new XMLHttpRequest();
+      var result;
+      req.addEventListener("load", function() {
+        result = this.responseText;
+      });
+      req.open("get", url, false);
+      req.send();
+      return result;
+    }
+
+    /**
+     * The index of the maximum element in an array.
+     * @param {Array} The array.
+     * @return {number} The index.
+     */
+    function argmax(arr) {
+      assert(typeof arr.length == "number", "Input must be array-like");
+
+      var res = 0;
+      for (var i = 0; i < arr.length; i++) {
+        if (arr[i] > arr[res]) {
+          res = i;
+        }
+      }
+      return res;
+    }
+
+    /**
+     * Preprocess an image to fit resnet input format.
+     * @param {ImageData} The input image data. Should be 224x224xRGBA.
+     * @return {Float32Array} The preprocessed input array.
+     */
+    function preprocess_image(image_data) {
+      assert(image_data instanceof ImageData, "Input must be ImageData.");
+      assert(image_data.width == 224, "Width must be 224.");
+      assert(image_data.height == 224, "Height must be 224.");
+
+      var width = image_data.width;
+      var height = image_data.height;
+      var npixels = width * height;
+
+      var rgba_uint8 = image_data.data;
+      assert(rgba_uint8.length == npixels * 4, "Image should be RGBA.");
+
+      // Drop alpha channel. Resnet does not need it.
+      var rgb_uint8 = new Uint8Array(npixels * 3);
+      for (var i = 0; i < npixels; i++) {
+        rgb_uint8[i * 3] = rgba_uint8[i * 4];
+        rgb_uint8[i * 3 + 1] = rgba_uint8[i * 4 + 1];
+        rgb_uint8[i * 3 + 2] = rgba_uint8[i * 4 + 2];
+      }
+
+      // Cast to float and normalize.
+      var rgb_float = new Float32Array(npixels * 3);
+      for (var i = 0; i < npixels; i++) {
+        rgb_float[i * 3] = (rgb_uint8[i * 3] - 123.0) / 58.395;
+        rgb_float[i * 3 + 1] = (rgb_uint8[i * 3 + 1] - 117.0) / 57.12;
+        rgb_float[i * 3 + 2] = (rgb_uint8[i * 3 + 2] - 104.0) / 57.375;
+      }
+
+      // Transpose. Resnet expects 3 greyscale images.
+      var data = new Float32Array(npixels * 3);
+      for (var i = 0; i < npixels; i++) {
+        data[i] = rgb_float[i * 3];
+        data[npixels + i] = rgb_float[i * 3 + 1];
+        data[npixels * 2 + i] = rgb_float[i * 3 + 2];
+      }
+
+      return data;
+    }
+
+    // Set these variables at the global scope so that we can debug more easily.
+    var tvm;
+    var syslib;
+    var graph_json_str;
+    var loaded_module;
+    var data_array;
+    var data;
+    var input;
+    var base64_params;
+    var output;
+    Module["onRuntimeInitialized"] = function () {
+      tvm = tvm_runtime.create(Module);
+
+      tvm.logger = function (message) {
+        console.log(message);
+        var d = document.createElement("div");
+        d.innerHTML = message;
+        document.getElementById("log").appendChild(d);
+      };
+
+      tvm.logger("Loading SystemLib...");
+      syslib = tvm.systemLib();
+      tvm.logger("- SystemLib loaded!");
+
+      tvm.logger("Loading resnet model...");
+      graph_json_str = load_file("resnet.json");
+      ctx = tvm.context("opengl", 0);
+      loaded_module = tvm.createGraphRuntime(graph_json_str, syslib, ctx);
+      tvm.logger("- Model loaded!");
+
+      tvm.logger("Loading model parameters...");
+      base64_params = load_file("resnet.params");
+      loaded_module.load_base64_params(base64_params);
+      tvm.logger("- Model parameters loaded!");
+
+      tvm.logger("Loading input image...");
+      var image = document.getElementById("image");
+      var image_canvas = document.getElementById("image_canvas");
+      var image_canvas_context = image_canvas.getContext("2d");
+      image_canvas_context.drawImage(image, 0, 0);
+      var image_data = image_canvas_context.getImageData(0, 0, 224, 224);
+      data_array = preprocess_image(image_data);
+      tvm.logger("- Input image loaded!");
+
+      tvm.logger("Setting input data...");
+      data_shape = JSON.parse(load_file("data_shape.json"));
+      data = tvm.empty(data_shape, "float32", ctx);
+      data.copyFrom(data_array);
+      loaded_module.set_input("data", data);
+      tvm.logger("- Input data set!");
+
+      tvm.logger("Running model...");
+      loaded_module.run();
+      tvm.logger("- Model execution completed!");
+
+      out_shape = JSON.parse(load_file("out_shape.json"));
+      output = tvm.empty(out_shape, "float32", ctx);
+      loaded_module.get_output(0, output);
+
+      prediction = argmax(output.asArray());
+      
+      synset = JSON.parse(load_file("synset.json"));
+      result_string = "Prediction: " + synset[prediction] + "\n";
+      document.getElementById("prediction").innerHTML = result_string;
+    };
+
+  </script>
+</body>
+
+</html>
diff --git a/tutorials/nnvm_quick_start.py b/tutorials/nnvm_quick_start.py
new file mode 100644
index 000000000000..c9f6c33591d0
--- /dev/null
+++ b/tutorials/nnvm_quick_start.py
@@ -0,0 +1,141 @@
+"""
+.. _tutorial-nnvm-quick-start:
+
+Quick Start Tutorial for Compiling Deep Learning Models
+=======================================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_
+
+This example shows how to build a neural network with NNVM python frontend and
+generate runtime library for Nvidia GPU with TVM.
+Notice that you need to build TVM with cuda and llvm enabled.
+"""
+
+######################################################################
+# Overview for Supported Hardware Backend of TVM
+# ----------------------------------------------
+# The image below shows hardware backend currently supported by TVM:
+#
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tvm_support_list.png
+#      :align: center
+#      :scale: 100%
+#
+# In this tutorial, we'll choose cuda and llvm as target backends.
+# To begin with, let's import NNVM and TVM.
+
+import numpy as np
+
+import nnvm.compiler
+import nnvm.testing
+import tvm
+from tvm.contrib import graph_runtime
+
+######################################################################
+# Define Neural Network in NNVM
+# -----------------------------
+# First, let's define a neural network with nnvm python frontend.
+# For simplicity, we'll use pre-defined resnet-18 network in NNVM.
+# Parameters are initialized with Xavier initializer.
+# NNVM also supports other model formats such as MXNet, CoreML, ONNX and 
+# Tensorflow.
+#
+# In this tutorial, we assume we will do inference on our device
+# and the batch size is set to be 1. Input images are RGB color
+# images of size 224 * 224. We can call the :any:`nnvm.symbol.debug_str`
+# to show the network structure.
+
+batch_size = 1
+num_class = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+out_shape = (batch_size, num_class)
+
+net, params = nnvm.testing.resnet.get_workload(layers=18,
+        batch_size=batch_size, image_shape=image_shape)
+print(net.debug_str())
+
+######################################################################
+# Compilation
+# -----------
+# Next step is to compile the model using the NNVM/TVM pipeline.
+# Users can specify the optimization level of the compilation.
+# Currently this value can be 0 to 3. The optimization passes include
+# operator fusion, pre-computation, layout transformation and so on.
+#
+# :any:`nnvm.compiler.build` returns three components: the execution graph in
+# json format, the TVM module library of compiled functions specifically
+# for this graph on the target hardware, and the parameter blobs of
+# the model. During the compilation, NNVM does the graph-level
+# optimization while TVM does the tensor-level optimization, resulting
+# in an optimized runtime module for model serving.
+#
+# We'll first compile for Nvidia GPU. Behind the scene, `nnvm.compiler.build`
+# first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
+# then registers the operators (i.e. the nodes of the optimized graphs) to
+# TVM implementations to generate a `tvm.module`.
+# To generate the module library, TVM will first transfer the High level IR
+# into the lower intrinsic IR of the specified target backend, which is CUDA
+# in this example. Then the machine code will be generated as the module library.
+
+opt_level = 3
+target = tvm.target.cuda()
+with nnvm.compiler.build_config(opt_level=opt_level):
+    graph, lib, params = nnvm.compiler.build(
+        net, target, shape={"data": data_shape}, params=params)
+
+#####################################################################
+# Run the generate library
+# ------------------------
+# Now we can create graph runtime and run the module on Nvidia GPU.
+
+# create random input
+ctx = tvm.gpu()
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+# create module
+module = graph_runtime.create(graph, lib, ctx)
+# set input and parameters
+module.set_input("data", data)
+module.set_input(**params)
+# run
+module.run()
+# get output
+out = module.get_output(0, tvm.nd.empty(out_shape))
+# convert to numpy
+out.asnumpy()
+
+# Print first 10 elements of output
+print(out.asnumpy().flatten()[0:10])
+
+######################################################################
+# Save and Load Compiled Module
+# -----------------------------
+# We can also save the graph, lib and parameters into files and load them
+# back in development environment.
+
+####################################################
+
+# save the graph, lib and params into separate files
+from tvm.contrib import util
+
+temp = util.tempdir()
+path_lib = temp.relpath("deploy_lib.so")
+lib.export_library(path_lib)
+with open(temp.relpath("deploy_graph.json"), "w") as fo:
+    fo.write(graph.json())
+with open(temp.relpath("deploy_param.params"), "wb") as fo:
+    fo.write(nnvm.compiler.save_param_dict(params))
+print(temp.listdir())
+
+####################################################
+
+# load the module back.
+loaded_json = open(temp.relpath("deploy_graph.json")).read()
+loaded_lib = tvm.module.load(path_lib)
+loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
+input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
+
+module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
+module.load_params(loaded_params)
+module.run(data=input_data)
+
+out = module.get_output(0, out=tvm.nd.empty(out_shape))
+
diff --git a/tutorials/optimize/README.txt b/tutorials/optimize/README.txt
index 4118f94bd51f..b051548c5351 100644
--- a/tutorials/optimize/README.txt
+++ b/tutorials/optimize/README.txt
@@ -1,3 +1,2 @@
-Optimize Operators
-------------------
-
+Optimize Tensor Operators
+-------------------------
diff --git a/tutorials/optimize/opt_conv_cuda.py b/tutorials/optimize/opt_conv_cuda.py
index ecacc59e4673..52e0417f20ca 100644
--- a/tutorials/optimize/opt_conv_cuda.py
+++ b/tutorials/optimize/opt_conv_cuda.py
@@ -1,4 +1,7 @@
-"""How to optimize convolution on GPU
+"""
+.. _opt-conv-gpu:
+
+How to optimize convolution on GPU
 ==================================
 **Author**: `Haichen Shen <https://homes.cs.washington.edu/~haichen/>`_
 
diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index cc9c234a7f87..803b81e7d222 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -1,7 +1,8 @@
 """
 How to optimize GEMM on CPU
 ===========================
-**Author**: `Jian Weng <https://github.com/were>`_
+**Author**: `Jian Weng <https://github.com/were>`_, \
+            `Ruofei Yu <https://github.com/yuruofeifei>`_
 
 (TL;DR) TVM provides abstract interfaces which allows users to depict an algorithm and the
 algorithm's implementing organization (the so-called schedule) separately. Typically, writing
@@ -9,12 +10,12 @@
 trying various seemingly promising schedules is time-consuming. With the help of TVM, we can
 try these schedules efficiently to enhance the performance.
 
-In this tutorial, we will demonstrate how square matrix multiplication is optimized step by step by
-writing TVM.
+In this tutorial, we will demonstrate how to use TVM to optimize square matrix multiplication
+and achieve 200 times faster than baseline by simply adding 18 extra lines of code.
 
-There are two important optmizations on intense computation applications executed on CPU:
+There are two important optimizations on intense computation applications executed on CPU:
     1. Increase the cache hit rate of memory access. Both complex numerical computation and hot-spot
-       memory access can be acclerated from high cache hit rate. This requires us to transform the
+       memory access can be accelerated from high cache hit rate. This requires us to transform the
        origin memory access pattern to the pattern fits the cache policy.
     2. SIMD (Single instruction multi-data), or we call it vector processing unit. Every time, a
        small batch of data, rather than a single grid, will be processed. This requires us to
@@ -25,90 +26,185 @@
 `repo <https://github.com/flame/how-to-optimize-gemm>`_. Some of them have been applied by TVM
 abstraction automatically, but some of them cannot be simply applied due to TVM constraints.
 
-All the experiment results mentioned below, are executed on 2013's 15' MacBook equiped with
-Intel i7-2760QM CPU. The cache line size should be 64 bytes for all the x86 CPU.
+All the experiment results mentioned below, are executed on 2015's 15' MacBook equipped with
+Intel i7-4770HQ CPU. The cache line size should be 64 bytes for all the x86 CPUs.
 """
 
-###############################################################################
+################################################################################################
 # Preparation and Baseline
 # ------------------------
-# In this tutorial we assume all the matrix tensors are square and fix-bounded.
-# We use 1024x1024 float32 matrix in demonstration. Before actually demonstrating,
-# we first define these variables. Then we write a baseline implementation,
-# the simplest way to write a matrix mulplication in TVM.
-#
+# In this tutorial, we will demo how to use TVM to optimize matrix multiplication.
+# Before actually demonstrating, we first define these variables.
+# Then we write a baseline implementation, the simplest way to write a matrix multiplication in TVM.
 
 import tvm
 import numpy
-import time
+import timeit
 
-# The size of the square matrix
+# The size of the matrix
+# (M, K) x (K, N)
+# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.
+M = 1024
+K = 1024
 N = 1024
+
 # The default tensor type in tvm
 dtype = "float32"
+
+# using Intel AVX2(Advanced Vector Extensions) ISA for SIMD
+# To get the best performance, please change the following line
+# to llvm -mcpu=core-avx2, or specific type of CPU you use
+target = 'llvm'
+ctx = tvm.context(target, 0)
+
 # Random generated tensor for testing
-a = tvm.nd.array(numpy.random.rand(N, N).astype(dtype), tvm.cpu(0))
-b = tvm.nd.array(numpy.random.rand(N, N).astype(dtype), tvm.cpu(0))
-# The expected answer
+a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), ctx)
+b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), ctx)
+
+np_repeat = 100
+np_runing_time = timeit.timeit(setup='import numpy\n'
+                                     'M = ' + str(M) + '\n'
+                                     'K = ' + str(K) + '\n'
+                                     'N = ' + str(N) + '\n'
+                                     'dtype = "float32"\n'
+                                     'a = numpy.random.rand(M, K).astype(dtype)\n'
+                                     'b = numpy.random.rand(K, N).astype(dtype)\n',
+                               stmt='answer = numpy.dot(a, b)',
+                               number=np_repeat)
+print("Numpy running time: %f" % (np_runing_time / np_repeat))
+
 answer = numpy.dot(a.asnumpy(), b.asnumpy())
 
 # Algorithm
-k = tvm.reduce_axis((0, N), 'k')
-A = tvm.placeholder((N, N), name = 'A')
-B = tvm.placeholder((N, N), name = 'B')
+k = tvm.reduce_axis((0, K), 'k')
+A = tvm.placeholder((M, K), name='A')
+B = tvm.placeholder((K, N), name='B')
 C = tvm.compute(
-           A.shape,
-           lambda x, y: tvm.sum(A[x, k] * B[k, y], axis = k),
-           name = 'C')
+           (M, N),
+           lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k),
+           name='C')
 
 # Default schedule
 s = tvm.create_schedule(C.op)
-func = tvm.build(s, [A, B, C], name = 'mmult')
+func = tvm.build(s, [A, B, C], target=target, name='mmult')
 assert func
-evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number = 1)
-c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0))
+
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
+func(a, b, c)
+numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+
+evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
 print('Baseline: %f' % evaluator(a, b, c).mean)
 
+################################################################################################
+# In TVM, we can always inspect lower level IR to debug or optimize our schedule.
+# Here is the generated IR using our baseline schedule.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
 ################################################################################################
 # Blocking
 # --------
-# A important trick to enhance the cache hit rate is blocking --- data chunck will be computed
+# A important trick to enhance the cache hit rate is blocking --- data chunk will be computed
 # block by block. The memory access inside the block is a small neighbourhood which is with high
-# meomry locality. In this tutorial, I pick up 8, a relatively small value (8 ints < 64 bytes),
-# as the blocking size.
-#
+# memory locality. In this tutorial, I picked up 32 as the blocking factor. So the block will
+# fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache)
+
+bn = 32
+s = tvm.create_schedule(C.op)
 
-bn = 8
 # Blocking by loop tiling
-yo, xo, yi, xi = s[C].tile(C.op.axis[1], C.op.axis[0], bn, bn)
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+k, = s[C].op.reduce_axis
+ko, ki = s[C].split(k, factor=4)
+
 # Hoist reduction domain outside the blocking loop
-s[C].reorder(yo, xo, k, yi, xi)
-func = tvm.build(s, [A, B, C], name = 'mmult')
+s[C].reorder(xo, yo, ko, ki, xi, yi)
+
+func = tvm.build(s, [A, B, C], target=target, name='mmult')
 assert func
-# By simply tiling the loop 8x8, and hoisting k outside the blocking loops, we can get nearly 4x
-# speedup compared with the baseline.
-evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number = 5)
-c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0))
+
+c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+func(a, b, c)
+numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+
+# By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops,
+# we can see big speedup compared with the baseline.
+evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt1: %f' % evaluator(a, b, c).mean)
 
+################################################################################################
+# Here is the generated IR after blocking.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
 ###################################################################################################
 # Vectorization
 # -------------
-# Another important trick is vectorization. When the memory access pattern is uniform, the compiler
-# can detect this pattern and pass the continuous memory to vector processor. In TVM, we can use
-# `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly.
+# Another important trick is vectorization. When the memory access pattern is uniform,
+# the compiler can detect this pattern and pass the continuous memory to vector processor. In TVM,
+# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly.
 #
+# In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly.
+
+s = tvm.create_schedule(C.op)
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+k, = s[C].op.reduce_axis
+ko, ki = s[C].split(k, factor=4)
+
+s[C].reorder(xo, yo, ko, ki, xi, yi)
 
-# After trying different schedule, we finally found that we can benefit from vectorizing
-# the row loop most, i.e. yi.
+# Vectorization
 s[C].vectorize(yi)
-func = tvm.build(s, [A, B, C], name = 'mmult')
+
+func = tvm.build(s, [A, B, C], target=target, name='mmult')
 assert func
-# We can get almost another 4x speedup compared with the previous schedule.
-evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number = 5)
-c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0))
+
+c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+func(a, b, c)
+numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+
+evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt2: %f' % evaluator(a, b, c).mean)
 
+################################################################################################
+# Here is the generated IR after vectorization.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+###################################################################################################
+# Loop Permutation
+# ----------------
+# If we look at the above IR, we can see the inner loop row data is vectorized and
+# B is transformed into PackedB. The traversal of PackedB is sequential now.
+# So we will look at the access pattern of A. In current schedule, A is accessed column by column
+# which is not cache friendly. If we change the nested loop order of ki and inner axes xi,
+# the access pattern for A matrix is more cache friendly.
+
+s = tvm.create_schedule(C.op)
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+k, = s[C].op.reduce_axis
+ko, ki = s[C].split(k, factor=4)
+
+# re-ordering
+s[C].reorder(xo, yo, ko, xi, ki, yi)
+s[C].vectorize(yi)
+
+func = tvm.build(s, [A, B, C], target=target, name='mmult')
+assert func
+
+c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+func(a, b, c)
+numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+
+evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+print('Opt3: %f' % evaluator(a, b, c).mean)
+
+################################################################################################
+# Here is the generated IR after loop permutation.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
 ###################################################################################################
 # Array Packing
 # -------------
@@ -125,41 +221,142 @@
 ###################################################################################################
 # Just as it is shown in the figure above, after blocking the computations, we can observe the array
 # access pattern of B (after flattening), which is regular but discontinuous. We expect that after
-# some transformation we can get continuous access pattern. We can reorder a [16][16] array to 
-# a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing 
+# some transformation we can get continuous access pattern. We can reorder a [16][16] array to
+# a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing
 # the corresponding value from the packed array.
 #
 
 # We have to re-write the algorithm slightly.
-packedB = tvm.compute((N / bn, N, bn), lambda x, y, z: B[y, x * bn + z], name = 'packedB')
-C = tvm.compute(A.shape,
-                lambda x, y: tvm.sum(A[x, k] * packedB[y / bn, k, y % bn], axis = k),
+packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB')
+C = tvm.compute((M, N),
+                lambda x, y: tvm.sum(A[x, k] * packedB[y / bn, k, y % bn], axis=k),
                 name = 'C')
 
-# Same schedule
 s = tvm.create_schedule(C.op)
-yo, xo, yi, xi = s[C].tile(C.op.axis[1], C.op.axis[0], bn, bn)
-s[C].reorder(yo, xo, k, yi, xi)
+
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+k, = s[C].op.reduce_axis
+ko, ki = s[C].split(k, factor=4)
+
+s[C].reorder(xo, yo, ko, xi, ki, yi)
 s[C].vectorize(yi)
 
-func = tvm.build(s, [A, B, C], name = 'mmult')
+x, y, z = s[packedB].op.axis
+s[packedB].vectorize(z)
+s[packedB].parallel(x)
+
+func = tvm.build(s, [A, B, C], target=target, name='mmult')
 assert func
-# We can accelerate it almost 3x compared with the previous schedule.
-evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number = 5)
-c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0))
-print('Opt3: %f' % evaluator(a, b, c).mean)
 
-##################################################################################################
-# Summary
-# -------
-# After applying three main tricks, we can achieve almost 90% performance of numpy.
-# Further observation is required to catch up with the performance of numpy.
+c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+func(a, b, c)
+numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+
+evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+print('Opt4: %f' % evaluator(a, b, c).mean)
+
+################################################################################################
+# Here is the generated IR after array packing.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################################
+# Write cache for blocks
+# ----------------------
+# After blocking, the program will write result to C block by block, the access pattern
+# is not sequential. So we can use a sequential cache array to hold the block results and
+# write to C when all the block results are ready.
 #
 
-# TODO(Jian Weng): Catch up with the performance of numpy.
-_a = a.asnumpy()
-_b = b.asnumpy()
-now = time.clock()
-answer = numpy.dot(_a, _b)
-print("Numpy: %f" % (time.clock() - now))
+s = tvm.create_schedule(C.op)
+
+# Allocate write cache
+CC = s.cache_write(C, 'global')
+
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+
+# Write cache is computed at yo
+s[CC].compute_at(s[C], yo)
+
+# New inner axes
+xc, yc = s[CC].op.axis
+
+k, = s[CC].op.reduce_axis
+ko, ki = s[CC].split(k, factor=4)
+s[CC].reorder(ko, xc, ki, yc)
+s[CC].unroll(ki)
+s[CC].vectorize(yc)
+
+x, y, z = s[packedB].op.axis
+s[packedB].vectorize(z)
+s[packedB].parallel(x)
+
+func = tvm.build(s, [A, B, C], target=target, name='mmult')
+assert func
+
+c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+func(a, b, c)
+numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+
+evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+print('Opt5: %f' % evaluator(a, b, c).mean)
+
+################################################################################################
+# Here is the generated IR after blocking.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
 
+###################################################################################################
+# Parallel
+# --------
+# Futhermore, we can also utilize multi-core processors to do the thread-level parallelization.
+
+s = tvm.create_schedule(C.op)
+
+CC = s.cache_write(C, 'global')
+
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+
+s[CC].compute_at(s[C], yo)
+
+xc, yc = s[CC].op.axis
+
+k, = s[CC].op.reduce_axis
+ko, ki = s[CC].split(k, factor=4)
+s[CC].reorder(ko, xc, ki, yc)
+s[CC].unroll(ki)
+s[CC].vectorize(yc)
+
+# parallel
+s[C].parallel(xo)
+
+x, y, z = s[packedB].op.axis
+s[packedB].vectorize(z)
+s[packedB].parallel(x)
+
+func = tvm.build(s, [A, B, C], target=target, name = 'mmult')
+assert func
+
+c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+func(a, b, c)
+numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+
+evaluator = func.time_evaluator(func.entry_name, ctx, number=50)
+opt6_time = evaluator(a, b, c).mean
+print('Opt6: %f' % opt6_time)
+
+################################################################################################
+# Here is the generated IR after parallelization.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+###################################################################################################
+
+##################################################################################################
+# Summary
+# -------
+# After applying the above simple optimizations with only 18 lines of code,
+# our generated code can achieve 60% of the `numpy` performance with MKL.
+# Note that the outputs on the web page reflect the running times on a non-exclusive
+# Docker container, thereby they are *unreliable*. It is highly encouraged to run the
+# tutorial by yourself to observe the performance gain acheived by TVM.
diff --git a/tutorials/topi/README.txt b/tutorials/topi/README.txt
new file mode 100644
index 000000000000..eae0aafafc02
--- /dev/null
+++ b/tutorials/topi/README.txt
@@ -0,0 +1,2 @@
+TOPI: TVM Operator Inventory
+----------------------------
diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
new file mode 100644
index 000000000000..cf21aa52261c
--- /dev/null
+++ b/tutorials/topi/intro_topi.py
@@ -0,0 +1,122 @@
+"""
+Introduction to TOPI
+====================
+**Author**: `Ehsan M. Kermani <https://github.com/ehsanmok>`_
+
+This is an introductory tutorial to TVM Operator Inventory (TOPI).
+TOPI provides numpy-style generic operations and schedules with higher abstractions than TVM.
+In this tutorial, we will see how TOPI can save us from writing boilerplates code in TVM.
+"""
+from __future__ import absolute_import, print_function
+
+import tvm
+import topi
+import numpy as np
+
+######################################################################
+# Basic example
+# -------------
+# Let's revisit the sum of rows operation (equivalent to :code:`B = numpy.sum(A, axis=1)`') \
+# To compute the sum of rows of a two dimensional TVM tensor A, we should
+# specify the symbolic operation as well as schedule as follows
+#
+n = tvm.var("n")
+m = tvm.var("m")
+A = tvm.placeholder((n, m), name='A')
+k = tvm.reduce_axis((0, m), "k")
+B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
+s = tvm.create_schedule(B.op)
+
+######################################################################
+# and to examine the IR code in human readable format, we can do
+#
+print(tvm.lower(s, [A], simple_mode=True))
+
+######################################################################
+# However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with
+# :code: `tvm.compute`. Imagine for more complicated operations how much details we need to provide.
+# Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code`numpy.sum`
+#
+C = topi.sum(A, axis=1)
+ts = tvm.create_schedule(C.op)
+print(tvm.lower(ts, [A], simple_mode=True))
+
+######################################################################
+# Numpy-style operator overloading
+# --------------------------------
+# We can add two tensors using :code:`topi.broadcast_add` that have correct (broadcastable with specific) shapes.
+# Even shorter, TOPI provides operator overloading for such common operations. For example,
+#
+x, y = 100, 10
+a = tvm.placeholder((x, y, y), name="a")
+b = tvm.placeholder((y, y), name="b")
+c = a + b  # same as topi.broadcast_add
+d = a * b  # same as topi.broadcast_mul
+
+######################################################################
+# Overloaded with the same syntax, TOPI handles broadcasting a primitive (`int`, `float`) to a tensor :code:`d - 3.14`.
+
+######################################################################
+# Generic schedules and fusing operations
+# ---------------------------------------
+# Up to now, we have seen an example of how TOPI can save us from writing explicit computations in lower level API.
+# But it doesn't stop here. Still we did the scheduling as before. TOPI also provides higher level
+# scheduling recipes depending on a given context. For example, for CUDA,
+# we can schedule the following series of operations ending with :code:`topi.sum` using only
+# :code:`topi.generic.schedule_reduce`
+#
+e = topi.elemwise_sum([c, d])
+f = e / 2.0
+g = topi.sum(f)
+with tvm.target.cuda():
+    sg = topi.generic.schedule_reduce(g)
+    print(tvm.lower(sg, [a, b], simple_mode=True))
+
+######################################################################
+# As you can see, scheduled stages of computation have been accumulated and we can examine them by
+#
+print(sg.stages)
+
+######################################################################
+# We can test the correctness by comparing with :code:`numpy` result as follows
+#
+func = tvm.build(sg, [a, b, g], 'cuda')
+ctx = tvm.gpu(0)
+a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
+b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
+g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)
+a_nd = tvm.nd.array(a_np, ctx)
+b_nd = tvm.nd.array(b_np, ctx)
+g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
+func(a_nd, b_nd, g_nd)
+np.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)
+
+######################################################################
+# TOPI also provides common neural nets operations such as _softmax_ with optimized schedule
+#
+tarray = tvm.placeholder((512, 512), name="tarray")
+softmax_topi = topi.nn.softmax(tarray)
+with tvm.target.create("cuda"):
+    sst = topi.generic.schedule_softmax(softmax_topi)
+    print(tvm.lower(sst, [tarray], simple_mode=True))
+
+######################################################################
+# Fusing convolutions
+# -------------------
+# We can fuse :code:`topi.nn.conv2d` and :code:`topi.nn.relu` together
+#
+data = tvm.placeholder((1, 3, 224, 224))
+kernel = tvm.placeholder((10, 3, 5, 5))
+conv = topi.nn.conv2d(data, kernel, strides=1, padding=2)
+out = topi.nn.relu(conv)
+with tvm.target.create("cuda"):
+    sconv = topi.generic.nn.schedule_conv2d_nchw(out)
+    print(tvm.lower(sconv, [data, kernel], simple_mode=True))
+
+######################################################################
+# Summary
+# -------
+# In this tutorial, we have seen
+#
+# - How to use TOPI API for common operations with numpy-style operators.
+# - How TOPI facilitates generic schedules and operator fusion for a context, to generate optimized kernel codes.
diff --git a/vta/README.md b/vta/README.md
new file mode 100644
index 000000000000..4b1d17ff4bb4
--- /dev/null
+++ b/vta/README.md
@@ -0,0 +1,16 @@
+VTA: Open, Modular, Deep Learning Accelerator Stack
+===================================================
+VTA (versatile tensor accelerator) is an open-source deep learning accelerator complemented with an end-to-end TVM-based compiler stack.
+
+The key features of VTA include:
+
+- Generic, modular, open-source hardware
+  - Streamlined workflow to deploy to FPGAs.
+  - Simulator support to prototype compilation passes on regular workstations.
+- Driver and JIT runtime for both simulator and FPGA hardware back-end.
+- End-to-end TVM stack integration
+  - Direct optimization and deployment of models from deep learning frameworks via TVM.
+  - Customized and extensible TVM compiler back-end.
+  - Flexible RPC support to ease deployment, and program FPGAs with the convenience of Python.
+
+Learn more about VTA [here](https://docs.tvm.ai/vta/index.html).
\ No newline at end of file
diff --git a/vta/config/README.md b/vta/config/README.md
new file mode 100644
index 000000000000..152621666ef1
--- /dev/null
+++ b/vta/config/README.md
@@ -0,0 +1,8 @@
+# VTA Configuration
+
+Each VTA runtime/hardware configuration is specified by vta_config.json file.
+You can copy the vta_config.json to tvm project root and modify the configuration
+before you type make.
+
+The config is going to affect the behavior of python package as well as
+the hardware runtime build.
diff --git a/vta/config/pynq_sample.json b/vta/config/pynq_sample.json
new file mode 100644
index 000000000000..5c37108e6b12
--- /dev/null
+++ b/vta/config/pynq_sample.json
@@ -0,0 +1,17 @@
+{
+  "TARGET" : "pynq",
+  "HW_FREQ" : 100,
+  "HW_CLK_TARGET" : 8,
+  "HW_VER" : "0.0.0",
+  "LOG_INP_WIDTH" : 3,
+  "LOG_WGT_WIDTH" : 3,
+  "LOG_ACC_WIDTH" : 5,
+  "LOG_OUT_WIDTH" : 3,
+  "LOG_BATCH" : 0,
+  "LOG_BLOCK_IN" : 4,
+  "LOG_BLOCK_OUT" : 4,
+  "LOG_UOP_BUFF_SIZE" : 15,
+  "LOG_INP_BUFF_SIZE" : 15,
+  "LOG_WGT_BUFF_SIZE" : 18,
+  "LOG_ACC_BUFF_SIZE" : 17
+}
diff --git a/vta/config/vta_config.json b/vta/config/vta_config.json
new file mode 100644
index 000000000000..602af0126816
--- /dev/null
+++ b/vta/config/vta_config.json
@@ -0,0 +1,17 @@
+{
+  "TARGET" : "sim",
+  "HW_FREQ" : 100,
+  "HW_CLK_TARGET" : 7,
+  "HW_VER" : "0.0.0",
+  "LOG_INP_WIDTH" : 3,
+  "LOG_WGT_WIDTH" : 3,
+  "LOG_ACC_WIDTH" : 5,
+  "LOG_OUT_WIDTH" : 3,
+  "LOG_BATCH" : 0,
+  "LOG_BLOCK_IN" : 4,
+  "LOG_BLOCK_OUT" : 4,
+  "LOG_UOP_BUFF_SIZE" : 15,
+  "LOG_INP_BUFF_SIZE" : 15,
+  "LOG_WGT_BUFF_SIZE" : 18,
+  "LOG_ACC_BUFF_SIZE" : 17
+}
diff --git a/vta/config/vta_config.py b/vta/config/vta_config.py
new file mode 100644
index 000000000000..69c21fe80940
--- /dev/null
+++ b/vta/config/vta_config.py
@@ -0,0 +1,180 @@
+"""VTA config tool"""
+import os
+import sys
+import json
+import argparse
+
+def get_pkg_config(cfg):
+    """Get the pkg config object."""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
+    pkg_config_py = os.path.join(proj_root, "vta/python/vta/pkg_config.py")
+    libpkg = {"__file__": pkg_config_py}
+    exec(compile(open(pkg_config_py, "rb").read(), pkg_config_py, "exec"), libpkg, libpkg)
+    PkgConfig = libpkg["PkgConfig"]
+    return PkgConfig(cfg, proj_root)
+
+
+def main():
+    """Main funciton"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--use-cfg", type=str, default="",
+                        help="path to the config json")
+    parser.add_argument("--cflags", action="store_true",
+                        help="print the cflags")
+    parser.add_argument("--defs", action="store_true",
+                        help="print the macro defs")
+    parser.add_argument("--sources", action="store_true",
+                        help="print the source file paths")
+    parser.add_argument("--update", action="store_true",
+                        help="Print out the json option.")
+    parser.add_argument("--ldflags", action="store_true",
+                        help="print the cflags")
+    parser.add_argument("--cfg-json", action="store_true",
+                        help="print all the config json")
+    parser.add_argument("--save-cfg-json", type=str, default="",
+                        help="save config json to file")
+    parser.add_argument("--target", action="store_true",
+                        help="print the target")
+    parser.add_argument("--cfg-str", action="store_true",
+                        help="print the configuration string")
+    parser.add_argument("--get-inpwidth", action="store_true",
+                        help="returns log of input bitwidth")
+    parser.add_argument("--get-wgtwidth", action="store_true",
+                        help="returns log of weight bitwidth")
+    parser.add_argument("--get-accwidth", action="store_true",
+                        help="returns log of accum bitwidth")
+    parser.add_argument("--get-outwidth", action="store_true",
+                        help="returns log of output bitwidth")
+    parser.add_argument("--get-batch", action="store_true",
+                        help="returns log of tensor batch dimension")
+    parser.add_argument("--get-blockin", action="store_true",
+                        help="returns log of tensor block in dimension")
+    parser.add_argument("--get-blockout", action="store_true",
+                        help="returns log of tensor block out dimension")
+    parser.add_argument("--get-uopbuffsize", action="store_true",
+                        help="returns log of micro-op buffer size in B")
+    parser.add_argument("--get-inpbuffsize", action="store_true",
+                        help="returns log of input buffer size in B")
+    parser.add_argument("--get-wgtbuffsize", action="store_true",
+                        help="returns log of weight buffer size in B")
+    parser.add_argument("--get-accbuffsize", action="store_true",
+                        help="returns log of accum buffer size in B")
+    parser.add_argument("--get-outbuffsize", action="store_true",
+                        help="returns log of output buffer size in B")
+    parser.add_argument("--get-fpgafreq", action="store_true",
+                        help="returns FPGA frequency")
+    parser.add_argument("--get-fpgaper", action="store_true",
+                        help="returns HLS target clock period")
+    args = parser.parse_args()
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        return
+
+    curr_path = os.path.dirname(
+        os.path.abspath(os.path.expanduser(__file__)))
+    proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
+    path_list = [
+        os.path.join(proj_root, "vta_config.json"),
+        os.path.join(proj_root, "build", "vta_config.json"),
+        os.path.join(proj_root, "vta/config/vta_config.json")
+    ]
+    if args.use_cfg:
+        path_list = [args.use_cfg]
+    ok_path_list = [p for p in path_list if os.path.exists(p)]
+    if not ok_path_list:
+        raise RuntimeError("Cannot find config in %s" % str(path_list))
+    cfg = json.load(open(ok_path_list[0]))
+    cfg["LOG_OUT_BUFF_SIZE"] = (
+        cfg["LOG_ACC_BUFF_SIZE"] +
+        cfg["LOG_OUT_WIDTH"] -
+        cfg["LOG_ACC_WIDTH"])
+    pkg = get_pkg_config(cfg)
+
+    if args.target:
+        print(pkg.target)
+
+    if args.defs:
+        print(" ".join(pkg.macro_defs))
+
+    if args.sources:
+        print(" ".join(pkg.lib_source))
+
+    if args.cflags:
+        cflags_str = " ".join(pkg.cflags)
+        if cfg["TARGET"] == "pynq":
+            cflags_str += " -DVTA_TARGET_PYNQ"
+        print(cflags_str)
+
+    if args.ldflags:
+        print(" ".join(pkg.ldflags))
+
+    if args.cfg_json:
+        print(pkg.cfg_json)
+
+    if args.save_cfg_json:
+        with open(args.save_cfg_json, "w") as fo:
+            fo.write(pkg.cfg_json)
+
+    if args.cfg_str:
+        # Needs to match the BITSTREAM string in python/vta/environment.py
+        cfg_str = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}".format(
+            (1 << cfg["LOG_BATCH"]),
+            (1 << cfg["LOG_BLOCK_IN"]),
+            (1 << cfg["LOG_BLOCK_OUT"]),
+            (1 << cfg["LOG_INP_WIDTH"]),
+            (1 << cfg["LOG_WGT_WIDTH"]),
+            cfg["LOG_UOP_BUFF_SIZE"],
+            cfg["LOG_INP_BUFF_SIZE"],
+            cfg["LOG_WGT_BUFF_SIZE"],
+            cfg["LOG_ACC_BUFF_SIZE"],
+            cfg["HW_FREQ"],
+            cfg["HW_CLK_TARGET"],
+            cfg["HW_VER"].replace('.', '_'))
+        print(cfg_str)
+
+    if args.get_inpwidth:
+        print(cfg["LOG_INP_WIDTH"])
+
+    if args.get_wgtwidth:
+        print(cfg["LOG_WGT_WIDTH"])
+
+    if args.get_accwidth:
+        print(cfg["LOG_ACC_WIDTH"])
+
+    if args.get_outwidth:
+        print(cfg["LOG_OUT_WIDTH"])
+
+    if args.get_batch:
+        print(cfg["LOG_BATCH"])
+
+    if args.get_blockin:
+        print(cfg["LOG_BLOCK_IN"])
+
+    if args.get_blockout:
+        print(cfg["LOG_BLOCK_OUT"])
+
+    if args.get_uopbuffsize:
+        print(cfg["LOG_UOP_BUFF_SIZE"])
+
+    if args.get_inpbuffsize:
+        print(cfg["LOG_INP_BUFF_SIZE"])
+
+    if args.get_wgtbuffsize:
+        print(cfg["LOG_WGT_BUFF_SIZE"])
+
+    if args.get_outbuffsize:
+        print(cfg["LOG_OUT_BUFF_SIZE"])
+
+    if args.get_accbuffsize:
+        print(cfg["LOG_ACC_BUFF_SIZE"])
+
+    if args.get_fpgafreq:
+        print(cfg["HW_FREQ"])
+
+    if args.get_fpgaper:
+        print(cfg["HW_CLK_TARGET"])
+
+if __name__ == "__main__":
+    main()
diff --git a/vta/hardware/xilinx/.gitignore b/vta/hardware/xilinx/.gitignore
new file mode 100644
index 000000000000..54ff70b80cd1
--- /dev/null
+++ b/vta/hardware/xilinx/.gitignore
@@ -0,0 +1,4 @@
+build
+*.out
+*.log
+*.sb
diff --git a/vta/hardware/xilinx/Makefile b/vta/hardware/xilinx/Makefile
new file mode 100644
index 000000000000..0c365be0ed61
--- /dev/null
+++ b/vta/hardware/xilinx/Makefile
@@ -0,0 +1,119 @@
+# Directories
+ROOTDIR = $(CURDIR)
+BUILD_NAME = build
+BUILD_DIR = $(ROOTDIR)/../../$(BUILD_NAME)/hardware/xilinx
+SCRIPT_DIR = $(ROOTDIR)/scripts
+SRC_DIR = $(ROOTDIR)/src
+SIM_DIR = $(ROOTDIR)/sim
+TEST_DIR = $(ROOTDIR)/../../tests/hardware/common
+INCLUDE_DIR = $(ROOTDIR)/../../include
+
+# Executables
+VIVADO_HLS = vivado_hls
+VIVADO = vivado
+HSI = hsi
+
+# HLS mode
+MODE = skip_sim
+# Debug flag
+DEBUG = false
+# SLURM
+SLURM = false
+# Prevent generation of DSP
+NO_DSP = false
+# Prevent generation of ALU
+NO_ALU = false
+
+# Process VTA JSON config
+VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
+CFLAGS := $(shell ${VTA_CONFIG} --cflags)
+VTA_TARGET := $(shell ${VTA_CONFIG} --target)
+
+#---------------------
+# VTA Parameters
+#--------------------
+VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth)
+VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth)
+VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth)
+VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
+VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
+VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
+VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
+VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
+VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
+VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
+VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
+VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
+
+#---------------------
+# FPGA Parameters
+#--------------------
+VTA_CLOCK_FREQ = $(shell ${VTA_CONFIG} --get-fpgafreq)
+VTA_TARGET_PER = $(shell ${VTA_CONFIG} --get-fpgaper)
+
+#---------------------
+# Compilation parameters
+#--------------------
+
+#  Number of threads during compilation
+VTA_HW_COMP_THREADS = 8
+
+# Derive config name
+CONF = $(shell ${VTA_CONFIG} --cfg-str)
+IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
+HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
+
+ifeq ($(SLURM), true)
+	IP_BUILD_PATH = /scratch/hls/$(CONF)
+	HW_BUILD_PATH = /scratch/vivado/$(CONF)
+endif
+
+# IP file path
+IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
+
+# Bitstream file path
+BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
+
+.PHONY: all ip bit bsp clean clean_all
+
+all: bit
+ip: $(IP_PATH)
+bit: $(BIT_PATH)
+
+$(IP_PATH): $(SRC_DIR)/*
+	mkdir -p $(IP_BUILD_PATH)
+	cd $(IP_BUILD_PATH) && \
+		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
+		-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
+		$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(VTA_TARGET_PER) \
+		$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
+		$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
+		$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
+		$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
+ifeq ($(SLURM), true)
+	mkdir -p $(BUILD_DIR)/hls
+	mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
+endif
+
+$(BIT_PATH): $(IP_PATH)
+	mkdir -p $(HW_BUILD_PATH)
+	cd $(HW_BUILD_PATH) && \
+		$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
+		-tclargs $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) $(VTA_CLOCK_FREQ) \
+		$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \
+		$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
+		$(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
+ifeq ($(SLURM), true)
+	mkdir -p $(BUILD_DIR)/vivado
+	mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
+endif
+
+bsp: $(BIT_PATH)
+	cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
+	cd $(HW_BUILD_PATH)/bsp && make
+
+clean:
+	rm -rf *.out *.log *.sb figures
+
+cleanall: clean
+	rm -rf $(BUILD_DIR)
diff --git a/vta/hardware/xilinx/README.md b/vta/hardware/xilinx/README.md
new file mode 100644
index 000000000000..cd44c725711d
--- /dev/null
+++ b/vta/hardware/xilinx/README.md
@@ -0,0 +1 @@
+Complete instructions on how to build custom FPGA hardware designs are available on the [TVM documentation webpage](https://docs.tvm.ai/vta/install.html#vta-fpga-toolchain-installation).
\ No newline at end of file
diff --git a/vta/hardware/xilinx/scripts/compile_designs.py b/vta/hardware/xilinx/scripts/compile_designs.py
new file mode 100644
index 000000000000..a064a7198014
--- /dev/null
+++ b/vta/hardware/xilinx/scripts/compile_designs.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+import argparse
+import datetime
+import logging
+import numpy as np
+import os
+import pandas as pd
+import re
+import time
+from collections import namedtuple
+from numpy import floor, ceil, log2, log10
+from subprocess import call
+
+FPGA = namedtuple("FPGAConstraints",
+                  ['bram_w', 'bram_d', 'num_bram'])
+
+Hardware = namedtuple("HWConfig",
+                      ['batch', 'block_in', 'block_out',
+                       'input_w', 'weight_w', 'accum_w', 'out_w', 'uop_w'])
+
+def find_bram_confs(fpga, hw_conf, log_uop_sizeB):
+  # Derive sizes
+  input_elem_size_b = hw_conf.batch*hw_conf.block_in*hw_conf.input_w
+  weight_elem_size_b = hw_conf.block_in*hw_conf.block_out*hw_conf.weight_w
+  accum_elem_size_b = hw_conf.batch*hw_conf.block_out*hw_conf.accum_w
+  input_min_bram = (input_elem_size_b+fpga.bram_w-1)/fpga.bram_w
+  weight_min_bram = (weight_elem_size_b+fpga.bram_w-1)/fpga.bram_w
+  accum_min_bram = (accum_elem_size_b+fpga.bram_w-1)/fpga.bram_w
+  # Exploring all possible BRAM distributions
+  bram_confs = []
+  uop_bram = pow(2, log_uop_sizeB) * 8 / (fpga.bram_w * fpga.bram_d)
+  for log_i_bram in range(int(log2(input_min_bram)), int(ceil(log2(fpga.num_bram)))):
+    i_bram = pow(2, log_i_bram)
+    for log_w_bram in range(int(log2(weight_min_bram)), int(ceil(log2(fpga.num_bram)))):
+      w_bram = pow(2, log_w_bram)
+      for log_a_bram in range(int(log2(accum_min_bram)), int(ceil(log2(fpga.num_bram)))):
+        a_bram = pow(2, log_a_bram)
+        total_bram = uop_bram + i_bram + w_bram + a_bram + a_bram / hw_conf.accum_w * hw_conf.out_w
+        if total_bram <= fpga.num_bram:
+          # Right now we need to restrict uop width
+          input_elems = i_bram * fpga.bram_w * fpga.bram_d / input_elem_size_b
+          weight_elems = w_bram * fpga.bram_w * fpga.bram_d / weight_elem_size_b
+          accum_elems = a_bram * fpga.bram_w * fpga.bram_d / accum_elem_size_b
+          if log2(input_elems) + log2(weight_elems) + log2(accum_elems) <= hw_conf.uop_w:
+            log_inp_sizeB = int(log2(i_bram*fpga.bram_d*fpga.bram_w/8))
+            log_wgt_sizeB = int(log2(w_bram*fpga.bram_d*fpga.bram_w/8))
+            log_acc_sizeB = int(log2(a_bram*fpga.bram_d*fpga.bram_w/8))
+            bram_confs.append([log_uop_sizeB, log_inp_sizeB, log_wgt_sizeB, log_acc_sizeB])
+  # Filter out configs that are suboptimal
+  suboptimal = [False] * len(bram_confs)
+  for i in range(0, len(bram_confs)):
+    for j in range(i + 1, len(bram_confs)):
+      leq_list = [a <= b for a, b in zip(bram_confs[i], bram_confs[j])]
+      geq_list = [a >= b for a, b in zip(bram_confs[i], bram_confs[j])]
+      leq = all(leq_list)
+      geq = all(geq_list)
+      if leq:
+        suboptimal[i] = True
+      if geq:
+        suboptimal[j] = True
+  opt_bram_confs = [x[0] for x in zip(bram_confs, suboptimal) if not x[1]]
+  return opt_bram_confs
+
+def get_make_command(job, build_dir, hw_conf, bram_conf, mode, slurm=False):
+  cmd = ""
+  if slurm:
+    cmd += "#!/bin/bash\n"
+    cmd += "#SBATCH --job-name={}\n".format(job)
+    cmd += "#SBATCH --output={}.out\n".format(job)
+    cmd += "srun "
+  if mode=="hls":
+    cmd += "make ip"
+  else:
+    cmd += "make"
+  cmd += " SLURM={} MODE=skip_sim NO_DSP=false NO_ALU=false".format("true" if slurm else "false")
+  cmd += " BUILD_NAME={}".format(build_dir)
+  cmd += " VTA_LOG_INP_WIDTH={}".format(int(log2(hw_conf.input_w)))
+  cmd += " VTA_LOG_WGT_WIDTH={}".format(int(log2(hw_conf.weight_w)))
+  cmd += " VTA_LOG_BATCH={}".format(int(log2(hw_conf.batch)))
+  cmd += " VTA_LOG_BLOCK_IN={}".format(int(log2(hw_conf.block_in)))
+  cmd += " VTA_LOG_BLOCK_OUT={}".format(int(log2(hw_conf.block_out)))
+  cmd += " VTA_LOG_UOP_BUFF_SIZE={}".format(bram_conf[0])
+  cmd += " VTA_LOG_INP_BUFF_SIZE={}".format(bram_conf[1])
+  cmd += " VTA_LOG_WGT_BUFF_SIZE={}".format(bram_conf[2])
+  cmd += " VTA_LOG_ACC_BUFF_SIZE={}\n".format(bram_conf[3])
+  return cmd
+
+def cli():
+  parser = argparse.ArgumentParser(
+      description='Analyze HLS experiments'
+  )
+  parser.add_argument(
+      '-mode', dest='mode', action='store', type=str, required=True,
+      choices=["hls", "vivado"], help='hls synthesis or full compilation'
+  )
+  parser.add_argument(
+      '-base_dir', dest='base_dir', action='store', type=str, required=False,
+      default="../../build/hardware/xilinx/", help='path to build directory'
+  )
+  parser.add_argument(
+      '-min_ibw', dest='min_ibw', action='store', type=int, required=False,
+      default=3, help='log2 of minimum input bit-width'
+  )
+  parser.add_argument(
+      '-max_ibw', dest='max_ibw', action='store', type=int, required=False,
+      default=3, help='log2 of maximum input bit-width'
+  )
+  parser.add_argument(
+      '-min_wbw', dest='min_wbw', action='store', type=int, required=False,
+      default=3, help='log2 of minimum weight bit-width'
+  )
+  parser.add_argument(
+      '-max_wbw', dest='max_wbw', action='store', type=int, required=False,
+      default=3, help='log2 of maximum weight bit-width'
+  )
+  parser.add_argument(
+      '-acc_bw', dest='acc_bw', action='store', type=int, required=False,
+      default=32, help='accumulator bit-width'
+  )
+  parser.add_argument(
+      '-uop_bw', dest='uop_bw', action='store', type=int, required=False,
+      default=32, help='micro-op bit-width'
+  )
+  parser.add_argument(
+      '-min_batch', dest='min_batch', action='store', type=int, required=False,
+      default=0, help='log2 of minimum batch size'
+  )
+  parser.add_argument(
+      '-max_batch', dest='max_batch', action='store', type=int, required=False,
+      default=8, help='log2 of maximum batch size'
+  )
+  parser.add_argument(
+      '-min_ic', dest='min_ic', action='store', type=int, required=False,
+      default=0, help='log2 of minimum input channels'
+  )
+  parser.add_argument(
+      '-max_ic', dest='max_ic', action='store', type=int, required=False,
+      default=8, help='log2 of maximum input channels'
+  )
+  parser.add_argument(
+      '-min_oc', dest='min_oc', action='store', type=int, required=False,
+      default=0, help='log2 of minimum output channels'
+  )
+  parser.add_argument(
+      '-max_oc', dest='max_oc', action='store', type=int, required=False,
+      default=8, help='log2 of maximum output channels'
+  )
+  parser.add_argument(
+      '-uop_sizeB', dest='uop_sizeB', action='store', type=int, required=False,
+      default=14, help='log2 of uop buffer in B'
+  )
+  parser.add_argument(
+      '-bram_w', dest='bram_w', action='store', type=int, required=False,
+      default=32, help='FPGA BRAM port width in b'
+  )
+  parser.add_argument(
+      '-bram_d', dest='bram_d', action='store', type=int, required=False,
+      default=1024, help='FPGA BRAM depth'
+  )
+  parser.add_argument(
+      '-num_bram', dest='num_bram', action='store', type=int, required=False,
+      default=124, help='FPGA total BRAM'
+  )
+  parser.add_argument(
+      '-slurm', dest='slurm', action='store_true',
+      help='Run on cluster using slurm'
+  )
+  args = parser.parse_args()
+
+  # Logging
+  logging.basicConfig(filename='compile_designs.log',level=logging.DEBUG)
+
+  # FPGA config
+  pynq = FPGA(args.bram_w, args.bram_d, args.num_bram)
+
+  # Get timestamp
+  timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')
+  build_dir = "build_{}".format(timestamp)
+
+  num_confs = 0
+  for log_ibw in range(args.min_ibw, args.max_ibw+1):
+    ibw = pow(2, log_ibw)
+    for log_wbw in range(args.min_wbw, args.max_wbw+1):
+      wbw = pow(2, log_wbw)
+      for log_batch in range(args.min_batch, args.max_batch+1):
+        batch = pow(2, log_batch)
+        for log_ic in range(args.min_ic, args.max_ic+1):
+          ic = pow(2, log_ic)
+          for log_oc in range(args.min_oc, args.max_oc+1):
+            oc = pow(2, log_oc)
+            conf = Hardware(batch, ic, oc, ibw, wbw, args.acc_bw, ibw, args.uop_bw)
+            bram_confs = find_bram_confs(pynq, conf, args.uop_sizeB)
+            for b in bram_confs:
+              job = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_100MHz_10ns".format(
+                batch, ic, oc, ibw, wbw, b[0], b[1], b[2], b[3])
+              num_confs += 1
+              cmd = get_make_command(job, build_dir, conf, b, args.mode, args.slurm)
+              sb_file = job+".sb"
+              file = open(sb_file,"w") 
+              file.write(cmd)
+              file.close() 
+              call(["echo", cmd])
+              if args.slurm:
+                call(["sbatch", sb_file])
+              else:
+                call(cmd.split(" "))
+
+if __name__ == '__main__':
+  cli()
diff --git a/vta/hardware/xilinx/scripts/hls.tcl b/vta/hardware/xilinx/scripts/hls.tcl
new file mode 100644
index 000000000000..57efe9c28132
--- /dev/null
+++ b/vta/hardware/xilinx/scripts/hls.tcl
@@ -0,0 +1,217 @@
+#
+#  Copyright (c) 2018 by Contributors
+#  file: hls.tcl
+#  brief: HLS generation script.
+#
+
+# Command line arguments:
+# Arg 1: path to design sources
+# Arg 2: path to sim sources
+# Arg 3: path to test sources
+# Arg 4: path to include sources
+# Arg 5: mode
+# Arg 6: debug
+# Arg 7: no_dsp
+# Arg 8: no_alu
+# Arg 9: target clock period
+# Arg 10: input type width (log)
+# Arg 11: weight type width (log)
+# Arg 12: accum type width (log)
+# Arg 13: output type width (log)
+# Arg 14: batch size (log)
+# Arg 15: in block size (log)
+# Arg 16: out block size (log)
+# Arg 17: uop buffer size in B (log)
+# Arg 18: inp buffer size in B (log)
+# Arg 19: wgt buffer size in B (log)
+# Arg 20: acc buffer size in B (log)
+# Arg 21: out buffer size in B (log)
+
+if { [llength $argv] eq 23 } {
+	set src_dir [lindex $argv 2]
+	set sim_dir [lindex $argv 3]
+	set test_dir [lindex $argv 4]
+	set include_dir [lindex $argv 5]
+	set mode [lindex $argv 6]
+	set debug [lindex $argv 7]
+	set no_dsp [lindex $argv 8]
+	set no_alu [lindex $argv 9]
+	set target_period [lindex $argv 10]
+	set inp_width [lindex $argv 11]
+	set wgt_width [lindex $argv 12]
+	set acc_width [lindex $argv 13]
+	set out_width [lindex $argv 14]
+	set batch [lindex $argv 15]
+	set block_in [lindex $argv 16]
+	set block_out [lindex $argv 17]
+	set uop_buff_size [lindex $argv 18]
+	set inp_buff_size [lindex $argv 19]
+	set wgt_buff_size [lindex $argv 20]
+	set acc_buff_size [lindex $argv 21]
+	set out_buff_size [lindex $argv 22]
+} else {
+	set src_dir "../src"
+	set sim_dir "../sim"
+	set test_dir "../../src/test"
+	set include_dir "../../include"
+	set mode "all"
+	set debug "false"
+	set no_dsp "true"
+	set no_alu "false"
+	set target_period 10
+	set inp_width 3
+	set wgt_width 3
+	set acc_width 5
+	set out_width 3
+	set batch 1
+	set block_in 4
+	set block_out 4
+	set uop_buff_size 15
+	set inp_buff_size 15
+	set wgt_buff_size 15
+	set acc_buff_size 17
+	set out_buff_size 15
+	exit
+}
+
+# Initializes the HLS design and sets HLS pragmas for memory partitioning.
+# This is necessary because of a Vivado restriction that doesn't allow for
+# buses wider than 1024 bits.
+proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
+
+	# Set device number
+	set_part {xc7z020clg484-1}
+
+	# Set the clock frequency
+	create_clock -period $per -name default
+
+	# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024)
+	set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}]
+	if {$inp_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "load" inp_mem
+		set_directive_array_reshape -type complete -dim 2 "compute" inp_mem
+	} else {
+		# Set input reshaping factor below to (1024/INP_VECTOR_WIDTH)
+		set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}]
+		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem
+		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem
+		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
+		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
+	}
+	# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024)
+	set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}]
+	if {$wgt_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "load" wgt_mem
+		set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem
+	} else {
+		# Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH)
+		set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}]
+		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem
+		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem
+		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
+		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
+	}
+	# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024)
+	set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}]
+	if {$out_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "compute" out_mem
+		set_directive_array_reshape -type complete -dim 2 "store" out_mem
+	} else {
+		# Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH)
+		set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}]
+		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem
+		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem
+		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem
+		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
+	}
+}
+
+# C define flags to pass to compiler
+set cflags "-I $include_dir -I $src_dir -I $test_dir \
+	-DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
+	-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
+	-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
+	-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
+	-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
+	-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
+if {$debug=="true"} {
+	append cflags " -DVTA_DEBUG=1"
+}
+if {$no_dsp=="true"} {
+	append cflags " -DNO_DSP"
+}
+if {$no_alu=="true"} {
+	append cflags " -DNO_ALU"
+}
+
+# HLS behavioral sim
+if {$mode=="all" || $mode=="sim"} {
+	open_project vta_sim
+	set_top vta
+	add_files $src_dir/vta.cc -cflags $cflags
+	add_files -tb $sim_dir/vta_test.cc -cflags $cflags
+	add_files -tb $test_dir/test_lib.cc -cflags $cflags
+	open_solution "solution0"
+	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+	csim_design -clean
+	close_project
+}
+
+# Generate fetch stage
+if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
+	open_project vta_fetch
+	set_top fetch
+	add_files $src_dir/vta.cc -cflags $cflags
+	open_solution "solution0"
+	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+	csynth_design
+	if {$mode=="all" || $mode=="skip_sim"} {
+		export_design -format ip_catalog
+	}
+	close_project
+}
+
+# Generate load stage
+if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
+	open_project vta_load
+	set_top load
+	add_files $src_dir/vta.cc -cflags $cflags
+	open_solution "solution0"
+	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+	csynth_design
+	if {$mode=="all" || $mode=="skip_sim"} {
+		export_design -format ip_catalog
+	}
+	close_project
+}
+
+# Generate compute stage
+if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
+	open_project vta_compute
+	set_top compute
+	add_files $src_dir/vta.cc -cflags $cflags
+	open_solution "solution0"
+	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+	csynth_design
+	if {$mode=="all" || $mode=="skip_sim"} {
+		export_design -format ip_catalog
+	}
+	close_project
+}
+
+# Generate store stage
+if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
+	open_project vta_store
+	set_top store
+	add_files $src_dir/vta.cc -cflags $cflags
+	open_solution "solution0"
+	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+	csynth_design
+	if {$mode=="all" || $mode=="skip_sim"} {
+		export_design -format ip_catalog
+	}
+	close_project
+}
+
+exit
+
diff --git a/vta/hardware/xilinx/scripts/hsi.tcl b/vta/hardware/xilinx/scripts/hsi.tcl
new file mode 100644
index 000000000000..1567b078a3e4
--- /dev/null
+++ b/vta/hardware/xilinx/scripts/hsi.tcl
@@ -0,0 +1,11 @@
+#
+#  Copyright (c) 2018 by Contributors
+#  file: hsi.tcl
+#  brief: Driver generation script for ARMv7 driver libraries.
+#
+
+open_hw_design export/vta.hdf
+create_sw_design swdesign -proc ps7_cortexa9_0 -os standalone
+generate_bsp -dir bsp
+
+exit
diff --git a/vta/hardware/xilinx/scripts/vivado.tcl b/vta/hardware/xilinx/scripts/vivado.tcl
new file mode 100644
index 000000000000..b519d540bd4e
--- /dev/null
+++ b/vta/hardware/xilinx/scripts/vivado.tcl
@@ -0,0 +1,938 @@
+#
+#  Copyright (c) 2018 by Xilinx, Contributors
+#  file: vivado.tcl
+#  brief: Vivado compilation script. Partially automatically generated
+#   by Vivado.
+#
+
+# Check if script is running in correct Vivado version.
+set scripts_vivado_version 2018.2
+set current_vivado_version [version -short]
+
+if { [string first $scripts_vivado_version $current_vivado_version] == -1 } {
+   puts ""
+   catch {common::send_msg_id "BD_TCL-109" "ERROR" "This script was generated using Vivado \
+    <$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado. \
+    Please run the script in Vivado <$scripts_vivado_version> then open the design in Vivado \
+    <$current_vivado_version>. Upgrade the design by running \"Tools => Report => Report IP \
+    Status...\", then run write_bd_tcl to create an updated script."}
+
+   return 1
+}
+
+# Parse argument list, derive the clock to utilize
+set clock_id 0
+if { [llength $argv] eq 12 } {
+  set ip_path [lindex $argv 0]
+  set num_threads [lindex $argv 1]
+  set clock_freq [lindex $argv 2]
+  set inp_width [expr 1 << [lindex $argv 3]]
+  set wgt_width [expr 1 << [lindex $argv 4]]
+  set out_width [expr 1 << [lindex $argv 5]]
+  set batch [expr 1 << [lindex $argv 6]]
+  set out_block [expr 1 << [lindex $argv 7]]
+  set in_block [expr 1 << [lindex $argv 8]]
+  set inp_mem_size [expr 1 << [lindex $argv 9]]
+  set wgt_mem_size [expr 1 << [lindex $argv 10]]
+  set out_mem_size [expr 1 << [lindex $argv 11]]
+  if {$clock_freq eq 100} {
+    set clock_id 0
+    puts "Setting clock frequency to 100MHz"
+  } elseif {$clock_freq eq 142} {
+    set clock_id 1
+    puts "Setting clock frequency to 142MHz"
+  } elseif {$clock_freq eq 167} {
+    set clock_id 3
+    puts "Setting clock frequency to 167MHz"
+  } elseif {$clock_freq eq 200} {
+    set clock_id 2
+    puts "Setting clock frequency to 200MHz"
+  } else {
+    set clock_id 0
+    puts "Unrecognized clock frequency, setting clock to 100MHz"
+  }
+} else {
+  puts "Arg list incomplete: <path to ip dir> <num threads> <clock freq> \
+    <inp width> <wgt_width> <out_width> <batch> <batch> <out_block> <in_block
+    <inp_mem_size> <wgt_mem_size> <out_mem_size>"
+  return 1
+}
+
+# Derive input mem parameters
+set inp_mem_width [expr $inp_width * $batch * $in_block]
+set inp_bus_width 1024
+set inp_part [expr $inp_mem_width / $inp_bus_width]
+if {[expr $inp_part == 0]} {
+  set inp_part 1
+  set inp_bus_width $inp_mem_width
+}
+set inp_mem_depth [expr $inp_mem_size * 8 / ($inp_mem_width * $inp_part)]
+
+# Derive weight mem parameters
+set wgt_mem_width [expr $wgt_width * $out_block * $in_block]
+set wgt_bus_width 1024
+set wgt_part [expr $wgt_mem_width / $wgt_bus_width]
+if {[expr $wgt_part == 0]} {
+  set wgt_part 1
+  set wgt_bus_width $wgt_mem_width
+}
+set wgt_mem_depth [expr $wgt_mem_size * 8 / ($wgt_mem_width * $wgt_part)]
+
+# Derive output mem parameters
+set out_mem_width [expr $out_width * $batch * $out_block]
+set out_bus_width 1024
+set out_part [expr $out_mem_width / $out_bus_width]
+if {[expr $out_part == 0]} {
+  set out_part 1
+  set out_bus_width $out_mem_width
+}
+set out_mem_depth [expr $out_mem_size * 8 / ($out_mem_width * $out_part)]
+
+# User defined paths
+set proj_name vta
+set proj_path "."
+set ip_lib "ip_lib"
+set fetch_ip "${ip_path}/vta_fetch/solution0/impl/ip/xilinx_com_hls_fetch_1_0.zip"
+set load_ip "${ip_path}/vta_load/solution0/impl/ip/xilinx_com_hls_load_1_0.zip"
+set compute_ip "${ip_path}/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_1_0.zip"
+set store_ip "${ip_path}/vta_store/solution0/impl/ip/xilinx_com_hls_store_1_0.zip"
+
+# Create custom project
+create_project -force $proj_name $proj_path -part xc7z020clg484-1
+
+# Update IP repository with generated IP
+file mkdir $ip_lib
+set_property ip_repo_paths $ip_lib [current_project]
+update_ip_catalog
+update_ip_catalog -add_ip $fetch_ip -repo_path $ip_lib
+update_ip_catalog -add_ip $load_ip -repo_path $ip_lib
+update_ip_catalog -add_ip $compute_ip -repo_path $ip_lib
+update_ip_catalog -add_ip $store_ip -repo_path $ip_lib
+
+# CHANGE DESIGN NAME HERE
+set design_name $proj_name
+
+# Creating design if needed
+set errMsg ""
+set nRet 0
+
+set cur_design [current_bd_design -quiet]
+set list_cells [get_bd_cells -quiet]
+
+if { ${design_name} eq "" } {
+   # USE CASES:
+   #    1) Design_name not set
+
+   set errMsg "Please set the variable <design_name> to a non-empty value."
+   set nRet 1
+
+} elseif { ${cur_design} ne "" && ${list_cells} eq "" } {
+   # USE CASES:
+   #    2): Current design opened AND is empty AND names same.
+   #    3): Current design opened AND is empty AND names diff; design_name NOT in project.
+   #    4): Current design opened AND is empty AND names diff; design_name exists in project.
+
+   if { $cur_design ne $design_name } {
+      common::send_msg_id "BD_TCL-001" "INFO" "Changing value of <design_name> from <$design_name> \
+        to <$cur_design> since current design is empty."
+      set design_name [get_property NAME $cur_design]
+   }
+   common::send_msg_id "BD_TCL-002" "INFO" "Constructing design in IPI design <$cur_design>..."
+
+} elseif { ${cur_design} ne "" && $list_cells ne "" && $cur_design eq $design_name } {
+   # USE CASES:
+   #    5) Current design opened AND has components AND same names.
+
+   set errMsg "Design <$design_name> already exists in your project, please set the variable \
+    <design_name> to another value."
+   set nRet 1
+} elseif { [get_files -quiet ${design_name}.bd] ne "" } {
+   # USE CASES:
+   #    6) Current opened design, has components, but diff names, design_name exists in project.
+   #    7) No opened design, design_name exists in project.
+
+   set errMsg "Design <$design_name> already exists in your project, please set the variable \
+    <design_name> to another value."
+   set nRet 2
+
+} else {
+   # USE CASES:
+   #    8) No opened design, design_name not in project.
+   #    9) Current opened design, has components, but diff names, design_name not in project.
+
+   common::send_msg_id "BD_TCL-003" "INFO" "Currently there is no design <$design_name> in \
+    project, so creating one..."
+
+   create_bd_design $design_name
+
+   common::send_msg_id "BD_TCL-004" "INFO" "Making design <$design_name> as current_bd_design."
+   current_bd_design $design_name
+
+}
+
+common::send_msg_id "BD_TCL-005" "INFO" "Currently the variable <design_name> is equal \
+  to \"$design_name\"."
+
+if { $nRet != 0 } {
+   catch {common::send_msg_id "BD_TCL-114" "ERROR" $errMsg}
+   return $nRet
+}
+
+##################################################################
+# DESIGN PROCs
+##################################################################
+
+
+
+# Procedure to create entire design; Provide argument to make
+# procedure reusable. If parentCell is "", will use root.
+proc create_root_design { parentCell clk inp_part wgt_part out_part inp_bus_width inp_mem_depth wgt_bus_width wgt_mem_depth out_bus_width out_mem_depth} {
+
+  variable script_folder
+
+  if { $parentCell eq "" } {
+     set parentCell [get_bd_cells /]
+  }
+
+  # Get object for parentCell
+  set parentObj [get_bd_cells $parentCell]
+  if { $parentObj == "" } {
+     catch {common::send_msg_id "BD_TCL-100" "ERROR" "Unable to find parent cell <$parentCell>!"}
+     return
+  }
+
+  # Make sure parentObj is hier blk
+  set parentType [get_property TYPE $parentObj]
+  if { $parentType ne "hier" } {
+     catch {common::send_msg_id "BD_TCL-101" "ERROR" "Parent <$parentObj> has TYPE = \
+      <$parentType>. Expected to be <hier>."}
+     return
+  }
+
+  # Save current instance; Restore later
+  set oldCurInst [current_bd_instance .]
+
+  # Set parent object as current
+  current_bd_instance $parentObj
+
+
+  # Create interface ports
+  set DDR [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:ddrx_rtl:1.0 DDR ]
+  set FIXED_IO [ create_bd_intf_port -mode Master \
+    -vlnv xilinx.com:display_processing_system7:fixedio_rtl:1.0 FIXED_IO ]
+
+  # Create ports
+
+  # Create instance: axi_interconnect_1, and set properties
+  set axi_interconnect_1 \
+    [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_1 ]
+  set_property -dict [ list \
+    CONFIG.NUM_MI {5} \
+  ] $axi_interconnect_1
+
+  # Create instance: axi_smc, and set properties
+  set axi_smc [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc ]
+  set_property -dict [ list \
+    CONFIG.NUM_SI {5} \
+  ] $axi_smc
+
+  # Create instance: axi_timer_1, and set properties
+  set axi_timer_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_timer:2.0 axi_timer_1 ]
+
+  # Create instance: compute_0, and set properties
+  set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ]
+  set_property -dict [ list \
+    CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
+    CONFIG.C_M_AXI_DATA_PORT_DATA_WIDTH {64} \
+    CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE {"1111"} \
+  ] $compute_0
+
+  # Create instance: fetch_0, and set properties
+  set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ]
+  set_property -dict [ list \
+    CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE {"1111"} \
+    CONFIG.C_M_AXI_INS_PORT_DATA_WIDTH {64} \
+  ] $fetch_0
+
+  # Create instance: g2l_queue, and set properties
+  set g2l_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 g2l_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {1024} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $g2l_queue
+
+  # Create instance: g2s_queue, and set properties
+  set g2s_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 g2s_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {1024} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $g2s_queue
+
+  # Create instance: gemm_queue, and set properties
+  set gemm_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 gemm_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {510} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {511} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {512} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TDATA_NUM_BYTES {16} \
+    CONFIG.TKEEP_WIDTH {16} \
+    CONFIG.TSTRB_WIDTH {16} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $gemm_queue
+
+  # Create instance: l2g_queue, and set properties
+  set l2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 l2g_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {1024} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $l2g_queue
+
+  # Create instance: load_0, and set properties
+  set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ]
+  set_property -dict [ list \
+    CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
+  ] $load_0
+
+  # Create instance: load_queue, and set properties
+  set load_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 load_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {510} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {511} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {512} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TDATA_NUM_BYTES {16} \
+    CONFIG.TKEEP_WIDTH {16} \
+    CONFIG.TSTRB_WIDTH {16} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $load_queue
+
+  # Create instance: proc_sys_reset, and set properties
+  set proc_sys_reset \
+    [ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ]
+
+  # Create instance: processing_system7_1, and set properties
+  set processing_system7_1 \
+    [ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_1 ]
+  set_property -dict [ list \
+    CONFIG.PCW_CAN0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_ENET0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_EN_CLK0_PORT {1} \
+    CONFIG.PCW_EN_CLK1_PORT {1} \
+    CONFIG.PCW_EN_CLK2_PORT {1} \
+    CONFIG.PCW_EN_CLK3_PORT {1} \
+    CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \
+    CONFIG.PCW_FPGA1_PERIPHERAL_FREQMHZ {142.86} \
+    CONFIG.PCW_FPGA2_PERIPHERAL_FREQMHZ {200} \
+    CONFIG.PCW_FPGA3_PERIPHERAL_FREQMHZ {167} \
+    CONFIG.PCW_GPIO_MIO_GPIO_ENABLE {0} \
+    CONFIG.PCW_I2C0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_IMPORT_BOARD_PRESET {None} \
+    CONFIG.PCW_IRQ_F2P_INTR {1} \
+    CONFIG.PCW_QSPI_GRP_SINGLE_SS_ENABLE {0} \
+    CONFIG.PCW_QSPI_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_SD0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_USB0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \
+    CONFIG.PCW_USE_FABRIC_INTERRUPT {1} \
+    CONFIG.PCW_USE_HIGH_OCM {1} \
+    CONFIG.PCW_USE_S_AXI_ACP {1} \
+    CONFIG.PCW_USE_S_AXI_HP0 {0} \
+    CONFIG.PCW_USE_S_AXI_HP1 {0} \
+    CONFIG.PCW_USE_S_AXI_HP2 {0} \
+    CONFIG.PCW_USE_S_AXI_HP3 {0} \
+    CONFIG.preset {ZC702} \
+  ] $processing_system7_1
+
+  # Create instance: s2g_queue, and set properties
+  set s2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 s2g_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {1024} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $s2g_queue
+
+  # Create instance: store_0, and set properties
+  set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ]
+  set_property -dict [ list \
+CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
+  ] $store_0
+
+  # Create instance: store_queue, and set properties
+  set store_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 store_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {510} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {511} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {512} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TDATA_NUM_BYTES {16} \
+    CONFIG.TKEEP_WIDTH {16} \
+    CONFIG.TSTRB_WIDTH {16} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $store_queue
+
+  # Create instance: xlconcat_1, and set properties
+  set xlconcat_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_1 ]
+  set_property -dict [ list \
+CONFIG.NUM_PORTS {5} \
+  ] $xlconcat_1
+
+  # Create and connect inp_mem partitions
+  if {${inp_part} > 1} {
+    for {set i 0} {$i < ${inp_part}} {incr i} {
+      # Create instance: inp_mem, and set properties
+      set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem_${i} ]
+      set_property -dict [ list \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $inp_bus_width \
+        CONFIG.Read_Width_B $inp_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $inp_mem_depth \
+        CONFIG.Write_Width_A $inp_bus_width \
+        CONFIG.Write_Width_B $inp_bus_width \
+        CONFIG.use_bram_block {BRAM_Controller} \
+      ] $inp_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net load_0_inp_mem_${i}_V_PORTA \
+        [get_bd_intf_pins $inp_mem/BRAM_PORTA] \
+        [get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA]
+      connect_bd_intf_net -intf_net compute_0_inp_mem_${i}_V_PORTA \
+        [get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA] \
+        [get_bd_intf_pins $inp_mem/BRAM_PORTB]
+    }
+  } else {
+      # Create instance: inp_mem, and set properties
+      set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem ]
+      set_property -dict [ list \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $inp_bus_width \
+        CONFIG.Read_Width_B $inp_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $inp_mem_depth \
+        CONFIG.Write_Width_A $inp_bus_width \
+        CONFIG.Write_Width_B $inp_bus_width \
+        CONFIG.use_bram_block {BRAM_Controller} \
+      ] $inp_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net load_0_inp_mem_V_PORTA \
+        [get_bd_intf_pins $inp_mem/BRAM_PORTA] \
+        [get_bd_intf_pins load_0/inp_mem_V_PORTA]
+      connect_bd_intf_net -intf_net compute_0_inp_mem_V_PORTA \
+        [get_bd_intf_pins compute_0/inp_mem_V_PORTA] \
+        [get_bd_intf_pins $inp_mem/BRAM_PORTB]
+  }
+
+  # Create and connect wgt_mem partitions
+  if {${wgt_part} > 1} {
+    for {set i 0} {$i < ${wgt_part}} {incr i} {
+      # Create instance: wgt_mem, and set properties
+      set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem_${i} ]
+      set_property -dict [ list \
+        CONFIG.Assume_Synchronous_Clk {true} \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $wgt_bus_width \
+        CONFIG.Read_Width_B $wgt_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $wgt_mem_depth \
+        CONFIG.Write_Width_A $wgt_bus_width \
+        CONFIG.Write_Width_B $wgt_bus_width \
+      ] $wgt_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net load_0_wgt_mem_${i}_V_PORTA \
+        [get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA] \
+        [get_bd_intf_pins $wgt_mem/BRAM_PORTA]
+      connect_bd_intf_net -intf_net compute_0_wgt_mem_${i}_V_PORTA \
+        [get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA] \
+        [get_bd_intf_pins $wgt_mem/BRAM_PORTB]
+    }
+  } else {
+      # Create instance: wgt_mem, and set properties
+      set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem ]
+      set_property -dict [ list \
+        CONFIG.Assume_Synchronous_Clk {true} \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $wgt_bus_width \
+        CONFIG.Read_Width_B $wgt_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $wgt_mem_depth \
+        CONFIG.Write_Width_A $wgt_bus_width \
+        CONFIG.Write_Width_B $wgt_bus_width \
+      ] $wgt_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net load_0_wgt_mem_V_PORTA \
+        [get_bd_intf_pins load_0/wgt_mem_V_PORTA] \
+        [get_bd_intf_pins $wgt_mem/BRAM_PORTA]
+      connect_bd_intf_net -intf_net compute_0_wgt_mem_V_PORTA \
+        [get_bd_intf_pins compute_0/wgt_mem_V_PORTA] \
+        [get_bd_intf_pins $wgt_mem/BRAM_PORTB]
+  }
+
+  # Create and connect out_mem partitions
+  if {${out_part} > 1} {
+    for {set i 0} {$i < ${out_part}} {incr i} {
+      # Create instance: out_mem, and set properties
+      set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem_${i} ]
+      set_property -dict [ list \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $out_bus_width \
+        CONFIG.Read_Width_B $out_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $out_mem_depth \
+        CONFIG.Write_Width_A $out_bus_width \
+        CONFIG.Write_Width_B $out_bus_width \
+        CONFIG.use_bram_block {BRAM_Controller} \
+      ] $out_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net compute_0_out_mem_${i}_V_PORTA \
+        [get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA] \
+        [get_bd_intf_pins $out_mem/BRAM_PORTA]
+      connect_bd_intf_net -intf_net store_0_out_mem_${i}_V_PORTA \
+        [get_bd_intf_pins $out_mem/BRAM_PORTB] \
+        [get_bd_intf_pins store_0/out_mem_${i}_V_PORTA]
+    }
+  } else {
+      # Create instance: out_mem, and set properties
+      set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem ]
+      set_property -dict [ list \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $out_bus_width \
+        CONFIG.Read_Width_B $out_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $out_mem_depth \
+        CONFIG.Write_Width_A $out_bus_width \
+        CONFIG.Write_Width_B $out_bus_width \
+        CONFIG.use_bram_block {BRAM_Controller} \
+      ] $out_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net compute_0_out_mem_V_PORTA \
+        [get_bd_intf_pins compute_0/out_mem_V_PORTA] \
+        [get_bd_intf_pins $out_mem/BRAM_PORTA]
+      connect_bd_intf_net -intf_net store_0_out_mem_V_PORTA \
+        [get_bd_intf_pins $out_mem/BRAM_PORTB] \
+        [get_bd_intf_pins store_0/out_mem_V_PORTA]
+  }
+
+  # Create interface connections
+  connect_bd_intf_net -intf_net axi_interconnect_1_M01_AXI \
+    [get_bd_intf_pins axi_interconnect_1/M01_AXI] \
+    [get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS]
+  connect_bd_intf_net -intf_net axi_interconnect_1_M02_AXI \
+    [get_bd_intf_pins axi_interconnect_1/M02_AXI] \
+    [get_bd_intf_pins load_0/s_axi_CONTROL_BUS]
+  connect_bd_intf_net -intf_net axi_interconnect_1_M03_AXI \
+    [get_bd_intf_pins axi_interconnect_1/M03_AXI] \
+    [get_bd_intf_pins compute_0/s_axi_CONTROL_BUS]
+  connect_bd_intf_net -intf_net axi_interconnect_1_M04_AXI \
+    [get_bd_intf_pins axi_interconnect_1/M04_AXI] \
+    [get_bd_intf_pins store_0/s_axi_CONTROL_BUS]
+  connect_bd_intf_net -intf_net axi_smc_M00_AXI \
+    [get_bd_intf_pins axi_smc/M00_AXI] \
+    [get_bd_intf_pins processing_system7_1/S_AXI_ACP]
+  connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V \
+    [get_bd_intf_pins compute_0/g2l_dep_queue_V] \
+    [get_bd_intf_pins g2l_queue/S_AXIS]
+  connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V \
+    [get_bd_intf_pins compute_0/g2s_dep_queue_V] \
+    [get_bd_intf_pins g2s_queue/S_AXIS]
+  connect_bd_intf_net -intf_net compute_0_m_axi_data_port \
+    [get_bd_intf_pins axi_smc/S02_AXI] \
+    [get_bd_intf_pins compute_0/m_axi_data_port]
+  connect_bd_intf_net -intf_net compute_0_m_axi_uop_port \
+    [get_bd_intf_pins axi_smc/S01_AXI] \
+    [get_bd_intf_pins compute_0/m_axi_uop_port]
+  connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V \
+    [get_bd_intf_pins fetch_0/gemm_queue_V_V] \
+    [get_bd_intf_pins gemm_queue/S_AXIS]
+  connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V \
+    [get_bd_intf_pins l2g_queue/S_AXIS] \
+    [get_bd_intf_pins load_0/l2g_dep_queue_V]
+  connect_bd_intf_net -intf_net fetch_0_load_queue_V_V \
+    [get_bd_intf_pins fetch_0/load_queue_V_V] \
+    [get_bd_intf_pins load_queue/S_AXIS]
+  connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port \
+    [get_bd_intf_pins axi_smc/S00_AXI] \
+    [get_bd_intf_pins fetch_0/m_axi_ins_port]
+  connect_bd_intf_net -intf_net fetch_0_store_queue_V_V \
+    [get_bd_intf_pins fetch_0/store_queue_V_V] \
+    [get_bd_intf_pins store_queue/S_AXIS]
+  connect_bd_intf_net -intf_net g2l_queue_M_AXIS \
+    [get_bd_intf_pins g2l_queue/M_AXIS] \
+    [get_bd_intf_pins load_0/g2l_dep_queue_V]
+  connect_bd_intf_net -intf_net g2s_queue_M_AXIS \
+    [get_bd_intf_pins g2s_queue/M_AXIS] \
+    [get_bd_intf_pins store_0/g2s_dep_queue_V]
+  connect_bd_intf_net -intf_net gemm_queue_M_AXIS \
+    [get_bd_intf_pins compute_0/gemm_queue_V_V] \
+    [get_bd_intf_pins gemm_queue/M_AXIS]
+  connect_bd_intf_net -intf_net l2g_queue_M_AXIS \
+    [get_bd_intf_pins compute_0/l2g_dep_queue_V] \
+    [get_bd_intf_pins l2g_queue/M_AXIS]
+  connect_bd_intf_net -intf_net load_0_m_axi_data_port \
+    [get_bd_intf_pins axi_smc/S03_AXI] \
+    [get_bd_intf_pins load_0/m_axi_data_port]
+  connect_bd_intf_net -intf_net load_queue_M_AXIS \
+    [get_bd_intf_pins load_0/load_queue_V_V] \
+    [get_bd_intf_pins load_queue/M_AXIS]
+  connect_bd_intf_net -intf_net processing_system7_1_axi_periph_m00_axi \
+    [get_bd_intf_pins axi_interconnect_1/M00_AXI] \
+    [get_bd_intf_pins axi_timer_1/S_AXI]
+  connect_bd_intf_net -intf_net processing_system7_1_ddr \
+    [get_bd_intf_ports DDR] \
+    [get_bd_intf_pins processing_system7_1/DDR]
+  connect_bd_intf_net -intf_net processing_system7_1_fixed_io \
+    [get_bd_intf_ports FIXED_IO] \
+    [get_bd_intf_pins processing_system7_1/FIXED_IO]
+  connect_bd_intf_net -intf_net processing_system7_1_m_axi_gp0 \
+    [get_bd_intf_pins axi_interconnect_1/S00_AXI] \
+    [get_bd_intf_pins processing_system7_1/M_AXI_GP0]
+  connect_bd_intf_net -intf_net s2g_queue_M_AXIS \
+    [get_bd_intf_pins compute_0/s2g_dep_queue_V] \
+    [get_bd_intf_pins s2g_queue/M_AXIS]
+  connect_bd_intf_net -intf_net store_0_m_axi_data_port \
+    [get_bd_intf_pins axi_smc/S04_AXI] \
+    [get_bd_intf_pins store_0/m_axi_data_port]
+  connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V \
+    [get_bd_intf_pins s2g_queue/S_AXIS] \
+    [get_bd_intf_pins store_0/s2g_dep_queue_V]
+  connect_bd_intf_net -intf_net store_queue_M_AXIS \
+    [get_bd_intf_pins store_0/store_queue_V_V] \
+    [get_bd_intf_pins store_queue/M_AXIS]
+
+  # Create port connections
+  connect_bd_net -net axi_timer_1_interrupt \
+    [get_bd_pins axi_timer_1/interrupt] \
+    [get_bd_pins xlconcat_1/In0]
+  connect_bd_net -net compute_0_interrupt \
+    [get_bd_pins compute_0/interrupt] \
+    [get_bd_pins xlconcat_1/In3]
+  connect_bd_net -net fetch_0_interrupt \
+    [get_bd_pins fetch_0/interrupt] \
+    [get_bd_pins xlconcat_1/In1]
+  connect_bd_net -net load_0_interrupt \
+    [get_bd_pins load_0/interrupt] \
+    [get_bd_pins xlconcat_1/In2]
+  connect_bd_net -net proc_sys_reset_interconnect_aresetn \
+    [get_bd_pins axi_interconnect_1/ARESETN] \
+    [get_bd_pins proc_sys_reset/interconnect_aresetn]
+  connect_bd_net -net proc_sys_reset_peripheral_aresetn \
+    [get_bd_pins axi_interconnect_1/M00_ARESETN] \
+    [get_bd_pins axi_interconnect_1/M01_ARESETN] \
+    [get_bd_pins axi_interconnect_1/M02_ARESETN] \
+    [get_bd_pins axi_interconnect_1/M03_ARESETN] \
+    [get_bd_pins axi_interconnect_1/M04_ARESETN] \
+    [get_bd_pins axi_interconnect_1/S00_ARESETN] \
+    [get_bd_pins axi_smc/aresetn] \
+    [get_bd_pins axi_timer_1/s_axi_aresetn] \
+    [get_bd_pins compute_0/ap_rst_n] \
+    [get_bd_pins fetch_0/ap_rst_n] \
+    [get_bd_pins g2l_queue/s_aresetn] \
+    [get_bd_pins g2s_queue/s_aresetn] \
+    [get_bd_pins gemm_queue/s_aresetn] \
+    [get_bd_pins l2g_queue/s_aresetn] \
+    [get_bd_pins load_0/ap_rst_n] \
+    [get_bd_pins load_queue/s_aresetn] \
+    [get_bd_pins proc_sys_reset/peripheral_aresetn] \
+    [get_bd_pins s2g_queue/s_aresetn] \
+    [get_bd_pins store_0/ap_rst_n] \
+    [get_bd_pins store_queue/s_aresetn]
+  connect_bd_net -net processing_system7_1_FCLK_CLK \
+    [get_bd_pins axi_interconnect_1/ACLK] \
+    [get_bd_pins axi_interconnect_1/M00_ACLK] \
+    [get_bd_pins axi_interconnect_1/M01_ACLK] \
+    [get_bd_pins axi_interconnect_1/M02_ACLK] \
+    [get_bd_pins axi_interconnect_1/M03_ACLK] \
+    [get_bd_pins axi_interconnect_1/M04_ACLK] \
+    [get_bd_pins axi_interconnect_1/S00_ACLK] \
+    [get_bd_pins axi_smc/aclk] \
+    [get_bd_pins axi_timer_1/s_axi_aclk] \
+    [get_bd_pins compute_0/ap_clk] \
+    [get_bd_pins fetch_0/ap_clk] \
+    [get_bd_pins g2l_queue/s_aclk] \
+    [get_bd_pins g2s_queue/s_aclk] \
+    [get_bd_pins gemm_queue/s_aclk] \
+    [get_bd_pins l2g_queue/s_aclk] \
+    [get_bd_pins load_0/ap_clk] \
+    [get_bd_pins load_queue/s_aclk] \
+    [get_bd_pins proc_sys_reset/slowest_sync_clk] \
+    [get_bd_pins processing_system7_1/FCLK_CLK${clk}] \
+    [get_bd_pins processing_system7_1/M_AXI_GP0_ACLK] \
+    [get_bd_pins processing_system7_1/S_AXI_ACP_ACLK] \
+    [get_bd_pins s2g_queue/s_aclk] \
+    [get_bd_pins store_0/ap_clk] \
+    [get_bd_pins store_queue/s_aclk]
+  connect_bd_net -net processing_system7_1_fclk_reset0_n \
+    [get_bd_pins proc_sys_reset/ext_reset_in] \
+    [get_bd_pins processing_system7_1/FCLK_RESET0_N]
+  connect_bd_net -net store_0_interrupt \
+    [get_bd_pins store_0/interrupt] \
+    [get_bd_pins xlconcat_1/In4]
+  connect_bd_net -net xlconcat_1_dout \
+    [get_bd_pins processing_system7_1/IRQ_F2P] \
+    [get_bd_pins xlconcat_1/dout]
+
+  # Create address segments
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+  create_bd_addr_seg -range 0x00010000 -offset 0x42800000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs axi_timer_1/S_AXI/Reg] SEG_axi_timer_1_Reg
+  create_bd_addr_seg -range 0x00010000 -offset 0x43C10000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg
+  create_bd_addr_seg -range 0x00010000 -offset 0x43C00000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg
+  create_bd_addr_seg -range 0x00010000 -offset 0x43C20000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg
+  create_bd_addr_seg -range 0x00010000 -offset 0x43C30000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+
+
+  # Restore current instance
+  current_bd_instance $oldCurInst
+
+  save_bd_design
+}
+# End of create_root_design()
+
+
+##################################################################
+# MAIN FLOW
+##################################################################
+
+create_root_design "" $clock_id $inp_part $wgt_part $out_part $inp_bus_width \
+  $inp_mem_depth $wgt_bus_width $wgt_mem_depth $out_bus_width $out_mem_depth
+
+# Create top-level wrapper file
+make_wrapper -files \
+  [get_files $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/$proj_name.bd] -top
+add_files -norecurse $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/hdl/${proj_name}_wrapper.v
+update_compile_order -fileset sources_1
+update_compile_order -fileset sim_1
+
+# Run bistream generation on 8 threads with performance oriented P&R strategy
+# create_run impl_1 -parent_run synth_1 -flow {Vivado Implementation 2017} \
+#   -strategy "Performance_ExplorePostRoutePhysOpt"
+launch_runs impl_1 -to_step write_bitstream -jobs $num_threads
+wait_on_run impl_1
+
+# Export hardware description file and bitstream files to export/ dir
+if {[file exist $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit]} {
+  file mkdir $proj_path/export
+  file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.sysdef \
+    $proj_path/export/vta.hdf
+  file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit \
+    $proj_path/export/vta.bit
+}
+
+exit
diff --git a/vta/hardware/xilinx/sim/vta_test.cc b/vta/hardware/xilinx/sim/vta_test.cc
new file mode 100644
index 000000000000..e1c28834a691
--- /dev/null
+++ b/vta/hardware/xilinx/sim/vta_test.cc
@@ -0,0 +1,60 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_test.cpp
+ * \brief Simulation tests for the VTA design.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+
+#include "../src/vta.h"
+#include "../../../tests/hardware/common/test_lib.h"
+
+int main(void) {
+#if DEBUG == 1
+    printParameters();
+#endif
+
+    // Micro op bound
+    assert(VTA_UOP_GEM_2_1 < VTA_UOP_WIDTH);
+    assert(VTA_UOP_ALU_1_1 < VTA_UOP_WIDTH);
+    // Make sure there is no misaligment
+    assert(VTA_INSN_GEM_9_1 < VTA_INSN_GEM_A_0);
+    assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0);
+    // Instruction bounds
+    assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH);
+    assert(VTA_INSN_GEM_F_1 < VTA_INS_WIDTH);
+    assert(VTA_INSN_ALU_G_1 < VTA_INS_WIDTH);
+
+    int status = 0;
+
+    // Run ALU test (vector-scalar operators)
+    status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false);
+
+    // Run ALU test (vector-vector operators)
+    status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
+
+    // Run blocked GEMM test
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
+
+    // Simple GEMM unit test
+    status |= gemm_test(64, 64, 64, true);
+
+    return status;
+}
diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc
new file mode 100644
index 000000000000..8d0432477486
--- /dev/null
+++ b/vta/hardware/xilinx/src/vta.cc
@@ -0,0 +1,726 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta.cpp
+ * \brief VTA HLS design.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vta.h"
+
+void fetch(
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<insn_T> &store_queue) {
+#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
+#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
+#pragma HLS INTERFACE axis port = load_queue
+#pragma HLS INTERFACE axis port = gemm_queue
+#pragma HLS INTERFACE axis port = store_queue
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+
+  INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
+#pragma HLS PIPELINE II = 1
+    // Read instruction fields
+    insn_T insn = insns[pc];
+    // Do some partial decoding
+    opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
+    memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+    // Push to appropriate instruction queue
+    if (opcode == VTA_OPCODE_STORE) {
+      store_queue.write(insn);
+    } else if (opcode == VTA_OPCODE_LOAD &&
+          (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) {
+      load_queue.write(insn);
+    } else {
+      gemm_queue.write(insn);
+    }
+  }
+}
+
+void load(
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]
+  ) {
+#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
+#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
+#pragma HLS INTERFACE axis port = load_queue
+#pragma HLS INTERFACE axis port = g2l_dep_queue
+#pragma HLS INTERFACE axis port = l2g_dep_queue
+#pragma HLS INTERFACE bram port = wgt_mem
+#pragma HLS INTERFACE bram port = inp_mem
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+
+  // Pop load instruction
+  insn_T insn = load_queue.read();
+
+  // Decode instruction
+  bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
+  bool pop_next_dependence = insn[VTA_INSN_MEM_2];
+  bool push_prev_dependence = insn[VTA_INSN_MEM_3];
+  bool push_next_dependence = insn[VTA_INSN_MEM_4];
+  memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+  memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
+  memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
+  memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
+  memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
+  memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
+  memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
+  memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
+  memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
+  memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
+
+  // Pop dependence token if instructed
+  if (pop_next_dependence) {
+    g2l_dep_queue.read();
+  }
+
+  // Initialize indices
+  memop_sram_T sram_idx = sram_base;
+  memop_dram_T dram_idx = dram_base;
+
+  // Pre-compute dimensions, and offsets
+  memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
+  memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
+  memop_sram_T y_offset = x_size_total * y_pad_0;
+// Force this computation to be done with LUTs to avoid using too many DSPs
+#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
+
+  // Skip padding along y dimension
+  sram_idx += y_offset;
+
+  // Perform data transfer from DRAM
+  for (int y = 0; y < y_size; y++) {
+#pragma HLS PIPELINE rewind
+    // Skip padding along x dimension
+    sram_idx += x_pad_0;
+    // Perform data transfer
+    if (memory_type == VTA_MEM_ID_INP) {
+      memcpy(&inp_mem[sram_idx][0],
+             (const inp_vec_T*) &inputs[dram_idx * VTA_BATCH],
+             x_size * VTA_INP_ELEM_BYTES);
+    } else {
+      memcpy(&wgt_mem[sram_idx][0],
+             (const wgt_vec_T*) &weights[dram_idx * VTA_BLOCK_OUT],
+             x_size * VTA_WGT_ELEM_BYTES);
+    }
+    sram_idx += x_size;
+    dram_idx += x_stride;
+    // Skip padding along x dimension
+    sram_idx += x_pad_1;
+  }
+
+  // Reset SRAM index
+  sram_idx = sram_base;
+  // Pad x/y edges with zeros
+  for (int y = 0; y < y_size_total; y++) {
+    if (y < y_pad_0 || y >= y_pad_0 + y_size) {
+      for (int x = 0; x < x_size_total; x++) {
+#pragma HLS PIPELINE II = 1 rewind
+        if (memory_type == VTA_MEM_ID_INP) {
+          for (int i = 0; i < VTA_BATCH; i++) {
+            inp_mem[sram_idx][i] = 0;
+          }
+        } else {
+          for (int i = 0; i < VTA_BLOCK_OUT; i++) {
+            wgt_mem[sram_idx][i] = 0;
+          }
+        }
+        sram_idx++;
+      }
+    } else {
+      for (int x = 0; x < x_pad_0; x++) {
+#pragma HLS PIPELINE II = 1 rewind
+        if (memory_type == VTA_MEM_ID_INP) {
+          for (int i = 0; i < VTA_BATCH; i++) {
+            inp_mem[sram_idx][i] = 0;
+          }
+        } else {
+          for (int i = 0; i < VTA_BLOCK_OUT; i++) {
+            wgt_mem[sram_idx][i] = 0;
+          }
+        }
+        sram_idx++;
+      }
+      sram_idx += x_size;
+      for (int x = 0; x < x_pad_1; x++) {
+#pragma HLS PIPELINE II = 1 rewind
+        if (memory_type == VTA_MEM_ID_INP) {
+          for (int i = 0; i < VTA_BATCH; i++) {
+            inp_mem[sram_idx][i] = 0;
+          }
+        } else {
+          for (int i = 0; i < VTA_BLOCK_OUT; i++) {
+            wgt_mem[sram_idx][i] = 0;
+          }
+        }
+        sram_idx++;
+      }
+    }
+  }
+
+  // Push dependence token if instructed
+  if (push_next_dependence) {
+    l2g_dep_queue.write(1);
+  }
+}
+
+void compute(
+  volatile uint32_t &done,
+  volatile uop_T *uops,
+  volatile acc_vec_T *biases,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
+  ) {
+#pragma HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS
+#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
+#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
+#pragma HLS INTERFACE axis port = gemm_queue
+#pragma HLS INTERFACE axis port = l2g_dep_queue
+#pragma HLS INTERFACE axis port = s2g_dep_queue
+#pragma HLS INTERFACE axis port = g2l_dep_queue
+#pragma HLS INTERFACE axis port = g2s_dep_queue
+#pragma HLS INTERFACE bram port = inp_mem
+#pragma HLS INTERFACE bram port = wgt_mem
+#pragma HLS INTERFACE bram port = out_mem
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+// This is necessary connect the SRAM to the load module
+#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
+
+  // Micro-op storage
+  static uop_T uop_mem[VTA_UOP_BUFF_DEPTH];
+
+  // Accumulator storage
+  static acc_vec_T acc_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
+#pragma HLS ARRAY_PARTITION variable = acc_mem complete dim = 2
+
+  // Pop GEMM instruction
+  insn_T insn = gemm_queue.read();
+
+  // Decode
+  opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
+  bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
+  bool pop_next_dependence = insn[VTA_INSN_MEM_2];
+  bool push_prev_dependence = insn[VTA_INSN_MEM_3];
+  bool push_next_dependence = insn[VTA_INSN_MEM_4];
+
+  // Pop dependence token if instructed
+  if (pop_prev_dependence) {
+    l2g_dep_queue.read();
+  }
+  if (pop_next_dependence) {
+    s2g_dep_queue.read();
+  }
+
+  // Perform action based on opcode
+  if (opcode == VTA_OPCODE_FINISH) {
+    // Set done flag if we reach a FINISH instruction
+    done = 1;
+  } else if (opcode == VTA_OPCODE_LOAD || opcode == VTA_OPCODE_STORE) {
+    // Set done value
+    done = 0;
+
+    // Decode instruction
+    memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+    memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
+    memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
+    memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
+    memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
+    memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
+    memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
+    memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
+    memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
+    memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
+
+    // Initialize indices
+    memop_sram_T sram_idx = sram_base;
+    memop_dram_T dram_idx = dram_base;
+
+    // Pre-compute dimensions, and offsets
+    memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
+    memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
+    memop_sram_T y_offset = x_size_total * y_pad_0;
+// Force this computation to be done with LUTs to avoid using too many DSPs
+#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
+
+    if (memory_type == VTA_MEM_ID_UOP) {
+      // Perform data transfer
+      memcpy(&uop_mem[sram_base],
+             (const uop_T*) &uops[dram_base],
+             x_size * sizeof(uop_T));
+    } else {
+      // Skip vertical padding
+      sram_idx += y_offset;
+      // Perform data transfer from DRAM
+      for (int y = 0; y < y_size; y++) {
+#pragma HLS PIPELINE rewind
+        // Skip padding along x dimension
+        sram_idx += x_pad_0;
+        // Perform data transfer
+        memcpy(&acc_mem[sram_idx][0],
+               (const acc_vec_T*) &biases[dram_idx * VTA_BATCH],
+               x_size*VTA_ACC_ELEM_BYTES);
+        sram_idx += x_size;
+        dram_idx += x_stride;
+        // Skip padding along x dimension
+        sram_idx += x_pad_1;
+      }
+    }
+  } else if (opcode == VTA_OPCODE_GEMM || opcode == VTA_OPCODE_ALU) {
+    // Set done value
+    done = 0;
+
+    // Decode
+    bool reset_out = insn[VTA_INSN_GEM_5];
+    uop_idx_T uop_bgn = insn.range(VTA_INSN_GEM_6_1, VTA_INSN_GEM_6_0);
+    uop_idx_T uop_end = insn.range(VTA_INSN_GEM_7_1, VTA_INSN_GEM_7_0);
+    loop_T iter_out  = insn.range(VTA_INSN_GEM_8_1, VTA_INSN_GEM_8_0);
+    loop_T iter_in  = insn.range(VTA_INSN_GEM_9_1, VTA_INSN_GEM_9_0);
+    acc_idx_T dst_factor_out = insn.range(VTA_INSN_GEM_A_1, VTA_INSN_GEM_A_0);
+    acc_idx_T dst_factor_in = insn.range(VTA_INSN_GEM_B_1, VTA_INSN_GEM_B_0);
+    inp_idx_T src_factor_out = insn.range(VTA_INSN_GEM_C_1, VTA_INSN_GEM_C_0);
+    inp_idx_T src_factor_in = insn.range(VTA_INSN_GEM_D_1, VTA_INSN_GEM_D_0);
+
+    // GEMM-specific fields
+    wgt_idx_T wgt_factor_out = insn.range(VTA_INSN_GEM_E_1, VTA_INSN_GEM_E_0);
+    wgt_idx_T wgt_factor_in = insn.range(VTA_INSN_GEM_F_1, VTA_INSN_GEM_F_0);
+
+    // ALU-specific field
+    aluop_opcode_T alu_opcode = insn.range(VTA_INSN_ALU_E_1, VTA_INSN_ALU_E_0);
+    bool use_imm = insn[VTA_INSN_ALU_F];
+    aluop_imm_T imm = insn.range(VTA_INSN_ALU_G_1, VTA_INSN_ALU_G_0);
+    acc_idx_T dst_offset_out = 0;
+    inp_idx_T src_offset_out = 0;
+    wgt_idx_T wgt_offset_out = 0;
+
+    // Outer Loop
+    EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out++) {
+#pragma HLS DEPENDENCE variable = acc_mem inter false
+      acc_idx_T dst_offset_in = dst_offset_out;
+      inp_idx_T src_offset_in = src_offset_out;
+      wgt_idx_T wgt_offset_in = wgt_offset_out;
+
+      // Inner Loop
+      EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in++) {
+        // Perform appropriate computation based on opcode
+        if (opcode == VTA_OPCODE_GEMM) {
+          // Iterate over micro op
+          READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
+#pragma HLS PIPELINE II = 1 rewind
+
+            // Read micro-op fields
+            uop_T uop = uop_mem[upc];
+
+            // Decode indices
+            acc_idx_T dst_idx =
+                uop.range(VTA_UOP_GEM_0_1, VTA_UOP_GEM_0_0) + dst_offset_in;
+            inp_idx_T src_idx =
+                uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + src_offset_in;
+            wgt_idx_T wgt_idx =
+                uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + wgt_offset_in;
+
+            // Read weight matrix
+            wgt_vec_T w_matrix[VTA_BLOCK_OUT];
+            for (int i = 0; i < VTA_BLOCK_OUT; i++) {
+              w_matrix[i] = wgt_mem[wgt_idx][i];
+            }
+            // Read input matrix and accum matrix
+            acc_vec_T o_matrix[VTA_BATCH];
+            inp_vec_T i_matrix[VTA_BATCH];
+            for (int i = 0; i < VTA_BATCH; i++) {
+              o_matrix[i] = acc_mem[dst_idx][i];
+              i_matrix[i] = inp_mem[src_idx][i];
+            }
+            // Result matrices
+            acc_vec_T acc_mem_val[VTA_BATCH];
+            out_vec_T st_buf_val[VTA_BATCH];
+
+            // Inner GEMM loop
+            for (int i = 0; i < VTA_BATCH; i++) {
+              for (int b = 0; b < VTA_BLOCK_OUT; b++) {
+                // Initialize the accumulator values
+                acc_T accum =
+                  o_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
+                // Dot product sum
+                sum_T tmp = 0;
+                // Inner matrix multiplication loop (input channel/feature)
+                for (int k = 0; k < VTA_BLOCK_IN; k++) {
+                  wgt_T w_elem =
+                      w_matrix[b].range((k + 1) * VTA_WGT_WIDTH - 1, k * VTA_WGT_WIDTH);
+                  inp_T i_elem =
+                      i_matrix[i].range((k + 1) * VTA_INP_WIDTH - 1, k * VTA_INP_WIDTH);
+                  mul_T prod = i_elem * w_elem;
+#ifdef NO_DSP
+#pragma HLS RESOURCE variable = prod core = Mul_LUT
+#endif //  NO_DSP
+                  tmp += (sum_T) prod;
+                }
+                // Update summation
+                accum += (acc_T) tmp;
+                // Update result vector
+                acc_mem_val[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) =
+                    reset_out ? (acc_T) 0 : accum;
+                st_buf_val[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
+                    (out_T) accum.range(VTA_OUT_WIDTH - 1, 0);
+              }
+              // Write to buffers
+              acc_mem[dst_idx][i] = acc_mem_val[i];
+              out_mem[dst_idx][i] = st_buf_val[i];
+            }
+          }
+        }
+#ifndef NO_ALU
+        else if (opcode == VTA_OPCODE_ALU) {
+          // Iterate over micro op
+          READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
+            // Read micro-op fields
+            uop_T uop = uop_mem[upc];
+
+            // Decode
+            acc_idx_T dst_idx =
+                uop.range(VTA_UOP_ALU_0_1, VTA_UOP_ALU_0_0) + dst_offset_in;
+            acc_idx_T src_idx =
+                uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in;
+
+            // Perform ALU op over matrix elements
+            for (int i = 0; i < VTA_BATCH; i++) {
+              // Read input matrix and accum matrix
+              acc_vec_T dst_vector = acc_mem[dst_idx][i];
+              acc_vec_T src_vector = acc_mem[src_idx][i];
+              // Result matrices
+              acc_vec_T cmp_res;
+              acc_vec_T add_res;
+              acc_vec_T shr_res;
+              out_vec_T short_cmp_res;
+              out_vec_T short_add_res;
+              out_vec_T short_shr_res;
+              // Results vector
+              acc_vec_T res_vec = 0;
+              for (int b = 0; b < VTA_BLOCK_OUT; b++) {
+#pragma HLS PIPELINE II = 1 rewind
+                // Read in operands
+                acc_T src_0 = dst_vector.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
+                acc_T src_1 = use_imm ?
+                    (acc_T) imm :
+                    src_vector.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
+                // Compute Min/Max
+                acc_T mix_val = src_0 < src_1 ?
+                    (alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
+                    (alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
+                cmp_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = mix_val;
+                short_cmp_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
+                    (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
+                // Compute Sum
+                acc_T add_val =
+                    src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
+                add_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = add_val;
+                short_add_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
+                    (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
+                // Compute Shift Right
+                acc_T shr_val =
+                    src_0 >> (aluop_sh_imm_T) src_1.range(VTA_LOG_ACC_WIDTH - 1, 0);
+                shr_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = shr_val;
+                short_shr_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
+                    (out_T) shr_val.range(VTA_OUT_WIDTH-1, 0);
+              }
+
+              // Store to accum memory/store buffer
+              if (alu_opcode == VTA_ALU_OPCODE_MIN ||
+                  alu_opcode == VTA_ALU_OPCODE_MAX) {
+                acc_mem[dst_idx][i] = cmp_res;
+                out_mem[dst_idx][i] = short_cmp_res;
+              } else if (alu_opcode == VTA_ALU_OPCODE_ADD) {
+                acc_mem[dst_idx][i] = add_res;
+                out_mem[dst_idx][i] = short_add_res;
+              } else if (alu_opcode == VTA_ALU_OPCODE_SHR) {
+                acc_mem[dst_idx][i] = shr_res;
+                out_mem[dst_idx][i] = short_shr_res;
+              }
+            }
+          }
+        }
+#endif  // NO_ALU
+
+        // Update offsets
+        dst_offset_in += dst_factor_in;
+        src_offset_in += src_factor_in;
+        wgt_offset_in += wgt_factor_in;
+      }
+
+      // Update offsets
+      dst_offset_out += dst_factor_out;
+      src_offset_out += src_factor_out;
+      wgt_offset_out += wgt_factor_out;
+    }
+  }
+
+  // Push dependence token if instructed
+  if (push_prev_dependence) {
+    g2l_dep_queue.write(1);
+  }
+  if (push_next_dependence) {
+    g2s_dep_queue.write(1);
+  }
+}
+
+void store(
+  volatile out_vec_T *outputs,
+  hls::stream<insn_T> &store_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
+  ) {
+#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
+#pragma HLS INTERFACE axis port = store_queue
+#pragma HLS INTERFACE axis port = g2s_dep_queue
+#pragma HLS INTERFACE axis port = s2g_dep_queue
+#pragma HLS INTERFACE bram port = out_mem
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+
+  // Load buffer
+  insn_T insn = store_queue.read();
+
+  // Decode
+  bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
+  bool pop_next_dependence = insn[VTA_INSN_MEM_2];
+  bool push_prev_dependence = insn[VTA_INSN_MEM_3];
+  bool push_next_dependence = insn[VTA_INSN_MEM_4];
+  memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+  memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
+  memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
+  memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
+  memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
+  memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
+  memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
+  memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
+  memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
+  memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
+
+  // Pop dependence token if instructed
+  if (pop_prev_dependence) {
+    g2s_dep_queue.read();
+  }
+
+  // Initialize indices
+  memop_sram_T sram_idx = sram_base;
+  memop_dram_T dram_idx = dram_base;
+
+  // Skip padding along y dimension
+  memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0;
+  sram_idx += y_offset;
+// Force this computation to be done with LUTs to avoid using too many DSPs
+#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
+
+  // Copy along y dimension
+  for (int y = 0; y < y_size; y++) {
+#pragma HLS PIPELINE rewind
+    // Skip padding along x dimension
+    sram_idx += x_pad_0;
+    // Perform data transfer
+    memcpy(
+      const_cast<out_vec_T*>(&outputs[dram_idx*VTA_BATCH]),
+      (const out_vec_T*) &out_mem[sram_idx][0],
+      x_size * VTA_INP_ELEM_BYTES);
+    sram_idx += x_size;
+    dram_idx += x_stride;
+    // Skip padding along x dimension
+    sram_idx += x_pad_1;
+  }
+
+  // Push dependence token if instructed
+  if (push_prev_dependence) {
+    s2g_dep_queue.write(1);
+  }
+}
+
+void vta(
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  volatile uop_T *uops,
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  volatile acc_vec_T *biases,
+  volatile out_vec_T *outputs) {
+#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
+#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
+#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
+#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
+#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
+#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
+#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+
+  // Instantiate temporary instruction queues (used for peeking)
+  hls::stream<insn_T> tmp_load_queue;
+  hls::stream<insn_T> tmp_gemm_queue;
+  hls::stream<insn_T> tmp_store_queue;
+
+  // Instatiate physical instruction queues
+  hls::stream<insn_T> load_queue;
+  hls::stream<insn_T> gemm_queue;
+  hls::stream<insn_T> store_queue;
+
+  // Dependence queues
+  hls::stream<bool> l2g_dep_queue;
+  hls::stream<bool> s2g_dep_queue;
+  hls::stream<bool> g2l_dep_queue;
+  hls::stream<bool> g2s_dep_queue;
+
+  // Instantiate memories
+  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH];
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT];
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
+
+  // Push all instructions into the queues
+  fetch(insn_count, insns, tmp_load_queue, tmp_gemm_queue, tmp_store_queue);
+
+  // Global done indicator
+  uint32_t done = 0;
+
+  // Temporary instructions
+  insn_T tmp_load;
+  insn_T tmp_gemv;
+  insn_T tmp_store;
+
+  // Peeking status
+  bool tmp_load_popped = false;
+  bool tmp_gemm_popped = false;
+  bool tmp_store_popped = false;
+  int exit_counter = 0;
+
+  // Main control loop
+  while (true) {
+    // First execute as many load instructions as possible
+    while (!tmp_load_queue.empty() || tmp_load_popped == true) {
+      // Pop the load instruction
+      if (!tmp_load_popped) {
+        tmp_load_queue.read(tmp_load);
+        tmp_load_popped = true;
+      }
+      // Check dependences and invoke the load stage
+      bool pop_next_dependence = tmp_load[VTA_INSN_MEM_2];
+      if ((pop_next_dependence && !g2l_dep_queue.empty()) ||
+          !pop_next_dependence) {
+        // Push the instruction in the load queue
+        load_queue.write(tmp_load);
+        tmp_load_popped = false;
+        load(inputs, weights, load_queue, g2l_dep_queue, l2g_dep_queue, inp_mem, wgt_mem);
+      } else {
+        // Execution of load stage pending on completion of other stages, so break here...
+        break;
+      }
+    }
+    // Next execute as many gemm instructions as possible
+    while (!tmp_gemm_queue.empty() || tmp_gemm_popped == true) {
+      // Pop the gemm instruction
+      if (!tmp_gemm_popped) {
+        tmp_gemm_queue.read(tmp_gemv);
+        tmp_gemm_popped = true;
+      }
+      // Check dependences and invoke the load stage
+      bool pop_prev_dependence = tmp_gemv[VTA_INSN_MEM_1];
+      bool pop_next_dependence = tmp_gemv[VTA_INSN_MEM_2];
+      if (
+        (pop_prev_dependence && !l2g_dep_queue.empty() &&
+         pop_next_dependence && !s2g_dep_queue.empty()) ||
+        (!pop_prev_dependence && pop_next_dependence &&
+         !s2g_dep_queue.empty()) ||
+        (pop_prev_dependence && !l2g_dep_queue.empty() &&
+        !pop_next_dependence) ||
+        (!pop_prev_dependence && !pop_next_dependence)
+      ) {
+        // Push the instruction in the load queue
+        gemm_queue.write(tmp_gemv);
+        tmp_gemm_popped = false;
+        compute(done, uops, biases, gemm_queue, l2g_dep_queue, s2g_dep_queue,
+                g2l_dep_queue, g2s_dep_queue, inp_mem, wgt_mem, out_mem);
+      } else {
+        // Execution of load stage pending on completion of other stages,
+        // so break here...
+        break;
+      }
+    }
+    // Finally execute as many store instructions as possible
+    while (!tmp_store_queue.empty() || tmp_store_popped == true) {
+      // Pop the load instruction
+      if (!tmp_store_popped) {
+        tmp_store_queue.read(tmp_store);
+        tmp_store_popped = true;
+      }
+      // Check dependences and invoke the load stage
+      bool pop_prev_dependence = tmp_store[VTA_INSN_MEM_1];
+      if ((pop_prev_dependence && !g2s_dep_queue.empty()) ||
+          !pop_prev_dependence) {
+        // Push the instruction in the load queue
+        store_queue.write(tmp_store);
+        tmp_store_popped = false;
+        store(outputs, store_queue, g2s_dep_queue, s2g_dep_queue, out_mem);
+      } else {
+        // Execution of load stage pending on completion of other stages, so break here...
+        break;
+      }
+    }
+    // Check if we get a signal that we are done
+    if (done) {
+      break;
+    }
+    exit_counter++;
+    if (exit_counter > 1000) {
+      if (tmp_load_popped) {
+        if (g2l_dep_queue.empty()) {
+          printf("waiting on g2l\n");
+        }
+      }
+      if (tmp_gemm_popped) {
+        if (l2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_1]) {
+          printf("waiting on l2g\n");
+        }
+        if (s2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_2]) {
+          printf("waiting on s2g\n");
+        }
+      }
+      if (tmp_store_popped) {
+        if (g2s_dep_queue.empty()) {
+          printf("waiting on g2s\n");
+        }
+      }
+      break;
+    }
+  }
+
+  // Ensure that the tokens are empty
+  bool tmp_tok;
+  int l2g_count = 0;
+  int s2g_count = 0;
+  int g2l_count = 0;
+  int g2s_count = 0;
+  while (l2g_dep_queue.read_nb(tmp_tok)) {
+    l2g_count++;
+  }
+  while (s2g_dep_queue.read_nb(tmp_tok)) {
+    s2g_count++;
+  }
+  while (g2l_dep_queue.read_nb(tmp_tok)) {
+    g2l_count++;
+  }
+  while (g2s_dep_queue.read_nb(tmp_tok)) {
+    g2s_count++;
+  }
+
+  assert(l2g_count == 0 && g2s_count == 0 && g2l_count == 0 && g2s_count == 0);
+}
diff --git a/vta/hardware/xilinx/src/vta.h b/vta/hardware/xilinx/src/vta.h
new file mode 100644
index 000000000000..6118f867fd0c
--- /dev/null
+++ b/vta/hardware/xilinx/src/vta.h
@@ -0,0 +1,214 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta.h
+ * \brief Type definitions and prototype for VTA HLS design.
+ */
+#ifndef VTA_VTA_H_
+#define VTA_VTA_H_
+
+#include <ap_axi_sdata.h>
+#include <ap_int.h>
+#include <assert.h>
+#include <hls_stream.h>
+
+#include <vta/hw_spec.h>
+
+/* \typedef uop_T Micro-op datatype*/
+typedef ap_uint<VTA_UOP_WIDTH> uop_T;
+
+/* \typedef inp_T Input datatype*/
+typedef ap_int<VTA_INP_WIDTH> inp_T;
+
+/* \typedef wgt_T Weight datatype*/
+typedef ap_int<VTA_WGT_WIDTH> wgt_T;
+
+/* \typedef out_T Output datatype*/
+typedef ap_int<VTA_OUT_WIDTH> out_T;
+
+/* \typedef acc_T Accumulator datatype*/
+typedef ap_int<VTA_ACC_WIDTH> acc_T;
+
+/* \typedef mul_T Multiplier output datatype*/
+typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
+
+/* \typedef sum_T GEMM accumulator datatype*/
+typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
+
+/* \typedef inp_vec_T Input vector datatype*/
+typedef ap_uint<VTA_INP_WIDTH*VTA_BLOCK_IN> inp_vec_T;
+
+/* \typedef wgt_vec_T Weight vector datatype*/
+typedef ap_uint<VTA_WGT_WIDTH*VTA_BLOCK_IN> wgt_vec_T;
+
+/* \typedef acc_vec_T Accumulator vector datatype*/
+typedef ap_uint<VTA_ACC_WIDTH*VTA_BLOCK_OUT> acc_vec_T;
+
+/* \typedef out_vec_T Output vector datatype*/
+typedef ap_uint<VTA_OUT_WIDTH*VTA_BLOCK_OUT> out_vec_T;
+
+/* \typedef uop_idx_T Micro-op SRAM index datatype*/
+typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
+
+/* \typedef inp_idx_T Input SRAM index datatype*/
+typedef ap_uint<VTA_LOG_INP_BUFF_DEPTH+1> inp_idx_T;
+
+/* \typedef wgt_idx_T Weight SRAM index datatype*/
+typedef ap_uint<VTA_LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
+
+/* \typedef acc_idx_T Accumulator SRAM index datatype*/
+typedef ap_uint<VTA_LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
+
+/* \typedef opcode_T Opcode datatype*/
+typedef ap_uint<VTA_OPCODE_BIT_WIDTH> opcode_T;
+
+/* \typedef insn_T Instruction datatype*/
+typedef ap_uint<VTA_INS_WIDTH> insn_T;
+
+/* \typedef loop_T Loop bound datatype*/
+typedef ap_uint<VTA_LOOP_ITER_WIDTH> loop_T;
+
+/* \typedef memop_id_T Memory operation ID datatype*/
+typedef ap_uint<VTA_MEMOP_ID_BIT_WIDTH> memop_id_T;
+
+/* \typedef memop_sram_T Memory operation SRAM index datatype*/
+typedef ap_uint<VTA_MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
+
+/* \typedef memop_dram_T Memory operation DRAM index datatype*/
+typedef ap_uint<VTA_MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
+
+/* \typedef memop_size_T Memory operation range datatype*/
+typedef ap_uint<VTA_MEMOP_SIZE_BIT_WIDTH> memop_size_T;
+
+/* \typedef memop_stride_T Memory operation stride datatype*/
+typedef ap_uint<VTA_MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
+
+/* \typedef memop_pad_T Memory operation pad width datatype*/
+typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
+
+/* \typedef aluop_opcode_T ALU operation opcode datatype*/
+typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
+
+/* \typedef aluop_opcode_T ALU operation immediate datatype*/
+typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
+
+/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
+typedef ap_int<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
+
+/*!
+* \brief Fetch module.
+*   Reads in \a insn_count instructions via DMA and pushes them to the
+*   appropriate load, gemm or store queue.
+* \param insns Instruction data base address in DRAM. AXI-4 master port.
+* \param insn_count Total instruction count. AXI-lite memory mapped register.
+* \param load_queue Load instruction queue. AXI-stream FIFO.
+* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
+* \param store_queue Store instruction queue. AXI-stream FIFO.
+*/
+void fetch(
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<insn_T> &store_queue);
+
+/*!
+* \brief Load module.
+*   Reads in load instructions from the load queue, and performs appropriate
+*   DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM.
+*   Updates dependence queues accordingly.
+* \param inputs Input data base address in DRAM. AXI-4 master port.
+* \param weights Weight data base address in DRAM. AXI-4 master port.
+* \param load_queue Load instruction queue. AXI-stream FIFO.
+* \param g2l_dep_queue Dependence queue from GEMM to load stage.
+*   AXI-stream FIFO.
+* \param l2g_dep_queue Dependence queue from load to GEMM stage.
+*   AXI-stream FIFO.
+* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
+* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
+*/
+void load(
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]);
+
+/*!
+* \brief Compute module.
+*   Reads in GEMM instructions from the gemm queue, and performs appropriate
+*   GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem,
+*   and writes computation results into the \a out_mem. Updates dependence
+*   queues accordingly.
+* \param done Signal that indicates that VLA is done.  AXI-lite memory mapped
+*   register.
+* \param uops Micro-op data base address in DRAM. AXI-4 master port.
+* \param biases Bias data base address in DRAM. AXI-4 master port.
+* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
+* \param l2g_dep_queue Dependence queue from load to gemm stage.
+*   AXI-stream FIFO.
+* \param s2g_dep_queue Dependence queue from store to gemm stage.
+*   AXI-stream FIFO.
+* \param g2l_dep_queue Dependence queue from gemm to load stage.
+*   AXI-stream FIFO.
+* \param g2s_dep_queue Dependence queue from gemm to store stage.
+*   AXI-stream FIFO.
+* \param inp_mem Local input SRAM buffer. Read only single port BRAM.
+* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
+* \param out_mem Local output SRAM buffer. Write only single port BRAM.
+*/
+void compute(
+  volatile uint32_t &done,
+  volatile uop_T *uops,
+  volatile acc_vec_T *biases,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
+
+/*!
+* \brief Store module.
+*   Reads in store instructions from the store queue, and performs appropriate
+*   store instructions from the output buffer in SRAM to DRAM. Updates dependence
+*   queues accordingly.
+* \param outputs Output data base address in DRAM. AXI-4 master port.
+* \param store_queue Store instruction queue. AXI-stream FIFO.
+* \param g2s_dep_queue Dependence queue from gemm to store stage.
+*   AXI-stream FIFO.
+* \param s2g_dep_queue Dependence queue from store to gemm stage.
+*   AXI-stream FIFO.
+* \param out_mem Local output SRAM buffer. Read only single port BRAM.
+*/
+void store(
+  volatile out_vec_T *outputs,
+  hls::stream<insn_T> &store_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
+
+/*!
+* \brief VTA wrapper for simulation purpose only.
+*   Orchestrates dataflow execution of the fetch, load, GEMM and store stages.
+* \param insn_count Total instruction count. AXI-lite memory mapped register.
+* \param insns Instruction data base address in DRAM. AXI-4 master port.
+* \param uops Micro-op data base address in DRAM. AXI-4 master port.
+* \param inputs Input data base address in DRAM. AXI-4 master port.
+* \param weights Weight data base address in DRAM. AXI-4 master port.
+* \param biases Bias data base address in DRAM. AXI-4 master port.
+* \param outputs Output data base address in DRAM. AXI-4 master port.
+*/
+void vta(
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  volatile uop_T *uops,
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  volatile acc_vec_T *biases,
+  volatile out_vec_T *outputs);
+
+#endif  // VTA_VTA_H_
diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h
new file mode 100644
index 000000000000..269728c51cda
--- /dev/null
+++ b/vta/include/vta/driver.h
@@ -0,0 +1,107 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file driver.h
+ * \brief Driver interface that is used by runtime.
+ *
+ * Driver's implementation is device specific.
+ */
+
+#ifndef VTA_DRIVER_H_
+#define VTA_DRIVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/*! \brief Memory management constants for cached memory */
+#define VTA_CACHED 1
+/*! \brief Memory management constants for non-cached memory */
+#define VTA_NOT_CACHED 0
+
+/*! \brief Physically contiguous buffer size limit */
+#ifndef VTA_MAX_XFER
+#define VTA_MAX_XFER (1<<22)
+#endif
+
+/*! \brief Device resource context  */
+typedef void * VTADeviceHandle;
+
+/*! \brief physical address */
+typedef uint32_t vta_phy_addr_t;
+
+/*!
+ * \brief Allocate a device resource handle
+ * \return The device handle.
+ */
+VTADeviceHandle VTADeviceAlloc();
+
+/*!
+ * \brief Free a device handle
+ * \param handle The device handle to be freed.
+ */
+void VTADeviceFree(VTADeviceHandle handle);
+
+/*!
+ * \brief Launch the instructions block until done.
+ * \param device The device handle.
+ * \param insn_phy_addr The physical address of instruction stream.
+ * \param insn_count Instruction count.
+ * \param wait_cycles The maximum of cycles to wait
+ *
+ * \return 0 if running is successful, 1 if timeout.
+ */
+int VTADeviceRun(VTADeviceHandle device,
+                 vta_phy_addr_t insn_phy_addr,
+                 uint32_t insn_count,
+                 uint32_t wait_cycles);
+
+/*!
+ * \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
+ * \param size Size of the region in Bytes.
+ * \param cached Region can be set to not cached (write-back) if set to 0.
+ * \return A pointer to the allocated region.
+ */
+void* VTAMemAlloc(size_t size, int cached);
+
+/*!
+ * \brief Frees a physically contiguous region in memory.
+ * \param buf Buffer to free.
+ */
+void VTAMemFree(void* buf);
+
+/*!
+ * \brief Returns a physical address to the region of memory allocated with VTAMemAlloc.
+ * \param buf Pointer to memory region allocated with VTAMemAlloc.
+ * \return The physical address of the memory region.
+ */
+vta_phy_addr_t VTAMemGetPhyAddr(void* buf);
+
+/*!
+ * \brief Flushes the region of memory out of the CPU cache to DRAM.
+ * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
+ *            This need to be the physical address.
+ * \param size Size of the region to flush in Bytes.
+ */
+void VTAFlushCache(vta_phy_addr_t buf, int size);
+
+/*!
+ * \brief Invalidates the region of memory that is cached.
+ * \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
+ *            This need to be the physical address.
+ * \param size Size of the region to invalidate in Bytes.
+ */
+void VTAInvalidateCache(vta_phy_addr_t buf, int size);
+
+/*!
+ * \brief Programming the bit stream on the FPGA.
+ * \param bitstream The path to the bit stream file.
+ */
+void VTAProgram(const char* bitstream);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_DRIVER_H_
diff --git a/vta/include/vta/hw_spec.h b/vta/include/vta/hw_spec.h
new file mode 100644
index 000000000000..5bcd44c16bdf
--- /dev/null
+++ b/vta/include/vta/hw_spec.h
@@ -0,0 +1,551 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file hw_spec.h
+ * \brief Preprocessor definitions for VTA HLS design and runtime.
+ */
+
+#ifndef VTA_HW_SPEC_H_
+#define VTA_HW_SPEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+/*! log2 of instruction data type width */
+#define VTA_LOG_INS_WIDTH 7
+/*! Instruction data type width */
+#define VTA_INS_WIDTH (1 << VTA_LOG_INS_WIDTH)
+/*! log2 of micro op data type width */
+#define VTA_LOG_UOP_WIDTH 5
+/*! Micro Op data type width */
+#define VTA_UOP_WIDTH (1 << VTA_LOG_UOP_WIDTH)
+/*! Weight data type width */
+#define VTA_WGT_WIDTH (1 << VTA_LOG_WGT_WIDTH)
+/*! Input data type width */
+#define VTA_INP_WIDTH (1 << VTA_LOG_INP_WIDTH)
+/*! Output data type width */
+#define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH)
+/*! Accumulator data type width */
+#define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH)
+/*! log2 of ALU data type width */
+#define VTA_LOG_ALU_WIDTH (VTA_LOG_ACC_WIDTH - 1)
+/*! ALU data type width */
+#define VTA_ALU_WIDTH (1 << VTA_LOG_ALU_WIDTH)
+
+/*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
+#define VTA_BATCH (1 << VTA_LOG_BATCH)
+/*! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) */
+#define VTA_BLOCK_IN (1 << VTA_LOG_BLOCK_IN)
+/*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
+#define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT)
+
+/*! Weight vector width */
+#define VTA_WGT_VECTOR_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_IN)
+/*! Input vector width */
+#define VTA_INP_VECTOR_WIDTH (VTA_INP_WIDTH * VTA_BLOCK_IN)
+/*! Accumulator vector width */
+#define VTA_ACC_VECTOR_WIDTH (VTA_ACC_WIDTH * VTA_BLOCK_OUT)
+/*! Output vector width */
+#define VTA_OUT_VECTOR_WIDTH (VTA_OUT_WIDTH * VTA_BLOCK_OUT)
+
+/*! On-chip micro-op buffer size in B */
+#define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE)
+/*! On-chip weight buffer size in B */
+#define VTA_WGT_BUFF_SIZE (1 << VTA_LOG_WGT_BUFF_SIZE)
+/*! On-chip activation buffer size in B */
+#define VTA_INP_BUFF_SIZE (1 << VTA_LOG_INP_BUFF_SIZE)
+/*! On-chip accumulator buffer size in B */
+#define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE)
+
+/*! Size of instruction buffer element in B */
+#define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8)
+/*! Size of uop buffer element in B*/
+#define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8)
+/*! Size of activation buffer element in B*/
+#define VTA_INP_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_IN * VTA_INP_WIDTH / 8)
+/*! Size of weight buffer element in B*/
+#define VTA_WGT_ELEM_BYTES (VTA_BLOCK_OUT * VTA_BLOCK_IN * VTA_WGT_WIDTH / 8)
+/*! Size of accumulator buffer element in B*/
+#define VTA_ACC_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_OUT * VTA_ACC_WIDTH / 8)
+
+/*! On-chip micro-op buffer depth */
+#define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES)
+/*! log2 of on-chip micro-op buffer depth */
+#define VTA_LOG_UOP_BUFF_DEPTH (VTA_LOG_UOP_BUFF_SIZE - VTA_LOG_UOP_WIDTH + 3)
+// ! \brief On-chip weight buffer depth
+#define VTA_WGT_BUFF_DEPTH (VTA_WGT_BUFF_SIZE / VTA_WGT_ELEM_BYTES)
+/*! log2 of weight micro-op buffer depth */
+#define VTA_LOG_WGT_BUFF_DEPTH \
+    (VTA_LOG_WGT_BUFF_SIZE - VTA_LOG_BLOCK_OUT - VTA_LOG_BLOCK_IN - VTA_LOG_WGT_WIDTH + 3)
+/*! On-chip activation buffer depth */
+#define VTA_INP_BUFF_DEPTH (VTA_INP_BUFF_SIZE / VTA_INP_ELEM_BYTES)
+/*! log2 of activation micro-op buffer depth */
+#define VTA_LOG_INP_BUFF_DEPTH \
+    (VTA_LOG_INP_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_IN - VTA_LOG_INP_WIDTH + 3)
+/*! On-chip accumulator buffer depth */
+#define VTA_ACC_BUFF_DEPTH (VTA_ACC_BUFF_SIZE / VTA_ACC_ELEM_BYTES)
+/*! log2 of on-chip accumulator buffer depth */
+#define VTA_LOG_ACC_BUFF_DEPTH \
+    (VTA_LOG_ACC_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_OUT - VTA_LOG_ACC_WIDTH + 3)
+
+/*! Instruction opcode field bitwidth */
+#define VTA_OPCODE_BIT_WIDTH 3
+/*! ALU opcode field bitwidth */
+#define VTA_ALU_OPCODE_BIT_WIDTH 2
+
+/*! Opcode: load encoding */
+#define VTA_OPCODE_LOAD 0
+/*! Opcode: store encoding */
+#define VTA_OPCODE_STORE 1
+/*! Opcode: GEMM encoding */
+#define VTA_OPCODE_GEMM 2
+/*! Opcode: finish encoding */
+#define VTA_OPCODE_FINISH 3
+/*! Opcode: ALU encoding */
+#define VTA_OPCODE_ALU 4
+
+/*! ALU opcode: unary min op */
+#define VTA_ALU_OPCODE_MIN 0
+/*! ALU opcode: unary max op */
+#define VTA_ALU_OPCODE_MAX 1
+/*! ALU opcode: binary add op */
+#define VTA_ALU_OPCODE_ADD 2
+/*! ALU opcode: shift right by immediate op */
+#define VTA_ALU_OPCODE_SHR 3
+
+/*! Memory type field bitwidth */
+#define VTA_MEMOP_ID_BIT_WIDTH 2
+/*! Load/Store Instruction: DRAM address width*/
+#define VTA_MEMOP_SRAM_ADDR_BIT_WIDTH 16
+/*! Load/Store Instruction: DRAM address width*/
+#define VTA_MEMOP_DRAM_ADDR_BIT_WIDTH 32
+/*! Load/Store Instruction: transfer size width*/
+#define VTA_MEMOP_SIZE_BIT_WIDTH 16
+/*! Load/Store Instruction: stride size width*/
+#define VTA_MEMOP_STRIDE_BIT_WIDTH 16
+/*! Load/Store Instruction: padding width*/
+#define VTA_MEMOP_PAD_BIT_WIDTH 4
+/*! Load/Store Instruction: padding value encoding width*/
+#define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
+/*! ALU Instruction: immediate bitwidth*/
+#define VTA_ALUOP_IMM_BIT_WIDTH 16
+/*! GEMM/ALU Instruction: loop max iter bits */
+#define VTA_LOOP_ITER_WIDTH 14
+
+/*! Mem ID constant: uop memory */
+#define VTA_MEM_ID_UOP 0
+/*! Mem ID constant: weight memory */
+#define VTA_MEM_ID_WGT 1
+/*! Mem ID constant: input memory */
+#define VTA_MEM_ID_INP 2
+/*! Mem ID constant: accumulator/bias memory */
+#define VTA_MEM_ID_ACC 3
+/*! Mem ID constant: output store buffer */
+#define VTA_MEM_ID_OUT 4
+
+// Instruction organization layout:
+//
+// LOAD/STORE
+// _____________________________|_type______________|
+// arg 0: opcode                | opcode_T          |
+// arg 1: pop_prev_dependence   | bool              |
+// arg 2: pop_next_dependence   | bool              |
+// arg 3: push_prev_dependence  | bool              |
+// arg 4: push_next_dependence  | bool              |
+// arg 5: memory_type           | memop_id_T        |
+// arg 6: pad_value             | memop_pad_val_T   |
+// arg 7: sram_base             | memop_sram_T      |
+// arg 8: dram_base             | memop_dram_T      |
+// arg 9: y_size                | memop_size_T      |
+// arg a: x_size                | memop_size_T      |
+// arg b: x_stride              | memop_stride_T    |
+// arg c: y_pad_0               | memop_pad_T       |
+// arg d: y_pad_1               | memop_pad_T       |
+// arg e: x_pad_0               | memop_pad_T       |
+// arg f: x_pad_1               | memop_pad_T       |
+//
+// GEMM
+// _____________________________|_type______________|
+// arg 0: opcode                | opcode_T          |
+// arg 1: pop_prev_dependence   | bool              |
+// arg 2: pop_next_dependence   | bool              |
+// arg 3: push_prev_dependence  | bool              |
+// arg 4: push_next_dependence  | bool              |
+// arg 5: reset_reg             | bool              |
+// arg 6: uop_bgn               | uop_idx_T         |
+// arg 7: uop_end               | uop_idx_T         |
+// arg 8: iteration count ax0   | loop_T            |
+// arg 9: iteration count ax1   | loop_T            |
+// arg a: accum idx factor ax0  | acc_idx_T         |
+// arg b: accum idx factor ax1  | acc_idx_T         |
+// arg c: input idx factor ax0  | inp_idx_T         |
+// arg d: input idx factor ax1  | inp_idx_T         |
+// arg e: weight idx factor ax0 | wgt_idx_T         |
+// arg f: weight idx factor ax1 | wgt_idx_T         |
+//
+// ALU
+// _____________________________|_type______________|
+// arg 0: opcode                | opcode_T          |
+// arg 1: pop_prev_dependence   | bool              |
+// arg 2: pop_next_dependence   | bool              |
+// arg 3: push_prev_dependence  | bool              |
+// arg 4: push_next_dependence  | bool              |
+// arg 5: reset_reg             | bool              |
+// arg 6: uop_bgn               | uop_idx_T         |
+// arg 7: uop_end               | uop_idx_T         |
+// arg 8: iteration count ax0   | loop_T            |
+// arg 9: iteration count ax1   | loop_T            |
+// arg a: dst idx factor ax0    | acc_idx_T         |
+// arg b: dst idx factor ax1    | acc_idx_T         |
+// arg c: src idx factor ax0    | inp_idx_T         |
+// arg d: src idx factor ax1    | inp_idx_T         |
+// arg e: alu_opcode            | aluop_opcode_T    |
+// arg f: use_imm               | bool              |
+// arg g: imm                   | alu_imm_T         |
+
+/*! Load/Store instruction start position of the opcode field */
+#define VTA_INSN_MEM_0_0 0
+/*! Load/Store instruction end position of the opcode field */
+#define VTA_INSN_MEM_0_1 (VTA_INSN_MEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
+/*! Load/Store instruction position of the pop_prev_dep field */
+#define VTA_INSN_MEM_1   (VTA_INSN_MEM_0_1 + 1)
+/*! Load/Store instruction position of the pop_next_dep field */
+#define VTA_INSN_MEM_2   (VTA_INSN_MEM_1 + 1)
+/*! Load/Store instruction position of the push_prev_dependence field */
+#define VTA_INSN_MEM_3   (VTA_INSN_MEM_2 + 1)
+/*! Load/Store instruction position of the push_next_dependence field */
+#define VTA_INSN_MEM_4   (VTA_INSN_MEM_3 + 1)
+/*! Load/Store instruction start position of the memory_type field */
+#define VTA_INSN_MEM_5_0 (VTA_INSN_MEM_4 + 1)
+/*! Load/Store instruction end position of the memory_type field */
+#define VTA_INSN_MEM_5_1 (VTA_INSN_MEM_5_0 + VTA_MEMOP_ID_BIT_WIDTH - 1)
+/*! Load/Store instruction start position of the sram_base field */
+#define VTA_INSN_MEM_6_0 (VTA_INSN_MEM_5_1 + 1)
+/*! Load/Store instruction end position of the sram_base field */
+#define VTA_INSN_MEM_6_1 (VTA_INSN_MEM_6_0 + VTA_MEMOP_SRAM_ADDR_BIT_WIDTH - 1)
+/*! Load/Store instruction start position of the dram_base field */
+#define VTA_INSN_MEM_7_0 (VTA_INSN_MEM_6_1 + 1)
+/*! Load/Store instruction end position of the dram_base field */
+#define VTA_INSN_MEM_7_1 (VTA_INSN_MEM_7_0 + VTA_MEMOP_DRAM_ADDR_BIT_WIDTH - 1)
+/*! Load/Store instruction start position of the y_size field */
+#define VTA_INSN_MEM_8_0 64
+/*! Load/Store instruction end position of the y_size field */
+#define VTA_INSN_MEM_8_1 (VTA_INSN_MEM_8_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
+/*! Load/Store instruction start position of the x_size field */
+#define VTA_INSN_MEM_9_0 (VTA_INSN_MEM_8_1 + 1)
+/*! Load/Store instruction start position of the x_size field */
+#define VTA_INSN_MEM_9_1 (VTA_INSN_MEM_9_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
+/*! Load/Store instruction start position of the x_stride field */
+#define VTA_INSN_MEM_A_0 (VTA_INSN_MEM_9_1 + 1)
+/*! Load/Store instruction end position of the x_stride field */
+#define VTA_INSN_MEM_A_1 (VTA_INSN_MEM_A_0 + VTA_MEMOP_STRIDE_BIT_WIDTH - 1)
+/*! Load/Store instruction start position of the y_pad_0 field */
+#define VTA_INSN_MEM_B_0 (VTA_INSN_MEM_A_1 + 1)
+/*! Load/Store instruction start position of the y_pad_0 field */
+#define VTA_INSN_MEM_B_1 (VTA_INSN_MEM_B_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
+/*! Load/Store instruction start position of the y_pad_1 field */
+#define VTA_INSN_MEM_C_0 (VTA_INSN_MEM_B_1 + 1)
+/*! Load/Store instruction start position of the y_pad_1 field */
+#define VTA_INSN_MEM_C_1 (VTA_INSN_MEM_C_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
+/*! Load/Store instruction start position of the x_pad_0 field */
+#define VTA_INSN_MEM_D_0 (VTA_INSN_MEM_C_1 + 1)
+/*! Load/Store instruction start position of the x_pad_0 field */
+#define VTA_INSN_MEM_D_1 (VTA_INSN_MEM_D_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
+/*! Load/Store instruction start position of the x_pad_1 field */
+#define VTA_INSN_MEM_E_0 (VTA_INSN_MEM_D_1 + 1)
+/*! Load/Store instruction start position of the x_pad_1 field */
+#define VTA_INSN_MEM_E_1 (VTA_INSN_MEM_E_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
+
+/*! GEMM instruction start position of the opcode field */
+#define VTA_INSN_GEM_0_0 0
+/*! GEMM instruction end position of the opcode field */
+#define VTA_INSN_GEM_0_1 (VTA_INSN_GEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
+/*! GEMM instruction position of the pop_prev_dep field */
+#define VTA_INSN_GEM_1   (VTA_INSN_GEM_0_1 + 1)
+/*! GEMM instruction position of the pop_next_dep field */
+#define VTA_INSN_GEM_2   (VTA_INSN_GEM_1 + 1)
+/*! GEMM instruction position of the push_prev_dependence field */
+#define VTA_INSN_GEM_3   (VTA_INSN_GEM_2 + 1)
+/*! GEMM instruction position of the push_next_dependence field */
+#define VTA_INSN_GEM_4   (VTA_INSN_GEM_3 + 1)
+/*! GEMM instruction position of the reset register bit */
+#define VTA_INSN_GEM_5   (VTA_INSN_GEM_4 + 1)
+/*! GEMM instruction start position of the uop_bgn field */
+#define VTA_INSN_GEM_6_0 (VTA_INSN_GEM_5 + 1)
+/*! GEMM instruction end position of the uop_bgn field */
+#define VTA_INSN_GEM_6_1 (VTA_INSN_GEM_6_0 + VTA_LOG_UOP_BUFF_DEPTH - 1)
+/*! GEMM instruction start position of the uop_end field */
+#define VTA_INSN_GEM_7_0 (VTA_INSN_GEM_6_1 + 1)
+/*! GEMM instruction end position of the uop_end field */
+#define VTA_INSN_GEM_7_1 (VTA_INSN_GEM_7_0 + VTA_LOG_UOP_BUFF_DEPTH + 1 - 1)
+/*! GEMM instruction start position of the iter_out field */
+#define VTA_INSN_GEM_8_0 (VTA_INSN_GEM_7_1 + 1)
+/*! GEMM instruction end position of the iter_out field */
+#define VTA_INSN_GEM_8_1 (VTA_INSN_GEM_8_0 + VTA_LOOP_ITER_WIDTH - 1)
+/*! GEMM instruction start position of the iter_in field */
+#define VTA_INSN_GEM_9_0 (VTA_INSN_GEM_8_1 + 1)
+/*! GEMM instruction end position of the iter_in field */
+#define VTA_INSN_GEM_9_1 (VTA_INSN_GEM_9_0 + VTA_LOOP_ITER_WIDTH - 1)
+/*! GEMM instruction start position of the dst_factor_out field */
+#define VTA_INSN_GEM_A_0 64
+/*! GEMM instruction end position of the dst_factor_out field */
+#define VTA_INSN_GEM_A_1 (VTA_INSN_GEM_A_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
+/*! GEMM instruction start position of the dst_factor_in field */
+#define VTA_INSN_GEM_B_0 (VTA_INSN_GEM_A_1 + 1)
+/*! GEMM instruction end position of the dst_factor_in field */
+#define VTA_INSN_GEM_B_1 (VTA_INSN_GEM_B_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
+/*! GEMM instruction start position of the src_factor_out field */
+#define VTA_INSN_GEM_C_0 (VTA_INSN_GEM_B_1 + 1)
+/*! GEMM instruction end position of the src_factor_out field */
+#define VTA_INSN_GEM_C_1 (VTA_INSN_GEM_C_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
+/*! GEMM instruction start position of the src_factor_in field */
+#define VTA_INSN_GEM_D_0 (VTA_INSN_GEM_C_1 + 1)
+/*! GEMM instruction end position of the src_factor_in field */
+#define VTA_INSN_GEM_D_1 (VTA_INSN_GEM_D_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
+
+/*! GEMM instruction start position of the wgt_factor_out field */
+#define VTA_INSN_GEM_E_0 (VTA_INSN_GEM_D_1 + 1)
+/*! GEMM instruction end position of the wgt_factor_out field */
+#define VTA_INSN_GEM_E_1 (VTA_INSN_GEM_E_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
+/*! GEMM instruction start position of the wgt_factor_in field */
+#define VTA_INSN_GEM_F_0 (VTA_INSN_GEM_E_1 + 1)
+/*! GEMM instruction end position of the wgt_factor_in field */
+#define VTA_INSN_GEM_F_1 (VTA_INSN_GEM_F_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
+
+/*! ALU instruction start position of the alu_opcode field */
+#define VTA_INSN_ALU_E_0 (VTA_INSN_GEM_D_1 + 1)
+/*! ALU instruction end position of the alu_opcode field */
+#define VTA_INSN_ALU_E_1 (VTA_INSN_ALU_E_0 + VTA_ALU_OPCODE_BIT_WIDTH - 1)
+/*! ALU instruction position of the use_imm field */
+#define VTA_INSN_ALU_F   (VTA_INSN_ALU_E_1 + 1)
+/*! ALU instruction start position of the immediate field */
+#define VTA_INSN_ALU_G_0 (VTA_INSN_ALU_F + 1)
+/*! ALU instruction end position of the immediate field */
+#define VTA_INSN_ALU_G_1 (VTA_INSN_ALU_G_0 + VTA_ALUOP_IMM_BIT_WIDTH - 1)
+
+/*! GEMM Micro-op start position of the acc_idx field */
+#define VTA_UOP_GEM_0_0 0
+/*! GEMM Micro-op end position of the acc_idx field */
+#define VTA_UOP_GEM_0_1 (VTA_UOP_GEM_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
+/*! GEMM Micro-op start position of the inp_idx field */
+#define VTA_UOP_GEM_1_0 (VTA_UOP_GEM_0_1 + 1)
+/*! GEMM Micro-op end position of the inp_idx field */
+#define VTA_UOP_GEM_1_1 (VTA_UOP_GEM_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
+/*! GEMM Micro-op start position of the wgt_idx field */
+#define VTA_UOP_GEM_2_0 (VTA_UOP_GEM_1_1 + 1)
+/*! GEMM Micro-op end position of the wgt_idx field */
+#define VTA_UOP_GEM_2_1 (VTA_UOP_GEM_2_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
+
+/*! GEMM Micro-op start position of the acc_idx field */
+#define VTA_UOP_ALU_0_0 0
+/*! GEMM Micro-op end position of the acc_idx field */
+#define VTA_UOP_ALU_0_1 (VTA_UOP_ALU_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
+/*! GEMM Micro-op start position of the inp_idx field */
+#define VTA_UOP_ALU_1_0 (VTA_UOP_ALU_0_1 + 1)
+/*! GEMM Micro-op end position of the inp_idx field */
+#define VTA_UOP_ALU_1_1 (VTA_UOP_ALU_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
+
+/*! \brief VTA generic instruction */
+typedef struct {
+  uint64_t word_0         : 64;
+  uint64_t word_1         : 64;
+} VTAGenericInsn;
+
+/*! \brief VTA load/store instruction
+*   Load/store instruction can describe a 2D strided access pattern
+*   with padding, which can be useful to perform spatial padding
+*   on the fly on a tensor on which to perform 2D convolution.
+*   For instance if we try to load a 4x4 spatial tile from a 16x16
+*   matrix with padding of size 1 on all dimensions:
+*   y_size = 4, x_size = 4, x_stride = 16, y_pad_0 = 1, y_pad_1 = 1,
+*   x_pad_0 = 1, x_pad_1 = 1.
+*/
+typedef struct {
+  /*! \brief The instruction opcode */
+  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
+  /*! \brief Unused in this instruction */
+  uint64_t pop_prev_dep   : 1;
+  /*! \brief Pop dependence token from GEMM stage */
+  uint64_t pop_next_dep   : 1;
+  /*! \brief Unused in this instruction */
+  uint64_t push_prev_dep  : 1;
+  /*! \brief Push dependence token to GEMM stage */
+  uint64_t push_next_dep  : 1;
+  /*! \brief Source/destination SRAM for store/load instruction */
+  uint64_t memory_type    : VTA_MEMOP_ID_BIT_WIDTH;
+  /*! \brief SRAM base address (pointer to memory elem type) */
+  uint64_t sram_base      : VTA_MEMOP_SRAM_ADDR_BIT_WIDTH;
+  /*! \brief DRAM base address (pointer to memory elem type) */
+  uint64_t dram_base      : VTA_MEMOP_DRAM_ADDR_BIT_WIDTH;
+  /*! \brief 2D access pattern: y-size */
+  uint64_t y_size         : VTA_MEMOP_SIZE_BIT_WIDTH;
+  /*! \brief 2D access pattern: x-size (in terms of memory elements) */
+  uint64_t x_size         : VTA_MEMOP_SIZE_BIT_WIDTH;
+  /*! \brief 2D access pattern: x-stride (in terms of memory elements) */
+  uint64_t x_stride       : VTA_MEMOP_STRIDE_BIT_WIDTH;
+  /*! \brief 2D access pattern: start padding along y dimension */
+  uint64_t y_pad_0        : VTA_MEMOP_PAD_BIT_WIDTH;
+  /*! \brief 2D access pattern: end padding along y dimension */
+  uint64_t y_pad_1        : VTA_MEMOP_PAD_BIT_WIDTH;
+  /*! \brief 2D access pattern: start padding along x dimension */
+  uint64_t x_pad_0        : VTA_MEMOP_PAD_BIT_WIDTH;
+  /*! \brief 2D access pattern: end padding along x dimension */
+  uint64_t x_pad_1        : VTA_MEMOP_PAD_BIT_WIDTH;
+} VTAMemInsn;
+
+/*! \brief VTA GEMM instruction
+*   GEMM instruction is implemented by executing a sequence of micro-operations
+*   that is read in the local micro-op memory, delimited by \a uop_bgn and
+*   \a uop_end. For improved storage-efficiency, the micro-operations can be
+*   executed in a 2-level nested loop as follows:
+*   \code{.cpp}
+*     for (i = 0; i < iter_out; i++) {
+*       for (j = 0; j < iter_in; j++) {
+*         for (k = uop_bgn; k < uop_end; k++) {
+*           // Read micro op
+*           uop_T uop = uop_mem[k];
+*           // Read in memory indices
+*           acc_idx_T acc_idx = uop.dst_idx;
+*           inp_idx_T inp_idx = uop.inp_idx;
+*           wgt_idx_T wgt_idx = uop.wgt_idx;
+*           // Update those indices with the following affine functions
+*           acc_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
+*           inp_idx += iter_in * src_factor_in + iter_out * src_factor_out;
+*           wgt_idx += iter_in * wgt_factor_in + iter_out * wgt_factor_out;
+*           // Perform GEMM operation
+*           acc_mem[acc_idx] += dot(inp_mem[inp_idx], wgt[wgt_idx]);
+*         }
+*       }
+*     }
+*   \endcode
+*
+*/
+typedef struct {
+  /*! \brief The instruction opcode */
+  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
+  /*! \brief Pop dependence token from load stage */
+  uint64_t pop_prev_dep   : 1;
+  /*! \brief Pop dependence token from store stage */
+  uint64_t pop_next_dep   : 1;
+  /*! \brief Push dependence token to load stage */
+  uint64_t push_prev_dep  : 1;
+  /*! \brief Push dependence token to store stage */
+  uint64_t push_next_dep  : 1;
+  /*! \brief Reset register */
+  uint64_t reset_reg      : 1;
+  /*! \brief Micro-op begin address */
+  uint64_t uop_bgn        : VTA_LOG_UOP_BUFF_DEPTH;
+  /*! \brief Micro-op end address */
+  uint64_t uop_end        : VTA_LOG_UOP_BUFF_DEPTH + 1;
+  /*! \brief Iterations in the outer uop execution loop */
+  uint64_t iter_out       : VTA_LOOP_ITER_WIDTH;
+  /*! \brief Iterations in the inner uop execution loop */
+  uint64_t iter_in        : VTA_LOOP_ITER_WIDTH;
+  /*! \brief Outer loop accumulator memory index factor */
+  uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
+  /*! \brief Inner loop accumulator memory index factor */
+  uint64_t dst_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
+  /*! \brief Outer loop input memory index factor */
+  uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
+  /*! \brief Inner loop input memory index factor */
+  uint64_t src_factor_in  : VTA_LOG_INP_BUFF_DEPTH;
+  /*! \brief Outer loop weight memory index factor */
+  uint64_t wgt_factor_out : VTA_LOG_WGT_BUFF_DEPTH;
+  /*! \brief Inner loop weight memory index factor */
+  uint64_t wgt_factor_in  : VTA_LOG_WGT_BUFF_DEPTH;
+} VTAGemInsn;
+
+/*! \brief VTA ALU instruction
+*   ALU instruction is implemented by executing a sequence of micro-operations
+*   that is read in the local micro-op memory, delimited by \a uop_bgn and
+*   \a uop_end. For improved storage-efficiency, the micro-operations can be
+*   executed in a 2-level nested loop as follows:
+*   \code{.cpp}
+*     for (i = 0; i < iter_out; i++) {
+*       for (j = 0; j < iter_in; j++) {
+*         for (k = uop_bgn; k < uop_end; k++) {
+*           // Read micro op
+*           uop_T uop = uop_mem[k];
+*           // Read in memory indices
+*           acc_idx_T dst_idx = uop.dst_idx;
+*           inp_idx_T src_idx = uop.inp_idx;
+*           // Update those indices with the following affine functions
+*           dst_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
+*           src_idx += iter_in * src_factor_in + iter_out * src_factor_out;
+*           // Perform ALU operation
+*           if (use_imm) {
+*             acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], imm);
+*           } else {
+*             acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], acc_mem[src_idx]);
+*           }
+*         }
+*       }
+*     }
+*   \endcode
+*
+*/
+typedef struct {
+  /*! \brief The instruction opcode */
+  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
+  /*! \brief Pop dependence token from load stage */
+  uint64_t pop_prev_dep   : 1;
+  /*! \brief Pop dependence token from store stage */
+  uint64_t pop_next_dep   : 1;
+  /*! \brief Push dependence token to load stage */
+  uint64_t push_prev_dep  : 1;
+  /*! \brief Push dependence token to store stage */
+  uint64_t push_next_dep  : 1;
+  /*! \brief Reset register */
+  uint64_t reset_reg      : 1;
+  /*! \brief Micro-op begin address */
+  uint64_t uop_bgn        : VTA_LOG_UOP_BUFF_DEPTH;
+  /*! \brief Micro-op end address */
+  uint64_t uop_end        : VTA_LOG_UOP_BUFF_DEPTH + 1;
+  /*! \brief Iterations in the outer uop execution loop */
+  uint64_t iter_out       : VTA_LOOP_ITER_WIDTH;
+  /*! \brief Iterations in the inner uop execution loop */
+  uint64_t iter_in        : VTA_LOOP_ITER_WIDTH;
+  /*! \brief Outer loop accumulator memory destination index factor */
+  uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
+  /*! \brief Inner loop accumulator memory destination index factor */
+  uint64_t dst_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
+  /*! \brief Outer loop accumulator memory source index factor */
+  uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
+  /*! \brief Inner loop accumulator memory source index factor */
+  uint64_t src_factor_in  : VTA_LOG_INP_BUFF_DEPTH;
+  /*! \brief ALU opcode */
+  uint64_t alu_opcode     : VTA_ALU_OPCODE_BIT_WIDTH;
+  /*! \brief Use immediate is true */
+  uint64_t use_imm        : 1;
+  /*! \brief Immediate value: allow negative value */
+  int64_t imm            : VTA_ALUOP_IMM_BIT_WIDTH;
+} VTAAluInsn;
+
+/*! \brief VTA ALU instruction converter */
+union VTAInsn {
+  /*! \brief VTA generic instruction */
+  VTAGenericInsn generic;
+  /*! \brief VTA load/store instruction */
+  VTAMemInsn mem;
+  /*! \brief VTA GEMM instruction */
+  VTAGemInsn gemm;
+  /*! \brief VTA ALU instruction */
+  VTAAluInsn alu;
+};
+
+/*! \brief VTA micro-op for GEMM/ALU instruction */
+typedef struct {
+  /*! \brief Destination index (indexes accum buffer) */
+  uint32_t dst_idx    : VTA_LOG_ACC_BUFF_DEPTH;
+  /*! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) */
+  uint32_t src_idx    : VTA_LOG_INP_BUFF_DEPTH;
+  /*! \brief Weight index (indexes weight buffer) */
+  uint32_t wgt_idx    : VTA_LOG_WGT_BUFF_DEPTH;
+} VTAUop;
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_HW_SPEC_H_
diff --git a/vta/include/vta/runtime.h b/vta/include/vta/runtime.h
new file mode 100644
index 000000000000..6d77067be931
--- /dev/null
+++ b/vta/include/vta/runtime.h
@@ -0,0 +1,274 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file runtime.h
+ * \brief VTA runtime library.
+ */
+
+#ifndef VTA_RUNTIME_H_
+#define VTA_RUNTIME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./driver.h"
+
+#define VTA_MEMCPY_H2D 1
+#define VTA_MEMCPY_D2H 2
+#define VTA_MEMCPY_D2D 3
+
+#define VTA_DEBUG_DUMP_INSN (1 << 1)
+#define VTA_DEBUG_DUMP_UOP (1 << 2)
+#define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
+#define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
+#define VTA_DEBUG_FORCE_SERIAL (1 << 5)
+
+/*!
+ * \brief Allocate data buffer.
+ * \param size Buffer size.
+ * \return A pointer to the allocated buffer.
+ */
+void* VTABufferAlloc(size_t size);
+
+/*!
+ * \brief Free data buffer.
+ * \param buffer The data buffer to be freed.
+ */
+void VTABufferFree(void* buffer);
+
+/*!
+ * \brief Copy data buffer from one location to another.
+ * \param from The source buffer base address.
+ * \param from_offset The offset of the source buffer.
+ * \param to The target buffer base address.
+ * \param to_offset The offset of the target buffer.
+ * \param size Size of copy.
+ * \param kind_mask The memory copy kind.
+ */
+void VTABufferCopy(const void* from,
+                   size_t from_offset,
+                   void* to,
+                   size_t to_offset,
+                   size_t size,
+                   int kind_mask);
+
+/*! \brief VTA command handle */
+typedef void* VTACommandHandle;
+
+/*! \brief Shutdown hook of VTA to cleanup resources */
+void VTARuntimeShutdown();
+
+/*!
+ * \brief Get thread local command handle.
+ * \return A thread local command handle.
+ */
+VTACommandHandle VTATLSCommandHandle();
+
+/*!
+ * \brief Get the buffer access pointer on CPU.
+ * \param cmd The VTA command handle.
+ * \param buffer The data buffer.
+ * \return The pointer that can be accessed by the CPU.
+ */
+void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
+
+/*!
+ * \brief Perform a write barrier to make a memory region visible to the CPU.
+ * \param cmd The VTA command handle.
+ * \param buffer The head buffer pointer.
+ * \param elem_bits The size in bits of each element.
+ * \param start The start of the region (in elements).
+ * \param extent The end of the region (in elements).
+ */
+void VTAWriteBarrier(VTACommandHandle cmd,
+                     void* buffer,
+                     uint32_t elem_bits,
+                     uint32_t start,
+                     uint32_t extent);
+/*!
+ * \brief Perform a read barrier to a memory region visible to VTA.
+ * \param cmd The VTA command handle.
+ * \param buffer The head buffer pointer.
+ * \param elem_bits The unit bits of each elements.
+ * \param start The start of the region (in elements).
+ * \param extent The end of the region (in elements).
+ */
+void VTAReadBarrier(VTACommandHandle cmd,
+                    void* buffer,
+                    uint32_t elem_bits,
+                    uint32_t start,
+                    uint32_t extent);
+
+/*!
+ * \brief Set debug mode on the command handle.
+ * \param cmd The VTA command handle.
+ * \param debug_flag The debug flag.
+ */
+void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
+
+/*!
+ * \brief Perform a 2D data load from DRAM.
+ *  Sizes are measured in units of vector elements.
+ * \param cmd The VTA command handle.
+ * \param src_dram_addr Source DRAM address.
+ * \param src_elem_offset The source DRAM offset in number of unit elements.
+ * \param x_size The lowest dimension (x axis) size in number of unit elements.
+ * \param y_size The number of rows (y axis).
+ * \param x_stride The x axis stride.
+ * \param x_pad_before The start padding on x axis.
+ * \param y_pad_before The start padding on y axis.
+ * \param x_pad_after The end padding on x axis.
+ * \param y_pad_after The end padding of y axis.
+ * \param dst_sram_index Destination SRAM index.
+ * \param dst_memory_type Destination memory type.
+ */
+void VTALoadBuffer2D(VTACommandHandle cmd,
+                     void* src_dram_addr,
+                     uint32_t src_elem_offset,
+                     uint32_t x_size,
+                     uint32_t y_size,
+                     uint32_t x_stride,
+                     uint32_t x_pad_before,
+                     uint32_t y_pad_before,
+                     uint32_t x_pad_after,
+                     uint32_t y_pad_after,
+                     uint32_t dst_sram_index,
+                     uint32_t dst_memory_type);
+
+/*!
+ * \brief Perform a 2D data store into DRAM
+ *  Sizes are measured in units of vector elements.
+ * \param cmd The VTA command handle.
+ * \param src_sram_index Source SRAM index.
+ * \param src_memory_type Source memory type.
+ * \param dst_dram_addr Destination DRAM address.
+ * \param dst_elem_offset The destination DRAM offset in number of unit elements.
+ * \param x_size The lowest dimension (x axis) size in number of unit elements.
+ * \param y_size The number of rows.
+ * \param x_stride The x axis stride.
+ */
+void VTAStoreBuffer2D(VTACommandHandle cmd,
+                      uint32_t src_sram_index,
+                      uint32_t src_memory_type,
+                      void* dst_dram_addr,
+                      uint32_t dst_elem_offset,
+                      uint32_t x_size,
+                      uint32_t y_size,
+                      uint32_t x_stride);
+
+/*!
+ * \brief Push uop into kernel buffer.
+ * In GEMM mode, do a blocked GEMM with 2d access pattern.
+ * In ALU mode, do a vectorized ALU operation with 2d access pattern.
+ *
+ *  \code
+ *
+ *   DType accum[INP_BUFF_DEPTH][l][n];
+ *   DType weight[WGT_BUFF_DEPTH][n][m];
+ *   DType input[INP_BUFF_DEPTH][l][m];
+ *   if reset_out == 1
+ *    accum[dst_index] = 0
+ *   elif mode == 0
+ *    accum[dst_index] += GEMM(input[src_index], weight[wgt_index]);
+ *   else
+ *    if (use_imm)
+ *      accum[dst_index] = opcode(accum[dst_index], imm_val);
+ *    else
+ *      accum[dst_index] = opcode(accum[dst_index], accum[src_index]);
+ *
+ *  \endcode
+ *
+ * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
+ * \param reset_out Resets the accum to 0.
+ * \param dst_index The accum memory index.
+ * \param src_index The input memory (gemm) / accum memory (alu) index.
+ * \param wgt_index The weight memory index.
+ * \param opcode The ALU opcode.
+ * \param use_imm Use immediate in ALU mode if set to true.
+ * \param imm_val Immediate value in ALU mode.
+ */
+void VTAUopPush(uint32_t mode,
+                uint32_t reset_out,
+                uint32_t dst_index,
+                uint32_t src_index,
+                uint32_t wgt_index,
+                uint32_t opcode,
+                uint32_t use_imm,
+                int32_t imm_val);
+
+/*!
+ * \brief Mark start of a micro op loop.
+ * \param extent The extent of the loop.
+ * \param dst_factor The accum factor.
+ * \param src_factor The input factor.
+ * \param wgt_factor The weight factor.
+ */
+void VTAUopLoopBegin(uint32_t extent,
+                     uint32_t dst_factor,
+                     uint32_t src_factor,
+                     uint32_t wgt_factor);
+
+/*!
+ * \brief Mark end of a micro op loop.
+ */
+void VTAUopLoopEnd();
+
+/*!
+ * \brief Push GEMM uop kernel into the command handle.
+ * \param uop_handle The uop cache handle.
+ * \param finit The initalization function to initialize uop.
+ * \param signature The closure arguments of the finit.
+ * \param nbytes Number of bytes to in the closure arguments.
+ * \return 0 if success.
+ */
+int VTAPushGEMMOp(void** uop_handle,
+                  int (*finit)(void*),
+                  void* signature,
+                  int nbytes);
+
+/*!
+ * \brief Push ALU uop kernel into the command handle.
+ * \param uop_handle The uop cache handle.
+ * \param finit The initalization function to initialize uop.
+ * \param signature The closure arguments of the finit.
+ * \param nbytes Number of bytes to in the closure arguments.
+ * \return 0 if success.
+ */
+int VTAPushALUOp(void** uop_handle,
+                 int (*finit)(void*),
+                 void* signature,
+                 int nbytes);
+
+/*!
+ * \brief Push dependence token.
+ * \param cmd The VTA command handle.
+ * \param from_qid The source queue.
+ * \param to_qid The destination queue.
+ * \return 0 if success.
+ */
+int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
+
+/*!
+ * \brief Pop dependence signal.
+ * \param cmd The VTA command handle.
+ * \param from_qid The source queue.
+ * \param to_qid The destination queue.
+ * \return 0 if success.
+ */
+int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
+
+/*!
+ * \brief Synchronize the command handle.
+ *  Commit all the instructions to VTA and wait until
+ *  the accelerator finishes its job.
+ *  Perform all of the out-of-order DRAM stores.
+ * \param cmd The VTA command handle.
+ * \param wait_cycles The limit of poll cycles.
+ *
+ */
+void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_RUNTIME_H_
diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py
new file mode 100644
index 000000000000..926d73649b31
--- /dev/null
+++ b/vta/python/vta/__init__.py
@@ -0,0 +1,22 @@
+"""VTA Package is a TVM backend extension to support VTA hardwares
+
+Besides the compiler toolchain.
+It also include utility functions to
+configure the hardware Environment and  access remote through RPC
+"""
+from __future__ import absolute_import as _abs
+
+import sys
+
+from .bitstream import get_bitstream_path, download_bitstream
+from .environment import get_env, Environment
+from .rpc_client import reconfig_runtime, program_fpga
+
+__version__ = "0.1.0"
+
+# do not import nnvm/topi when running vta.exec.rpc_server
+# to maintain minimum dependency on the board
+if sys.argv[0] not in ("-c", "-m"):
+    from . import top
+    from .build_module import build_config, lower, build
+    from . import graph
diff --git a/vta/python/vta/bitstream.py b/vta/python/vta/bitstream.py
new file mode 100644
index 000000000000..329e741f7d1c
--- /dev/null
+++ b/vta/python/vta/bitstream.py
@@ -0,0 +1,72 @@
+"""VTA specific bitstream management library."""
+from __future__ import absolute_import as _abs
+
+import os
+import sys
+
+from tvm.contrib.download import download
+from .environment import get_env
+
+if sys.version_info >= (3,):
+    import urllib.error as urllib2
+else:
+    import urllib2
+
+# bitstream repo
+BITSTREAM_URL = "https://github.com/uwsaml/vta-distro/raw/master/bitstreams/"
+
+def get_bitstream_path():
+    """Returns the path to the cached bitstream corresponding to the current config
+
+    Returns
+    -------
+    bit_path: str
+        Corresponding to the filepath of the bitstream
+    """
+
+    env = get_env()
+
+    # Derive destination path
+    cache_dir = os.getenv("VTA_CACHE_PATH", os.path.join(os.getenv("HOME"), ".vta_cache/"))
+    cache_dir = os.path.join(cache_dir, env.TARGET)
+    # Create the directory if it didn't exist
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+    bit_path = os.path.join(cache_dir, env.BITSTREAM)
+
+    return bit_path
+
+
+def download_bitstream():
+    """Downloads a cached bitstream corresponding to the current config
+    """
+
+    env = get_env()
+
+    success = False
+    bit = get_bitstream_path()
+    url = os.path.join(BITSTREAM_URL, env.TARGET)
+    url = os.path.join(url, env.HW_VER)
+    url = os.path.join(url, env.BITSTREAM)
+
+    try:
+        download(url, bit)
+    except urllib2.HTTPError as err:
+        if err.code == 404:
+            raise RuntimeError(
+                # Raise error - the solution when this happens it to build your
+                # own bitstream and add it to your $VTA_CACHE_PATH
+                "{} is not available. It appears that this configuration \
+bistream has not been cached. Please compile your own bitstream (see hardware \
+compilation guide to get Xilinx toolchains setup) and add it to your \
+$VTA_CACHE_PATH. Alternatively edit your config.json back to its default \
+settings. You can see the list of available bitstreams under {}"
+                .format(url, BITSTREAM_URL))
+        else:
+            raise RuntimeError(
+                # This could happen when trying to access the URL behind a proxy
+                "Something went wrong when trying to access {}. Check your \
+internet connection or proxy settings."
+                .format(url))
+
+    return success
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
new file mode 100644
index 000000000000..e6804aa53191
--- /dev/null
+++ b/vta/python/vta/build_module.py
@@ -0,0 +1,101 @@
+"""VTA specific buildin for runtime."""
+from __future__ import absolute_import as _abs
+
+import tvm
+from . import ir_pass
+from .environment import get_env
+
+
+def lift_coproc_scope(x):
+    """Lift coprocessings cope to the """
+    x = ir_pass.lift_alloc_to_scope_begin(x)
+    x = tvm.ir_pass.LiftAttrScope(x, "coproc_scope", False)
+    return x
+
+def early_rewrite(stmt):
+    """Try to do storage rewrite in early pass."""
+    try:
+        return tvm.ir_pass.StorageRewrite(stmt)
+    except tvm.TVMError:
+        return stmt
+
+
+def build_config(debug_flag=0, **kwargs):
+    """Build a build config for VTA.
+
+    Parameters
+    ----------
+    debug_flag : int
+        The dbeug flag to be passed.
+
+    kwargs : dict
+        Additional configurations.
+
+    Returns
+    -------
+    build_config: BuildConfig
+        The build config that can be used in TVM.
+
+    Example
+    --------
+    .. code-block:: python
+
+      # build a vta module.
+      with vta.build_config():
+          vta_module = tvm.build(s, ...)
+    """
+    env = get_env()
+    def add_debug(stmt):
+        debug = tvm.call_extern(
+            "int32", "VTASetDebugMode",
+            env.dev.command_handle,
+            debug_flag)
+
+        return tvm.make.stmt_seq(debug, stmt)
+    pass_list = [(1, ir_pass.inject_dma_intrin),
+                 (1, ir_pass.inject_skip_copy),
+                 (1, ir_pass.annotate_alu_coproc_scope),
+                 (1, lambda x: tvm.ir_pass.LiftAttrScope(x, "coproc_uop_scope", True)),
+                 (1, lift_coproc_scope),
+                 (1, ir_pass.inject_coproc_sync),
+                 (1, early_rewrite)]
+    if debug_flag:
+        pass_list.append((1, add_debug))
+    pass_list.append((2, ir_pass.inject_alu_intrin))
+    pass_list.append((3, ir_pass.fold_uop_loop))
+    pass_list.append((3, ir_pass.cpu_access_rewrite))
+    return tvm.build_config(add_lower_pass=pass_list, **kwargs)
+
+
+def lower(*args, **kwargs):
+    """Thin wrapper of tvm.lower
+
+    This wrapper automatically applies VTA's build_config
+    if there is no user specified build_config in context.
+
+    See Also
+    --------
+    tvm.lower : The original TVM's lower function
+    """
+    cfg = tvm.build_module.current_build_config()
+    if not cfg.add_lower_pass:
+        with build_config():
+            return tvm.lower(*args, **kwargs)
+    return tvm.lower(*args, **kwargs)
+
+
+def build(*args, **kwargs):
+    """Thin wrapper of tvm.build
+
+    This wrapper automatically applies VTA's build_config
+    if there is no user specified build_config in context.
+
+    See Also
+    --------
+    tvm.build : The original TVM's build function
+    """
+    cfg = tvm.build_module.current_build_config()
+    if not cfg.add_lower_pass:
+        with build_config():
+            return tvm.build(*args, **kwargs)
+    return tvm.build(*args, **kwargs)
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
new file mode 100644
index 000000000000..a77e29ac3a52
--- /dev/null
+++ b/vta/python/vta/environment.py
@@ -0,0 +1,314 @@
+"""Configurable VTA Hareware Environment scope."""
+# pylint: disable=invalid-name
+from __future__ import absolute_import as _abs
+
+import os
+import json
+import copy
+import tvm
+from . import intrin
+from .pkg_config import PkgConfig
+
+
+class DevContext(object):
+    """Internal development context
+
+    This contains all the non-user facing compiler
+    internal context that is hold by the Environment.
+
+    Parameters
+    ----------
+    env : Environment
+        The environment hosting the DevContext
+
+    Note
+    ----
+    This class is introduced so we have a clear separation
+    of developer related, and user facing attributes.
+    """
+    # Memory id for DMA
+    MEM_ID_UOP = 0
+    MEM_ID_WGT = 1
+    MEM_ID_INP = 2
+    MEM_ID_ACC = 3
+    MEM_ID_OUT = 4
+    # VTA ALU Opcodes
+    ALU_OPCODE_MIN = 0
+    ALU_OPCODE_MAX = 1
+    ALU_OPCODE_ADD = 2
+    ALU_OPCODE_SHR = 3
+    # Task queue id (pipeline stage)
+    QID_LOAD_INP = 1
+    QID_LOAD_WGT = 1
+    QID_LOAD_OUT = 2
+    QID_STORE_OUT = 3
+    QID_COMPUTE = 2
+
+    def __init__(self, env):
+        self.vta_axis = tvm.thread_axis("vta")
+        self.vta_push_uop = tvm.make.StringImm("VTAPushGEMMOp")
+        ctx = tvm.call_extern("handle", "VTATLSCommandHandle")
+        self.command_handle = tvm.make.Call(
+            "handle", "tvm_thread_context", [ctx],
+            tvm.expr.Call.Intrinsic, None, 0)
+        self.DEBUG_NO_SYNC = False
+        env._dev_ctx = self
+        self.gemm = intrin.gemm(env, env.mock_mode)
+
+    def get_task_qid(self, qid):
+        """Get transformed queue index."""
+        return 1 if self.DEBUG_NO_SYNC else qid
+
+
+class Environment(object):
+    """Hardware configuration object.
+
+    This object contains all the information
+    needed for compiling to a specific VTA backend.
+
+    Parameters
+    ----------
+    cfg : dict of str to value.
+        The configuration parameters.
+
+    Example
+    --------
+    .. code-block:: python
+
+      # the following code reconfigures the environment
+      # temporarily to attributes specified in new_cfg.json
+      new_cfg = json.load(json.load(open("new_cfg.json")))
+      with vta.Environment(new_cfg):
+          # env works on the new environment
+          env = vta.get_env()
+    """
+    current = None
+    # constants
+    MAX_XFER = 1 << 22
+    # debug flags
+    DEBUG_DUMP_INSN = (1 << 1)
+    DEBUG_DUMP_UOP = (1 << 2)
+    DEBUG_SKIP_READ_BARRIER = (1 << 3)
+    DEBUG_SKIP_WRITE_BARRIER = (1 << 4)
+    # memory scopes
+    inp_scope = "local.inp_buffer"
+    wgt_scope = "local.wgt_buffer"
+    acc_scope = "local.acc_buffer"
+
+    # initialization function
+    def __init__(self, cfg):
+        self.__dict__.update(cfg)
+        for key in PkgConfig.cfg_keys:
+            if key not in cfg:
+                raise ValueError("Expect key %s in cfg" % key)
+        # derive output buffer size
+        self.LOG_OUT_BUFF_SIZE = (
+            self.LOG_ACC_BUFF_SIZE +
+            self.LOG_OUT_WIDTH -
+            self.LOG_ACC_WIDTH)
+        # data type width
+        self.INP_WIDTH = 1 << self.LOG_INP_WIDTH
+        self.WGT_WIDTH = 1 << self.LOG_WGT_WIDTH
+        self.ACC_WIDTH = 1 << self.LOG_ACC_WIDTH
+        self.OUT_WIDTH = 1 << self.LOG_OUT_WIDTH
+        # tensor intrinsic shape
+        self.BATCH = 1 << self.LOG_BATCH
+        self.BLOCK_IN = 1 << self.LOG_BLOCK_IN
+        self.BLOCK_OUT = 1 << self.LOG_BLOCK_OUT
+        # buffer size
+        self.UOP_BUFF_SIZE = 1 << self.LOG_UOP_BUFF_SIZE
+        self.INP_BUFF_SIZE = 1 << self.LOG_INP_BUFF_SIZE
+        self.WGT_BUFF_SIZE = 1 << self.LOG_WGT_BUFF_SIZE
+        self.ACC_BUFF_SIZE = 1 << self.LOG_ACC_BUFF_SIZE
+        self.OUT_BUFF_SIZE = 1 << self.LOG_OUT_BUFF_SIZE
+        # bytes per buffer
+        self.INP_ELEM_BITS = (self.BATCH *
+                              self.BLOCK_IN *
+                              self.INP_WIDTH)
+        self.WGT_ELEM_BITS = (self.BLOCK_OUT *
+                              self.BLOCK_IN *
+                              self.WGT_WIDTH)
+        self.ACC_ELEM_BITS = (self.BATCH *
+                              self.BLOCK_OUT *
+                              self.ACC_WIDTH)
+        self.OUT_ELEM_BITS = (self.BATCH *
+                              self.BLOCK_OUT *
+                              self.OUT_WIDTH)
+        self.INP_ELEM_BYTES = self.INP_ELEM_BITS // 8
+        self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
+        self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
+        self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
+        # Configuration bitstream name
+        self.BITSTREAM = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}.bit".format(
+            (1 << cfg["LOG_BATCH"]),
+            (1 << cfg["LOG_BLOCK_IN"]),
+            (1 << cfg["LOG_BLOCK_OUT"]),
+            (1 << cfg["LOG_INP_WIDTH"]),
+            (1 << cfg["LOG_WGT_WIDTH"]),
+            cfg["LOG_UOP_BUFF_SIZE"],
+            cfg["LOG_INP_BUFF_SIZE"],
+            cfg["LOG_WGT_BUFF_SIZE"],
+            cfg["LOG_ACC_BUFF_SIZE"],
+            cfg["HW_FREQ"],
+            cfg["HW_CLK_TARGET"],
+            cfg["HW_VER"].replace('.', '_'))
+        # dtypes
+        self.acc_dtype = "int%d" % self.ACC_WIDTH
+        self.inp_dtype = "int%d" % self.INP_WIDTH
+        self.wgt_dtype = "int%d" % self.WGT_WIDTH
+        self.out_dtype = "int%d" % self.OUT_WIDTH
+        # lazy cached members
+        self.mock_mode = False
+        self._mock_env = None
+        self._dev_ctx = None
+        self._last_env = None
+
+    def __enter__(self):
+        self._last_env = Environment.current
+        Environment.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        Environment.current = self._last_env
+
+    def pkg_config(self):
+        """PkgConfig instance"""
+        curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+        proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
+        return PkgConfig(self.__dict__, proj_root)
+
+    @property
+    def dev(self):
+        """Developer context"""
+        if self._dev_ctx is None:
+            self._dev_ctx = DevContext(self)
+        return self._dev_ctx
+
+    @property
+    def mock(self):
+        """A mock version of the Environment
+
+        The ALU, dma_copy and intrinsics will be
+        mocked to be nop.
+        """
+        if self.mock_mode:
+            return self
+        if self._mock_env is None:
+            self._mock_env = copy.copy(self)
+            self._mock_env._dev_ctx = None
+            self._mock_env.mock_mode = True
+        return self._mock_env
+
+    @property
+    def dma_copy(self):
+        """DMA copy pragma"""
+        return ("dma_copy"
+                if not self.mock_mode
+                else "skip_dma_copy")
+
+    @property
+    def alu(self):
+        """ALU pragma"""
+        return ("alu"
+                if not self.mock_mode
+                else "skip_alu")
+
+    @property
+    def gemm(self):
+        """GEMM intrinsic"""
+        return self.dev.gemm
+
+    @property
+    def target_host(self):
+        """The target host"""
+        if self.TARGET == "pynq":
+            return "llvm -target=armv7-none-linux-gnueabihf"
+        elif self.TARGET == "sim":
+            return "llvm"
+        else:
+            raise ValueError("Unknown target %s" % self.TARGET)
+
+
+def get_env():
+    """Get the current VTA Environment.
+
+    Returns
+    -------
+    env : Environment
+        The current environment.
+    """
+    return Environment.current
+
+
+# The memory information for the compiler
+@tvm.register_func("tvm.info.mem.%s" % Environment.inp_scope)
+def mem_info_inp_buffer():
+    spec = get_env()
+    return tvm.make.node("MemoryInfo",
+                         unit_bits=spec.INP_ELEM_BITS,
+                         max_simd_bits=spec.INP_ELEM_BITS,
+                         max_num_bits=spec.INP_BUFF_SIZE * 8,
+                         head_address=None)
+
+@tvm.register_func("tvm.info.mem.%s" % Environment.wgt_scope)
+def mem_info_wgt_buffer():
+    spec = get_env()
+    return tvm.make.node("MemoryInfo",
+                         unit_bits=spec.WGT_ELEM_BITS,
+                         max_simd_bits=spec.WGT_ELEM_BITS,
+                         max_num_bits=spec.WGT_BUFF_SIZE * 8,
+                         head_address=None)
+
+@tvm.register_func("tvm.info.mem.%s" % Environment.acc_scope)
+def mem_info_acc_buffer():
+    spec = get_env()
+    return tvm.make.node("MemoryInfo",
+                         unit_bits=spec.ACC_ELEM_BITS,
+                         max_simd_bits=spec.ACC_ELEM_BITS,
+                         max_num_bits=spec.ACC_BUFF_SIZE * 8,
+                         head_address=None)
+
+# TVM related registration
+@tvm.register_func("tvm.intrin.rule.default.vta.coproc_sync")
+def coproc_sync(op):
+    _ = op
+    return tvm.call_extern(
+        "int32", "VTASynchronize",
+        get_env().dev.command_handle, 1<<31)
+
+
+@tvm.register_func("tvm.intrin.rule.default.vta.coproc_dep_push")
+def coproc_dep_push(op):
+    return tvm.call_extern(
+        "int32", "VTADepPush",
+        get_env().dev.command_handle,
+        op.args[0], op.args[1])
+
+
+@tvm.register_func("tvm.intrin.rule.default.vta.coproc_dep_pop")
+def coproc_dep_pop(op):
+    return tvm.call_extern(
+        "int32", "VTADepPop",
+        get_env().dev.command_handle,
+        op.args[0], op.args[1])
+
+
+def _init_env():
+    """Iniitalize the default global env"""
+    curr_path = os.path.dirname(
+        os.path.abspath(os.path.expanduser(__file__)))
+    proj_root = os.path.abspath(os.path.join(curr_path, "../../../"))
+    path_list = [
+        os.path.join(curr_path, "vta_config.json"),
+        os.path.join(proj_root, "build", "vta_config.json"),
+        os.path.join(proj_root, "vta_config.json"),
+        os.path.join(proj_root, "vta/config/vta_config.json")
+    ]
+    path_list = [p for p in path_list if os.path.exists(p)]
+    if not path_list:
+        raise RuntimeError(
+            "Error: {} not found.make sure you have config.json in your vta root"
+            .format(filename))
+    return Environment(json.load(open(path_list[0])))
+
+Environment.current = _init_env()
diff --git a/vta/python/vta/exec/__init__.py b/vta/python/vta/exec/__init__.py
new file mode 100644
index 000000000000..2fa9de930455
--- /dev/null
+++ b/vta/python/vta/exec/__init__.py
@@ -0,0 +1 @@
+"""VTA Command line utils."""
diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py
new file mode 100644
index 000000000000..233d37ccad7c
--- /dev/null
+++ b/vta/python/vta/exec/rpc_server.py
@@ -0,0 +1,130 @@
+"""VTA customized TVM RPC Server
+
+Provides additional runtime function and library loading.
+"""
+from __future__ import absolute_import
+
+import logging
+import argparse
+import os
+import ctypes
+import json
+import tvm
+from tvm._ffi.base import c_str
+from tvm import rpc
+from tvm.contrib import cc
+
+from ..environment import get_env
+from ..pkg_config import PkgConfig
+from ..libinfo import find_libvta
+
+
+@tvm.register_func("tvm.rpc.server.start", override=True)
+def server_start():
+    """VTA RPC server extension."""
+    # pylint: disable=unused-variable
+    curr_path = os.path.dirname(
+        os.path.abspath(os.path.expanduser(__file__)))
+    proj_root = os.path.abspath(os.path.join(curr_path, "../../../../"))
+    dll_path = find_libvta()[0]
+    cfg_path = os.path.abspath(os.path.join(proj_root, "build/vta_config.json"))
+    runtime_dll = []
+    _load_module = tvm.get_global_func("tvm.rpc.server.load_module")
+
+    def load_vta_dll():
+        """Try to load vta dll"""
+        if not runtime_dll:
+            runtime_dll.append(ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL))
+        logging.info("Loading VTA library: %s", dll_path)
+        return runtime_dll[0]
+
+    @tvm.register_func("tvm.rpc.server.load_module", override=True)
+    def load_module(file_name):
+        load_vta_dll()
+        return _load_module(file_name)
+
+    @tvm.register_func("device_api.ext_dev")
+    def ext_dev_callback():
+        load_vta_dll()
+        return tvm.get_global_func("device_api.ext_dev")()
+
+    @tvm.register_func("tvm.contrib.vta.init", override=True)
+    def program_fpga(file_name):
+        path = tvm.get_global_func("tvm.rpc.server.workpath")(file_name)
+        load_vta_dll().VTAProgram(c_str(path))
+        logging.info("Program FPGA with %s", file_name)
+
+    @tvm.register_func("tvm.rpc.server.shutdown", override=True)
+    def server_shutdown():
+        if runtime_dll:
+            runtime_dll[0].VTARuntimeShutdown()
+            runtime_dll.pop()
+
+    @tvm.register_func("tvm.contrib.vta.reconfig_runtime", override=True)
+    def reconfig_runtime(cfg_json):
+        """Rebuild and reload runtime with new configuration.
+
+        Parameters
+        ----------
+        cfg_json : str
+            JSON string used for configurations.
+        """
+        if runtime_dll:
+            raise RuntimeError("Can only reconfig in the beginning of session...")
+        env = get_env()
+        cfg = json.loads(cfg_json)
+        cfg["TARGET"] = env.TARGET
+        pkg = PkgConfig(cfg, proj_root)
+        # check if the configuration is already the same
+        if os.path.isfile(cfg_path):
+            old_cfg = json.loads(open(cfg_path, "r").read())
+            if pkg.same_config(old_cfg):
+                logging.info("Skip reconfig_runtime due to same config.")
+                return
+        cflags = ["-O2", "-std=c++11"]
+        cflags += pkg.cflags
+        ldflags = pkg.ldflags
+        lib_name = dll_path
+        source = pkg.lib_source
+        logging.info("Rebuild runtime: output=%s, cflags=%s, source=%s, ldflags=%s",
+                     dll_path, str(cflags), str(source), str(ldflags))
+        cc.create_shared(lib_name, source, cflags + ldflags)
+        with open(cfg_path, "w") as outputfile:
+            outputfile.write(pkg.cfg_json)
+
+
+def main():
+    """Main funciton"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', type=str, default="0.0.0.0",
+                        help='the hostname of the server')
+    parser.add_argument('--port', type=int, default=9090,
+                        help='The port of the PRC')
+    parser.add_argument('--port-end', type=int, default=9199,
+                        help='The end search port of the PRC')
+    parser.add_argument('--key', type=str, default="",
+                        help="RPC key used to identify the connection type.")
+    parser.add_argument('--tracker', type=str, default="",
+                        help="Report to RPC tracker")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO)
+
+    if args.tracker:
+        url, port = args.tracker.split(":")
+        port = int(port)
+        tracker_addr = (url, port)
+        if not args.key:
+            raise RuntimeError(
+                "Need key to present type of resource when tracker is available")
+    else:
+        tracker_addr = None
+
+    server = rpc.Server(args.host,
+                        args.port,
+                        args.port_end,
+                        key=args.key,
+                        tracker_addr=tracker_addr)
+    server.proc.join()
+
+if __name__ == "__main__":
+    main()
diff --git a/vta/python/vta/graph.py b/vta/python/vta/graph.py
new file mode 100644
index 000000000000..7f2a26fdc4bf
--- /dev/null
+++ b/vta/python/vta/graph.py
@@ -0,0 +1,317 @@
+"""Graph transformation specific to accelerator.
+
+This module provide specific NNVM graph transformations
+to transform a generic NNVM graph to a version that can
+be executed on accelerator.
+"""
+
+import nnvm
+
+from nnvm.compiler import graph_attr, graph_util
+
+
+def _pack_batch_channel(data, dshape, bfactor, cfactor):
+    """Pack the data channel dimension.
+    """
+    assert dshape[0] % bfactor == 0
+    assert dshape[1] % cfactor == 0
+    data = nnvm.sym.reshape(data,
+                            shape=(dshape[0] // bfactor, bfactor,
+                                   dshape[1] // cfactor, cfactor,
+                                   dshape[2], dshape[3]))
+    data = nnvm.sym.transpose(
+        data, axes=(0, 2, 4, 5, 1, 3))
+    return data
+
+
+def _unpack_batch_channel(data, old_shape):
+    """Unpack the data channel dimension.
+    """
+    data = nnvm.sym.transpose(data, axes=(0, 4, 1, 5, 2, 3))
+    data = nnvm.sym.reshape(data, shape=old_shape)
+    return data
+
+
+def _pack_weight(data, dshape, cfactor):
+    """Pack the weight into packed format.
+    """
+    assert len(dshape) == 4
+    assert dshape[0] % cfactor == 0
+    assert dshape[1] % cfactor == 0
+    data = nnvm.sym.reshape(data,
+                            shape=(dshape[0] // cfactor, cfactor,
+                                   dshape[1] // cfactor, cfactor,
+                                   dshape[2], dshape[3]))
+    data = nnvm.sym.transpose(
+        data, axes=(0, 2, 4, 5, 1, 3))
+    return data
+
+
+def _pack_bias(data, dshape, bfactor, cfactor):
+    """Pack the bias parameter.
+    """
+    assert len(dshape) == 3
+    assert dshape[0] % cfactor == 0
+    data = nnvm.sym.reshape(data,
+                            shape=(dshape[0] // cfactor,
+                                   cfactor, dshape[1],
+                                   dshape[2], 1))
+    data = nnvm.sym.transpose(
+        data, axes=(0, 2, 3, 4, 1))
+    # broadcast batch dimension to bfactor
+    data = nnvm.sym.broadcast_to(
+        data,
+        shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor))
+    return data
+
+
+def _get_shape(sym, shape_dict):
+    """Get the shape of a node.
+    """
+    return graph_util.infer_shape(
+        nnvm.graph.create(sym), **shape_dict)[1][0]
+
+def clean_conv_fuse(graph):
+    """Cleanup the convolution's later fuse stages
+
+    Parameters
+    ----------
+    graph : Graph
+        Input graph
+
+    Returns
+    -------
+    graph : Graph
+        Optimized graph
+    """
+    def _clean_entry(entry):
+        node, flag = entry
+        if flag:
+            node = nnvm.symbol.clip(node, a_max=127, a_min=-127)
+            node = nnvm.symbol.cast(node, dtype="int8")
+            # Use copy as a hint to block conv2d schedules
+            node = nnvm.symbol.copy(node)
+            flag = False
+        return node, flag
+
+    gidx = graph.index
+    ref_count = {}
+    # count reference of each node
+    for nid, node in enumerate(gidx.nodes):
+        ref_count[nid] = 0
+        for elem in node["inputs"]:
+            ref_count[elem[0]] += 1
+    # construction remap
+    # entry_id->(new_node, conv_fuse)
+    # need_fold: bool indicates if we need fold
+    node_map = {}
+
+    for nid, node in enumerate(gidx.nodes):
+        children = [node_map[e[0]] for e in node["inputs"]]
+        attrs = node.get("attrs", {})
+        node_name = node["name"]
+        op_name = node["op"]
+        get_clone = lambda c, o_n, n_n, a: getattr(nnvm.symbol, o_n)(
+            *c, name=n_n, **a)
+
+        new_entry = None
+        if op_name == "null":
+            new_entry = (nnvm.symbol.Variable(node_name), False)
+        elif op_name in ("cast", "clip"):
+            if children[0][1]:
+                new_entry = children[0]
+            else:
+                new_entry = (
+                    get_clone([children[0][0]], op_name, node_name, attrs),
+                    False)
+        elif op_name == "conv2d" and attrs["out_dtype"] == "int32":
+            data, weight = children
+            data = _clean_entry(data)
+            new_node = nnvm.sym.conv2d(
+                data[0], weight[0], name=node_name, **attrs)
+            new_entry = (new_node, True)
+        elif op_name in ("__lshift_scalar__", "__rshift_scalar__", "relu"):
+            new_entry = (
+                get_clone([children[0][0]], op_name, node_name, attrs),
+                children[0][1])
+        elif op_name in ("broadcast_add", "broadcast_mul"):
+            rhs = children[1][0]
+            lhs, _ = _clean_entry(children[0])
+            lhs = nnvm.sym.cast(lhs, dtype="int32")
+            rhs = nnvm.sym.cast(rhs, dtype="int32")
+            new_entry = (
+                get_clone([lhs, rhs], op_name, node_name, attrs),
+                False)
+
+        if new_entry is None:
+            inputs = [_clean_entry(x) for x in children]
+            new_entry = (
+                get_clone([x[0] for x in inputs], op_name, node_name, attrs),
+                False)
+        if ref_count[nid] > 1:
+            new_entry = _clean_entry(new_entry)
+        node_map[nid] = new_entry
+
+    assert len(graph.index.output_entries) == 1
+    ret = node_map[graph.index.output_entries[0][0]][0]
+    ret = nnvm.graph.create(ret)
+    return ret
+
+def clean_cast(graph):
+    """
+    Move the casts to early part of graph,
+    remove uncessary clip operations when possible.
+    """
+    gidx = graph.index
+    node_map = {}
+
+    def _clean_cast(node, target_type):
+        op_name = node.attr("op_name")
+        if op_name == "cast":
+            return _clean_cast(node.get_children(), target_type)
+        elif op_name == "relu":
+            data, has_clip = _clean_cast(
+                node.get_children(), target_type)
+            data = nnvm.sym.relu(data)
+            return data, has_clip
+        return nnvm.sym.cast(node, dtype=target_type), False
+
+    for nid, node in enumerate(gidx.nodes):
+        children = [node_map[e[0]] for e in node["inputs"]]
+        attrs = node.get("attrs", {})
+        node_name = node["name"]
+        op_name = node["op"]
+        get_clone = lambda c, o_n, n_n, a: getattr(nnvm.symbol, o_n)(
+            *c, name=n_n, **a)
+
+        if op_name == "null":
+            new_node = nnvm.symbol.Variable(node_name)
+        elif op_name == "cast":
+            dtype = attrs["dtype"]
+            new_node, _ = _clean_cast(children[0], dtype)
+        elif op_name == "conv2d" and attrs["out_dtype"] == "int32":
+            data, weight = children
+            data, _ = _clean_cast(data, "int8")
+            weight, _ = _clean_cast(weight, "int8")
+            new_node = nnvm.sym.conv2d(
+                data, weight, name=node_name, **attrs)
+        elif op_name == "elemwise_add":
+            lhs, rhs = children
+            rhs = nnvm.sym.cast(rhs, dtype="int8")
+            new_node = nnvm.sym.elemwise_add(lhs, rhs)
+        else:
+            new_node = get_clone(children, op_name, node_name, attrs)
+        node_map[nid] = new_node
+
+    assert len(graph.index.output_entries) == 1
+    ret = node_map[graph.index.output_entries[0][0]]
+    ret = nnvm.graph.create(ret)
+    return ret
+
+
+def pack(graph, shape_dict, bfactor, cfactor, start_name=None):
+    """Pack the graph into batch&channel packed format.
+
+    Parameters
+    ----------
+    graph : Graph
+       The input graph.
+
+    shape_dict : dict of str to shapex
+       The input shape.
+
+    bfactor : int
+       The packing factor in batch
+
+    cfactor : int
+       The packing factor in channel
+
+    start_name: str, optional
+       Start name start packing from certain known node.
+
+    Returns
+    -------
+    graph : Graph
+        The transformed graph.
+    """
+    graph = graph_attr.set_shape_inputs(graph, shape_dict)
+    graph = graph.apply("InferShape")
+    shape = graph.json_attr("shape")
+    gidx = graph.index
+    node_map = {}
+    dset = set()
+    counter = 0
+    start_pack = False
+
+    for nid, node in enumerate(gidx.nodes):
+        children = [node_map[e[0]] for e in node["inputs"]]
+        ishape = [shape[gidx.entry_id(e)] for e in node["inputs"]]
+        oshape = shape[gidx.entry_id(nid, 0)]
+        attrs = node.get("attrs", {})
+        node_name = node["name"]
+        op_name = node["op"]
+        get_clone = lambda c, o_n, n_n, a: getattr(nnvm.symbol, o_n)(
+            *c, name=n_n, **a)
+
+        if op_name == "null":
+            new_node = nnvm.symbol.Variable(node_name)
+            if start_name and node_name == start_name:
+                start_pack = True
+                new_node = _pack_batch_channel(new_node, oshape, bfactor, cfactor)
+        elif op_name == "max_pool2d":
+            assert not start_pack
+            start_pack = True
+            new_node = get_clone(children, op_name, node_name, attrs)
+            new_node = _pack_batch_channel(new_node, oshape, bfactor, cfactor)
+        elif op_name == "global_avg_pool2d":
+            if start_pack:
+                start_pack = False
+                children[0] = _unpack_batch_channel(children[0], ishape[0])
+                new_node = getattr(nnvm.symbol, op_name)(
+                    *children, name=node_name, **attrs)
+            else:
+                new_node = get_clone(children, op_name, node_name, attrs)
+        elif op_name == "conv2d" and attrs["out_dtype"] == "int32":
+            if start_pack:
+                attrs["layout"] = "NCHW%dn%dc" % (bfactor, cfactor)
+                attrs["kernel_layout"] = "OIHW%do%di" % (cfactor, cfactor)
+                data, weight = children
+                weight = _pack_weight(weight, ishape[1], cfactor)
+                new_node = nnvm.sym.conv2d(
+                    data, weight, name=node_name, **attrs)
+            elif counter == 1:
+                attrs["layout"] = "NCHW%dn%dc" % (bfactor, cfactor)
+                attrs["kernel_layout"] = "OIHW%do%di" % (cfactor, cfactor)
+                data, weight = children
+                data = _pack_batch_channel(data, ishape[0], bfactor, cfactor)
+                weight = _pack_weight(weight, ishape[1], cfactor)
+                new_node = nnvm.sym.conv2d(
+                    data, weight, name=node_name, **attrs)
+                new_node = _unpack_batch_channel(new_node, oshape)
+                counter = counter + 1
+            else:
+                new_node = get_clone(children, op_name, node_name, attrs)
+        elif op_name.startswith("broadcast"):
+            if start_pack:
+                assert len(ishape[1]) == 3
+                children[1] = _pack_bias(children[1], ishape[1], bfactor, cfactor)
+                new_node = getattr(nnvm.symbol, op_name)(
+                    *children, name=node_name, **attrs)
+            else:
+                new_node = get_clone(children, op_name, node_name, attrs)
+        elif op_name.startswith("elementwise_add"):
+            new_node = get_clone(children, op_name, node_name, attrs)
+        else:
+            new_node = get_clone(children, op_name, node_name, attrs)
+            dset.add(op_name)
+        node_map[nid] = new_node
+
+    assert len(graph.index.output_entries) == 1
+    ret = node_map[graph.index.output_entries[0][0]]
+    if start_pack:
+        oshape = shape[graph.index.output_entries[0][0]]
+        ret = _unpack_batch_channel(ret, oshape)
+    graph = nnvm.graph.create(ret)
+    graph = graph_attr.set_shape_inputs(graph, shape_dict)
+    graph = graph.apply("InferShape")
+    return graph
diff --git a/vta/python/vta/intrin.py b/vta/python/vta/intrin.py
new file mode 100644
index 000000000000..b366287568e7
--- /dev/null
+++ b/vta/python/vta/intrin.py
@@ -0,0 +1,94 @@
+"""VTA related intrinsics"""
+from __future__ import absolute_import as _abs
+
+import tvm
+
+def gemm(env, mock=False):
+    """Matrix-matrix multiply intrinsic
+
+    Parameters
+    ----------
+    env : Environment
+        The Environment
+
+    mock : bool
+        Whether create a mock version.
+    """
+    wgt_lanes = env.WGT_ELEM_BITS // env.WGT_WIDTH
+    assert wgt_lanes == env.BLOCK_OUT * env.BLOCK_IN
+    wgt_shape = (env.BLOCK_OUT, env.BLOCK_IN)
+    assert wgt_shape[0] * wgt_shape[1] == wgt_lanes
+
+    inp_lanes = env.INP_ELEM_BITS // env.INP_WIDTH
+    assert inp_lanes == env.BATCH * env.BLOCK_IN
+    inp_shape = (env.BATCH, env.BLOCK_IN)
+    assert inp_shape[0] * inp_shape[1] == inp_lanes
+
+    out_lanes = env.ACC_ELEM_BITS // env.ACC_WIDTH
+    assert out_lanes == env.BATCH * env.BLOCK_OUT
+    out_shape = (env.BATCH, env.BLOCK_OUT)
+    assert out_shape[0] * out_shape[1] == out_lanes
+
+    wgt = tvm.placeholder((wgt_shape[0], wgt_shape[1]),
+                          dtype="int%d" % env.WGT_WIDTH,
+                          name=env.wgt_scope)
+    inp = tvm.placeholder((inp_shape[0], inp_shape[1]),
+                          dtype="int%d" % env.INP_WIDTH,
+                          name=env.inp_scope)
+    k = tvm.reduce_axis((0, wgt_shape[1]), name="k")
+    out_dtype = "int%d" % env.ACC_WIDTH
+    out = tvm.compute((out_shape[0], out_shape[1]),
+                      lambda i, j: tvm.sum(inp[i, k].astype(out_dtype) *
+                                           wgt[j, k].astype(out_dtype),
+                                           axis=[k]),
+                      name="out")
+    wgt_layout = tvm.decl_buffer(
+        wgt.shape, wgt.dtype, env.wgt_scope,
+        scope=env.wgt_scope, offset_factor=wgt_lanes, data_alignment=wgt_lanes)
+    inp_layout = tvm.decl_buffer(
+        inp.shape, inp.dtype, env.inp_scope,
+        scope=env.inp_scope, offset_factor=inp_lanes, data_alignment=inp_lanes)
+    out_layout = tvm.decl_buffer(
+        out.shape, out.dtype, env.acc_scope,
+        scope=env.acc_scope, offset_factor=out_lanes, data_alignment=out_lanes)
+
+    def intrin_func(ins, outs):
+        """Matrix-matrix multiply intrinsic function"""
+        dinp, dwgt = ins
+        dout = outs[0]
+        def instr(index):
+            """Generate matrix-matrix multiply VTA instruction"""
+            irb = tvm.ir_builder.create()
+            dev = env.dev
+            irb.scope_attr(dev.vta_axis, "coproc_scope",
+                           dev.get_task_qid(dev.QID_COMPUTE))
+            irb.scope_attr(dev.vta_axis, "coproc_uop_scope",
+                           dev.vta_push_uop)
+            if index == 0 or index == 2:
+                irb.emit(tvm.call_extern(
+                    "int32", "VTAUopPush",
+                    0, 0,
+                    dout.access_ptr("rw", "int32"),
+                    dinp.access_ptr("r", "int32"),
+                    dwgt.access_ptr("r", "int32"),
+                    0, 0, 0))
+            else:
+                irb.emit(tvm.call_extern(
+                    "int32", "VTAUopPush",
+                    0, 1,
+                    dout.access_ptr("rw", "int32"),
+                    0,
+                    0,
+                    0, 0, 0))
+            return irb.get()
+        # return a triple of normal-set, reset, update
+        nop = tvm.make.Evaluate(0)
+        if mock:
+            return (nop, nop, nop)
+        return (instr(0), instr(1), instr(2))
+
+    return tvm.decl_tensor_intrin(out.op, intrin_func,
+                                  name="GEMM",
+                                  binds={inp: inp_layout,
+                                         wgt: wgt_layout,
+                                         out: out_layout})
diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py
new file mode 100644
index 000000000000..90df67c53278
--- /dev/null
+++ b/vta/python/vta/ir_pass.py
@@ -0,0 +1,831 @@
+"""Additional IR Pass for VTA"""
+# pylint: disable=len-as-condition
+from __future__ import absolute_import as _abs
+
+import tvm
+from topi import util as util
+
+from .environment import get_env
+
+
+def _match_pragma(stmt, key):
+    """Internal helper to match stmt to pragma stmt.
+
+    Parameters
+    ----------
+    stmt : Stmt
+        The AttrStmt
+
+    key : str
+        The pragma key
+    """
+    return ((stmt.attr_key == "pragma_" + key) or
+            (stmt.attr_key == "pragma_scope" and stmt.value.value == key))
+
+
+def fold_uop_loop(stmt_in):
+    """Detect and fold uop loop.
+
+    VTA support uop programming model
+    that recognizes loop structure.
+    This pass detect the loop structure
+    and extract that into uop loop AST.
+
+    Parameters
+    ----------
+    stmt_in : Stmt
+        Input statement
+
+    Returns
+    -------
+    stmt_out : Stmt
+        Output statement.
+    """
+    env = get_env()
+
+    def _fold_outermost_loop(body):
+        stmt = body
+        while not isinstance(stmt, tvm.stmt.For):
+            if isinstance(stmt, (tvm.stmt.ProducerConsumer,)):
+                stmt = stmt.body
+            else:
+                return None, body, None
+
+        loop_var = stmt.loop_var
+        gemm_offsets = [None, None, None]
+        fail = [False]
+
+        def _post_order(op):
+            assert isinstance(op, tvm.expr.Call)
+            base_args = 2
+            if op.name == "VTAUopPush":
+                args = []
+                args += op.args[:base_args]
+                for i in range(3):
+                    m = tvm.arith.DetectLinearEquation(
+                        op.args[i + base_args], [loop_var])
+                    if not m:
+                        fail[0] = True
+                        return op
+                    if gemm_offsets[i] is not None:
+                        if not tvm.ir_pass.Equal(m[0], gemm_offsets[i]):
+                            fail[0] = True
+                            return op
+                        args.append(m[1])
+                    else:
+                        gemm_offsets[i] = m[0]
+                        args.append(m[1])
+                args += op.args[base_args+3:]
+                return tvm.call_extern("int32", "VTAUopPush", *args)
+            else:
+                if op.name not in ("VTATLSCommandHandle", "tvm_thread_context"):
+                    raise RuntimeError("unexpected op %s" % op)
+                return op
+
+        ret = tvm.ir_pass.IRTransform(
+            stmt.body, None, _post_order, ["Call"])
+
+        if not fail[0] and all(x is not None for x in gemm_offsets):
+            def _visit(op):
+                if op.same_as(loop_var):
+                    fail[0] = True
+            tvm.ir_pass.PostOrderVisit(ret, _visit)
+            if not fail[0]:
+                begin = tvm.call_extern(
+                    "int32", "VTAUopLoopBegin", stmt.extent, *gemm_offsets)
+                end = tvm.call_extern("int32", "VTAUopLoopEnd")
+                return [begin, ret, end]
+        raise ValueError("Failed to fold the GEMM instructions..")
+
+    def _do_fold(stmt):
+        if (stmt.attr_key == "coproc_uop_scope" and
+                isinstance(stmt.value, tvm.expr.StringImm) and
+                stmt.value.value == env.dev.vta_push_uop.value):
+            body = stmt.body
+            begins = []
+            ends = []
+            try:
+                begin, body, end = _fold_outermost_loop(body)
+                if begin is not None:
+                    begins.append(begin)
+                if end is not None:
+                    ends.append(end)
+                begin, body, end = _fold_outermost_loop(body)
+                if begin is not None:
+                    begins.append(begin)
+                if end is not None:
+                    ends.append(end)
+            except ValueError:
+                pass
+            if body == stmt.body:
+                return stmt
+            ends = list(reversed(ends))
+            body = tvm.make.stmt_seq(*(begins + [body] + ends))
+            return tvm.make.AttrStmt(
+                stmt.node, stmt.attr_key, stmt.value, body)
+        return None
+    out = tvm.ir_pass.IRTransform(
+        stmt_in, _do_fold, None, ["AttrStmt"])
+    return out
+
+
+def cpu_access_rewrite(stmt_in):
+    """Detect CPU access to VTA buffer and get address correctly.
+
+    VTA's buffer is an opaque handle that do not
+    correspond to address in CPU.
+    This pass detect CPU access and rewrite to use pointer
+    returned VTABufferCPUPtr for CPU access.
+
+    Parameters
+    ----------
+    stmt_in : Stmt
+        Input statement
+
+    Returns
+    -------
+    stmt_out : Stmt
+        Transformed statement
+    """
+    env = get_env()
+    rw_info = {}
+    def _post_order(op):
+        if isinstance(op, tvm.stmt.Allocate):
+            buffer_var = op.buffer_var
+            if not buffer_var in rw_info:
+                return None
+            new_var = rw_info[buffer_var]
+            let_stmt = tvm.make.LetStmt(
+                new_var, tvm.call_extern(
+                    "handle", "VTABufferCPUPtr",
+                    env.dev.command_handle,
+                    buffer_var), op.body)
+            alloc = tvm.make.Allocate(
+                buffer_var, op.dtype, op.extents,
+                op.condition, let_stmt)
+            del rw_info[buffer_var]
+            return alloc
+        elif isinstance(op, tvm.expr.Load):
+            buffer_var = op.buffer_var
+            if not buffer_var in rw_info:
+                rw_info[buffer_var] = tvm.var(
+                    buffer_var.name + "_ptr", "handle")
+            new_var = rw_info[buffer_var]
+            return tvm.make.Load(op.dtype, new_var, op.index)
+        elif isinstance(op, tvm.stmt.Store):
+            buffer_var = op.buffer_var
+            if not buffer_var in rw_info:
+                rw_info[buffer_var] = tvm.var(
+                    buffer_var.name + "_ptr", "handle")
+            new_var = rw_info[buffer_var]
+            return tvm.make.Store(new_var, op.value, op.index)
+        else:
+            raise RuntimeError("not reached")
+    stmt = tvm.ir_pass.IRTransform(
+        stmt_in, None, _post_order, ["Allocate", "Load", "Store"])
+    for buffer_var, new_var in rw_info.items():
+        stmt = tvm.make.LetStmt(
+            new_var, tvm.call_extern(
+                "handle", "VTABufferCPUPtr",
+                env.dev.command_handle,
+                buffer_var), stmt)
+    return stmt
+
+
+def lift_alloc_to_scope_begin(stmt_in):
+    """Lift allocate to beginning of the current scope.
+
+    Parameters
+    ----------
+    stmt_in : Stmt
+        Input statement
+
+    Returns
+    -------
+    stmt_out : Stmt
+        Transformed statement
+    """
+    lift_stmt = [[]]
+    def _merge_block(slist, body):
+        for op in slist:
+            if op.body == body:
+                body = op
+            elif isinstance(op, tvm.stmt.Allocate):
+                body = tvm.make.Allocate(
+                    op.buffer_var, op.dtype,
+                    op.extents, op.condition, body)
+            elif isinstance(op, tvm.stmt.AttrStmt):
+                body = tvm.make.AttrStmt(
+                    op.node, op.attr_key, op.value, body)
+            elif isinstance(op, tvm.stmt.For):
+                body = tvm.make.For(
+                    op.loop_var, op.min, op.extent, op.for_type,
+                    op.device_api, body)
+            else:
+                raise RuntimeError("unexpected op")
+        del slist[:]
+        return body
+
+    def _pre_order(op):
+        if isinstance(op, tvm.stmt.For):
+            lift_stmt.append([])
+        elif isinstance(op, tvm.stmt.AttrStmt):
+            if op.attr_key == "virtual_thread":
+                lift_stmt.append([])
+
+        return None
+
+    def _post_order(op):
+        if isinstance(op, tvm.stmt.Allocate):
+            lift_stmt[-1].append(op)
+            return op.body
+        elif isinstance(op, tvm.stmt.AttrStmt):
+            if op.attr_key == "storage_scope":
+                lift_stmt[-1].append(op)
+                return op.body
+            elif op.attr_key == "virtual_thread":
+                return _merge_block(lift_stmt.pop() + [op], op.body)
+            return op
+        elif isinstance(op, tvm.stmt.For):
+            return _merge_block(lift_stmt.pop() + [op], op.body)
+        else:
+            raise RuntimeError("not reached")
+    stmt = tvm.ir_pass.IRTransform(
+        stmt_in, _pre_order, _post_order, ["Allocate", "AttrStmt", "For"])
+    assert len(lift_stmt) == 1
+    return _merge_block(lift_stmt[0], stmt)
+
+
+def inject_skip_copy(stmt_in):
+    """Pass to inject skip copy stmt, used for debug purpose.
+
+    Parameters
+    ----------
+    stmt_in : Stmt
+        Input statement
+
+    Returns
+    -------
+    stmt_out : Stmt
+        Transformed statement
+    """
+    def _do_fold(stmt):
+        if _match_pragma(stmt, "skip_dma_copy"):
+            return tvm.make.Evaluate(0)
+        return None
+    return tvm.ir_pass.IRTransform(
+        stmt_in, _do_fold, None, ["AttrStmt"])
+
+
+def inject_coproc_sync(stmt_in):
+    """Pass to inject skip copy stmt, used in debug.
+
+    Parameters
+    ----------
+    stmt_in : Stmt
+        Input statement
+
+    Returns
+    -------
+    stmt_out : Stmt
+        Transformed statement
+    """
+    success = [False]
+    def _do_fold(stmt):
+        if _match_pragma(stmt, "coproc_sync"):
+            success[0] = True
+            sync = tvm.make.Call(
+                "int32", "vta.coproc_sync", [], tvm.expr.Call.Intrinsic, None, 0)
+            return tvm.make.Block(stmt.body, tvm.make.Evaluate(sync))
+        elif _match_pragma(stmt, "trim_loop"):
+            op = stmt.body
+            assert isinstance(op, tvm.stmt.For)
+            return tvm.make.For(
+                op.loop_var, op.min, 2, op.for_type,
+                op.device_api, op.body)
+        return None
+    stmt = tvm.ir_pass.IRTransform(
+        stmt_in, None, _do_fold, ["AttrStmt"])
+    stmt = tvm.ir_pass.CoProcSync(stmt)
+    return stmt
+
+
+def inject_dma_intrin(stmt_in):
+    """Pass to inject DMA copy intrinsics.
+
+    Parameters
+    ----------
+    stmt_in : Stmt
+        Input statement
+
+    Returns
+    -------
+    stmt_out : Stmt
+        Transformed statement
+    """
+    env = get_env()
+    def _check_compact(buf):
+        ndim = len(buf.shape)
+        size = tvm.const(1, buf.shape[0].dtype)
+        for i in reversed(range(ndim)):
+            if not util.equal_const_int(size - buf.strides[i], 0):
+                raise RuntimeError(
+                    "Cannot prove compact: shape=%s, strides=%s" % (buf.shape, buf.strides))
+            size = size * buf.shape[i]
+
+    def _fold_buffer_dim(buf, scope, elem_block):
+        ndim = len(buf.shape)
+        x_size = 1
+        base = 0
+        for i in range(1, ndim + 1):
+            if not util.equal_const_int(buf.strides[ndim - i] - x_size, 0):
+                raise RuntimeError("scope %s needs to have block=%d" % (scope, elem_block))
+            x_size = x_size * buf.shape[ndim - i]
+            if util.equal_const_int(x_size - elem_block, 0):
+                base = i + 1
+                break
+        if base == 0:
+            raise RuntimeError("scope %s need to have block=%d, shape=%s" % (
+                scope, elem_block, buf.shape))
+        shape = [elem_block]
+        strides = [1]
+
+        if base < ndim + 1 and not util.equal_const_int(buf.strides[ndim - base], elem_block):
+            shape.append(1)
+            strides.append(elem_block)
+
+        while base < ndim + 1:
+            x_size = 1
+            x_stride = buf.strides[ndim - base]
+            next_base = base
+            if not util.equal_const_int(x_stride % elem_block, 0):
+                raise RuntimeError(
+                    "scope %s need to have block=%d, shape=%s, strides=%s" % (
+                        scope, elem_block, buf.shape, buf.strides))
+            for i in range(base, ndim + 1):
+                k = ndim - i
+                if not util.equal_const_int(x_size * x_stride - buf.strides[k], 0):
+                    break
+                x_size = x_size * buf.shape[k]
+                next_base = i + 1
+            shape.append(tvm.ir_pass.Simplify(x_size))
+            strides.append(x_stride)
+            assert next_base != base
+            base = next_base
+
+        strides = list(reversed(strides))
+        shape = list(reversed(shape))
+        return shape, strides
+
+    def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold):
+        elem_block = elem_bytes * 8 // elem_width
+        if buf.dtype != dtype:
+            raise RuntimeError("Expect buffer type to be %s instead of %s" %
+                               (dtype, buf.dtype))
+        shape, strides = buf.shape, buf.strides
+        if not util.equal_const_int(buf.elem_offset % elem_block, 0):
+            raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block))
+        if allow_fold:
+            shape, strides = _fold_buffer_dim(buf, scope, elem_block)
+        else:
+            shape = list(x for x in shape)
+            strides = list(x for x in strides)
+
+        def raise_error():
+            """Internal function to raise error """
+            raise RuntimeError(
+                ("Scope[%s]: cannot detect 2d pattern with elem_block=%d:" +
+                 " shape=%s, strides=%s") % (scope, elem_block, buf.shape, buf.strides))
+
+        ndim = len(shape)
+
+        # Check if the inner-tensor is already flat
+        flat = util.equal_const_int(shape[-1], elem_block)
+
+        if flat:
+            if not util.equal_const_int(strides[-1], 1):
+                raise_error()
+
+            if ndim == 1:
+                x_size = 1
+                x_stride = 1
+                y_size = 1
+                return x_size, y_size, x_stride, buf.elem_offset / elem_block
+            if not util.equal_const_int(strides[-2] - elem_block, 0):
+                raise_error()
+
+            if ndim == 2:
+                x_size = shape[-2]
+                x_stride = shape[-2]
+                y_size = 1
+                return x_size, y_size, x_stride, buf.elem_offset / elem_block
+            if not util.equal_const_int(strides[-3] % elem_block, 0):
+                raise_error()
+
+            if ndim == 3:
+                x_size = shape[-2]
+                x_stride = strides[-3] / elem_block
+                y_size = shape[-3]
+                return x_size, y_size, x_stride, buf.elem_offset / elem_block
+
+        else:
+            if not util.equal_const_int(strides[-1], 1):
+                raise_error()
+            if not util.equal_const_int(strides[-2] - shape[-1], 0):
+                raise_error()
+            if not util.equal_const_int(shape[-1] * shape[-2], elem_block):
+                raise_error()
+
+            if ndim == 2:
+                x_size = 1
+                x_stride = 1
+                y_size = 1
+                return x_size, y_size, x_stride, buf.elem_offset / elem_block
+            if not util.equal_const_int(strides[-3], elem_block):
+                raise_error()
+
+            if ndim == 3:
+                x_size = shape[-3]
+                x_stride = shape[-3]
+                y_size = 1
+                return x_size, y_size, x_stride, buf.elem_offset / elem_block
+            if not util.equal_const_int(strides[-4] % elem_block, 0):
+                raise_error()
+
+            if ndim == 4:
+                x_size = shape[-3]
+                x_stride = strides[-4] / elem_block
+                y_size = shape[-4]
+                return x_size, y_size, x_stride, buf.elem_offset / elem_block
+
+        raise_error()
+
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # FIXME: pad_value is ignored...
+        _ = pad_value
+        if dst.scope == "global":
+            # Store
+            if pad_before or pad_after:
+                raise RuntimeError("Do not support copy into DRAM with pad")
+            if src.scope == env.acc_scope:
+                elem_width = env.OUT_WIDTH
+                elem_bytes = env.OUT_ELEM_BYTES
+                mem_type = env.dev.MEM_ID_OUT
+                data_type = "int%d" % env.OUT_WIDTH
+                task_qid = env.dev.QID_STORE_OUT
+            else:
+                raise RuntimeError("Do not support copy %s->dram" % (src.scope))
+            _check_compact(src)
+            x_size, y_size, x_stride, offset = _get_2d_pattern(
+                dst, elem_width, elem_bytes, data_type, src.scope, allow_fold=True)
+            irb = tvm.ir_builder.create()
+            irb.scope_attr(env.dev.vta_axis, "coproc_scope",
+                           env.dev.get_task_qid(task_qid))
+            irb.emit(tvm.call_extern(
+                "int32", "VTAStoreBuffer2D",
+                env.dev.command_handle,
+                src.access_ptr("r", "int32"),
+                mem_type, dst.data, offset, x_size, y_size, x_stride))
+            return irb.get()
+        elif src.scope == "global":
+            if dst.scope == env.acc_scope:
+                elem_width = env.ACC_WIDTH
+                elem_bytes = env.ACC_ELEM_BYTES
+                mem_type = env.dev.MEM_ID_ACC
+                data_type = "int%d" % env.ACC_WIDTH
+                task_qid = env.dev.QID_LOAD_OUT
+            elif dst.scope == env.inp_scope:
+                elem_width = env.INP_WIDTH
+                elem_bytes = env.INP_ELEM_BYTES
+                mem_type = env.dev.MEM_ID_INP
+                data_type = "int%d" % env.INP_WIDTH
+                task_qid = env.dev.QID_LOAD_INP
+            elif dst.scope == env.wgt_scope:
+                elem_width = env.WGT_WIDTH
+                elem_bytes = env.WGT_ELEM_BYTES
+                mem_type = env.dev.MEM_ID_WGT
+                data_type = "int%d" % env.WGT_WIDTH
+                task_qid = env.dev.QID_LOAD_WGT
+            else:
+                raise RuntimeError("Do not support copy dram->%s" % (dst.scope))
+            # collect pad statistics
+            if pad_before:
+                assert pad_after
+                ndim = len(pad_before)
+                if ndim <= 2 or ndim > 4:
+                    raise ValueError("Limitation of 2D pad load forbid ndim=%d" % ndim)
+                if ndim > 2:
+                    if not util.equal_const_int(pad_before[ndim - 1], 0):
+                        raise ValueError("Do not support pad on the innermost block")
+                    if not util.equal_const_int(pad_after[ndim - 1], 0):
+                        raise ValueError("Do not support pad on the innermost block")
+                if ndim > 3:
+                    if not util.equal_const_int(pad_before[ndim - 2], 0):
+                        raise ValueError("Do not support pad on the innermost block")
+                    if not util.equal_const_int(pad_after[ndim - 2], 0):
+                        raise ValueError("Do not support pad on the innermost block")
+                y_pad_before = pad_before[0]
+                x_pad_before = pad_before[1]
+                y_pad_after = pad_after[0]
+                x_pad_after = pad_after[1]
+                allow_fold = False
+            else:
+                x_pad_before = 0
+                y_pad_before = 0
+                x_pad_after = 0
+                y_pad_after = 0
+                allow_fold = True
+
+            _check_compact(dst)
+            x_size, y_size, x_stride, offset = _get_2d_pattern(
+                src, elem_width, elem_bytes, data_type,
+                dst.scope, allow_fold=allow_fold)
+
+            irb = tvm.ir_builder.create()
+            irb.scope_attr(env.dev.vta_axis, "coproc_scope",
+                           env.dev.get_task_qid(task_qid))
+
+            irb.emit(tvm.call_extern(
+                "int32", "VTALoadBuffer2D",
+                env.dev.command_handle,
+                src.data, offset, x_size, y_size, x_stride,
+                x_pad_before, y_pad_before,
+                x_pad_after, y_pad_after,
+                dst.access_ptr("r", "int32"), mem_type))
+            return irb.get()
+
+        else:
+            raise RuntimeError("Donot support copy %s->%s" % (src.scope, dst.scope))
+
+    return tvm.ir_pass.InjectCopyIntrin(stmt_in, "dma_copy", _inject_copy)
+
+
+def annotate_alu_coproc_scope(stmt_in):
+    """Pass to insert ALU instruction.
+
+    Parameters
+    ----------
+    stmt_in : Stmt
+        Input statement
+
+    Returns
+    -------
+    stmt_out : Stmt
+        Transformed statement
+    """
+    env = get_env()
+    def _do_fold(stmt):
+        if _match_pragma(stmt, "alu"):
+            irb = tvm.ir_builder.create()
+            irb.scope_attr(env.dev.vta_axis, "coproc_scope",
+                           env.dev.get_task_qid(env.dev.QID_COMPUTE))
+            irb.scope_attr(env.dev.vta_axis, "coproc_uop_scope",
+                           tvm.make.StringImm("VTAPushALUOp"))
+            irb.emit(stmt)
+            return irb.get()
+        elif _match_pragma(stmt, "skip_alu"):
+            return tvm.make.Evaluate(0)
+        return stmt
+
+    stmt_out = tvm.ir_pass.IRTransform(
+        stmt_in, None, _do_fold, ["AttrStmt"])
+
+    return stmt_out
+
+
+def inject_alu_intrin(stmt_in):
+    """Pass to inject ALU micro-ops.
+
+    Parameters
+    ----------
+    stmt_in : Stmt
+        Input statement
+
+    Returns
+    -------
+    stmt_out : Stmt
+        Transformed statement
+    """
+    env = get_env()
+    def _do_fold(stmt):
+        def _equal(x, y):
+            return tvm.ir_pass.Equal(tvm.ir_pass.Simplify(x - y), 0)
+
+        def _flatten_loop(src_coeff, dst_coeff, extents):
+            src_coeff = list(src_coeff)
+            dst_coeff = list(dst_coeff)
+            extents = list(extents)
+            rev_src_coeff = [src_coeff.pop()]
+            rev_dst_coeff = [dst_coeff.pop()]
+            rev_extents = []
+            assert src_coeff
+            vsrc = src_coeff.pop()
+            vdst = dst_coeff.pop()
+            vext = extents.pop()
+            while src_coeff:
+                next_src = src_coeff.pop()
+                next_dst = dst_coeff.pop()
+                next_ext = extents.pop()
+
+                if _equal(next_src, vsrc * vext) and _equal(next_dst, vdst * vext):
+                    vext = tvm.ir_pass.Simplify(vext * next_ext)
+                else:
+                    rev_src_coeff.append(vsrc)
+                    rev_dst_coeff.append(vdst)
+                    rev_extents.append(vext)
+                    vsrc = next_src
+                    vdst = next_dst
+                    vext = next_ext
+            rev_src_coeff.append(vsrc)
+            rev_dst_coeff.append(vdst)
+            rev_extents.append(vext)
+            rev_src_coeff.reverse()
+            rev_dst_coeff.reverse()
+            rev_extents.reverse()
+
+            return rev_src_coeff, rev_dst_coeff, rev_extents
+
+        if _match_pragma(stmt, "alu"):
+            # Get to the innermost loop body
+            loop_body = stmt.body
+            nest_size = 0
+            while isinstance(loop_body, tvm.stmt.For):
+                loop_body = loop_body.body
+                nest_size += 1
+            # Get the src/dst arguments
+            dst_var = loop_body.buffer_var
+            dst_idx = loop_body.index
+            # Derive loop variables and extents
+            tmp_body = stmt.body
+            indices = []
+            extents = []
+            for _ in range(nest_size):
+                indices.append(tmp_body.loop_var)
+                extents.append(tmp_body.extent)
+                tmp_body = tmp_body.body
+            # Derive opcode
+            if isinstance(loop_body.value, tvm.expr.Add):
+                alu_opcode = env.dev.ALU_OPCODE_ADD
+                lhs = loop_body.value.a
+                rhs = loop_body.value.b
+            elif isinstance(loop_body.value, tvm.expr.Sub):
+                alu_opcode = env.dev.ALU_OPCODE_SUB
+                lhs = loop_body.value.a
+                rhs = loop_body.value.b
+            elif isinstance(loop_body.value, tvm.expr.Mul):
+                alu_opcode = env.dev.ALU_OPCODE_MUL
+                lhs = loop_body.value.a
+                rhs = loop_body.value.b
+            elif isinstance(loop_body.value, tvm.expr.Min):
+                alu_opcode = env.dev.ALU_OPCODE_MIN
+                lhs = loop_body.value.a
+                rhs = loop_body.value.b
+            elif isinstance(loop_body.value, tvm.expr.Max):
+                alu_opcode = env.dev.ALU_OPCODE_MAX
+                lhs = loop_body.value.a
+                rhs = loop_body.value.b
+            elif isinstance(loop_body.value, tvm.expr.Call):
+                if loop_body.value.name == 'shift_left':
+                    alu_opcode = env.dev.ALU_OPCODE_SHR
+                    lhs = loop_body.value.args[0]
+                    rhs = tvm.ir_pass.Simplify(-loop_body.value.args[1])
+                elif loop_body.value.name == 'shift_right':
+                    alu_opcode = env.dev.ALU_OPCODE_SHR
+                    lhs = loop_body.value.args[0]
+                    rhs = loop_body.value.args[1]
+                else:
+                    raise RuntimeError(
+                        "Function call not recognized %s" % (loop_body.value.name))
+            elif isinstance(loop_body.value, tvm.expr.Load):
+                alu_opcode = env.dev.ALU_OPCODE_SHR
+                lhs = loop_body.value
+                rhs = tvm.const(0)
+            else:
+                raise RuntimeError(
+                    "Expression not recognized %s, %s, %s" % (
+                        type(loop_body.value), str(loop_body.value), str(stmt)))
+
+            # Derive array index coefficients
+            dst_coeff = tvm.arith.DetectLinearEquation(dst_idx, indices)
+            # Check if lhs/rhs is immediate
+            use_imm = False
+            imm_val = None
+            if isinstance(rhs, tvm.expr.IntImm):
+                assert lhs.buffer_var.same_as(dst_var)
+                src_coeff = tvm.arith.DetectLinearEquation(lhs.index, indices)
+                use_imm = True
+                imm_val = rhs
+            if isinstance(lhs, tvm.expr.IntImm):
+                assert rhs.buffer_var.same_as(dst_var)
+                src_coeff = tvm.arith.DetectLinearEquation(rhs.index, indices)
+                use_imm = True
+                imm_val = lhs
+            if imm_val is None:
+                imm_val = 0
+                assert lhs.buffer_var.same_as(dst_var) and rhs.buffer_var.same_as(dst_var)
+                src_lhs_coeff = tvm.arith.DetectLinearEquation(lhs.index, indices)
+                src_rhs_coeff = tvm.arith.DetectLinearEquation(rhs.index, indices)
+                # Determine which side has the same coefficients
+                lhs_equal = True
+                rhs_equal = True
+                for i, coef in enumerate(dst_coeff):
+                    if not tvm.ir_pass.Equal(coef, src_lhs_coeff[i]):
+                        lhs_equal = False
+                    if not tvm.ir_pass.Equal(coef, src_rhs_coeff[i]):
+                        rhs_equal = False
+                # Make sure at least one of the source is identical to the
+                # destination (in-place computation)
+                assert lhs_equal or rhs_equal
+                # Assign the source coefficients
+                if lhs_equal:
+                    src_coeff = src_rhs_coeff
+                else:
+                    src_coeff = src_lhs_coeff
+
+            # Ensure that we have the proper tensor dimensions in the
+            # innermost loop (pattern match)
+            src_coeff = list(src_coeff)
+            dst_coeff = list(dst_coeff)
+            extents = list(extents)
+            assert len(src_coeff) > 1
+            assert len(dst_coeff) > 1
+            assert len(extents) != 0
+            assert tvm.ir_pass.Equal(
+                tvm.ir_pass.Simplify(
+                    src_coeff[-1] % (env.BATCH * env.BLOCK_OUT)), 0)
+            assert tvm.ir_pass.Equal(
+                tvm.ir_pass.Simplify(
+                    dst_coeff[-1] % (env.BATCH * env.BLOCK_OUT)), 0)
+            assert tvm.ir_pass.Equal(src_coeff[-2], 1)
+            assert tvm.ir_pass.Equal(dst_coeff[-2], 1)
+            if env.BATCH > 1:
+                assert len(src_coeff) > 2
+                assert len(dst_coeff) > 2
+                assert len(extents) > 1
+                assert tvm.ir_pass.Equal(src_coeff[-3], env.BLOCK_OUT)
+                assert tvm.ir_pass.Equal(dst_coeff[-3], env.BLOCK_OUT)
+
+            # Apply tensorization of the loop coefficients
+            src_offset = src_coeff[-1]
+            dst_offset = dst_coeff[-1]
+            if env.BATCH == 1:
+                src_coeff = src_coeff[:-2]
+                dst_coeff = dst_coeff[:-2]
+                extents = extents[:-1]
+            else:
+                src_coeff = src_coeff[:-3]
+                dst_coeff = dst_coeff[:-3]
+                extents = extents[:-2]
+            src_coeff.append(src_offset)
+            dst_coeff.append(dst_offset)
+            src_coeff = [
+                tvm.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in src_coeff]
+            dst_coeff = [
+                tvm.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in dst_coeff]
+
+            # Flatten the outer loops
+            if extents:
+                src_coeff, dst_coeff, extents = _flatten_loop(src_coeff, dst_coeff, extents)
+
+            # Insert ALU micro-ops
+            irb = tvm.ir_builder.create()
+            for idx, extent in enumerate(extents):
+                irb.emit(tvm.call_extern(
+                    "int32", "VTAUopLoopBegin",
+                    extent, dst_coeff[idx], src_coeff[idx], 0))
+            use_imm = int(use_imm)
+            irb.emit(tvm.call_extern(
+                "int32", "VTAUopPush",
+                1, 0,
+                dst_coeff[len(dst_coeff)-1],
+                src_coeff[len(src_coeff)-1],
+                0,
+                alu_opcode, use_imm, imm_val))
+            for extent in extents:
+                irb.emit(tvm.call_extern(
+                    "int32", "VTAUopLoopEnd"))
+            return irb.get()
+        return stmt
+
+    stmt_out = tvm.ir_pass.IRTransform(
+        stmt_in, None, _do_fold, ["AttrStmt"])
+    return stmt_out
+
+
+def debug_print(stmt):
+    """A debug pass that print the stmt
+
+    Parameters
+    ----------
+    stmt : Stmt
+        The input statement
+
+    Returns
+    -------
+    stmt : Stmt
+        The
+    """
+    # pylint: disable=superfluous-parens
+    print(stmt)
+    return stmt
diff --git a/vta/python/vta/libinfo.py b/vta/python/vta/libinfo.py
new file mode 100644
index 000000000000..6cda7dfdeb7d
--- /dev/null
+++ b/vta/python/vta/libinfo.py
@@ -0,0 +1,25 @@
+"""Library information."""
+from __future__ import absolute_import
+import sys
+import os
+
+def _get_lib_name():
+    if sys.platform.startswith('win32'):
+        return "vta.dll"
+    if sys.platform.startswith('darwin'):
+        return "libvta.dylib"
+    return "libvta.so"
+
+
+def find_libvta(optional=False):
+    """Find VTA library"""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    lib_search = [curr_path]
+    lib_search += [os.path.join(curr_path, "..", "..", "..", "build",)]
+    lib_search += [os.path.join(curr_path, "..", "..", "..", "build", "Release")]
+    lib_name = _get_lib_name()
+    lib_path = [os.path.join(x, lib_name) for x in lib_search]
+    lib_found = [x for x in lib_path if os.path.exists(x)]
+    if not lib_found and not optional:
+        raise RuntimeError("Cannot find libvta: candidates are: " % str(lib_path))
+    return lib_found
diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py
new file mode 100644
index 000000000000..c3fe09effb76
--- /dev/null
+++ b/vta/python/vta/pkg_config.py
@@ -0,0 +1,97 @@
+"""VTA Package configuration module
+
+This module is dependency free and can be used to configure package.
+"""
+from __future__ import absolute_import as _abs
+
+import json
+import glob
+
+class PkgConfig(object):
+    """Simple package config tool for VTA.
+
+    This is used to provide runtime specific configurations.
+
+    Parameters
+    ----------
+    cfg : dict
+        The config dictionary
+
+    proj_root : str
+        Path to the project root
+    """
+    cfg_keys = [
+        "TARGET",
+        "HW_FREQ",
+        "HW_CLK_TARGET",
+        "HW_VER",
+        "LOG_INP_WIDTH",
+        "LOG_WGT_WIDTH",
+        "LOG_ACC_WIDTH",
+        "LOG_OUT_WIDTH",
+        "LOG_BATCH",
+        "LOG_BLOCK_IN",
+        "LOG_BLOCK_OUT",
+        "LOG_UOP_BUFF_SIZE",
+        "LOG_INP_BUFF_SIZE",
+        "LOG_WGT_BUFF_SIZE",
+        "LOG_ACC_BUFF_SIZE",
+    ]
+    def __init__(self, cfg, proj_root):
+        # include path
+        self.include_path = [
+            "-I%s/include" % proj_root,
+            "-I%s/vta/include" % proj_root,
+            "-I%s/dlpack/include" % proj_root,
+            "-I%s/dmlc-core/include" % proj_root
+        ]
+        # List of source files that can be used to build standalone library.
+        self.lib_source = []
+        self.lib_source += glob.glob("%s/vta/src/*.cc" % proj_root)
+        self.lib_source += glob.glob("%s/vta/src/%s/*.cc" % (proj_root, cfg["TARGET"]))
+        # macro keys
+        self.macro_defs = []
+        self.cfg_dict = {}
+        for key in self.cfg_keys:
+            self.macro_defs.append("-DVTA_%s=%s" % (key, str(cfg[key])))
+            self.cfg_dict[key] = cfg[key]
+
+        self.target = cfg["TARGET"]
+
+        if self.target == "pynq":
+            self.ldflags = [
+                "-L/usr/lib",
+                "-lsds_lib",
+                "-L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/",
+                "-L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/",
+                "-l:libdma.so"]
+        else:
+            self.ldflags = []
+
+    @property
+    def cflags(self):
+        return self.include_path + self.macro_defs
+
+    @property
+    def cfg_json(self):
+        return json.dumps(self.cfg_dict, indent=2)
+
+    def same_config(self, cfg):
+        """Compare if cfg is same as current config.
+
+        Parameters
+        ----------
+        cfg : the configuration
+            The configuration
+
+        Returns
+        -------
+        equal : bool
+            Whether the configuration is the same.
+        """
+        for k, v in self.cfg_dict.items():
+            if k not in cfg:
+                return False
+            if cfg[k] != v:
+                return False
+        return True
diff --git a/vta/python/vta/rpc_client.py b/vta/python/vta/rpc_client.py
new file mode 100644
index 000000000000..999250cd789d
--- /dev/null
+++ b/vta/python/vta/rpc_client.py
@@ -0,0 +1,40 @@
+"""VTA RPC client function"""
+import os
+
+from .environment import get_env
+from .bitstream import download_bitstream, get_bitstream_path
+
+def reconfig_runtime(remote):
+    """Reconfigure remote runtime based on current hardware spec.
+
+    Parameters
+    ----------
+    remote : RPCSession
+        The TVM RPC session
+    """
+    env = get_env()
+    freconfig = remote.get_function("tvm.contrib.vta.reconfig_runtime")
+    freconfig(env.pkg_config().cfg_json)
+
+
+def program_fpga(remote, bitstream=None):
+    """Upload and program bistream
+
+    Parameters
+    ----------
+    remote : RPCSession
+        The TVM RPC session
+
+    bitstream : str, optional
+        Path to a local bistream file. If unset, tries to download from cache server.
+    """
+    if bitstream:
+        assert os.path.isfile(bitstream)
+    else:
+        bitstream = get_bitstream_path()
+        if not os.path.isfile(bitstream):
+            download_bitstream()
+
+    fprogram = remote.get_function("tvm.contrib.vta.init")
+    remote.upload(bitstream)
+    fprogram(os.path.basename(bitstream))
diff --git a/vta/python/vta/testing/__init__.py b/vta/python/vta/testing/__init__.py
new file mode 100644
index 000000000000..513fa1e99a52
--- /dev/null
+++ b/vta/python/vta/testing/__init__.py
@@ -0,0 +1,3 @@
+"""Testing utilities, this namespace is not imported by default."""
+
+from . util import run
diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
new file mode 100644
index 000000000000..5695c9419c8b
--- /dev/null
+++ b/vta/python/vta/testing/simulator.py
@@ -0,0 +1,43 @@
+"""Utilities to start simulator."""
+import ctypes
+import json
+import tvm
+from ..libinfo import find_libvta
+
+def _load_lib():
+    """Load local library, assuming they are simulator."""
+    lib_path = find_libvta(optional=True)
+    if not lib_path:
+        return []
+    try:
+        return [ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL)]
+    except OSError:
+        return []
+
+
+def enabled():
+    """Check if simulator is enabled."""
+    f = tvm.get_global_func("vta.simulator.profiler_clear", True)
+    return f is not None
+
+
+def clear_stats():
+    """Clear profiler statistics"""
+    f = tvm.get_global_func("vta.simulator.profiler_clear", True)
+    if f:
+        f()
+
+
+def stats():
+    """Clear profiler statistics
+
+    Returns
+    -------
+    stats : dict
+        Current profiler statistics
+    """
+    x = tvm.get_global_func("vta.simulator.profiler_status")()
+    return json.loads(x)
+
+
+LIBS = _load_lib()
diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py
new file mode 100644
index 000000000000..c01d206e9126
--- /dev/null
+++ b/vta/python/vta/testing/util.py
@@ -0,0 +1,48 @@
+"""Test Utilities"""
+from __future__ import absolute_import as _abs
+
+import os
+from tvm import rpc
+from ..environment import get_env
+from . import simulator
+
+def run(run_func):
+    """Run test function on all available env.
+
+    Parameters
+    ----------
+    run_func : function(env, remote)
+    """
+    env = get_env()
+
+    if env.TARGET == "sim":
+
+        # Talk to local RPC if necessary to debug RPC server.
+        # Compile vta on your host with make at the root.
+        # Make sure TARGET is set to "sim" in the config.json file.
+        # Then launch the RPC server on the host machine
+        # with ./apps/pynq_rpc/start_rpc_server.sh
+        # Set your VTA_LOCAL_SIM_RPC environment variable to
+        # the port it's listening to, e.g. 9090
+        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
+        if local_rpc:
+            remote = rpc.connect("localhost", local_rpc)
+            run_func(env, remote)
+        else:
+            # Make sure simulation library exists
+            # If this fails, build vta on host (make)
+            # with TARGET="sim" in the json.config file.
+            assert simulator.enabled()
+            run_func(env, rpc.LocalSession())
+
+    elif env.TARGET == "pynq":
+
+        # Run on PYNQ if env variable exists
+        host = os.environ.get("VTA_PYNQ_RPC_HOST", None)
+        port = int(os.environ.get("VTA_PYNQ_RPC_PORT", None))
+        if host and port:
+            remote = rpc.connect(host, port)
+            run_func(env, remote)
+        else:
+            raise RuntimeError(
+                "Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables")
diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
new file mode 100644
index 000000000000..614ed2347181
--- /dev/null
+++ b/vta/python/vta/top/__init__.py
@@ -0,0 +1,5 @@
+"""TVM TOPI connector, eventually most of these should go to TVM repo"""
+
+from .vta_conv2d import packed_conv2d, schedule_packed_conv2d
+from . import vta_conv2d
+from . import arm_conv2d
diff --git a/vta/python/vta/top/arm_conv2d.py b/vta/python/vta/top/arm_conv2d.py
new file mode 100644
index 000000000000..634348a87cfe
--- /dev/null
+++ b/vta/python/vta/top/arm_conv2d.py
@@ -0,0 +1,21 @@
+"""Reuse conv2d schedule from ARM CPU"""
+
+import tvm
+
+from topi.nn import conv2d, conv2d_alter_layout
+from topi import generic
+
+@conv2d.register(["vtacpu", "vta"])
+def compute(*args, **kwargs):
+    with tvm.target.arm_cpu("vtacpu"):
+        return conv2d(*args, **kwargs)
+
+@generic.schedule_conv2d_nchw.register(["vtacpu", "vta"])
+def schedule(*args, **kwargs):
+    with tvm.target.arm_cpu("vtacpu"):
+        return generic.schedule_conv2d_nchw(*args, **kwargs)
+
+@conv2d_alter_layout.register(["vtacpu", "vta"])
+def alter(*args, **kwargs):
+    with tvm.target.arm_cpu("vtacpu"):
+        return conv2d_alter_layout(*args, **kwargs)
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
new file mode 100644
index 000000000000..e7d584a791fc
--- /dev/null
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -0,0 +1,490 @@
+"""Namespace for supporting packed_conv2d + ewise variant of nnvm."""
+from __future__ import absolute_import as _abs
+
+from collections import namedtuple
+
+import logging
+import tvm
+import topi
+
+from nnvm.top import registry as reg, OpPattern
+from nnvm.top import nn as _nn
+from ..environment import get_env
+
+
+Workload = namedtuple("Conv2DWorkload",
+                      ['batch', 'height', 'width', 'in_filter', 'out_filter',
+                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+
+def find_schedules(layer, vt_only=False, best_only=False):
+    """ Returns a schedule for a given a layer.
+
+    Parameters
+    ----------
+    layer : Workload
+        Convolutional layer description.
+    vt_only : Boolean
+        Produce a schedule plan with virtual threading.
+    best_only : Boolean
+        Return the "best" schedule plan.
+
+    Returns
+    -------
+    fil_sched : list
+        List of valid schedules.
+
+    """
+    # pylint: disable=too-many-nested-blocks
+    env = get_env()
+
+    # Helper function to get factors
+    def _find_factors(n):
+        factors = []
+        for f in range(1, n + 1):
+            if n % f == 0:
+                factors.append(f)
+        return factors
+
+    def _get_data_movement_byte(schedule, layer):
+        """ Estimate data movement in bytes for the schedule plan
+        """
+        env = get_env()
+        b_f = schedule.b_factor
+        h_f = schedule.h_factor
+        w_f = schedule.w_factor
+        ci_f = schedule.ic_factor
+        co_f = schedule.oc_factor
+        # Derive data movement
+        inp_elem_sizeb = env.BATCH * env.BLOCK_IN * env.INP_WIDTH
+        wgt_elem_sizeb = env.BLOCK_IN * env.BLOCK_OUT * env.WGT_WIDTH
+        out_elem_sizeb = env.BATCH * env.BLOCK_OUT * env.OUT_WIDTH
+        input_tile_elems = b_f * \
+                ((h_f - 1) * layer.hstride + layer.hkernel) * \
+                ((w_f - 1) * layer.wstride + layer.wkernel) * ci_f
+        weight_tile_elems = layer.hkernel * layer.wkernel * ci_f
+        output_tile_elems = b_f * h_f * w_f * co_f
+        # Derive tiling factors
+        b_factor = layer.batch // (b_f * env.BATCH)
+        h_factor = (layer.height // layer.hstride) // h_f
+        w_factor = (layer.width // layer.wstride) // w_f
+        ci_factor = layer.in_filter // (ci_f * env.BLOCK_IN)
+        co_factor = layer.out_filter // (co_f * env.BLOCK_OUT)
+        # Compute input transaction count
+        input_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor
+        weight_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor
+        output_xfers = b_factor * h_factor * w_factor * co_factor
+        # Compute total transfer sizes
+        input_xfer_byte = input_tile_elems * input_xfers * inp_elem_sizeb // 8
+        weight_xfer_byte = weight_tile_elems * weight_xfers * wgt_elem_sizeb // 8
+        output_xfer_byte = output_tile_elems * output_xfers * out_elem_sizeb // 8
+        total_xfer_byte = input_xfer_byte + weight_xfer_byte + output_xfer_byte
+        return total_xfer_byte
+
+    # Scheduling exploration
+    batch_factors = _find_factors(layer.batch // env.BATCH)
+    height_factors = _find_factors(layer.height // layer.hstride)
+    width_factors = _find_factors(layer.width // layer.wstride)
+    cin_factors = _find_factors(layer.in_filter // env.BLOCK_IN)
+    cout_factors = _find_factors(layer.out_filter // env.BLOCK_OUT)
+    ht_factors = [1, 2]
+    cot_factors = [1, 2]
+
+    # Explore schedules
+    schedules = []
+    for b_f in batch_factors:
+        for h_f in height_factors:
+            for w_f in width_factors:
+                for ci_f in cin_factors:
+                    for co_f in cout_factors:
+                        # FIXME: 2D load pattern matching imposes restrictions on schedule
+                        valid = (w_f == layer.width // layer.wstride) or \
+                                (w_f != layer.width // layer.wstride and co_f == 1) and \
+                                ci_f == 1
+                        if valid:
+                            schedules.append([b_f, h_f, w_f, ci_f, co_f])
+
+    # Filter the schedules that wouldn't work in the available BRAM sizes
+    inp_elem_sizeb = env.BATCH * env.BLOCK_IN * env.INP_WIDTH
+    wgt_elem_sizeb = env.BLOCK_IN * env.BLOCK_OUT * env.WGT_WIDTH
+    out_elem_sizeb = env.BATCH * env.BLOCK_OUT * env.OUT_WIDTH
+    inp_brams_sizeb = env.INP_BUFF_SIZE * 8
+    wgt_brams_sizeb = env.WGT_BUFF_SIZE * 8
+    out_brams_sizeb = env.OUT_BUFF_SIZE * 8
+    fil_sched = []
+    xfer_size = []
+    for sched in schedules:
+        b_f, h_f, w_f, ci_f, co_f = sched
+        for h_t in ht_factors:
+            for co_t in cot_factors:
+                # Make sure to filter cases where we apply threading on two axes
+                # or cases where the threading factors for h and co are not
+                # factors of h and co
+                if (h_t == 2 and co_t == 2) or (h_f % h_t != 0) or (co_f % co_t != 0):
+                    continue
+                # Adjust tile sizes if threading is applied
+                h_f //= h_t
+                co_f //= co_t
+                # Derive tile sizes
+                input_tile_elems = b_f * \
+                        ((h_f - 1) * layer.hstride + layer.hkernel) * \
+                        ((w_f - 1) * layer.wstride + layer.wkernel) * ci_f
+                weight_tile_elems = layer.hkernel * layer.wkernel * ci_f * co_f
+                output_tile_elems = b_f * h_f * w_f * co_f
+
+                # Derive valid schedule filter
+                valid = True
+                # If in vitrual-threaded mode, only allow for threaded plans
+                valid &= (vt_only and (h_t == 2 or co_t == 2)) or not vt_only
+                # Check that we don't exceed input/weight/output capacity
+                valid &= input_tile_elems * inp_elem_sizeb <= inp_brams_sizeb // (co_t * h_t)
+                valid &= weight_tile_elems * wgt_elem_sizeb <= wgt_brams_sizeb
+                valid &= output_tile_elems * out_elem_sizeb <= out_brams_sizeb // (co_t * h_t)
+                # Make sure that we don't write to the same acc location within 2 consecutive cycles
+                valid &= h_f > 2 and w_f > 2
+                # TODO: check that we don't exceed instruction or micro-op count
+
+                if valid:
+                    schedule = Schedule(b_factor=b_f, oc_factor=co_f, ic_factor=ci_f, h_factor=h_f,
+                                        w_factor=w_f, oc_nthread=co_t, h_nthread=h_t)
+                    fil_sched.append(schedule)
+                    xfer_size.append(_get_data_movement_byte(schedule, layer))
+
+    if best_only:
+        return [fil_sched[xfer_size.index(min(xfer_size))]]
+    return fil_sched
+
+def packed_conv2d(data,
+                  kernel,
+                  padding,
+                  strides,
+                  out_dtype="int32"):
+    """ Packed conv2d function.
+    """
+    if padding[0]:
+        pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data")
+    else:
+        pad_data = data
+    assert len(data.shape) == 6
+    assert len(kernel.shape) == 6
+    oheight = topi.util.simplify((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
+    owidth = topi.util.simplify((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
+    oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])
+
+    ishape = topi.util.get_const_tuple(data.shape)
+    kshape = topi.util.get_const_tuple(kernel.shape)
+    assert data.dtype == "int8", data.dtype
+    assert kernel.dtype == "int8", kernel.dtype
+    d_i = tvm.reduce_axis((0, kshape[2]), name='d_i')
+    d_j = tvm.reduce_axis((0, kshape[3]), name='d_j')
+    k_o = tvm.reduce_axis((0, ishape[1]), name='k_o')
+    k_i = tvm.reduce_axis((0, ishape[-1]), name='k_i')
+    hstride, wstride = strides
+    res = tvm.compute(
+        oshape,
+        lambda b_o, c_o, i, j, b_i, c_i: tvm.sum(
+            pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) *
+            kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
+            axis=[k_o, d_i, d_j, k_i]),
+        name="res", tag="packed_conv2d")
+    return res
+
+@tvm.register_func("nnvm.compiler.build_target", override=True)
+def _build(funcs, target, target_host):
+    tvm_t = tvm.target.create(target)
+    if tvm_t.device_name == "vta":
+        return tvm.build(funcs, target="ext_dev", target_host=target_host)
+    elif tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu":
+        return tvm.build(funcs, target=target_host)
+    return tvm.build(funcs, target=target)
+
+
+@tvm.register_func("nnvm.compiler.lower", override=True)
+def _lower(sch, inputs, func_name, graph):
+    import traceback
+    # pylint: disable=broad-except
+    try:
+        f = tvm.lower(sch, inputs, name=func_name)
+        if "quantized_conv2d" in func_name:
+            logging.info(graph.ir(join_entry_attrs=["shape"]))
+    except Exception:
+        msg = traceback.format_exc()
+        msg += "Error during compile graph\n"
+        msg += "--------------------------\n"
+        msg += graph.ir(join_entry_attrs=["shape"])
+        raise RuntimeError(msg)
+    return f if isinstance(
+        f, (tvm.container.Array, tuple, list)) else [f]
+
+
+@reg.register_compute("clip", level=15)
+def compute_clip(attrs, inputs, _):
+    """ Clip operator.
+    """
+    x = inputs[0]
+    a_min = attrs.get_float("a_min")
+    a_max = attrs.get_float("a_max")
+    const_min = tvm.const(a_min, x.dtype)
+    const_max = tvm.const(a_max, x.dtype)
+    with tvm.tag_scope(topi.tag.ELEMWISE):
+        x = tvm.compute(
+            x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+        x = tvm.compute(
+            x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    return x
+
+# override to force partition at copy
+reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
+
+def is_packed_layout(layout):
+    """Check if layout is packed layout"""
+    if layout == "NCHW":
+        return False
+    if "n" in layout and "c" in layout:
+        return True
+    return False
+
+@reg.register_alter_op_layout("conv2d", level=15)
+def alter_conv2d_layout(attrs, inputs, out):
+    layout = attrs['layout']
+    if is_packed_layout(layout):
+        return None
+    return _nn.alter_conv2d_layout(attrs, inputs, out)
+
+
+@reg.register_compute("conv2d", level=15)
+def compute_conv2d(attrs, inputs, out):
+    """ 2D convolution algorithm.
+    """
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs["layout"]
+    out_dtype = attrs['out_dtype']
+    assert dilation == (1, 1), "not support dilate now"
+    if is_packed_layout(layout):
+        assert groups == 1
+        return packed_conv2d(inputs[0], inputs[1],
+                             padding, strides, out_dtype=out_dtype)
+    return _nn.compute_conv2d(attrs, inputs, out)
+
+
+@reg.register_schedule("conv2d", level=15)
+def schedule_conv2d(attrs, outs, target):
+    """ 2D convolution schedule.
+    """
+    layout = attrs["layout"]
+
+    if is_packed_layout(layout):
+        target = tvm.target.create(target)
+        if target.device_name == "vta":
+            return schedule_packed_conv2d(outs)
+        elif str(target).startswith("llvm"):
+            return tvm.create_schedule([x.op for x in outs])
+        else:
+            raise RuntimeError("not support target %s" % target)
+    return _nn.schedule_conv2d(attrs, outs, target)
+
+
+def _get_workload(data, pad_data, kernel, output):
+    """ Get the workload structure.
+    """
+    o_shape = topi.util.get_const_tuple(output.shape)
+    d_shape = topi.util.get_const_tuple(data.shape)
+    k_shape = topi.util.get_const_tuple(kernel.shape)
+    o_b, o_c, o_h, o_w, ob_blk, o_blk = o_shape
+    i_b, i_c, i_h, i_w, ib_blk, i_blk = d_shape
+    k_o, k_i, k_h, k_w, ko_blk, ki_blk = k_shape
+    # For now we need to assume that input channel blocking is the same
+    # as the output channel blocking
+    assert o_blk == i_blk
+    assert ob_blk == ib_blk
+    # Make sure that dimensions match
+    assert o_b == i_b
+    assert o_blk == ko_blk
+    assert i_blk == ki_blk
+    assert k_o == o_c
+    assert k_i == i_c
+    # Scale the channel size
+    i_c *= i_blk
+    o_c *= o_blk
+    if pad_data is not None:
+        p_shape = topi.util.get_const_tuple(pad_data.shape)
+        h_pad = (p_shape[2] - d_shape[2]) // 2
+        w_pad = (p_shape[3] - d_shape[3]) // 2
+    else:
+        h_pad, w_pad = 0, 0
+    h_str = (i_h + h_pad*2 - k_h) // (o_h - 1)
+    w_str = (i_w + w_pad*2 - k_w) // (o_w - 1)
+    return Workload(i_b, i_h, i_w, i_c, o_c, k_h, k_w, h_pad, w_pad, h_str, w_str)
+
+_WL2PLAN = {}
+
+def schedule_packed_conv2d(outs):
+    """ Schedule the packed conv2d.
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    ewise_inputs = []
+    ewise_ops = []
+    conv2d_res = []
+    assert output.dtype == "int8"
+    assert output.op.input_tensors[0].dtype == "int32"
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                ewise_ops.append(op)
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.tensor.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            assert op.tag == "packed_conv2d"
+            conv2d_res.append(op)
+
+    _traverse(output.op)
+    assert len(conv2d_res) == 1
+    conv2d_stage = conv2d_res[0].output(0)
+
+    data, kernel = conv2d_stage.op.input_tensors
+    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+        temp = data.op.input_tensors[0]
+        pad_data = data
+        data = temp
+    else:
+        pad_data = None
+    wrkld = _get_workload(data, pad_data, kernel, output)
+    if wrkld in _WL2PLAN:
+        plan = _WL2PLAN[wrkld]
+    else:
+        plan = find_schedules(wrkld, vt_only=True, best_only=True)[0]
+        logging.info("Trying to find plan for %s", wrkld)
+    env = get_env()
+
+    load_inp = load_wgt = load_out = store_out = env.dma_copy
+    alu = env.alu
+    gemm = env.gemm
+
+    # schedule1
+    oshape = topi.util.get_const_tuple(output.shape)
+    s = tvm.create_schedule(output.op)
+
+    # setup pad
+    if pad_data is not None:
+        cdata = pad_data
+        s[pad_data].set_scope(env.inp_scope)
+    else:
+        cdata = s.cache_read(data, env.inp_scope, [conv2d_stage])
+    ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage])
+    s[conv2d_stage].set_scope(env.acc_scope)
+    # cache read input
+    cache_read_ewise = []
+
+    for consumer, tensor in ewise_inputs:
+        cache_read_ewise.append(
+            s.cache_read(tensor, env.acc_scope, [consumer]))
+    # set ewise scope
+    for op in ewise_ops:
+        s[op].set_scope(env.acc_scope)
+        s[op].pragma(s[op].op.axis[0], alu)
+
+    # tile
+    oc_factor = (plan.oc_factor if plan.oc_factor
+                 else plan.out_filter // env.BLOCK_OUT)
+    h_factor = (plan.h_factor if plan.h_factor else oshape[2])
+    w_factor = (plan.w_factor if plan.w_factor else oshape[3])
+
+    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
+    x_co0, x_co1 = s[output].split(x_co, factor=oc_factor)
+    x_i0, x_i1 = s[output].split(x_i, factor=h_factor)
+    x_j0, x_j1 = s[output].split(x_j, factor=w_factor)
+    s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
+    store_pt = x_j0
+
+    # set all compute scopes
+    s[conv2d_stage].compute_at(s[output], store_pt)
+    for op in ewise_ops:
+        s[op].compute_at(s[output], store_pt)
+
+    for tensor in cache_read_ewise:
+        s[tensor].compute_at(s[output], store_pt)
+        s[tensor].pragma(s[tensor].op.axis[0], load_out)
+
+    # virtual threading along output channel axes
+    if plan.oc_nthread > 1:
+        _, v_t = s[output].split(x_co0, factor=plan.oc_nthread)
+        s[output].reorder(v_t, x_bo)
+        s[output].bind(v_t, tvm.thread_axis("cthread"))
+
+    # virtual threading along spatial rows
+    if plan.h_nthread > 1:
+        _, v_t = s[output].split(x_i0, factor=plan.h_nthread)
+        s[output].reorder(v_t, x_bo)
+        s[output].bind(v_t, tvm.thread_axis("cthread"))
+
+    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis
+    k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis
+    s[conv2d_stage].reorder(x_bo, k_o, x_j, d_j, d_i, x_co, x_i, x_bi, x_ci, k_i)
+
+    if plan.ic_factor:
+        k_o, _ = s[conv2d_stage].split(k_o, factor=plan.ic_factor)
+        s[cdata].compute_at(s[conv2d_stage], k_o)
+        s[ckernel].compute_at(s[conv2d_stage], k_o)
+
+    # Use VTA instructions
+    s[cdata].pragma(s[cdata].op.axis[0], load_inp)
+    s[ckernel].pragma(s[ckernel].op.axis[0], load_wgt)
+    s[conv2d_stage].tensorize(x_bi, gemm)
+    s[output].pragma(x_co1, store_out)
+    return s
+
+class Conv2DSchedule(object):
+    """ 2D convolution schedule object.
+    """
+    def __init__(self,
+                 b_factor=1,
+                 oc_factor=1,
+                 ic_factor=1,
+                 h_factor=1,
+                 w_factor=0,
+                 oc_nthread=0,
+                 h_nthread=0,
+                 debug_sync=False):
+        self.b_factor = b_factor
+        self.oc_factor = oc_factor
+        self.ic_factor = ic_factor
+        self.h_factor = h_factor
+        self.w_factor = w_factor
+        self.oc_nthread = oc_nthread
+        self.h_nthread = h_nthread
+        self.debug_sync = debug_sync
+
+    def __str__(self):
+        return "{}.{}.{}.{}.{}.{}.{}".format(
+            self.b_factor, self.oc_factor, self.ic_factor,
+            self.h_factor, self.w_factor,
+            self.oc_nthread, self.h_nthread)
+
+Schedule = Conv2DSchedule
+
+# Layer description of the ResNet18
+RESNET = {
+    0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2),
+    1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+    2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+    3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+    4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+    5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+    6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+    7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+    8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+    9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+    10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+    11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+}
+
+for idx in RESNET:
+    scheds = find_schedules(RESNET[idx], vt_only=True, best_only=True)[0]
+    _WL2PLAN[RESNET[idx]] = scheds
diff --git a/vta/src/device_api.cc b/vta/src/device_api.cc
new file mode 100644
index 000000000000..88990e1b1331
--- /dev/null
+++ b/vta/src/device_api.cc
@@ -0,0 +1,96 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file device_api.cc
+ * \brief TVM device API for VTA
+ */
+
+#include <tvm/runtime/registry.h>
+#include <dmlc/thread_local.h>
+#include <vta/runtime.h>
+
+#include "../../src/runtime/workspace_pool.h"
+
+
+namespace tvm {
+namespace runtime {
+
+class VTADeviceAPI final : public DeviceAPI {
+ public:
+  void SetDevice(TVMContext ctx) final {}
+
+  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
+    if (kind == kExist) {
+      *rv = 1;
+    }
+  }
+
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t size,
+                       size_t alignment,
+                       TVMType type_hint) final {
+    return VTABufferAlloc(size);
+  }
+
+  void FreeDataSpace(TVMContext ctx, void* ptr) final {
+    VTABufferFree(ptr);
+  }
+
+  void CopyDataFromTo(const void* from,
+                      size_t from_offset,
+                      void* to,
+                      size_t to_offset,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMType type_hint,
+                      TVMStreamHandle stream) final {
+    int kind_mask = 0;
+    if (ctx_from.device_type != kDLCPU) {
+      kind_mask |= 2;
+    }
+    if (ctx_to.device_type != kDLCPU) {
+      kind_mask |= 1;
+    }
+    VTABufferCopy(from, from_offset,
+                  to, to_offset,
+                  size, kind_mask);
+  }
+
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
+  }
+
+  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
+
+  void FreeWorkspace(TVMContext ctx, void* data) final;
+
+  static const std::shared_ptr<VTADeviceAPI>& Global() {
+    static std::shared_ptr<VTADeviceAPI> inst =
+        std::make_shared<VTADeviceAPI>();
+    return inst;
+  }
+};
+
+struct VTAWorkspacePool : public WorkspacePool {
+  VTAWorkspacePool() :
+      WorkspacePool(static_cast<DLDeviceType>(kExtDev),
+                    VTADeviceAPI::Global()) {}
+};
+
+void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) {
+  return dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()
+      ->AllocWorkspace(ctx, size);
+}
+
+void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) {
+  dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(ctx, data);
+}
+
+// Register device api with override.
+static TVM_ATTRIBUTE_UNUSED auto& __register_dev__ =
+::tvm::runtime::Registry::Register("device_api.ext_dev", true)
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    DeviceAPI* ptr = VTADeviceAPI::Global().get();
+    *rv = static_cast<void*>(ptr);
+  });
+}  // namespace runtime
+}  // namespace tvm
diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc
new file mode 100644
index 000000000000..e2630b14acde
--- /dev/null
+++ b/vta/src/pynq/pynq_driver.cc
@@ -0,0 +1,169 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file pynq_driver.c
+ * \brief VTA driver for Pynq board.
+ */
+
+#include <vta/driver.h>
+#include <thread>
+#include "./pynq_driver.h"
+
+
+void* VTAMemAlloc(size_t size, int cached) {
+  return cma_alloc(size, cached);
+}
+
+void VTAMemFree(void* buf) {
+  cma_free(buf);
+}
+
+vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
+  return cma_get_phy_addr(buf);
+}
+
+void VTAFlushCache(vta_phy_addr_t buf, int size) {
+  xlnkFlushCache(reinterpret_cast<void*>(buf), size);
+}
+
+void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
+  xlnkInvalidateCache(reinterpret_cast<void*>(buf), size);
+}
+
+void *VTAMapRegister(uint32_t addr, size_t length) {
+  // Align the base address with the pages
+  uint32_t virt_base = addr & ~(getpagesize() - 1);
+  // Calculate base address offset w.r.t the base address
+  uint32_t virt_offset = addr - virt_base;
+  // Open file and mmap
+  uint32_t mmap_file = open(VTA_PYNQ_DEV_MEM_PATH, O_RDWR|O_SYNC);
+  return mmap(NULL,
+              (length+virt_offset),
+              PROT_READ|PROT_WRITE,
+              MAP_SHARED,
+              mmap_file,
+              virt_base);
+}
+
+void VTAUnmapRegister(void *vta, size_t length) {
+  // Unmap memory
+  int status = munmap(vta, length);
+  assert(status == 0);
+}
+
+void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
+  *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)) = val;
+}
+
+uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
+  return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
+}
+
+class VTADevice {
+ public:
+  VTADevice() {
+    // VTA stage handles
+    vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
+    vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
+    vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
+    vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
+  }
+
+  ~VTADevice() {
+    // Close VTA stage handle
+    VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
+    VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
+    VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
+    VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
+  }
+
+  int Run(vta_phy_addr_t insn_phy_addr,
+          uint32_t insn_count,
+          uint32_t wait_cycles) {
+    // NOTE: Register address map is derived from the auto-generated
+    // driver files available under hardware/build/vivado/<design>/export/driver
+    // FETCH @ 0x10 : Data signal of insn_count_V
+    VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_count);
+    // FETCH @ 0x18 : Data signal of insns_V
+    VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_phy_addr);
+    // LOAD @ 0x10 : Data signal of inputs_V
+    VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
+    // LOAD @ 0x18 : Data signal of weight_V
+    VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
+    // COMPUTE @ 0x20 : Data signal of uops_V
+    VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
+    // COMPUTE @ 0x28 : Data signal of biases_V
+    VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
+    // STORE @ 0x10 : Data signal of outputs_V
+    VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
+
+    // VTA start
+    VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
+    VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
+    VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
+    VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
+
+    // Loop until the VTA is done
+    unsigned t, flag = 0;
+    for (t = 0; t < wait_cycles; ++t) {
+      flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
+      if (flag == VTA_DONE) break;
+      std::this_thread::yield();
+    }
+    // Report error if timeout
+    return t < wait_cycles ? 0 : 1;
+  }
+
+ private:
+  // VTA handles (register maps)
+  void* vta_fetch_handle_{nullptr};
+  void* vta_load_handle_{nullptr};
+  void* vta_compute_handle_{nullptr};
+  void* vta_store_handle_{nullptr};
+};
+
+VTADeviceHandle VTADeviceAlloc() {
+  return new VTADevice();
+}
+
+void VTADeviceFree(VTADeviceHandle handle) {
+  delete static_cast<VTADevice*>(handle);
+}
+
+int VTADeviceRun(VTADeviceHandle handle,
+                 vta_phy_addr_t insn_phy_addr,
+                 uint32_t insn_count,
+                 uint32_t wait_cycles) {
+  return static_cast<VTADevice*>(handle)->Run(
+      insn_phy_addr, insn_count, wait_cycles);
+}
+
+void VTAProgram(const char* bitstream) {
+  int elem;
+  FILE *src, *dst, *partial;
+  partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w");
+  if (partial == NULL) {
+    printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL);
+        fclose(partial);
+        exit(1);
+  }
+  fputc('0', partial);
+  fclose(partial);
+  src = fopen(bitstream, "rb");
+  if (src == NULL) {
+    printf("Cannot open bitstream %s\n", bitstream);
+    exit(1);
+  }
+  dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb");
+  if (dst == NULL) {
+    printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG);
+    fclose(dst);
+    exit(1);
+  }
+  elem = fgetc(src);
+  while (elem != EOF) {
+    fputc(elem, dst);
+    elem = fgetc(src);
+  }
+  fclose(src);
+  fclose(dst);
+}
diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h
new file mode 100644
index 000000000000..7aba00441abd
--- /dev/null
+++ b/vta/src/pynq/pynq_driver.h
@@ -0,0 +1,85 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_pynq_driver.h
+ * \brief VTA driver for Pynq board.
+ */
+
+#ifndef VTA_PYNQ_PYNQ_DRIVER_H_
+#define VTA_PYNQ_PYNQ_DRIVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <assert.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#ifdef __arm__
+#include <libxlnk_cma.h>
+#else
+void* cma_alloc(size_t size, int cached);
+void cma_free(void* buf);
+uint32_t cma_get_phy_addr(void* buf);
+#endif
+void xlnkFlushCache(void* buf, int size);
+void xlnkInvalidateCache(void* buf, int size);
+
+void *VTAMapRegister(uint32_t addr, size_t length);
+void VTAUnmapRegister(void *vta, size_t length);
+void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
+uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
+
+/*! \brief (Pynq only) Partial bitstream status file path */
+#define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
+/*! \brief (Pynq only) Bitstream destination file path */
+#define VTA_PYNQ_BS_XDEVCFG "/dev/xdevcfg"
+
+/*! \brief (Pynq only) Path to /dev/mem */
+#define VTA_PYNQ_DEV_MEM_PATH "/dev/mem"
+/*! \brief (Pynq only) MMIO driver constant */
+#define VTA_PYNQ_MMIO_WORD_LENGTH 4
+/*! \brief (Pynq only) MMIO driver constant */
+#define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
+
+/*! \brief VTA configuration register address range */
+#define VTA_RANGE 0x100
+/*! \brief VTA configuration register start value */
+#define VTA_START 0x1
+/*! \brief VTA configuration register auto-restart value */
+#define VTA_AUTORESTART 0x81
+/*! \brief VTA configuration register done value */
+#define VTA_DONE 0x1
+
+/*! \brief VTA fetch stage configuration register address
+*   from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_FETCH_ADDR    0x43C00000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_COMPUTE_ADDR  0x43C10000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_LOAD_ADDR     0x43C20000
+/*! \brief VTA store stage configuration register address
+*   from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_STORE_ADDR    0x43C30000
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_PYNQ_PYNQ_DRIVER_H_
diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc
new file mode 100644
index 000000000000..ffa0096e1713
--- /dev/null
+++ b/vta/src/runtime.cc
@@ -0,0 +1,1372 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file runtime.cc
+ * \brief Generic VTA runtime in C++11.
+ *
+ *  The runtime depends on specific instruction
+ *  stream spec as specified in hw_spec.h
+ */
+#include <vta/driver.h>
+#include <vta/hw_spec.h>
+#include <vta/runtime.h>
+#include <dmlc/logging.h>
+
+#include <cassert>
+#include <cstring>
+#include <vector>
+#include <thread>
+#include <memory>
+#include <atomic>
+
+namespace vta {
+
+// Avoid bad configurations.
+static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8,
+              "VTA_UOP_WIDTH do not match VTAUop size");
+
+/*! \brief Enable coherent access between VTA and CPU. */
+static const bool kBufferCoherent = true;
+
+/*!
+ * \brief Data buffer represents data on CMA.
+ */
+struct DataBuffer {
+  /*! \return Virtual address of the data. */
+  void* virt_addr() const {
+    return data_;
+  }
+  /*! \return Physical address of the data. */
+  uint32_t phy_addr() const {
+    return phy_addr_;
+  }
+  /*!
+   * \brief Invalidate the cache of given location in data buffer.
+   * \param offset The offset to the data.
+   * \param size The size of the data.
+   */
+  void InvalidateCache(size_t offset, size_t size) {
+    if (!kBufferCoherent) {
+      VTAInvalidateCache(phy_addr_ + offset, size);
+    }
+  }
+  /*!
+   * \brief Invalidate the cache of certain location in data buffer.
+   * \param offset The offset to the data.
+   * \param size The size of the data.
+   */
+  void FlushCache(size_t offset, size_t size) {
+    if (!kBufferCoherent) {
+      VTAFlushCache(phy_addr_ + offset, size);
+    }
+  }
+  /*!
+   * \brief Allocate a buffer of a given size.
+   * \param size The size of the buffer.
+   */
+  static DataBuffer* Alloc(size_t size) {
+    void* data = VTAMemAlloc(size, 1);
+    CHECK(data != nullptr);
+    DataBuffer* buffer = new DataBuffer();
+    buffer->data_ = data;
+    buffer->phy_addr_ = VTAMemGetPhyAddr(data);
+    return buffer;
+  }
+  /*!
+   * \brief Free the data buffer.
+   * \param buffer The buffer to be freed.
+   */
+  static void Free(DataBuffer* buffer) {
+    VTAMemFree(buffer->data_);
+    delete buffer;
+  }
+  /*!
+   * \brief Create data buffer header from buffer ptr.
+   * \param buffer The buffer pointer.
+   * \return The corresponding data buffer header.
+   */
+  static DataBuffer* FromHandle(const void* buffer) {
+    return const_cast<DataBuffer*>(
+        reinterpret_cast<const DataBuffer*>(buffer));
+  }
+
+ private:
+  /*! \brief The internal data. */
+  void* data_;
+  /*! \brief The physical address of the buffer, excluding header. */
+  uint32_t phy_addr_;
+};
+
+/*!
+ * \brief Micro op kernel.
+ *  Contains functions to construct the kernel with prefix Push.
+ */
+class UopKernel {
+ public:
+  /*! \brief Loop information. */
+  struct LoopEntry {
+    uint32_t extent;
+    uint32_t dst_factor;
+    uint32_t src_factor;
+    uint32_t wgt_factor;
+  };
+  /*!
+   * \brief Construct UopKernel with signature.
+   * \param signature The pointer to signature.
+   * \param nbytes Number of bytes.
+   */
+  UopKernel(const char* signature, int nbytes)
+      : signature_(signature, signature + nbytes) {
+  }
+  /*!
+   * \brief Verify if the signature is correct.
+   * \param signature Signature ptr.
+   * \param nbytes Number of bytes.
+   */
+  bool MatchSignature(void* signature, int nbytes) const {
+    if (static_cast<size_t>(nbytes) != signature_.size()) return false;
+    return memcmp(signature, signature_.data(), nbytes) == 0;
+  }
+  /*! \return Whether the kernel is cached in SRAM. */
+  bool cached() const {
+    return sram_begin_ != sram_end_;
+  }
+  /*! \return The length of the micro op sequence. */
+  size_t size() const {
+    return seq_.size();
+  }
+  /*! \return The micro-op data. */
+  const VTAUop* data() const {
+    return seq_.data();
+  }
+  /*! \return The loop structure. */
+  const std::vector<LoopEntry>& loop() const {
+    return loop_;
+  }
+  /*!
+   * \brief Declare loop start.
+   * \param extent The loop extent.
+   * \param dst_factor Loop factor of accum index.
+   * \param src_factor Loop factor of input index
+   * \param wgt_factor Loop factor of weight index.
+   */
+  void PushLoopBegin(uint32_t extent,
+                     uint32_t dst_factor,
+                     uint32_t src_factor,
+                     uint32_t wgt_factor) {
+    LoopEntry le;
+    le.extent = extent;
+    le.dst_factor = dst_factor;
+    le.src_factor = src_factor;
+    le.wgt_factor = wgt_factor;
+    CHECK_EQ(seq_.size(), 0U);
+    CHECK_LT(loop_.size(), 2U);
+    loop_.push_back(le);
+    ++loop_ptr_;
+  }
+  /*!
+   * \brief Declare loop end.
+   */
+  void PushLoopEnd() {
+    --loop_ptr_;
+  }
+  /*!
+   * \brief Push micro op into kernel.
+   * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
+   * \param reset_out Resets the accum to 0.
+   * \param dst_index The accum memory index.
+   * \param src_index The input memory (gemm) / accum memory (alu) index.
+   * \param wgt_index The weight memory index.
+   * \param opcode The ALU opcode.
+   * \param use_imm Use immediate in ALU mode if set to true.
+   * \param imm_val Immediate value in ALU mode.
+   */
+  void Push(uint32_t mode,
+            uint32_t reset_out,
+            uint32_t dst_index,
+            uint32_t src_index,
+            uint32_t wgt_index,
+            uint32_t opcode,
+            uint32_t use_imm,
+            int32_t imm_val) {
+    // The loop nest structure
+    VerifyDep(dst_index);
+    VTAUop op;
+    op.dst_idx = dst_index;
+    op.src_idx = src_index;
+    op.wgt_idx = wgt_index;
+    seq_.push_back(op);
+    // Ensure that mode is consistent if set
+    if (mode_ == 0xFFFFFFFF) {
+      mode_ = mode;
+    } else {
+      CHECK(mode_ == mode);
+    }
+    // Set reset_out field if unset
+    if (reset_out_ == 0xFFFFFFFF) {
+      reset_out_ = reset_out;
+    } else {
+      CHECK(reset_out_ == reset_out);
+    }
+    // Check kernel op and imm/imm_val in ALU mode
+    if (mode == 1) {
+      if (opcode_ == 0xFFFFFFFF) {
+        opcode_ = opcode;
+        use_imm_ = use_imm;
+        imm_val_ = imm_val;
+      } else {
+        CHECK(opcode_ == opcode);
+        CHECK(use_imm_ == use_imm);
+        CHECK(imm_val_ == imm_val);
+      }
+    }
+  }
+  /*! \brief Dump kernel micro ops to stdout. */
+  void Dump() {
+    uint32_t size = seq_.size();
+    printf("There are %u uops\n", size);
+    for (uint32_t i = 0; i < size; ++i) {
+      printf("[%04u]\t acc=%u, inp=%u, wgt=%u\n",
+             i,
+             seq_[i].dst_idx,
+             seq_[i].src_idx,
+             seq_[i].wgt_idx);
+    }
+    printf("\n");
+  }
+
+ public:
+  // The kernel's mode, opcode, immediate setting and value
+  uint32_t mode_{0xFFFFFFFF};  // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU
+  uint32_t opcode_{0xFFFFFFFF};
+  uint32_t reset_out_{0xFFFFFFFF};
+  bool use_imm_{false};
+  int16_t imm_val_{0};
+
+ private:
+  // Verify that we don't write to the same acc_mem index two cycles in a row
+  void VerifyDep(uint32_t dst_index) {
+    size_t step = std::min(static_cast<size_t>(2U), seq_.size());
+    for (size_t i = seq_.size() - step; i < seq_.size(); ++i) {
+      CHECK(seq_[i].dst_idx != dst_index);
+    }
+  }
+  // The uop buffer
+  template<int, bool, bool>
+  friend class UopQueue;
+  friend class CommandQueue;
+  // SRAM location if begin != end.
+  uint32_t sram_begin_{0};
+  uint32_t sram_end_{0};
+  // The signature used for verification
+  std::vector<char> signature_;
+  // Internal sequence
+  std::vector<VTAUop> seq_;
+  // The loop nest structure specific to ALU instructions
+  std::vector<LoopEntry> loop_;
+  // The loop pointer
+  size_t loop_ptr_{0};
+};
+
+/*!
+ * \brief Base class of all queues to send and recv serial data.
+ */
+class BaseQueue {
+ public:
+  ~BaseQueue() {
+    if (dram_buffer_ != nullptr) {
+      VTAMemFree(dram_buffer_);
+    }
+  }
+  /*! \return Content of DRAM buffer. */
+  char* dram_buffer() const {
+    return dram_buffer_;
+  }
+  /*! \return Physical address of DRAM. */
+  uint32_t dram_phy_addr() const {
+    return dram_phy_addr_;
+  }
+  /*! \return Whether there is pending information. */
+  bool pending() const {
+    return sram_begin_ != sram_end_;
+  }
+  /*! \brief Initialize the space of the buffer. */
+  void InitSpace(uint32_t elem_bytes, uint32_t max_bytes, bool coherent, bool always_cache) {
+    coherent_ = coherent;
+    always_cache_ = always_cache;
+    elem_bytes_ = elem_bytes;
+    dram_buffer_ = static_cast<char*>(VTAMemAlloc(
+        max_bytes, coherent || always_cache_));
+    CHECK(dram_buffer_ != nullptr);
+    dram_phy_addr_ = VTAMemGetPhyAddr(dram_buffer_);
+  }
+  /*!
+   * \brief Reset the pointer of the buffer.
+   *  Set SRAM pointer to be the current end.
+   */
+  void Reset() {
+    dram_begin_ = dram_end_ = 0;
+    sram_begin_ = sram_end_;
+  }
+  void AutoReadBarrier() {
+    ReadBarrier(elem_bytes_ * 8, 0, dram_end_);
+  }
+  /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
+  void ReadBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
+    if (!coherent_ && always_cache_ && dram_extent != 0) {
+      dram_begin = dram_begin * elem_bits / 8;
+      dram_extent = dram_extent * elem_bits / 8;
+      VTAFlushCache(dram_phy_addr_ + dram_begin,
+                    dram_extent);
+    }
+  }
+  /*! \brief Read barrier to make sure that data written by VTA is visible to CPU. */
+  void WriteBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
+    if (!coherent_ && always_cache_ && dram_extent != 0) {
+      dram_begin = dram_begin * elem_bits / 8;
+      dram_extent = dram_extent * elem_bits / 8;
+      VTAInvalidateCache(dram_phy_addr_ + dram_begin,
+                         dram_extent);
+    }
+  }
+
+ protected:
+  // Cache coherence access
+  bool coherent_{false};
+  // Make the buffer cacheable
+  bool always_cache_{false};
+  // Element bytes
+  uint32_t elem_bytes_{0};
+  // Begin location of current SRAM read in FIFO mode
+  uint32_t sram_begin_{0};
+  // End location of current SRAM write in FIFO mode
+  uint32_t sram_end_{0};
+  // The current pending offset in DRAM in FIFO mode
+  uint32_t dram_begin_{0};
+  // The current pending offset in DRAM in FIFO mode
+  uint32_t dram_end_{0};
+  // The buffer in DRAM
+  char* dram_buffer_{nullptr};
+  // Physics address of the buffer
+  uint32_t dram_phy_addr_;
+};
+
+/*!
+ * \brief Micro op buffer that manages the micro op cache.
+ */
+template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
+class UopQueue : public BaseQueue {
+ public:
+  void InitSpace() {
+    BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
+  }
+  // Push data to the queue
+  template<typename FAutoSync>
+  void Push(UopKernel* kernel, FAutoSync fautosync) {
+    if (kernel->cached()) return;
+    size_t num_op = kernel->size();
+    if (dram_end_ + num_op > kMaxElems) {
+      fautosync();
+      CHECK(dram_end_ <= kMaxElems);
+    }
+    CHECK(num_op <= kMaxNumUop);
+    uint32_t uop_begin = 0;
+    if (sram_end_ + num_op > kMaxNumUop) {
+      // Need to evict
+      cache_ptr_ = 0;
+      sram_begin_ = 0;
+      sram_end_ = num_op;
+    } else {
+      uop_begin = sram_end_;
+      sram_end_ += num_op;
+    }
+    // Simple eviction policy
+    uint32_t evict_begin = cache_ptr_;
+    for (; cache_ptr_ < cache_.size(); ++cache_ptr_) {
+      if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break;
+      cache_[cache_ptr_]->sram_begin_ = 0;
+      cache_[cache_ptr_]->sram_end_ = 0;
+    }
+    memcpy(dram_buffer_ + dram_end_ * kElemBytes,
+           kernel->data(),
+           num_op * kElemBytes);
+    dram_end_ += num_op;
+    kernel->sram_begin_ = uop_begin;
+    kernel->sram_end_ = sram_end_;
+    CHECK(kernel->cached());
+    CHECK(uop_begin != sram_end_);
+    cache_.insert(cache_.begin() + cache_ptr_, kernel);
+    cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_ptr_);
+    cache_ptr_ = evict_begin + 1;
+  }
+  // Flush as weight load
+  void FlushUopLoad(VTAMemInsn* insn) {
+    if (sram_begin_ != sram_end_) {
+      CHECK((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_));
+      insn->memory_type = VTA_MEM_ID_UOP;
+      insn->sram_base = sram_begin_;
+      insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_;
+      insn->y_size = 1;
+      insn->x_size = (dram_end_ - dram_begin_);
+      insn->x_stride = (dram_end_ - dram_begin_);
+      insn->y_pad_0 = 0;
+      insn->y_pad_1 = 0;
+      insn->x_pad_0 = 0;
+      insn->x_pad_1 = 0;
+      // Reset indices
+      sram_begin_ = sram_end_;
+      dram_begin_ = dram_end_;
+    }
+  }
+
+ private:
+  // Cache pointer
+  uint32_t cache_ptr_{0};
+  // Cached ring, sorted by sram_begin
+  std::vector<UopKernel*> cache_;
+  // Constants
+  static constexpr int kElemBytes = sizeof(VTAUop);
+  static constexpr int kMaxNumUop = VTA_UOP_BUFF_DEPTH;
+  static constexpr int kMaxElems = kMaxBytes / kElemBytes;
+};
+
+// Internal kernel structure
+class UopKernelMap {
+ public:
+  // Simple hash map
+  UopKernel** Get(void* signature,
+                  int nbytes) {
+    uint32_t key = 0;
+    CHECK(nbytes == 0 || nbytes == sizeof(int));
+    if (nbytes == sizeof(int)) {
+      memcpy(&key, signature, sizeof(int));
+      key = key + 1;
+    }
+    CHECK_LT(key, 100);
+    if (kmap_.size() <= key) {
+      kmap_.resize(key + 1, nullptr);
+    }
+    return &(kmap_[key]);
+  }
+
+ private:
+  std::vector<UopKernel*> kmap_;
+};
+
+enum PipelineStage : int {
+  kNoneStage = 0,
+  kLoadStage = 1,
+  kComputeStage = 2,
+  kStoreStage = 3
+};
+
+// Instruction Queue
+template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
+class InsnQueue : public BaseQueue {
+ public:
+  /*! \brief Initialize the space. */
+  void InitSpace() {
+    BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
+    // Initialize the stage
+    std::fill(pending_pop_prev_, pending_pop_prev_ + 4, 0);
+    std::fill(pending_pop_next_, pending_pop_next_ + 4, 0);
+  }
+  /*! \return The data pointer. */
+  VTAGenericInsn* data() {
+    return reinterpret_cast<VTAGenericInsn*>(dram_buffer_);
+  }
+  /*! \return Number of instructions. */
+  uint32_t count() {
+    return dram_end_;
+  }
+  // Insert dependency push of load
+  void DepPop(int from, int to) {
+    // NOTE: This instruction executes on queue[to]
+    if (from < to) {
+      if (pending_pop_prev_[to]) {
+        this->CommitPendingPop(to);
+      }
+      pending_pop_prev_[to] = 1;
+    } else {
+      if (pending_pop_next_[to]) {
+        this->CommitPendingPop(to);
+      }
+      pending_pop_next_[to] = 1;
+    }
+    // Impossible condition
+    CHECK(from != kLoadStage || to != kStoreStage);
+    CHECK(to != kLoadStage || to != kComputeStage);
+  }
+  // Insert dependency push of load
+  void DepPush(int from, int to) {
+    // NOTE: this instruction executes on queue[from]
+    this->CommitPendingPop(from);
+    if (dram_end_ != 0) {
+      VTAMemInsn* mptr =
+          reinterpret_cast<VTAMemInsn*>(dram_buffer_) + dram_end_ - 1;
+      if (GetPipelineStage(mptr) == from) {
+        if (from < to && !mptr->push_next_dep) {
+          // push(LD->C) or push(C->ST)
+          mptr->push_next_dep = true; return;
+        } else if (from > to && !mptr->push_prev_dep) {
+          // push(C->LD) or push(ST->C)
+          mptr->push_prev_dep = true; return;
+        }
+      }
+    }
+    if (from < to) {
+      // Push next dep
+      PushNoop(from, false, true, false, false);
+    } else {
+      // Push prev dep
+      PushNoop(from, true, false, false, false);
+    }
+  }
+  // Create a new instruction for a GEMM stage
+  VTAGemInsn* CreateGemInsn() {
+    return reinterpret_cast<VTAGemInsn*>(
+        Create(kComputeStage));
+  }
+  // Create a new instruction for a ALU stage
+  VTAAluInsn* CreateAluInsn() {
+    return reinterpret_cast<VTAAluInsn*>(
+        Create(kComputeStage));
+  }
+  // Create a new instruction for a memory stage
+  VTAMemInsn* CreateMemInsn(int memory_type) {
+    return reinterpret_cast<VTAMemInsn*>(
+        Create(GetMemPipelineStage(memory_type)));
+  }
+  // create a new instruction for a store stage
+  VTAMemInsn* CreateStoreInsn() {
+    return reinterpret_cast<VTAMemInsn*>(
+        Create(kStoreStage));
+  }
+  // Rewrite instruction stream to force serial execution
+  void RewriteForceSerial() {
+    int insn_count = count();
+    VTAMemInsn* mem_ptr = reinterpret_cast<VTAMemInsn*>(data());
+    for (int i = 1; i < insn_count; ++i) {
+      PipelineStage prev = GetPipelineStage(mem_ptr + i - 1);
+      PipelineStage now = GetPipelineStage(mem_ptr + i);
+      if (prev == kLoadStage && now == kComputeStage) {
+        mem_ptr[i - 1].push_prev_dep = false;
+        mem_ptr[i - 1].push_next_dep = true;
+        mem_ptr[i].pop_prev_dep = true;
+        mem_ptr[i].pop_next_dep = false;
+      } else if (prev == kComputeStage && now == kLoadStage) {
+        mem_ptr[i - 1].push_prev_dep = true;
+        mem_ptr[i - 1].push_next_dep = false;
+        mem_ptr[i].pop_prev_dep = false;
+        mem_ptr[i].pop_next_dep = true;
+      } else if (prev == kStoreStage && now == kComputeStage) {
+        mem_ptr[i - 1].push_prev_dep = true;
+        mem_ptr[i - 1].push_next_dep = false;
+        mem_ptr[i].pop_prev_dep = false;
+        mem_ptr[i].pop_next_dep = true;
+      } else if (prev == kComputeStage && now == kStoreStage) {
+        mem_ptr[i - 1].push_prev_dep = false;
+        mem_ptr[i - 1].push_next_dep = true;
+        mem_ptr[i].pop_prev_dep = true;
+        mem_ptr[i].pop_next_dep = false;
+      } else {
+        mem_ptr[i - 1].push_prev_dep = false;
+        mem_ptr[i - 1].push_next_dep = false;
+        mem_ptr[i].pop_prev_dep = false;
+        mem_ptr[i].pop_next_dep = false;
+      }
+    }
+  }
+
+  // Helper function: Get Opcode string
+  const char* getOpcodeString(int opcode, bool use_imm) {
+      // The string name
+      if (opcode == VTA_ALU_OPCODE_MIN) {
+          if (use_imm) {
+              return "min imm";
+          } else {
+              return "min";
+          }
+      } else if (opcode == VTA_ALU_OPCODE_MAX) {
+          if (use_imm) {
+              return "max imm";
+          } else {
+              return "max";
+          }
+      } else if (opcode == VTA_ALU_OPCODE_ADD) {
+          if (use_imm) {
+              return "add imm";
+          } else {
+              return "add";
+          }
+      } else if (opcode == VTA_ALU_OPCODE_SHR) {
+          return "shr";
+      }
+
+      return "unknown op";
+  }
+
+  // Dump instructions in the queue
+  void DumpInsn() {
+    // Keep tabs on dependence queues
+    int l2g_queue = 0;
+    int g2l_queue = 0;
+    int s2g_queue = 0;
+    int g2s_queue = 0;
+    // Converter
+    union VTAInsn c;
+    // Iterate over all instructions
+    int insn_count = count();
+    const VTAGenericInsn* insn = data();
+    printf("There are %u instructions\n", insn_count);
+    for (int i = 0; i < insn_count; ++i) {
+      // Fetch instruction and decode opcode
+      c.generic = insn[i];
+      printf("INSTRUCTION %u: ", i);
+      if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
+        if (c.mem.x_size == 0) {
+          if (c.mem.opcode == VTA_OPCODE_STORE) {
+            printf("NOP-STORE-STAGE\n");
+          } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
+            printf("NOP-COMPUTE-STAGE\n");
+          } else {
+            printf("NOP-MEMORY-STAGE\n");
+          }
+          printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+                 static_cast<int>(c.mem.pop_prev_dep),
+                 static_cast<int>(c.mem.pop_next_dep),
+                 static_cast<int>(c.mem.push_prev_dep),
+                 static_cast<int>(c.mem.push_next_dep));
+          // Count status in queues
+          if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
+            if (c.mem.opcode == VTA_OPCODE_STORE) {
+                CHECK(c.mem.pop_next_dep == false);
+                CHECK(c.mem.push_next_dep == false);
+                if (c.mem.pop_prev_dep) g2s_queue--;
+                if (c.mem.push_prev_dep) s2g_queue++;
+            } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
+                       (c.mem.memory_type == VTA_MEM_ID_INP ||
+                        c.mem.memory_type == VTA_MEM_ID_WGT) ) {
+                CHECK(c.mem.pop_prev_dep == false);
+                CHECK(c.mem.push_prev_dep == false);
+                if (c.mem.pop_next_dep) g2l_queue--;
+                if (c.mem.push_next_dep) l2g_queue++;
+            } else {
+                if (c.mem.pop_prev_dep) l2g_queue--;
+                if (c.mem.push_prev_dep) g2l_queue++;
+                if (c.mem.pop_next_dep) s2g_queue--;
+                if (c.mem.push_next_dep) g2s_queue++;
+            }
+          } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
+            // Print instruction field information
+            if (c.gemm.pop_prev_dep) l2g_queue--;
+            if (c.gemm.push_prev_dep) g2l_queue++;
+            if (c.gemm.pop_next_dep) s2g_queue--;
+            if (c.gemm.push_next_dep) g2s_queue++;
+          }
+          printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
+          printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
+          continue;
+        }
+        // Print instruction field information
+        if (c.mem.opcode == VTA_OPCODE_LOAD) {
+          printf("LOAD ");
+          if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
+          if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
+          if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
+          if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
+        }
+        if (c.mem.opcode == VTA_OPCODE_STORE) {
+          printf("STORE:\n");
+        }
+        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+               static_cast<int>(c.mem.pop_prev_dep),
+               static_cast<int>(c.mem.pop_next_dep),
+               static_cast<int>(c.mem.push_prev_dep),
+               static_cast<int>(c.mem.push_next_dep));
+        printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
+               static_cast<int>(c.mem.dram_base),
+               static_cast<int>(c.mem.sram_base));
+        printf("\ty: size=%d, pad=[%d, %d]\n",
+               static_cast<int>(c.mem.y_size),
+               static_cast<int>(c.mem.y_pad_0),
+               static_cast<int>(c.mem.y_pad_1));
+        printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
+               static_cast<int>(c.mem.x_size),
+               static_cast<int>(c.mem.x_stride),
+               static_cast<int>(c.mem.x_pad_0),
+               static_cast<int>(c.mem.x_pad_1));
+      } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
+        // Print instruction field information
+        printf("GEMM\n");
+
+        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+               static_cast<int>(c.mem.pop_prev_dep),
+               static_cast<int>(c.mem.pop_next_dep),
+               static_cast<int>(c.mem.push_prev_dep),
+               static_cast<int>(c.mem.push_next_dep));
+        printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
+        printf("\trange (%d, %d)\n",
+               static_cast<int>(c.gemm.uop_bgn),
+               static_cast<int>(c.gemm.uop_end));
+        printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
+               static_cast<int>(c.gemm.iter_out),
+               static_cast<int>(c.gemm.wgt_factor_out),
+               static_cast<int>(c.gemm.src_factor_out),
+               static_cast<int>(c.gemm.dst_factor_out));
+        printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
+               static_cast<int>(c.gemm.iter_in),
+               static_cast<int>(c.gemm.wgt_factor_in),
+               static_cast<int>(c.gemm.src_factor_in),
+               static_cast<int>(c.gemm.dst_factor_in));
+      } else if (c.mem.opcode == VTA_OPCODE_ALU) {
+        // Print instruction field information
+        printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
+        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+               static_cast<int>(c.mem.pop_prev_dep),
+               static_cast<int>(c.mem.pop_next_dep),
+               static_cast<int>(c.mem.push_prev_dep),
+               static_cast<int>(c.mem.push_next_dep));
+        printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
+        printf("\trange (%d, %d)\n",
+               static_cast<int>(c.alu.uop_bgn),
+               static_cast<int>(c.alu.uop_end));
+        printf("\touter loop - iter: %d, dst: %d, src: %d\n",
+               static_cast<int>(c.alu.iter_out),
+               static_cast<int>(c.alu.dst_factor_out),
+               static_cast<int>(c.alu.src_factor_out));
+        printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
+               static_cast<int>(c.alu.iter_in),
+               static_cast<int>(c.alu.dst_factor_in),
+               static_cast<int>(c.alu.src_factor_in));
+      } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
+        printf("FINISH\n");
+      }
+
+      // Count status in queues
+      if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
+        if (c.mem.opcode == VTA_OPCODE_STORE) {
+            CHECK(c.mem.pop_next_dep == false);
+            CHECK(c.mem.push_next_dep == false);
+            if (c.mem.pop_prev_dep) g2s_queue--;
+            if (c.mem.push_prev_dep) s2g_queue++;
+        } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
+                   (c.mem.memory_type == VTA_MEM_ID_INP ||
+                    c.mem.memory_type == VTA_MEM_ID_WGT) ) {
+            CHECK(c.mem.pop_prev_dep == false);
+            CHECK(c.mem.push_prev_dep == false);
+            if (c.mem.pop_next_dep) g2l_queue--;
+            if (c.mem.push_next_dep) l2g_queue++;
+        } else {
+            if (c.mem.pop_prev_dep) l2g_queue--;
+            if (c.mem.push_prev_dep) g2l_queue++;
+            if (c.mem.pop_next_dep) s2g_queue--;
+            if (c.mem.push_next_dep) g2s_queue++;
+        }
+      } else if (c.mem.opcode == VTA_OPCODE_GEMM ||
+                 c.mem.opcode == VTA_OPCODE_ALU) {
+        // Print instruction field information
+        if (c.gemm.pop_prev_dep) l2g_queue--;
+        if (c.gemm.push_prev_dep) g2l_queue++;
+        if (c.gemm.pop_next_dep) s2g_queue--;
+        if (c.gemm.push_next_dep) g2s_queue++;
+      }
+      printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
+      printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
+    }
+  }
+
+  // Commit all pending pop of corresponding stage
+  void CommitPendingPop(int stage) {
+    // Handle the LD<->compute queue
+    // NOTE: pop executes on target(stage)
+    CHECK(stage > 0 && stage < 4);
+    if (pending_pop_prev_[stage] ||
+        pending_pop_next_[stage]) {
+      PushNoop(stage, false, false,
+               pending_pop_prev_[stage],
+               pending_pop_next_[stage]);
+      pending_pop_prev_[stage] = 0;
+      pending_pop_next_[stage] = 0;
+    }
+  }
+
+  void CommitPending() {
+    for (int i = kLoadStage; i <= kStoreStage; ++i) {
+      CommitPendingPop(i);
+    }
+  }
+
+  bool PendingPop() {
+    for (int i = kLoadStage; i <= kStoreStage; ++i) {
+      if (pending_pop_prev_[i]) return true;
+      if (pending_pop_next_[i]) return true;
+    }
+    return false;
+  }
+
+ protected:
+  /*! \return Add new instruction to the buffer. */
+  VTAGenericInsn* NextInsn() {
+    VTAGenericInsn* insn  = data() + dram_end_;
+    ++dram_end_;
+    CHECK(dram_end_ < kMaxElems);
+    return insn;
+  }
+  // Create a new instruction for a given stage
+  VTAGenericInsn* Create(PipelineStage stage) {
+    VTAGenericInsn* gptr = NextInsn();
+    VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(gptr);
+    mptr->pop_prev_dep = pending_pop_prev_[stage];
+    mptr->pop_next_dep = pending_pop_next_[stage];
+    mptr->push_prev_dep = false;
+    mptr->push_next_dep = false;
+    pending_pop_prev_[stage] = 0;
+    pending_pop_next_[stage] = 0;
+    return gptr;
+  }
+  // Get stage of the memory
+  static PipelineStage GetMemPipelineStage(int memory_type) {
+    if (memory_type == VTA_MEM_ID_ACC) return kComputeStage;
+    if (memory_type == VTA_MEM_ID_UOP) return kComputeStage;
+    return kLoadStage;
+  }
+  // Get stage of the computation
+  static PipelineStage GetPipelineStage(VTAMemInsn* insn) {
+    if (insn->opcode == VTA_OPCODE_GEMM) return kComputeStage;
+    if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage;
+    if (insn->opcode == VTA_OPCODE_LOAD) {
+      if (insn->x_size == 0) return kNoneStage;
+      if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage;
+      if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage;
+      return kLoadStage;
+    }
+    if (insn->opcode == VTA_OPCODE_STORE) {
+      // FIXME: Right now memory_type is a 2-bit field which means that
+      //        VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from
+      //        checking the memory_type to avoid an CHECKion error...
+      return kStoreStage;
+    }
+    LOG(FATAL) << "not reached";
+    return kNoneStage;
+  }
+  // Push no-op
+  void PushNoop(int stage,
+                bool push_prev_dep, bool push_next_dep,
+                bool pop_prev_dep, bool pop_next_dep) {
+    VTAMemInsn* insn = reinterpret_cast<VTAMemInsn*>(NextInsn());
+    insn->opcode = (stage == kStoreStage ? VTA_OPCODE_STORE : VTA_OPCODE_LOAD);
+    insn->push_prev_dep = push_prev_dep;
+    insn->push_next_dep = push_next_dep;
+    insn->pop_prev_dep = pop_prev_dep;
+    insn->pop_next_dep = pop_next_dep;
+    insn->sram_base = 0;
+    insn->dram_base = 0;
+    insn->y_size = 0;
+    insn->x_size = 0;
+    insn->x_stride = 0;
+    insn->y_pad_0 = 0;
+    insn->y_pad_1 = 0;
+    insn->x_pad_0 = 0;
+    insn->x_pad_1 = 0;
+    insn->memory_type = (stage == kLoadStage ? VTA_MEM_ID_INP : VTA_MEM_ID_UOP);
+  }
+
+ private:
+  // Pending pop of each isntruction queue, qid=0 is not used
+  int pending_pop_prev_[4];
+  int pending_pop_next_[4];
+  static constexpr int kElemBytes = sizeof(VTAGenericInsn);
+  static constexpr int kMaxElems = kMaxBytes / kElemBytes;
+};
+
+/*!
+ * \brief The command queue object that handles the request.
+ */
+class CommandQueue {
+ public:
+  CommandQueue() {
+    this->InitSpace();
+  }
+  void InitSpace() {
+    uop_queue_.InitSpace();
+    insn_queue_.InitSpace();
+    device_ = VTADeviceAlloc();
+    CHECK(device_ != nullptr);
+    printf("Initialize VTACommandHandle...\n");
+  }
+
+  ~CommandQueue() {
+    VTADeviceFree(device_);
+    printf("Close VTACommandhandle...\n");
+  }
+
+  uint32_t GetElemBytes(uint32_t memory_id) {
+    switch (memory_id) {
+      case VTA_MEM_ID_UOP: return VTA_UOP_ELEM_BYTES;
+      case VTA_MEM_ID_INP: return VTA_INP_ELEM_BYTES;
+      case VTA_MEM_ID_WGT: return VTA_WGT_ELEM_BYTES;
+      case VTA_MEM_ID_ACC: return VTA_ACC_ELEM_BYTES;
+      case VTA_MEM_ID_OUT: return VTA_INP_ELEM_BYTES;
+      default: break;
+    }
+    LOG(FATAL) << "Memory id not recognized:" << memory_id;
+    return 0;
+  }
+
+  void LoadBuffer2D(void* src_dram_addr,
+                    uint32_t src_elem_offset,
+                    uint32_t x_size,
+                    uint32_t y_size,
+                    uint32_t x_stride,
+                    uint32_t x_pad_before,
+                    uint32_t y_pad_before,
+                    uint32_t x_pad_after,
+                    uint32_t y_pad_after,
+                    uint32_t dst_sram_index,
+                    uint32_t dst_memory_type) {
+    VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type);
+    insn->opcode = VTA_OPCODE_LOAD;
+    insn->memory_type = dst_memory_type;
+    insn->sram_base = dst_sram_index;
+    DataBuffer* src = DataBuffer::FromHandle(src_dram_addr);
+    insn->dram_base = src->phy_addr() / GetElemBytes(dst_memory_type) + src_elem_offset;
+    insn->y_size = y_size;
+    insn->x_size = x_size;
+    insn->x_stride = x_stride;
+    insn->y_pad_0 = y_pad_before;
+    insn->y_pad_1 = y_pad_after;
+    insn->x_pad_0 = x_pad_before;
+    insn->x_pad_1 = x_pad_after;
+    this->CheckInsnOverFlow();
+  }
+
+  void StoreBuffer2D(uint32_t src_sram_index,
+                     uint32_t src_memory_type,
+                     void* dst_dram_addr,
+                     uint32_t dst_elem_offset,
+                     uint32_t x_size,
+                     uint32_t y_size,
+                     uint32_t x_stride) {
+    VTAMemInsn* insn = insn_queue_.CreateStoreInsn();
+    insn->opcode = VTA_OPCODE_STORE;
+    insn->memory_type = src_memory_type;
+    insn->sram_base = src_sram_index;
+    DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr);
+    insn->dram_base = dst->phy_addr() / GetElemBytes(src_memory_type) + dst_elem_offset;
+    insn->y_size = y_size;
+    insn->x_size = x_size;
+    insn->x_stride = x_stride;
+    insn->y_pad_0 = 0;
+    insn->y_pad_1 = 0;
+    insn->x_pad_0 = 0;
+    insn->x_pad_1 = 0;
+    this->CheckInsnOverFlow();
+  }
+
+  void DepPush(int from_qid, int to_qid) {
+    insn_queue_.DepPush(from_qid, to_qid);
+  }
+
+  void DepPop(int from_qid, int to_qid) {
+    insn_queue_.DepPop(from_qid, to_qid);
+  }
+
+  void ReadBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
+    if (!(debug_flag_ & VTA_DEBUG_SKIP_READ_BARRIER)) {
+      uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
+      DataBuffer::FromHandle(buffer)->FlushCache(
+          elem_bytes * start, elem_bytes * extent);
+    }
+  }
+
+  void WriteBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
+    if (!(debug_flag_ & VTA_DEBUG_SKIP_WRITE_BARRIER)) {
+      uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
+      DataBuffer::FromHandle(buffer)->InvalidateCache(
+          elem_bytes * start, elem_bytes * extent);
+    }
+  }
+
+  void Synchronize(uint32_t wait_cycles) {
+    // Insert dependences to force serialization
+    if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) {
+      insn_queue_.RewriteForceSerial();
+    }
+    // This will issue finish after last store finishes
+    insn_queue_.DepPush(kStoreStage, kComputeStage);
+    insn_queue_.DepPush(kLoadStage, kComputeStage);
+    insn_queue_.DepPop(kStoreStage, kComputeStage);
+    insn_queue_.DepPop(kLoadStage, kComputeStage);
+    insn_queue_.CommitPendingPop(kComputeStage);
+    // NOTE: FINISH cannot contain pop
+    VTAGemInsn* insn = insn_queue_.CreateGemInsn();
+    insn->opcode = VTA_OPCODE_FINISH;
+    CHECK(!insn_queue_.PendingPop());
+    // Check if there are no instruction to execute at all
+    if (insn_queue_.count() == 0) return;
+    // Synchronization for the queues
+    uop_queue_.AutoReadBarrier();
+    insn_queue_.AutoReadBarrier();
+    // Dump instructions if debug enabled
+    if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
+      insn_queue_.DumpInsn();
+    }
+    // Make sure that the last instruction is a finish instruction
+    CHECK(reinterpret_cast<VTAMemInsn*>(
+        insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH);
+
+    // Make sure that we don't exceed contiguous physical memory limits
+    CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
+    int timeout = VTADeviceRun(
+        device_,
+        insn_queue_.dram_phy_addr(),
+        insn_queue_.count(),
+        wait_cycles);
+    CHECK_EQ(timeout, 0);
+    // Reset buffers
+    uop_queue_.Reset();
+    insn_queue_.Reset();
+  }
+
+  // Get record kernel
+  UopKernel* record_kernel() const {
+    CHECK(record_kernel_ != nullptr);
+    return record_kernel_;
+  }
+
+  // Set debug flag
+  void SetDebugFlag(int debug_flag) {
+    debug_flag_ = debug_flag;
+  }
+
+  void PushGEMMOp(void** uop_handle,
+                  int (*finit)(void*),
+                  void* signature,
+                  int nbytes) {
+    UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
+    if (uptr[0] == nullptr) {
+      uptr[0] = new UopKernelMap();
+    }
+    UopKernel** kptr = uptr[0]->Get(signature, nbytes);
+    if (kptr[0] == nullptr) {
+      record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
+      CHECK_EQ((*finit)(signature), 0);
+      kptr[0] = static_cast<UopKernel*>(record_kernel_);
+      if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
+        record_kernel_->Dump();
+      }
+      record_kernel_ = nullptr;
+    }
+    this->PushGEMMOp(static_cast<UopKernel*>(kptr[0]));
+    this->CheckInsnOverFlow();
+  }
+
+  void PushALUUop(void** uop_handle,
+                  int (*finit)(void*),
+                  void* signature,
+                  int nbytes) {
+    UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
+    if (uptr[0] == nullptr) {
+      uptr[0] = new UopKernelMap();
+    }
+    UopKernel** kptr = uptr[0]->Get(signature, nbytes);
+    if (kptr[0] == nullptr) {
+      record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
+      CHECK_EQ((*finit)(signature), 0);
+      kptr[0] = static_cast<UopKernel*>(record_kernel_);
+      if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
+        record_kernel_->Dump();
+      }
+      record_kernel_ = nullptr;
+    }
+    this->PushALUUop(static_cast<UopKernel*>(kptr[0]));
+    this->CheckInsnOverFlow();
+  }
+
+  static std::shared_ptr<CommandQueue>& ThreadLocal() {
+    static std::shared_ptr<CommandQueue> inst =
+        std::make_shared<CommandQueue>();
+    if (inst == nullptr) {
+      inst = std::make_shared<CommandQueue>();
+    }
+    return inst;
+  }
+
+  static void Shutdown() {
+    ThreadLocal().reset();
+  }
+
+ private:
+  // Push GEMM uop to the command buffer
+  void PushGEMMOp(UopKernel* kernel) {
+    uop_queue_.Push(kernel,
+                    [this]() { this->AutoSync(); });
+    if (uop_queue_.pending()) {
+      VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
+      insn->opcode = VTA_OPCODE_LOAD;
+      uop_queue_.FlushUopLoad(insn);
+    }
+    VTAGemInsn* insn = insn_queue_.CreateGemInsn();
+    insn->opcode = VTA_OPCODE_GEMM;
+    insn->reset_reg = kernel->reset_out_;
+    insn->uop_bgn = kernel->sram_begin_;
+    insn->uop_end = kernel->sram_end_;
+    const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
+    if (loop.size() > 0) {
+      insn->iter_out = loop[0].extent;
+      insn->wgt_factor_out = loop[0].wgt_factor;
+      insn->src_factor_out = loop[0].src_factor;
+      insn->dst_factor_out = loop[0].dst_factor;
+    } else {
+      insn->iter_out = 1;
+      insn->wgt_factor_out = 0;
+      insn->src_factor_out = 0;
+      insn->dst_factor_out = 0;
+    }
+    if (loop.size() > 1) {
+      insn->iter_in = loop[1].extent;
+      insn->wgt_factor_in = loop[1].wgt_factor;
+      insn->src_factor_in = loop[1].src_factor;
+      insn->dst_factor_in = loop[1].dst_factor;
+    } else {
+      insn->iter_in = 1;
+      insn->wgt_factor_in = 0;
+      insn->src_factor_in = 0;
+      insn->dst_factor_in = 0;
+    }
+  }
+
+  // Push ALU uop to the command buffer
+  void PushALUUop(UopKernel* kernel) {
+    uop_queue_.Push(kernel,
+                    [this]() { this->AutoSync(); });
+    if (uop_queue_.pending()) {
+      VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
+      insn->opcode = VTA_OPCODE_LOAD;
+      uop_queue_.FlushUopLoad(insn);
+    }
+    VTAAluInsn* insn = insn_queue_.CreateAluInsn();
+    insn->opcode = VTA_OPCODE_ALU;
+    insn->reset_reg = kernel->reset_out_;
+    insn->uop_bgn = kernel->sram_begin_;
+    insn->uop_end = kernel->sram_end_;
+    insn->alu_opcode = kernel->opcode_;
+    insn->use_imm = kernel->use_imm_;
+    insn->imm = kernel->imm_val_;
+    const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
+    if (loop.size() == 0) {
+      insn->iter_out = 1;
+      insn->dst_factor_out = 0;
+      insn->src_factor_out = 0;
+      insn->iter_in = 1;
+      insn->dst_factor_in = 0;
+      insn->src_factor_in = 0;
+    } else if (loop.size() == 1) {
+      insn->iter_out = 1;
+      insn->dst_factor_out = 0;
+      insn->src_factor_out = 0;
+      insn->iter_in = loop[0].extent;
+      insn->dst_factor_in = loop[0].dst_factor;
+      insn->src_factor_in = loop[0].src_factor;
+    } else {
+      insn->iter_out = loop[0].extent;
+      insn->dst_factor_out = loop[0].dst_factor;
+      insn->src_factor_out = loop[0].src_factor;
+      insn->iter_in = loop[1].extent;
+      insn->dst_factor_in = loop[1].dst_factor;
+      insn->src_factor_in = loop[1].src_factor;
+    }
+  }
+
+  void CheckInsnOverFlow() {
+    // At each API call, we can at most commit:
+    // one pending store, one pending load, and one uop
+    if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
+      this->AutoSync();
+    }
+  }
+  // Auto sync when instruction overflow
+  void AutoSync() {
+    this->Synchronize(1 << 31);
+  }
+
+  // Internal debug flag
+  int debug_flag_{0};
+  // The kernel we currently recording
+  UopKernel* record_kernel_{nullptr};
+  // Micro op queue
+  UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
+  // instruction queue
+  InsnQueue<VTA_MAX_XFER, true, true> insn_queue_;
+  // Device handle
+  VTADeviceHandle device_{nullptr};
+};
+
+}  // namespace vta
+
+void* VTABufferAlloc(size_t size) {
+  return vta::DataBuffer::Alloc(size);
+}
+
+void VTABufferFree(void* buffer) {
+  vta::DataBuffer::Free(vta::DataBuffer::FromHandle(buffer));
+}
+
+void VTABufferCopy(const void* from,
+                   size_t from_offset,
+                   void* to,
+                   size_t to_offset,
+                   size_t size,
+                   int kind_mask) {
+  vta::DataBuffer* from_buffer = nullptr;
+  vta::DataBuffer* to_buffer = nullptr;
+
+  if (kind_mask & 2) {
+    from_buffer = vta::DataBuffer::FromHandle(from);
+    from = from_buffer->virt_addr();
+  }
+  if (kind_mask & 1) {
+    to_buffer = vta::DataBuffer::FromHandle(to);
+    to = to_buffer->virt_addr();
+  }
+  if (from_buffer) {
+    from_buffer->InvalidateCache(from_offset, size);
+  }
+
+  memcpy(static_cast<char*>(to) + to_offset,
+         static_cast<const char*>(from) + from_offset,
+         size);
+  if (to_buffer) {
+    to_buffer->FlushCache(to_offset, size);
+  }
+}
+
+VTACommandHandle VTATLSCommandHandle() {
+  return vta::CommandQueue::ThreadLocal().get();
+}
+
+void VTARuntimeShutdown() {
+  vta::CommandQueue::Shutdown();
+}
+
+void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      SetDebugFlag(debug_flag);
+}
+
+void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) {
+  return vta::DataBuffer::FromHandle(buffer)->virt_addr();
+}
+
+void VTAWriteBarrier(VTACommandHandle cmd,
+                     void* buffer,
+                     uint32_t elem_bits,
+                     uint32_t start,
+                     uint32_t extent) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      WriteBarrier(buffer, elem_bits, start, extent);
+}
+
+void VTAReadBarrier(VTACommandHandle cmd,
+                    void* buffer,
+                    uint32_t elem_bits,
+                    uint32_t start,
+                    uint32_t extent) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      ReadBarrier(buffer, elem_bits, start, extent);
+}
+
+void VTALoadBuffer2D(VTACommandHandle cmd,
+                     void* src_dram_addr,
+                     uint32_t src_elem_offset,
+                     uint32_t x_size,
+                     uint32_t y_size,
+                     uint32_t x_stride,
+                     uint32_t x_pad_before,
+                     uint32_t y_pad_before,
+                     uint32_t x_pad_after,
+                     uint32_t y_pad_after,
+                     uint32_t dst_sram_index,
+                     uint32_t dst_memory_type) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      LoadBuffer2D(src_dram_addr, src_elem_offset,
+                   x_size, y_size, x_stride,
+                   x_pad_before, y_pad_before,
+                   x_pad_after, y_pad_after,
+                   dst_sram_index, dst_memory_type);
+}
+
+void VTAStoreBuffer2D(VTACommandHandle cmd,
+                      uint32_t src_sram_index,
+                      uint32_t src_memory_type,
+                      void* dst_dram_addr,
+                      uint32_t dst_elem_offset,
+                      uint32_t x_size,
+                      uint32_t y_size,
+                      uint32_t x_stride) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      StoreBuffer2D(src_sram_index, src_memory_type,
+                    dst_dram_addr, dst_elem_offset,
+                    x_size, y_size, x_stride);
+}
+
+void VTAUopPush(uint32_t mode,
+                uint32_t reset_out,
+                uint32_t dst_index,
+                uint32_t src_index,
+                uint32_t wgt_index,
+                uint32_t opcode,
+                uint32_t use_imm,
+                int32_t imm_val) {
+  vta::CommandQueue::ThreadLocal()->record_kernel()
+      ->Push(mode, reset_out, dst_index, src_index,
+             wgt_index, opcode, use_imm, imm_val);
+}
+
+void VTAUopLoopBegin(uint32_t extent,
+                     uint32_t dst_factor,
+                     uint32_t src_factor,
+                     uint32_t wgt_factor) {
+  vta::CommandQueue::ThreadLocal()->record_kernel()
+      ->PushLoopBegin(extent, dst_factor, src_factor, wgt_factor);
+}
+
+void VTAUopLoopEnd() {
+  vta::CommandQueue::ThreadLocal()->record_kernel()
+      ->PushLoopEnd();
+}
+
+int VTAPushGEMMOp(void** uop_handle,
+                  int (*finit)(void*),
+                  void* signature,
+                  int nbytes) {
+  vta::CommandQueue::ThreadLocal()->
+      PushGEMMOp(uop_handle, finit, signature, nbytes);
+  return 0;
+}
+
+int VTAPushALUOp(void** uop_handle,
+                 int (*finit)(void*),
+                 void* signature,
+                 int nbytes) {
+  vta::CommandQueue::ThreadLocal()->
+      PushALUUop(uop_handle, finit, signature, nbytes);
+  return 0;
+}
+
+int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      DepPush(from_qid, to_qid);
+  return 0;
+}
+
+int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      DepPop(from_qid, to_qid);
+  return 0;
+}
+
+void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      Synchronize(wait_cycles);
+}
diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc
new file mode 100644
index 000000000000..60645818757c
--- /dev/null
+++ b/vta/src/sim/sim_driver.cc
@@ -0,0 +1,588 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sim_driver.cc
+ * \brief VTA driver for simulated backend.
+ */
+#include <vta/driver.h>
+#include <vta/hw_spec.h>
+#include <tvm/runtime/registry.h>
+#include <type_traits>
+#include <mutex>
+#include <map>
+#include <unordered_map>
+#include <cstring>
+#include <sstream>
+
+namespace vta {
+namespace sim {
+
+/*!
+ * \brief Helper class to pack and unpack bits
+ *  Applies truncation when pack to low level bits.
+ *
+ * \tparam bits The number of bits in integer.
+ * \note This implementation relies on little endian.
+ */
+template<uint32_t bits>
+class BitPacker {
+ public:
+  explicit BitPacker(void* data) {
+    data_ = static_cast<uint32_t*>(data);
+  }
+
+  uint32_t GetUnsigned(uint32_t index) const {
+    if (bits == 32) {
+      return data_[index];
+    } else if (bits == 16) {
+      return reinterpret_cast<uint16_t*>(data_)[index];
+    } else if (bits == 8) {
+      return reinterpret_cast<uint8_t*>(data_)[index];
+    } else {
+      uint32_t offset = index / kNumPackElem;
+      uint32_t shift = index % kNumPackElem;
+      return (data_[offset] >> shift) & kMask;
+    }
+  }
+
+  int32_t GetSigned(uint32_t index) const {
+    if (bits == 32) {
+      return reinterpret_cast<int32_t*>(data_)[index];
+    } else if (bits == 16) {
+      return reinterpret_cast<int16_t*>(data_)[index];
+    } else if (bits == 8) {
+      return reinterpret_cast<int8_t*>(data_)[index];
+    } else {
+      uint32_t offset = index / kNumPackElem;
+      uint32_t shift = (index % kNumPackElem) * bits;
+      int32_t uvalue = static_cast<int32_t>(
+          (data_[offset] >> shift) & kMask);
+      int kleft = 32 - bits;
+      return (uvalue << kleft) >> kleft;
+    }
+  }
+
+  void SetUnsigned(uint32_t index, uint32_t value) {
+    if (bits == 32) {
+      data_[index] = value;
+    } else if (bits == 16) {
+      reinterpret_cast<uint16_t*>(data_)[index] = value;
+    } else if (bits == 8) {
+      reinterpret_cast<uint8_t*>(data_)[index] = value;
+    } else {
+      uint32_t offset = index / kNumPackElem;
+      uint32_t shift = (index % kNumPackElem) * bits;
+      data_[offset] &= (~(kMask << shift));
+      data_[offset] |= (value & kMask) << shift;
+    }
+  }
+
+  void SetSigned(uint32_t index, int32_t value) {
+    if (bits == 32) {
+      reinterpret_cast<int32_t*>(data_)[index] = value;
+    } else if (bits == 16) {
+      reinterpret_cast<int16_t*>(data_)[index] = value;
+    } else if (bits == 8) {
+      reinterpret_cast<int8_t*>(data_)[index] = value;
+    } else {
+      uint32_t offset = index / kNumPackElem;
+      uint32_t shift = (index % kNumPackElem) * bits;
+      data_[offset] &= (~(kMask << shift));
+      data_[offset] |= static_cast<uint32_t>(value & kMask) << shift;
+    }
+  }
+
+ private:
+  uint32_t* data_;
+  static constexpr uint32_t kNumPackElem = 32 / bits;
+  static constexpr uint32_t kMask = (1U << (bits >= 32U ? 31U : bits)) - 1U;
+};
+
+/*!
+ * \brief DRAM memory manager
+ *  Implements simple paging to allow physical address translation.
+ */
+class DRAM {
+ public:
+  /*!
+   * \brief Get virtual address given physical address.
+   * \param phy_addr The simulator phyiscal address.
+   * \return The true virtual address;
+   */
+  void* GetAddr(uint64_t phy_addr) {
+    CHECK_NE(phy_addr, 0)
+        << "trying to get address that is nullptr";
+    std::lock_guard<std::mutex> lock(mutex_);
+    uint64_t loc = (phy_addr >> kPageBits) - 1;
+    CHECK_LT(loc, ptable_.size())
+        << "phy_addr=" << phy_addr;
+    Page* p = ptable_[loc];
+    CHECK(p != nullptr);
+    size_t offset = (loc - p->ptable_begin) << kPageBits;
+    offset += phy_addr & (kPageSize - 1);
+    return reinterpret_cast<char*>(p->data) + offset;
+  }
+  /*!
+   * \brief Get physical address
+   * \param buf The virtual address.
+   * \return The true physical address;
+   */
+  vta_phy_addr_t GetPhyAddr(void* buf) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto it = pmap_.find(buf);
+    CHECK(it != pmap_.end());
+    Page* p = it->second.get();
+    return (p->ptable_begin + 1) << kPageBits;
+  }
+  /*!
+   * \brief Allocate memory from manager
+   * \param size The size of memory
+   * \return The virtual address
+   */
+  void* Alloc(size_t size) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    size_t npage = (size + kPageSize - 1) / kPageSize;
+    auto it = free_map_.lower_bound(npage);
+    if (it != free_map_.end()) {
+      Page* p = it->second;
+      free_map_.erase(it);
+      return p->data;
+    }
+    size_t start = ptable_.size();
+    std::unique_ptr<Page> p(new Page(start, npage));
+    // insert page entry
+    ptable_.resize(start + npage, p.get());
+    void* data = p->data;
+    pmap_[data] = std::move(p);
+    return data;
+  }
+  /*!
+   * \brief Free the memory.
+   * \param size The size of memory
+   * \return The virtual address
+   */
+  void Free(void* data) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (pmap_.size() == 0) return;
+    auto it = pmap_.find(data);
+    CHECK(it != pmap_.end());
+    Page* p = it->second.get();
+    free_map_.insert(std::make_pair(p->num_pages, p));
+  }
+
+  static DRAM* Global() {
+    static DRAM inst;
+    return &inst;
+  }
+
+
+ private:
+  // The bits in page table
+  static constexpr vta_phy_addr_t kPageBits = 12;
+  // page size, also the maximum allocable size 16 K
+  static constexpr vta_phy_addr_t kPageSize = 1 << kPageBits;
+  /*! \brief A page in the DRAM */
+  struct Page {
+    /*! \brief Data Type */
+    using DType = typename std::aligned_storage<kPageSize, 256>::type;
+    /*! \brief Start location in page table */
+    size_t ptable_begin;
+    /*! \brief The total number of pages */
+    size_t num_pages;
+    /*! \brief Data */
+    DType* data{nullptr};
+    // construct a new page
+    explicit Page(size_t ptable_begin, size_t num_pages)
+        : ptable_begin(ptable_begin), num_pages(num_pages) {
+      data = new DType[num_pages];
+    }
+    ~Page() {
+      delete [] data;
+    }
+  };
+  // Internal lock
+  std::mutex mutex_;
+  // Physical address -> page
+  std::vector<Page*> ptable_;
+  // virtual addres -> page
+  std::unordered_map<void*, std::unique_ptr<Page> > pmap_;
+  // Free map
+  std::multimap<size_t, Page*> free_map_;
+};
+
+/*!
+ * \brief Register file.
+ * \tparam kBits Number of bits of one value.
+ * \tparam kLane Number of lanes in one element.
+ * \tparam kMaxNumElem Maximum number of element.
+ */
+template<int kBits, int kLane, int kMaxNumElem>
+class SRAM {
+ public:
+  /*! \brief Bytes of single vector element */
+  static const int kElemBytes = (kBits * kLane + 7) / 8;
+  /*! \brief content data type */
+  using DType = typename std::aligned_storage<kElemBytes, kElemBytes>::type;
+  SRAM() {
+    data_ = new DType[kMaxNumElem];
+  }
+  ~SRAM() {
+    delete [] data_;
+  }
+  // Get the i-th index
+  void* BeginPtr(uint32_t index) {
+    CHECK_LT(index, kMaxNumElem);
+    return &(data_[index]);
+  }
+  // Execute the load instruction on this SRAM
+  void Load(const VTAMemInsn* op, DRAM* dram, uint64_t* load_counter) {
+    load_counter[0] += (op->x_size * op->y_size) * kElemBytes;
+    DType* sram_ptr = data_ + op->sram_base;
+    uint8_t* dram_ptr = static_cast<uint8_t*>(dram->GetAddr(
+        op->dram_base * kElemBytes));
+    uint64_t xtotal = op->x_size + op->x_pad_0 + op->x_pad_1;
+    uint32_t ytotal = op->y_size + op->y_pad_0 + op->y_pad_1;
+    uint64_t sram_end = op->sram_base + xtotal * ytotal;
+    CHECK_LE(sram_end, kMaxNumElem);
+    memset(sram_ptr, 0, kElemBytes * xtotal * op->y_pad_0);
+    sram_ptr += xtotal * op->y_pad_0;
+
+    for (uint32_t y = 0; y < op->y_size; ++y) {
+      memset(sram_ptr, 0, kElemBytes * op->x_pad_0);
+      sram_ptr += op->x_pad_0;
+      memcpy(sram_ptr, dram_ptr, kElemBytes * op->x_size);
+      sram_ptr += op->x_size;
+      memset(sram_ptr, 0, kElemBytes * op->x_pad_1);
+      sram_ptr += op->x_pad_1;
+      dram_ptr += kElemBytes * op->x_stride;
+    }
+    memset(sram_ptr, 0, kElemBytes * xtotal * op->y_pad_1);
+  }
+  // Execute the store instruction on this SRAM apply trucation.
+  // This relies on the elements is 32 bits
+  template<int target_bits>
+  void TruncStore(const VTAMemInsn* op, DRAM* dram) {
+    CHECK_EQ(op->x_pad_0, 0);
+    CHECK_EQ(op->x_pad_1, 0);
+    CHECK_EQ(op->y_pad_0, 0);
+    CHECK_EQ(op->y_pad_1, 0);
+    int target_width = (target_bits * kLane + 7) / 8;
+    BitPacker<kBits> src(data_ + op->sram_base);
+    BitPacker<target_bits> dst(dram->GetAddr(op->dram_base * target_width));
+    for (uint32_t y = 0; y < op->y_size; ++y) {
+      for (uint32_t x = 0; x < op->x_size; ++x) {
+        uint32_t sram_base = y * op->x_size + x;
+        uint32_t dram_base = y * op->x_stride + x;
+        for (int i = 0; i < kLane; ++i) {
+          dst.SetSigned(dram_base * kLane + i,
+                        src.GetSigned(sram_base * kLane +i));
+        }
+      }
+    }
+  }
+
+ private:
+  /*! \brief internal data content */
+  DType* data_;
+};
+
+
+/*!
+ * \brief Memory information of special memory region.
+ *  Use MemoryInfo as its container type
+ */
+class Profiler {
+ public:
+  /*! \brief The memory load statistics */
+  uint64_t inp_load_nbytes{0};
+  /*! \brief The memory load statistics */
+  uint64_t wgt_load_nbytes{0};
+  /*! \brief The ACC memory load statistics */
+  uint64_t acc_load_nbytes{0};
+  /*! \brief The ACC memory load statistics */
+  uint64_t uop_load_nbytes{0};
+  /*! \brief The ACC memory load statistics */
+  uint64_t out_store_nbytes{0};
+  /*! \brief instr counter for gemm */
+  uint64_t gemm_counter{0};
+  /*! \brief instr counter for ALU ops */
+  uint64_t alu_counter{0};
+  /*! \brief clear the profiler */
+  void Clear() {
+    inp_load_nbytes = 0;
+    wgt_load_nbytes = 0;
+    acc_load_nbytes = 0;
+    uop_load_nbytes = 0;
+    out_store_nbytes = 0;
+    gemm_counter = 0;
+    alu_counter = 0;
+  }
+
+  std::string AsJSON() {
+    std::ostringstream os;
+    os << "{\n"
+       << " \"inp_load_nbytes\":" << inp_load_nbytes << ",\n"
+       << " \"wgt_load_nbytes\":" << wgt_load_nbytes << ",\n"
+       << " \"acc_load_nbytes\":" << acc_load_nbytes << ",\n"
+       << " \"uop_load_nbytes\":" << uop_load_nbytes << ",\n"
+       << " \"out_store_nbytes\":" << out_store_nbytes << ",\n"
+       << " \"gemm_counter\":" << gemm_counter << ",\n"
+       << " \"alu_counter\":" << alu_counter << "\n"
+       <<"}\n";
+    return os.str();
+  }
+
+  static Profiler* ThreadLocal() {
+    static thread_local Profiler inst;
+    return &inst;
+  }
+};
+
+
+// Simulate device
+// TODO(tqchen,thierry): queue based event driven simulation.
+class Device {
+ public:
+  Device() {
+    prof_ = Profiler::ThreadLocal();
+    dram_ = DRAM::Global();
+  }
+
+  int Run(vta_phy_addr_t insn_phy_addr,
+          uint32_t insn_count,
+          uint32_t wait_cycles) {
+    VTAGenericInsn* insn = static_cast<VTAGenericInsn*>(
+        dram_->GetAddr(insn_phy_addr));
+    finish_counter_ = 0;
+    for (uint32_t i = 0; i < insn_count; ++i) {
+      this->Run(insn + i);
+    }
+    return 0;
+  }
+
+ private:
+  void Run(const VTAGenericInsn* insn) {
+    const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
+    const VTAGemInsn* gem = reinterpret_cast<const VTAGemInsn*>(insn);
+    const VTAAluInsn* alu = reinterpret_cast<const VTAAluInsn*>(insn);
+    switch (mem->opcode) {
+      case VTA_OPCODE_LOAD: RunLoad(mem); break;
+      case VTA_OPCODE_STORE: RunStore(mem); break;
+      case VTA_OPCODE_GEMM: RunGEMM(gem); break;
+      case VTA_OPCODE_ALU: RunALU(alu); break;
+      case VTA_OPCODE_FINISH: ++finish_counter_; break;
+      default: {
+        LOG(FATAL) << "Unknown op_code" << mem->opcode;
+      }
+    }
+  }
+
+  void RunLoad(const VTAMemInsn* op) {
+    if (op->x_size == 0) return;
+    if (op->memory_type == VTA_MEM_ID_INP) {
+      inp_.Load(op, dram_, &(prof_->inp_load_nbytes));
+    } else if (op->memory_type == VTA_MEM_ID_WGT) {
+      wgt_.Load(op, dram_, &(prof_->wgt_load_nbytes));
+    } else if (op->memory_type == VTA_MEM_ID_ACC) {
+      acc_.Load(op, dram_, &(prof_->acc_load_nbytes));
+    } else if (op->memory_type == VTA_MEM_ID_UOP) {
+      uop_.Load(op, dram_, &(prof_->uop_load_nbytes));
+    } else {
+      LOG(FATAL) << "Unknown memory_type=" << op->memory_type;
+    }
+  }
+
+  void RunStore(const VTAMemInsn* op) {
+    if (op->x_size == 0) return;
+    if (op->memory_type == VTA_MEM_ID_ACC ||
+        op->memory_type == VTA_MEM_ID_UOP) {
+      prof_->out_store_nbytes += (
+          op->x_size * op->y_size * VTA_BATCH * VTA_BLOCK_OUT * VTA_OUT_WIDTH / 8);
+      acc_.TruncStore<VTA_OUT_WIDTH>(op, dram_);
+    } else {
+      LOG(FATAL) << "Store do not support memory_type="
+                 << op->memory_type;
+    }
+  }
+
+  void RunGEMM(const VTAGemInsn* op) {
+    if (!op->reset_reg) {
+      prof_->gemm_counter += op->iter_out * op->iter_in;
+      for (uint32_t y = 0; y < op->iter_out; ++y) {
+        for (uint32_t x = 0; x < op->iter_in; ++x) {
+          for (uint32_t uindex = op->uop_bgn; uindex < op->uop_end; ++uindex) {
+            VTAUop* uop_ptr = static_cast<VTAUop*>(uop_.BeginPtr(uindex));
+            // Read in memory indices
+            uint32_t acc_idx = uop_ptr->dst_idx;
+            uint32_t inp_idx = uop_ptr->src_idx;
+            uint32_t wgt_idx = uop_ptr->wgt_idx;
+
+            acc_idx += y * op->dst_factor_out + x * op->dst_factor_in;
+            inp_idx += y * op->src_factor_out + x * op->src_factor_in;
+            wgt_idx += y * op->wgt_factor_out + x * op->wgt_factor_in;
+            BitPacker<VTA_ACC_WIDTH> acc(acc_.BeginPtr(acc_idx));
+            BitPacker<VTA_INP_WIDTH> inp(inp_.BeginPtr(inp_idx));
+            BitPacker<VTA_WGT_WIDTH> wgt(wgt_.BeginPtr(wgt_idx));
+
+            // gemm loop
+            for (uint32_t i = 0; i < VTA_BATCH; ++i) {
+              for (uint32_t j = 0; j < VTA_BLOCK_OUT; ++j) {
+                uint32_t acc_offset = i * VTA_BLOCK_OUT + j;
+                int32_t sum = acc.GetSigned(acc_offset);
+                for (uint32_t k = 0; k < VTA_BLOCK_IN; ++k) {
+                  sum +=
+                      inp.GetSigned(i * VTA_BLOCK_IN + k) *
+                      wgt.GetSigned(j * VTA_BLOCK_IN + k);
+                }
+                acc.SetSigned(acc_offset, sum);
+              }
+            }
+          }
+        }
+      }
+    } else {
+      // reset
+      for (uint32_t y = 0; y < op->iter_out; ++y) {
+        for (uint32_t x = 0; x < op->iter_in; ++x) {
+          for (uint32_t uindex = op->uop_bgn; uindex < op->uop_end; ++uindex) {
+            VTAUop* uop_ptr = static_cast<VTAUop*>(uop_.BeginPtr(uindex));
+            uint32_t acc_idx = uop_ptr->dst_idx;
+            acc_idx += y * op->dst_factor_out + x * op->dst_factor_in;
+            BitPacker<VTA_ACC_WIDTH> acc(acc_.BeginPtr(acc_idx));
+            for (uint32_t i = 0; i < VTA_BATCH * VTA_BLOCK_OUT; ++i) {
+              acc.SetSigned(i, 0);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void RunALU(const VTAAluInsn* op) {
+    prof_->alu_counter += op->iter_out * op->iter_in;
+    if (op->use_imm) {
+      RunALU_<true>(op);
+    } else {
+      RunALU_<false>(op);
+    }
+  }
+
+  template<bool use_imm>
+  void RunALU_(const VTAAluInsn* op) {
+    switch (op->alu_opcode) {
+      case VTA_ALU_OPCODE_ADD: {
+        return RunALULoop<use_imm>(op, [](int32_t x, int32_t y) {
+            return x + y;
+          });
+      }
+      case VTA_ALU_OPCODE_MAX: {
+        return RunALULoop<use_imm>(op, [](int32_t x, int32_t y) {
+            return std::max(x, y);
+          });
+      }
+      case VTA_ALU_OPCODE_MIN: {
+        return RunALULoop<use_imm>(op, [](int32_t x, int32_t y) {
+            return std::min(x, y);
+          });
+      }
+      case VTA_ALU_OPCODE_SHR: {
+        return RunALULoop<use_imm>(op, [](int32_t x, int32_t y) {
+            if (y >= 0) {
+              return x >> y;
+            } else {
+              return x << (-y);
+            }
+          });
+      }
+      default: {
+        LOG(FATAL) << "Unknown ALU code " << op->alu_opcode;
+      }
+    }
+  }
+
+  template<bool use_imm, typename F>
+  void RunALULoop(const VTAAluInsn* op, F func) {
+    for (int y = 0; y < op->iter_out; ++y) {
+      for (int x = 0; x < op->iter_in; ++x) {
+        for (int k = op->uop_bgn; k < op->uop_end; ++k) {
+          // Read micro op
+          VTAUop* uop_ptr = static_cast<VTAUop*>(uop_.BeginPtr(k));
+          uint32_t dst_index = uop_ptr->dst_idx;
+          uint32_t src_index = uop_ptr->src_idx;
+          dst_index += y * op->dst_factor_out + x * op->dst_factor_in;
+          src_index += y * op->src_factor_out + x * op->src_factor_in;
+          BitPacker<VTA_ACC_WIDTH> dst(acc_.BeginPtr(dst_index));
+          BitPacker<VTA_ACC_WIDTH> src(acc_.BeginPtr(src_index));
+          for (int k = 0; k < VTA_BLOCK_OUT; ++k) {
+            if (use_imm) {
+              dst.SetSigned(k, func(dst.GetSigned(k), op->imm));
+            } else {
+              dst.SetSigned(k, func(dst.GetSigned(k), src.GetSigned(k)));
+            }
+          }
+        }
+      }
+    }
+  }
+  // the finish counter
+  int finish_counter_{0};
+  // Prof_
+  Profiler* prof_;
+  // The DRAM interface
+  DRAM* dram_;
+  // The SRAM
+  SRAM<VTA_INP_WIDTH, VTA_BATCH * VTA_BLOCK_IN, VTA_INP_BUFF_DEPTH> inp_;
+  SRAM<VTA_WGT_WIDTH, VTA_BLOCK_IN * VTA_BLOCK_OUT, VTA_WGT_BUFF_DEPTH> wgt_;
+  SRAM<VTA_ACC_WIDTH, VTA_BATCH * VTA_BLOCK_OUT, VTA_ACC_BUFF_DEPTH> acc_;
+  SRAM<VTA_UOP_WIDTH, 1, VTA_UOP_BUFF_DEPTH> uop_;
+};
+
+using tvm::runtime::TVMRetValue;
+using tvm::runtime::TVMArgs;
+
+TVM_REGISTER_GLOBAL("vta.simulator.profiler_clear")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    Profiler::ThreadLocal()->Clear();
+  });
+TVM_REGISTER_GLOBAL("vta.simulator.profiler_status")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = Profiler::ThreadLocal()->AsJSON();
+  });
+}  // namespace sim
+}  // namespace vta
+
+void* VTAMemAlloc(size_t size, int cached) {
+  return vta::sim::DRAM::Global()->Alloc(size);
+}
+
+void VTAMemFree(void* buf) {
+  vta::sim::DRAM::Global()->Free(buf);
+}
+
+vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
+  return vta::sim::DRAM::Global()->GetPhyAddr(buf);
+}
+
+void VTAFlushCache(vta_phy_addr_t buf, int size) {
+}
+
+void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
+}
+
+VTADeviceHandle VTADeviceAlloc() {
+  return new vta::sim::Device();
+}
+
+void VTADeviceFree(VTADeviceHandle handle) {
+  delete static_cast<vta::sim::Device*>(handle);
+}
+
+int VTADeviceRun(VTADeviceHandle handle,
+                 vta_phy_addr_t insn_phy_addr,
+                 uint32_t insn_count,
+                 uint32_t wait_cycles) {
+  return static_cast<vta::sim::Device*>(handle)->Run(
+      insn_phy_addr, insn_count, wait_cycles);
+}
+
+void VTAProgram(const char* bitstream) {
+}
diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc
new file mode 100644
index 000000000000..6c6d28ec0c69
--- /dev/null
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -0,0 +1,1456 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file test_lib.cpp
+ * \brief Test library for the VTA design simulation and driver tests.
+ */
+
+#include "./test_lib.h"
+
+#ifdef NO_SIM
+#ifdef VTA_TARGET_PYNQ
+
+uint64_t vta(
+  uint32_t insn_count,
+  VTAGenericInsn *insns,
+  VTAUop *uops,
+  inp_T *inputs,
+  wgt_T *weights,
+  acc_T *biases,
+  inp_T *outputs) {
+  // Performance counter variables
+  uint64_t t_fpga;
+  struct timespec start, stop;
+
+  // Derive bitstream file
+  char bitstream[128];
+  char str_batch_size[4];
+  char str_block_out_size[4];
+  char str_block_in_size[4];
+  char str_block_bit_width[4];
+  snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
+  snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
+  snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
+  snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
+  snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
+
+#if VTA_DEBUG == 1
+  printf("INFO - Programming FPGA: %s!\n", bitstream);
+#endif
+
+  // Program VTA
+  VTAProgram(bitstream);
+  // Get VTA handles
+  void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
+  void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
+  void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
+  void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
+
+  // Physical address pointers
+  uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
+  uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
+  uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
+  uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
+  uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
+  uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
+
+#if VTA_DEBUG == 1
+  printf("INFO - Starting FPGA!\n");
+#endif
+
+  clock_gettime(CLOCK_REALTIME, &start);
+
+  // FETCH @ 0x10 : Data signal of insn_count_V
+  VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
+  // FETCH @ 0x18 : Data signal of insns_V
+  if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
+  // LOAD @ 0x10 : Data signal of inputs_V
+  if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
+  // LOAD @ 0x18 : Data signal of weight_V
+  if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
+  // COMPUTE @ 0x20 : Data signal of uops_V
+  if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
+  // COMPUTE @ 0x28 : Data signal of biases_V
+  if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
+  // STORE @ 0x10 : Data signal of outputs_V
+  if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
+
+  // VTA start
+  VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
+  VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
+  VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
+  VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
+
+  int flag = 0, t = 0;
+  for (t = 0; t < 10000000; ++t) {
+    flag = VTAReadMappedReg(vta_compute_handle, 0x18);
+    if (flag & VTA_DONE) break;
+  }
+
+  if (t == 10000000) {
+    printf("\tWARNING: VTA TIMEOUT!!!!\n");
+#if VTA_DEBUG == 1
+  } else {
+    printf("INFO - FPGA Finished!\n");
+#endif
+  }
+
+  clock_gettime(CLOCK_REALTIME, &stop);
+  t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
+
+  // Unmap VTA register
+  VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_load_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_store_handle, VTA_RANGE);
+
+  return t_fpga;
+}
+
+#endif  // VTA_TARGET_PYNQ
+#endif  // NO_SIM
+
+uint32_t globalSeed;
+
+const char* getOpcodeString(int opcode, bool use_imm) {
+  // Returns string name
+  if (opcode == VTA_ALU_OPCODE_MIN) {
+    if (use_imm) {
+      return "min imm";
+    } else {
+      return "min";
+    }
+  } else if (opcode == VTA_ALU_OPCODE_MAX) {
+    if (use_imm) {
+      return "max imm";
+    } else {
+      return "max";
+    }
+  } else if (opcode == VTA_ALU_OPCODE_ADD) {
+    if (use_imm) {
+      return "add imm";
+    } else {
+      return "add";
+    }
+  } else if (opcode == VTA_ALU_OPCODE_SHR) {
+    return "shr";
+  }
+  return "unknown op";
+}
+
+template <typename T, int T_WIDTH>
+void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block) {
+  int buffer_idx = 0;
+  for (int i = 0; i < y_size / y_block; i++) {
+    for (int j = 0; j < x_size / x_block; j++) {
+      for (int k = 0; k < y_block; k++) {
+        if (T_WIDTH < 8) {
+          for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
+            dst[buffer_idx] = 0;
+            for (int m = 0; m < 8 / T_WIDTH; m++) {
+              dst[buffer_idx] |= (src[i * y_block + k][j * x_block + l + m] &
+                ((1ULL << T_WIDTH) - 1)) << (m * T_WIDTH);
+            }
+            buffer_idx++;
+          }
+        } else {
+          for (int l = 0; l < x_block; l++) {
+            dst[buffer_idx++] = src[i * y_block + k][j * x_block + l];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, int T_WIDTH>
+void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block) {
+  int buffer_idx = 0;
+  for (int i = 0; i < y_size / y_block; i++) {
+    for (int j = 0; j < x_size / x_block; j++) {
+      for (int k = 0; k < y_block; k++) {
+        if (T_WIDTH < 8) {
+          for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
+            for (int m = 0; m < 8 / T_WIDTH; m++) {
+              dst[i * y_block + k][j * x_block + l + m] = (src[buffer_idx] >> (m * T_WIDTH))
+                & ((1 << T_WIDTH) - 1);
+            }
+            buffer_idx++;
+          }
+        } else {
+          for (int l = 0; l < x_block; l++) {
+            dst[i * y_block + k][j * x_block + l] = src[buffer_idx++];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, int T_WIDTH>
+T ** allocInit2dArray(int rows, int cols) {
+  // Allocate
+  T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
+  for (int i = 0; i < rows; i++) {
+    array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
+  }
+  // Init
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      array[i][j] =
+          static_cast<T>(rand_r(&globalSeed) % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2)));
+    }
+  }
+  return array;
+}
+
+template <typename T>
+T ** alloc2dArray(int rows, int cols) {
+  T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
+  for (int i = 0; i < rows; i++) {
+    array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
+  }
+  return array;
+}
+
+template <typename T>
+void free2dArray(T **array, int rows, int cols) {
+  for (int i = 0; i < rows; i++) {
+    free(array[i]);
+  }
+  free(array);
+}
+
+template <typename T>
+T *** alloc3dArray(int rows, int cols, int depth) {
+  T ***array = static_cast<T ***>(malloc(sizeof(T **) * rows));
+  for (int i = 0; i < rows; i++) {
+    array[i] = static_cast<T **>(malloc(sizeof(T *) * cols));
+    for (int j = 0; j < cols; j++) {
+      array[i][j] = static_cast<T*>(malloc(sizeof(T) * depth));
+    }
+  }
+  return array;
+}
+
+template <typename T>
+void free3dArray(T *** array, int rows, int cols, int depth) {
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      free(array[i][j]);
+    }
+    free(array[i]);
+  }
+  free(array);
+}
+
+void * allocBuffer(size_t num_bytes) {
+#ifdef NO_SIM
+  return VTAMemAlloc(num_bytes, VTA_CACHED);
+#else
+  return malloc(num_bytes);
+#endif
+}
+
+void freeBuffer(void * buffer) {
+#ifdef NO_SIM
+  return VTAMemFree(buffer);
+#else
+  return free(buffer);
+#endif
+}
+
+VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
+    int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
+    int push_prev_dep, int push_next_dep) {
+  // Converter
+  union VTAInsn converter;
+  // Memory instruction initialization
+  VTAMemInsn insn = {};
+  insn.opcode = opcode;
+  insn.pop_prev_dep = pop_prev_dep;
+  insn.pop_next_dep = pop_next_dep;
+  insn.push_prev_dep = push_prev_dep;
+  insn.push_next_dep = push_next_dep;
+  insn.memory_type = type;
+  insn.sram_base = sram_offset;
+  insn.dram_base = dram_offset;
+  insn.y_size = y_size;
+  insn.x_size = x_size;
+  insn.x_stride = x_stride;
+  insn.y_pad_0 = y_pad;
+  insn.y_pad_1 = y_pad;
+  insn.x_pad_0 = x_pad;
+  insn.x_pad_1 = x_pad;
+  converter.mem = insn;
+  return converter.generic;
+}
+
+VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
+    int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
+  // Converter
+  union VTAInsn converter;
+  // Memory instruction initialization
+  VTAMemInsn insn = {};
+  insn.opcode = opcode;
+  insn.pop_prev_dep = pop_prev_dep;
+  insn.pop_next_dep = pop_next_dep;
+  insn.push_prev_dep = push_prev_dep;
+  insn.push_next_dep = push_next_dep;
+  insn.memory_type = type;
+  insn.sram_base = sram_offset;
+  insn.dram_base = dram_offset;
+  insn.y_size = 1;
+  insn.x_size = size;
+  insn.x_stride = size;
+  insn.y_pad_0 = 0;
+  insn.y_pad_1 = 0;
+  insn.x_pad_0 = 0;
+  insn.x_pad_1 = 0;
+  converter.mem = insn;
+  return converter.generic;
+}
+
+VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
+    bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
+    int push_next_dep) {
+  // Converter
+  union VTAInsn converter;
+  // GEMM instruction initialization
+  VTAGemInsn insn;
+  insn.opcode = VTA_OPCODE_GEMM;
+  insn.pop_prev_dep = pop_prev_dep;
+  insn.pop_next_dep = pop_next_dep;
+  insn.push_prev_dep = push_prev_dep;
+  insn.push_next_dep = push_next_dep;
+  insn.reset_reg = false;
+  if (!uop_compression) {
+    insn.uop_bgn = uop_offset;
+    insn.uop_end = uop_offset + batch * in_feat * out_feat;
+    insn.iter_out = 1;
+    insn.iter_in = 1;
+    insn.dst_factor_out = 0;
+    insn.src_factor_out = 0;
+    insn.wgt_factor_out = 0;
+    insn.dst_factor_in = 0;
+    insn.src_factor_in = 0;
+    insn.wgt_factor_in = 0;
+  } else {
+    insn.uop_bgn = uop_offset;
+    insn.uop_end = uop_offset + batch;
+    insn.iter_out = in_feat;
+    insn.iter_in = out_feat;
+    insn.dst_factor_out = 0;
+    insn.src_factor_out = 1;
+    insn.wgt_factor_out = 1;
+    insn.dst_factor_in = 1;
+    insn.src_factor_in = 0;
+    insn.wgt_factor_in = in_feat;
+  }
+  converter.gemm = insn;
+  return converter.generic;
+}
+
+VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bool uop_compression,
+    int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
+  // Converter
+  union VTAInsn converter;
+  // Memory instruction initialization
+  VTAAluInsn insn = {};
+  insn.opcode = VTA_OPCODE_ALU;
+  insn.pop_prev_dep = pop_prev_dep;
+  insn.pop_next_dep = pop_next_dep;
+  insn.push_prev_dep = push_prev_dep;
+  insn.push_next_dep = push_next_dep;
+  insn.reset_reg = false;
+  if (!uop_compression) {
+    insn.uop_bgn = 0;
+    insn.uop_end = vector_size;
+    insn.iter_out = 1;
+    insn.iter_in = 1;
+    insn.dst_factor_out = 0;
+    insn.src_factor_out = 0;
+    insn.dst_factor_in = 0;
+    insn.src_factor_in = 0;
+    insn.alu_opcode = opcode;
+    insn.use_imm = use_imm;
+    insn.imm = imm;
+  } else {
+    insn.uop_bgn = 0;
+    insn.uop_end = 1;
+    insn.iter_out = 1;
+    insn.iter_in = vector_size;
+    insn.dst_factor_out = 0;
+    insn.src_factor_out = 0;
+    insn.dst_factor_in = 1;
+    insn.src_factor_in = 1;
+    insn.alu_opcode = opcode;
+    insn.use_imm = use_imm;
+    insn.imm = imm;
+  }
+  converter.alu = insn;
+  return converter.generic;
+}
+
+VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) {
+  // Converter
+  union VTAInsn converter;
+  // GEMM instruction initialization
+  VTAGemInsn insn;
+  insn.opcode = VTA_OPCODE_FINISH;
+  insn.pop_prev_dep = pop_prev;
+  insn.pop_next_dep = pop_next;
+  insn.push_prev_dep = 0;
+  insn.push_next_dep = 0;
+  insn.reset_reg = false;
+  insn.uop_bgn = 0;
+  insn.uop_end = 0;
+  insn.iter_out = 0;
+  insn.iter_in = 0;
+  insn.dst_factor_out = 0;
+  insn.src_factor_out = 0;
+  insn.wgt_factor_out = 0;
+  insn.dst_factor_in = 0;
+  insn.src_factor_in = 0;
+  insn.wgt_factor_in = 0;
+  converter.gemm = insn;
+  return converter.generic;
+}
+
+VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
+  // Derive the total uop size
+  int uop_size = (uop_compression) ? 1 : y_size * x_size;
+
+  // Allocate buffer
+#ifdef NO_SIM
+  VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
+#else
+  VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
+#endif
+
+  if (!uop_compression) {
+    int uop_idx = 0;
+    for (int i = 0; i < y_size; i++) {
+      for (int j = 0; j < x_size; j++) {
+        uop_buf[uop_idx].dst_idx = i * x_size + j;
+        uop_buf[uop_idx].src_idx = 0;
+        uop_buf[uop_idx].wgt_idx = 0;
+        uop_idx++;
+      }
+    }
+  } else {
+    uop_buf[0].dst_idx = 1;
+    uop_buf[0].src_idx = 0;
+    uop_buf[0].wgt_idx = 0;
+  }
+
+  return uop_buf;
+}
+
+VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
+    bool multi_threaded) {
+  // Derive the total uop size
+  int uop_size = (uop_compression) ? batch : batch * in_feat * out_feat;
+  if (multi_threaded) uop_size *= 2;
+
+  // Allocate buffer
+#ifdef NO_SIM
+  VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
+#else
+  VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
+#endif
+
+  if (!uop_compression) {
+    int uop_idx = 0;
+    for (int i = 0; i < batch; i++) {
+      for (int j = 0; j < in_feat; j++) {
+        for (int k = 0; k < out_feat; k++) {
+          uop_buf[uop_idx].dst_idx = i * out_feat + k;
+          uop_buf[uop_idx].src_idx = i * in_feat + j;
+          uop_buf[uop_idx].wgt_idx = k * in_feat + j;
+          uop_idx++;
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < batch; i++) {
+      uop_buf[i].dst_idx = i * out_feat;
+      uop_buf[i].src_idx = i * in_feat;
+      uop_buf[i].wgt_idx = 0;
+    }
+  }
+
+  if (multi_threaded) {
+    if (!uop_compression) {
+      int uop_idx = uop_size / 2;
+      for (int i = 0; i < batch; i++) {
+        for (int j = 0; j < in_feat; j++) {
+          for (int k = 0; k < out_feat; k++) {
+            uop_buf[uop_idx].dst_idx = i * out_feat + k;
+            uop_buf[uop_idx].src_idx = batch * in_feat + i * in_feat + j;
+            uop_buf[uop_idx].wgt_idx = out_feat * in_feat + k * in_feat + j;
+            uop_idx++;
+          }
+        }
+      }
+    } else {
+      for (int i = 0; i < batch; i++) {
+        uop_buf[batch+i].dst_idx = i * out_feat;
+        uop_buf[batch+i].src_idx = batch * in_feat + i * in_feat;
+        uop_buf[batch+i].wgt_idx = out_feat * in_feat;
+      }
+    }
+  }
+
+  return uop_buf;
+}
+
+VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
+  // Derive the total uop size
+  int uop_size = (uop_compression) ? 1 : vector_size;
+
+  // Allocate buffer
+#ifdef NO_SIM
+  VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
+#else
+  VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
+#endif
+
+  if (!uop_compression) {
+    for (int i = 0; i < vector_size; i++) {
+      uop_buf[i].dst_idx = i;
+      uop_buf[i].src_idx = vector_size + i;
+    }
+  } else {
+    uop_buf[0].dst_idx = 0;
+    uop_buf[0].src_idx = vector_size;
+  }
+
+  return uop_buf;
+}
+
+void printParameters() {
+  // Some debugging code
+  printf("Size of VTAInsn: %d\n", sizeof(VTAGenericInsn));
+  printf("Size of VTAUop: %d\n", sizeof(VTAUop));
+  printf("VTA_UOP_BUFF_DEPTH: %d\n", VTA_UOP_BUFF_DEPTH);
+  printf("VTA_LOG_UOP_BUFF_DEPTH: %d\n", VTA_LOG_UOP_BUFF_DEPTH);
+  printf("VTA_WGT_BUFF_DEPTH: %d\n", VTA_WGT_BUFF_DEPTH);
+  printf("VTA_LOG_WGT_BUFF_DEPTH: %d\n", VTA_LOG_WGT_BUFF_DEPTH);
+  printf("VTA_INP_BUFF_DEPTH: %d\n", VTA_INP_BUFF_DEPTH);
+  printf("VTA_LOG_INP_BUFF_DEPTH: %d\n", VTA_LOG_INP_BUFF_DEPTH);
+  printf("VTA_ACC_BUFF_DEPTH: %d\n", VTA_ACC_BUFF_DEPTH);
+  printf("VTA_LOG_ACC_BUFF_DEPTH: %d\n", VTA_LOG_ACC_BUFF_DEPTH);
+  printf("VTA_WGT_WORDS: %d\n", VTA_WGT_BUFF_DEPTH*VTA_BLOCK_IN*VTA_BLOCK_OUT);
+  printf("VTA_INP_WORDS: %d\n", VTA_INP_BUFF_DEPTH*VTA_BLOCK_IN);
+  printf("VTA_ACC_WORDS: %d\n", VTA_ACC_BUFF_DEPTH*VTA_BLOCK_OUT);
+  printf("VTA_INS_ELEM_BYTES: %d\n", VTA_INS_ELEM_BYTES);
+  printf("VTA_UOP_ELEM_BYTES: %d\n", VTA_UOP_ELEM_BYTES);
+  printf("VTA_INP_ELEM_BYTES: %d\n", VTA_INP_ELEM_BYTES);
+  printf("VTA_WGT_ELEM_BYTES: %d\n", VTA_WGT_ELEM_BYTES);
+  printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES);
+  printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN);
+  printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT);
+  printf("VTA_INSN_MEM_0 [%d-%d]\n", VTA_INSN_MEM_0_0, VTA_INSN_MEM_0_1);
+  printf("VTA_INSN_MEM_1 [%d]\n", VTA_INSN_MEM_1);
+  printf("VTA_INSN_MEM_2 [%d]\n", VTA_INSN_MEM_2);
+  printf("VTA_INSN_MEM_3 [%d]\n", VTA_INSN_MEM_3);
+  printf("VTA_INSN_MEM_4 [%d]\n", VTA_INSN_MEM_4);
+  printf("VTA_INSN_MEM_5 [%d-%d]\n", VTA_INSN_MEM_5_0, VTA_INSN_MEM_5_1);
+  printf("VTA_INSN_MEM_6 [%d-%d]\n", VTA_INSN_MEM_6_0, VTA_INSN_MEM_6_1);
+  printf("VTA_INSN_MEM_7 [%d-%d]\n", VTA_INSN_MEM_7_0, VTA_INSN_MEM_7_1);
+  printf("VTA_INSN_MEM_8 [%d-%d]\n", VTA_INSN_MEM_8_0, VTA_INSN_MEM_8_1);
+  printf("VTA_INSN_MEM_9 [%d-%d]\n", VTA_INSN_MEM_9_0, VTA_INSN_MEM_9_1);
+  printf("VTA_INSN_MEM_A [%d-%d]\n", VTA_INSN_MEM_A_0, VTA_INSN_MEM_A_1);
+  printf("VTA_INSN_MEM_B [%d-%d]\n", VTA_INSN_MEM_B_0, VTA_INSN_MEM_B_1);
+  printf("VTA_INSN_MEM_C [%d-%d]\n", VTA_INSN_MEM_C_0, VTA_INSN_MEM_C_1);
+  printf("VTA_INSN_MEM_D [%d-%d]\n", VTA_INSN_MEM_D_0, VTA_INSN_MEM_D_1);
+  printf("VTA_INSN_MEM_E [%d-%d]\n", VTA_INSN_MEM_E_0, VTA_INSN_MEM_E_1);
+  printf("VTA_INSN_GEM_0 [%d-%d]\n", VTA_INSN_GEM_0_0, VTA_INSN_GEM_0_1);
+  printf("VTA_INSN_GEM_1 [%d]\n", VTA_INSN_GEM_1);
+  printf("VTA_INSN_GEM_2 [%d]\n", VTA_INSN_GEM_2);
+  printf("VTA_INSN_GEM_3 [%d]\n", VTA_INSN_GEM_3);
+  printf("VTA_INSN_GEM_4 [%d]\n", VTA_INSN_GEM_4);
+  printf("VTA_INSN_GEM_5 [%d]\n", VTA_INSN_GEM_5);
+  printf("VTA_INSN_GEM_6 [%d-%d]\n", VTA_INSN_GEM_6_0, VTA_INSN_GEM_6_1);
+  printf("VTA_INSN_GEM_7 [%d-%d]\n", VTA_INSN_GEM_7_0, VTA_INSN_GEM_7_1);
+  printf("VTA_INSN_GEM_8 [%d-%d]\n", VTA_INSN_GEM_8_0, VTA_INSN_GEM_8_1);
+  printf("VTA_INSN_GEM_9 [%d-%d]\n", VTA_INSN_GEM_9_0, VTA_INSN_GEM_9_1);
+  printf("VTA_INSN_GEM_A [%d-%d]\n", VTA_INSN_GEM_A_0, VTA_INSN_GEM_A_1);
+  printf("VTA_INSN_GEM_B [%d-%d]\n", VTA_INSN_GEM_B_0, VTA_INSN_GEM_B_1);
+  printf("VTA_INSN_GEM_C [%d-%d]\n", VTA_INSN_GEM_C_0, VTA_INSN_GEM_C_1);
+  printf("VTA_INSN_GEM_D [%d-%d]\n", VTA_INSN_GEM_D_0, VTA_INSN_GEM_D_1);
+  printf("VTA_INSN_GEM_E [%d-%d]\n", VTA_INSN_GEM_E_0, VTA_INSN_GEM_E_1);
+  printf("VTA_INSN_GEM_F [%d-%d]\n", VTA_INSN_GEM_F_0, VTA_INSN_GEM_F_1);
+  printf("VTA_INSN_ALU_E [%d-%d]\n", VTA_INSN_ALU_E_0, VTA_INSN_ALU_E_1);
+  printf("VTA_INSN_ALU_F [%d]\n", VTA_INSN_ALU_F);
+  printf("VTA_INSN_ALU_G [%d-%d]\n", VTA_INSN_ALU_G_0, VTA_INSN_ALU_G_1);
+  printf("VTA_UOP_GEM_0 [%d-%d]\n", VTA_UOP_GEM_0_0, VTA_UOP_GEM_0_1);
+  printf("VTA_UOP_GEM_1 [%d-%d]\n", VTA_UOP_GEM_1_0, VTA_UOP_GEM_1_1);
+  printf("VTA_UOP_GEM_2 [%d-%d]\n", VTA_UOP_GEM_2_0, VTA_UOP_GEM_2_1);
+  printf("VTA_UOP_ALU_0 [%d-%d]\n", VTA_UOP_ALU_0_0, VTA_UOP_ALU_0_1);
+  printf("VTA_UOP_ALU_1 [%d-%d]\n", VTA_UOP_ALU_1_0, VTA_UOP_ALU_1_1);
+}
+
+void printInstruction(int num_insn, VTAGenericInsn *insns) {
+  // Keep tabs on dependence queues
+  int l2g_queue = 0;
+  int g2l_queue = 0;
+  int s2g_queue = 0;
+  int g2s_queue = 0;
+  // Converter
+  union VTAInsn c;
+  // Iterate over all instructions
+  printf("DEBUG - There are %u instructions\n", num_insn);
+  for (int i = 0; i < num_insn; i++) {
+    // Fetch instruction and decode opcode
+    c.generic = insns[i];
+    printf("DEBUG - INSTRUCTION %u: ", i);
+    if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
+      // Print instruction field information
+      if (c.mem.opcode == VTA_OPCODE_LOAD) {
+        printf("LOAD ");
+        if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
+        if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
+        if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
+        if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
+      }
+      if (c.mem.opcode == VTA_OPCODE_STORE) {
+        printf("STORE ACC\n");
+      }
+      printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+             static_cast<int>(c.mem.pop_prev_dep),
+             static_cast<int>(c.mem.pop_next_dep),
+             static_cast<int>(c.mem.push_prev_dep),
+             static_cast<int>(c.mem.push_next_dep));
+      printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
+             static_cast<int>(c.mem.dram_base),
+             static_cast<int>(c.mem.sram_base));
+      printf("\ty: size=%d, pad=[%d, %d]\n",
+             static_cast<int>(c.mem.y_size),
+             static_cast<int>(c.mem.y_pad_0),
+             static_cast<int>(c.mem.y_pad_1));
+      printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
+             static_cast<int>(c.mem.x_size),
+             static_cast<int>(c.mem.x_stride),
+             static_cast<int>(c.mem.x_pad_0),
+             static_cast<int>(c.mem.x_pad_1));
+      if (c.mem.opcode == VTA_OPCODE_STORE) {
+        if (c.mem.pop_prev_dep) g2s_queue--;
+        if (c.mem.push_prev_dep) s2g_queue++;
+      } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
+        (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
+        if (c.mem.pop_next_dep) g2l_queue--;
+        if (c.mem.push_next_dep) l2g_queue++;
+      } else {
+        if (c.mem.pop_prev_dep) l2g_queue--;
+        if (c.mem.push_prev_dep) g2l_queue++;
+        if (c.mem.pop_next_dep) s2g_queue--;
+        if (c.mem.push_next_dep) g2s_queue++;
+      }
+    } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
+      // Print instruction field information
+      printf("GEMM\n");
+      printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+             static_cast<int>(c.mem.pop_prev_dep),
+             static_cast<int>(c.mem.pop_next_dep),
+             static_cast<int>(c.mem.push_prev_dep),
+             static_cast<int>(c.mem.push_next_dep));
+      printf("\trange (%d, %d)\n",
+             static_cast<int>(c.gemm.uop_bgn),
+             static_cast<int>(c.gemm.uop_end));
+      printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
+      printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
+             static_cast<int>(c.gemm.iter_out),
+             static_cast<int>(c.gemm.dst_factor_out),
+             static_cast<int>(c.gemm.src_factor_out),
+             static_cast<int>(c.gemm.wgt_factor_out));
+      printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
+             static_cast<int>(c.gemm.iter_in),
+             static_cast<int>(c.gemm.dst_factor_in),
+             static_cast<int>(c.gemm.src_factor_in),
+             static_cast<int>(c.gemm.wgt_factor_in));
+      if (c.gemm.pop_prev_dep) l2g_queue--;
+      if (c.gemm.push_prev_dep) g2l_queue++;
+      if (c.gemm.pop_next_dep) s2g_queue--;
+      if (c.gemm.push_next_dep) g2s_queue++;
+    } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
+      printf("FINISH\n");
+      printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+             static_cast<int>(c.mem.pop_prev_dep),
+             static_cast<int>(c.mem.pop_next_dep),
+             static_cast<int>(c.mem.push_prev_dep),
+             static_cast<int>(c.mem.push_next_dep));
+      if (c.gemm.pop_prev_dep) l2g_queue--;
+      if (c.gemm.push_prev_dep) g2l_queue++;
+      if (c.gemm.pop_next_dep) s2g_queue--;
+      if (c.gemm.push_next_dep) g2s_queue++;
+    } else if (c.mem.opcode == VTA_OPCODE_ALU) {
+      // Print instruction field information
+      printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
+      printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+             static_cast<int>(c.mem.pop_prev_dep),
+             static_cast<int>(c.mem.pop_next_dep),
+             static_cast<int>(c.mem.push_prev_dep),
+             static_cast<int>(c.mem.push_next_dep));
+      printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
+      printf("\trange (%d, %d)\n",
+             static_cast<int>(c.alu.uop_bgn),
+             static_cast<int>(c.alu.uop_end));
+      printf("\touter loop - iter: %d, dst: %d, src: %d\n",
+             static_cast<int>(c.alu.iter_out),
+             static_cast<int>(c.alu.dst_factor_out),
+             static_cast<int>(c.alu.src_factor_out));
+      printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
+             static_cast<int>(c.alu.iter_in),
+             static_cast<int>(c.alu.dst_factor_in),
+             static_cast<int>(c.alu.src_factor_in));
+      if (c.alu.pop_prev_dep) l2g_queue--;
+      if (c.alu.push_prev_dep) g2l_queue++;
+      if (c.alu.pop_next_dep) s2g_queue--;
+      if (c.alu.push_next_dep) g2s_queue++;
+    }
+  }
+  printf("DEBUG - l2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
+  printf("DEBUG - s2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
+}
+
+// Helper function: Print micro-ops status
+void printMicroOp(int num_uop, VTAUop *uops) {
+  // Iterate over all micro ops
+  printf("DEBUG - There are %u micro-ops\n", num_uop);
+  for (int i = 0; i < num_uop; i++) {
+    // Read micro-op
+    printf("DEBUG - UOP %u: ", i);
+    printf("acc=%u, inp= %u, wgt=%u\n", uops[i].dst_idx, uops[i].src_idx, uops[i].wgt_idx);
+  }
+}
+
+int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) {
+  // Some assertions
+  assert(batch % VTA_BATCH == 0);
+  assert(vector_size % VTA_BLOCK_OUT == 0);
+  assert(!(opcode == VTA_ALU_OPCODE_SHR && !use_imm));
+  printf("=====================================================================================\n");
+  printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
+    getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
+
+  // Instruction count
+  int ins_size = 3 * batch / VTA_BATCH + 2;
+  // Micro op count
+  int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT;
+  // Input/output elements in each transfer
+  int tx_size = vector_size / VTA_BLOCK_OUT;
+  // Number of input sets to be generated
+  int input_sets = (use_imm) ? 1 : 2;
+  // Make sure we don't exceed buffer bounds
+  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
+  assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH);
+
+  // Immediate values
+  acc_T *immediate = static_cast<acc_T *>(malloc(sizeof(acc_T) * batch / VTA_BATCH));
+  for (int b = 0; b < batch / VTA_BATCH; b++) {
+    if (opcode == VTA_ALU_OPCODE_MIN) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+    } else if (opcode == VTA_ALU_OPCODE_MAX) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+    } else if (opcode == VTA_ALU_OPCODE_ADD) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+    } else if (opcode == VTA_ALU_OPCODE_SHR) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % VTA_ACC_WIDTH - VTA_ACC_WIDTH/2);
+    }
+  }
+
+  // Initialize instructions
+  VTAGenericInsn *insn_buf =
+      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
+  int insn_idx = 0;
+  insn_buf[insn_idx++] =
+      get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
+  for (int b = 0; b < batch; b += VTA_BATCH) {
+    insn_buf[insn_idx++] = get2DLoadStoreInsn(
+        VTA_OPCODE_LOAD,                   // opcode
+        VTA_MEM_ID_ACC,                    // vector size
+        0,                                 // sram offset
+        b / VTA_BATCH * tx_size * input_sets,  // dram offset
+        1,                                 // y size
+        tx_size * input_sets,              // x size
+        tx_size * input_sets,              // x stride
+        0,                                 // y pad
+        0,                                 // x pad
+        0,                                 // pop prev dep
+        b > 0,                             // pop next dep
+        0,                                 // push prev dep
+        0);                                // push next dep
+    insn_buf[insn_idx++] = getALUInsn(
+        opcode,                            // opcode
+        tx_size,                           // vector size
+        use_imm,                           // use imm
+        immediate[b / VTA_BATCH],          // imm
+        uop_compression,                   // uop compression
+        0,                                 // pop prev dep
+        0,                                 // pop next dep
+        0,                                 // push prev dep
+        1);                                // push next dep
+    insn_buf[insn_idx++] = get2DLoadStoreInsn(
+        VTA_OPCODE_STORE,                  // opcode
+        VTA_MEM_ID_OUT,                    // vector size
+        0,                                 // sram offset
+        b / VTA_BATCH * tx_size,           // dram offset
+        1,                                 // y size
+        tx_size,                           // x size
+        tx_size,                           // x stride
+        0,                                 // y pad
+        0,                                 // x pad
+        1,                                 // pop prev dep
+        0,                                 // pop next dep
+        1,                                 // push prev dep
+        0);                                // push next dep
+  }
+  // Finish
+  insn_buf[insn_idx++] = getFinishInsn(0, 1);
+  // Prepare the uop buffer
+  VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression);
+
+#if VTA_DEBUG == 1
+  printInstruction(ins_size, insn_buf);
+  printMicroOp(uop_size, uop_buf);
+#endif
+
+  // Initialize the input/output data
+  acc_T **inputs = alloc2dArray<acc_T>(batch, vector_size * input_sets);
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < vector_size * input_sets; j++) {
+      if (opcode == VTA_ALU_OPCODE_MIN) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_MAX) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_ADD) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      }
+    }
+  }
+
+  // Compute reference output
+  out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < vector_size; j++) {
+      acc_T tmp = 0;
+      if (opcode == VTA_ALU_OPCODE_MIN) {
+        if (!use_imm) {
+          tmp = inputs[i][j] < inputs[i][j + vector_size] ?
+                    inputs[i][j] :
+                    inputs[i][j + vector_size];
+        } else {
+          tmp = inputs[i][j] < immediate[i / VTA_BATCH] ?
+                    inputs[i][j] :
+                    immediate[i / VTA_BATCH];
+        }
+      } else if (opcode == VTA_ALU_OPCODE_MAX) {
+        if (!use_imm) {
+          tmp = inputs[i][j] > inputs[i][j + vector_size] ?
+                    inputs[i][j] :
+                    inputs[i][j + vector_size];
+        } else {
+          tmp = inputs[i][j] > immediate[i / VTA_BATCH] ?
+                    inputs[i][j] :
+                    immediate[i / VTA_BATCH];
+        }
+      } else if (opcode == VTA_ALU_OPCODE_ADD) {
+        if (!use_imm) {
+          tmp = inputs[i][j] + inputs[i][j + vector_size];
+        } else {
+          tmp = inputs[i][j] + immediate[i / VTA_BATCH];
+        }
+      } else if (opcode == VTA_ALU_OPCODE_SHR) {
+        if (immediate[i / VTA_BATCH] >= 0) {
+          tmp = inputs[i][j] >> immediate[i / VTA_BATCH];
+        } else {
+          tmp = inputs[i][j] << (0 - immediate[i / VTA_BATCH]);
+        }
+      }
+      // Set
+      outputs_ref[i][j] = (out_T) tmp;
+    }
+  }
+
+  // Pack input buffer
+  acc_T *bias_buf =
+      static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
+  packBuffer<acc_T, VTA_ACC_WIDTH>(
+      bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
+
+  // Prepare output buffer
+  out_T *output_buf =
+      static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * batch * tx_size * input_sets));
+
+#ifdef NO_SIM
+  // Invoke the VTA
+  uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf);
+  // Report on timining
+  printf("INFO - Synchronization time: %.3fms\n", static_cast<float>(t_fpga) / 1E6);
+  printf("INFO - Throughput: %.3fGOps/s\n", static_cast<float>(vector_size * batch) / t_fpga);
+#else
+  // Invoke the VTA
+  vta(ins_size,
+      (volatile insn_T *) insn_buf,
+      (volatile uop_T *) uop_buf,
+      (volatile inp_vec_T *) NULL,
+      (volatile wgt_vec_T *) NULL,
+      (volatile acc_vec_T *) bias_buf,
+      (volatile out_vec_T *) output_buf);
+#endif
+
+  // Unpack output buffer
+  out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
+  unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
+                                     output_buf,
+                                     batch,
+                                     vector_size,
+                                     VTA_BATCH,
+                                     VTA_BLOCK_OUT);
+
+  // Correctness checks
+  int err = 0;
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < vector_size; j++) {
+      if (outputs_ref[i][j] != outputs[i][j]) {
+        err++;
+#if VTA_DEBUG == 1
+        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
+               static_cast<int>(outputs_ref[i][j]),
+               static_cast<int>(outputs[i][j]));
+#endif
+      }
+    }
+  }
+
+  // Free all allocated arrays
+  free(immediate);
+  free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
+  free2dArray<out_T>(outputs_ref, batch, vector_size);
+  free2dArray<out_T>(outputs, batch, vector_size);
+  freeBuffer(insn_buf);
+  freeBuffer(uop_buf);
+  freeBuffer(bias_buf);
+  freeBuffer(output_buf);
+
+  if (err == 0) {
+    printf("INFO - ALU test successful!\n");
+    return 0;
+  } else {
+    printf("INFO - ALU test failed, got %d errors!\n", err);
+    return -1;
+  }
+}
+
+int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
+    int virtual_threads) {
+  // Some assertions
+  assert(block % VTA_BLOCK_IN == 0);
+  assert(block % VTA_BLOCK_OUT == 0);
+  assert(block % VTA_BATCH == 0);
+  assert(channels % block == 0);
+  assert(batch % block == 0);
+
+  printf("=====================================================================================\n");
+  printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_comp=%d, vt=%d\n",
+         batch, channels, block, uop_compression, virtual_threads);
+
+  // Input/output channels
+  int in_feat = channels;
+  int out_feat = channels;
+  // Derive number of elements that need to be loaded/stored
+  int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
+  int uop_size = uop_compression ?
+      block / VTA_BATCH * virtual_threads :
+      block / VTA_BATCH * block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT * virtual_threads;
+  int inp_size = batch / VTA_BATCH * in_feat / VTA_BLOCK_IN;
+  int wgt_size = in_feat / VTA_BLOCK_IN * out_feat / VTA_BLOCK_OUT;
+  int out_size = batch / VTA_BATCH * out_feat / VTA_BLOCK_OUT;
+  // Blocked buffer sizes (in terms of elements)
+  int inp_block_size = block / VTA_BATCH * block / VTA_BLOCK_IN;
+  int wgt_block_size = block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT;
+  int out_block_size = block / VTA_BATCH * block / VTA_BLOCK_OUT;
+  // Make sure we don't exceed buffer bounds
+  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
+  assert(inp_block_size <= VTA_INP_BUFF_DEPTH);
+  assert(wgt_block_size <= VTA_WGT_BUFF_DEPTH);
+  assert(out_block_size <= VTA_ACC_BUFF_DEPTH);
+
+  // Initialize instruction buffer
+  VTAGenericInsn *insn_buf =
+      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
+  int insn_idx = 0;
+
+  // Load uops
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(VTA_OPCODE_LOAD,
+                                            VTA_MEM_ID_UOP,
+                                            0,
+                                            0,
+                                            uop_size,
+                                            0,
+                                            0,
+                                            0,
+                                            0);
+  // Iterate over batch blocks
+  for (int i = 0; i < batch; i += block) {
+    // Iterate over output channel blocks
+    for (int j = 0; j < out_feat; j += block) {
+      // Load bias block (pop next if not first, push prev)
+      insn_buf[insn_idx++] = get2DLoadStoreInsn(
+          VTA_OPCODE_LOAD,                                    // opcode
+          VTA_MEM_ID_ACC,                                     // type
+          0,                                                  // sram offset
+          (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT,     // dram offset
+          block / VTA_BATCH,                                  // y size
+          block / VTA_BLOCK_OUT,                              // x size
+          out_feat / VTA_BLOCK_OUT,                           // x stride
+          0,                                                  // y pad
+          0,                                                  // x pad
+          0,                                                  // pop prev dep
+          (i > 0 || j > 0),                                   // pop next dep
+          (virtual_threads == 1),                             // push prev dep
+          0);                                                 // push next dep
+      // Iterate over input channel blocks
+      for (int k = 0; k < in_feat; k += block * virtual_threads) {
+        for (int l = 0; l < block * virtual_threads; l += block) {
+          // Derive dependence flags
+          bool pop = (virtual_threads == 1) ?
+              1 :
+              (i > 0 || j > 0 || k > 0 || l > 0) && (k + l != block * virtual_threads - block);
+          bool push_prev = (virtual_threads == 1) ?
+              ((k + l) != in_feat - block) :
+              ((k + l) != in_feat - virtual_threads * block) &&
+              (
+                  (k + l != in_feat - block) ||
+                  (j != out_feat - block) ||
+                  (i != batch - block));
+          bool push_next = (k + l == in_feat - block);
+          // Load weight block (pop next)
+          insn_buf[insn_idx++] = get2DLoadStoreInsn(
+              VTA_OPCODE_LOAD,                                // opcode
+              VTA_MEM_ID_WGT,                                 // type
+              l / VTA_BLOCK_IN * block / VTA_BLOCK_OUT,       // sram offset
+              (j / VTA_BLOCK_OUT * in_feat + k + l) / VTA_BLOCK_IN,  // dram offset
+              block / VTA_BLOCK_OUT,                          // y size
+              block / VTA_BLOCK_IN,                           // x size
+              in_feat / VTA_BLOCK_IN,                         // x stride
+              0,                                              // y pad
+              0,                                              // x pad
+              0,                                              // pop prev dep
+              pop,                                            // pop next dep
+              0,                                              // push prev dep
+              0);                                             // push next dep
+          // Load input block (push next)
+          insn_buf[insn_idx++] = get2DLoadStoreInsn(
+              VTA_OPCODE_LOAD,                                // opcode
+              VTA_MEM_ID_INP,                                 // type
+              l / VTA_BLOCK_IN * block / VTA_BATCH,           // sram offset
+              (i / VTA_BATCH * in_feat + k + l) / VTA_BLOCK_IN,  // dram offset
+              block / VTA_BATCH,                              // y size
+              block / VTA_BLOCK_IN,                           // x size
+              in_feat / VTA_BLOCK_IN,                         // x stride
+              0,                                              // y pad
+              0,                                              // x pad
+              0,                                              // pop prev dep
+              0,                                              // pop next dep
+              0,                                              // push prev dep
+              1);                                             // push next dep
+          // Perform GEMM (pop prev, push prev if not last, push next if last)
+          insn_buf[insn_idx++] = getGEMMInsn(
+              l / block * uop_size / virtual_threads,         // uop offset
+              block / VTA_BATCH,                              // batch
+              block / VTA_BLOCK_IN,                           // in_feat
+              block / VTA_BLOCK_OUT,                          // out_feat
+              uop_compression,                                // uop_compression
+              1,                                              // pop_prev_dep
+              0,                                              // pop_next_dep
+              push_prev,                                      // push prev dep
+              push_next);                                     // push_next_dep
+        }
+      }
+      // Store output block (pop prev, push prev if not last)
+      insn_buf[insn_idx++] = get2DLoadStoreInsn(
+          VTA_OPCODE_STORE,                                   // opcode
+          VTA_MEM_ID_OUT,                                     // type
+          0,                                                  // sram offset
+          (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT,     // dram offset
+          block / VTA_BATCH,                                  // y size
+          block / VTA_BLOCK_OUT,                              // x size
+          out_feat / VTA_BLOCK_OUT,                           // x stride
+          0,                                                  // y pad
+          0,                                                  // x pad
+          1,                                                  // pop prev dep
+          0,                                                  // pop next dep
+          1,                                                  // pop prev dep
+          0);                                                 // push next dep
+    }
+  }
+  // Finish
+  insn_buf[insn_idx++] = getFinishInsn(0, 1);
+
+  // Prepare the uop buffer
+  VTAUop * uop_buf = getGEMMUops(
+      block / VTA_BATCH,
+      block / VTA_BLOCK_IN,
+      block / VTA_BLOCK_OUT,
+      uop_compression,
+      virtual_threads > 1);
+
+#if VTA_DEBUG == 1
+  printInstruction(ins_size, insn_buf);
+  printMicroOp(uop_size, uop_buf);
+#endif
+
+  // Initialize inputs
+  inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_feat);
+  // Initialize weights
+  wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_feat, in_feat);
+  // Initialize biases
+  acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_feat);
+
+  // Reference GEMM implementation
+  out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < out_feat; j++) {
+      acc_T sum = biases[i][j];
+      for (int k = 0; k < in_feat; k++) {
+        sum += (acc_T) (inputs[i][k] * weights[j][k]);
+      }
+      // Set
+      outputs_ref[i][j] = (out_T) sum;
+    }
+  }
+
+  // Prepare the input buffer
+  inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
+  packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
+                                   inputs,
+                                   batch,
+                                   in_feat,
+                                   VTA_BATCH,
+                                   VTA_BLOCK_IN);
+  // Prepare the weight buffer
+  wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
+  packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
+                                   weights,
+                                   out_feat,
+                                   in_feat,
+                                   VTA_BLOCK_OUT,
+                                   VTA_BLOCK_IN);
+  // Prepare the bias buffer
+  acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
+  packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
+                                   biases,
+                                   batch,
+                                   out_feat,
+                                   VTA_BATCH,
+                                   VTA_BLOCK_OUT);
+  // Prepare the output buffer
+  out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
+
+#ifdef NO_SIM
+  // Invoke the VTA
+  uint64_t t_fpga = vta(ins_size,
+                        insn_buf,
+                        uop_buf,
+                        input_buf,
+                        weight_buf,
+                        bias_buf,
+                        output_buf);
+  // Report on timining
+  printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
+  printf("INFO - Throughput: %.3lfGOPs/s\n",
+         static_cast<float>(batch) * in_feat * out_feat * 2 / t_fpga);
+#else
+  // Invoke the VTA
+  vta(ins_size,
+      (volatile insn_T *) insn_buf,
+      (volatile uop_T *) uop_buf,
+      (volatile inp_vec_T *) input_buf,
+      (volatile wgt_vec_T *) weight_buf,
+      (volatile acc_vec_T *) bias_buf,
+      (volatile out_vec_T *) output_buf);
+#endif
+
+  // Unpack output data
+  out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
+  unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
+                                     output_buf,
+                                     batch,
+                                     out_feat,
+                                     VTA_BATCH,
+                                     VTA_BLOCK_OUT);
+
+  // Correctness checks
+  int err = 0;
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < out_feat; j++) {
+      if (outputs_ref[i][j] != outputs[i][j]) {
+        err++;
+#if VTA_DEBUG == 1
+        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
+               static_cast<int>(outputs_ref[i][j]),
+               static_cast<int>(outputs[i][j]));
+#endif
+      }
+    }
+  }
+
+  // Free all allocated arrays
+  free2dArray<inp_T>(inputs, batch, in_feat);
+  free2dArray<wgt_T>(weights, out_feat, in_feat);
+  free2dArray<acc_T>(biases, batch, out_feat);
+  free2dArray<out_T>(outputs_ref, batch, out_feat);
+  free2dArray<out_T>(outputs, batch, out_feat);
+  freeBuffer(insn_buf);
+  freeBuffer(uop_buf);
+  freeBuffer(input_buf);
+  freeBuffer(weight_buf);
+  freeBuffer(bias_buf);
+  freeBuffer(output_buf);
+
+  if (err == 0) {
+    printf("INFO - Blocked GEMM test successful!\n");
+    return 0;
+  } else {
+    printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
+    return -1;
+  }
+}
+
+
+int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression) {
+  // Some assertions
+  assert(batch % VTA_BATCH == 0);
+  assert(in_channels % VTA_BLOCK_IN == 0);
+  assert(out_channels % VTA_BLOCK_OUT == 0);
+
+  printf("=====================================================================================\n");
+  printf("INFO - Blocked GEMM test: batch=%d, in_channels=%d, out_channels=%d, uop_comp=%d\n",
+         batch, in_channels, out_channels, uop_compression);
+
+  // Derive number of elements that need to be loaded/stored
+  int ins_size = 7;
+  int uop_size = uop_compression ?
+      batch / VTA_BATCH :
+      batch / VTA_BATCH * in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
+  int inp_size = batch / VTA_BATCH * in_channels / VTA_BLOCK_IN;
+  int wgt_size = in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
+  int out_size = batch / VTA_BATCH * out_channels / VTA_BLOCK_OUT;
+  // Make sure we don't exceed buffer bounds
+  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
+  assert(inp_size <= VTA_INP_BUFF_DEPTH);
+  assert(wgt_size <= VTA_WGT_BUFF_DEPTH);
+  assert(out_size <= VTA_ACC_BUFF_DEPTH);
+
+  // Initialize instruction buffer
+  VTAGenericInsn *insn_buf =
+      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
+  int insn_idx = 0;
+
+  // Load uops
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_LOAD,
+      VTA_MEM_ID_UOP,
+      0,
+      0,
+      uop_size,
+      0,
+      0,
+      0,
+      0);
+  // Load bias
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_LOAD,                                    // opcode
+      VTA_MEM_ID_ACC,                                     // type
+      0,                                                  // sram offset
+      0,                                                  // dram offset
+      out_size,                                           // size
+      0,                                                  // pop prev dep
+      0,                                                  // pop next dep
+      1,                                                  // push prev dep
+      0);                                                 // push next dep
+  // Load weight block (pop next)
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_LOAD,                                    // opcode
+      VTA_MEM_ID_WGT,                                     // type
+      0,                                                  // sram offset
+      0,                                                  // dram offset
+      wgt_size,                                           // size
+      0,                                                  // pop prev dep
+      1,                                                  // pop next dep
+      0,                                                  // push prev dep
+      0);                                                 // push next dep
+  // Load input block (push next)
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_LOAD,                                    // opcode
+      VTA_MEM_ID_INP,                                     // type
+      0,                                                  // sram offset
+      0,                                                  // dram offset
+      inp_size,                                           // size
+      0,                                                  // pop prev dep
+      0,                                                  // pop next dep
+      0,                                                  // push prev dep
+      1);                                                 // push next dep
+  // Perform GEMM (pop prev, push prev if not last, push next if last)
+  insn_buf[insn_idx++] = getGEMMInsn(
+      0,                                                  // uop offset
+      batch / VTA_BATCH,                                  // batch
+      in_channels / VTA_BLOCK_IN,                         // in_channels
+      out_channels / VTA_BLOCK_OUT,                       // out_channels
+      uop_compression,                                    // uop_compression
+      1,                                                  // pop_prev_dep
+      0,                                                  // pop_next_dep
+      0,                                                  // push prev dep
+      1);                                                 // push_next_dep
+  // Store output block (pop prev, push prev if not last)
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_STORE,                                   // opcode
+      VTA_MEM_ID_OUT,                                     // type
+      0,                                                  // sram offset
+      0,                                                  // dram offset
+      out_size,                                           // size
+      1,                                                  // pop prev dep
+      0,                                                  // pop next dep
+      1,                                                  // push prev dep
+      0);                                                 // push next dep
+  // Finish
+  insn_buf[insn_idx++] = getFinishInsn(0, 1);
+
+  // Prepare the uop buffer
+  VTAUop * uop_buf = getGEMMUops(
+      batch / VTA_BATCH,
+      in_channels / VTA_BLOCK_IN,
+      out_channels / VTA_BLOCK_OUT,
+      uop_compression,
+      0);
+
+#if VTA_DEBUG == 1
+  printInstruction(ins_size, insn_buf);
+  printMicroOp(uop_size, uop_buf);
+#endif
+
+  // Initialize inputs
+  inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_channels);
+  // Initialize weights
+  wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_channels, in_channels);
+  // Initialize biases
+  acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_channels);
+
+  // Reference GEMM implementation
+  out_T **outputs_ref = alloc2dArray<out_T>(batch, out_channels);
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < out_channels; j++) {
+      acc_T sum = biases[i][j];
+      for (int k = 0; k < in_channels; k++) {
+        sum += (acc_T) (inputs[i][k] * weights[j][k]);
+      }
+      // Set
+      outputs_ref[i][j] = (out_T) sum;
+    }
+  }
+
+  // Prepare the input buffer
+  inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
+  packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
+                                   inputs,
+                                   batch,
+                                   in_channels,
+                                   VTA_BATCH,
+                                   VTA_BLOCK_IN);
+  // Prepare the weight buffer
+  wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
+  packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
+                                   weights,
+                                   out_channels,
+                                   in_channels,
+                                   VTA_BLOCK_OUT,
+                                   VTA_BLOCK_IN);
+  // Prepare the bias buffer
+  acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
+  packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
+                                   biases,
+                                   batch,
+                                   out_channels,
+                                   VTA_BATCH,
+                                   VTA_BLOCK_OUT);
+  // Prepare the output buffer
+  out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
+
+#ifdef NO_SIM
+  // Invoke the VTA
+  uint64_t t_fpga = vta(ins_size,
+                        insn_buf,
+                        uop_buf,
+                        input_buf,
+                        weight_buf,
+                        bias_buf,
+                        output_buf);
+  // Report on timining
+  printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
+  printf("INFO - Throughput: %.3lfGOPs/s\n",
+         static_cast<float>(batch) * in_channels * out_channels * 2 / t_fpga);
+#else
+  // Invoke the VTA
+  vta(ins_size,
+      (volatile insn_T *) insn_buf,
+      (volatile uop_T *) uop_buf,
+      (volatile inp_vec_T *) input_buf,
+      (volatile wgt_vec_T *) weight_buf,
+      (volatile acc_vec_T *) bias_buf,
+      (volatile out_vec_T *) output_buf);
+#endif
+
+  // Unpack output data
+  out_T **outputs = alloc2dArray<out_T>(batch, out_channels);
+  unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
+                                     output_buf,
+                                     batch,
+                                     out_channels,
+                                     VTA_BATCH,
+                                     VTA_BLOCK_OUT);
+
+  // Correctness checks
+  int err = 0;
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < out_channels; j++) {
+      if (outputs_ref[i][j] != outputs[i][j]) {
+        err++;
+#if VTA_DEBUG == 1
+        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
+               static_cast<int>(outputs_ref[i][j]),
+               static_cast<int>(outputs[i][j]));
+#endif
+      }
+    }
+  }
+
+  // Free all allocated arrays
+  free2dArray<inp_T>(inputs, batch, in_channels);
+  free2dArray<wgt_T>(weights, out_channels, in_channels);
+  free2dArray<acc_T>(biases, batch, out_channels);
+  free2dArray<out_T>(outputs_ref, batch, out_channels);
+  free2dArray<out_T>(outputs, batch, out_channels);
+  freeBuffer(insn_buf);
+  freeBuffer(uop_buf);
+  freeBuffer(input_buf);
+  freeBuffer(weight_buf);
+  freeBuffer(bias_buf);
+  freeBuffer(output_buf);
+
+  if (err == 0) {
+    printf("INFO - Blocked GEMM test successful!\n");
+    return 0;
+  } else {
+    printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
+    return -1;
+  }
+}
\ No newline at end of file
diff --git a/vta/tests/hardware/common/test_lib.h b/vta/tests/hardware/common/test_lib.h
new file mode 100644
index 000000000000..0e8f30df7d92
--- /dev/null
+++ b/vta/tests/hardware/common/test_lib.h
@@ -0,0 +1,313 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file test_lib.cpp
+ * \brief Test library for the VTA design simulation and driver tests.
+ */
+
+#ifndef TESTS_HARDWARE_COMMON_TEST_LIB_H_
+#define TESTS_HARDWARE_COMMON_TEST_LIB_H_
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vta/hw_spec.h>
+
+#ifdef NO_SIM
+
+#include <vta/driver.h>
+
+#ifdef VTA_TARGET_PYNQ
+#include "../../../src/pynq/pynq_driver.h"
+#endif  // VTA_TARGET_PYNQ
+
+typedef uint64_t axi_T;
+typedef uint32_t uop_T;
+typedef int8_t wgt_T;
+typedef int8_t inp_T;
+typedef int8_t out_T;
+typedef int32_t acc_T;
+
+uint64_t vta(
+  uint32_t insn_count,
+  VTAGenericInsn *insns,
+  VTAUop *uops,
+  inp_T *inputs,
+  wgt_T *weights,
+  acc_T *biases,
+  inp_T *outputs);
+
+#else  // NO_SIM
+
+#include "../../../hardware/xilinx/src/vta.h"
+
+#endif  // NO_SIM
+
+/*!
+* \brief Returns opcode string.
+* \param opcode Opcode parameter (defined in vta_defines.h).
+* \param use_imm Boolean that indicates if the operation uses an immediate value.
+* \return The opcode string.
+*/
+const char* getOpcodeString(int opcode, bool use_imm);
+
+/*!
+* \brief Performs buffer data packing and tiling.
+* \param dst Pointer to the packed, and tiled destination 1D array (flattened).
+* \param src Pointer to the unpacked source 2D array.
+* \param y_size Number of rows.
+* \param x_size Number of columns.
+* \param y_block Inner tiling along row dimension.
+* \param x_block Inner tiling along column dimension.
+*/
+template <typename T, int T_WIDTH>
+void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block);
+
+/*!
+* \brief Performs buffer data unpacking.
+* \param dst Pointer to the unpacked destination 2D array.
+* \param src Pointer to the packed, and tiled source 1D array (flattened).
+* \param y_size Number of rows.
+* \param x_size Number of columns.
+* \param y_block Inner tiling along row dimension.
+* \param x_block Inner tiling along column dimension.
+*/
+template <typename T, int T_WIDTH>
+void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block);
+
+/*!
+* \brief Allocates and initializes a 2D array in the heap.
+* \param rows Number of rows.
+* \param cols Number of columns.
+* \return Pointer to the 2D array.
+*/
+template <typename T, int T_WIDTH>
+T ** allocInit2dArray(int rows, int cols);
+
+/*!
+* \brief Allocates a 2D array in the heap.
+* \param rows Number of rows.
+* \param cols Number of columns.
+* \return Pointer to the 2D array.
+*/
+template <typename T>
+T ** alloc2dArray(int rows, int cols);
+
+/*!
+* \brief Frees a 2D array.
+* \param array Pointer to the 2D array to be freed.
+* \param rows Number of rows.
+* \param cols Number of columns.
+*/
+template <typename T>
+void free2dArray(T **array, int rows, int cols);
+
+/*!
+* \brief Allocates a 3D array in the heap.
+* \param rows Number of rows (dim 0).
+* \param cols Number of columns (dim 1).
+* \param depth Depth of the array (dim 2).
+* \return Pointer to the 3D array.
+*/
+template <typename T>
+T *** alloc3dArray(int rows, int cols, int depth);
+
+/*!
+* \brief Frees a 3D array.
+* \param array Pointer to the 3D array.
+* \param rows Number of rows (dim 0).
+* \param cols Number of columns (dim 1).
+* \param depth Depth of the array (dim 2).
+*/
+template <typename T>
+void free3dArray(T *** array, int rows, int cols, int depth);
+
+/*!
+* \brief Performs memory allocation in a physically contiguous region of memory.
+* \param num_bytes Size of the buffer in bytes.
+* \return Pointer to the allocated buffer.
+*/
+void * allocBuffer(size_t num_bytes);
+
+/*!
+* \brief Frees buffer allocated in a physically contiguous region of memory.
+* \param buffer Pointer to the buffer to free.
+*/
+void freeBuffer(void * buffer);
+
+/*!
+* \brief Returns a VTA reset instruction on a 2D patch of the register file.
+* \param type On-chip memory target.
+* \param sram_offset Offset in SRAM.
+* \param y_size Number of rows to reset (y axis).
+* \param x_size Number of elements per row to reset (x axis).
+* \param x_stride Stride along the x axis.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a reset op.
+*/
+VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, int x_stride,
+  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA 2D load or store instruction.
+* \param opcode Type of operation.
+* \param type On-chip memory target.
+* \param sram_offset Offset in SRAM.
+* \param dram_offset Offset in DRAM.
+* \param y_size Number of rows to load/store (y axis).
+* \param x_size Number of elements per row to load/store (x axis).
+* \param x_stride Stride along the x axis.
+* \param y_pad Padding along the y axis.
+* \param x_pad Padding along the x axis.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a 2D load or store op.
+*/
+VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
+  int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
+  int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA 1D load or store instruction.
+* \param opcode Type of operation.
+* \param type On-chip memory target.
+* \param sram_offset Offset in SRAM.
+* \param dram_offset Offset in DRAM.
+* \param size Number of elements to load/store.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a 1D load or store op.
+*/
+VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
+  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA matrix multiplication instruction of size (a, b) x (b, c).
+* \param uop_offset Offset of the micro-op in SRAM.
+* \param batch Batch size (a).
+* \param in_feat Input features (b).
+* \param out_feat Output features (c).
+* \param uop_compression Apply micro-op compression.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a GEMM op.
+*/
+VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
+  bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
+  int push_next_dep);
+
+/*!
+* \brief Returns a VTA ALU instruction for map type operation.
+* \param opcode Opcode of the ALU instruction.
+* \param use_imm Use immediate.
+* \param imm Immediate value (int16).
+* \param vector_size Vector size of the ALU operation size.
+* \param uop_compression Apply micro-op compression.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a ALU op.
+*/
+VTAGenericInsn getALUInsn(int opcode, bool use_imm, int imm, int vector_size, bool uop_compression,
+  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA finish instruction.
+* \param pop_prev Pop dependence from previous stage.
+* \param pop_next Pop dependence from next stage.
+* \return A VTAGenericInsn for a finish op.
+*/
+VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next);
+
+/*!
+* \brief Returns an allocated buffer of VTA micro-ops to implement a copy operation.
+* \param y_size Number of rows to load/store (y axis).
+* \param x_size Number of elements per row to load/store (x axis).
+* \param uop_compression Apply micro-op compression.
+* \return A VTAUop pointer to an allocated micro-op buffer.
+*/
+VTAUop * getCopyUops(int y_size, int x_size, int uop_compression);
+
+/*!
+* \brief Returns an allocated buffer of VTA micro-ops to implement a matrix multiplication
+*   of size (a, b) x (b, c).
+* \param batch Batch size (a).
+* \param in_feat Input features (b).
+* \param out_feat Output features (c).
+* \param uop_compression Apply micro-op compression.
+* \param multi_threaded Generate micro-ops for two virtual execution threads.
+* \return A VTAUop pointer to an allocated micro-op buffer.
+*/
+VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
+  bool multi_threaded);
+
+/*!
+* \brief Returns an allocated buffer of VTA micro-ops to implement a vector-vector map operation.
+* \param vector_size Vector size.
+* \param uop_compression Apply micro-op compression.
+* \return A VTAUop pointer to an allocated micro-op buffer.
+*/
+VTAUop * getMapALUUops(int vector_size, bool uop_compression);
+
+/*!
+* \brief Print out parameters of the VTA design (for debugging purposes).
+*/
+void printParameters();
+
+/*!
+* \brief Print out instruction information (for debugging purposes).
+* \param num_insn Number of instructions.
+* \param insns Pointer to the instruction buffer.
+*/
+void printInstruction(int num_insn, VTAGenericInsn *insns);
+
+/*!
+* \brief Print out micro-op information (for debugging purposes).
+* \param num_insn Number of micro-ops.
+* \param insns Pointer to the micro-op buffer.
+*/
+void printMicroOp(int num_uop, VTAUop *uops);
+
+/*!
+* \brief VTA ALU unit test.
+* \param opcode The ALU opcode.
+* \param use_imm Use immediate.
+* \param batch Batch size.
+* \param vector_size Vector length of the ALU operation.
+* \param uop_compression Apply micro-op compression.
+* \return Number of errors from the test run.
+*/
+int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression);
+
+/*!
+* \brief VTA blocked GEMM unit test.
+* \param batch Batch size.
+* \param channels Channel width.
+* \param block Blocking size.
+* \param uop_compression Apply micro-op compression.
+* \return Number of errors from the test run.
+*/
+int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
+  int virtual_threads);
+
+/*!
+* \brief VTA GEMM unit test.
+* \param batch Batch size.
+* \param in_channels Input channels.
+* \param out_channels Output channels.
+* \param uop_compression Apply micro-op compression.
+* \return Number of errors from the test run.
+*/
+int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression);
+
+#endif  //  TESTS_HARDWARE_COMMON_TEST_LIB_H_
diff --git a/vta/tests/hardware/pynq/Makefile b/vta/tests/hardware/pynq/Makefile
new file mode 100644
index 000000000000..7a862e22eff9
--- /dev/null
+++ b/vta/tests/hardware/pynq/Makefile
@@ -0,0 +1,32 @@
+CC ?= g++
+CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
+LDFLAGS = -L/usr/lib -L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/
+LIBS = -l:libsds_lib.so -l:libdma.so -lstdc++
+INCLUDE_DIR = ../../../include
+DRIVER_DIR = ../../../src/pynq
+TESTLIB_DIR = ../common
+VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
+SOURCES = pynq_driver.cc test_lib.cc
+OBJECTS = pynq_driver.o test_lib.o metal_test.o
+EXECUTABLE = vta
+
+# Include VTA config
+VTA_CONFIG = python ../../../make/vta_config.py
+CFLAGS += `${VTA_CONFIG} --cflags`
+LDFLAGS += `${VTA_CONFIG} --ldflags`
+VTA_TARGET := $(shell ${VTA_CONFIG} --target)
+
+# Define flags
+CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DVTA_DEBUG=0
+
+# All Target
+all: $(EXECUTABLE)
+
+%.o: %.cc $(SOURCES)
+	$(CC) -c -o $@ $< $(CFLAGS)
+
+$(EXECUTABLE): $(OBJECTS)
+	$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
+
+clean:
+	rm -rf *.o $(EXECUTABLE)
diff --git a/vta/tests/hardware/pynq/metal_test.cc b/vta/tests/hardware/pynq/metal_test.cc
new file mode 100644
index 000000000000..56be244baa79
--- /dev/null
+++ b/vta/tests/hardware/pynq/metal_test.cc
@@ -0,0 +1,50 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file metal_test.cpp
+ * \brief Bare-metal test to test driver and VTA design.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <vta/driver.h>
+#include "../../../src/pynq/pynq_driver.h"
+#include "../common/test_lib.h"
+
+int main(void) {
+#if VTA_DEBUG == 1
+  printParameters();
+#endif
+
+  int status = 0;
+
+  // Run ALU test (vector-scalar operators)
+  status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
+  status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
+  status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
+
+  // Run ALU test (vector-vector operators)
+  status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
+  status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
+
+  // Run blocked GEMM test
+  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
+  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
+  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
+  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
+
+  if (status == 0) {
+    printf("\nINFO - Unit tests successful!\n");
+  } else {
+    printf("\nINTO - Unit tests failed!\n");
+  }
+
+  return status;
+}
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
new file mode 100644
index 000000000000..7201038b7be0
--- /dev/null
+++ b/vta/tests/python/integration/test_benchmark_gemm.py
@@ -0,0 +1,258 @@
+import tvm
+import numpy as np
+from tvm.contrib import util
+import vta.testing
+
+
+def test_gemm():
+    def run_gemm_packed(env, remote, batch_size, channel, block):
+        data_shape = (batch_size // env.BATCH,
+                      channel // env.BLOCK_IN,
+                      env.BATCH,
+                      env.BLOCK_IN)
+        weight_shape = (channel // env.BLOCK_OUT,
+                        channel // env.BLOCK_IN,
+                        env.BLOCK_OUT,
+                        env.BLOCK_IN)
+        res_shape = (batch_size // env.BATCH,
+                     channel // env.BLOCK_OUT,
+                     env.BATCH,
+                     env.BLOCK_OUT)
+        # To compute number of ops, use a x2 factor for FMA
+        num_ops = 2 * channel * channel * batch_size
+
+        ko = tvm.reduce_axis((0, channel // env.BLOCK_IN), name='ko')
+        ki = tvm.reduce_axis((0, env.BLOCK_IN), name='ki')
+
+        data = tvm.placeholder(data_shape,
+                               name="data",
+                               dtype=env.inp_dtype)
+        weight = tvm.placeholder(weight_shape,
+                                 name="weight",
+                                 dtype=env.wgt_dtype)
+        data_buf = tvm.compute(data_shape,
+                               lambda *i: data(*i),
+                               "data_buf")
+        weight_buf = tvm.compute(weight_shape,
+                                 lambda *i: weight(*i),
+                                 "weight_buf")
+        res_gem = tvm.compute(res_shape,
+                              lambda bo, co, bi, ci: tvm.sum(
+                                  data_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
+                                  weight_buf[co, ko, ci, ki].astype(env.acc_dtype),
+                                  axis=[ko, ki]),
+                              name="res_gem")
+        res_shf = tvm.compute(res_shape,
+                              lambda *i: res_gem(*i)>>8,
+                            name="res_shf")
+        res_max = tvm.compute(res_shape,
+                              lambda *i: tvm.max(res_shf(*i), 0),
+                              "res_max") #relu
+        res_min = tvm.compute(res_shape,
+                              lambda *i: tvm.min(res_max(*i), (1<<(env.INP_WIDTH-1))-1),
+                              "res_min") #relu
+        res = tvm.compute(res_shape,
+                          lambda *i: res_min(*i).astype(env.inp_dtype),
+                          name="res")
+
+        def verify(s, check_correctness=True):
+            mod = vta.build(s, [data, weight, res],
+                            "ext_dev", env.target_host, name="gemm")
+            temp = util.tempdir()
+            mod.save(temp.relpath("gemm.o"))
+            remote.upload(temp.relpath("gemm.o"))
+            f = remote.load_module("gemm.o")
+            # verify
+            ctx = remote.ext_dev(0)
+            # Data in original format
+            data_orig = np.random.randint(
+                -128, 128, size=(batch_size, channel)).astype(data.dtype)
+            weight_orig = np.random.randint(
+                -128, 128, size=(channel, channel)).astype(weight.dtype)
+            data_packed = data_orig.reshape(
+                batch_size // env.BATCH, env.BATCH,
+                channel // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))
+            weight_packed = weight_orig.reshape(
+                channel // env.BLOCK_OUT, env.BLOCK_OUT,
+                channel // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))
+            res_np = np.zeros(res_shape).astype(res.dtype)
+            data_arr = tvm.nd.array(data_packed, ctx)
+            weight_arr = tvm.nd.array(weight_packed, ctx)
+            res_arr = tvm.nd.array(res_np, ctx)
+            res_ref = np.zeros(res_shape).astype(env.acc_dtype)
+            for b in range(batch_size // env.BATCH):
+                for i in range(channel // env.BLOCK_OUT):
+                    for j in range(channel // env.BLOCK_IN):
+                        res_ref[b,i,:] += np.dot(data_packed[b,j,:].astype(env.acc_dtype),
+                                                 weight_packed[i,j].T.astype(env.acc_dtype))
+            res_ref = np.right_shift(res_ref, 8)
+            res_ref = np.clip(res_ref, 0, (1<<(env.INP_WIDTH-1))-1).astype(res.dtype)
+            time_f = f.time_evaluator("gemm", ctx, number=20)
+            cost = time_f(data_arr, weight_arr, res_arr)
+            res_unpack = res_arr.asnumpy().reshape(batch_size // env.BATCH,
+                                                   channel // env.BLOCK_OUT,
+                                                   env.BATCH,
+                                                   env.BLOCK_OUT)
+            if check_correctness:
+                np.testing.assert_allclose(res_unpack, res_ref)
+            return cost
+
+        def run_schedule(load_inp,
+                         load_wgt,
+                         gemm,
+                         alu,
+                         store_out,
+                         print_ir,
+                         check_correctness):
+            s = tvm.create_schedule(res.op)
+            s[data_buf].set_scope(env.inp_scope)
+            s[weight_buf].set_scope(env.wgt_scope)
+            s[res_gem].set_scope(env.acc_scope)
+            s[res_shf].set_scope(env.acc_scope)
+            s[res_min].set_scope(env.acc_scope)
+            s[res_max].set_scope(env.acc_scope)
+
+            if block:
+                bblock = block // env.BATCH
+                iblock = block // env.BLOCK_IN
+                oblock = block // env.BLOCK_OUT
+                xbo, xco, xbi, xci = s[res].op.axis
+                xb1, xco1, xb2, xco2 = s[res].tile(xbo, xco, bblock, oblock)
+                store_pt = xb2
+
+                s[res_gem].compute_at(s[res], xco1)
+                s[res_shf].compute_at(s[res], xco1)
+                s[res_min].compute_at(s[res], xco1)
+                s[res_max].compute_at(s[res], xco1)
+
+                xbo, xco, xbi, xci = s[res_gem].op.axis
+                # Compute one line at a time
+                ko1, ko2 = s[res_gem].split(ko, iblock)
+                s[res_gem].reorder(ko1, ko2, xbo, xco, xbi, xci, ki)
+                s[data_buf].compute_at(s[res_gem], ko1)
+                s[weight_buf].compute_at(s[res_gem], ko1)
+                # Use VTA instructions
+                s[data_buf].pragma(s[data_buf].op.axis[0], load_inp)
+                s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt)
+                s[res_gem].tensorize(xbi, gemm)
+                s[res_shf].pragma(s[res_shf].op.axis[0], alu)
+                s[res_min].pragma(s[res_min].op.axis[0], alu)
+                s[res_max].pragma(s[res_max].op.axis[0], alu)
+                s[res].pragma(store_pt, store_out)
+            else:
+                xbo, xco, xbi, xci = s[res_gem].op.axis
+                s[res_gem].reorder(ko, xbo, xco, xbi, xci, ki)
+                # Use VTA instructions
+                s[data_buf].pragma(s[data_buf].op.axis[0], load_inp)
+                s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt)
+                s[res_gem].tensorize(xbi, gemm)
+                s[res_shf].pragma(s[res_shf].op.axis[0], alu)
+                s[res_min].pragma(s[res_min].op.axis[0], alu)
+                s[res_max].pragma(s[res_max].op.axis[0], alu)
+                s[res].pragma(s[res].op.axis[0], store_out)
+
+
+            if print_ir:
+                print(tvm.lower(s, [data, weight, res], simple_mode=True))
+            return verify(s, check_correctness)
+
+        def gemm_normal(print_ir):
+            mock = env.mock
+            print("----- GEMM GOPS End-to-End Test-------")
+            def run_test(header, print_ir, check_correctness):
+                cost = run_schedule(
+                    env.dma_copy, env.dma_copy, env.gemm, env.alu, env.dma_copy,
+                    print_ir, check_correctness)
+                gops = (num_ops / cost.mean) / float(10 ** 9)
+                print(header)
+                print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
+            with vta.build_config():
+                run_test("NORMAL", print_ir, True)
+
+        def gemm_unittest(print_ir):
+            mock = env.mock
+            print("----- GEMM Unit Test-------")
+            def run_test(header, print_ir):
+                cost = run_schedule(
+                    mock.dma_copy, mock.dma_copy, env.gemm, mock.alu, mock.dma_copy,
+                    print_ir, False)
+                gops = (num_ops / cost.mean) / float(10 ** 9)
+                print(header)
+                print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
+            with vta.build_config():
+                run_test("NORMAL", print_ir)
+
+        def alu_unittest(print_ir):
+            mock = env.mock
+            print("----- ALU Unit Test-------")
+            def run_test(header, print_ir):
+                cost = run_schedule(
+                    mock.dma_copy, mock.dma_copy, mock.gemm, env.alu, mock.dma_copy,
+                    print_ir, False)
+                gops = (num_ops / cost.mean) / float(10 ** 9)
+                print(header)
+                print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
+            with vta.build_config():
+                run_test("NORMAL", print_ir)
+            print("")
+
+        def load_inp_unittest(print_ir):
+            mock = env.mock
+            print("----- LoadInp Unit Test-------")
+            def run_test(header, print_ir):
+                cost = run_schedule(
+                    env.dma_copy, mock.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir, False)
+                gops = (num_ops / cost.mean) / float(10 ** 9)
+                bandwith = (batch_size * channel * env.INP_WIDTH / cost.mean) / float(10 ** 9)
+                print(header)
+                print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" % (
+                    cost.mean, gops, bandwith))
+            with vta.build_config():
+                run_test("NORMAL", print_ir)
+            print("")
+
+        def load_wgt_unittest(print_ir):
+            mock = env.mock
+            print("----- LoadWgt Unit Test-------")
+            def run_test(header, print_ir):
+                cost = run_schedule(
+                    mock.dma_copy, env.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir, False)
+                gops = (num_ops / cost.mean) / float(10 ** 9)
+                bandwith = (channel * channel * env.WGT_WIDTH / cost.mean) / float(10 ** 9)
+                print(header)
+                print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" % (
+                    cost.mean, gops, bandwith))
+            with vta.build_config():
+                run_test("NORMAL", print_ir)
+            print("")
+
+        def store_out_unittest(print_ir):
+            mock = env.mock
+            print("----- StoreOut Unit Test-------")
+            def run_test(header, print_ir):
+                cost = run_schedule(
+                    mock.dma_copy, mock.dma_copy, mock.gemm, mock.alu, env.dma_copy,
+                    print_ir, False)
+                gops = (num_ops / cost.mean) / float(10 ** 9)
+                bandwith = (batch_size * channel * env.OUT_WIDTH / cost.mean) / float(10 ** 9)
+                print(header)
+                print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" % (
+                    cost.mean, gops, bandwith))
+            with vta.build_config():
+                run_test("NORMAL", print_ir)
+            print("")
+
+
+
+        gemm_normal(False)
+        gemm_unittest(False)
+        alu_unittest(False)
+
+    def _run(env, remote):
+        print("========GEMM 128=========")
+        run_gemm_packed(env, remote, 128, 128, 128)
+
+    vta.testing.run(_run)
+
+if __name__ == "__main__":
+    test_gemm()
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
new file mode 100644
index 000000000000..ca2451dec614
--- /dev/null
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -0,0 +1,268 @@
+"""Testing if we can generate code in topi style"""
+
+import tvm
+from tvm import autotvm
+from tvm.contrib import util
+from tvm.contrib.pickle_memoize import memoize
+import topi
+import topi.testing
+import vta
+import vta.testing
+import numpy as np
+
+Workload = vta.top.vta_conv2d.Workload
+
+@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+def my_clip(x, a_min, a_max):
+    """Unlike topi's current clip, put min and max into two stages."""
+    const_min = tvm.const(a_min, x.dtype)
+    const_max = tvm.const(a_max, x.dtype)
+    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    return x
+
+def test_cpu_conv2d():
+    def run_cpu_conv2d(env, remote, key, batch_size, wl, profile=True):
+        data_shape = (batch_size, wl.in_filter, wl.height, wl.width)
+        kernel_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
+
+        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
+        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
+        data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+        kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+        res_conv = topi.nn.conv2d(
+            data, kernel, padding=(wl.hpad, wl.wpad),
+            strides=(wl.hstride, wl.wstride),
+            out_dtype="int32")
+        res = topi.right_shift(res_conv, 8)
+        res = my_clip(res, 0, 127)
+        res = topi.cast(res, "int8")
+
+        # To compute number of ops, use a x2 factor for FMA
+        num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter
+
+        a_shape = (batch_size, wl.in_filter, wl.height, wl.width)
+        w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
+        stride = (wl.hstride, wl.wstride)
+        data_dtype = data.dtype
+        kernel_dtype = kernel.dtype
+        acc_dtype = env.acc_dtype
+        assert wl.hpad == wl.wpad
+        padding = wl.hpad
+
+        @memoize("vta.tests.test_benchmark_topi.conv2d.cpu.verify_nhwc")
+        def get_ref_data():
+            a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype)
+            w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype)
+            a_np = np.abs(a_np)
+            w_np = np.abs(w_np)
+            b_np = topi.testing.conv2d_nchw_python(
+                a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
+            return a_np, w_np, b_np
+
+
+        def verify(s, check_correctness):
+            mod = tvm.build(s, [data, kernel, res],
+                            target_host=env.target_host,
+                            name="conv2d")
+            temp = util.tempdir()
+            mod.save(temp.relpath("conv2d.o"))
+            remote.upload(temp.relpath("conv2d.o"))
+            f = remote.load_module("conv2d.o")
+            # verify
+            ctx = remote.cpu(0)
+            # Data in original format
+            data_orig, kernel_orig, res_ref = get_ref_data()
+            res_shape = topi.util.get_const_tuple(res.shape)
+            res_np = np.zeros(res_shape).astype(res.dtype)
+            data_arr = tvm.nd.array(data_orig, ctx)
+            kernel_arr = tvm.nd.array(kernel_orig, ctx)
+            res_arr = tvm.nd.array(res_np, ctx)
+            time_f = f.time_evaluator("conv2d", ctx, number=5)
+            cost = time_f(data_arr, kernel_arr, res_arr)
+            res_unpack = res_arr.asnumpy()
+            if check_correctness:
+                assert wl.hpad == wl.wpad
+                stride = (wl.hstride, wl.wstride)
+                padding = wl.hpad
+                res_ref = res_ref >> 8
+                res_ref = np.clip(res_ref, 0, 127).astype("int8")
+                np.testing.assert_allclose(res_unpack, res_ref)
+            return cost
+
+        def conv_normal(print_ir):
+            print("----- CONV2D CPU End-to-End Test-------")
+            s = topi.generic.schedule_conv2d_nchw([res])
+            if print_ir:
+                print(tvm.lower(s, [data, kernel, res], simple_mode=True))
+            cost = verify(s, True)
+            gops = (num_ops / cost.mean) / float(10 ** 9)
+            print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
+
+        conv_normal(False)
+
+    def _run(env, remote):
+        # ResNet18 workloads
+        resnet = {
+            # Workloads of resnet18 on imagenet
+            0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2),
+            1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+            2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+            3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+            4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+            5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+            6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+            7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+            8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+            9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+            10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+            11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+        }
+        batch_size = 1
+        for i in range(1, len(resnet)):
+            wl = resnet[i]
+            key = "resnet-cfg[%d]" % i
+            print("key=%s" % key)
+            print(wl)
+            with tvm.target.create("llvm -device=vtacpu"):
+                run_cpu_conv2d(env, remote, key, batch_size, wl)
+
+    # load pre-tuned operator parameters for ARM CPU
+    autotvm.tophub.check_package('vta')
+    with autotvm.tophub.context('llvm -device=vtacpu'):
+        vta.testing.run(_run)
+
+
+def test_vta_conv2d():
+    def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True):
+        data_shape = (batch_size//env.BATCH, wl.in_filter//env.BLOCK_IN,
+                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
+        kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN,
+                        wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN)
+        bias_shape = (1, wl.out_filter//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
+
+        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
+        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
+        data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+        kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+        bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype)
+
+        res_conv = vta.top.packed_conv2d(
+            data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride))
+        res = topi.right_shift(res_conv, 8)
+        res = topi.add(res, bias)
+        res = my_clip(res, 0, 127)
+        res = topi.cast(res, "int8")
+
+        # To compute number of ops, use a x2 factor for FMA
+        num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter
+
+        a_shape = (batch_size, wl.in_filter, wl.height, wl.width)
+        w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
+        stride = (wl.hstride, wl.wstride)
+        data_dtype = data.dtype
+        kernel_dtype = kernel.dtype
+        acc_dtype = env.acc_dtype
+        assert wl.hpad == wl.wpad
+        padding = wl.hpad
+
+        @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
+        def get_ref_data():
+            a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype)
+            w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype)
+            a_np = np.abs(a_np)
+            w_np = np.abs(w_np)
+            b_np = topi.testing.conv2d_nchw_python(
+                a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
+            return a_np, w_np, b_np
+
+        def verify(s, check_correctness):
+            mod = vta.build(s, [data, kernel, bias, res], "ext_dev",
+                            env.target_host, name="conv2d")
+            temp = util.tempdir()
+
+            mod.save(temp.relpath("conv2d.o"))
+            remote.upload(temp.relpath("conv2d.o"))
+            f = remote.load_module("conv2d.o")
+            # verify
+            ctx = remote.ext_dev(0)
+            # Data in original format
+            data_orig, kernel_orig, res_ref = get_ref_data()
+            bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32")
+            bias_orig = np.abs(bias_orig)
+
+            data_packed = data_orig.reshape(
+                batch_size//env.BATCH, env.BATCH,
+                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
+                wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3))
+            kernel_packed = kernel_orig.reshape(
+                wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT,
+                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
+                wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
+            bias_packed = bias_orig.reshape(
+                1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
+            res_shape = topi.util.get_const_tuple(res.shape)
+
+            res_np = np.zeros(res_shape).astype(res.dtype)
+            data_arr = tvm.nd.array(data_packed, ctx)
+            kernel_arr = tvm.nd.array(kernel_packed, ctx)
+            bias_arr = tvm.nd.array(bias_packed, ctx)
+            res_arr = tvm.nd.array(res_np, ctx)
+            time_f = f.time_evaluator("conv2d", ctx, number=5)
+            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
+            res_unpack = res_arr.asnumpy().transpose(
+                (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width)
+            if check_correctness:
+                assert wl.hpad == wl.wpad
+                stride = (wl.hstride, wl.wstride)
+                padding = wl.hpad
+                res_ref = res_ref >> 8
+                res_ref += bias_orig.reshape(wl.out_filter, 1, 1)
+                res_ref = np.clip(res_ref, 0, 127).astype("int8")
+                np.testing.assert_allclose(res_unpack, res_ref)
+            return cost
+
+        def conv_normal(print_ir):
+            print("----- CONV2D End-to-End Test-------")
+            with vta.build_config():
+                s = vta.top.schedule_packed_conv2d([res])
+                if print_ir:
+                    print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
+            cost = verify(s, True)
+            gops = (num_ops / cost.mean) / float(10 ** 9)
+            print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
+
+        conv_normal(False)
+
+    def _run(env, remote):
+        # ResNet18 workloads
+        resnet = {
+            # Workloads of resnet18 on imagenet
+            0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2),
+            1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+            2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+            3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+            4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+            5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+            6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+            7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+            8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+            9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+            10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+            11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+        }
+
+        batch_size = 1
+        for i in range(0, len(resnet)):
+            wl = resnet[i]
+            key = "resnet-cfg[%d]" % i
+            print("key=%s" % key)
+            print(wl)
+            run_vta_conv2d(env, remote, key, batch_size, wl)
+
+    vta.testing.run(_run)
+
+
+if __name__ == "__main__":
+    test_cpu_conv2d()
+    test_vta_conv2d()
diff --git a/vta/tests/python/pynq/test_program_rpc.py b/vta/tests/python/pynq/test_program_rpc.py
new file mode 100644
index 000000000000..3bccb963bb67
--- /dev/null
+++ b/vta/tests/python/pynq/test_program_rpc.py
@@ -0,0 +1,28 @@
+import os
+import tvm
+from tvm import rpc
+from vta import get_bitstream_path, download_bitstream, program_fpga, reconfig_runtime
+
+host = os.environ.get("VTA_PYNQ_RPC_HOST", "pynq")
+port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
+
+def program_rpc_bitstream(path=None):
+    """Program the FPGA on the RPC server
+
+    Parameters
+    ----------
+    path : path to bitstream (optional)
+    """
+    assert tvm.module.enabled("rpc")
+    remote = rpc.connect(host, port)
+    program_fpga(remote, path)
+
+def reconfig_rpc_runtime():
+    """Reconfig the RPC server runtime
+    """
+    assert tvm.module.enabled("rpc")
+    remote = rpc.connect(host, port)
+    reconfig_runtime(remote)
+
+program_rpc_bitstream()
+reconfig_rpc_runtime()
diff --git a/vta/tests/python/unittest/test_environment.py b/vta/tests/python/unittest/test_environment.py
new file mode 100644
index 000000000000..9d6597abe200
--- /dev/null
+++ b/vta/tests/python/unittest/test_environment.py
@@ -0,0 +1,20 @@
+import vta
+
+
+def test_env():
+    env = vta.get_env()
+    mock = env.mock
+    assert mock.alu == "skip_alu"
+
+def test_env_scope():
+    env = vta.get_env()
+    cfg = env.pkg_config().cfg_dict
+    cfg["TARGET"] = "xyz"
+    with vta.Environment(cfg):
+        assert vta.get_env().TARGET == "xyz"
+    assert vta.get_env().TARGET == env.TARGET
+
+
+if __name__ == "__main__":
+    test_env()
+    test_env_scope()
diff --git a/vta/tests/python/unittest/test_vta_insn.py b/vta/tests/python/unittest/test_vta_insn.py
new file mode 100644
index 000000000000..b9803684b22f
--- /dev/null
+++ b/vta/tests/python/unittest/test_vta_insn.py
@@ -0,0 +1,497 @@
+"""Unit test VTA's instructions """
+import tvm
+import numpy as np
+import topi
+from tvm.contrib import util
+
+import vta
+import vta.testing
+from vta.testing import simulator
+
+
+def test_save_load_out():
+    """Test save/store output command"""
+    def _run(env, remote):
+        n = 6
+        x = tvm.placeholder(
+            (n, n, env.BATCH, env.BLOCK_OUT),
+            name="x",
+            dtype=env.acc_dtype)
+        x_buf = tvm.compute(
+            (n, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: x(*i), "x_buf")
+        # insert no-op that won't be optimized away
+        y_buf = tvm.compute(
+            (n, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: x_buf(*i)>>0, "y_buf")
+        y = tvm.compute(
+            (n, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
+        # schedule
+        s = tvm.create_schedule(y.op)
+        s[x_buf].set_scope(env.acc_scope)
+        s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
+        s[y_buf].set_scope(env.acc_scope)
+        s[y_buf].pragma(y_buf.op.axis[0], env.alu)
+        s[y].pragma(y.op.axis[0], env.dma_copy)
+
+        # verification
+        with vta.build_config():
+            m = vta.build(s, [x, y], "ext_dev", env.target_host)
+
+        if not remote:
+            return
+        temp = util.tempdir()
+        m.save(temp.relpath("load_act.o"))
+        remote.upload(temp.relpath("load_act.o"))
+        f = remote.load_module("load_act.o")
+        # verify
+        ctx = remote.ext_dev(0)
+        x_np = np.random.randint(
+            1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
+        y_np = x_np.astype(y.dtype)
+        x_nd = tvm.nd.array(x_np, ctx)
+        y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)
+        f(x_nd, y_nd)
+        np.testing.assert_equal(y_np, y_nd.asnumpy())
+
+    vta.testing.run(_run)
+
+
+def test_padded_load():
+    """Test padded load."""
+    def _run(env, remote):
+        # declare
+        n = 21
+        m = 20
+        pad_before = [0, 1, 0, 0]
+        pad_after = [1, 3, 0, 0]
+        x = tvm.placeholder(
+            (n, m, env.BATCH, env.BLOCK_OUT),
+            name="x",
+            dtype=env.acc_dtype)
+        x_buf = topi.nn.pad(x, pad_before, pad_after, name="y")
+        # insert no-op that won't be optimized away
+        y_buf = tvm.compute((n + pad_before[0] + pad_after[0],
+                             m + pad_before[1] + pad_after[1],
+                             env.BATCH,
+                             env.BLOCK_OUT), lambda *i: x_buf(*i)>>0, "y_buf")
+        y = tvm.compute((n + pad_before[0] + pad_after[0],
+                         m + pad_before[1] + pad_after[1],
+                         env.BATCH,
+                         env.BLOCK_OUT), lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
+        # schedule
+        s = tvm.create_schedule(y.op)
+        s[x_buf].set_scope(env.acc_scope)
+        s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
+        s[y_buf].set_scope(env.acc_scope)
+        s[y_buf].pragma(y_buf.op.axis[0], env.alu)
+        s[y].pragma(y.op.axis[0], env.dma_copy)
+        # build
+        with vta.build_config():
+            mod = vta.build(s, [x, y], "ext_dev", env.target_host)
+
+        if not remote:
+            return
+        temp = util.tempdir()
+        mod.save(temp.relpath("padded_load.o"))
+        remote.upload(temp.relpath("padded_load.o"))
+        f = remote.load_module("padded_load.o")
+        # verify
+        ctx = remote.ext_dev(0)
+        x_np = np.random.randint(1, 2, size=(
+            n, m, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
+        y_np = np.zeros((n + pad_before[0] + pad_after[0],
+                         m + pad_before[1] + pad_after[1],
+                         env.BATCH,
+                         env.BLOCK_OUT)).astype(y.dtype)
+        y_np[pad_before[0]:pad_before[0] + n,
+             pad_before[1]:pad_before[1] + m,
+             :] = x_np
+        x_nd = tvm.nd.array(x_np, ctx)
+        y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)
+        f(x_nd, y_nd)
+        np.testing.assert_equal(y_np, y_nd.asnumpy())
+
+    vta.testing.run(_run)
+
+
+def test_gemm():
+    """Test GEMM."""
+    def _run(env, remote):
+        # declare
+        o = 4
+        n = 1
+        m = 4
+        x = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="x", dtype=env.inp_dtype)
+        w = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="w", dtype=env.wgt_dtype)
+        x_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i), "x_buf")
+        w_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: w(*i), "w_buf")
+        ko = tvm.reduce_axis((0, n), name="ko")
+        ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki")
+        y_gem = tvm.compute(
+            (o, m, env.BATCH, env.BLOCK_OUT),
+            lambda bo, co, bi, ci:
+            tvm.sum(x_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
+                    w_buf[co, ko, ci, ki].astype(env.acc_dtype),
+                    axis=[ko, ki]),
+            name="y_gem")
+        y_shf = tvm.compute(
+            (o, m, env.BATCH, env.BLOCK_OUT),
+            lambda *i: y_gem(*i)>>8,
+            name="y_shf")
+        y_max = tvm.compute(
+            (o, m, env.BATCH, env.BLOCK_OUT),
+            lambda *i: tvm.max(y_shf(*i), 0),
+            "y_max") #relu
+        y_min = tvm.compute(
+            (o, m, env.BATCH, env.BLOCK_OUT),
+            lambda *i: tvm.min(y_max(*i), (1<<(env.INP_WIDTH-1))-1),
+            "y_min") #relu
+        y = tvm.compute(
+            (o, m, env.BATCH, env.BLOCK_OUT),
+            lambda *i: y_min(*i).astype(env.inp_dtype),
+            name="y")
+
+        if not remote:
+            return
+
+        def verify(s):
+            mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
+            temp = util.tempdir()
+            mod.save(temp.relpath("gemm.o"))
+            remote.upload(temp.relpath("gemm.o"))
+            f = remote.load_module("gemm.o")
+            # verify
+            ctx = remote.ext_dev(0)
+            x_np = np.random.randint(
+                -128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype)
+            w_np = np.random.randint(
+                -128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype)
+            y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
+            x_nd = tvm.nd.array(x_np, ctx)
+            w_nd = tvm.nd.array(w_np, ctx)
+            y_nd = tvm.nd.array(y_np, ctx)
+            y_np = y_np.astype(env.acc_dtype)
+            for b in range(o):
+                for i in range(m):
+                    for j in range(n):
+                        y_np[b,i,:] += np.dot(x_np[b,j,:].astype(env.acc_dtype),
+                                              w_np[i,j].T.astype(env.acc_dtype))
+            y_np = np.right_shift(y_np, 8)
+            y_np = np.clip(y_np, 0, (1<<(env.INP_WIDTH-1))-1).astype(y.dtype)
+
+            if env.TARGET == "sim":
+                simulator.clear_stats()
+                f(x_nd, w_nd, y_nd)
+                print(simulator.stats())
+            else:
+                f(x_nd, w_nd, y_nd)
+
+            np.testing.assert_equal(y_np, y_nd.asnumpy())
+
+        def test_schedule1():
+            # default schedule with no smt
+            s = tvm.create_schedule(y.op)
+            # set the scope of the SRAM buffers
+            s[x_buf].set_scope(env.inp_scope)
+            s[w_buf].set_scope(env.wgt_scope)
+            s[y_gem].set_scope(env.acc_scope)
+            s[y_shf].set_scope(env.acc_scope)
+            s[y_max].set_scope(env.acc_scope)
+            s[y_min].set_scope(env.acc_scope)
+            # set pragmas for DMA transfer and ALU ops
+            s[x_buf].compute_at(s[y_gem], ko)
+            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
+            s[w_buf].compute_at(s[y_gem], ko)
+            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
+            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
+            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
+            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
+            s[y].pragma(s[y].op.axis[0], env.dma_copy)
+            # tensorization
+            s[y_gem].reorder(
+                ko,
+                s[y_gem].op.axis[0],
+                s[y_gem].op.axis[1],
+                s[y_gem].op.axis[2],
+                s[y_gem].op.axis[3],
+                ki)
+            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
+            verify(s)
+
+        def test_smt():
+            # test smt schedule
+            s = tvm.create_schedule(y.op)
+            s[x_buf].set_scope(env.inp_scope)
+            s[w_buf].set_scope(env.wgt_scope)
+            s[y_gem].set_scope(env.acc_scope)
+            s[y_shf].set_scope(env.acc_scope)
+            s[y_max].set_scope(env.acc_scope)
+            s[y_min].set_scope(env.acc_scope)
+            abo, aco, abi, aci = s[y].op.axis
+            abo1, abo2 = s[y].split(abo, nparts=2)
+            s[y].bind(abo1, tvm.thread_axis("cthread"))
+            s[y_gem].compute_at(s[y], abo1)
+            s[y_shf].compute_at(s[y], abo1)
+            s[y_max].compute_at(s[y], abo1)
+            s[y_min].compute_at(s[y], abo1)
+            s[y_gem].reorder(
+                ko,
+                s[y_gem].op.axis[0],
+                s[y_gem].op.axis[1],
+                s[y_gem].op.axis[2],
+                s[y_gem].op.axis[3],
+                ki)
+            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
+            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
+            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
+            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
+            s[x_buf].compute_at(s[y_gem], ko)
+            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
+            s[w_buf].compute_at(s[y_gem], ko)
+            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
+            s[y].pragma(abo2, env.dma_copy)
+            verify(s)
+
+        test_schedule1()
+        test_smt()
+    vta.testing.run(_run)
+
+
+def test_alu():
+    def _run(env, remote):
+        def check_alu(tvm_op, np_op=None, use_imm=False):
+            """Test ALU"""
+            m = 8
+            n = 8
+            imm = np.random.randint(1,5)
+            # compute
+            a = tvm.placeholder(
+                (m, n, env.BATCH, env.BLOCK_OUT),
+                name="a",
+                dtype=env.acc_dtype)
+            a_buf = tvm.compute(
+                (m, n, env.BATCH, env.BLOCK_OUT),
+                lambda *i: a(*i),
+                "a_buf") #DRAM->SRAM
+            if use_imm:
+                res_buf = tvm.compute(
+                    (m, n, env.BATCH, env.BLOCK_OUT),
+                    lambda *i: tvm_op(a_buf(*i), imm),
+                    "res_buf") #compute
+            else:
+                b = tvm.placeholder(
+                    (m, n, env.BATCH, env.BLOCK_OUT),
+                    name="b",
+                    dtype=env.acc_dtype)
+                b_buf = tvm.compute(
+                    (m, n, env.BATCH, env.BLOCK_OUT),
+                    lambda *i: b(*i),
+                    "b_buf") #DRAM->SRAM
+                res_buf = tvm.compute(
+                    (m, n, env.BATCH, env.BLOCK_OUT),
+                    lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
+                    "res_buf") #compute5B
+            res = tvm.compute(
+                (m, n, env.BATCH, env.BLOCK_OUT),
+                lambda *i: res_buf(*i).astype(env.inp_dtype),
+                "res") #SRAM->DRAM
+            # schedule
+            s = tvm.create_schedule(res.op)
+            s[a_buf].set_scope(env.acc_scope) # SRAM
+            s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
+            s[res_buf].set_scope(env.acc_scope) # SRAM
+            s[res_buf].pragma(res_buf.op.axis[0], env.alu) # compute
+            s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM
+            if not use_imm:
+                s[b_buf].set_scope(env.acc_scope) # SRAM
+                s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
+
+            if not remote:
+                return
+
+            # build
+            with vta.build_config():
+                if use_imm:
+                    mod = vta.build(s, [a, res], "ext_dev", env.target_host)
+                else:
+                    mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
+            temp = util.tempdir()
+            mod.save(temp.relpath("load_act.o"))
+            remote.upload(temp.relpath("load_act.o"))
+            f = remote.load_module("load_act.o")
+            # verify
+            ctx = remote.ext_dev(0)
+            a_np = np.random.randint(
+                -16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
+            if use_imm:
+                res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
+            else:
+                b_np = np.random.randint(
+                    -16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(b.dtype)
+                res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
+            res_np = res_np.astype(res.dtype)
+            a_nd = tvm.nd.array(a_np, ctx)
+            res_nd = tvm.nd.array(
+                np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)
+            if use_imm:
+                f(a_nd, res_nd)
+            else:
+                b_nd = tvm.nd.array(b_np, ctx)
+                f(a_nd, b_nd, res_nd)
+            np.testing.assert_equal(res_np, res_nd.asnumpy())
+
+        check_alu(lambda x, y: x << y, np.left_shift, use_imm=True)
+        check_alu(tvm.max, np.maximum, use_imm=True)
+        check_alu(tvm.max, np.maximum)
+        check_alu(lambda x, y: x + y, use_imm=True)
+        check_alu(lambda x, y: x + y)
+        check_alu(lambda x, y: x >> y, np.right_shift, use_imm=True)
+
+    vta.testing.run(_run)
+
+
+def test_relu():
+    """Test RELU on ALU"""
+    def _run(env, remote):
+        m = 8
+        n = 10
+        # compute
+        a = tvm.placeholder(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            name="a",
+            dtype=env.acc_dtype)
+        a_buf = tvm.compute(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: a(*i),
+            "a_buf") # DRAM->SRAM
+        max_buf = tvm.compute(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: tvm.max(a_buf(*i), 0),
+            "res_buf") # relu
+        min_buf = tvm.compute(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: tvm.min(max_buf(*i), (1<<(env.INP_WIDTH-1))-1),
+            "max_buf") # relu
+        res = tvm.compute(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: min_buf(*i).astype(env.inp_dtype),
+            "min_buf") # SRAM->DRAM
+        # schedule
+        s = tvm.create_schedule(res.op)
+        s[a_buf].set_scope(env.acc_scope) # SRAM
+        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
+        s[max_buf].set_scope(env.acc_scope) # SRAM
+        s[min_buf].set_scope(env.acc_scope) # SRAM
+        s[max_buf].pragma(max_buf.op.axis[0], env.alu) # compute
+        s[min_buf].pragma(min_buf.op.axis[0], env.alu) # compute
+        s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM
+        # build
+        with vta.build_config():
+            mod = vta.build(s, [a, res], "ext_dev", env.target_host)
+        if not remote:
+            return
+        temp = util.tempdir()
+        mod.save(temp.relpath("load_act.o"))
+        remote.upload(temp.relpath("load_act.o"))
+        f = remote.load_module("load_act.o")
+        # verify
+        ctx = remote.ext_dev(0)
+        a_np = np.random.randint(
+            -256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
+        res_np = np.clip(a_np, 0, (1<<(env.INP_WIDTH-1))-1).astype(res.dtype)
+        a_nd = tvm.nd.array(a_np, ctx)
+        res_nd = tvm.nd.array(
+            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)
+        f(a_nd, res_nd)
+        np.testing.assert_equal(res_np, res_nd.asnumpy())
+
+    vta.testing.run(_run)
+
+
+def test_shift_and_scale():
+    """Test shift and scale on ALU"""
+    def _run(env, remote):
+        m = 2
+        n = 8
+        imm_shift = np.random.randint(0,8)
+        imm_scale = np.random.randint(1,5)
+        # compute
+        a = tvm.placeholder(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            name="a", dtype=env.acc_dtype)
+        a_buf = tvm.compute(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: a(*i),
+            "a_buf") # DRAM->SRAM
+        res_shift = tvm.compute(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: a_buf(*i)+imm_shift,
+            "res_shift") # compute
+        res_scale = tvm.compute(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: res_shift(*i)>>imm_scale,
+            "res_scale") # compute
+        res = tvm.compute(
+            (m, n, env.BATCH, env.BLOCK_OUT),
+            lambda *i: res_scale(*i).astype(env.inp_dtype),
+            "res") # SRAM->DRAM
+        # schedule
+        s = tvm.create_schedule(res.op)
+        s[a_buf].set_scope(env.acc_scope) # SRAM
+        s[res_shift].set_scope(env.acc_scope) # SRAM
+        s[res_scale].set_scope(env.acc_scope) # SRAM
+        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
+        s[res_shift].pragma(res_shift.op.axis[0], env.alu) # compute
+        s[res_scale].pragma(res_scale.op.axis[0], env.alu) # compute
+        s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM
+        # build
+        mod = vta.build(s, [a, res], "ext_dev", env.target_host)
+        if not remote:
+            return
+        temp = util.tempdir()
+        mod.save(temp.relpath("load_act.o"))
+        remote.upload(temp.relpath("load_act.o"))
+        f = remote.load_module("load_act.o")
+        # verify
+        ctx = remote.ext_dev(0)
+        a_np = np.random.randint(
+            -10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
+        res_np = np.right_shift((a_np + imm_shift), imm_scale)
+        res_np = res_np.astype(res.dtype)
+        a_nd = tvm.nd.array(a_np, ctx)
+        res_nd = tvm.nd.array(
+            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)
+        f(a_nd, res_nd)
+        np.testing.assert_equal(res_np, res_nd.asnumpy())
+
+    vta.testing.run(_run)
+
+
+def test_runtime_array():
+    def _run(env, remote):
+        n = 100
+        ctx = remote.ext_dev(0)
+        x_np = np.random.randint(
+            1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype("int8")
+        x_nd = tvm.nd.array(x_np, ctx)
+        np.testing.assert_equal(x_np, x_nd.asnumpy())
+
+    vta.testing.run(_run)
+
+
+if __name__ == "__main__":
+    print("Array test")
+    test_runtime_array()
+    print("Load/store test")
+    test_save_load_out()
+    print("Padded load test")
+    #test_padded_load()
+    print("GEMM test")
+    test_gemm()
+    test_alu()
+    print("ALU test")
+    test_relu()
+    print("Shift and scale")
+    test_shift_and_scale()
diff --git a/vta/tutorials/README.txt b/vta/tutorials/README.txt
new file mode 100644
index 000000000000..1ba48b0b1fad
--- /dev/null
+++ b/vta/tutorials/README.txt
@@ -0,0 +1,2 @@
+VTA Tutorials
+=============
diff --git a/vta/tutorials/convolution_opt.py b/vta/tutorials/convolution_opt.py
new file mode 100644
index 000000000000..8e4b77d8b491
--- /dev/null
+++ b/vta/tutorials/convolution_opt.py
@@ -0,0 +1,426 @@
+"""
+2D Convolution Optimization
+===========================
+**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+This tutorial provides an overview on how to use TVM to map a 2D convolution
+workload efficiently on the VTA design.
+We recommend covering the :ref:`vta-mat-mult-opt` tutorial first.
+
+2D convolution is dominant in most computer vision deep neural networks.
+In this tutorial, we will demonstrate TVM schedule optimizations to map
+2D convolution operators in NCHW layout onto VTA.
+We also introduce the notion of latency hiding, which allows us to
+maximize VTA's compute and memory resource utilization.
+"""
+
+######################################################################
+# RPC Setup
+# ---------
+# We start by programming the Pynq's FPGA and building its RPC runtime.
+
+from __future__ import absolute_import, print_function
+
+import os
+import tvm
+import vta
+import numpy as np
+
+from tvm import rpc
+from tvm.contrib import util
+from vta.testing import simulator
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+
+# We read the Pynq RPC host IP address and port number from the OS environment
+host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
+
+# We configure both the bitstream and the runtime system on the Pynq
+# to match the VTA configuration specified by the vta_config.json file.
+if env.TARGET == "pynq":
+
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.module.enabled("rpc")
+    remote = rpc.connect(host, port)
+
+    # Reconfigure the JIT runtime
+    vta.reconfig_runtime(remote)
+
+    # Program the FPGA with a pre-compiled VTA bitstream.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    vta.program_fpga(remote, bitstream=None)
+
+# In simulation mode, host the RPC server locally.
+elif env.TARGET == "sim":
+    remote = rpc.LocalSession()
+
+######################################################################
+# Computation Declaration
+# -----------------------
+# As a first step, we need to describe our 2D convolution computation
+# in NCHW format.
+#
+# We define the 2D convolution shape by the batch size,
+# spatial dimensions, input channels, output channels, kernel dimensions,
+# kernel dimensions, padding dimensions, and stride dimensions.
+#
+# We pick the shape of the 9th convolutional layer of the ResNet-18
+# architecture as our convolution workload parameters.
+#
+# We've added extra operators to the 2D convolution that apply
+# shifting and clipping to the output in order to mimic a fixed-point
+# convolution followed by a rectified linear activation.
+# We describe the TVM dataflow graph of the 2D convolution layer below:
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/conv2d_dataflow.png
+#      :align: center
+#
+# This computation is intentionally too large to fit onto VTA's on-chip
+# buffers all at once. Therefore in the scheduling phase we'll
+# rely on computation blocking strategies to break the computation down into
+# manageable chunks.
+#
+# .. note::
+#
+#   *Spatial padding*
+#
+#   Note that we'll need to import the TOPI library to apply spatial padding
+#   on the input feature map tensor.
+#   Spatial padding facilitates blocking in the context of 2D convolutions
+#   due to the fact that the same (x, y) spatial location of the input
+#   feature map of any given layer is read more than once if the convolution
+#   kernel window size is greater than one.
+#   On CPUs, and GPUs, one way to increase efficiency of memory accesses
+#   when parallelizing work is spatial packing, which requires data re-layout.
+#   VTA load DMA engine can insert padding automatically so that the original
+#   input feature map does not have to be re-packed in memory.
+#
+#   We show the effect of VTA's on the fly spatial padding when data is being
+#   loaded from DRAM into VTA's SRAM, following a 2D strided and padded memory
+#   read.
+#
+#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/padding.png
+#        :align: center
+#        :width: 480px
+
+import topi
+
+# 2D convolution layer dimensions taken from ResNet-18 architecture
+# (9th convolutional layer)
+batch_size = 1
+height = 14
+width = 14
+in_channels = 256
+out_channels = 256
+kernel_h = 3
+kernel_w = 3
+pad_h = 1
+pad_w = 1
+stride_h = 1
+stride_w = 1
+assert batch_size % env.BATCH == 0
+assert in_channels % env.BLOCK_IN == 0
+assert out_channels % env.BLOCK_OUT == 0
+
+# Input feature map: (N, IC, H, W, n, ic)
+data_shape = (batch_size // env.BATCH,
+              in_channels // env.BLOCK_IN,
+              height,
+              width,
+              env.BATCH,
+              env.BLOCK_IN)
+# Kernel: (OC, IC, H, W, oc, ic)
+kernel_shape = (out_channels // env.BLOCK_OUT,
+                in_channels // env.BLOCK_IN,
+                kernel_h,
+                kernel_w,
+                env.BLOCK_OUT,
+                env.BLOCK_IN)
+# Derive output feature map dimensions
+fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
+fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
+# Output feature map: (N, OC, H, W, n, oc)
+output_shape = (batch_size // env.BATCH,
+                out_channels // env.BLOCK_OUT,
+                fout_height,
+                fout_width,
+                env.BATCH,
+                env.BLOCK_OUT)
+
+# Convolution reduction axes
+dy = tvm.reduce_axis((0, kernel_h), name='dy')
+dx = tvm.reduce_axis((0, kernel_w), name='dx')
+ic = tvm.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
+ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
+
+# Input placeholder tensors
+data = tvm.placeholder(data_shape,
+                       name="data",
+                       dtype=env.inp_dtype)
+kernel = tvm.placeholder(kernel_shape,
+                         name="kernel",
+                         dtype=env.wgt_dtype)
+
+# Copy buffers:
+#   Apply spatial padding to input feature map
+data_buf = topi.nn.pad(data,
+                       [0, 0, pad_h, pad_w, 0, 0],
+                       name="data_buf")
+kernel_buf = tvm.compute(kernel_shape, lambda *i: kernel(*i), "kernel_buf")
+
+# Declare 2D convolution
+res_conv = tvm.compute(
+    output_shape,
+    lambda bo, co, i, j, bi, ci: tvm.sum(
+      data_buf[bo, ic, i*stride_h+dy, j*stride_w+dx, bi, ic_tns].astype(env.acc_dtype) *
+      kernel_buf[co, ic, dy, dx, ci, ic_tns].astype(env.acc_dtype),
+    axis=[ic, dy, dx, ic_tns]),
+    name="res_conv")
+
+# Add shift stage for fix-point normalization
+res_shr = tvm.compute(output_shape,
+                      lambda *i: res_conv(*i) >> 8,
+                      name="res_shr")
+
+# Apply clipping between (0, input max value)
+inp_max = (1 << (env.INP_WIDTH - 1)) - 1
+res_max = tvm.compute(output_shape,
+                      lambda *i: tvm.max(res_shr(*i), 0),
+                      "res_max")
+res_min = tvm.compute(output_shape,
+                      lambda *i: tvm.min(res_max(*i), inp_max),
+                      "res_min")
+
+# Result Tensor
+res = tvm.compute(output_shape,
+                  lambda *i: res_min(*i).astype(env.inp_dtype),
+                  name="res")
+
+
+######################################################################
+# Scheduling the Computation
+# --------------------------
+# We'll look at a set of schedule transformations necessary to map the
+# 2D convolution onto VTA in an efficient fashion.
+# Those include:
+#
+# - Computation blocking
+# - Virtual threading to increase compute utilization
+# - Lowering to VTA hardware intrinsics
+
+# Create TVM schedule
+s = tvm.create_schedule(res.op)
+# Let's look at the default TVM schedule
+print(tvm.lower(s, [data, kernel, res], simple_mode=True))
+
+######################################################################
+# Blocking the Computation
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+# The 2D convolution is by default too large for activations or kernel weights
+# to fit on VTA's on-chip buffers all at once.
+# We apply blocking along input channels, output channels, and along
+# the height spatial dimensions.
+# We don't apply blocking along the width spatial dimension since it's
+# the innermost dimension in the NCHW layout (and consequently to increase
+# locality, it's best not to block along the innermost dimension).
+
+# Let's define tiling sizes
+b_block = 1 // env.BATCH
+oc_block = 128 // env.BLOCK_OUT
+ic_block = 16 // env.BLOCK_IN
+h_block = 7
+w_block = 14
+
+# Tile the output tensor along the spatial and output channel dimensions
+# (since by default we are doing single batch inference, the split along
+#  the batch dimension has no effect)
+b, oc, y, x, b_tns, oc_tns = s[res].op.axis
+b_out, b_inn = s[res].split(b, factor=b_block)
+oc_out, oc_inn = s[res].split(oc, factor=oc_block)
+y_out, y_inn = s[res].split(y, factor=h_block)
+x_out, x_inn = s[res].split(x, factor=w_block)
+s[res].reorder(b_out, oc_out, y_out, x_out, b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns)
+
+# Move intermediate computation into each output compute tile
+s[res_conv].compute_at(s[res], x_out)
+s[res_shr].compute_at(s[res], x_out)
+s[res_max].compute_at(s[res], x_out)
+s[res_min].compute_at(s[res], x_out)
+
+# Apply additional loop split along reduction axis (input channel)
+b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns = s[res_conv].op.axis
+ic_out, ic_inn = s[res_conv].split(ic, factor=ic_block)
+
+# Reorder axes.
+# 1) Group the VTA tensor axes in the inner most position: b_tns, oc_tns, ic_tns
+#    to allow TVM to tensorize.
+# 2) We move the ic_out axis all the way out of the convolution loop to block
+#    along the reduction axis.
+# 3) Now we re-order the block axes: b_inn, oc_inn, y_inn, x_inn, ic_inn, dy, dx.
+#    VTA runtime/hardware requires us to write to a different output feature map
+#    location for every VTA tensor operation.
+#    This restriction requires us to order one of oc_inn, y_inn or x_inn right
+#    before b_tns, since they all affect output feature map indexing.
+#    Therefore, we choose to bring x_inn inside as shown below.
+s[res_conv].reorder(ic_out, b_inn, oc_inn, y_inn, ic_inn, dy, dx, x_inn, b_tns, oc_tns, ic_tns)
+
+######################################################################
+# Virtual Threading
+# ~~~~~~~~~~~~~~~~~
+# Virtual threading is a mechanism that increases task-level pipeline
+# parallelism in the VTA hardware design.
+# Put it another way, it increases compute resource utilization by hiding
+# memory access latency.
+#
+# In the implementation below, virtual threading distributes work across two
+# threads split along the output channel axis.
+# We show how work is split when computing the 2D convolution in the figure
+# below.
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/virtual_threading.png
+#      :align: center
+#      :width: 480px
+
+# VTA only supports 2 virtual threads
+v_threads = 2
+
+# Perform virtual thread split along output channel outer axis
+_, tx = s[res].split(oc_out, factor=v_threads)
+s[res].reorder(tx, b_out)
+s[res].bind(tx, tvm.thread_axis("cthread"))
+
+# Let's look at the current TVM schedule after blocking and virtual threading
+print(tvm.lower(s, [data, kernel, res], simple_mode=True))
+
+######################################################################
+# Lowering Copies to DMA Transfers
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Next we set the buffer scopes to the corresponding on-chip VTA SRAM buffers.
+# We move the load loops into the 2D convolution computation loop to stage
+# memory loads such that they fit in the on-chip SRAM buffers.
+# Finally we annotate the load/store loop outer axes with the DMA copy pragma
+# to perform bulk memory transfers on VTA.
+
+# Set scope of SRAM buffers
+s[data_buf].set_scope(env.inp_scope)
+s[kernel_buf].set_scope(env.wgt_scope)
+s[res_conv].set_scope(env.acc_scope)
+s[res_shr].set_scope(env.acc_scope)
+s[res_min].set_scope(env.acc_scope)
+s[res_max].set_scope(env.acc_scope)
+
+# Block data and kernel cache reads
+s[data_buf].compute_at(s[res_conv], ic_out)
+s[kernel_buf].compute_at(s[res_conv], ic_out)
+
+# Use DMA copy pragma on DRAM->SRAM operations
+s[data_buf].pragma(s[data_buf].op.axis[0], env.dma_copy)
+s[kernel_buf].pragma(s[kernel_buf].op.axis[0], env.dma_copy)
+
+# Use DMA copy pragma on SRAM->DRAM operation in each result block
+# (this implies that these copies should be performed along b_inn,
+# or result axis 4)
+s[res].pragma(s[res].op.axis[4], env.dma_copy)
+
+######################################################################
+# Lowering Computation to VTA Compute Intrinsics
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# The last phase is to lower the computation loops down to VTA hardware
+# intrinsics by mapping the 2D convolution to tensor intrinsics,
+# and mapping the shift, and clipping computation to the vector ALU.
+
+# Apply tensorization over the batch tensor tile axis
+s[res_conv].tensorize(b_tns, env.gemm)
+
+# Add an ALU pragma over the shift and clipping operations
+s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
+s[res_min].pragma(s[res_min].op.axis[0], env.alu)
+s[res_max].pragma(s[res_max].op.axis[0], env.alu)
+
+# Let's look at the final lowered TVM schedule after lowering memory
+# loads/stores down to DMA copy intrinsics, and the computation down to
+# VTA compute intrinsics.
+print(vta.lower(s, [data, kernel, res], simple_mode=True))
+
+######################################################################
+# TVM Compilation and Verification
+# --------------------------------
+# After specifying the schedule, we can compile it into a TVM function.
+# We save the module so we can send it over RPC.
+# We run the function and verify it against a numpy implementation to
+# ensure correctness.
+
+# This library facilitates 2D convolution testing
+from topi.testing import conv2d_nchw_python
+
+# Compile the TVM module
+my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
+temp = util.tempdir()
+my_conv.save(temp.relpath("conv2d.o"))
+remote.upload(temp.relpath("conv2d.o"))
+f = remote.load_module("conv2d.o")
+
+# Get the remote device context
+ctx = remote.ext_dev(0)
+
+# Initialize the data and kernel arrays randomly in the int range
+# of (-128, 128] in NCHW layout
+data_np = np.random.randint(
+    -128, 128,
+    size=(batch_size, in_channels, height, width)).astype(data.dtype)
+kernel_np = np.random.randint(
+    -128, 128,
+    size=(out_channels, in_channels, kernel_h, kernel_w)).astype(kernel.dtype)
+
+# Apply packing to the data and kernel arrays from a 2D NCHW
+# to a 4D NCHWnc packed layout
+data_packed = data_np.reshape(batch_size // env.BATCH,
+                              env.BATCH,
+                              in_channels // env.BLOCK_IN,
+                              env.BLOCK_IN,
+                              height,
+                              width).transpose((0, 2, 4, 5, 1, 3))
+
+kernel_packed = kernel_np.reshape(out_channels // env.BLOCK_OUT,
+                                  env.BLOCK_OUT,
+                                  in_channels // env.BLOCK_IN,
+                                  env.BLOCK_IN,
+                                  kernel_h,
+                                  kernel_w).transpose((0, 2, 4, 5, 1, 3))
+
+# Format the input/output arrays with tvm.nd.array to the DLPack standard
+data_nd = tvm.nd.array(data_packed, ctx)
+kernel_nd = tvm.nd.array(kernel_packed, ctx)
+res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)
+
+# Invoke the module to perform the computation
+f(data_nd, kernel_nd, res_nd)
+
+# Verify against numpy implementation
+res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype),
+                            kernel_np.astype(env.acc_dtype),
+                            (stride_h, stride_w),
+                            (pad_h, pad_w)).astype(env.acc_dtype)
+res_ref = res_ref >> env.INP_WIDTH
+res_ref = np.clip(res_ref, 0, inp_max)
+res_ref = res_ref.astype(res.dtype)
+res_ref = res_ref.reshape((batch_size // env.BATCH,
+                           env.BATCH,
+                           out_channels // env.BLOCK_OUT,
+                           env.BLOCK_OUT,
+                           fout_height,
+                           fout_width)).transpose((0, 2, 4, 5, 1, 3))
+np.testing.assert_allclose(res_ref, res_nd.asnumpy())
+print("Successful 2D convolution test!")
+
+######################################################################
+# Summary
+# -------
+# This tutorial demonstrates how TVM scheduling primitives can be used to
+# lower 2D convolution onto hardware accelerator intrinsics, making
+# use of hardware specific optimizations, such as latency hiding with
+# virtual threading.
+#
diff --git a/vta/tutorials/matrix_multiply.py b/vta/tutorials/matrix_multiply.py
new file mode 100644
index 000000000000..4c8e716ff665
--- /dev/null
+++ b/vta/tutorials/matrix_multiply.py
@@ -0,0 +1,454 @@
+"""
+.. _basic-mat-mult:
+
+Simple Matrix Multiply
+======================
+**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+In this tutorial, we will build on top of the :ref:`vta-get-started` tutorial
+and introduce additional concepts required to implement matrix multiplication
+on VTA with the TVM workflow.
+"""
+
+######################################################################
+# RPC Setup
+# ---------
+# We start by programming the Pynq's FPGA and building its RPC runtime
+# as we did in the VTA introductory tutorial.
+
+from __future__ import absolute_import, print_function
+
+import os
+import tvm
+import vta
+import numpy as np
+from tvm import rpc
+from tvm.contrib import util
+from vta.testing import simulator
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+
+# We read the Pynq RPC host IP address and port number from the OS environment
+host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
+
+# We configure both the bitstream and the runtime system on the Pynq
+# to match the VTA configuration specified by the vta_config.json file.
+if env.TARGET == "pynq":
+
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.module.enabled("rpc")
+    remote = rpc.connect(host, port)
+
+    # Reconfigure the JIT runtime
+    vta.reconfig_runtime(remote)
+
+    # Program the FPGA with a pre-compiled VTA bitstream.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    vta.program_fpga(remote, bitstream=None)
+
+# In simulation mode, host the RPC server locally.
+elif env.TARGET == "sim":
+    remote = rpc.LocalSession()
+
+######################################################################
+# Computation Declaration
+# -----------------------
+# In this example we describe a simple matrix multiplication addition, which
+# requires multiple computation stages, as shown in the dataflow diagram below.
+# First we describe the input tensors :code:`A` and :code:`B` that are living
+# in main memory.
+# Second, we need to declare intermediate tensors :code:`A_buf` and
+# :code:`B_buf`, which will live in VTA's on-chip buffers.
+# Having this extra computational stage allows us to explicitly
+# stage cached reads and writes.
+# Third, we describe the matrix multiplication computation over
+# :code:`A_buf` and :code:`B_buf` to produce the product matrix :code:`C_buf`.
+# The last operation is a cast and copy back to DRAM, into results tensor
+# :code:`C`.
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/gemm_dataflow.png
+#      :align: center
+
+######################################################################
+# Data Layout
+# ~~~~~~~~~~~
+# We describe the placeholder tensors :code:`A`, and :code:`B` in a tiled data
+# format to match the data layout requirements imposed by the VTA tensor core.
+
+######################################################################
+# .. note::
+#
+#   **Data Tiling**
+#
+#   One source of complexity when targeting accelerators is to make sure
+#   that the data layout matches the layout imposed by the accelerator design.
+#   VTA is designed around a *tensor core* that performs, one matrix-matrix
+#   operation per cycle between an activation matrix and a weight matrix,
+#   adding the result matrix to an accumulator matrix, as shown in the
+#   figure below.
+#
+#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/tensor_core.png
+#        :align: center
+#        :width: 480px
+#
+#   The dimensions of that matrix-matrix multiplication are specified in
+#   the :code:`vta_config.json` configuration file.
+#   The activation matrix has a :code:`(BATCH, BLOCK_IN)` shape
+#   and the transposed weight matrix has a :code:`(BLOCK_OUT, BLOCK_IN)` shape,
+#   thus inferring that the resulting output matrix has a
+#   :code:`(BATCH, BLOCK_OUT)` shape.
+#   Consequently input and output tensors processed by VTA need to be
+#   tiled according to these aforementioned dimension.
+#
+#   The diagram below shows the impact of data tiling on a matrix that is
+#   originally of shape (4, 8).
+#   Tiling by a (2, 2) tile shape ensures that data within each tile is
+#   contiguous.
+#   The resulting tiled tensor has a shape of (2, 4, 2, 2).
+#
+#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/data_tiling.png
+#        :align: center
+#        :width: 480px
+#
+# We first define the variables :code:`m`, :code:`n`, :code:`o` to represent
+# the shape of the matrix multiplication. These variables are multiplicative
+# factors over the :code:`BLOCK_OUT`, :code:`BLOCK_IN`, and :code:`BATCH`
+# tensor dimensions respectively.
+# By default, the configuration file sets :code:`BATCH`, :code:`BLOCK_IN`, and
+# :code:`BLOCK_OUT` to be 1, 16 and 16 respectively (:code:`BATCH` being set to
+# 1 implies that our compute building block is vector-matrix multiply).
+#
+
+######################################################################
+# .. note::
+#
+#   **Data Types**
+#
+#   It's important to not only match the inner-tile
+#   dimension of VTA's tensor core, but also to match the specific data types
+#   expected by VTA.
+#   VTA for now only supports fixed point data types, which integer width is
+#   specified in the :code:`vta_config.json` file by :code:`INP_WIDTH` and
+#   :code:`WGT_WIDTH` for the activations and weights data types respectively.
+#   In addition, the accumulator data type integer width is specified by
+#   :code:`ACC_WIDTH`.
+#
+# By default, the configuration file sets :code:`INP_WIDTH`
+# and :code:`WGT_WIDTH` to 8.
+# The accumulator width :code:`ACC_WIDTH` is set to 32, in order to avoid
+# overflow during accumulation.
+# As a result, :code:`env.inp_dtype` and :code:`env.wgt_dtype` are all
+# narrow 8-bit integers, while :code:`env.acc_dtype` is a standard 32-bit
+# integer.
+
+# Output channel factor m - total 16x16=256 output channels
+m = 16
+# Input channel factor n - total 16x16=256 input channels
+n = 16
+# Batch factor o (we use single batch inference)
+o = 1
+# A placeholder tensor in tiled data format
+A = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype)
+# B placeholder tensor in tiled data format
+B = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype)
+# A copy buffer
+A_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: A(*i), "A_buf")
+# B copy buffer
+B_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: B(*i), "B_buf")
+
+######################################################################
+# Matrix Multiplication
+# ~~~~~~~~~~~~~~~~~~~~~
+# Now we're ready to describe the matrix multiplication result tensor :code:`C`,
+# with another compute operation.
+# The compute function takes the shape of the tensor, as well as a lambda
+# function that describes the computation rule for each position of the tensor.
+#
+# In order to implement matrix multiplication, the lambda function needs to
+# include a reduction formula over the input channel dimension axes.
+# To create a reduction formula, we can declare a reduction axis using
+# :code:`tvm.reduce_axis`, which takes in the range of reductions.
+# :code:`tvm.sum` takes in the expression to be reduced as well as
+# the reduction axes to compute the sum of value over all k in the declared
+# ranges.
+#
+# Note that the reduction needs to be performed over 32-bit :code:`env.acc_dtype`
+# accumulator data types.
+#
+# No computation happens during this phase, as we are only declaring how
+# the computation should be done.
+
+# Outer input feature reduction axis
+ko = tvm.reduce_axis((0, n), name="ko")
+# Inner input feature reduction axis
+ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki")
+# Describe the in-VTA matrix multiplication
+C_buf = tvm.compute(
+    (o, m, env.BATCH, env.BLOCK_OUT),
+    lambda bo, co, bi, ci:
+        tvm.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
+                B_buf[co, ko, ci, ki].astype(env.acc_dtype),
+                axis=[ko, ki]),
+    name="C_buf")
+
+######################################################################
+# Casting the Results
+# ~~~~~~~~~~~~~~~~~~~
+# After the computation is done, we'll need to send the results computed by VTA
+# back to main memory.
+
+######################################################################
+# .. note::
+#
+#   **Memory Store Restrictions**
+#
+#   One specificity of VTA is that it only supports DRAM stores in the narrow
+#   :code:`env.inp_dtype` data type format.
+#   This lets us reduce the data footprint for memory transfers, but also lets
+#   us quantize the wide accumulator data type down to a data format that
+#   matches the input activation data type.
+#   This means that in the context of neural network inference, the outputs
+#   of a given layer after activation can be consumed directly by the next
+#   layer.
+#
+# We perform one last typecast operation to the narrow
+# input activation data format.
+
+# Cast to output type, and send to main memory
+C = tvm.compute(
+    (o, m, env.BATCH, env.BLOCK_OUT),
+    lambda *i: C_buf(*i).astype(env.inp_dtype),
+    name="C")
+
+######################################################################
+# This concludes the computation declaration part of this tutorial.
+
+######################################################################
+# Scheduling the Computation
+# --------------------------
+# While the above lines describes the computation rule, we can obtain
+# :code:`C` in many ways.
+# TVM asks the user to provide an implementation of the computation called
+# *schedule*.
+#
+# A schedule is a set of transformations to an original computation that
+# transforms the implementation of the computation without affecting
+# correctness.
+# This simple VTA programming tutorial aims to demonstrate basic schedule
+# transformations that will map the original schedule down to VTA hardware
+# primitives.
+
+
+######################################################################
+# Default Schedule
+# ~~~~~~~~~~~~~~~~
+# After we construct the schedule, by default the schedule computes
+# :code:`C` in the following way:
+
+# Let's take a look at the generated schedule
+s = tvm.create_schedule(C.op)
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# Although this schedule makes sense, it won't compile to VTA.
+# In order to obtain correct code generation, we need to apply scheduling
+# primitives and code annotation that will transform the schedule into
+# one that can be directly lowered onto VTA hardware intrinsics.
+# Those include:
+#
+#  - DMA copy operations which will take globally-scoped tensors and copy
+#    those into locally-scoped tensors.
+#  - Tensor operations that will perform the matrix multiplication.
+
+######################################################################
+# Buffer Scopes
+# ~~~~~~~~~~~~~
+# First, we set the scope of the buffers to tell TVM that these buffers
+# will be living in the VTA's on-chip SRAM caches.
+# Below, we tell TVM that :code:`A_buf`, :code:`B_buf`, :code:`C_buf`
+# will respectively live in VTA's on-chip input, weight and accumulator
+# memory.
+
+######################################################################
+# .. note::
+#
+#   **VTA's On-Chip SRAMs**
+#
+#   VTA has three different memory scopes, each corresponding to different
+#   on-chip SRAM buffers.
+#
+#    - :code:`env.inp_scope`: Input buffer, which is a read-only SRAM buffer
+#      that stores input matrices of shape :code:`(env.BATCH, env.BLOCK_IN)`
+#      of type :code:`env.inp_dtype`. The input buffer contains
+#      `2 ^ LOG_INP_BUFF_SIZE` matrix elements (as specified in the
+#      :code:`vta_config.json` file).
+#    - :code:`env.wgt_scope`: Weight buffer, which is a read-only SRAM buffer
+#      that stores weight matrices of shape :code:`(env.BLOCK_OUT, env.BLOCK_IN)`
+#      of type :code:`env.wgt_dtype`. The weight buffer contains
+#      `2 ^ LOG_WGT_BUFF_SIZE` matrix elements.
+#    - :code:`env.acc_scope`: Accumulator buffer, which is a read/write SRAM
+#      buffer that stores accumulator matrices of shape
+#      :code:`(env.BATCH, env.BLOCK_OUT)` of type :code:`env.acc_dtype`.
+#      The accumulator buffer is VTA's general purpose register file: it holds
+#      both intermediate results of convolutions and matrix multiplications
+#      as well as intermediate results of pooling, batch normalization, and
+#      activation layers. The accumulator buffer contains
+#      `2 ^ LOG_ACC_BUFF_SIZE` matrix elements.
+
+# Set the intermediate tensor's scope to VTA's on-chip buffers
+s[A_buf].set_scope(env.inp_scope)
+s[B_buf].set_scope(env.wgt_scope)
+s[C_buf].set_scope(env.acc_scope)
+
+######################################################################
+# DMA Transfers
+# ~~~~~~~~~~~~~
+# We need to schedule DMA transfers to move data living in DRAM to
+# and from the VTA on-chip buffers.
+# This can be achieved using the :code:`compute_at` schedule primitive
+# which nests the copying of the buffers into the computation loop
+# that performs the matrix multiplication.
+#
+# We insert :code:`dma_copy` pragmas to indicate to the compiler
+# that the copy operations will be performed in bulk via DMA,
+# which is common in hardware accelerators.
+# Finally, we print the temporary schedule to observe the effects of
+# moving the copy operations into the matrix multiplication loop.
+
+# Move buffer copy into matrix multiply loop
+s[A_buf].compute_at(s[C_buf], ko)
+s[B_buf].compute_at(s[C_buf], ko)
+
+# Tag the buffer copies with the DMA pragma to insert a DMA transfer
+s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
+s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
+s[C].pragma(s[C].op.axis[0], env.dma_copy)
+
+# Let's take a look at the transformed schedule
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# Tensorization
+# ~~~~~~~~~~~~~
+# The last step of the schedule transformation consists in applying
+# *tensorization* to our schedule.
+# Tensorization is analogous to vectorization, but extends the concept
+# to a higher-dimensional unit of computation.
+# Consequently, tensorization imposes data layout constraints as discussed
+# earlier when declaring the data layout input placeholders.
+# We've already arranged our tensors in a tiled format, so the next thing
+# we need to perform is loop reordering to accommodate for tensorization.
+#
+# Here we choose to move the outermost reduction axis all the way out.
+# This dictates that we first iterate over input channels, then batch
+# dimensions, and finally output channels.
+# Lastly, we apply the tensorization scheduling primitive :code:`tensorize`
+# along the outer axis of the inner-most matrix matrix multiplication tensor
+# block.
+# We print the finalized schedule that is ready for code-generation
+# by the VTA runtime JIT compiler.
+
+s[C_buf].reorder(
+    ko,
+    s[C_buf].op.axis[0],
+    s[C_buf].op.axis[1],
+    s[C_buf].op.axis[2],
+    s[C_buf].op.axis[3],
+    ki)
+s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)
+
+# Let's take a look at the finalized schedule
+print(vta.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# This concludes the scheduling portion of this tutorial.
+
+######################################################################
+# TVM Compilation
+# ---------------
+# After we have finished specifying the schedule, we can compile it
+# into a TVM function.
+
+# Build GEMM VTA kernel
+my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")
+
+# Write the compiled module into an object file.
+temp = util.tempdir()
+my_gemm.save(temp.relpath("gemm.o"))
+
+# Send the executable over RPC
+remote.upload(temp.relpath("gemm.o"))
+
+# Load the compiled module
+f = remote.load_module("gemm.o")
+
+######################################################################
+# Running the Function
+# --------------------
+# The compiled TVM function uses a concise C API and can be invoked from
+# code language.
+#
+# TVM provides an array API in python to aid quick testing and prototyping.
+# The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
+#
+# - We first create a remote context (for remote execution on the Pynq).
+# - Then :code:`tvm.nd.array` formats the data accordingly.
+# - :code:`f()` runs the actual computation.
+# - :code:`asnumpy()` copies the result array back in a format that can be
+#   interpreted.
+#
+
+# Get the remote device context
+ctx = remote.ext_dev(0)
+
+# Initialize the A and B arrays randomly in the int range of (-128, 128]
+A_orig = np.random.randint(
+    -128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)
+B_orig = np.random.randint(
+    -128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)
+
+# Apply packing to the A and B arrays from a 2D to a 4D packed layout
+A_packed = A_orig.reshape(
+    o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
+B_packed = B_orig.reshape(
+    m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
+
+# Format the input/output arrays with tvm.nd.array to the DLPack standard
+A_nd = tvm.nd.array(A_packed, ctx)
+B_nd = tvm.nd.array(B_packed, ctx)
+C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)
+
+# Invoke the module to perform the computation
+f(A_nd, B_nd, C_nd)
+
+######################################################################
+# Verifying Correctness
+# ---------------------
+# Compute the reference result with numpy and assert that the output of the
+# matrix multiplication indeed is correct
+
+# Compute reference result with numpy
+C_ref = np.dot(A_orig.astype(env.acc_dtype),
+               B_orig.T.astype(env.acc_dtype)).astype(C.dtype)
+C_ref = C_ref.reshape(
+    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
+np.testing.assert_equal(C_ref, C_nd.asnumpy())
+print("Successful matrix multiply test!")
+
+
+######################################################################
+# Summary
+# -------
+# This tutorial showcases the TVM workflow to implement a simple matrix
+# multiplication example on VTA.
+# The general workflow includes:
+#
+# - Programming the FPGA with the VTA bitstream over RPC.
+# - Describing matrix multiplication via a series of computations.
+# - Describing how we want to perform the computation using schedule primitives.
+# - Compiling the function to the VTA target.
+# - Running the compiled module and verifying it against a numpy implementation.
+#
diff --git a/vta/tutorials/matrix_multiply_opt.py b/vta/tutorials/matrix_multiply_opt.py
new file mode 100644
index 000000000000..4e1f4167f84b
--- /dev/null
+++ b/vta/tutorials/matrix_multiply_opt.py
@@ -0,0 +1,362 @@
+"""
+.. _vta-mat-mult-opt:
+
+Matrix Multiply Blocking
+========================
+**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+This tutorial provides an overview on how to use TVM to map matrix
+multiplication efficiently on the VTA design.
+We recommend covering the :ref:`vta-basic-mat-mult` tutorial first.
+
+In this tutorial, we will demonstrate TVM schedule optimizations to break large
+neural network operators down onto smaller blocks to achieve computation within
+limited hardware accelerator resources.
+"""
+
+######################################################################
+# RPC Setup
+# ---------
+# We start by programming the Pynq's FPGA and building its RPC runtime.
+
+from __future__ import absolute_import, print_function
+
+import os
+import tvm
+import vta
+import numpy as np
+from tvm import rpc
+from tvm.contrib import util
+from vta.testing import simulator
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+
+# We read the Pynq RPC host IP address and port number from the OS environment
+host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
+
+# We configure both the bitstream and the runtime system on the Pynq
+# to match the VTA configuration specified by the vta_config.json file.
+if env.TARGET == "pynq":
+
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.module.enabled("rpc")
+    remote = rpc.connect(host, port)
+
+    # Reconfigure the JIT runtime
+    vta.reconfig_runtime(remote)
+
+    # Program the FPGA with a pre-compiled VTA bitstream.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    vta.program_fpga(remote, bitstream=None)
+
+# In simulation mode, host the RPC server locally.
+elif env.TARGET == "sim":
+    remote = rpc.LocalSession()
+
+######################################################################
+# Computation Declaration
+# -----------------------
+# As a first step, we need to describe our matrix multiplication computation.
+# We define the matrix multiplication as the computation one would find in a
+# fully connected layer, defined by its batch size, input channels, and output
+# channels.
+# These have to be integer multiples of the VTA tensor shape:
+# :code:`BATCH`, :code:`BLOCK_IN`, and :code:`BLOCK_OUT` respectively.
+#
+# We've added extra operators to the matrix multiplication that apply
+# shifting and clipping to the output in order to mimic a fixed-point
+# matrix multiplication followed by a rectified linear activation.
+# We describe the TVM dataflow graph of the fully connected layer below:
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/fc_dataflow.png
+#      :align: center
+#
+# This computation is intentionally too large to fit onto VTA's on-chip
+# buffers all at once. Therefore in the scheduling phase we'll
+# rely on computation blocking strategies to break the computation down into
+# manageable chunks.
+
+# Fully connected layer dimensions: 1024 x 1024
+batch_size = 1
+in_channels = 1024
+out_channels = 1024
+assert batch_size % env.BATCH == 0
+assert in_channels % env.BLOCK_IN == 0
+assert out_channels % env.BLOCK_OUT == 0
+
+# Let's derive the tiled input tensor shapes
+data_shape = (batch_size // env.BATCH,
+              in_channels // env.BLOCK_IN,
+              env.BATCH,
+              env.BLOCK_IN)
+weight_shape = (out_channels // env.BLOCK_OUT,
+                in_channels // env.BLOCK_IN,
+                env.BLOCK_OUT,
+                env.BLOCK_IN)
+output_shape = (batch_size // env.BATCH,
+                out_channels // env.BLOCK_OUT,
+                env.BATCH,
+                env.BLOCK_OUT)
+num_ops = in_channels * out_channels * batch_size * 2
+
+# Reduction axes
+ic = tvm.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
+ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
+
+# Input placeholder tensors
+data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+weight = tvm.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype)
+
+# Copy buffers
+data_buf = tvm.compute(data_shape,
+                       lambda *i: data(*i),
+                       "data_buf")
+weight_buf = tvm.compute(weight_shape,
+                         lambda *i: weight(*i),
+                         "weight_buf")
+
+# Declare matrix multiply computation
+res_gemm = tvm.compute(output_shape,
+                       lambda bo, co, bi, ci: tvm.sum(
+                            data_buf[bo, ic, bi, ic_tns].astype(env.acc_dtype) *
+                            weight_buf[co, ic, ci, ic_tns].astype(env.acc_dtype),
+                            axis=[ic, ic_tns]),
+                       name="res_gem")
+
+# Add shift stage for fix-point normalization
+res_shr = tvm.compute(output_shape,
+                      lambda *i: res_gemm(*i) >> env.INP_WIDTH,
+                      name="res_shr")
+
+# Apply clipping between (0, input max value)
+inp_max = (1<<(env.INP_WIDTH-1))-1
+res_max = tvm.compute(output_shape,
+                      lambda *i: tvm.max(res_shr(*i), 0),
+                      "res_max")
+res_min = tvm.compute(output_shape,
+                      lambda *i: tvm.min(res_max(*i), inp_max),
+                      "res_min")
+
+# Apply typecast to input data type before sending results back
+res = tvm.compute(output_shape,
+                  lambda *i: res_min(*i).astype(env.inp_dtype),
+                  name="res")
+
+######################################################################
+# Scheduling the Computation
+# --------------------------
+# We'll look at a set of schedule transformations necessary to map the
+# matrix multiplications onto VTA in an efficient fashion.
+# Those include:
+#
+# - Computation blocking
+# - Lowering to VTA hardware intrinsics
+
+
+# Create TVM schedule
+s = tvm.create_schedule(res.op)
+# Let's look at the default TVM schedule
+print(tvm.lower(s, [data, weight, res], simple_mode=True))
+
+######################################################################
+# Blocking the Computation
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+# The matrix multiplication is by default too large for activations or weights
+# to fit on VTA's on-chip buffers all at once.
+# We block the (1, 1024) by (1024, 1024) matrix multiplication into
+# smaller (1, 256) by (256, 256) matrix multiplications so the intermediate
+# tensors can fit on the accelerator's on-chip SRAM.
+# This approach is similar to blocking techniques applied to CPUs and GPUs in
+# order to increase cache hit rate.
+#
+# We perform blocking along each axes (the batch axis being untouched since
+# we are performing singe-batch inference).
+# We also leave the inner-most tensorization axes as-is in order to allow
+# TVM to pattern-match tensorization.
+# We show the outcome of blocking on the computation schedule in the diagram
+# below:
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/blocking.png
+#      :align: center
+#      :width: 480px
+#
+# .. note::
+#
+#   The code after loop splitting and reordering is equivalent to the following
+#   pseudo-code. We ignore the batch axis since we are only performing single-batch
+#   inference in this example:
+#
+#   .. code-block:: c
+#
+#      for (int oc_out = 0; oc_out < 4; ++oc_out) {
+#        // Initialization loop
+#        for (int oc_inn = 0; oc_inn < 16; ++oc_inn) {
+#         for (int oc_tns = 0; oc_tns < 16; ++oc_tns) {
+#          int j = (oc_out * 16 + oc_inn) * 16 + oc_tns;
+#          C[0][j] = 0;
+#         }
+#        }
+#        for (int ic_out = 0; ic_out < 4; ++ic_out) {
+#         // Block loop
+#         for (int oc_inn = 0; oc_inn < 16; ++oc_inn) {
+#          for (int ic_inn = 0; ic_inn < 16; ++ic_inn) {
+#           // Tensorization loop
+#           for (int oc_tns = 0; oc_tns < 16; ++oc_tns) {
+#            for (int ic_tns = 0; ic_tns < 16; ++ic_tns) {
+#             int i = (ic_out * 16 + ic_inn) * 16 + ic_tns;
+#             int j = (oc_out * 16 + oc_inn) * 16 + oc_tns;
+#             C[0][i] = C[0][i] + A[0][i] * B[j][i];
+#            }
+#           }
+#          }
+#         }
+#        }
+#       }
+#      }
+
+# Let's define tiling sizes (expressed in multiples of VTA tensor shape size)
+b_block = 1 // env.BATCH
+i_block = 256 // env.BLOCK_IN
+o_block = 256 // env.BLOCK_OUT
+
+# Tile the output tensor along the batch and output channel dimensions
+# (since by default we are doing single batch inference, the split along
+#  the batch dimension has no effect)
+b, oc, b_tns, oc_tns = s[res].op.axis
+b_out, b_inn = s[res].split(b, b_block)
+oc_out, oc_inn = s[res].split(oc, o_block)
+s[res].reorder(b_out, oc_out, b_inn, oc_inn)
+
+# Move intermediate computation into each output compute tile
+s[res_gemm].compute_at(s[res], oc_out)
+s[res_shr].compute_at(s[res], oc_out)
+s[res_max].compute_at(s[res], oc_out)
+s[res_min].compute_at(s[res], oc_out)
+
+# Apply additional loop split along reduction axis (input channel)
+b_inn, oc_inn, b_tns, oc_tns = s[res_gemm].op.axis
+ic_out, ic_inn = s[res_gemm].split(ic, i_block)
+
+# Reorder axes. We move the ic_out axis all the way out of the GEMM
+# loop to block along the reduction axis
+s[res_gemm].reorder(ic_out, b_inn, oc_inn, ic_inn, b_tns, oc_tns, ic_tns)
+
+# Let's look at the current TVM schedule after blocking
+print(tvm.lower(s, [data, weight, res], simple_mode=True))
+
+######################################################################
+# Lowering Copies to DMA Transfers
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Next we set the buffer scopes to the corresponding on-chip VTA SRAM buffers.
+# We move the load loops into the matrix multiply computation loop to stage
+# memory loads such that they fit in the on-chip SRAM buffers.
+# Finally we annotate the load/store loop outer axes with the DMA copy pragma
+# to perform bulk memory transfers on VTA.
+
+# Set scope of SRAM buffers
+s[data_buf].set_scope(env.inp_scope)
+s[weight_buf].set_scope(env.wgt_scope)
+s[res_gemm].set_scope(env.acc_scope)
+s[res_shr].set_scope(env.acc_scope)
+s[res_min].set_scope(env.acc_scope)
+s[res_max].set_scope(env.acc_scope)
+
+# Block data and weight cache reads
+s[data_buf].compute_at(s[res_gemm], ic_out)
+s[weight_buf].compute_at(s[res_gemm], ic_out)
+
+# Use DMA copy pragma on DRAM->SRAM operations
+s[data_buf].pragma(s[data_buf].op.axis[0], env.dma_copy)
+s[weight_buf].pragma(s[weight_buf].op.axis[0], env.dma_copy)
+
+# Use DMA copy pragma on SRAM->DRAM operation
+# (this implies that these copies should be performed along b_inn,
+# or result axis 2)
+s[res].pragma(s[res].op.axis[2], env.dma_copy)
+
+######################################################################
+# Lowering Computation to VTA Compute Intrinsics
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# The last phase is to lower the computation loops down to VTA hardware
+# intrinsics by mapping the matrix multiplication to tensor intrinsics,
+# and mapping the shift, and clipping computation to the vector ALU.
+
+# Apply tensorization over the batch tensor tile axis
+s[res_gemm].tensorize(b_tns, env.gemm)
+
+# Add an ALU pragma over the shift and clipping operations
+s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
+s[res_min].pragma(s[res_min].op.axis[0], env.alu)
+s[res_max].pragma(s[res_max].op.axis[0], env.alu)
+
+# Let's look at the final lowered TVM schedule after lowering memory
+# loads/stores down to DMA copy intrinsics, and the computation down to
+# VTA compute intrinsics.
+print(vta.lower(s, [data, weight, res], simple_mode=True))
+
+######################################################################
+# TVM Compilation and Verification
+# --------------------------------
+# After specifying the schedule, we can compile it into a TVM function.
+# We save the module so we can send it over RPC.
+# We run the function and verify it against a numpy implementation to
+# ensure correctness.
+
+# Compile the TVM module
+my_gemm = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="my_gemm")
+temp = util.tempdir()
+my_gemm.save(temp.relpath("gemm.o"))
+remote.upload(temp.relpath("gemm.o"))
+f = remote.load_module("gemm.o")
+
+# Get the remote device context
+ctx = remote.ext_dev(0)
+
+# Initialize the data and weight arrays randomly in the int range of (-128, 128]
+data_np = np.random.randint(
+    -128, 128, size=(batch_size, in_channels)).astype(data.dtype)
+weight_np = np.random.randint(
+    -128, 128, size=(out_channels, in_channels)).astype(weight.dtype)
+
+# Apply packing to the data and weight arrays from a 2D to a 4D packed layout
+data_packed = data_np.reshape(batch_size // env.BATCH,
+                              env.BATCH,
+                              in_channels // env.BLOCK_IN,
+                              env.BLOCK_IN).transpose((0, 2, 1, 3))
+weight_packed = weight_np.reshape(out_channels // env.BLOCK_OUT,
+                                  env.BLOCK_OUT,
+                                  in_channels // env.BLOCK_IN,
+                                  env.BLOCK_IN).transpose((0, 2, 1, 3))
+
+# Format the input/output arrays with tvm.nd.array to the DLPack standard
+data_nd = tvm.nd.array(data_packed, ctx)
+weight_nd = tvm.nd.array(weight_packed, ctx)
+res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)
+
+# Invoke the module to perform the computation
+f(data_nd, weight_nd, res_nd)
+
+# Verify against numpy implementation
+res_ref = np.dot(data_np.astype(env.acc_dtype),
+                 weight_np.T.astype(env.acc_dtype))
+res_ref = res_ref >> env.INP_WIDTH
+res_ref = np.clip(res_ref, 0, inp_max)
+res_ref = res_ref.astype(res.dtype)
+res_ref = res_ref.reshape(batch_size // env.BATCH,
+                          env.BATCH,
+                          out_channels // env.BLOCK_OUT,
+                          env.BLOCK_OUT).transpose((0, 2, 1, 3))
+np.testing.assert_equal(res_ref, res_nd.asnumpy())
+print("Successful blocked matrix multiply test!")
+
+######################################################################
+# Summary
+# -------
+# This tutorial demonstrates how TVM scheduling primitives can achieve
+# computation blocking for a matrix multiplication example.
+# This allows us to map arbitrarily large computation onto limited
+# hardware accelerator resources.
+#
diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
new file mode 100644
index 000000000000..8d33a91d5691
--- /dev/null
+++ b/vta/tutorials/resnet.py
@@ -0,0 +1,315 @@
+"""
+ResNet Inference Example
+========================
+**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+This tutorial provides an end-to-end demo, on how to run ResNet-18 inference
+onto the VTA accelerator design to perform ImageNet classification tasks.
+
+"""
+
+######################################################################
+# Import Libraries
+# ----------------
+# We start by importing the tvm, vta, nnvm libraries to run this example.
+
+from __future__ import absolute_import, print_function
+
+import os
+import time
+from io import BytesIO
+
+import numpy as np
+import requests
+from matplotlib import pyplot as plt
+from PIL import Image
+
+import tvm
+from tvm import rpc, autotvm
+from tvm.contrib import graph_runtime, util
+from tvm.contrib.download import download
+import nnvm.compiler
+import vta
+import vta.testing
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+
+# Helper to crop an image to a square (224, 224)
+# Takes in an Image object, returns an Image object
+def thumbnailify(image, pad=15):
+    w, h = image.size
+    crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad)
+    image = image.crop(crop)
+    image = image.resize((224, 224))
+    return image
+
+# Helper function to read in image
+# Takes in Image object, returns an ND array
+def process_image(image):
+    # Convert to neural network input format
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+
+    return tvm.nd.array(image.astype("float32"))
+
+# Classification helper function
+# Takes in the graph runtime, and an image, and returns top result and time
+def classify(m, image):
+    m.set_input('data', image)
+    timer = m.module.time_evaluator("run", ctx, number=1)
+    tcost = timer()
+    tvm_output = m.get_output(0, tvm.nd.empty((1000,), "float32", remote.cpu(0)))
+    top = np.argmax(tvm_output.asnumpy())
+    tcost = "t={0:.2f}s".format(tcost.mean)
+    return tcost + " {}".format(synset[top])
+
+# Helper function to compile the NNVM graph
+# Takes in a path to a graph file, params file, and device target
+# Returns the NNVM graph object, a compiled library object, and the params dict
+def generate_graph(graph_fn, params_fn, device="vta"):
+    # Measure build start time
+    build_start = time.time()
+
+    # Derive the TVM target
+    target = tvm.target.create("llvm -device={}".format(device))
+
+    # Derive the LLVM compiler flags
+    # When targetting the Pynq, cross-compile to ARMv7 ISA
+    if env.TARGET == "sim":
+        target_host = "llvm"
+    elif env.TARGET == "pynq":
+        target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon"
+
+    # Load the ResNet-18 graph and parameters
+    sym = nnvm.graph.load_json(open(graph_fn).read())
+    params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read())
+
+    # Populate the shape and data type dictionary
+    shape_dict = {"data": (1, 3, 224, 224)}
+    dtype_dict = {"data": 'float32'}
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Apply NNVM graph optimization passes
+    sym = vta.graph.clean_cast(sym)
+    sym = vta.graph.clean_conv_fuse(sym)
+    if target.device_name == "vta":
+        assert env.BLOCK_IN == env.BLOCK_OUT
+        sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
+
+    # Compile NNVM graph
+    with nnvm.compiler.build_config(opt_level=3):
+        if target.device_name != "vta":
+            graph, lib, params = nnvm.compiler.build(
+                sym, target, shape_dict, dtype_dict,
+                params=params, target_host=target_host)
+        else:
+            with vta.build_config():
+                graph, lib, params = nnvm.compiler.build(
+                    sym, target, shape_dict, dtype_dict,
+                    params=params, target_host=target_host)
+
+    # Save the compiled inference graph library
+    assert tvm.module.enabled("rpc")
+    temp = util.tempdir()
+    lib.save(temp.relpath("graphlib.o"))
+
+    # Send the inference library over to the remote RPC server
+    remote.upload(temp.relpath("graphlib.o"))
+    lib = remote.load_module("graphlib.o")
+
+    # Measure build time
+    build_time = time.time() - build_start
+    print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time))
+
+    return graph, lib, params
+
+
+######################################################################
+# Download ResNet Model
+# --------------------------------------------
+# Download the necessary files to run ResNet-18.
+#
+
+# Obtain ResNet model and download them into _data dir
+url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
+categ_fn = 'synset.txt'
+graph_fn = 'resnet18_qt8.json'
+params_fn = 'resnet18_qt8.params'
+
+# Create data dir
+data_dir = "_data/"
+if not os.path.exists(data_dir):
+    os.makedirs(data_dir)
+
+# Download files
+for file in [categ_fn, graph_fn, params_fn]:
+    if not os.path.isfile(file):
+        download(os.path.join(url, file), os.path.join(data_dir, file))
+
+# Read in ImageNet Categories
+synset = eval(open(os.path.join(data_dir, categ_fn)).read())
+
+# Download pre-tuned op parameters of conv2d for ARM CPU used in VTA
+autotvm.tophub.check_package('vta')
+
+
+######################################################################
+# Setup the Pynq Board's RPC Server
+# ---------------------------------
+# Build the RPC server's VTA runtime and program the Pynq FPGA.
+
+# Measure build start time
+reconfig_start = time.time()
+
+# We read the Pynq RPC host IP address and port number from the OS environment
+host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
+
+# We configure both the bitstream and the runtime system on the Pynq
+# to match the VTA configuration specified by the vta_config.json file.
+if env.TARGET == "pynq":
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.module.enabled("rpc")
+    remote = rpc.connect(host, port)
+
+    # Reconfigure the JIT runtime
+    vta.reconfig_runtime(remote)
+
+    # Program the FPGA with a pre-compiled VTA bitstream.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    vta.program_fpga(remote, bitstream=None)
+
+    # Report on reconfiguration time
+    reconfig_time = time.time() - reconfig_start
+    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
+
+# In simulation mode, host the RPC server locally.
+elif env.TARGET == "sim":
+    remote = rpc.LocalSession()
+
+
+######################################################################
+# Build the ResNet Runtime
+# ------------------------
+# Build the ResNet graph runtime, and configure the parameters.
+
+# Set ``device=vtacpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+
+# Device context
+ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+
+# Build the graph runtime
+graph, lib, params = generate_graph(os.path.join(data_dir, graph_fn),
+                                    os.path.join(data_dir, params_fn),
+                                    device)
+m = graph_runtime.create(graph, lib, ctx)
+
+# Set the parameters
+m.set_input(**params)
+
+######################################################################
+# Run ResNet-18 inference on a sample image
+# -----------------------------------------
+# Perform image classification on test image.
+# You can change the test image URL to any image of your choosing.
+
+# Read in test image
+image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
+# Read in test image
+response = requests.get(image_url)
+image = Image.open(BytesIO(response.content)).resize((224, 224))
+# Show Image
+plt.imshow(image)
+plt.show()
+# Set the input
+image = process_image(image)
+m.set_input('data', image)
+
+# Perform inference
+timer = m.module.time_evaluator("run", ctx, number=1)
+tcost = timer()
+
+# Get classification results
+tvm_output = m.get_output(0, tvm.nd.empty((1000,), "float32", remote.cpu(0)))
+top_categories = np.argsort(tvm_output.asnumpy())
+
+# Report top-5 classification results
+print("ResNet-18 Prediction #1:", synset[top_categories[-1]])
+print("                     #2:", synset[top_categories[-2]])
+print("                     #3:", synset[top_categories[-3]])
+print("                     #4:", synset[top_categories[-4]])
+print("                     #5:", synset[top_categories[-5]])
+print("Performed inference in {0:.2f}s".format(tcost.mean))
+
+
+######################################################################
+# Run a Youtube Video Image Classifier
+# ------------------------------------
+# Perform image classification on test stream on 1 frame every 48 frames.
+# Comment the `if False:` out to run the demo
+
+# Early exit - remove for Demo
+if False:
+
+    import cv2
+    import pafy
+    from IPython.display import clear_output
+
+    # Helper to crop an image to a square (224, 224)
+    # Takes in an Image object, returns an Image object
+    def thumbnailify(image, pad=15):
+        w, h = image.size
+        crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad)
+        image = image.crop(crop)
+        image = image.resize((224, 224))
+        return image
+
+    # 16:16 inches
+    plt.rcParams['figure.figsize'] = [16, 16]
+
+    # Stream the video in
+    url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s"
+    video = pafy.new(url)
+    best = video.getbest(preftype="mp4")
+    cap = cv2.VideoCapture(best.url)
+
+    # Process one frame out of every 48 for variety
+    count = 0
+    guess = ""
+    while(count<2400):
+
+        # Capture frame-by-frame
+        ret, frame = cap.read()
+
+        # Process one every 48 frames
+        if count % 48 == 1:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+            # Crop and resize
+            thumb = np.array(thumbnailify(frame))
+            image = process_image(thumb)
+            guess = classify(m, image)
+
+            # Insert guess in frame
+            frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50)
+            cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA)
+
+            plt.imshow(thumb)
+            plt.axis('off')
+            plt.show()
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                break
+            clear_output(wait=True)
+
+        count += 1
+
+    # When everything done, release the capture
+    cap.release()
+    cv2.destroyAllWindows()
diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py
new file mode 100644
index 000000000000..73f6e2bd5472
--- /dev/null
+++ b/vta/tutorials/vta_get_started.py
@@ -0,0 +1,387 @@
+"""
+.. _vta-get-started:
+
+Get Started with VTA
+====================
+**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+This is an introduction tutorial on how to use TVM to program the VTA design.
+
+In this tutorial, we will demonstrate the basic TVM workflow to implement
+a vector addition on the VTA design's vector ALU.
+This process includes specific scheduling transformations necessary to lower
+computation down to low-level accelerator operations.
+
+To begin, we need to import TVM which is our deep learning optimizing compiler.
+We also need to import the VTA python package which contains VTA specific
+extensions for TVM to target the VTA design.
+"""
+from __future__ import absolute_import, print_function
+
+import os
+import tvm
+import vta
+import numpy as np
+
+######################################################################
+# Loading in VTA Parameters
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# VTA is a modular and customizable design. Consequently, the user
+# is free to modify high-level hardware parameters that affect
+# the hardware design layout.
+# These parameters are specified in the :code:`vta_config.json` file by their
+# :code:`log2` values.
+# These VTA parameters can be loaded with the :code:`vta.get_env`
+# function.
+#
+# Finally, the TVM target is also specified in the :code:`vta_config.json` file.
+# When set to *sim*, execution will take place inside of a behavioral
+# VTA simulator.
+# If you want to run this tutorial on the Pynq FPGA development platform,
+# follow the *VTA Pynq-Based Testing Setup* guide.
+
+env = vta.get_env()
+
+######################################################################
+# FPGA Programming
+# ----------------
+# When targeting the Pynq FPGA development board, we need to configure
+# the board with a VTA bitstream.
+
+# We'll need the TVM RPC module and the VTA simulator module
+from tvm import rpc
+from tvm.contrib import util
+from vta.testing import simulator
+
+# We read the Pynq RPC host IP address and port number from the OS environment
+host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
+
+# We configure both the bitstream and the runtime system on the Pynq
+# to match the VTA configuration specified by the vta_config.json file.
+if env.TARGET == "pynq":
+
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.module.enabled("rpc")
+    remote = rpc.connect(host, port)
+
+    # Reconfigure the JIT runtime
+    vta.reconfig_runtime(remote)
+
+    # Program the FPGA with a pre-compiled VTA bitstream.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    vta.program_fpga(remote, bitstream=None)
+
+# In simulation mode, host the RPC server locally.
+elif env.TARGET == "sim":
+    remote = rpc.LocalSession()
+
+######################################################################
+# Computation Declaration
+# -----------------------
+# As a first step, we need to describe our computation.
+# TVM adopts tensor semantics, with each intermediate result
+# represented as multi-dimensional array. The user needs to describe
+# the computation rule that generates the output tensors.
+#
+# In this example we describe a vector addition, which requires multiple
+# computation stages, as shown in the dataflow diagram below.
+# First we describe the input tensors :code:`A` and :code:`B` that are living
+# in main memory.
+# Second, we need to declare intermediate tensors :code:`A_buf` and
+# :code:`B_buf`, which will live in VTA's on-chip buffers.
+# Having this extra computational stage allows us to explicitly
+# stage cached reads and writes.
+# Third, we describe the vector addition computation which will
+# add :code:`A_buf` to :code:`B_buf` to produce :code:`C_buf`.
+# The last operation is a cast and copy back to DRAM, into results tensor
+# :code:`C`.
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/vadd_dataflow.png
+#      :align: center
+
+######################################################################
+# Input Placeholders
+# ~~~~~~~~~~~~~~~~~~
+# We describe the placeholder tensors :code:`A`, and :code:`B` in a tiled data
+# format to match the data layout requirements imposed by the VTA vector ALU.
+#
+# For VTA's general purpose operations such as vector adds, the tile size is
+# :code:`(env.BATCH, env.BLOCK_OUT)`.
+# The dimensions are specified in
+# the :code:`vta_config.json` configuration file and are set by default to
+# a (1, 16) vector.
+#
+# In addition, A and B's data types also needs to match the :code:`env.acc_dtype`
+# which is set by the :code:`vta_config.json` file to be a 32-bit integer.
+
+# Output channel factor m - total 64 x 16 = 1024 output channels
+m = 64
+# Batch factor o - total 1 x 1 = 1
+o = 1
+# A placeholder tensor in tiled data format
+A = tvm.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="A", dtype=env.acc_dtype)
+# B placeholder tensor in tiled data format
+B = tvm.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="B", dtype=env.acc_dtype)
+
+######################################################################
+# Copy Buffers
+# ~~~~~~~~~~~~
+# One specificity of hardware accelerators, is that on-chip memory has to be
+# explicitly managed.
+# This means that we'll need to describe intermediate tensors :code:`A_buf`
+# and :code:`B_buf` that can have a different memory scope than the original
+# placeholder tensors :code:`A` and :code:`B`.
+#
+# Later in the scheduling phase, we can tell the compiler that :code:`A_buf`
+# and :code:`B_buf` will live in the VTA's on-chip buffers (SRAM), while
+# :code:`A` and :code:`B` will live in main memory (DRAM).
+# We describe A_buf and B_buf as the result of a compute
+# operation that is the identity function.
+# This can later be interpreted by the compiler as a cached read operation.
+
+# A copy buffer
+A_buf = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: A(*i), "A_buf")
+# B copy buffer
+B_buf = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: B(*i), "B_buf")
+
+######################################################################
+# Vector Addition
+# ~~~~~~~~~~~~~~~
+# Now we're ready to describe the vector addition result tensor :code:`C`,
+# with another compute operation.
+# The compute function takes the shape of the tensor, as well as a lambda
+# function that describes the computation rule for each position of the tensor.
+#
+# No computation happens during this phase, as we are only declaring how
+# the computation should be done.
+
+# Describe the in-VTA vector addition
+C_buf = tvm.compute(
+    (o, m, env.BATCH, env.BLOCK_OUT),
+    lambda *i: A_buf(*i).astype(env.acc_dtype) + B_buf(*i).astype(env.acc_dtype),
+    name="C_buf")
+
+######################################################################
+# Casting the Results
+# ~~~~~~~~~~~~~~~~~~~
+# After the computation is done, we'll need to send the results computed by VTA
+# back to main memory.
+
+######################################################################
+# .. note::
+#
+#   **Memory Store Restrictions**
+#
+#   One specificity of VTA is that it only supports DRAM stores in the narrow
+#   :code:`env.inp_dtype` data type format.
+#   This lets us reduce the data footprint for memory transfers (more on this
+#   in the basic matrix multiply example).
+#
+# We perform one last typecast operation to the narrow
+# input activation data format.
+
+# Cast to output type, and send to main memory
+C = tvm.compute(
+    (o, m, env.BATCH, env.BLOCK_OUT),
+    lambda *i: C_buf(*i).astype(env.inp_dtype),
+    name="C")
+
+######################################################################
+# This concludes the computation declaration part of this tutorial.
+
+
+######################################################################
+# Scheduling the Computation
+# --------------------------
+# While the above lines describes the computation rule, we can obtain
+# :code:`C` in many ways.
+# TVM asks the user to provide an implementation of the computation called
+# *schedule*.
+#
+# A schedule is a set of transformations to an original computation that
+# transforms the implementation of the computation without affecting
+# correctness.
+# This simple VTA programming tutorial aims to demonstrate basic schedule
+# transformations that will map the original schedule down to VTA hardware
+# primitives.
+
+
+######################################################################
+# Default Schedule
+# ~~~~~~~~~~~~~~~~
+# After we construct the schedule, by default the schedule computes
+# :code:`C` in the following way:
+
+# Let's take a look at the generated schedule
+s = tvm.create_schedule(C.op)
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# Although this schedule makes sense, it won't compile to VTA.
+# In order to obtain correct code generation, we need to apply scheduling
+# primitives and code annotation that will transform the schedule into
+# one that can be directly lowered onto VTA hardware intrinsics.
+# Those include:
+#
+#  - DMA copy operations which will take globally-scoped tensors and copy
+#    those into locally-scoped tensors.
+#  - Vector ALU operations that will perform the vector add.
+
+######################################################################
+# Buffer Scopes
+# ~~~~~~~~~~~~~
+# First, we set the scope of the copy buffers to indicate to TVM that these
+# intermediate tensors will be stored in the VTA's on-chip SRAM buffers.
+# Below, we tell TVM that :code:`A_buf`, :code:`B_buf`, :code:`C_buf`
+# will live in VTA's on-chip *accumulator buffer* which serves as
+# VTA's general purpose register file.
+#
+# Set the intermediate tensors' scope to VTA's on-chip accumulator buffer
+s[A_buf].set_scope(env.acc_scope)
+s[B_buf].set_scope(env.acc_scope)
+s[C_buf].set_scope(env.acc_scope)
+
+######################################################################
+# DMA Transfers
+# ~~~~~~~~~~~~~
+# We need to schedule DMA transfers to move data living in DRAM to
+# and from the VTA on-chip buffers.
+# We insert :code:`dma_copy` pragmas to indicate to the compiler
+# that the copy operations will be performed in bulk via DMA,
+# which is common in hardware accelerators.
+
+# Tag the buffer copies with the DMA pragma to map a copy loop to a
+# DMA transfer operation
+s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
+s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
+s[C].pragma(s[C].op.axis[0], env.dma_copy)
+
+######################################################################
+# ALU Operations
+# ~~~~~~~~~~~~~~
+# VTA has a vector ALU that can perform vector operations on tensors
+# in the accumulator buffer.
+# In order to tell TVM that a given operation needs to be mapped to the
+# VTA's vector ALU, we need to explicitly tag the vector addition loop
+# with an :code:`env.alu` pragma.
+
+# Tell TVM that the computation needs to be performed
+# on VTA's vector ALU
+s[C_buf].pragma(C_buf.op.axis[0], env.alu)
+
+# Let's take a look at the finalized schedule
+print(vta.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# This concludes the scheduling portion of this tutorial.
+
+######################################################################
+# TVM Compilation
+# ---------------
+# After we have finished specifying the schedule, we can compile it
+# into a TVM function. By default TVM compiles into a type-erased
+# function that can be directly called from python side.
+#
+# In the following line, we use :code:`tvm.build` to create a function.
+# The build function takes the schedule, the desired signature of the
+# function(including the inputs and outputs) as well as target language
+# we want to compile to.
+#
+my_vadd = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_vadd")
+
+######################################################################
+# Saving the Module
+# ~~~~~~~~~~~~~~~~~
+# TVM lets us save our module into a file so it can loaded back later. This
+# is called ahead-of-time compilation and allows us to save some compilation
+# time.
+# More importantly, this allows us to cross-compile the executable on our
+# development machine and send it over to the Pynq FPGA board over RPC for
+# execution.
+
+# Write the compiled module into an object file.
+temp = util.tempdir()
+my_vadd.save(temp.relpath("vadd.o"))
+
+# Send the executable over RPC
+remote.upload(temp.relpath("vadd.o"))
+
+######################################################################
+# Loading the Module
+# ~~~~~~~~~~~~~~~~~~
+# We can load the compiled module from the file system to run the code.
+
+f = remote.load_module("vadd.o")
+
+######################################################################
+# Running the Function
+# --------------------
+# The compiled TVM function uses a concise C API and can be invoked from
+# any language.
+#
+# TVM provides an array API in python to aid quick testing and prototyping.
+# The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
+#
+# - We first create a remote context (for remote execution on the Pynq).
+# - Then :code:`tvm.nd.array` formats the data accordingly.
+# - :code:`f()` runs the actual computation.
+# - :code:`asnumpy()` copies the result array back in a format that can be
+#   interpreted.
+#
+
+# Get the remote device context
+ctx = remote.ext_dev(0)
+
+# Initialize the A and B arrays randomly in the int range of (-128, 128]
+A_orig = np.random.randint(
+    -128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(A.dtype)
+B_orig = np.random.randint(
+    -128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(B.dtype)
+
+# Apply packing to the A and B arrays from a 2D to a 4D packed layout
+A_packed = A_orig.reshape(
+    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
+B_packed = B_orig.reshape(
+    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
+
+# Format the input/output arrays with tvm.nd.array to the DLPack standard
+A_nd = tvm.nd.array(A_packed, ctx)
+B_nd = tvm.nd.array(B_packed, ctx)
+C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)
+
+# Invoke the module to perform the computation
+f(A_nd, B_nd, C_nd)
+
+######################################################################
+# Verifying Correctness
+# ---------------------
+# Compute the reference result with numpy and assert that the output of the
+# matrix multiplication indeed is correct
+
+# Compute reference result with numpy
+C_ref = (A_orig.astype(env.acc_dtype) + B_orig.astype(env.acc_dtype)).astype(C.dtype)
+C_ref = C_ref.reshape(
+    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
+np.testing.assert_equal(C_ref, C_nd.asnumpy())
+print("Successful vector add test!")
+
+######################################################################
+# Summary
+# -------
+# This tutorial provides a walk-through of TVM for programming the
+# deep learning accelerator VTA with a simple vector addition example.
+# The general workflow includes:
+#
+# - Programming the FPGA with the VTA bitstream over RPC.
+# - Describing the vector add computation via a series of computations.
+# - Describing how we want to perform the computation using schedule primitives.
+# - Compiling the function to the VTA target.
+# - Running the compiled module and verifying it against a numpy implementation.
+#
+# You are more than welcome to check other examples out and tutorials
+# to learn more about the supported operations, schedule primitives
+# and other features supported by TVM to program VTA.
+#
diff --git a/web/example_rpc.html b/web/example_rpc.html
index bcccbef7358f..b23ecda8e017 100644
--- a/web/example_rpc.html
+++ b/web/example_rpc.html
@@ -36,5 +36,9 @@ <h2>Options</h2>
    <button onclick="connect_rpc()">Connect To Proxy</button>
    <button onclick="clear_log()">Clear Log</button>
    <div id="log"></div>
+   <canvas id="canvas"></canvas>
+   <script>
+    Module["canvas"] = document.getElementById("canvas");
+   </script>
  </body>
 </html>
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index 288f6e16014b..ef594e9433fb 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -229,6 +229,14 @@ var tvm_runtime = tvm_runtime || {};
       "number"  // size_t nbytes
      ]);
 
+    var TVMModLoadFromFile = Module.cwrap
+    ("TVMModLoadFromFile",
+     "number",
+     ["string", // const char* file_name
+      "string", // const char* format
+      "number"  // TVMModuleHandle* out
+     ])
+
     //-----------------------------------------
     // Static utility functions
     // ----------------------------------------
@@ -291,10 +299,11 @@ var tvm_runtime = tvm_runtime || {};
     }
 
     function StringToUint8Array(str) {
-      var arr = new Uint8Array(str.length);
+      var arr = new Uint8Array(str.length + 1);
       for(var i = 0; i < str.length; ++i) {
         arr[i] = str.charCodeAt(i);
       }
+      arr[str.length] = 0;
       return arr;
     }
     //-----------------------------------------
@@ -493,7 +502,8 @@ var tvm_runtime = tvm_runtime || {};
     }
     var fptrInvokeCallback = null;
     var fptrFreeCallback = null;
-    if (typeof Runtime !== "undefined") {
+    if (typeof Runtime !== "undefined" &&
+        typeof Runtime.addFunction !== "undefined") {
       fptrInvokeCallback = Runtime.addFunction(invokeCallback);
       fptrFreeCallback = Runtime.addFunction(freeCallback);
     }
@@ -513,7 +523,8 @@ var tvm_runtime = tvm_runtime || {};
      */
     this.convertFunc = function(f) {
       if (isPackedFunc(f)) return f;
-      CHECK(fptrInvokeCallback !== null, "Emscripten Runtime is not available");
+      CHECK(fptrInvokeCallback !== null,
+            "Emscripten Runtime addFunction is not available");
       var fid;
       if (freeFuncId.length != 0) {
         fid = freeFuncId.pop();
@@ -597,7 +608,7 @@ var tvm_runtime = tvm_runtime || {};
         Module.setValue(this.value + index * SIZEOF_TVMVALUE, value, "*");
       },
       setString : function(index, value) {
-        var sdata = new CBuffer(value.length);
+        var sdata = new CBuffer(value.length + 1);
         Module.HEAPU8.set(StringToUint8Array(value), sdata.data);
         this.temp.push(sdata);
         Module.setValue(this.tcode + index * SIZEOF_INT, kStr, "i32");
@@ -645,6 +656,8 @@ var tvm_runtime = tvm_runtime || {};
             v = convertFunc(v);
             this.temp.push(v);
             this.setHandle(i, v._tvm_function.handle, kFuncHandle);
+          } else if (v instanceof TVMModule) {
+            this.setHandle(i, v.handle, kModuleHandle);
           } else {
             throwError("Unsupported argument type " + tp);
           }
@@ -684,7 +697,8 @@ var tvm_runtime = tvm_runtime || {};
       2 : "gpu",
       4 : "opencl",
       8 : "metal",
-      9 : "vpi"
+      9 : "vpi",
+      11 : "opengl",
     };
     var CTX_STR2MASK = {
       "cpu": 1,
@@ -693,7 +707,8 @@ var tvm_runtime = tvm_runtime || {};
       "cl": 4,
       "opencl": 4,
       "metal": 8,
-      "vpi": 9
+      "vpi": 9,
+      "opengl": 11,
     };
     TVMContext.prototype = {
       toString : function() {
@@ -836,6 +851,7 @@ var tvm_runtime = tvm_runtime || {};
       }
       // Node js, import websocket
       var bkey = StringToUint8Array("server:" + key);
+      bkey = bkey.slice(0, bkey.length - 1);
       var server_name = "WebSocketRPCServer[" + key + "]";
       var RPC_MAGIC = 0xff271;
       function checkEndian() {
@@ -866,7 +882,7 @@ var tvm_runtime = tvm_runtime || {};
 
         if (typeof systemFunc.fcreateServer === "undefined") {
           systemFunc.fcreateServer =
-            getGlobalFunc("contrib.rpc._CreateEventDrivenServer");
+            getGlobalFunc("rpc._CreateEventDrivenServer");
         }
         if (systemFunc.fcreateServer == null) {
           throwError("RPCServer is not included in runtime");
@@ -880,7 +896,7 @@ var tvm_runtime = tvm_runtime || {};
             } else {
               return new TVMConstant(0, "int32");
             }
-          } , server_name);
+          } , server_name, "%toinit");
 
         function on_open(event) {
           var intbuf = new Int32Array(1);
@@ -897,6 +913,7 @@ var tvm_runtime = tvm_runtime || {};
             var msg = new Uint8Array(event.data);
             CHECK(msg.length >= 4, "Need message header to be bigger than 4");
             var magic = new Int32Array(event.data)[0];
+
             if (magic == RPC_MAGIC + 1) {
               throwError("key: " + key + " has already been used in proxy");
             } else if (magic == RPC_MAGIC + 2) {
@@ -935,6 +952,136 @@ var tvm_runtime = tvm_runtime || {};
       }
       return new RPCServer(counter);
     };
+
+    /**
+     * Load a TVM module from a library file.
+     * The file must be present in the Emscripten virtual file system.
+     * For example, you can pass "--preload-file file" or "--preload-file dir/"
+     * to "emcc" when compiling the TVM library, in order to populate files into
+     * the file system.
+     * For more detail, see:
+     * https://kripken.github.io/emscripten-site/docs/porting/files/packaging_files
+     * @param {string} file_name Path of the file to be loaded. The path refers
+     * to the Emscripten virtual file system.
+     * @param {string} format The format of the file.
+     * @return {tvm.TVMModule} The loaded module.
+     */
+    this.loadModuleFromFile = function (file_name, format) {
+      // alloc
+      var out = new RefTVMValue();
+      TVM_CALL(TVMModLoadFromFile(file_name, format, out.data));
+      var out_handle = out.asHandle();
+      // release
+      out.release();
+      if (out_handle != 0) {
+        return new TVMModule(out_handle);
+      } else {
+        return null;
+      }
+    };
+    var loadModuleFromFile = this.loadModuleFromFile;
+
+    /**
+     * Wrapper runtime module.
+     * Wraps around set_input, load_params, run, and get_output.
+     *
+     * @class
+     * @memberof tvm
+     */
+    function GraphModule(tvm_graph_module, ctx) {
+      CHECK(tvm_graph_module instanceof TVMModule,
+            "tvm_graph_module must be TVMModule");
+      CHECK(ctx instanceof TVMContext, "ctx must be TVMContext");
+
+      this.tvm_graph_module = tvm_graph_module;
+      this.ctx = ctx;
+      this._set_input = tvm_graph_module.getFunction("set_input");
+      this._load_params = tvm_graph_module.getFunction("load_params");
+      this._run = tvm_graph_module.getFunction("run");
+      this._get_output = tvm_graph_module.getFunction("get_output");
+    };
+
+    GraphModule.prototype = {
+      /**
+       * Set input to graph module.
+       *
+       * @param {string} key The name of the input.
+       * @param {NDArray} value The input value.
+       */
+      "set_input" : function(key, value) {
+        CHECK(typeof key == "string", "key must be string");
+        CHECK(value instanceof NDArray, "value must be NDArray");
+        this._set_input(key, value);
+      },
+
+      /**
+       * Load parameters from serialized byte array of parameter dict.
+       *
+       * @param {Uint8Array} params The serialized parameter dict.
+       */
+      "load_params" : function(params) {
+        CHECK(params instanceof Uint8Array, "params must be Uint8Array");
+        this._load_params(params);
+      },
+
+      /**
+       * Load parameters from serialized base64 string of parameter dict.
+       *
+       * @param {string} base64_params The serialized parameter dict.
+       */
+      "load_base64_params" : function(base64_params) {
+        CHECK(typeof base64_params == "string", "base64_params must be string");
+        var decoded_string = atob(base64_params);
+        var decoded_u8 = new Uint8Array(decoded_string.length);
+        for (var i = 0; i < decoded_string.length; i++) {
+          decoded_u8[i] = decoded_string[i].charCodeAt(0);
+        }
+        this.load_params(decoded_u8);
+      },
+
+      /**
+       * Run forward execution of the graph.
+       */
+      "run" : function() {
+        this._run();
+      },
+
+      /**
+       * Get index-th output to out.
+       *
+       * @param {NDArray} out The output array container.
+       * @return {NDArray} The output array container.
+       */
+      "get_output" : function(index, out) {
+        CHECK(typeof index == "number", "index must be number");
+        CHECK(out instanceof NDArray, "out must be NDArray");
+        this._get_output(new TVMConstant(index, "int32"), out);
+        return out;
+      }
+    };
+
+    /**
+     * Create a runtime executor module given a graph and a module.
+     * @param {string} graph_json_str The Json string of the graph.
+     * @param {TVMModule} libmod The TVM module.
+     * @param {TVMContext} ctx The context to deploy the module.
+     * @return {GraphModule} Runtime graph module for executing the graph.
+     */
+    this.createGraphRuntime = function(graph_json_str, libmod, ctx) {
+      CHECK(typeof graph_json_str == "string", "graph_json_str must be string");
+      CHECK(libmod instanceof TVMModule, "libmod must be TVMModule");
+      CHECK(ctx instanceof TVMContext, "ctx must be TVMContext");
+
+      var fcreate = getGlobalFunc("tvm.graph_runtime.create");
+      CHECK(fcreate != null, "Cannot find tvm.graph_runtime.create");
+
+      var tvm_graph_module = fcreate(graph_json_str, libmod,
+                                     new TVMConstant(ctx.device_type, "int32"),
+                                     new TVMConstant(ctx.device_id, "int32"));
+
+      return new GraphModule(tvm_graph_module, ctx);
+    };
+
     //-----------------------------------------
     // Class defintions
     // ----------------------------------------
@@ -1086,7 +1233,11 @@ var tvm_runtime = tvm_runtime || {};
   this.create = function(Module) {
     var tvm = {};
     tvm.Module = Module;
-    tvm.Runtime = Module.Runtime;
+    if (typeof Module.addFunction !== "undefined") {
+      tvm.Runtime = Module;
+    } else {
+      tvm.Runtime = Module.Runtime;
+    }
     TVMRuntime.apply(tvm);
     return tvm;
   };
diff --git a/web/web_runtime.cc b/web/web_runtime.cc
index 56538733025b..46fe54b9f88f 100644
--- a/web/web_runtime.cc
+++ b/web/web_runtime.cc
@@ -18,6 +18,8 @@
 #include "../src/runtime/rpc/rpc_event_impl.cc"
 #include "../src/runtime/rpc/rpc_server_env.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
+#include "../src/runtime/opengl/opengl_device_api.cc"
+#include "../src/runtime/opengl/opengl_module.cc"
 
 namespace tvm {
 namespace contrib {
@@ -37,13 +39,13 @@ struct RPCEnv {
   std::string base_;
 };
 
-TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.workpath")
+TVM_REGISTER_GLOBAL("tvm.rpc.server.workpath")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     static RPCEnv env;
     *rv = env.GetPath(args[0]);
   });
 
-TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.load_module")
+TVM_REGISTER_GLOBAL("tvm.rpc.server.load_module")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
     std::string file_name = "/rpc/" + args[0].operator std::string();
     *rv = Module::LoadFromFile(file_name, "");