diff --git a/CODEOWNERS b/.github/CODEOWNERS
similarity index 51%
rename from CODEOWNERS
rename to .github/CODEOWNERS
index 0e22cff91e0b..5d0e94533bf4 100644
--- a/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,13 +5,23 @@
* @dmlc/tvm-committers
# LLVM backends
-src/llvm/* @aatluri
+src/codegen/llvm/* @aatluri
# ROCM runtime
src/runtime/rocm/* @aatluri
+# SGX support
+src/runtime/sgx/* @nhynes
+apps/sgx/* @nhynes
+
# JVM language
-jvm/* @javelinjs
+jvm/* @yzhliu
+
+# WebGL backends
+src/runtime/opengl/* @phisiart
+src/codegen/*opengl* @phisiart
# TOPI
topi/python/topi/* @Laurawly @Huyuwei
+
+
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 000000000000..0e2a130d489e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,7 @@
+Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. You are always welcomed to post on the forum first :)
+
+Issues that are inactive for a period of time may get closed. We adopt this policy so that we won't lose track of actionable issues that may fall at the bottom of the pile. Feel free to reopen a new one if you feel there is an additional problem that needs attention when an old one gets closed.
+
+For bug reports, to help the developer act on the issues, please include a description of your environment, preferably a minimum script to reproduce the problem.
+
+For feature proposals, list clear, small actionable items so we can track the progress of the change.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000000..313b776b0824
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1 @@
+Thanks for contributing to TVM! Please refer to guideline https://docs.tvm.ai/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from others in the community.
diff --git a/.gitignore b/.gitignore
index f59a58552f8d..3c968eb3ed47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -98,7 +98,6 @@ build_*
Win32
*.dir
perf
-nnvm
*.wasm
.emscripten
@@ -132,13 +131,63 @@ xcuserdata/
.emscripten*
.m2
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
## Other
*.moved-aside
*.xccheckout
*.xcscmblueprint
.DS_Store
tags
+cscope*
+*.lock
# vim temporary files
*.swp
*.swo
+
+# TVM generated code
+perf
+.bash_history
+*.json
+*.params
+*.onnx
+*.h5
+synset.txt
+cat.jpg
+docs.tgz
+cat.png
+*.mlmodel
+# Mac OS X
+.DS_Store
+build*
+
+# Jetbrain
+.idea
+
+# tmp file
+.nfs*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f42705ae7fda..39776d53d1f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,12 @@
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.2)
project(tvm C CXX)
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
- include(${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
-endif()
-
-include(cmake/Util.cmake)
+# Utility functions
+include(cmake/util/Util.cmake)
+include(cmake/util/FindCUDA.cmake)
+include(cmake/util/FindVulkan.cmake)
+include(cmake/util/FindLLVM.cmake)
+include(cmake/util/FindROCM.cmake)
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
@@ -19,33 +20,47 @@ endif()
# You can create a config.cmake at build folder
# and add set(OPTION VALUE) to override these build options.
# Alernatively, use cmake -DOPTION=VALUE through command-line.
-
tvm_option(USE_CUDA "Build with CUDA" OFF)
tvm_option(USE_OPENCL "Build with OpenCL" OFF)
+tvm_option(USE_VULKAN "Build with Vulkan" OFF)
+tvm_option(USE_OPENGL "Build with OpenGL" OFF)
tvm_option(USE_METAL "Build with Metal" OFF)
+tvm_option(USE_ROCM "Build with ROCM" OFF)
+tvm_option(ROCM_PATH "The path to rocm" /opt/rocm)
tvm_option(USE_RPC "Build with RPC" ON)
+tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
-tvm_option(USE_LLVM "Build with LLVM" OFF)
+tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
tvm_option(USE_RTTI "Build with RTTI" ON)
tvm_option(USE_MSVC_MT "Build with MT" OFF)
tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF)
+# Contrib library options
+tvm_option(USE_BLAS "The blas library to be linked" none)
+tvm_option(USE_MKL_PATH "MKL root path when use MKL blas" none)
+tvm_option(USE_CUDNN "Build with cuDNN" OFF)
+tvm_option(USE_CUBLAS "Build with cuBLAS" OFF)
+tvm_option(USE_MIOPEN "Build with ROCM:MIOpen" OFF)
+tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF)
+tvm_option(USE_SORT "Build with sort support" OFF)
+tvm_option(USE_NNPACK "Build with nnpack support" OFF)
+tvm_option(USE_RANDOM "Build with random support" OFF)
+
+# include directories
include_directories("include")
-include_directories("HalideIR/src")
include_directories("dlpack/include")
+include_directories("dmlc-core/include")
-
+# initial variables
set(TVM_LINKER_LIBS "")
set(TVM_RUNTIME_LINKER_LIBS "")
-# compile
+# Generic compilation options
if(MSVC)
add_definitions(-DWIN32_LEAN_AND_MEAN)
add_definitions(-D_CRT_SECURE_NO_WARNINGS)
add_definitions(-D_SCL_SECURE_NO_WARNINGS)
- add_definitions(-DTVM_EXPORTS)
add_definitions(-DHalide_SHARED)
- add_definitions(-DHalide_EXPORTS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /bigobj")
@@ -61,146 +76,142 @@ if(MSVC)
else(MSVC)
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11)
- set(CMAKE_C_FLAGS "-O3 -Wall -std=c++11 -fPIC")
- set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS})
+ set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}")
+ set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS}")
+ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
+ CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+ set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
+ endif()
endif(MSVC)
# add source group
-FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "HalideIR/src/*.cpp")
-FILE(GLOB_RECURSE GROUP_Include "src/*.h" "include/*.h" "HalideIR/src/*.h")
+FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "HalideIR/src/*.cpp" "nnvm/src/*.cc")
+FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "HalideIR/src/*.h"
+ "nnvm/src/*.h" "nnvm/include/*.h")
assign_source_group("Source" ${GROUP_SOURCE})
-assign_source_group("Include" ${GROUP_Include})
+assign_source_group("Include" ${GROUP_INCLUDE})
+# Source file lists
file(GLOB COMPILER_SRCS
src/api/*.cc
src/arithmetic/*.cc
+ src/autotvm/*.cc
src/codegen/*.cc
src/codegen/stack_vm/*.cc
src/lang/*.cc
src/pass/*.cc
src/op/*.cc
src/schedule/*.cc
+ )
+
+if(NOT MSVC)
+ file(GLOB COMPILER_VERILOG_SRCS src/codegen/verilog/*.cc)
+ list(APPEND COMPILER_SRCS ${COMPILER_VERILOG_SRCS})
+endif()
+
+file(GLOB_RECURSE NNVM_COMPILER_SRCS
+ nnvm/src/c_api/*.cc
+ nnvm/src/core/*.cc
+ nnvm/src/pass/*.cc
+ nnvm/src/compiler/*.cc
+ nnvm/src/top/*.cc
+ )
+
+file(GLOB TOPI_SRCS
+ topi/src/*.cc
)
file(GLOB_RECURSE HALIDEIR_SRCS HalideIR/src/*.cpp)
list(APPEND COMPILER_SRCS ${HALIDEIR_SRCS})
file(GLOB RUNTIME_SRCS src/runtime/*.cc)
-file(GLOB COMPILER_LLVM_SRCS src/codegen/llvm/*.cc)
-file(GLOB RUNTIME_CUDA_SRCS src/runtime/cuda/*.cc)
-file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
-file(GLOB RUNTIME_METAL_SRCS src/runtime/metal/*.mm)
-file(GLOB RUNTIME_RPC_SRCS src/runtime/rpc/*.cc)
-file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
-
-if(USE_CUDA)
-find_package(CUDA)
-# Find CUDA doesn't find all the libraries we need, add the extra ones
-find_library(CUDA_CUDA_LIBRARIES cuda
- PATHS ${CUDA_TOOLKIT_ROOT_DIR}
- PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
-find_library(CUDA_NVRTC_LIBRARIES nvrtc
- PATHS ${CUDA_TOOLKIT_ROOT_DIR}
- PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
- set(CUDA_CUDA_LIBRARY ${CUDA_CUDA_LIBRARIES})
-
- find_package(CUDA QUIET REQUIRED)
- message(STATUS "Build with CUDA support")
- include_directories(${CUDA_INCLUDE_DIRS})
- list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDART_LIBRARY})
- list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDA_LIBRARY})
- list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_SRCS})
- if(MSVC)
- find_library(CUDA_NVRTC_LIB nvrtc
- ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
- ${CUDA_TOOLKIT_ROOT_DIR}/lib/win32)
- list(APPEND TVM_LINKER_LIBS ${CUDA_NVRTC_LIB})
- else(MSVC)
- find_library(CUDA_NVRTC_LIB nvrtc
- ${CUDA_TOOLKIT_ROOT_DIR}/lib64
- ${CUDA_TOOLKIT_ROOT_DIR}/lib)
- list(APPEND TVM_LINKER_LIBS ${CUDA_NVRTC_LIB})
- endif(MSVC)
- add_definitions(-DTVM_CUDA_RUNTIME=1)
-else(USE_CUDA)
- add_definitions(-DTVM_CUDA_RUNTIME=0)
-endif(USE_CUDA)
-
-if(USE_OPENCL)
- find_package(OpenCL QUIET REQUIRED)
- message(STATUS "Build with OpenCL support")
- include_directories(${OPENCL_INCLUDE_DIRS})
- list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
- list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
- add_definitions(-DTVM_OPENCL_RUNTIME=1)
-else(USE_OPENCL)
- add_definitions(-DTVM_OPENCL_RUNTIME=0)
-endif(USE_OPENCL)
-
-if(USE_METAL)
- find_package(OpenCL QUIET REQUIRED)
- message(STATUS "Build with Metal support")
- FIND_LIBRARY(METAL_LIB Metal)
- FIND_LIBRARY(FOUNDATION_LIB Foundation)
- list(APPEND TVM_RUNTIME_LINKER_LIBS ${METAL_LIB} ${FOUNDATION_LIB})
- list(APPEND RUNTIME_SRCS ${RUNTIME_METAL_SRCS})
- add_definitions(-DTVM_METAL_RUNTIME=1)
-else(USE_METAL)
- add_definitions(-DTVM_METAL_RUNTIME=0)
-endif(USE_METAL)
+
+# Package runtime rules
+if(NOT USE_RTTI)
+ add_definitions(-DDMLC_ENABLE_RTTI=0)
+endif()
if(USE_RPC)
message(STATUS "Build with RPC support...")
+ file(GLOB RUNTIME_RPC_SRCS src/runtime/rpc/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_RPC_SRCS})
endif(USE_RPC)
if(USE_GRAPH_RUNTIME)
message(STATUS "Build with Graph runtime support...")
+ file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_SRCS})
-endif(USE_GRAPH_RUNTIME)
-if(USE_LLVM)
- find_package(LLVM CONFIG REQUIRED)
- include_directories(${LLVM_INCLUDE_DIRS})
- add_definitions(${LLVM_DEFINITIONS})
- set(TVM_LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR})
- message(STATUS "Build with LLVM " ${LLVM_PACKAGE_VERSION})
- message(STATUS "Set TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION})
- add_definitions(-DTVM_LLVM_VERSION=${TVM_LLVM_VERSION})
- add_definitions(-DDMLC_USE_FOPEN64=0)
- llvm_map_components_to_libnames(LLVM_LIBS all)
- list(REMOVE_ITEM LLVM_LIBS LTO)
- list(APPEND TVM_LINKER_LIBS ${LLVM_LIBS})
- list(APPEND COMPILER_SRCS ${COMPILER_LLVM_SRCS})
- if(NOT MSVC)
- set_property(SOURCE ${COMPILER_LLVM_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS
- "-fno-rtti -DDMLC_ENABLE_RTTI=0")
- endif()
-endif(USE_LLVM)
-
-if(NOT USE_RTTI)
- add_definitions(-DDMLC_ENABLE_RTTI=0)
-endif()
+ if(USE_GRAPH_RUNTIME_DEBUG)
+ set_source_files_properties(${RUNTIME_GRAPH_SRCS}
+ PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG")
+ endif(USE_GRAPH_RUNTIME_DEBUG)
+endif(USE_GRAPH_RUNTIME)
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/CMakeLists.txt)
- include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/include)
- if (INSTALL_DEV)
- install(
- DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/include/." DESTINATION "include"
- FILES_MATCHING
- PATTERN "*.h"
- )
- endif()
-elseif(DMLC_CORE_PATH)
- include_directories(${DMLC_CORE_PATH}/include)
-endif()
+# Module rules
+include(cmake/modules/VTA.cmake)
+include(cmake/modules/CUDA.cmake)
+include(cmake/modules/OpenCL.cmake)
+include(cmake/modules/OpenGL.cmake)
+include(cmake/modules/Vulkan.cmake)
+include(cmake/modules/Metal.cmake)
+include(cmake/modules/ROCM.cmake)
+include(cmake/modules/LLVM.cmake)
+include(cmake/modules/contrib/BLAS.cmake)
+include(cmake/modules/contrib/Random.cmake)
+include(cmake/modules/contrib/Sort.cmake)
+include(cmake/modules/contrib/NNPack.cmake)
-list(APPEND RUNTIME_SRCS ${GROUP_Include})
add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
+add_library(tvm_topi SHARED ${TOPI_SRCS})
add_library(tvm_runtime SHARED ${RUNTIME_SRCS})
+add_library(nnvm_compiler SHARED ${NNVM_COMPILER_SRCS})
+
target_link_libraries(tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
-target_link_libraries(tvm_runtime ${TVM_RUNTIME_LINKER_LIBS})
+target_link_libraries(tvm_topi tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
+target_link_libraries(tvm_runtime ${TVM_RUNTIME_LINKER_LIBS})
+target_link_libraries(nnvm_compiler tvm)
+
+# Related headers
+target_include_directories(
+ tvm
+ PUBLIC "HalideIR/src"
+ PUBLIC "topi/include")
+target_include_directories(
+ tvm_topi
+ PUBLIC "topi/include")
+target_include_directories(
+ nnvm_compiler
+ PUBLIC "nnvm/include"
+ PUBLIC "topi/include")
+
+# Tests
+set(TEST_EXECS "")
+file(GLOB TEST_SRCS tests/cpp/*.cc)
+find_library(GTEST_LIB gtest)
+
+if(GTEST_LIB)
+ foreach(__srcpath ${TEST_SRCS})
+ get_filename_component(__srcname ${__srcpath} NAME)
+ string(REPLACE ".cc" "" __execname ${__srcname})
+ add_executable(${__execname} ${__srcpath})
+ list(APPEND TEST_EXECS ${__execname})
+ target_link_libraries(${__execname}
+ tvm ${GTEST_LIB} pthread)
+ set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_ALL 1)
+ set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
+ endforeach()
+ add_custom_target(cpptest DEPENDS ${TEST_EXECS})
+endif()
+
+# Custom targets
+add_custom_target(runtime DEPENDS tvm_runtime)
+
+# Installation rules
+install(TARGETS tvm DESTINATION lib${LIB_SUFFIX})
+install(TARGETS tvm_topi DESTINATION lib${LIB_SUFFIX})
install(TARGETS tvm_runtime DESTINATION lib${LIB_SUFFIX})
+install(TARGETS nnvm_compiler DESTINATION lib${LIB_SUFFIX})
+
if (INSTALL_DEV)
- install(TARGETS tvm DESTINATION lib${LIB_SUFFIX})
install(
DIRECTORY "include/." DESTINATION "include"
FILES_MATCHING
@@ -220,11 +231,25 @@ if (INSTALL_DEV)
DIRECTORY "dlpack/include/." DESTINATION "include"
FILES_MATCHING
PATTERN "*.h"
- )
+ )
+ install(
+ DIRECTORY "nnvm/include/." DESTINATION "include"
+ FILES_MATCHING
+ PATTERN "*.h"
+ )
else(INSTALL_DEV)
install(
DIRECTORY "include/tvm/runtime/." DESTINATION "include/tvm/runtime"
FILES_MATCHING
PATTERN "*.h"
- )
+ )
endif(INSTALL_DEV)
+
+# More target definitions
+if(MSVC)
+ target_compile_definitions(tvm PRIVATE -DHalide_EXPORTS)
+ target_compile_definitions(tvm_runtime PRIVATE -DHalide_EXPORTS)
+ target_compile_definitions(tvm PRIVATE -DTVM_EXPORTS)
+ target_compile_definitions(tvm_runtime PRIVATE -DTVM_EXPORTS)
+ target_compile_definitions(nnvm_compiler PRIVATE -DNNVM_EXPORTS)
+endif()
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index ab9950a9f31d..6e3cf55b94b0 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,37 +1,42 @@
-Contributors of TVM
-===================
-TVM adopts Apache style committer model. The package is developed and used by the community.
+TVM Contributors
+================
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use,
+contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community.
-We actively seek committers that comes from contributors who:
-- Made substantial contribution to the project.
-- Willing to spent time on maintaining and lead the project.
+See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines.
-How to Contribute
------------------
-See [Contributor guide](docs/how_to/contribute.md) on how to contribute
-
-Committers
-----------
-Committers are people who have made substantial contribution to the project and granted write access to the project.
-- [Tianqi Chen](https://github.com/tqchen), University of Washington
-- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/), University of Washington
-- [Haichen Shen](http://homes.cs.washington.edu/~haichen/), University of Washington
-- [Ziheng Jiang](https://github.com/ZihengJiang), Fudan University
-
-Code Owners
------------
-[Code owners](CODEOWNERS) are people who make substantial contribution to a module
-and are qualified to lead development and review changes of the owned module.
+## Committers
+- [Tianqi Chen](https://github.com/tqchen) (PMC)
+- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/)
+- [Ziheng Jiang](https://github.com/ZihengJiang)
+- [Haichen Shen](http://homes.cs.washington.edu/~haichen/)
+- [Yizhi Liu](https://github.com/yzhliu)
+## Code Owners
- [Aditya Atluri](https://github.com/adityaatluri) ROCM
- [Leyuan Wang](https://github.com/Laurawly) TOPI
- [Yuwei Hu](https://github.com/Huyuwei) TOPI
-- [Yizhi Liu](https://github.com/javelinjs) JVM package
+- [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
+- [Nick Hynes](https://github.com/nhynes) SGX and secured computing
+
+## Reviewers
+- [Masahiro Masuda](https://github.com/masahi)
+- [Kazutaka Morita](https://github.com/kazum)
+- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
+- [Siva](https://github.com/srkreddy1238)
+- [Alex Weaver](https://github.com/alex-weaver)
+- [Eddie Yan](https://github.com/eqy)
+- [Joshua Z. Zhang](https://github.com/zhreshold)
+- [Lianmin Zheng](https://github.com/merrymercy)
-List of Contributors
---------------------
+## List of Contributors
- [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)
- To contributors: please add your name to the list.
- [Qiao Zhang](https://github.com/zhangqiaorjc)
- [Jian Weng](https://github.com/were)
- [Masahiro Masuda](https://github.com/masahi)
+- [Haolong Zhang](https://github.com/haolongzhangm)
+- [Cody Hao Yu](https://github.com/comaniac)
+- [Chris Nuernberger](https://github.com/cnuernber)
+- [Tatsuya Nishiyama](https://github.com/nishi-t)
+- [Kazutaka Morita](https://github.com/kazum)
diff --git a/HalideIR b/HalideIR
index d91cf97d5d6c..a0b9563f4571 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit d91cf97d5d6cd2b47ec408bb08e978b88cbf6ab7
+Subproject commit a0b9563f45719553adf4d39fe3c14db1af0e1f40
diff --git a/Jenkinsfile b/Jenkinsfile
index ef9666351ba5..8d76ebedeaae 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -4,13 +4,14 @@
// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
// tvm libraries
-tvm_runtime = "lib/libtvm_runtime.so, config.mk"
-tvm_lib = "lib/libtvm.so, " + tvm_runtime
+tvm_runtime = "build/libtvm_runtime.so, build/config.cmake"
+tvm_lib = "build/libtvm.so, " + tvm_runtime
// LLVM upstream lib
-tvm_multilib = "lib/libtvm_llvm40.so, lib/libtvm_llvm50.so, lib/libtvm_llvm60.so, " + tvm_runtime
+tvm_multilib = "build/libtvm.so, " +
+ "build/libvta.so, build/libtvm_topi.so, build/libnnvm_compiler.so, " + tvm_runtime
// command to start a docker container
-docker_run = 'tests/ci_build/ci_build.sh'
+docker_run = 'docker/bash.sh'
// timeout in minutes
max_time = 60
@@ -38,7 +39,7 @@ stage("Sanity Check") {
node('linux') {
ws('workspace/tvm/sanity') {
init_git()
- sh "${docker_run} lint ./tests/scripts/task_lint.sh"
+ sh "${docker_run} tvmai/ci-lint ./tests/scripts/task_lint.sh"
}
}
}
@@ -47,14 +48,14 @@ stage("Sanity Check") {
// Run make. First try to do an incremental make from a previous workspace in hope to
// accelerate the compilation. If something wrong, clean the workspace and then
// build from scratch.
-def make(docker_type, make_flag) {
+def make(docker_type, path, make_flag) {
timeout(time: max_time, unit: 'MINUTES') {
try {
- sh "${docker_run} ${docker_type} make ${make_flag}"
+ sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}"
} catch (exc) {
echo 'Incremental compilation failed. Fall back to build from scratch'
- sh "${docker_run} ${docker_type} make clean"
- sh "${docker_run} ${docker_type} make ${make_flag}"
+ sh "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}"
+ sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}"
}
}
}
@@ -84,30 +85,35 @@ stage('Build') {
ws('workspace/tvm/build-gpu') {
init_git()
sh """
- cp make/config.mk .
- echo USE_CUDNN=1 >> config.mk
- echo USE_CUDA=1 >> config.mk
- echo USE_OPENCL=1 >> config.mk
- echo LLVM_CONFIG=llvm-config-4.0 >> config.mk
- echo USE_RPC=1 >> config.mk
- echo USE_GRAPH_RUNTIME=1 >> config.mk
- echo USE_BLAS=openblas >> config.mk
- rm -f lib/libtvm_runtime.so lib/libtvm.so
+ mkdir -p build
+ cd build
+ cp ../cmake/config.cmake .
+ echo set\\(USE_CUBLAS ON\\) >> config.cmake
+ echo set\\(USE_CUDNN ON\\) >> config.cmake
+ echo set\\(USE_CUDA ON\\) >> config.cmake
+ echo set\\(USE_OPENGL ON\\) >> config.cmake
+ echo set\\(USE_LLVM llvm-config-6.0\\) >> config.cmake
+ echo set\\(USE_RPC ON\\) >> config.cmake
+ echo set\\(USE_SORT ON\\) >> config.cmake
+ echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake
+ echo set\\(USE_BLAS openblas\\) >> config.cmake
+ echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
+ echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
"""
- make('gpu', '-j2')
- sh "mv lib/libtvm.so lib/libtvm_llvm40.so"
- sh "echo LLVM_CONFIG=llvm-config-5.0 >> config.mk"
- make('gpu', '-j2')
- sh "mv lib/libtvm.so lib/libtvm_llvm50.so"
- sh "echo LLVM_CONFIG=llvm-config-6.0 >> config.mk"
- make('gpu', '-j2')
- sh "mv lib/libtvm.so lib/libtvm_llvm60.so"
+ make('tvmai/ci-gpu', 'build', '-j2')
pack_lib('gpu', tvm_multilib)
+ // compiler test
sh """
- echo USE_ROCM=1 >> config.mk
- echo ROCM_PATH=/opt/rocm >> config.mk
+ mkdir -p build2
+ cd build2
+ cp ../cmake/config.cmake .
+ echo set\\(USE_OPENCL ON\\) >> config.cmake
+ echo set\\(USE_ROCM ON\\) >> config.cmake
+ echo set\\(USE_VULKAN ON\\) >> config.cmake
+ echo set\\(CMAKE_CXX_COMPILER clang-6.0\\) >> config.cmake
+ echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
"""
- make('gpu', '-j2')
+ make('tvmai/ci-gpu', 'build2', '-j2')
}
}
},
@@ -116,13 +122,20 @@ stage('Build') {
ws('workspace/tvm/build-cpu') {
init_git()
sh """
- cp make/config.mk .
- echo USE_CUDA=0 >> config.mk
- echo USE_OPENCL=0 >> config.mk
- echo USE_RPC=0 >> config.mk
+ mkdir -p build
+ cd build
+ cp ../cmake/config.cmake .
+ echo set\\(USE_SORT ON\\) >> config.cmake
+ echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
+ echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
+ echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
"""
- make('cpu', '-j2')
+ make('tvmai/ci-cpu', 'build', '-j2')
pack_lib('cpu', tvm_lib)
+ timeout(time: max_time, unit: 'MINUTES') {
+ sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_cpp_unittest.sh"
+ sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_vta.sh"
+ }
}
}
},
@@ -131,48 +144,19 @@ stage('Build') {
ws('workspace/tvm/build-i386') {
init_git()
sh """
- cp make/config.mk .
- echo USE_CUDA=0 >> config.mk
- echo USE_OPENCL=0 >> config.mk
- echo LLVM_CONFIG=llvm-config-4.0 >> config.mk
- echo USE_RPC=1 >> config.mk
+ mkdir -p build
+ cd build
+ cp ../cmake/config.cmake .
+ echo set\\(USE_SORT ON\\) >> config.cmake
+ echo set\\(USE_RPC ON\\) >> config.cmake
+ echo set\\(USE_LLVM llvm-config-5.0\\) >> config.cmake
+ echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
+ echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
"""
- make('i386', '-j2')
- sh "mv lib/libtvm.so lib/libtvm_llvm40.so"
- sh "echo LLVM_CONFIG=llvm-config-5.0 >> config.mk"
- make('i386', '-j2')
- sh "mv lib/libtvm.so lib/libtvm_llvm50.so"
- sh "echo LLVM_CONFIG=llvm-config-6.0 >> config.mk"
- make('i386', '-j2')
- sh "mv lib/libtvm.so lib/libtvm_llvm60.so"
+ make('tvmai/ci-i386', 'build', '-j2')
pack_lib('i386', tvm_multilib)
}
}
- },
- 'web': {
- node('emcc') {
- ws('workspace/tvm/build-weblib') {
- init_git()
- sh """
- cp make/config.mk .
- echo USE_CUDA=0 >> config.mk
- echo USE_OPENCL=0 >> config.mk
- echo LLVM_CONFIG=llvm-config >> config.mk
- echo USE_RPC=0 >> config.mk
- """
- sh "${docker_run} emscripten echo testing javascript..."
- timeout(time: max_time, unit: 'MINUTES') {
- try {
- sh "${docker_run} emscripten ./tests/scripts/task_web_build.sh"
- } catch (exc) {
- echo 'Incremental compilation failed. Fall back to build from scratch'
- sh "${docker_run} emscripten make clean"
- sh "${docker_run} emscripten ./tests/scripts/task_web_build.sh"
- }
- }
- pack_lib('weblib', tvm_lib)
- }
- }
}
}
@@ -182,14 +166,8 @@ stage('Unit Test') {
ws('workspace/tvm/ut-python-gpu') {
init_git()
unpack_lib('gpu', tvm_multilib)
- sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
timeout(time: max_time, unit: 'MINUTES') {
- sh "${docker_run} gpu ./tests/scripts/task_python_unittest.sh"
- }
- // Test on the lastest mainline.
- sh "cp lib/libtvm_llvm60.so lib/libtvm.so"
- timeout(time: max_time, unit: 'MINUTES') {
- sh "${docker_run} gpu ./tests/scripts/task_python_unittest.sh"
+ sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_unittest.sh"
}
}
}
@@ -199,26 +177,10 @@ stage('Unit Test') {
ws('workspace/tvm/ut-python-i386') {
init_git()
unpack_lib('i386', tvm_multilib)
- sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
- timeout(time: max_time, unit: 'MINUTES') {
- sh "${docker_run} i386 ./tests/scripts/task_python_unittest.sh"
- sh "${docker_run} i386 ./tests/scripts/task_python_integration.sh"
- }
- // Test on llvm 5.0
- sh "cp lib/libtvm_llvm50.so lib/libtvm.so"
timeout(time: max_time, unit: 'MINUTES') {
- sh "${docker_run} i386 ./tests/scripts/task_python_integration.sh"
- }
- }
- }
- },
- 'cpp': {
- node('linux') {
- ws('workspace/tvm/ut-cpp') {
- init_git()
- unpack_lib('cpu', tvm_lib)
- timeout(time: max_time, unit: 'MINUTES') {
- sh "${docker_run} cpu ./tests/scripts/task_cpp_unittest.sh"
+ sh "${docker_run} tvmai/ci-i386 ./tests/scripts/task_python_unittest.sh"
+ sh "${docker_run} tvmai/ci-i386 ./tests/scripts/task_python_integration.sh"
+ sh "${docker_run} tvmai/ci-i386 ./tests/scripts/task_python_vta.sh"
}
}
}
@@ -228,9 +190,8 @@ stage('Unit Test') {
ws('workspace/tvm/ut-java') {
init_git()
unpack_lib('gpu', tvm_multilib)
- sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
timeout(time: max_time, unit: 'MINUTES') {
- sh "${docker_run} gpu ./tests/scripts/task_java_unittest.sh"
+ sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_java_unittest.sh"
}
}
}
@@ -243,22 +204,11 @@ stage('Integration Test') {
ws('workspace/tvm/it-python-gpu') {
init_git()
unpack_lib('gpu', tvm_multilib)
- sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
- timeout(time: max_time, unit: 'MINUTES') {
- sh "${docker_run} gpu ./tests/scripts/task_python_integration.sh"
- sh "${docker_run} gpu ./tests/scripts/task_python_topi.sh"
- }
- }
- }
- },
- 'web': {
- node('emcc') {
- ws('workspace/tvm/it-weblib') {
- init_git()
- unpack_lib('weblib', tvm_lib)
- sh "${docker_run} emscripten echo testing javascript..."
timeout(time: max_time, unit: 'MINUTES') {
- sh "${docker_run} emscripten ./tests/scripts/task_web_test.sh"
+ sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_integration.sh"
+ sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_topi.sh"
+ sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_cpp_topi.sh"
+ sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_nnvm.sh"
}
}
}
@@ -268,9 +218,8 @@ stage('Integration Test') {
ws('workspace/tvm/docs-python-gpu') {
init_git()
unpack_lib('gpu', tvm_multilib)
- sh "cp lib/libtvm_llvm40.so lib/libtvm.so"
timeout(time: max_time, unit: 'MINUTES') {
- sh "${docker_run} gpu ./tests/scripts/task_python_docs.sh"
+ sh "${docker_run} tvmai/ci-gpu ./tests/scripts/task_python_docs.sh"
}
pack_lib('mydocs', 'docs.tgz')
}
diff --git a/Makefile b/Makefile
index 4a16d5162102..2d3d4843c4c0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,7 @@
ROOTDIR = $(CURDIR)
-ifndef config
-ifneq ("$(wildcard ./config.mk)","")
- config ?= config.mk
-else
- config ?= make/config.mk
-endif
-endif
-
-include $(config)
-
-.PHONY: clean install installdev all test doc pylint cpplint lint verilog cython cython2 cython3 web runtime
+.PHONY: clean all test doc pylint cpplint lint\
+ cython cython2 cython3 web runtime vta
ifndef DMLC_CORE_PATH
DMLC_CORE_PATH = $(ROOTDIR)/dmlc-core
@@ -20,242 +11,65 @@ ifndef DLPACK_PATH
DLPACK_PATH = $(ROOTDIR)/dlpack
endif
-UNAME_S := $(shell uname -s)
-
-# The flags
-LLVM_CFLAGS= -fno-rtti -DDMLC_ENABLE_RTTI=0 -DDMLC_USE_FOPEN64=0
-LDFLAGS = -pthread -lm -ldl
-INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include -IHalideIR/src -Itopi/include
-CFLAGS = -std=c++11 -Wall -O2 $(INCLUDE_FLAGS) -fPIC
-FRAMEWORKS =
-OBJCFLAGS = -fno-objc-arc
-EMCC_FLAGS= -s RESERVED_FUNCTION_POINTERS=2 -s NO_EXIT_RUNTIME=1 -s MAIN_MODULE=1 -DDMLC_LOG_STACK_TRACE=0\
- -std=c++11 -Oz $(INCLUDE_FLAGS)
-
-# llvm configuration
-ifdef LLVM_CONFIG
- LLVM_VERSION=$(shell $(LLVM_CONFIG) --version| cut -b 1,3)
- LLVM_INCLUDE=$(filter -I%, $(shell $(LLVM_CONFIG) --cxxflags))
- LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags --libs --system-libs)
- LLVM_CFLAGS += $(LLVM_INCLUDE) -DTVM_LLVM_VERSION=$(LLVM_VERSION)
-else
- LLVM_VERSION=00
-endif
-
-# The source code dependencies
-LIB_HALIDEIR = HalideIR/lib/libHalideIR.a
+INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include
+PKG_CFLAGS = -std=c++11 -Wall -O2 $(INCLUDE_FLAGS) -fPIC
+PKG_LDFLAGS =
-CC_SRC = $(filter-out src/contrib/%.cc src/runtime/%.cc src/codgen/llvm/%.cc,\
- $(wildcard src/*/*.cc src/*/*/*.cc))
-LLVM_SRC = $(wildcard src/codegen/llvm/*.cc src/codegen/llvm/*/*.cc)
-METAL_SRC = $(wildcard src/runtime/metal/*.mm)
-CUDA_SRC = $(wildcard src/runtime/cuda/*.cc)
-ROCM_SRC = $(wildcard src/runtime/rocm/*.cc)
-OPENCL_SRC = $(wildcard src/runtime/opencl/*.cc)
-RPC_SRC = $(wildcard src/runtime/rpc/*.cc)
-GRAPH_SRC = $(wildcard src/runtime/graph/*.cc)
-RUNTIME_SRC = $(wildcard src/runtime/*.cc)
-# Objectives
-LLVM_BUILD = build/llvm${LLVM_VERSION}
-LLVM_OBJ = $(patsubst src/%.cc, ${LLVM_BUILD}/%.o, $(LLVM_SRC))
-METAL_OBJ = $(patsubst src/%.mm, build/%.o, $(METAL_SRC))
-CUDA_OBJ = $(patsubst src/%.cc, build/%.o, $(CUDA_SRC))
-ROCM_OBJ = $(patsubst src/%.cc, build/%.o, $(ROCM_SRC))
-OPENCL_OBJ = $(patsubst src/%.cc, build/%.o, $(OPENCL_SRC))
-RPC_OBJ = $(patsubst src/%.cc, build/%.o, $(RPC_SRC))
-GRAPH_OBJ = $(patsubst src/%.cc, build/%.o, $(GRAPH_SRC))
-CC_OBJ = $(patsubst src/%.cc, build/%.o, $(CC_SRC)) $(LLVM_OBJ)
-RUNTIME_OBJ = $(patsubst src/%.cc, build/%.o, $(RUNTIME_SRC))
-CONTRIB_OBJ =
+all:
+ @mkdir -p build && cd build && cmake .. && $(MAKE)
-# Deps
-ALL_DEP = $(CC_OBJ) $(CONTRIB_OBJ) $(LIB_HALIDEIR)
-RUNTIME_DEP = $(RUNTIME_OBJ)
+runtime:
+ @mkdir -p build && cd build && cmake .. && $(MAKE) runtime
-# Dependency specific rules
-ifdef CUDA_PATH
- NVCC=$(CUDA_PATH)/bin/nvcc
- CFLAGS += -I$(CUDA_PATH)/include
- LDFLAGS += -L$(CUDA_PATH)/lib64
-endif
+vta:
+ @mkdir -p build && cd build && cmake .. && $(MAKE) vta
-ifeq ($(USE_CUDA), 1)
- CFLAGS += -DTVM_CUDA_RUNTIME=1
- LDFLAGS += -lcuda -lcudart -lnvrtc
- RUNTIME_DEP += $(CUDA_OBJ)
-else
- CFLAGS += -DTVM_CUDA_RUNTIME=0
-endif
+cpptest:
+ @mkdir -p build && cd build && cmake .. && $(MAKE) cpptest
-ifdef ROCM_PATH
- CFLAGS += -I$(ROCM_PATH)/include
- LDFLAGS += -L$(ROCM_PATH)/lib
-endif
+# EMCC; Web related scripts
+EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\
+ -Oz -s RESERVED_FUNCTION_POINTERS=2 -s MAIN_MODULE=1 -s NO_EXIT_RUNTIME=1\
+ -s TOTAL_MEMORY=1073741824\
+ -s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap','getValue','setValue','addFunction']"\
+ -s USE_GLFW=3 -s USE_WEBGL2=1 -lglfw\
+ $(INCLUDE_FLAGS)
-ifeq ($(USE_ROCM), 1)
- CFLAGS += -DTVM_ROCM_RUNTIME=1 -D__HIP_PLATFORM_HCC__=1
- LDFLAGS += -lhip_hcc
- RUNTIME_DEP += $(ROCM_OBJ)
-else
- CFLAGS += -DTVM_ROCM_RUNTIME=0
-endif
-
-ifeq ($(USE_OPENCL), 1)
- CFLAGS += -DTVM_OPENCL_RUNTIME=1
- ifeq ($(UNAME_S), Darwin)
- FRAMEWORKS += -framework OpenCL
- else
- LDFLAGS += -lOpenCL
- endif
- RUNTIME_DEP += $(OPENCL_OBJ)
-else
- CFLAGS += -DTVM_OPENCL_RUNTIME=0
-endif
-
-ifeq ($(USE_METAL), 1)
- CFLAGS += -DTVM_METAL_RUNTIME=1
- LDFLAGS += -lobjc
- RUNTIME_DEP += $(METAL_OBJ)
- FRAMEWORKS += -framework Metal -framework Foundation
-else
- CFLAGS += -DTVM_METAL_RUNTIME=0
-endif
-
-ifeq ($(USE_RPC), 1)
- RUNTIME_DEP += $(RPC_OBJ)
-endif
-
-ifeq ($(USE_GRAPH_RUNTIME), 1)
- RUNTIME_DEP += $(GRAPH_OBJ)
-endif
-
-include make/contrib/cblas.mk
-include make/contrib/nnpack.mk
-include make/contrib/cudnn.mk
-
-ifdef ADD_CFLAGS
- CFLAGS += $(ADD_CFLAGS)
-endif
-
-ifdef ADD_LDFLAGS
- LDFLAGS += $(ADD_LDFLAGS)
-endif
-
-ifeq ($(OS),Windows_NT)
- JVM_PKG_PROFILE := windows
- SHARED_LIBRARY_SUFFIX := dll
-else
- UNAME_S := $(shell uname -s)
- ifeq ($(UNAME_S), Darwin)
- JVM_PKG_PROFILE := osx-x86_64
- SHARED_LIBRARY_SUFFIX := dylib
- else
- JVM_PKG_PROFILE := linux-x86_64
- SHARED_LIBRARY_SUFFIX := so
- endif
-endif
+web: build/libtvm_web_runtime.js build/libtvm_web_runtime.bc
-JVM_TEST_ARGS := $(if $(JVM_TEST_ARGS),$(JVM_TEST_ARGS),-DskipTests -Dcheckstyle.skip=true)
-
-ifeq ($(USE_CUDA), 1)
- JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-gpu
-else ifeq ($(USE_OPENCL), 1)
- JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-gpu
-else ifeq ($(USE_METAL), 1)
- JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-gpu
-else
- JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-cpu
-endif
-
-BUILD_TARGETS ?= lib/libtvm.$(SHARED_LIBRARY_SUFFIX) lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX)
-all: ${BUILD_TARGETS}
-runtime: lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX)
-web: lib/libtvm_web_runtime.js lib/libtvm_web_runtime.bc
-
-include tests/cpp/unittest.mk
-
-test: $(TEST)
-
-include verilog/verilog.mk
-verilog: $(VER_LIBS)
-
-# Special rules for LLVM related modules.
-${LLVM_BUILD}/codegen/llvm/%.o: src/codegen/llvm/%.cc
- @mkdir -p $(@D)
- $(CXX) $(CFLAGS) $(LLVM_CFLAGS) -MM -MT ${LLVM_BUILD}/codegen/llvm/$*.o $< >${LLVM_BUILD}/codegen/llvm/$*.d
- $(CXX) -c $(CFLAGS) $(LLVM_CFLAGS) -c $< -o $@
-
-build/runtime/metal/%.o: src/runtime/metal/%.mm
- @mkdir -p $(@D)
- $(CXX) $(OBJCFLAGS) $(CFLAGS) -MM -MT build/runtime/metal/$*.o $< >build/runtime/metal/$*.d
- $(CXX) $(OBJCFLAGS) -c $(CFLAGS) -c $< -o $@
-
-build/%.o: src/%.cc
- @mkdir -p $(@D)
- $(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
- $(CXX) -c $(CFLAGS) -c $< -o $@
-
-lib/libtvm.dylib: $(ALL_DEP) $(RUNTIME_DEP)
- @mkdir -p $(@D)
- $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
-
-lib/libtvm_runtime.dylib: $(RUNTIME_DEP)
- @mkdir -p $(@D)
- $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
-
-lib/libtvm.so: $(ALL_DEP) $(RUNTIME_DEP)
- @mkdir -p $(@D)
- $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
-
-lib/libtvm_runtime.so: $(RUNTIME_DEP)
- @mkdir -p $(@D)
- $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
-
-lib/libtvm_web_runtime.bc: web/web_runtime.cc
+build/libtvm_web_runtime.bc: web/web_runtime.cc
@mkdir -p build/web
@mkdir -p $(@D)
- $(CXX) $(CFLAGS) -MM -MT lib/libtvm_web_runtime.bc $< >build/web/web_runtime.d
+ emcc $(EMCC_FLAGS) -MM -MT build/libtvm_web_runtime.bc $< >build/web/web_runtime.d
emcc $(EMCC_FLAGS) -o $@ web/web_runtime.cc
-lib/libtvm_web_runtime.js: lib/libtvm_web_runtime.bc
+build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
@mkdir -p $(@D)
- emcc $(EMCC_FLAGS) -o $@ lib/libtvm_web_runtime.bc
-
-$(LIB_HALIDEIR): LIBHALIDEIR
-
-LIBHALIDEIR:
- + cd HalideIR; make lib/libHalideIR.a DMLC_CORE_PATH=../dmlc-core; cd $(ROOTDIR)
+ emcc $(EMCC_FLAGS) -o $@ build/libtvm_web_runtime.bc
+# Lint scripts
cpplint:
- python dmlc-core/scripts/lint.py topi cpp topi/include;
- python dmlc-core/scripts/lint.py tvm cpp include src verilog\
+ python3 dmlc-core/scripts/lint.py vta cpp vta/include vta/src
+ python3 dmlc-core/scripts/lint.py topi cpp topi/include;
+ python3 dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
+ python3 dmlc-core/scripts/lint.py tvm cpp include src verilog\
examples/extension/src examples/graph_executor/src
pylint:
- pylint python/tvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
- pylint topi/python/topi --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+ python3 -m pylint python/tvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+ python3 -m pylint topi/python/topi --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+ python3 -m pylint nnvm/python/nnvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+ python3 -m pylint vta/python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
jnilint:
- python dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
+ python3 dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
lint: cpplint pylint jnilint
doc:
doxygen docs/Doxyfile
-install: lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX)
- mkdir -p $(DESTDIR)$(PREFIX)/include/tvm/runtime
- cp -R include/tvm/runtime/. $(DESTDIR)$(PREFIX)/include/tvm/runtime
- cp lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) $(DESTDIR)$(PREFIX)/lib
-
-installdev: lib/libtvm.$(SHARED_LIBRARY_SUFFIX) lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) lib/libtvm.a
- mkdir -p $(DESTDIR)$(PREFIX)/include
- cp -R include/tvm $(DESTDIR)$(PREFIX)/include
- cp lib/libtvm.$(SHARED_LIBRARY_SUFFIX) $(DESTDIR)$(PREFIX)/lib
- cp lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) $(DESTDIR)$(PREFIX)/lib
- cp lib/libtvm.a $(DESTDIR)$(PREFIX)/lib
-
# Cython build
cython:
cd python; python setup.py build_ext --inplace
@@ -269,22 +83,34 @@ cython3:
cyclean:
rm -rf python/tvm/*/*/*.so python/tvm/*/*/*.dylib python/tvm/*/*/*.cpp
+# JVM build rules
+ifeq ($(OS),Windows_NT)
+ JVM_PKG_PROFILE := windows
+ SHARED_LIBRARY_SUFFIX := dll
+else
+ UNAME_S := $(shell uname -s)
+ ifeq ($(UNAME_S), Darwin)
+ JVM_PKG_PROFILE := osx-x86_64
+ SHARED_LIBRARY_SUFFIX := dylib
+ else
+ JVM_PKG_PROFILE := linux-x86_64
+ SHARED_LIBRARY_SUFFIX := so
+ endif
+endif
+
+JVM_TEST_ARGS := $(if $(JVM_TEST_ARGS),$(JVM_TEST_ARGS),-DskipTests -Dcheckstyle.skip=true)
+
jvmpkg:
(cd $(ROOTDIR)/jvm; \
mvn clean package -P$(JVM_PKG_PROFILE) -Dcxx="$(CXX)" \
- -Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
- -Dcurrent_libdir="$(ROOTDIR)/lib" $(JVM_TEST_ARGS))
+ -Dcflags="$(PKG_CFLAGS)" -Dldflags="$(PKG_LDFLAGS)" \
+ -Dcurrent_libdir="$(ROOTDIR)/build" $(JVM_TEST_ARGS))
jvminstall:
(cd $(ROOTDIR)/jvm; \
mvn install -P$(JVM_PKG_PROFILE) -Dcxx="$(CXX)" \
- -Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
- -Dcurrent_libdir="$(ROOTDIR)/lib" $(JVM_TEST_ARGS))
+ -Dcflags="$(PKG_CFLAGS)" -Dldflags="$(PKG_LDFLAGS)" \
+ -Dcurrent_libdir="$(ROOTDIR)/build" $(JVM_TEST_ARGS))
+# clean rule
clean:
- $(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o */*.d */*/*.d */*/*/*.d
- cd HalideIR; make clean; cd $(ROOTDIR)
-
--include build/*.d
--include build/*/*.d
--include build/*/*/*.d
--include build/*/*/*/*.d
+ @mkdir -p build && cd build && cmake .. && $(MAKE) clean
diff --git a/NEWS.md b/NEWS.md
index 6bc97b163ab1..567aabf3fcbd 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,11 +3,104 @@ TVM Change Log
This file records the changes in TVM library in reverse chronological order.
+## On-going version
-## On onging verison
+Refer to the Roadmap issue for complete list on on-going version features.
+If you check in something that is not reflected in Roadmap issue, please reply
+to that issue so it can get added.
+
+## 0.3
+
+This release features numerous improvements in TOPI and backends. We make the first step toward object detection support in TOPI, featuring operators necessary for YOLO and SSDs. The topi now supports numpy-style API and operator overloading. RPC is significantly improved to support resource allocation and using a pool of devices. We are adding two new backends: WebGL for running GPUs on the browser, and Vulkan for running on next-generation graphics API.
+
+- TOPI Vision operators
+ - SSD support
+ - YOLO support
+ - NMS operator support in vision
+- TOPI general numpy-style operators
+ - numpy style operator overload in topi
+ - more operators: flip, take
+ - dilation support on conv2d and depthwise
+- 8bit support
+ - ARM 8bit gemm
+ - ARM 8bit conv
+- Low bit operator support
+ - popcount intrinsics
+ - 1-bit fully connected
+- Contrib: MPSDNN fully-connected and conv2d support
+- Better RPC support
+ - RPC Tracker support to allow centralized resource management
+ - RPC protocol upgrade (this is a non-backward compatible change) to support timeout in the proxy
+ - This is a breaking change, need to use the latest version of TVM runtime with the RPC
+ - Fault-tolerant to early server termination with correct exception propagated
+ - RPC support enabled for ROCm AMDGPUs
+- Tutorials and docs
+ - How to deploy to android devices.
+- Optimizations for hardware backends
+ - intel CPU (AVX and AVX512)
+- Schedule Primitives
+ - rfactor now support factor_axis to specify the factored dimension in the result
+ - cache_write now support multiple output operators
+ - enable warp memory which generates shuffle instructions
+- Framework bridge
+ - MXNet bridge supported
+- C++ compiler API support
+ - build migration
+ - topi migration to c++
+ - Target system in c++
+- WebGL backend
+ - runtime and codegen
+ - topi integration
+ - end to end pipeline on the browser
+- Vulkan backend
+ - vulkan runtime
+ - spirv code generator
+- Security
+ - intel SGX runtime support
+ - multi-threaded SGX runtime
+- LLVM 7.0 support
+- Robustness
+ - VerifyMemory to verify incorrect GPU schedules that writes into GPU memory from cpu
+ - Verify compute formulas
+- Better CPU parallel runtime
+
+## 0.2
+
+This release comes with a complete set of TOPI support for NNVM compiler, which allows compilation of end to end workloads.
+We also make major improvements in supporting new backends: ROCm for AMDGPUs and ARM GPU.
+
+- Backend support
+ - Support LLVM mainline(4.0, 5.0, 6.0)
+ - Support ROCM stack for AMD GPUs
+ - More robust OpenCL support for ARM GPUs
+- Android RPC runtime
+- Multi-threading optimization for ARM
+ - multi-threaded depthwise
+ - multi-threaded conv2d
+- New schedule primitives
+ - storage_align for shared memory alignment
+ - double_buffer
- UnrollLoop : more robust version of unroll loop, count maximum steps that can be unrolled.
+- Full set of TOPI operators
+ - Introduce tvm.target to specify target options for compilation better.
+ - broadcast/ reduction operators
+ - pooling and global pooling
+ - Generic target support for topi
+ - schedule with external libraries
+- End to end deep learning pipelines for CPU, GPU, ARM GPU
+- Tutorials
+ - How to load compiled module in any language runtime
+ - How to use java runtime
+- Contrib library: MIOpen, CuDNN
+- Ongoing items that contains functioning pieces
+ - WebGL backend
+ - C++ compiler support
+ - MPS DNN
+ - low bit support, introduced popcount
+
+
+## 0.1
-## 0.1rc
- Language runtime
- python
- javascript
diff --git a/README.md b/README.md
index 07e550d76043..561ca91d5abe 100644
--- a/README.md
+++ b/README.md
@@ -1,33 +1,27 @@
-TVM: Tensor IR Stack for Deep Learning Systems
+ Open Deep Learning Compiler Stack
==============================================
-[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
-[![Build Status](http://mode-gpu.cs.washington.edu:8080/buildStatus/icon?job=dmlc/tvm/master)](http://mode-gpu.cs.washington.edu:8080/job/dmlc/job/tvm/job/master/)
+[![GitHub license](https://dmlc.github.io/img/apache2.svg)](./LICENSE)
+[![Build Status](http://mode-gpu.cs.washington.edu:8080/buildStatus/icon?job=tvm/master)](http://mode-gpu.cs.washington.edu:8080/job/tvm/job/master/)
-[Installation](docs/how_to/install.md) |
-[Documentation](http://docs.tvmlang.org) |
-[Tutorials](http://tutorials.tvmlang.org) |
-[Operator Inventory](topi) |
-[FAQ](docs/faq.md) |
+[Documentation](https://docs.tvm.ai) |
[Contributors](CONTRIBUTORS.md) |
+[Community](https://tvm.ai/community.html) |
[Release Notes](NEWS.md)
-TVM is a Tensor intermediate representation(IR) stack for deep learning systems. It is designed to close the gap between the
+TVM is a compiler stack for deep learning systems. It is designed to close the gap between the
productivity-focused deep learning frameworks, and the performance- and efficiency-focused hardware backends.
TVM works with deep learning frameworks to provide end to end compilation to different backends.
-Checkout our [announcement](http://tvmlang.org/2017/08/17/tvm-release-announcement.html) for more details.
+Checkout the [tvm stack homepage](https://tvm.ai/) for more information.
License
-------
-© Contributors, 2017. Licensed under an [Apache-2.0](https://github.com/dmlc/tvm/blob/master/LICENSE) license.
+© Contributors Licensed under an [Apache-2.0](https://github.com/dmlc/tvm/blob/master/LICENSE) license.
Contribute to TVM
-----------------
TVM adopts apache committer model, we aim to create an open source project that is maintained and owned by the community.
-
-- [Contributor Guide](docs/how_to/contribute.md)
-- Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md)
-- Please also update [NEWS.md](NEWS.md) on changes and improvements in API and codes.
+Checkout the [Contributor Guide](https://docs.tvm.ai/contribute/)
Acknowledgement
---------------
diff --git a/apps/README.md b/apps/README.md
index 254f8c26a510..2345cc3ab548 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -3,9 +3,9 @@ This folder contains various extension projects using TVM,
they also serve as examples on how to use TVM in your own project.
If you are interested in writing optimized kernels with TVM, checkout [TOPI: TVM Operator Inventory](../topi).
-If you are interested in end to end deep learning model compilation, checkout [NNVM Compiler](https://github.com/dmlc/nnvm).
- [extension](extension) How to extend TVM C++ api along with python API.
- [ios_rpc](ios_rpc) iOS RPC server.
- [android_rpc](android_rpc) Android RPC server.
+- [benchmark](benchmark) Example end to end compilation benchmarks
- [howto_deploy](howto_deploy) Tutorial on how to deploy TVM with minimum code dependency.
diff --git a/apps/android_deploy/.gitignore b/apps/android_deploy/.gitignore
new file mode 100644
index 000000000000..39fb081a42a8
--- /dev/null
+++ b/apps/android_deploy/.gitignore
@@ -0,0 +1,9 @@
+*.iml
+.gradle
+/local.properties
+/.idea/workspace.xml
+/.idea/libraries
+.DS_Store
+/build
+/captures
+.externalNativeBuild
diff --git a/apps/android_deploy/README.md b/apps/android_deploy/README.md
new file mode 100644
index 000000000000..801ca8bdf95c
--- /dev/null
+++ b/apps/android_deploy/README.md
@@ -0,0 +1,119 @@
+# Android TVM Demo
+
+This folder contains Android Demo app that allows us to show how to deploy model using TVM runtime api on a Android phone.
+
+You will need [JDK](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html), [Android SDK](https://developer.android.com/studio/index.html), [Android NDK](https://developer.android.com/ndk) and an Android device to use this.
+
+## Build and Installation
+
+### Build APK
+
+We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
+
+Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/dmlc/tvm/blob/master/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
+
+```
+dependencies {
+ compile fileTree(dir: 'libs', include: ['*.jar'])
+ androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+ exclude group: 'com.android.support', module: 'support-annotations'
+ })
+ compile 'com.android.support:appcompat-v7:26.0.1'
+ compile 'com.android.support.constraint:constraint-layout:1.0.2'
+ compile 'com.android.support:design:26.0.1'
+ compile 'ml.dmlc.tvm:tvm4j-core:0.0.1-SNAPSHOT'
+ testCompile 'junit:junit:4.12'
+}
+```
+
+Application default has CPU version TVM runtime flavor and follow below instruction to setup.
+In `app/src/main/jni/make` you will find JNI Makefile config `config.mk` and copy it to `app/src/main/jni` and modify it.
+
+```bash
+cd apps/android_deploy/app/src/main/jni
+cp make/config.mk .
+```
+
+Here's a piece of example for `config.mk`.
+
+```makefile
+APP_ABI = arm64-v8a
+
+APP_PLATFORM = android-17
+
+# whether enable OpenCL during compile
+USE_OPENCL = 0
+```
+
+Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file.
+
+```bash
+export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]
+cd apps/android_deploy
+gradle clean build
+```
+
+In `app/build/outputs/apk` you'll find `app-release-unsigned.apk`, use `dev_tools/gen_keystore.sh` to generate a signature and use `dev_tools/sign_apk.sh` to get the signed apk file `app/build/outputs/apk/tvmdemo-release.apk`.
+
+Upload `tvmdemo-release.apk` to your Android device and install it.
+
+### Build with OpenCL
+
+Application does not link with OpenCL library unless you configure it to. Modify JNI Makefile config `app/src/main/jni` with proper target OpenCL configuration.
+
+Here's a piece of example for `config.mk`.
+
+```makefile
+APP_ABI = arm64-v8a
+
+APP_PLATFORM = android-17
+
+# whether enable OpenCL during compile
+USE_OPENCL = 1
+
+# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
+ADD_C_INCLUDES = /opt/adrenosdk-osx/Development/Inc
+
+# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
+ADD_LDLIBS = libOpenCL.so
+```
+
+Note that you should specify the correct GPU development headers for your android device. Run `adb shell dumpsys | grep GLES` to find out what GPU your android device uses. It is very likely the library (libOpenCL.so) is already present on the mobile device. For instance, I found it under `/system/vendor/lib64`. You can do `adb pull /system/vendor/lib64/libOpenCL.so ./` to get the file to your desktop.
+
+After you setup the `config.mk`, follow the instructions in [Build APK](#buildapk) to build the Android package with OpenCL flavor.
+
+## Cross Compile and Run on Android Devices
+
+### Architecture and Android Standalone Toolchain
+
+In order to cross compile a shared library (.so) for your android device, you have to know the target triple for the device. (Refer to [Cross-compilation using Clang](https://clang.llvm.org/docs/CrossCompilation.html) for more information). Run `adb shell cat /proc/cpuinfo` to list the device's CPU information.
+
+Now use NDK to generate standalone toolchain for your device. For my test device, I use following command.
+
+```bash
+cd /opt/android-ndk/build/tools/
+./make-standalone-toolchain.sh --platform=android-24 --use-llvm --arch=arm64 --install-dir=/opt/android-toolchain-arm64
+```
+
+If everything goes well, you will find compile tools in `/opt/android-toolchain-arm64/bin`. For example, `bin/aarch64-linux-android-g++` can be used to compile C++ source codes and create shared libraries for arm64 Android devices.
+
+### Place compiled model on Android application assets folder
+
+Follow instruction to get compiled version model for android target [here.](http://docs.tvm.ai/deploy/android.html)
+
+Copied these compiled model deploy_lib.so, deploy_graph.json and deploy_param.params to apps/android_deploy/app/src/main/assets/ and modify TVM flavor changes on [java](https://github.com/dmlc/tvm/blob/master/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java#L81)
+
+`CPU Verison flavor`
+```
+ private static final boolean EXE_GPU = false;
+```
+
+`OpenCL Verison flavor`
+```
+ private static final boolean EXE_GPU = true;
+```
+
+
+Install compiled android application on phone and enjoy the image classifier demo using extraction model
+
+You can define your own TVM operators and deploy via this demo application on your Android device to find the most optimized TVM schedule.
diff --git a/apps/android_deploy/app/.gitignore b/apps/android_deploy/app/.gitignore
new file mode 100644
index 000000000000..796b96d1c402
--- /dev/null
+++ b/apps/android_deploy/app/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/apps/android_deploy/app/build.gradle b/apps/android_deploy/app/build.gradle
new file mode 100644
index 000000000000..6790308a9ec4
--- /dev/null
+++ b/apps/android_deploy/app/build.gradle
@@ -0,0 +1,56 @@
+// import DownloadModels task
+project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets'
+project.ext.TMP_DIR = project.buildDir.toString() + '/downloads'
+
+// Download default models(darknet framework extraction model compiled version);
+// if you wish to use your own models then place them in the "assets" directory
+// and comment out this line.
+apply from: "download-models.gradle"
+
+apply plugin: 'com.android.application'
+
+task buildJni(type: Exec, description: 'Build JNI libs') {
+ commandLine 'sh', 'src/main/jni/build.sh'
+}
+
+tasks.withType(JavaCompile) {
+ compileTask -> compileTask.dependsOn buildJni
+}
+
+android {
+ compileSdkVersion 26
+ buildToolsVersion "26.0.1"
+ defaultConfig {
+ applicationId "ml.dmlc.tvm.android.demo"
+ minSdkVersion 17
+ targetSdkVersion 26
+ versionCode 1
+ versionName "1.0"
+ testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+ }
+ buildTypes {
+ release {
+ minifyEnabled false
+ proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+ }
+ }
+ sourceSets {
+ main {
+ jni.srcDirs = []
+ jniLibs.srcDirs = ['src/main/libs']
+ assets.srcDirs = [project.ext.ASSET_DIR]
+ }
+ }
+}
+
+dependencies {
+ compile fileTree(dir: 'libs', include: ['*.jar'])
+ androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+ exclude group: 'com.android.support', module: 'support-annotations'
+ })
+ compile 'com.android.support:appcompat-v7:26.0.1'
+ compile 'com.android.support.constraint:constraint-layout:1.0.2'
+ compile 'com.android.support:design:26.0.1'
+ compile 'ml.dmlc.tvm:tvm4j-core:0.0.1-SNAPSHOT'
+ testCompile 'junit:junit:4.12'
+}
diff --git a/apps/android_deploy/app/download-models.gradle b/apps/android_deploy/app/download-models.gradle
new file mode 100644
index 000000000000..5b0509fbca2b
--- /dev/null
+++ b/apps/android_deploy/app/download-models.gradle
@@ -0,0 +1,64 @@
+/*
+ * download-models.gradle
+ * Downloads model files from ${MODEL_URL} into application's asset folder
+ * Input:
+ * project.ext.TMP_DIR: absolute path to hold downloaded zip files
+ * project.ext.ASSET_DIR: absolute path to save unzipped model files
+ * Output:
+ * 3 model files will be downloaded into given folder of ext.ASSET_DIR
+ */
+// hard coded model files
+def models = ['extraction.zip']
+
+// Root URL for model archives
+def MODEL_URL = 'https://github.com/PariksheetPinjari909/TVM_models/blob/master/extraction_model'
+buildscript {
+ repositories {
+ jcenter()
+ }
+ dependencies {
+ classpath 'de.undercouch:gradle-download-task:3.2.0'
+ }
+}
+
+import de.undercouch.gradle.tasks.download.Download
+task downloadFile(type: Download){
+ for (f in models) {
+ src "${MODEL_URL}/" + f + "?raw=true"
+ dest new File(project.ext.TMP_DIR + "/" + f)
+ }
+ overwrite true
+}
+
+task extractModels(type: Copy) {
+ def needDownload = false
+ for (f in models) {
+ def localFile = f.split("/")[-1]
+ if (!(new File(project.ext.TMP_DIR + '/' + localFile)).exists()) {
+ needDownload = true
+ }
+ }
+
+ if (needDownload) {
+ dependsOn downloadFile
+ }
+
+ for (f in models) {
+ def localFile = f.split("/")[-1]
+ from zipTree(project.ext.TMP_DIR + '/' + localFile)
+ }
+
+ into file(project.ext.ASSET_DIR)
+ fileMode 0644
+ exclude '**/LICENSE'
+}
+
+tasks.whenTaskAdded { task ->
+ if (task.name == 'assembleDebug') {
+ task.dependsOn 'extractModels'
+ }
+ if (task.name == 'assembleRelease') {
+ task.dependsOn 'extractModels'
+ }
+}
+
diff --git a/apps/android_deploy/app/src/main/AndroidManifest.xml b/apps/android_deploy/app/src/main/AndroidManifest.xml
new file mode 100644
index 000000000000..bac82ee90faa
--- /dev/null
+++ b/apps/android_deploy/app/src/main/AndroidManifest.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
new file mode 100644
index 000000000000..f3cdefe1c2ff
--- /dev/null
+++ b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
@@ -0,0 +1,633 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm.android.demo;
+
+import android.Manifest;
+import android.content.Intent;
+import android.content.pm.PackageManager;
+import android.content.res.AssetManager;
+import android.app.AlertDialog;
+import android.app.ProgressDialog;
+import android.content.DialogInterface;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+import android.graphics.Canvas;
+import android.graphics.Matrix;
+import android.net.Uri;
+import android.os.AsyncTask;
+import android.os.Build;
+import android.os.Bundle;
+import android.os.Environment;
+import android.os.SystemClock;
+import android.provider.MediaStore;
+import android.support.v4.content.FileProvider;
+import android.support.v7.app.AppCompatActivity;
+import android.support.v7.widget.Toolbar;
+import android.util.Log;
+import android.view.View;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.Toast;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Vector;
+
+import ml.dmlc.tvm.Function;
+import ml.dmlc.tvm.Module;
+import ml.dmlc.tvm.NDArray;
+import ml.dmlc.tvm.TVMContext;
+import ml.dmlc.tvm.TVMValue;
+import ml.dmlc.tvm.TVMType;
+
+public class MainActivity extends AppCompatActivity {
+ private static final String TAG = MainActivity.class.getSimpleName();
+
+ private static final int PERMISSIONS_REQUEST = 100;
+ private static final int PICTURE_FROM_GALLERY = 101;
+ private static final int PICTURE_FROM_CAMERA = 102;
+ private static final int IMAGE_PREVIEW_WIDTH = 960;
+ private static final int IMAGE_PREVIEW_HEIGHT = 720;
+
+ // TVM constants
+ private static final int OUTPUT_INDEX = 0;
+ private static final int IMG_CHANNEL = 3;
+ private static final String INPUT_NAME = "data";
+
+ // Configuration values for extraction model. Note that the graph, lib and params is not
+ // included with TVM and must be manually placed in the assets/ directory by the user.
+ // Graphs and models downloaded from https://github.com/pjreddie/darknet/blob/ may be
+ // converted e.g. via define_and_compile_model.py.
+ private static final boolean EXE_GPU = false;
+ private static final int MODEL_INPUT_SIZE = 224;
+ private static final String MODEL_CL_LIB_FILE = "file:///android_asset/deploy_lib_opencl.so";
+ private static final String MODEL_CPU_LIB_FILE = "file:///android_asset/deploy_lib_cpu.so";
+ private static final String MODEL_GRAPH_FILE = "file:///android_asset/deploy_graph.json";
+ private static final String MODEL_PARAM_FILE = "file:///android_asset/deploy_param.params";
+ private static final String MODEL_LABEL_FILE = "file:///android_asset/imagenet.shortnames.list";
+
+ private Uri mCameraImageUri;
+ private ImageView mImageView;
+ private TextView mResultView;
+ private AssetManager assetManager;
+ private Module graphRuntimeModule;
+ private Vector labels = new Vector();
+
+ @Override
+ protected void onCreate(Bundle savedInstanceState) {
+ super.onCreate(savedInstanceState);
+ setContentView(R.layout.activity_main);
+ Toolbar toolbar = findViewById(R.id.toolbar);
+ setSupportActionBar(toolbar);
+ assetManager = getAssets();
+
+ mImageView = (ImageView) findViewById(R.id.imageView);
+ mResultView = (TextView) findViewById(R.id.resultTextView);
+ findViewById(R.id.btnPickImage).setOnClickListener(new View.OnClickListener() {
+ @Override
+ public void onClick(View v) {
+ showPictureDialog();
+ }
+ });
+
+ if (hasPermission()) {
+ // instantiate tvm runtime and setup environment on background after application begin
+ new LoadModleAsyncTask().execute();
+ } else {
+ requestPermission();
+ }
+ }
+
+ /*
+ Load precompiled model on TVM graph runtime and init the system.
+ */
+ private class LoadModleAsyncTask extends AsyncTask {
+ ProgressDialog dialog = new ProgressDialog(MainActivity.this);
+
+ @Override
+ protected Integer doInBackground(Void... args) {
+
+ // load synset name
+ String lableFilename = MODEL_LABEL_FILE.split("file:///android_asset/")[1];
+ Log.i(TAG, "Reading synset name from: " + lableFilename);
+ try {
+ String labelsContent = new String(getBytesFromFile(assetManager, lableFilename));
+ for (String line : labelsContent.split("\\r?\\n")) {
+ labels.add(line);
+ }
+ } catch (IOException e) {
+ Log.e(TAG, "Problem reading synset name file!" + e);
+ return -1;//failure
+ }
+
+ // load json graph
+ String modelGraph = null;
+ String graphFilename = MODEL_GRAPH_FILE.split("file:///android_asset/")[1];
+ Log.i(TAG, "Reading json graph from: " + graphFilename);
+ try {
+ modelGraph = new String(getBytesFromFile(assetManager, graphFilename));
+ } catch (IOException e) {
+ Log.e(TAG, "Problem reading json graph file!" + e);
+ return -1;//failure
+ }
+
+ // upload tvm compiled function on application cache folder
+ String libCacheFilePath = null;
+ String libFilename = EXE_GPU ? MODEL_CL_LIB_FILE.split("file:///android_asset/")[1] :
+ MODEL_CPU_LIB_FILE.split("file:///android_asset/")[1];
+ Log.i(TAG, "Uploading compiled function to cache folder");
+ try {
+ libCacheFilePath = getTempLibFilePath(libFilename);
+ byte[] modelLibByte = getBytesFromFile(assetManager, libFilename);
+ FileOutputStream fos = new FileOutputStream(libCacheFilePath);
+ fos.write(modelLibByte);
+ fos.close();
+ } catch (IOException e) {
+ Log.e(TAG, "Problem uploading compiled function!" + e);
+ return -1;//failure
+ }
+
+ // load parameters
+ byte[] modelParams = null;
+ String paramFilename = MODEL_PARAM_FILE.split("file:///android_asset/")[1];
+ try {
+ modelParams = getBytesFromFile(assetManager, paramFilename);
+ } catch (IOException e) {
+ Log.e(TAG, "Problem reading params file!" + e);
+ return -1;//failure
+ }
+
+ // create java tvm context
+ TVMContext tvmCtx = EXE_GPU ? TVMContext.opencl() : TVMContext.cpu();
+
+ // tvm module for compiled functions
+ Module modelLib = Module.load(libCacheFilePath);
+
+ // get global function module for graph runtime
+ Function runtimeCreFun = Function.getFunction("tvm.graph_runtime.create");
+ TVMValue runtimeCreFunRes = runtimeCreFun.pushArg(modelGraph)
+ .pushArg(modelLib)
+ .pushArg(tvmCtx.deviceType)
+ .pushArg(tvmCtx.deviceId)
+ .invoke();
+ graphRuntimeModule = runtimeCreFunRes.asModule();
+
+ // get the function from the module(load parameters)
+ Function loadParamFunc = graphRuntimeModule.getFunction("load_params");
+ loadParamFunc.pushArg(modelParams).invoke();
+
+ // release tvm local variables
+ modelLib.release();
+ loadParamFunc.release();
+ runtimeCreFun.release();
+
+ return 0;//success
+ }
+
+ @Override
+ protected void onPreExecute() {
+ dialog.setCancelable(false);
+ dialog.setMessage("Loading Model...");
+ dialog.show();
+ super.onPreExecute();
+ }
+
+ @Override
+ protected void onPostExecute(Integer status) {
+ if (dialog != null && dialog.isShowing()) {
+ dialog.dismiss();
+ }
+ if (status != 0) {
+ showDialog("Error", "Fail to initialized model, check compiled model");
+ }
+ }
+ }
+
+ /*
+ Execute prediction for processed decode input bitmap image content on TVM graph runtime.
+ */
+ private class ModelRunAsyncTask extends AsyncTask {
+ ProgressDialog dialog = new ProgressDialog(MainActivity.this);
+
+ @Override
+ protected Integer doInBackground(Bitmap... bitmaps) {
+ if (null != graphRuntimeModule) {
+ int count = bitmaps.length;
+ for (int i = 0 ; i < count ; i++) {
+ long processingTimeMs = SystemClock.uptimeMillis();
+ Log.i(TAG, "Decode JPEG image content");
+
+ // extract the jpeg content
+ ByteArrayOutputStream stream = new ByteArrayOutputStream();
+ bitmaps[i].compress(Bitmap.CompressFormat.JPEG,100,stream);
+ byte[] byteArray = stream.toByteArray();
+ Bitmap imageBitmap = BitmapFactory.decodeByteArray(byteArray, 0, byteArray.length);
+
+ // crop input image at centre to model input size
+ // commecial deploy note:: instead of cropying image do resize
+ // image to model input size so we never lost the image content
+ Bitmap cropImageBitmap = Bitmap.createBitmap(MODEL_INPUT_SIZE, MODEL_INPUT_SIZE, Bitmap.Config.ARGB_8888);
+ Matrix frameToCropTransform = getTransformationMatrix(imageBitmap.getWidth(), imageBitmap.getHeight(),
+ MODEL_INPUT_SIZE, MODEL_INPUT_SIZE, 0, true);
+ Canvas canvas = new Canvas(cropImageBitmap);
+ canvas.drawBitmap(imageBitmap, frameToCropTransform, null);
+
+ // image pixel int values
+ int[] pixelValues = new int[MODEL_INPUT_SIZE * MODEL_INPUT_SIZE];
+ // image RGB float values
+ float[] imgRgbValues = new float[MODEL_INPUT_SIZE * MODEL_INPUT_SIZE * IMG_CHANNEL];
+ // image RGB transpose float values
+ float[] imgRgbTranValues = new float[MODEL_INPUT_SIZE * MODEL_INPUT_SIZE * IMG_CHANNEL];
+
+ // pre-process the image data from 0-255 int to normalized float based on the
+ // provided parameters.
+ cropImageBitmap.getPixels(pixelValues, 0, MODEL_INPUT_SIZE, 0, 0, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE);
+ for (int j = 0; j < pixelValues.length; ++j) {
+ imgRgbValues[j * 3 + 0] = ((pixelValues[j] >> 16) & 0xFF)/255.0f;
+ imgRgbValues[j * 3 + 1] = ((pixelValues[j] >> 8) & 0xFF)/255.0f;
+ imgRgbValues[j * 3 + 2] = (pixelValues[j] & 0xFF)/255.0f;
+ }
+
+ // pre-process the image rgb data transpose based on the provided parameters.
+ for (int k = 0; k < IMG_CHANNEL; ++k) {
+ for (int l = 0; l < MODEL_INPUT_SIZE; ++l) {
+ for (int m = 0; m < MODEL_INPUT_SIZE; ++m) {
+ int dst_index = m + MODEL_INPUT_SIZE*l + MODEL_INPUT_SIZE*MODEL_INPUT_SIZE*k;
+ int src_index = k + IMG_CHANNEL*m + IMG_CHANNEL*MODEL_INPUT_SIZE*l;
+ imgRgbTranValues[dst_index] = imgRgbValues[src_index];
+ }
+ }
+ }
+
+ // get the function from the module(set input data)
+ Log.i(TAG, "set input data");
+ NDArray inputNdArray = NDArray.empty(new long[]{1, IMG_CHANNEL, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE}, new TVMType("float32"));;
+ inputNdArray.copyFrom(imgRgbTranValues);
+ Function setInputFunc = graphRuntimeModule.getFunction("set_input");
+ setInputFunc.pushArg(INPUT_NAME).pushArg(inputNdArray).invoke();
+ // release tvm local variables
+ inputNdArray.release();
+ setInputFunc.release();
+
+ // get the function from the module(run it)
+ Log.i(TAG, "run function on target");
+ Function runFunc = graphRuntimeModule.getFunction("run");
+ runFunc.invoke();
+ // release tvm local variables
+ runFunc.release();
+
+ // get the function from the module(get output data)
+ Log.i(TAG, "get output data");
+ NDArray outputNdArray = NDArray.empty(new long[]{1000}, new TVMType("float32"));
+ Function getOutputFunc = graphRuntimeModule.getFunction("get_output");
+ getOutputFunc.pushArg(OUTPUT_INDEX).pushArg(outputNdArray).invoke();
+ float[] output = outputNdArray.asFloatArray();
+ // release tvm local variables
+ outputNdArray.release();
+ getOutputFunc.release();
+
+ // display the result from extracted output data
+ if (null != output) {
+ int maxPosition = -1;
+ float maxValue = 0;
+ for (int j = 0; j < output.length; ++j) {
+ if (output[j] > maxValue) {
+ maxValue = output[j];
+ maxPosition = j;
+ }
+ }
+ processingTimeMs = SystemClock.uptimeMillis() - processingTimeMs;
+ String label = "Prediction Result : ";
+ label += labels.size() > maxPosition ? labels.get(maxPosition) : "unknown";
+ label += "\nPrediction Time : " + processingTimeMs + "ms";
+ mResultView.setText(label);
+ }
+ Log.i(TAG, "prediction finished");
+ }
+ return 0;
+ }
+ return -1;
+ }
+
+ @Override
+ protected void onPreExecute() {
+ dialog.setCancelable(false);
+ dialog.setMessage("Prediction running on image...");
+ dialog.show();
+ super.onPreExecute();
+ }
+
+ @Override
+ protected void onPostExecute(Integer status) {
+ if (dialog != null && dialog.isShowing()) {
+ dialog.dismiss();
+ }
+ if (status != 0) {
+ showDialog("Error", "Fail to predict image, GraphRuntime exception");
+ }
+ }
+ }
+
+ @Override
+ protected void onDestroy() {
+ // release tvm local variables
+ if (null != graphRuntimeModule)
+ graphRuntimeModule.release();
+ super.onDestroy();
+ }
+
+ /**
+ * Read file from assets and return byte array.
+ *
+ * @param assets The asset manager to be used to load assets.
+ * @param fileName The filepath of read file.
+ * @return byte[] file content
+ * @throws IOException
+ */
+ private byte[] getBytesFromFile(AssetManager assets, String fileName) throws IOException {
+ InputStream is = assets.open(fileName);
+ int length = is.available();
+ byte[] bytes = new byte[length];
+ // Read in the bytes
+ int offset = 0;
+ int numRead = 0;
+ try {
+ while (offset < bytes.length
+ && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) {
+ offset += numRead;
+ }
+ } finally {
+ is.close();
+ }
+ // Ensure all the bytes have been read in
+ if (offset < bytes.length) {
+ throw new IOException("Could not completely read file " + fileName);
+ }
+ return bytes;
+ }
+
+ /**
+ * Dialog show pick option for select image from Gallery or Camera.
+ */
+ private void showPictureDialog(){
+ AlertDialog.Builder pictureDialog = new AlertDialog.Builder(this);
+ pictureDialog.setTitle("Select Action");
+ String[] pictureDialogItems = {
+ "Select photo from gallery",
+ "Capture photo from camera" };
+ pictureDialog.setItems(pictureDialogItems,
+ new DialogInterface.OnClickListener() {
+ @Override
+ public void onClick(DialogInterface dialog, int which) {
+ switch (which) {
+ case 0:
+ choosePhotoFromGallery();
+ break;
+ case 1:
+ takePhotoFromCamera();
+ break;
+ }
+ }
+ });
+ pictureDialog.show();
+ }
+
+ /**
+ * Request to pick image from Gallery.
+ */
+ public void choosePhotoFromGallery() {
+ Intent galleryIntent = new Intent(Intent.ACTION_PICK,
+ android.provider.MediaStore.Images.Media.EXTERNAL_CONTENT_URI);
+
+ startActivityForResult(galleryIntent, PICTURE_FROM_GALLERY);
+ }
+
+ /**
+ * Request to capture image from Camera.
+ */
+ private void takePhotoFromCamera() {
+ Intent intent = new Intent(android.provider.MediaStore.ACTION_IMAGE_CAPTURE);
+
+ if (Build.VERSION.SDK_INT < Build.VERSION_CODES.N) {
+ mCameraImageUri = Uri.fromFile(createImageFile());
+ } else {
+ File file = new File(createImageFile().getPath());
+ mCameraImageUri = FileProvider.getUriForFile(getApplicationContext(), getApplicationContext().getPackageName() + ".provider", file);
+ }
+
+ intent.putExtra(MediaStore.EXTRA_OUTPUT, mCameraImageUri);
+ startActivityForResult(intent, PICTURE_FROM_CAMERA);
+ }
+
+ @Override
+ public void onActivityResult(int requestCode, int resultCode, Intent data) {
+ super.onActivityResult(requestCode, resultCode, data);
+ if (resultCode == this.RESULT_CANCELED) {
+ return;
+ }
+ Uri contentURI = null;
+ if (requestCode == PICTURE_FROM_GALLERY) {
+ if (data != null) {
+ contentURI = data.getData();
+ }
+ } else if (requestCode == PICTURE_FROM_CAMERA) {
+ contentURI = mCameraImageUri;
+ }
+ if (null != contentURI) {
+ try {
+ Bitmap bitmap = MediaStore.Images.Media.getBitmap(this.getContentResolver(), contentURI);
+ Bitmap scaled = Bitmap.createScaledBitmap(bitmap, IMAGE_PREVIEW_HEIGHT, IMAGE_PREVIEW_WIDTH, true);
+ mImageView.setImageBitmap(scaled);
+ new ModelRunAsyncTask().execute(scaled);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ /**
+ * Get application cache path where to place compiled functions.
+ *
+ * @param fileName library file name.
+ * @return String application cache folder path
+ * @throws IOException
+ */
+ private final String getTempLibFilePath(String fileName) throws IOException {
+ File tempDir = File.createTempFile("tvm4j_demo_", "");
+ if (!tempDir.delete() || !tempDir.mkdir()) {
+ throw new IOException("Couldn't create directory " + tempDir.getAbsolutePath());
+ }
+ return (tempDir + File.separator + fileName);
+ }
+
+ /**
+ * Create image file under storage where camera application save captured image.
+ *
+ * @return File image file under sdcard where camera can save image
+ */
+ private File createImageFile() {
+ // Create an image file name
+ String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
+ String imageFileName = "JPEG_" + timeStamp + "_";
+ File storageDir = Environment.getExternalStoragePublicDirectory(
+ Environment.DIRECTORY_PICTURES);
+ try {
+ File image = File.createTempFile(
+ imageFileName, // prefix
+ ".jpg", // suffix
+ storageDir // directory
+ );
+ return image;
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ /**
+ * Show dialog to user.
+ *
+ * @param title dialog display title
+ * @param msg dialog display message
+ */
+ private void showDialog(String title, String msg) {
+ AlertDialog.Builder builder = new AlertDialog.Builder(this);
+ builder.setTitle(title);
+ builder.setMessage(msg);
+ builder.setCancelable(true);
+ builder.setNeutralButton(android.R.string.ok,
+ new DialogInterface.OnClickListener() {
+ public void onClick(DialogInterface dialog, int id) {
+ dialog.cancel();
+ finish();
+ }
+ });
+ builder.create().show();
+ }
+
+ @Override
+ public void onRequestPermissionsResult (final int requestCode, final String[] permissions, final int[] grantResults){
+ if (requestCode == PERMISSIONS_REQUEST) {
+ if (grantResults.length > 0
+ && grantResults[0] == PackageManager.PERMISSION_GRANTED
+ && grantResults[1] == PackageManager.PERMISSION_GRANTED) {
+ // instantiate tvm runtime and setup environment on background after application begin
+ new LoadModleAsyncTask().execute();
+ } else {
+ requestPermission();
+ }
+ }
+ }
+
+ /**
+ * Whether application has required mandatory permissions to run.
+ */
+ private boolean hasPermission() {
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+ return checkSelfPermission(Manifest.permission.CAMERA) == PackageManager.PERMISSION_GRANTED &&
+ checkSelfPermission(Manifest.permission.WRITE_EXTERNAL_STORAGE) == PackageManager.PERMISSION_GRANTED;
+ } else {
+ return true;
+ }
+ }
+
+ /**
+ * Request required mandatory permission for application to run.
+ */
+ private void requestPermission() {
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+ if (shouldShowRequestPermissionRationale(Manifest.permission.CAMERA) ||
+ shouldShowRequestPermissionRationale(Manifest.permission.WRITE_EXTERNAL_STORAGE)) {
+ Toast.makeText(this,
+ "Camera AND storage permission are required for this demo", Toast.LENGTH_LONG).show();
+ }
+ requestPermissions(new String[] {Manifest.permission.CAMERA, Manifest.permission.WRITE_EXTERNAL_STORAGE}, PERMISSIONS_REQUEST);
+ }
+ }
+
+ /**
+ * Returns a transformation matrix from one reference frame into another.
+ * Handles cropping (if maintaining aspect ratio is desired) and rotation.
+ *
+ * @param srcWidth Width of source frame.
+ * @param srcHeight Height of source frame.
+ * @param dstWidth Width of destination frame.
+ * @param dstHeight Height of destination frame.
+ * @param applyRotation Amount of rotation to apply from one frame to another.
+ * Must be a multiple of 90.
+ * @param maintainAspectRatio If true, will ensure that scaling in x and y remains constant,
+ * cropping the image if necessary.
+ * @return The transformation fulfilling the desired requirements.
+ */
+ public static Matrix getTransformationMatrix(
+ final int srcWidth,
+ final int srcHeight,
+ final int dstWidth,
+ final int dstHeight,
+ final int applyRotation,
+ final boolean maintainAspectRatio) {
+ final Matrix matrix = new Matrix();
+
+ if (applyRotation != 0) {
+ if (applyRotation % 90 != 0) {
+ Log.w(TAG, "Rotation of %d % 90 != 0 " + applyRotation);
+ }
+
+ // Translate so center of image is at origin.
+ matrix.postTranslate(-srcWidth / 2.0f, -srcHeight / 2.0f);
+
+ // Rotate around origin.
+ matrix.postRotate(applyRotation);
+ }
+
+ // Account for the already applied rotation, if any, and then determine how
+ // much scaling is needed for each axis.
+ final boolean transpose = (Math.abs(applyRotation) + 90) % 180 == 0;
+
+ final int inWidth = transpose ? srcHeight : srcWidth;
+ final int inHeight = transpose ? srcWidth : srcHeight;
+
+ // Apply scaling if necessary.
+ if (inWidth != dstWidth || inHeight != dstHeight) {
+ final float scaleFactorX = dstWidth / (float) inWidth;
+ final float scaleFactorY = dstHeight / (float) inHeight;
+
+ if (maintainAspectRatio) {
+ // Scale by minimum factor so that dst is filled completely while
+ // maintaining the aspect ratio. Some image may fall off the edge.
+ final float scaleFactor = Math.max(scaleFactorX, scaleFactorY);
+ matrix.postScale(scaleFactor, scaleFactor);
+ } else {
+ // Scale exactly to fill dst from src.
+ matrix.postScale(scaleFactorX, scaleFactorY);
+ }
+ }
+
+ if (applyRotation != 0) {
+ // Translate back from origin centered reference to destination frame.
+ matrix.postTranslate(dstWidth / 2.0f, dstHeight / 2.0f);
+ }
+
+ return matrix;
+ }
+}
\ No newline at end of file
diff --git a/apps/android_deploy/app/src/main/jni/Android.mk b/apps/android_deploy/app/src/main/jni/Android.mk
new file mode 100644
index 000000000000..a99517f90332
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/Android.mk
@@ -0,0 +1,42 @@
+LOCAL_PATH := $(call my-dir)
+MY_PATH := $(LOCAL_PATH)
+
+include $(CLEAR_VARS)
+
+LOCAL_PATH := $(MY_PATH)
+ROOT_PATH := $(MY_PATH)/../../../../../..
+
+ifndef config
+ ifneq ("$(wildcard ./config.mk)","")
+ config ?= config.mk
+ else
+ config ?= make/config.mk
+ endif
+endif
+
+include $(config)
+
+LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
+LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
+
+LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
+ $(ROOT_PATH)/dlpack/include \
+ $(ROOT_PATH)/dmlc-core/include \
+ $(ROOT_PATH)/HalideIR/src \
+ $(ROOT_PATH)/topi/include
+
+LOCAL_MODULE = tvm4j_runtime_packed
+
+LOCAL_CPP_FEATURES += exceptions
+LOCAL_LDLIBS += -latomic
+LOCAL_ARM_MODE := arm
+
+ifdef ADD_C_INCLUDES
+ LOCAL_C_INCLUDES += $(ADD_C_INCLUDES)
+endif
+
+ifdef ADD_LDLIBS
+ LOCAL_LDLIBS += $(ADD_LDLIBS)
+endif
+
+include $(BUILD_SHARED_LIBRARY)
diff --git a/apps/android_deploy/app/src/main/jni/Application.mk b/apps/android_deploy/app/src/main/jni/Application.mk
new file mode 100644
index 000000000000..8e81a8d6a81c
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/Application.mk
@@ -0,0 +1,16 @@
+ifndef config
+ ifneq ("$(wildcard ./config.mk)","")
+ config ?= config.mk
+ else
+ config ?= make/config.mk
+ endif
+endif
+
+include $(config)
+
+APP_STL := c++_static
+
+APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
+ifeq ($(USE_OPENCL), 1)
+ APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
+endif
diff --git a/apps/android_deploy/app/src/main/jni/build.sh b/apps/android_deploy/app/src/main/jni/build.sh
new file mode 100644
index 000000000000..1ca38ae5bd12
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/build.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+PATH="$PATH:/usr/local/bin"
+CURR_DIR=$(cd `dirname $0`; pwd)
+ROOT_DIR="$CURR_DIR/../../../../../.."
+javah -o $CURR_DIR/ml_dmlc_tvm_native_c_api.h -cp "$ROOT_DIR/jvm/core/target/*" ml.dmlc.tvm.LibInfo || exit -1
+cp -f $ROOT_DIR/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc $CURR_DIR/ || exit -1
+cp -f $ROOT_DIR/jvm/native/src/main/native/jni_helper_func.h $CURR_DIR/ || exit -1
+rm -rf $CURR_DIR/../libs
+ndk-build --directory=$CURR_DIR
diff --git a/apps/android_deploy/app/src/main/jni/make/config.mk b/apps/android_deploy/app/src/main/jni/make/config.mk
new file mode 100644
index 000000000000..8d6f5a56dd5b
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/make/config.mk
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------------------------
+# Template configuration for compiling
+#
+# If you want to change the configuration, please use the following
+# steps. Assume you are on the root directory. First copy the this
+# file so that any local changes will be ignored by git
+#
+# cp make/config.mk .
+#
+# Next modify the according entries, and then compile by
+#
+# ./build.sh
+#
+#-------------------------------------------------------------------------------
+APP_ABI = all
+
+APP_PLATFORM = android-17
+
+# whether enable OpenCL during compile
+USE_OPENCL = 0
+
+# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
+ADD_C_INCLUDES =
+
+# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
+ADD_LDLIBS =
diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
new file mode 100644
index 000000000000..0b5f4ee67237
--- /dev/null
+++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
@@ -0,0 +1,27 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file tvm_runtime.h
+ * \brief Pack all tvm runtime source files
+ */
+#include
+#include
+
+#include "../src/runtime/c_runtime_api.cc"
+#include "../src/runtime/cpu_device_api.cc"
+#include "../src/runtime/workspace_pool.cc"
+#include "../src/runtime/module_util.cc"
+#include "../src/runtime/system_lib_module.cc"
+#include "../src/runtime/module.cc"
+#include "../src/runtime/registry.cc"
+#include "../src/runtime/file_util.cc"
+#include "../src/runtime/dso_module.cc"
+#include "../src/runtime/thread_pool.cc"
+#include "../src/runtime/threading_backend.cc"
+#include "../src/runtime/ndarray.cc"
+
+#include "../src/runtime/graph/graph_runtime.cc"
+
+#ifdef TVM_OPENCL_RUNTIME
+#include "../src/runtime/opencl/opencl_device_api.cc"
+#include "../src/runtime/opencl/opencl_module.cc"
+#endif
diff --git a/apps/android_deploy/app/src/main/res/layout/activity_main.xml b/apps/android_deploy/app/src/main/res/layout/activity_main.xml
new file mode 100644
index 000000000000..b16a5c2548a6
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/layout/activity_main.xml
@@ -0,0 +1,27 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/apps/android_deploy/app/src/main/res/layout/content_main.xml b/apps/android_deploy/app/src/main/res/layout/content_main.xml
new file mode 100644
index 000000000000..34de93843645
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/layout/content_main.xml
@@ -0,0 +1,46 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/apps/android_deploy/app/src/main/res/values/colors.xml b/apps/android_deploy/app/src/main/res/values/colors.xml
new file mode 100644
index 000000000000..3bdabdf11d00
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/values/colors.xml
@@ -0,0 +1,6 @@
+
+
+ #3F51B5
+ #303F9F
+ #06d467
+
diff --git a/apps/android_deploy/app/src/main/res/values/strings.xml b/apps/android_deploy/app/src/main/res/values/strings.xml
new file mode 100644
index 000000000000..cf1fa24069a1
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/values/strings.xml
@@ -0,0 +1,3 @@
+
+ TVM Android Demo
+
\ No newline at end of file
diff --git a/apps/android_deploy/app/src/main/res/values/styles.xml b/apps/android_deploy/app/src/main/res/values/styles.xml
new file mode 100644
index 000000000000..44f664f202f9
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/values/styles.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
diff --git a/apps/android_deploy/app/src/main/res/xml/provider_paths.xml b/apps/android_deploy/app/src/main/res/xml/provider_paths.xml
new file mode 100644
index 000000000000..74a5cde1d8fd
--- /dev/null
+++ b/apps/android_deploy/app/src/main/res/xml/provider_paths.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/apps/android_deploy/build.gradle b/apps/android_deploy/build.gradle
new file mode 100644
index 000000000000..f7bbe2641c9d
--- /dev/null
+++ b/apps/android_deploy/build.gradle
@@ -0,0 +1,29 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+ repositories {
+ jcenter()
+ }
+ dependencies {
+ classpath 'com.android.tools.build:gradle:2.3.3'
+ classpath 'org.apache.httpcomponents:httpclient:4.5.4'
+
+ // NOTE: Do not place your application dependencies here; they belong
+ // in the individual module build.gradle files
+ }
+}
+
+allprojects {
+ repositories {
+ jcenter()
+ maven {
+ url 'https://maven.google.com'
+ }
+ mavenLocal()
+ mavenCentral()
+ }
+}
+
+task clean(type: Delete) {
+ delete rootProject.buildDir
+}
diff --git a/apps/android_deploy/dev_tools/gen_keystore.sh b/apps/android_deploy/dev_tools/gen_keystore.sh
new file mode 100644
index 000000000000..e91cd05ad957
--- /dev/null
+++ b/apps/android_deploy/dev_tools/gen_keystore.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+CURR_DIR=$(cd `dirname $0`; pwd)
+keytool -genkey -keystore $CURR_DIR/tvmdemo.keystore -alias tvmdemo -keyalg RSA -validity 10000
diff --git a/apps/android_deploy/dev_tools/sign_apk.sh b/apps/android_deploy/dev_tools/sign_apk.sh
new file mode 100644
index 000000000000..314f82cdb76c
--- /dev/null
+++ b/apps/android_deploy/dev_tools/sign_apk.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+CURR_DIR=$(cd `dirname $0`; pwd)
+APK_DIR=$CURR_DIR/../app/build/outputs/apk
+UNSIGNED_APK=$APK_DIR/app-release-unsigned.apk
+SIGNED_APK=$APK_DIR/tvmdemo-release.apk
+jarsigner -verbose -keystore $CURR_DIR/tvmdemo.keystore -signedjar $SIGNED_APK $UNSIGNED_APK 'tvmdemo'
+echo $SIGNED_APK
diff --git a/apps/android_deploy/settings.gradle b/apps/android_deploy/settings.gradle
new file mode 100644
index 000000000000..e7b4def49cb5
--- /dev/null
+++ b/apps/android_deploy/settings.gradle
@@ -0,0 +1 @@
+include ':app'
diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index 062227b3e424..64e7779f150a 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -1,6 +1,6 @@
# Android TVM RPC
-This folder contains Android RPC app that allows us to launch an rpc server on a Android device and connect to it through python script and do testing on the python side as normal TVM RPC.
+This folder contains Android RPC app that allows us to launch an RPC server on a Android device and connect to it through python script and do testing on the python side as normal TVM RPC.
You will need JDK, [Android NDK](https://developer.android.com/ndk) and an Android device to use this.
@@ -8,6 +8,8 @@ You will need JDK, [Android NDK](https://developer.android.com/ndk) and an Andro
### Build APK
+We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
+
Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/dmlc/tvm/blob/master/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
```
@@ -24,17 +26,17 @@ dependencies {
}
```
-The Gradle build script is provided in the app root folder. It downloads the proper version of Gradle, compiles JNI, resolves Java dependencies and builds the Android application together with tvm4j. Run following script to build apk file.
+Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file.
```bash
export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]
cd apps/android_rpc
-./gradlew clean build
+gradle clean build
```
In `app/build/outputs/apk` you'll find `app-release-unsigned.apk`, use `dev_tools/gen_keystore.sh` to generate a signature and use `dev_tools/sign_apk.sh` to get the signed apk file `app/build/outputs/apk/tvmrpc-release.apk`.
-Now upload `tvmrpc-release.apk` to your Android device and install it.
+Upload `tvmrpc-release.apk` to your Android device and install it.
### Build with OpenCL
@@ -49,15 +51,15 @@ Here's a piece of example for `config.mk`.
```makefile
APP_ABI = arm64-v8a
-
+
APP_PLATFORM = android-17
-
+
# whether enable OpenCL during compile
USE_OPENCL = 1
-
+
# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
ADD_C_INCLUDES = /opt/adrenosdk-osx/Development/Inc
-
+
# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
ADD_LDLIBS = libOpenCL.so
```
@@ -83,19 +85,22 @@ If everything goes well, you will find compile tools in `/opt/android-toolchain-
### Cross Compile and Upload to the Android Device
-First start a proxy server using `python -m tvm.exec.rpc_proxy` and make your Android device connect to this proxy server via TVM RPC application.
+First start an RPC tracker using `python -m tvm.exec.rpc_tracker --port [PORT]` and connect your Android device to this RPC tracker via the TVM RPC application.
+Set the `Address` and `Port` fields to the address and port of the RPC tracker respectively.
+The key should be set to "android" if you wish to avoid modifying the default test script.
Then checkout [android\_rpc/tests/android\_rpc\_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py) and run,
```bash
-# Specify the proxy host
-export TVM_ANDROID_RPC_PROXY_HOST=0.0.0.0
+# Specify the RPC tracker
+export TVM_TRACKER_HOST=0.0.0.0
+export TVM_TRACKER_PORT=[PORT]
# Specify the standalone Android C++ compiler
export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
python android_rpc_test.py
```
-This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector additon on your Android device. On my test device, it gives following results.
+This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector addition on your Android device. On my test device, it gives following results.
```bash
TVM: Initializing cython mode...
diff --git a/apps/android_rpc/app/build.gradle b/apps/android_rpc/app/build.gradle
index 97364da5cd87..a91455fc5477 100644
--- a/apps/android_rpc/app/build.gradle
+++ b/apps/android_rpc/app/build.gradle
@@ -13,7 +13,7 @@ android {
buildToolsVersion "26.0.1"
defaultConfig {
applicationId "ml.dmlc.tvm.tvmrpc"
- minSdkVersion 17
+ minSdkVersion 24
targetSdkVersion 26
versionCode 1
versionName "1.0"
diff --git a/apps/android_rpc/app/src/main/AndroidManifest.xml b/apps/android_rpc/app/src/main/AndroidManifest.xml
index 6b0d6d995dba..2dbc06ece6e3 100644
--- a/apps/android_rpc/app/src/main/AndroidManifest.xml
+++ b/apps/android_rpc/app/src/main/AndroidManifest.xml
@@ -2,11 +2,14 @@
+
+
+ android:theme="@style/AppTheme"
+ android:icon="@mipmap/ic_launcher" >
+
+
+
-
-
-
\ No newline at end of file
+
diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
index 62c877e8b34c..d80008bbe258 100644
--- a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
+++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
@@ -19,34 +19,30 @@
import android.annotation.SuppressLint;
import android.app.AlertDialog;
+import android.content.Context;
import android.content.DialogInterface;
+import android.content.SharedPreferences;
import android.os.Bundle;
import android.os.Handler;
import android.os.Message;
+
import android.support.v7.app.AppCompatActivity;
import android.support.v7.widget.Toolbar;
import android.widget.CompoundButton;
import android.widget.EditText;
import android.widget.Switch;
+import android.widget.Button;
+import android.view.View;
+import android.content.Intent;
+import android.app.NotificationChannel;
+import android.app.NotificationManager;
+
public class MainActivity extends AppCompatActivity {
- static final int MSG_RPC_ERROR = 0;
- static final String MSG_RPC_ERROR_DATA_KEY = "msg_rpc_error_data_key";
-
- private RPCProcessor tvmServerWorker;
- @SuppressLint("HandlerLeak")
- private final Handler rpcHandler = new Handler() {
- @Override
- public void dispatchMessage(Message msg) {
- Switch switchConnect = findViewById(R.id.switch_connect);
- if (msg.what == MSG_RPC_ERROR && switchConnect.isChecked()) {
- // switch off and show alert dialog.
- switchConnect.setChecked(false);
- String msgBody = msg.getData().getString(MSG_RPC_ERROR_DATA_KEY);
- showDialog("Error", msgBody);
- }
- }
- };
+ private boolean skipRelaunch = true;
+ // wait time before automatic restart of RPC Activity
+ public static final int HANDLER_RESTART_DELAY = 5000;
+
private void showDialog(String title, String msg) {
AlertDialog.Builder builder = new AlertDialog.Builder(this);
@@ -62,61 +58,124 @@ public void onClick(DialogInterface dialog, int id) {
builder.create().show();
}
+ public Intent updateRPCPrefs() {
+ System.err.println("updating preferences...");
+ EditText edProxyAddress = findViewById(R.id.input_address);
+ EditText edProxyPort = findViewById(R.id.input_port);
+ EditText edAppKey = findViewById(R.id.input_key);
+ Switch inputSwitch = findViewById(R.id.switch_persistent);
+
+ final String proxyHost = edProxyAddress.getText().toString();
+ final int proxyPort = Integer.parseInt(edProxyPort.getText().toString());
+ final String key = edAppKey.getText().toString();
+ final boolean isChecked = inputSwitch.isChecked();
+
+ SharedPreferences pref = getApplicationContext().getSharedPreferences("RPCProxyPreference", Context.MODE_PRIVATE);
+ SharedPreferences.Editor editor = pref.edit();
+ editor.putString("input_address", proxyHost);
+ editor.putString("input_port", edProxyPort.getText().toString());
+ editor.putString("input_key", key);
+ editor.putBoolean("input_switch", isChecked);
+ editor.commit();
+
+ Intent intent = new Intent(this, RPCActivity.class);
+ intent.putExtra("host", proxyHost);
+ intent.putExtra("port", proxyPort);
+ intent.putExtra("key", key);
+ return intent;
+ }
+
+ private void setupRelaunch() {
+ final Context context = this;
+ final Switch switchPersistent = findViewById(R.id.switch_persistent);
+ final Runnable rPCStarter = new Runnable() {
+ public void run() {
+ if (switchPersistent.isChecked()) {
+ System.err.println("relaunching RPC activity in 5s...");
+ Intent intent = ((MainActivity) context).updateRPCPrefs();
+ startActivity(intent);
+ }
+ }
+ };
+ Handler handler = new Handler();
+ handler.postDelayed(rPCStarter, HANDLER_RESTART_DELAY);
+ }
+
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
Toolbar toolbar = findViewById(R.id.toolbar);
setSupportActionBar(toolbar);
+ final Context context = this;
- tvmServerWorker = new RPCProcessor(rpcHandler);
- tvmServerWorker.setDaemon(true);
- tvmServerWorker.start();
-
- Switch switchConnect = findViewById(R.id.switch_connect);
- switchConnect.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+ Switch switchPersistent = findViewById(R.id.switch_persistent);
+ switchPersistent.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
@Override
public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
if (isChecked) {
- enableInputView(false);
- connectProxy();
+ System.err.println("automatic RPC restart enabled...");
+ updateRPCPrefs();
} else {
- disconnect();
- enableInputView(true);
+ System.err.println("automatic RPC restart disabled...");
+ updateRPCPrefs();
}
}
});
- }
- @Override
- protected void onDestroy() {
- super.onDestroy();
- tvmServerWorker.disconnect();
- }
-
- private void connectProxy() {
- EditText edProxyAddress = findViewById(R.id.input_address);
- EditText edProxyPort = findViewById(R.id.input_port);
- EditText edAppKey = findViewById(R.id.input_key);
+ Button startRPC = findViewById(R.id.button_start_rpc);
+ startRPC.setOnClickListener(new View.OnClickListener() {
+ public void onClick(View v) {
+ Intent intent = ((MainActivity) context).updateRPCPrefs();
+ startActivity(intent);
+ }
+ });
- final String proxyHost = edProxyAddress.getText().toString();
- final int proxyPort = Integer.parseInt(edProxyPort.getText().toString());
- final String key = edAppKey.getText().toString();
+ enableInputView(true);
+ }
- tvmServerWorker.connect(proxyHost, proxyPort, key);
+ @Override
+ protected void onResume() {
+ System.err.println("MainActivity onResume...");
+ System.err.println("skipRelaunch: " + skipRelaunch);
+ // if this is the first time onResume is called, do nothing, otherwise we
+ // may double launch
+ if (!skipRelaunch) {
+ enableInputView(true);
+ setupRelaunch();
+ } else {
+ skipRelaunch = false;
+ }
+ super.onResume();
}
- private void disconnect() {
- tvmServerWorker.disconnect();
- System.err.println("Disconnected.");
+ @Override
+ protected void onDestroy() {
+ super.onDestroy();
}
private void enableInputView(boolean enable) {
EditText edProxyAddress = findViewById(R.id.input_address);
EditText edProxyPort = findViewById(R.id.input_port);
EditText edAppKey = findViewById(R.id.input_key);
+ Switch input_switch = findViewById(R.id.switch_persistent);
edProxyAddress.setEnabled(enable);
edProxyPort.setEnabled(enable);
edAppKey.setEnabled(enable);
+
+ if (enable) {
+ SharedPreferences pref = getApplicationContext().getSharedPreferences("RPCProxyPreference", Context.MODE_PRIVATE);
+ String inputAddress = pref.getString("input_address", null);
+ if (null != inputAddress)
+ edProxyAddress.setText(inputAddress);
+ String inputPort = pref.getString("input_port", null);
+ if (null != inputPort)
+ edProxyPort.setText(inputPort);
+ String inputKey = pref.getString("input_key", null);
+ if (null != inputKey)
+ edAppKey.setText(inputKey);
+ boolean isChecked = pref.getBoolean("input_switch", false);
+ input_switch.setChecked(isChecked);
+ }
}
}
diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCActivity.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCActivity.java
new file mode 100644
index 000000000000..912a7c9e69a6
--- /dev/null
+++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCActivity.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.tvm.tvmrpc;
+
+import android.os.Bundle;
+import android.support.v7.app.AppCompatActivity;
+import android.content.Intent;
+import android.widget.Button;
+import android.view.View;
+
+public class RPCActivity extends AppCompatActivity {
+ private RPCProcessor tvmServerWorker;
+
+ @Override
+ protected void onCreate(Bundle savedInstanceState) {
+ super.onCreate(savedInstanceState);
+ setContentView(R.layout.activity_rpc);
+
+ Button stopRPC = findViewById(R.id.button_stop_rpc);
+ stopRPC.setOnClickListener(new View.OnClickListener() {
+ public void onClick(View v) {
+ System.err.println(tvmServerWorker == null);
+ if (tvmServerWorker != null) {
+ // currently will raise a socket closed exception
+ tvmServerWorker.disconnect();
+ }
+ finish();
+ // prevent Android from recycling the process
+ System.exit(0);
+ }
+ });
+
+ System.err.println("rpc activity onCreate...");
+ Intent intent = getIntent();
+ String host = intent.getStringExtra("host");
+ int port = intent.getIntExtra("port", 9090);
+ String key = intent.getStringExtra("key");
+
+ tvmServerWorker = new RPCProcessor();
+ tvmServerWorker.setDaemon(true);
+ tvmServerWorker.start();
+ tvmServerWorker.connect(host, port, key);
+ }
+
+ @Override
+ protected void onDestroy() {
+ System.err.println("rpc activity onDestroy");
+ tvmServerWorker.disconnect();
+ super.onDestroy();
+ }
+}
diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCProcessor.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCProcessor.java
index 2ff7fee8a6b3..6da890931104 100644
--- a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCProcessor.java
+++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/RPCProcessor.java
@@ -17,15 +17,11 @@
package ml.dmlc.tvm.tvmrpc;
-import android.os.Bundle;
-import android.os.Handler;
-import android.os.Message;
import android.os.ParcelFileDescriptor;
-
import java.net.Socket;
-
-import ml.dmlc.tvm.rpc.ConnectProxyServerProcessor;
+import ml.dmlc.tvm.rpc.ConnectTrackerServerProcessor;
import ml.dmlc.tvm.rpc.SocketFileDescriptorGetter;
+import ml.dmlc.tvm.rpc.RPCWatchdog;
/**
* Connect to RPC proxy and deal with requests.
@@ -34,10 +30,10 @@ class RPCProcessor extends Thread {
private String host;
private int port;
private String key;
-
private boolean running = false;
- private ConnectProxyServerProcessor currProcessor;
- private final Handler uiHandler;
+ private long startTime;
+ private ConnectTrackerServerProcessor currProcessor;
+ private boolean first = true;
static final SocketFileDescriptorGetter socketFdGetter
= new SocketFileDescriptorGetter() {
@@ -47,11 +43,9 @@ public int get(Socket socket) {
}
};
- RPCProcessor(Handler uiHandler) {
- this.uiHandler = uiHandler;
- }
-
@Override public void run() {
+ RPCWatchdog watchdog = new RPCWatchdog();
+ watchdog.start();
while (true) {
synchronized (this) {
currProcessor = null;
@@ -61,20 +55,17 @@ public int get(Socket socket) {
} catch (InterruptedException e) {
}
}
- currProcessor = new ConnectProxyServerProcessor(host, port, key, socketFdGetter);
+ try {
+ currProcessor = new ConnectTrackerServerProcessor(host, port, key, socketFdGetter, watchdog);
+ } catch (Throwable e) {
+ e.printStackTrace();
+ // kill if creating a new processor failed
+ System.exit(0);
+ }
}
- try {
+ if (currProcessor != null)
currProcessor.run();
- } catch (Throwable e) {
- disconnect();
- // turn connect switch off.
- Message message = new Message();
- message.what = MainActivity.MSG_RPC_ERROR;
- Bundle bundle = new Bundle();
- bundle.putString(MainActivity.MSG_RPC_ERROR_DATA_KEY, e.getMessage());
- message.setData(bundle);
- uiHandler.sendMessage(message);
- }
+ watchdog.finishTimeout();
}
}
@@ -101,6 +92,6 @@ synchronized void connect(String host, int port, String key) {
this.port = port;
this.key = key;
running = true;
- notify();
+ this.notify();
}
}
diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk
index 01cad9b783a7..5bf52bdaffc0 100644
--- a/apps/android_rpc/app/src/main/jni/Application.mk
+++ b/apps/android_rpc/app/src/main/jni/Application.mk
@@ -8,9 +8,18 @@ endif
include $(config)
-APP_STL := gnustl_static
+# We target every architecture except armeabi here, for two reasons:
+# 1) armeabi is deprecated in NDK r16 and removed in r17
+# 2) vulkan is not supported in armeabi
+APP_ABI ?= armeabi-v7a arm64-v8a x86 x86_64 mips
+APP_STL := c++_static
APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
-ifeq ($(USE_OPENCL), 1)
+ifeq ($(USE_OPENCL), 1)
APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
endif
+
+ifeq ($(USE_VULKAN), 1)
+ APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
+ APP_LDFLAGS += -lvulkan
+endif
diff --git a/apps/android_rpc/app/src/main/jni/make/config.mk b/apps/android_rpc/app/src/main/jni/make/config.mk
index 8d6f5a56dd5b..c40ce4ba3ec7 100644
--- a/apps/android_rpc/app/src/main/jni/make/config.mk
+++ b/apps/android_rpc/app/src/main/jni/make/config.mk
@@ -14,11 +14,19 @@
#-------------------------------------------------------------------------------
APP_ABI = all
-APP_PLATFORM = android-17
+APP_PLATFORM = android-24
# whether enable OpenCL during compile
USE_OPENCL = 0
+# whether to enable Vulkan during compile
+USE_VULKAN = 0
+
+ifeq ($(USE_VULKAN), 1)
+ # Statically linking vulkan requires API Level 24 or higher
+ APP_PLATFORM = android-24
+endif
+
# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
ADD_C_INCLUDES =
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index fc384a8fcd72..c3c33b0fde37 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -6,6 +6,18 @@
#include
#include
+/* Enable custom logging - this will cause TVM to pass every log message
+ * through CustomLogMessage instead of LogMessage. By enabling this, we must
+ * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log
+ * messages to Android logcat.
+ */
+#define DMLC_LOG_CUSTOMIZE 1
+
+/* Ensure that fatal errors are passed to the logger before throwing
+ * in LogMessageFatal
+ */
+#define DMLC_LOG_BEFORE_THROW 1
+
#include "../src/runtime/c_runtime_api.cc"
#include "../src/runtime/cpu_device_api.cc"
#include "../src/runtime/workspace_pool.cc"
@@ -21,10 +33,25 @@
#include "../src/runtime/rpc/rpc_module.cc"
#include "../src/runtime/rpc/rpc_socket_impl.cc"
#include "../src/runtime/thread_pool.cc"
-
+#include "../src/runtime/threading_backend.cc"
#include "../src/runtime/graph/graph_runtime.cc"
+#include "../src/runtime/ndarray.cc"
#ifdef TVM_OPENCL_RUNTIME
#include "../src/runtime/opencl/opencl_device_api.cc"
#include "../src/runtime/opencl/opencl_module.cc"
#endif
+
+#ifdef TVM_VULKAN_RUNTIME
+#include "../src/runtime/vulkan/vulkan_device_api.cc"
+#include "../src/runtime/vulkan/vulkan_module.cc"
+#endif
+
+
+#include
+
+void dmlc::CustomLogMessage::Log(const std::string& msg) {
+ // This is called for every message logged by TVM.
+ // We pass the message to logcat.
+ __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str());
+}
diff --git a/apps/android_rpc/app/src/main/res/layout/activity_main.xml b/apps/android_rpc/app/src/main/res/layout/activity_main.xml
index f617cf2a04bb..53d48bbd60d9 100644
--- a/apps/android_rpc/app/src/main/res/layout/activity_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/activity_main.xml
@@ -24,4 +24,3 @@
-
diff --git a/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml b/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml
new file mode 100644
index 000000000000..ba3102a6033c
--- /dev/null
+++ b/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml
index 827cdfb01b8a..0f2564833ecd 100644
--- a/apps/android_rpc/app/src/main/res/layout/content_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml
@@ -64,9 +64,9 @@
+ android:text="@string/label_persistent"/>
+
+
+
+
diff --git a/apps/android_rpc/app/src/main/res/layout/content_rpc.xml b/apps/android_rpc/app/src/main/res/layout/content_rpc.xml
new file mode 100644
index 000000000000..fb9ab2f66a9b
--- /dev/null
+++ b/apps/android_rpc/app/src/main/res/layout/content_rpc.xml
@@ -0,0 +1,14 @@
+
+
+
diff --git a/apps/android_rpc/app/src/main/res/mipmap-hdpi/ic_launcher.png b/apps/android_rpc/app/src/main/res/mipmap-hdpi/ic_launcher.png
new file mode 100644
index 000000000000..32a4f0f9157f
Binary files /dev/null and b/apps/android_rpc/app/src/main/res/mipmap-hdpi/ic_launcher.png differ
diff --git a/apps/android_rpc/app/src/main/res/mipmap-mdpi/ic_launcher.png b/apps/android_rpc/app/src/main/res/mipmap-mdpi/ic_launcher.png
new file mode 100644
index 000000000000..8e5d4dd8331e
Binary files /dev/null and b/apps/android_rpc/app/src/main/res/mipmap-mdpi/ic_launcher.png differ
diff --git a/apps/android_rpc/app/src/main/res/values/strings.xml b/apps/android_rpc/app/src/main/res/values/strings.xml
index 468fbed8ceaa..33caa374b496 100644
--- a/apps/android_rpc/app/src/main/res/values/strings.xml
+++ b/apps/android_rpc/app/src/main/res/values/strings.xml
@@ -1,15 +1,19 @@
TVM RPC
+ RPC
- Enter the proxy server address
- Enter the proxy server port
+ Enter the tracker server address
+ Enter the tracker server port
Enter the app connection key
Address
Port
Key
- Connect to Proxy
+ Keep RPC Alive
- Connected
- Disconnected
+ Enabled
+ Disabled
+
+ Start RPC
+ Stop RPC
diff --git a/apps/android_rpc/gradle/wrapper/gradle-wrapper.jar b/apps/android_rpc/gradle/wrapper/gradle-wrapper.jar
deleted file mode 100644
index 13372aef5e24..000000000000
Binary files a/apps/android_rpc/gradle/wrapper/gradle-wrapper.jar and /dev/null differ
diff --git a/apps/android_rpc/gradle/wrapper/gradle-wrapper.properties b/apps/android_rpc/gradle/wrapper/gradle-wrapper.properties
deleted file mode 100644
index 80a1f0954c16..000000000000
--- a/apps/android_rpc/gradle/wrapper/gradle-wrapper.properties
+++ /dev/null
@@ -1,6 +0,0 @@
-#Mon Aug 14 21:31:55 CST 2017
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
diff --git a/apps/android_rpc/gradlew b/apps/android_rpc/gradlew
deleted file mode 100755
index 9d82f7891513..000000000000
--- a/apps/android_rpc/gradlew
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env bash
-
-##############################################################################
-##
-## Gradle start up script for UN*X
-##
-##############################################################################
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS=""
-
-APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
-
-warn ( ) {
- echo "$*"
-}
-
-die ( ) {
- echo
- echo "$*"
- echo
- exit 1
-}
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-case "`uname`" in
- CYGWIN* )
- cygwin=true
- ;;
- Darwin* )
- darwin=true
- ;;
- MINGW* )
- msys=true
- ;;
-esac
-
-# Attempt to set APP_HOME
-# Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
- ls=`ls -ld "$PRG"`
- link=`expr "$ls" : '.*-> \(.*\)$'`
- if expr "$link" : '/.*' > /dev/null; then
- PRG="$link"
- else
- PRG=`dirname "$PRG"`"/$link"
- fi
-done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
- if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
- # IBM's JDK on AIX uses strange locations for the executables
- JAVACMD="$JAVA_HOME/jre/sh/java"
- else
- JAVACMD="$JAVA_HOME/bin/java"
- fi
- if [ ! -x "$JAVACMD" ] ; then
- die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
- fi
-else
- JAVACMD="java"
- which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-fi
-
-# Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
- MAX_FD_LIMIT=`ulimit -H -n`
- if [ $? -eq 0 ] ; then
- if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
- MAX_FD="$MAX_FD_LIMIT"
- fi
- ulimit -n $MAX_FD
- if [ $? -ne 0 ] ; then
- warn "Could not set maximum file descriptor limit: $MAX_FD"
- fi
- else
- warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
- fi
-fi
-
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
- GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
-
-# For Cygwin, switch paths to Windows format before running java
-if $cygwin ; then
- APP_HOME=`cygpath --path --mixed "$APP_HOME"`
- CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
- JAVACMD=`cygpath --unix "$JAVACMD"`
-
- # We build the pattern for arguments to be converted via cygpath
- ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
- SEP=""
- for dir in $ROOTDIRSRAW ; do
- ROOTDIRS="$ROOTDIRS$SEP$dir"
- SEP="|"
- done
- OURCYGPATTERN="(^($ROOTDIRS))"
- # Add a user-defined pattern to the cygpath arguments
- if [ "$GRADLE_CYGPATTERN" != "" ] ; then
- OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
- fi
- # Now convert the arguments - kludge to limit ourselves to /bin/sh
- i=0
- for arg in "$@" ; do
- CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
- CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
-
- if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
- eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
- else
- eval `echo args$i`="\"$arg\""
- fi
- i=$((i+1))
- done
- case $i in
- (0) set -- ;;
- (1) set -- "$args0" ;;
- (2) set -- "$args0" "$args1" ;;
- (3) set -- "$args0" "$args1" "$args2" ;;
- (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
- (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
- (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
- (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
- (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
- (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
- esac
-fi
-
-# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
-function splitJvmOpts() {
- JVM_OPTS=("$@")
-}
-eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
-JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
-
-exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/apps/android_rpc/gradlew.bat b/apps/android_rpc/gradlew.bat
deleted file mode 100644
index aec99730b4e8..000000000000
--- a/apps/android_rpc/gradlew.bat
+++ /dev/null
@@ -1,90 +0,0 @@
-@if "%DEBUG%" == "" @echo off
-@rem ##########################################################################
-@rem
-@rem Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS=
-
-set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto init
-
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto init
-
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:init
-@rem Get command-line arguments, handling Windowz variants
-
-if not "%OS%" == "Windows_NT" goto win9xME_args
-if "%@eval[2+2]" == "4" goto 4NT_args
-
-:win9xME_args
-@rem Slurp the command line arguments.
-set CMD_LINE_ARGS=
-set _SKIP=2
-
-:win9xME_args_slurp
-if "x%~1" == "x" goto execute
-
-set CMD_LINE_ARGS=%*
-goto execute
-
-:4NT_args
-@rem Get arguments from the 4NT Shell from JP Software
-set CMD_LINE_ARGS=%$
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
-
-:end
-@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index 650892ab5735..cfb04c1ca9a9 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -1,17 +1,19 @@
"""Testcode for Android RPC.
-To use it, start a rpc proxy with "python -m tvm.exec.rpc_proxy".
-And configure the proxy host field as commented.
+To use it, start an RPC tracker with "python -m tvm.exec.rpc_tracker".
+Use the tracker's address and port when configuring the RPC app.
+Use "android" as the key if you wish to avoid modifying this script.
"""
import tvm
import os
-from tvm.contrib import rpc, util, ndk, rpc_proxy
+from tvm import rpc
+from tvm.contrib import util, ndk
import numpy as np
# Set to be address of tvm proxy.
-proxy_host = os.environ["TVM_ANDROID_RPC_PROXY_HOST"]
-proxy_port = 9090
+tracker_host = os.environ["TVM_TRACKER_HOST"]
+tracker_port = int(os.environ["TVM_TRACKER_PORT"])
key = "android"
# Change target configuration.
@@ -32,7 +34,7 @@ def test_rpc_module():
# Build the dynamic lib.
# If we don't want to do metal and only use cpu, just set target to be target
f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
- path_dso1 = temp.relpath("dev_lib.so")
+ path_dso1 = temp.relpath("dev_lib2.so")
f.export_library(path_dso1, ndk.create_shared)
s = tvm.create_schedule(B.op)
@@ -44,29 +46,31 @@ def test_rpc_module():
path_dso2 = temp.relpath("cpu_lib.so")
f.export_library(path_dso2, ndk.create_shared)
- # connect to the proxy
- remote = rpc.connect(proxy_host, proxy_port, key=key)
+ tracker = rpc.connect_tracker(tracker_host, tracker_port)
+ remote = tracker.request(key, priority=0,
+ session_timeout=60)
- print('Run GPU test ...')
- ctx = remote.cl(0)
- remote.upload(path_dso1)
- f1 = remote.load_module("dev_lib.so")
+ print('Run CPU test ...')
+ ctx = remote.cpu(0)
+ remote.upload(path_dso2)
+ f2 = remote.load_module("cpu_lib.so")
a_np = np.random.uniform(size=1024).astype(A.dtype)
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
- time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+ time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
cost = time_f(a, b).mean
print('%g secs/op' % cost)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
- print('Run CPU test ...')
- ctx = remote.cpu(0)
- remote.upload(path_dso2)
- f2 = remote.load_module("cpu_lib.so")
+
+ print('Run GPU test ...')
+ ctx = remote.cl(0)
+ remote.upload(path_dso1)
+ f1 = remote.load_module("dev_lib2.so")
a_np = np.random.uniform(size=1024).astype(A.dtype)
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
- time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
+ time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
cost = time_f(a, b).mean
print('%g secs/op' % cost)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
new file mode 100644
index 000000000000..e83e47c46eb7
--- /dev/null
+++ b/apps/benchmark/README.md
@@ -0,0 +1,70 @@
+# Performance Benchmark
+
+## Results
+
+See results on wiki page https://github.com/dmlc/tvm/wiki/Benchmark
+
+## How to Reproduce
+
+### ARM CPU
+We use RPC infrastructure in TVM to make device management easy. So you need to use it for reproducing benchmark results.
+
+1. Start an RPC Tracker on the host machine
+```bash
+python3 -m tvm.exec.rpc_tracker
+```
+
+2. Register devices to the tracker
+* For Linux device
+ * Build tvm runtime on your device [Help](https://docs.tvm.ai/tutorials/nnvm/deploy_model_on_rasp.html#build-tvm-runtime-on-device)
+ * Register your device to tracker by
+ ```bash
+ python3 -m tvm.exec.rpc_sever --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
+ ```
+ replace `[HOST_IP]` with the IP address of the host machine, `[DEVICE_KEY]` with the name of device.
+
+ E.g. Here is an example command for RK3399,
+ `python3 -m tvm.exec.rpc_sever --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
+
+* For Android device
+ * Build and install tvm RPC apk on your device [Help](https://github.com/dmlc/tvm/tree/master/apps/android_rpc).
+ Make sure you can pass the android rpc test. Then you have alreadly known how to register.
+
+3. Verify the device registration
+ We can query all registered devices by
+ ```bash
+ python3 -m tvm.exec.query_rpc_tracker
+ ```
+ You should be able to find your devices in `Queue Status`. Make sure the registration is correct before going ahead.
+
+ For our test environment, one sample output can be
+ ```bash
+ Queue Status
+ ------------------------------
+ key free pending
+ ------------------------------
+ mate10pro 1 0
+ p20pro 2 0
+ pixel2 2 0
+ rk3399 2 0
+ rasp3b 8 0
+ ```
+
+ 4. Run benchmark
+ We did auto-tuning for Huawei P20/Mate10 Pro, Google Pixel2, Raspberry Pi3 and Firefly-RK3399,
+ and release pre-tuned parameters in [this repo](https://github.com/uwsaml/tvm-distro).
+ During compilation, TVM will download these operator parameters automatically.
+
+ ```bash
+ python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key rasp3b
+ python3 arm_cpu_imagenet_bench.py --device rk3399 --rpc-key rk3399
+ python3 arm_cpu_imagenet_bench.py --device pixel2 --rpc-key pixel2
+ python3 arm_cpu_imagenet_bench.py --device p20pro --rpc-key p20pro
+ python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro
+ ```
+
+ If your device has a same SoC of the above device, you can reuse these parameters
+ (e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target).
+ Otherwise, you need to tune for your own device, please follow this
+ [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
+
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
new file mode 100644
index 000000000000..7baf244e0dae
--- /dev/null
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -0,0 +1,96 @@
+"""Benchmark script for performance on ARM CPU.
+see README.md for the usage and results of this script.
+"""
+
+import argparse
+import time
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+def get_network(name, batch_size):
+ """Get the symbol definition and random weight of a network"""
+ input_shape = (batch_size, 3, 224, 224)
+ output_shape = (batch_size, 1000)
+
+ if name == 'resnet-18':
+ net, params = nnvm.testing.resnet.get_workload(num_layers=18,
+ batch_size=batch_size, image_shape=(3, 224, 224))
+ elif name == 'mobilenet':
+ net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+ elif name == 'squeezenet v1.1':
+ net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size,
+ version='1.1')
+ elif name == 'vgg-16':
+ net, params = nnvm.testing.vgg.get_workload(batch_size=batch_size, num_layers=16)
+ else:
+ raise RuntimeError("Unsupported network: " + name)
+
+ return net, params, input_shape, output_shape
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--network", type=str, choices=['resnet-18', 'mobilenet', 'squeezenet v1.1', 'vgg-16'])
+ parser.add_argument("--device", type=str, required=True, choices=['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro',
+ 'pixel2', 'rasp3b', 'pynq'])
+ parser.add_argument("--host", type=str, default='localhost')
+ parser.add_argument("--port", type=int, default=9190)
+ parser.add_argument("--rpc-key", type=str, required=True)
+ parser.add_argument("--number", type=int, default=6)
+ args = parser.parse_args()
+
+ dtype = 'float32'
+
+ if args.network is None:
+ networks = ['squeezenet v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
+ else:
+ networks = [args.network]
+
+ target = tvm.target.arm_cpu(model=args.device)
+
+ # connect to remote device
+ tracker = tvm.rpc.connect_tracker(args.host, args.port)
+ remote = tracker.request(args.rpc_key)
+
+ print("--------------------------------------------------")
+ print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
+ print("--------------------------------------------------")
+ for network in networks:
+ net, params, input_shape, output_shape = get_network(network, batch_size=1)
+
+ with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+ graph, lib, params = nnvm.compiler.build(
+ net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+ tmp = tempdir()
+ if 'android' in str(target):
+ from tvm.contrib import ndk
+ filename = "%s.so" % network
+ lib.export_library(tmp.relpath(filename), ndk.create_shared)
+ else:
+ filename = "%s.tar" % network
+ lib.export_library(tmp.relpath(filename))
+
+ # upload library and params
+ ctx = remote.context(str(target), 0)
+ remote.upload(tmp.relpath(filename))
+ rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+
+ rlib = remote.load_module(filename)
+ module = runtime.create(graph, rlib, ctx)
+ data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+ module.set_input('data', data_tvm)
+ module.set_input(**rparams)
+
+ # evaluate
+ ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
+ prof_res = np.array(ftimer().results) * 1000 # multiply 1000 for converting to millisecond
+ print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
new file mode 100644
index 000000000000..fca4e35b6516
--- /dev/null
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -0,0 +1,80 @@
+""" Benchmark script for performance on GPUs.
+
+For example, run the file with:
+`python gpu_imagenet_bench.py --model=mobilenet --target=cuda`.
+For more details about how to set up the inference environment on GPUs,
+please refer to NNVM Tutorial: ImageNet Inference on the GPU
+"""
+import time
+import argparse
+import numpy as np
+import tvm
+import nnvm.compiler
+import nnvm.testing
+from tvm.contrib import util, nvcc
+from tvm.contrib import graph_runtime as runtime
+
+@tvm.register_func
+def tvm_callback_cuda_compile(code):
+ ptx = nvcc.compile_cuda(code, target="ptx")
+ return ptx
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model', type=str, required=True,
+ choices=['resnet', 'mobilenet'],
+ help="The model type.")
+ parser.add_argument('--target', type=str, required=True,
+ choices=['cuda', 'rocm', 'opencl', 'metal', 'nvptx'],
+ help="Compilation target.")
+ parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
+ parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.")
+ parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.")
+ args = parser.parse_args()
+ opt_level = args.opt_level
+ num_iter = args.num_iter
+ ctx = tvm.context(args.target, 0)
+ batch_size = 1
+ num_classes = 1000
+ image_shape = (3, 224, 224)
+
+ data_shape = (batch_size,) + image_shape
+ out_shape = (batch_size, num_classes)
+ if args.model == 'resnet':
+ net, params = nnvm.testing.resnet.get_workload(
+ batch_size=1, image_shape=image_shape)
+ elif args.model == 'mobilenet':
+ net, params = nnvm.testing.mobilenet.get_workload(
+ batch_size=1, image_shape=image_shape)
+ else:
+ raise ValueError('no benchmark prepared for {}.'.format(args.model))
+
+ if args.target == "cuda":
+ unroll = 1400
+ else:
+ unroll = 128
+ with nnvm.compiler.build_config(opt_level=opt_level):
+ with tvm.build_config(auto_unroll_max_step=unroll,
+ unroll_explicit=(args.target != "cuda")):
+ graph, lib, params = nnvm.compiler.build(
+ net, args.target, shape={"data": data_shape}, params=params)
+
+ data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+ module = runtime.create(graph, lib, ctx)
+ module.set_input(**params)
+ module.set_input("data", data)
+ module.run()
+ out = module.get_output(0, tvm.nd.empty(out_shape))
+ out.asnumpy()
+
+ print('benchmark args: {}'.format(args))
+ ftimer = module.module.time_evaluator("run", ctx, num_iter)
+ for i in range(args.repeat):
+ prof_res = ftimer()
+ print(prof_res)
+ # sleep for avoiding device overheat
+ if i + 1 != args.repeat:
+ time.sleep(45)
+
+if __name__ == '__main__':
+ main()
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index 6d7f4bdf7533..bb8b4b694187 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -22,12 +22,11 @@ struct extension_class_info {
} // namespace tvm
} // namespace runtime
-
-namespace tvm_ext {
-
using namespace tvm;
using namespace tvm::runtime;
+namespace tvm_ext {
+
TVM_REGISTER_EXT_TYPE(IntVector);
TVM_REGISTER_GLOBAL("tvm_ext.ivec_create")
@@ -66,3 +65,18 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev")
*rv = (*tvm::runtime::Registry::Get("device_api.cpu"))();
});
} // namespace tvm_ext
+
+// This callback approach allows extension allows tvm to extract
+// This way can be helpful when we want to use a header only
+// minimum version of TVM Runtime.
+extern "C" int TVMExtDeclare(TVMFunctionHandle pregister) {
+ const PackedFunc& fregister =
+ *static_cast(pregister);
+ auto mul = [](TVMArgs args, TVMRetValue *rv) {
+ int x = args[0];
+ int y = args[1];
+ *rv = x * y;
+ };
+ fregister("mul", PackedFunc(mul));
+ return 0;
+}
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index 0bbfff14eeef..628602f0baea 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -44,8 +44,14 @@ def ivec_cb(v2):
tvm.convert(ivec_cb)(ivec)
+def test_extract_ext():
+ fdict = tvm.extract_ext_funcs(tvm_ext._LIB.TVMExtDeclare)
+ assert fdict["mul"](3, 4) == 12
+
+
if __name__ == "__main__":
test_ext_dev()
test_ext_vec()
test_bind_add()
test_sym_add()
+ test_extract_ext()
diff --git a/apps/howto_deploy/Makefile b/apps/howto_deploy/Makefile
index 8e59084c60ae..ad4e56680d21 100644
--- a/apps/howto_deploy/Makefile
+++ b/apps/howto_deploy/Makefile
@@ -8,7 +8,7 @@ PKG_CFLAGS = -std=c++11 -O2 -fPIC\
-I${DMLC_CORE}/include\
-I${TVM_ROOT}/dlpack/include\
-PKG_LDFLAGS = -L${TVM_ROOT}/lib -ldl -lpthread
+PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -lpthread
.PHONY: clean all
diff --git a/apps/howto_deploy/README.md b/apps/howto_deploy/README.md
index 6c732879a6a5..fda6251ae9c5 100644
--- a/apps/howto_deploy/README.md
+++ b/apps/howto_deploy/README.md
@@ -8,4 +8,4 @@ Type the following command to run the sample code under the current folder(need
./run_example.sh
```
-Checkout [How to Deploy TVM Modules](http://docs.tvmlang.org/how_to/deploy.html) for more information.
+Checkout [How to Deploy TVM Modules](http://docs.tvm.ai/deploy/cpp_deploy.html) for more information.
diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc
index e3a88550dc2b..1fd22e5f2b5f 100644
--- a/apps/howto_deploy/cpp_deploy.cc
+++ b/apps/howto_deploy/cpp_deploy.cc
@@ -28,10 +28,10 @@ void Verify(tvm::runtime::Module mod, std::string fname) {
DLTensor* x;
DLTensor* y;
int ndim = 1;
- int dtype_code = kFloat;
+ int dtype_code = kDLFloat;
int dtype_bits = 32;
int dtype_lanes = 1;
- int device_type = kCPU;
+ int device_type = kDLCPU;
int device_id = 0;
int64_t shape[1] = {10};
TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes,
diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc
index e5c65b66b71a..27f95e9e6065 100644
--- a/apps/howto_deploy/tvm_runtime_pack.cc
+++ b/apps/howto_deploy/tvm_runtime_pack.cc
@@ -25,7 +25,9 @@
#include "../../src/runtime/module.cc"
#include "../../src/runtime/registry.cc"
#include "../../src/runtime/file_util.cc"
+#include "../../src/runtime/threading_backend.cc"
#include "../../src/runtime/thread_pool.cc"
+#include "../../src/runtime/ndarray.cc"
// NOTE: all the files after this are optional modules
// that you can include remove, depending on how much feature you use.
@@ -44,10 +46,19 @@
// #include "../../src/runtime/rpc/rpc_event_impl.cc"
// #include "../../src/runtime/rpc/rpc_server_env.cc"
+// These macros enables the device API when uncommented.
+#define TVM_CUDA_RUNTIME 1
+#define TVM_METAL_RUNTIME 1
+#define TVM_OPENCL_RUNTIME 1
+
// Uncomment the following lines to enable Metal
// #include "../../src/runtime/metal/metal_device_api.mm"
// #include "../../src/runtime/metal/metal_module.mm"
+// Uncomment the following lines to enable CUDA
+// #include "../../src/runtime/cuda/cuda_device_api.cc"
+// #include "../../src/runtime/cuda/cuda_module.cc"
+
// Uncomment the following lines to enable OpenCL
// #include "../../src/runtime/opencl/opencl_device_api.cc"
// #include "../../src/runtime/opencl/opencl_module.cc"
diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
index a3df1d3a9043..c0cfcde6e294 100644
--- a/apps/ios_rpc/tests/ios_rpc_test.py
+++ b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -6,7 +6,8 @@
import tvm
import os
-from tvm.contrib import rpc, util, xcode
+from tvm import rpc
+from tvm.contrib import util, xcode
import numpy as np
# Set to be address of tvm proxy.
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.h b/apps/ios_rpc/tvmrpc/TVMRuntime.h
index a758f9454460..fec351e7b22b 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.h
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.h
@@ -31,9 +31,12 @@ using FEventHandler = std::function ch(new NSStreamChannel(outputStream));
- std::shared_ptr sess = RPCSession::Create(std::move(ch), name);
+ std::shared_ptr sess = RPCSession::Create(std::move(ch), name, remote_key);
return [sess](const std::string& in_bytes, int flag) {
return sess->ServerEventHandler(in_bytes, flag);
};
@@ -101,13 +104,13 @@ void LaunchSyncServer() {
->ServerLoop();
}
-TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.workpath")
+TVM_REGISTER_GLOBAL("tvm.rpc.server.workpath")
.set_body([](TVMArgs args, TVMRetValue* rv) {
static RPCEnv env;
*rv = env.GetPath(args[0]);
});
-TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.load_module")
+TVM_REGISTER_GLOBAL("tvm.rpc.server.load_module")
.set_body([](TVMArgs args, TVMRetValue *rv) {
std::string name = args[0];
std::string fmt = GetFileFormat(name, "");
diff --git a/apps/ios_rpc/tvmrpc/ViewController.mm b/apps/ios_rpc/tvmrpc/ViewController.mm
index f25501809b01..98527bd67b50 100644
--- a/apps/ios_rpc/tvmrpc/ViewController.mm
+++ b/apps/ios_rpc/tvmrpc/ViewController.mm
@@ -143,7 +143,7 @@ - (void)open {
[outputStream_ scheduleInRunLoop:[NSRunLoop currentRunLoop] forMode:NSDefaultRunLoopMode];
[outputStream_ open];
[inputStream_ open];
- handler_ = tvm::runtime::CreateServerEventHandler(outputStream_, key_);
+ handler_ = tvm::runtime::CreateServerEventHandler(outputStream_, key_, "%toinit");
CHECK(handler_ != nullptr);
self.infoText.text = @"";
self.statusLabel.text = @"Connecting...";
@@ -169,7 +169,6 @@ - (IBAction)connect:(id)sender {
}
- (IBAction)disconnect:(id)sender {
-
[self close];
}
diff --git a/apps/pynq_rpc/start_rpc_server.sh b/apps/pynq_rpc/start_rpc_server.sh
new file mode 100755
index 000000000000..30b3c9a90d6b
--- /dev/null
+++ b/apps/pynq_rpc/start_rpc_server.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
+
+export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
+python -m vta.exec.rpc_server
diff --git a/apps/rocm_rpc/Makefile b/apps/rocm_rpc/Makefile
new file mode 100644
index 000000000000..b4e527980941
--- /dev/null
+++ b/apps/rocm_rpc/Makefile
@@ -0,0 +1,23 @@
+# Makefile Example to deploy TVM modules.
+ROCM_PATH=/opt/rocm
+
+TVM_ROOT=$(shell cd ../..; pwd)
+NNVM_PATH=nnvm
+DMLC_CORE=${TVM_ROOT}/dmlc-core
+
+PKG_CFLAGS = -std=c++11 -O2 -fPIC\
+ -I${TVM_ROOT}/include\
+ -I${DMLC_CORE}/include\
+ -I${TVM_ROOT}/dlpack/include\
+ -I${ROCM_PATH}/include
+
+PKG_LDFLAGS = -L${ROCM_PATH}/lib -L${TVM_ROOT}/lib -ldl -lpthread -lhip_hcc -lMIOpen
+
+.PHONY: clean all
+
+all: lib/libtvm_runtime_rocm.so
+
+# Build rule for all in one TVM package library
+lib/libtvm_runtime_rocm.so: rocm_runtime_pack.cc
+ @mkdir -p $(@D)
+ $(CXX) $(PKG_CFLAGS) -shared -o $@ $(filter %.cc %.o %.a, $^) $(PKG_LDFLAGS)
diff --git a/apps/rocm_rpc/README.md b/apps/rocm_rpc/README.md
new file mode 100644
index 000000000000..70ce9780a31d
--- /dev/null
+++ b/apps/rocm_rpc/README.md
@@ -0,0 +1,41 @@
+# TVM ROCm RPC
+
+This folder contains a simple recipe to make RPC work together with ROCm.TVM's RPC server relies on process
+fork to create a new process for each incoming session.
+Like CUDA, opencl driver, the runtime ROCm runtime is not fork-safe.
+A typical CUDA or opencl driver will initialize lazily
+and we can use normal TVM RPC server because we won't touch the driver API before we fork a new session.
+However, the current ROCm runtime eagerly initialize during startup and will directly cause error during fork.
+This folder provides a workaround to this problem.
+
+## Usage
+- Build tvm **without** rocm (it is important to exclude rocm from runtime)
+- Modify the ROCM_PATH to be the correct path the current [Makefile](Makefile)
+- Type make to build lib/libtvm_runtime_rocm.so, which is a standalone dll module
+- Use [start_rpc_server.sh](start_rpc_server.sh) to start the RPC server
+
+## How it works
+- The RPC server starts without ROCm dependency.
+- lib/libtvm_runtim_rocm.so is dynamically loaded only after the fork.
+
+## Note
+With ROCm RPC, we can build AMDGPU program from a machine without AMD GPU
+and remotely upload and execute on a AMDGPU machine.
+Please note that you will need to set the gfx version correctly(via ```-model``` or ```-mcpu```)
+because we can no longer query the GPU version dynamically during runtime.
+
+
+```python
+import tvm
+from tvm.contrib import rpc
+
+# set mcpu explicitly to be the gpu version.
+target = "rocm -mcpu=gfx900"
+remote = rpc.connect(server_host, server_port)
+mod = tvm.build(s, args, target)
+mod.export_library("mylib.so")
+
+remote.upload("mylib.so")
+foo = remote.load_module("mylib.so")
+# same as normal RPC
+```
diff --git a/apps/rocm_rpc/rocm_runtime_pack.cc b/apps/rocm_rpc/rocm_runtime_pack.cc
new file mode 100644
index 000000000000..174d9f0a8270
--- /dev/null
+++ b/apps/rocm_rpc/rocm_runtime_pack.cc
@@ -0,0 +1,15 @@
+/*!
+ * \brief This is an all in one file for ROCM runtime library.
+ *
+ * This is used to create a RPC module library that can be
+ * safely passed to rocm
+ */
+
+#define TVM_ROCM_RUNTIME 1
+#define TVM_USE_MIOPEN 1
+#define __HIP_PLATFORM_HCC__ 1
+
+#include "../../src/runtime/rocm/rocm_device_api.cc"
+#include "../../src/runtime/rocm/rocm_module.cc"
+#include "../../src/contrib/miopen/conv_forward.cc"
+#include "../../src/contrib/miopen/miopen_utils.cc"
diff --git a/apps/rocm_rpc/start_rpc_server.sh b/apps/rocm_rpc/start_rpc_server.sh
new file mode 100755
index 000000000000..e082d9d63ee6
--- /dev/null
+++ b/apps/rocm_rpc/start_rpc_server.sh
@@ -0,0 +1,5 @@
+#/bin/bash
+PROJ_ROOT=$(realpath $(dirname "$0")/../..)
+export PYTHONPATH=${PROJ_ROOT}/python:${PYTHONPATH}
+
+python -m tvm.exec.rpc_server "$@" --load-library=${PROJ_ROOT}/apps/rocm_rpc/lib/libtvm_runtime_rocm.so
diff --git a/apps/sgx/.gitignore b/apps/sgx/.gitignore
new file mode 100644
index 000000000000..c3af857904eb
--- /dev/null
+++ b/apps/sgx/.gitignore
@@ -0,0 +1 @@
+lib/
diff --git a/apps/sgx/Makefile b/apps/sgx/Makefile
new file mode 100644
index 000000000000..cd7034d4c41b
--- /dev/null
+++ b/apps/sgx/Makefile
@@ -0,0 +1,88 @@
+# Makefile for example to deploy TVM modules in SGX.
+
+TVM_ROOT := $(shell cd ../..; pwd)
+NNVM_PATH := nnvm
+DMLC_CORE := ${TVM_ROOT}/dmlc-core
+
+SGX_SDK ?= /opt/sgxsdk
+SGX_MODE ?= SIM
+SGX_ARCH ?= x64
+SGX_DEBUG ?= 1
+
+sgx_edger8r := $(SGX_SDK)/bin/x64/sgx_edger8r
+sgx_enclave_signer := $(SGX_SDK)/bin/x64/sgx_sign
+
+ifneq ($(SGX_MODE), HW)
+ sgx_sim := _sim
+endif
+urts_library_name := sgx_urts$(sgx_sim)
+trts_library_name := sgx_trts$(sgx_sim)
+tservice_library_name := sgx_tservice$(sgx_sim)
+uservice_library_name := sgx_uae_service$(sgx_sim)
+
+pkg_cflags := -std=c++11 -O2 -fPIC\
+ -I${TVM_ROOT}/include\
+ -I${DMLC_CORE}/include\
+ -I${TVM_ROOT}/dlpack/include\
+ -I.\
+ -DDMLC_LOG_STACK_TRACE=0\
+ -fmax-errors=4
+
+pkg_ldflags := -L${TVM_ROOT}/lib
+
+enclave_include_paths := -I$(SGX_SDK)/include\
+ -I$(SGX_SDK)/include/tlibc\
+ -I$(SGX_SDK)/include/libcxx\
+ -I$(SGX_SDK)/include/stdc++\
+
+enclave_cflags := -static -nostdinc\
+ -fvisibility=hidden -fpie -fstack-protector-strong\
+ -ffunction-sections -fdata-sections\
+ -DDMLC_CXX11_THREAD_LOCAL=0\
+ -include "lib/tvm_t.h"\
+ $(enclave_include_paths)\
+
+enclave_cxxflags := -nostdinc++ $(enclave_cflags) -DTVM_SGX_MAX_CONCURRENCY=4
+
+enclave_ldflags :=\
+ -Wl,--no-undefined -nostdlib -nodefaultlibs -nostartfiles -L$(SGX_SDK)/lib64\
+ -Wl,--whole-archive -l$(trts_library_name) -Wl,--no-whole-archive\
+ -Wl,--start-group\
+ -lsgx_tstdc -lsgx_tstdcxx -lsgx_tcxx -lsgx_tcrypto -lsgx_tkey_exchange -l$(tservice_library_name)\
+ -Wl,--end-group\
+ -Wl,-Bstatic -Wl,-Bsymbolic -Wl,--no-undefined\
+ -Wl,-pie,-eenclave_entry -Wl,--export-dynamic\
+ -Wl,--defsym,__ImageBase=0 -Wl,--gc-sections
+
+.PHONY: clean all
+
+all: lib/test_addone.signed.so
+
+# The code library built by TVM
+lib/test_addone_sys.o: prepare_test_libs.py
+ python prepare_test_libs.py
+
+lib/tvm_t.h: ../../src/runtime/sgx/tvm.edl
+ $(sgx_edger8r) --trusted $< --trusted-dir lib --search-path $(SGX_SDK)/include
+ mv $@ $@.in
+ awk 'NR==4{print "#include "}1' $@.in > $@
+
+lib/tvm_t.c: lib/tvm_t.h
+
+lib/tvm_t.o: lib/tvm_t.c
+ $(CC) $(enclave_cflags) $(pkg_cflags) -c $< -o $@ -include $(TVM_ROOT)/include/tvm/runtime/c_runtime_api.h
+
+# The enclave library
+lib/test_addone.so: $(TVM_ROOT)/src/runtime/sgx/trusted/runtime.cc lib/tvm_t.o lib/test_addone_sys.o
+ $(CXX) $^ -o $@ $(pkg_cflags) $(pkg_ldflags) $(enclave_cxxflags) $(enclave_ldflags) -g
+
+# The demo enclave signing key
+lib/enclave.pem:
+ curl -Lso $@ https://gist.githubusercontent.com/nhynes/8a2d80068a92e672f8b0b7d710ceb404/raw/2d5ae5fbe83198ede49465fdc6535065e093543b/tvm_sgx_demo.pem
+
+# The signed enclave
+lib/test_addone.signed.so: lib/test_addone.so enclave_config.xml lib/enclave.pem
+ $(sgx_enclave_signer) sign -key lib/enclave.pem -enclave $< -out $@ -config enclave_config.xml
+
+clean:
+ rm -rf lib
diff --git a/apps/sgx/README.md b/apps/sgx/README.md
new file mode 100644
index 000000000000..565519d457ce
--- /dev/null
+++ b/apps/sgx/README.md
@@ -0,0 +1,34 @@
+# TVM in Intel SGX Example
+
+This application demonstrates the use of a simple TVM model in the [Intel SGX](https://software.intel.com/en-us/blogs/2013/09/26/protecting-application-secrets-with-intel-sgx) trusted computing environment.
+
+## Prerequisites
+
+1. A GNU/Linux environment
+2. TVM compiled with LLVM and SGX; and the `tvm` Python module
+3. The [Linux SGX SDK](https://github.com/intel/linux-sgx) [link to pre-built libraries](https://01.org/intel-software-guard-extensions/downloads)
+
+## Running the example
+
+`SGX_SDK=/path/to/sgxsdk bash run_example.sh`
+
+If everything goes well, you should see a lot of build messages and below them
+the text `It works!`.
+
+## High-level overview
+
+First of all, it helps to think of an SGX enclave as a library that can be called
+to perform trusted computation.
+In this library, one can use other libraries like TVM.
+
+Building this example performs the following steps:
+
+1. Creates a simple TVM module that computes `x + 1` and save it as a system library.
+2. Builds a minimal TVM runtime pack that can load the module.
+3. Links the TVM module into an SGX enclave along with some code that runs the module.
+4. Compiles and runs an executable that loads the enclave and calls a function
+ which invokes the TVM module.
+
+For more information on building, please refer to the `Makefile`.
+For more information on the TVM module, please refer to `../howto_deploy`.
+For more in formation on SGX enclaves, please refer to the [SGX Enclave Demo](https://github.com/intel/linux-sgx/tree/master/SampleCode/SampleEnclave/)
diff --git a/apps/sgx/enclave_config.xml b/apps/sgx/enclave_config.xml
new file mode 100644
index 000000000000..07be0d7a7ad2
--- /dev/null
+++ b/apps/sgx/enclave_config.xml
@@ -0,0 +1,11 @@
+
+ 0
+ 0
+ 0x2000
+ 0x2000
+ 5
+ 1
+ 0
+ 0
+ 0xFFFFFFFF
+
diff --git a/apps/sgx/prepare_test_libs.py b/apps/sgx/prepare_test_libs.py
new file mode 100644
index 000000000000..f676f46b7ff0
--- /dev/null
+++ b/apps/sgx/prepare_test_libs.py
@@ -0,0 +1,26 @@
+"""Script to prepare test_addone_sys.o"""
+
+from os import path as osp
+
+import tvm
+
+CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
+
+
+def main():
+ out_dir = osp.join(CWD, 'lib')
+
+ n = tvm.var('n')
+ A = tvm.placeholder((n,), name='A')
+ B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
+ s = tvm.create_schedule(B.op)
+ s[B].parallel(s[B].op.axis[0])
+ print(tvm.lower(s, [A, B], simple_mode=True))
+
+ # Compile library in system library mode
+ fadd_syslib = tvm.build(s, [A, B], 'llvm --system-lib')
+ fadd_syslib.save(osp.join(out_dir, 'test_addone_sys.o'))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/apps/sgx/run_example.sh b/apps/sgx/run_example.sh
new file mode 100755
index 000000000000..9334b260cbf3
--- /dev/null
+++ b/apps/sgx/run_example.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+sgx_sdk=${SGX_SDK:=/opt/sgxsdk}
+make
+echo "========================="
+LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH} TVM_CACHE_DIR=/tmp python test_addone.py
diff --git a/apps/sgx/test_addone.py b/apps/sgx/test_addone.py
new file mode 100644
index 000000000000..5ddccfa425cc
--- /dev/null
+++ b/apps/sgx/test_addone.py
@@ -0,0 +1,13 @@
+import tvm
+import numpy as np
+
+ctx = tvm.context('cpu', 0)
+fadd1 = tvm.module.load('lib/test_addone.signed.so')
+
+n = 10
+x = tvm.nd.array(np.random.uniform(size=n).astype('float32'), ctx)
+y = tvm.nd.array(np.zeros(n, dtype='float32'), ctx)
+fadd1(x, y)
+
+np.testing.assert_allclose(y.asnumpy(), x.asnumpy() + 1)
+print("It works!")
diff --git a/cmake/config.cmake b/cmake/config.cmake
new file mode 100644
index 000000000000..85c5102169a9
--- /dev/null
+++ b/cmake/config.cmake
@@ -0,0 +1,116 @@
+#--------------------------------------------------------------------
+# Template custom cmake configuration for compiling
+#
+# This file is used to override the build options in build.
+# If you want to change the configuration, please use the following
+# steps. Assume you are on the root directory. First copy the this
+# file so that any local changes will be ignored by git
+#
+# $ mkdir build
+# $ cp cmake/config.cmake build
+#
+# Next modify the according entries, and then compile by
+#
+# $ cd build
+# $ cmake ..
+#
+# Then buld in parallel with 8 threads
+#
+# $ make -j8
+#--------------------------------------------------------------------
+
+#---------------------------------------------
+# Backend runtimes.
+#---------------------------------------------
+
+# Whether enable CUDA during compile,
+#
+# Possible values:
+# - ON: enable CUDA with cmake's auto search
+# - OFF: disbale CUDA
+# - /path/to/cuda: use specific path to cuda toolkit
+set(USE_CUDA OFF)
+
+# Whether enable ROCM runtime
+#
+# Possible values:
+# - ON: enable ROCM with cmake's auto search
+# - OFF: disbale ROCM
+# - /path/to/rocm: use specific path to rocm
+set(USE_ROCM OFF)
+
+# Whether enable SDAccel runtime
+set(USE_SDACCEL OFF)
+
+# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
+set(USE_AOCL OFF)
+
+# Whether enable OpenCL runtime
+set(USE_OPENCL OFF)
+
+# Whether enable Metal runtime
+set(USE_METAL OFF)
+
+# Whether enable Vulkan runtime
+#
+# Possible values:
+# - ON: enable Vulkan with cmake's auto search
+# - OFF: disbale vulkan
+# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
+set(USE_VULKAN OFF)
+
+# Whether enable OpenGL runtime
+set(USE_OPENGL OFF)
+
+# Whether enable RPC runtime
+set(USE_RPC ON)
+
+# Whether enable tiny embedded graph runtime.
+set(USE_GRAPH_RUNTIME ON)
+
+# Whether enable additional graph debug functions
+set(USE_GRAPH_RUNTIME_DEBUG OFF)
+
+# Whether build with LLVM support
+# Requires LLVM version >= 4.0
+#
+# Possible values:
+# - ON: enable llvm with cmake's find search
+# - OFF: disbale llvm
+# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
+set(USE_LLVM OFF)
+
+#---------------------------------------------
+# Contrib libraries
+#---------------------------------------------
+# Whether use BLAS, choices: openblas, mkl, atlas, apple
+set(USE_BLAS none)
+
+# /path/to/mkl: mkl root path when use mkl blas library
+# set(USE_MKL_PATH /opt/intel/mkl) for UNIX
+# set(USE_MKL_PATH ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
+set(USE_MKL_PATH none)
+
+# Whether use contrib.random in runtime
+set(USE_RANDOM OFF)
+
+# Whether use NNPack
+set(USE_NNPACK OFF)
+
+# Whether use CuDNN
+set(USE_CUDNN OFF)
+
+# Whether use cuBLAS
+set(USE_CUBLAS OFF)
+
+# Whether use MIOpen
+set(USE_MIOPEN OFF)
+
+# Whether use MPS
+set(USE_MPS OFF)
+
+# Whether use rocBlas
+set(USE_ROCBLAS OFF)
+
+# Whether use contrib sort
+set(USE_SORT OFF)
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
new file mode 100644
index 000000000000..70c8c8eebe28
--- /dev/null
+++ b/cmake/modules/CUDA.cmake
@@ -0,0 +1,40 @@
+# CUDA Module
+find_cuda(${USE_CUDA})
+
+if(CUDA_FOUND)
+ # always set the includedir when cuda is available
+ # avoid global retrigger of cmake
+ include_directories(${CUDA_INCLUDE_DIRS})
+endif(CUDA_FOUND)
+
+if(USE_CUDA)
+ if(NOT CUDA_FOUND)
+ message(FATAL_ERROR "Cannot find CUDA, USE_CUDA=" ${USE_CUDA})
+ endif()
+ message(STATUS "Build with CUDA support")
+ file(GLOB RUNTIME_CUDA_SRCS src/runtime/cuda/*.cc)
+ list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_SRCS})
+ list(APPEND COMPILER_SRCS src/codegen/opt/build_cuda_on.cc)
+
+ list(APPEND TVM_LINKER_LIBS ${CUDA_NVRTC_LIBRARY})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDART_LIBRARY})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDA_LIBRARY})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_NVRTC_LIBRARY})
+
+ if(USE_CUDNN)
+ message(STATUS "Build with cuDNN support")
+ file(GLOB CONTRIB_CUDNN_SRCS src/contrib/cudnn/*.cc)
+ list(APPEND RUNTIME_SRCS ${CONTRIB_CUDNN_SRCS})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDNN_LIBRARY})
+ endif(USE_CUDNN)
+
+ if(USE_CUBLAS)
+ message(STATUS "Build with cuBLAS support")
+ file(GLOB CONTRIB_CUBLAS_SRCS src/contrib/cublas/*.cc)
+ list(APPEND RUNTIME_SRCS ${CONTRIB_CUBLAS_SRCS})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUBLAS_LIBRARY})
+ endif(USE_CUBLAS)
+
+else(USE_CUDA)
+ list(APPEND COMPILER_SRCS src/codegen/opt/build_cuda_off.cc)
+endif(USE_CUDA)
diff --git a/cmake/modules/LLVM.cmake b/cmake/modules/LLVM.cmake
new file mode 100644
index 000000000000..3e896a601056
--- /dev/null
+++ b/cmake/modules/LLVM.cmake
@@ -0,0 +1,21 @@
+# LLVM rules
+add_definitions(-DDMLC_USE_FOPEN64=0)
+
+if(NOT USE_LLVM STREQUAL "OFF")
+ find_llvm(${USE_LLVM})
+ include_directories(${LLVM_INCLUDE_DIRS})
+ add_definitions(${LLVM_DEFINITIONS})
+ message(STATUS "Build with LLVM " ${LLVM_PACKAGE_VERSION})
+ message(STATUS "Set TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION})
+ # Set flags that are only needed for LLVM target
+ add_definitions(-DTVM_LLVM_VERSION=${TVM_LLVM_VERSION})
+ file(GLOB COMPILER_LLVM_SRCS src/codegen/llvm/*.cc)
+ list(APPEND TVM_LINKER_LIBS ${LLVM_LIBS})
+ list(APPEND COMPILER_SRCS ${COMPILER_LLVM_SRCS})
+ if(NOT MSVC)
+ set_source_files_properties(${COMPILER_LLVM_SRCS}
+ PROPERTIES COMPILE_DEFINITIONS "DMLC_ENABLE_RTTI=0")
+ set_source_files_properties(${COMPILER_LLVM_SRCS}
+ PROPERTIES COMPILE_FLAGS "-fno-rtti")
+ endif()
+endif()
diff --git a/cmake/modules/Metal.cmake b/cmake/modules/Metal.cmake
new file mode 100644
index 000000000000..27aa5a226f2b
--- /dev/null
+++ b/cmake/modules/Metal.cmake
@@ -0,0 +1,17 @@
+if(USE_METAL)
+ message(STATUS "Build with Metal support")
+ find_library(METAL_LIB Metal)
+ find_library(FOUNDATION_LIB Foundation)
+ file(GLOB RUNTIME_METAL_SRCS src/runtime/metal/*.mm)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${METAL_LIB} ${FOUNDATION_LIB})
+ list(APPEND RUNTIME_SRCS ${RUNTIME_METAL_SRCS})
+
+ if(USE_MPS)
+ file(GLOB MPS_CONTRIB_SRC src/contrib/mps/*.mm)
+ list(APPEND RUNTIME_SRCS ${MPS_CONTRIB_SRC})
+ find_library(MPS_CONTRIB_LIB MetalPerformanceShaders)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${MPS_CONTRIB_LIB})
+ endif()
+else(USE_METAL)
+ list(APPEND COMPILER_SRCS src/codegen/opt/build_metal_off.cc)
+endif(USE_METAL)
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
new file mode 100644
index 000000000000..b30df1864522
--- /dev/null
+++ b/cmake/modules/OpenCL.cmake
@@ -0,0 +1,42 @@
+# OPENCL Module
+find_package(OpenCL QUIET)
+
+if(OpenCL_FOUND)
+ # always set the includedir when cuda is available
+ # avoid global retrigger of cmake
+ include_directories(${OpenCL_INCLUDE_DIRS})
+endif(OpenCL_FOUND)
+
+if(USE_SDACCEL)
+ message(STATUS "Build with SDAccel support")
+ file(GLOB RUNTIME_SDACCEL_SRCS src/runtime/opencl/sdaccel/*.cc)
+ list(APPEND RUNTIME_SRCS ${RUNTIME_SDACCEL_SRCS})
+ if(NOT USE_OPENCL)
+ message(STATUS "Enable OpenCL support required for SDAccel")
+ set(USE_OPENCL ON)
+ endif()
+else()
+ list(APPEND COMPILER_SRCS src/codegen/opt/build_sdaccel_off.cc)
+endif(USE_SDACCEL)
+
+if(USE_AOCL)
+ message(STATUS "Build with Intel FPGA SDK for OpenCL support")
+ file(GLOB RUNTIME_AOCL_SRCS src/runtime/opencl/aocl/*.cc)
+ list(APPEND RUNTIME_SRCS ${RUNTIME_AOCL_SRCS})
+ if(NOT USE_OPENCL)
+ message(STATUS "Enable OpenCL support required for Intel FPGA SDK for OpenCL")
+ set(USE_OPENCL ON)
+ endif()
+else()
+ list(APPEND COMPILER_SRCS src/codegen/opt/build_aocl_off.cc)
+endif(USE_AOCL)
+
+if(USE_OPENCL)
+ find_package(OpenCL REQUIRED)
+ message(STATUS "Build with OpenCL support")
+ file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
+ list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
+else()
+ list(APPEND COMPILER_SRCS src/codegen/opt/build_opencl_off.cc)
+endif(USE_OPENCL)
diff --git a/cmake/modules/OpenGL.cmake b/cmake/modules/OpenGL.cmake
new file mode 100644
index 000000000000..2b62c9f302d4
--- /dev/null
+++ b/cmake/modules/OpenGL.cmake
@@ -0,0 +1,18 @@
+find_package(OpenGL QUIET)
+
+if(OpenGL_FOUND)
+ # always set the includedir when dir is available
+ # avoid global retrigger of cmake
+ include_directories(${OPENGL_INCLUDE_DIRS})
+endif(OpenGL_FOUND)
+
+if(USE_OPENGL)
+ find_package(OpenGL REQUIRED)
+ find_package(glfw3 QUIET REQUIRED)
+ message(STATUS "Build with OpenGL support")
+ file(GLOB RUNTIME_OPENGL_SRCS src/runtime/opengl/*.cc)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenGL_LIBRARIES} glfw)
+ list(APPEND RUNTIME_SRCS ${RUNTIME_OPENGL_SRCS})
+else(USE_OPENGL)
+ list(APPEND COMPILER_SRCS src/codegen/opt/build_opengl_off.cc)
+endif(USE_OPENGL)
diff --git a/cmake/modules/ROCM.cmake b/cmake/modules/ROCM.cmake
new file mode 100644
index 000000000000..0e45fdac66d9
--- /dev/null
+++ b/cmake/modules/ROCM.cmake
@@ -0,0 +1,36 @@
+# ROCM Module
+find_rocm(${USE_ROCM})
+
+if(ROCM_FOUND)
+ # always set the includedir
+ # avoid global retrigger of cmake
+ include_directories(${ROCM_INCLUDE_DIRS})
+ add_definitions(-D__HIP_PLATFORM_HCC__=1)
+endif(ROCM_FOUND)
+
+
+if(USE_ROCM)
+ if(NOT ROCM_FOUND)
+ message(FATAL_ERROR "Cannot find ROCM, USE_ROCM=" ${USE_ROCM})
+ endif()
+ message(STATUS "Build with ROCM support")
+ file(GLOB RUNTIME_ROCM_SRCS src/runtime/rocm/*.cc)
+ list(APPEND RUNTIME_SRCS ${RUNTIME_ROCM_SRCS})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_HIPHCC_LIBRARY})
+
+ if(USE_MIOPEN)
+ message(STATUS "Build with MIOpen support")
+ file(GLOB MIOPEN_CONTRIB_SRCS src/contrib/miopen/*.cc)
+ list(APPEND RUNTIME_SRCS ${MIOPEN_CONTRIB_SRCS})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_MIOPEN_LIBRARY})
+ endif(USE_MIOPEN)
+
+ if(USE_ROCBLAS)
+ message(STATUS "Build with RocBLAS support")
+ file(GLOB ROCBLAS_CONTRIB_SRCS src/contrib/rocblas/*.cc)
+ list(APPEND RUNTIME_SRCS ${ROCBLAS_CONTRIB_SRCS})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_ROCBLAS_LIBRARY})
+ endif(USE_ROCBLAS)
+else(USE_ROCM)
+ list(APPEND COMPILER_SRCS src/codegen/opt/build_rocm_off.cc)
+endif(USE_ROCM)
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
new file mode 100644
index 000000000000..43fb700203c7
--- /dev/null
+++ b/cmake/modules/VTA.cmake
@@ -0,0 +1,51 @@
+# CMake Build rules for VTA
+find_program(PYTHON NAMES python python3 python3.6)
+
+if(MSVC)
+ message(STATUS "VTA build is skipped in Windows..")
+elseif(PYTHON)
+ set(VTA_CONFIG ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/vta/config/vta_config.py)
+
+ if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
+ message(STATUS "Use VTA config " ${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
+ set(VTA_CONFIG ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/vta/config/vta_config.py
+ --use-cfg=${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
+ endif()
+
+ execute_process(COMMAND ${VTA_CONFIG} --target OUTPUT_VARIABLE __vta_target)
+ string(STRIP ${__vta_target} VTA_TARGET)
+
+ message(STATUS "Build VTA runtime with target: " ${VTA_TARGET})
+
+ execute_process(COMMAND ${VTA_CONFIG} --defs OUTPUT_VARIABLE __vta_defs)
+
+ string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_=.]*" VTA_DEFINITIONS "${__vta_defs}")
+
+ file(GLOB VTA_RUNTIME_SRCS vta/src/*.cc)
+ file(GLOB __vta_target_srcs vta/src/${VTA_TARGET}/*.cc)
+ list(APPEND VTA_RUNTIME_SRCS ${__vta_target_srcs})
+
+ add_library(vta SHARED ${VTA_RUNTIME_SRCS})
+
+ target_include_directories(vta PUBLIC vta/include)
+
+ foreach(__def ${VTA_DEFINITIONS})
+ string(SUBSTRING ${__def} 3 -1 __strip_def)
+ target_compile_definitions(vta PUBLIC ${__strip_def})
+ endforeach()
+
+ if(APPLE)
+ set_target_properties(vta PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+ endif(APPLE)
+
+ # PYNQ rules
+ if(${VTA_TARGET} STREQUAL "pynq")
+ find_library(__sds_lib NAMES sds_lib PATHS /usr/lib)
+ find_library(__dma_lib NAMES dma PATHS
+ "/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/"
+ "/opt/python3.6/lib/python3.6/site-packages/pynq/lib/")
+ target_link_libraries(vta ${__sds_lib} ${__dma_lib})
+ endif()
+else()
+ message(STATUS "Cannot found python in env, VTA build is skipped..")
+endif()
diff --git a/cmake/modules/Vulkan.cmake b/cmake/modules/Vulkan.cmake
new file mode 100644
index 000000000000..4093f88f2e10
--- /dev/null
+++ b/cmake/modules/Vulkan.cmake
@@ -0,0 +1,22 @@
+# Be compatible with older version of CMake
+find_vulkan(${USE_VULKAN})
+
+if(Vulkan_FOUND)
+ # always set the includedir
+ # avoid global retrigger of cmake
+ include_directories(${Vulkan_INCLUDE_DIRS})
+endif(Vulkan_FOUND)
+
+if(USE_VULKAN)
+ if(NOT Vulkan_FOUND)
+ message(FATAL_ERROR "Cannot find Vulkan, USE_VULKAN=" ${USE_VULKAN})
+ endif()
+ message(STATUS "Build with VULKAN support")
+ file(GLOB RUNTIME_VULKAN_SRCS src/runtime/vulkan/*.cc)
+ file(GLOB COMPILER_VULKAN_SRCS src/codegen/spirv/*.cc)
+ list(APPEND RUNTIME_SRCS ${RUNTIME_VULKAN_SRCS})
+ list(APPEND COMPILER_SRCS ${COMPILER_VULKAN_SRCS})
+
+ list(APPEND TVM_LINKER_LIBS ${Vulkan_SPIRV_TOOLS_LIBRARY})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${Vulkan_LIBRARY})
+endif(USE_VULKAN)
diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
new file mode 100644
index 000000000000..45269a20715d
--- /dev/null
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -0,0 +1,34 @@
+# Plugin rules for cblas
+file(GLOB CBLAS_CONTRIB_SRC src/contrib/cblas/*.cc)
+
+if(USE_BLAS STREQUAL "openblas")
+ find_library(BLAS_LIBRARY openblas)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
+ list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+ message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
+elseif(USE_BLAS STREQUAL "mkl")
+ if(NOT IS_DIRECTORY ${USE_MKL_PATH})
+ set(USE_MKL_PATH /opt/intel/mkl)
+ endif()
+ find_library(BLAS_LIBRARY mkl_rt ${USE_MKL_PATH}/lib/ ${USE_MKL_PATH}/lib/intel64)
+ include_directories(${USE_MKL_PATH}/include)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
+ list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+ add_definitions(-DUSE_MKL_BLAS=1)
+ message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
+elseif(USE_BLAS STREQUAL "atlas" OR USE_BLAS STREQUAL "blas")
+ find_library(BLAS_LIBRARY cblas)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
+ list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+ message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
+elseif(USE_BLAS STREQUAL "apple")
+ find_library(BLAS_LIBRARY Accelerate)
+ include_directories(${BLAS_LIBRARY}/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
+ list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+ message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
+elseif(USE_BLAS STREQUAL "none")
+ # pass
+else()
+ message(FATAL_ERROR "Invalid option: USE_BLAS=" ${USE_BLAS})
+endif()
diff --git a/cmake/modules/contrib/NNPack.cmake b/cmake/modules/contrib/NNPack.cmake
new file mode 100644
index 000000000000..82de88a21e63
--- /dev/null
+++ b/cmake/modules/contrib/NNPack.cmake
@@ -0,0 +1,14 @@
+if(USE_NNPACK)
+ if(NNPACK_PATH STREQUAL "")
+ set(NNPACK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/NNPack)
+ endif()
+ set(PTHREAD_POOL_PATH ${NNPACK_PATH}/deps/pthreadpool)
+ file(GLOB NNPACK_CONTRIB_SRC src/contrib/nnpack/*.cc)
+ list(APPEND RUNTIME_SRCS ${NNPACK_CONTRIB_SRC})
+ include_directories(${NNPACK_PATH}/include)
+ include_directories(${PTHREAD_POOL_PATH}/include)
+ find_library(NNPACK_CONTRIB_LIB nnpack ${NNPACK_PATH}/lib)
+ find_library(NNPACK_PTHREAD_CONTRIB_LIB pthreadpool ${NNPACK_PATH}/lib)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CONTRIB_LIB})
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_PTHREAD_CONTRIB_LIB})
+endif(USE_NNPACK)
diff --git a/cmake/modules/contrib/Random.cmake b/cmake/modules/contrib/Random.cmake
new file mode 100644
index 000000000000..a6980be8bb5b
--- /dev/null
+++ b/cmake/modules/contrib/Random.cmake
@@ -0,0 +1,5 @@
+if(USE_RANDOM)
+ message(STATUS "Build with contrib.random")
+ file(GLOB RANDOM_CONTRIB_SRC src/contrib/random/random.cc)
+ list(APPEND RUNTIME_SRCS ${RANDOM_CONTRIB_SRC})
+endif(USE_RANDOM)
diff --git a/cmake/modules/contrib/Sort.cmake b/cmake/modules/contrib/Sort.cmake
new file mode 100644
index 000000000000..9ef637ecd99f
--- /dev/null
+++ b/cmake/modules/contrib/Sort.cmake
@@ -0,0 +1,5 @@
+if(USE_SORT)
+ message(STATUS "Build with contrib.sort")
+ file(GLOB SORT_CONTRIB_SRC src/contrib/sort/*.cc)
+ list(APPEND RUNTIME_SRCS ${SORT_CONTRIB_SRC})
+endif(USE_SORT)
diff --git a/cmake/util/FindCUDA.cmake b/cmake/util/FindCUDA.cmake
new file mode 100644
index 000000000000..3ce0cc40a5e5
--- /dev/null
+++ b/cmake/util/FindCUDA.cmake
@@ -0,0 +1,74 @@
+#######################################################
+# Enhanced version of find CUDA.
+#
+# Usage:
+# find_cuda(${USE_CUDA})
+#
+# - When USE_CUDA=ON, use auto search
+# - When USE_CUDA=/path/to/cuda-path, use the cuda path
+#
+# Provide variables:
+#
+# - CUDA_FOUND
+# - CUDA_INCLUDE_DIRS
+# - CUDA_TOOLKIT_ROOT_DIR
+# - CUDA_CUDA_LIBRARY
+# - CUDA_CUDART_LIBRARY
+# - CUDA_NVRTC_LIBRARY
+# - CUDA_CUDNN_LIBRARY
+# - CUDA_CUBLAS_LIBRARY
+#
+macro(find_cuda use_cuda)
+ set(__use_cuda ${use_cuda})
+ if(__use_cuda STREQUAL "ON")
+ find_package(CUDA QUIET)
+ elseif(IS_DIRECTORY ${__use_cuda})
+ set(CUDA_TOOLKIT_ROOT_DIR ${__use_cuda})
+ message(STATUS "Custom CUDA_PATH=" ${CUDA_TOOLKIT_ROOT_DIR})
+ set(CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_ROOT_DIR}/include)
+ set(CUDA_FOUND TRUE)
+ if(MSVC)
+ find_library(CUDA_CUDART_LIBRARY cudart
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+ else(MSVC)
+ find_library(CUDA_CUDART_LIBRARY cudart
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+ endif(MSVC)
+ endif()
+
+ # additional libraries
+ if(CUDA_FOUND)
+ if(MSVC)
+ find_library(CUDA_CUDA_LIBRARY cuda
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+ find_library(CUDA_NVRTC_LIBRARY nvrtc
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+ find_library(CUDA_CUDNN_LIBRARY cudnn
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+ find_library(CUDA_CUBLAS_LIBRARY cublas
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+ else(MSVC)
+ find_library(_CUDA_CUDA_LIBRARY cuda
+ PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+ PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+ if(_CUDA_CUDA_LIBRARY)
+ set(CUDA_CUDA_LIBRARY ${_CUDA_CUDA_LIBRARY})
+ endif()
+ find_library(CUDA_NVRTC_LIBRARY nvrtc
+ PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+ PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+ find_library(CUDA_CUDNN_LIBRARY cudnn
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+ find_library(CUDA_CUBLAS_LIBRARY cublas
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+ endif(MSVC)
+ endif(CUDA_FOUND)
+endmacro(find_cuda)
diff --git a/cmake/util/FindLLVM.cmake b/cmake/util/FindLLVM.cmake
new file mode 100644
index 000000000000..4bb58d462d12
--- /dev/null
+++ b/cmake/util/FindLLVM.cmake
@@ -0,0 +1,59 @@
+#######################################################
+# Enhanced version of find llvm.
+#
+# Usage:
+# find_llvm(${USE_LLVM})
+#
+# - When USE_LLVM=ON, use auto search
+# - When USE_LLVM=/path/to/llvm-config, use corresponding config
+#
+# Provide variables:
+# - LLVM_INCLUDE_DIRS
+# - LLVM_LIBS
+# - LLVM_DEFINITIONS
+# - TVM_LLVM_VERISON
+#
+macro(find_llvm use_llvm)
+ set(LLVM_CONFIG ${use_llvm})
+ if(LLVM_CONFIG STREQUAL "ON")
+ find_package(LLVM REQUIRED CONFIG)
+ llvm_map_components_to_libnames(LLVM_LIBS all)
+ list (FIND LLVM_LIBS "LLVM" _llvm_dynlib_index)
+ if (${_llvm_dynlib_index} GREATER -1)
+ set(LLVM_LIBS LLVM)
+ message(STATUS "Link with dynamic LLVM library")
+ else()
+ list(REMOVE_ITEM LLVM_LIBS LTO)
+ message(STATUS "Link with static LLVM libraries")
+ endif()
+ set(TVM_LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR})
+ elseif(NOT LLVM_CONFIG STREQUAL "OFF")
+ # use llvm config
+ message(STATUS "Use llvm-config=" ${LLVM_CONFIG})
+ execute_process(COMMAND ${LLVM_CONFIG} --libfiles
+ OUTPUT_VARIABLE __llvm_libfiles)
+ execute_process(COMMAND ${LLVM_CONFIG} --system-libs
+ OUTPUT_VARIABLE __llvm_system_libs)
+ execute_process(COMMAND ${LLVM_CONFIG} --cxxflags
+ OUTPUT_VARIABLE __llvm_cxxflags)
+ execute_process(COMMAND ${LLVM_CONFIG} --version
+ COMMAND cut -b 1,3
+ OUTPUT_VARIABLE TVM_LLVM_VERSION)
+ # definitions
+ string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_]*" LLVM_DEFINITIONS ${__llvm_cxxflags})
+ # include dir
+ string(REGEX MATCHALL "(^| )-I[^ ]*" __llvm_include_flags ${__llvm_cxxflags})
+ set(LLVM_INCLUDE_DIRS "")
+ foreach(__flag IN ITEMS ${__llvm_include_flags})
+ string(REGEX REPLACE "(^| )-I" "" __dir "${__flag}")
+ list(APPEND LLVM_INCLUDE_DIRS "${__dir}")
+ endforeach()
+ message(STATUS ${LLVM_INCLUDE_DIRS})
+ # libfiles
+ string(STRIP ${__llvm_libfiles} __llvm_libfiles)
+ string(STRIP ${__llvm_system_libs} __llvm_system_libs)
+ set(LLVM_LIBS "${__llvm_libfiles} ${__llvm_system_libs}")
+ separate_arguments(LLVM_LIBS)
+ string(STRIP ${TVM_LLVM_VERSION} TVM_LLVM_VERSION)
+ endif()
+endmacro(find_llvm)
diff --git a/cmake/util/FindROCM.cmake b/cmake/util/FindROCM.cmake
new file mode 100644
index 000000000000..235969813382
--- /dev/null
+++ b/cmake/util/FindROCM.cmake
@@ -0,0 +1,41 @@
+#######################################################
+# Enhanced version of find rocm.
+#
+# Usage:
+# find_rocm(${USE_ROCM})
+#
+# - When USE_VULKAN=ON, use auto search
+# - When USE_VULKAN=/path/to/vulkan-sdk-path, use the sdk
+#
+# Provide variables:
+#
+# - ROCM_FOUND
+# - ROCM_INCLUDE_DIRS
+# - ROCM_HIPHCC_LIBRARY
+# - ROCM_MIOPEN_LIBRARY
+# - ROCM_ROCBLAS_LIBRARY
+#
+
+macro(find_rocm use_rocm)
+ set(__use_rocm ${use_rocm})
+ if(IS_DIRECTORY ${__use_rocm})
+ set(__rocm_sdk ${__use_rocm})
+ message(STATUS "Custom ROCM SDK PATH=" ${__use_rocm})
+ elseif(IS_DIRECTORY $ENV{ROCM_PATH})
+ set(__rocm_sdk $ENV{ROCM_PATH})
+ elseif(IS_DIRECTORY /opt/rocm)
+ set(__rocm_sdk /opt/rocm)
+ else()
+ set(__rocm_sdk "")
+ endif()
+
+ if(__rocm_sdk)
+ set(ROCM_INCLUDE_DIRS ${__rocm_sdk}/include)
+ find_library(ROCM_HIPHCC_LIBRARY hip_hcc ${__rocm_sdk}/lib)
+ find_library(ROCM_MIOPEN_LIBRARY MIOpen ${__rocm_sdk}/lib)
+ find_library(ROCM_ROCBLAS_LIBRARY rocblas ${__rocm_sdk}/lib)
+ if(ROCM_HIPHCC_LIBRARY)
+ set(ROCM_FOUND TRUE)
+ endif()
+ endif(__rocm_sdk)
+endmacro(find_rocm)
diff --git a/cmake/util/FindVulkan.cmake b/cmake/util/FindVulkan.cmake
new file mode 100644
index 000000000000..0b85e8f47d79
--- /dev/null
+++ b/cmake/util/FindVulkan.cmake
@@ -0,0 +1,55 @@
+#######################################################
+# Enhanced version of find Vulkan.
+#
+# Usage:
+# find_vulkan(${USE_VULKAN})
+#
+# - When USE_VULKAN=ON, use auto search
+# - When USE_VULKAN=/path/to/vulkan-sdk-path, use the sdk
+#
+# Provide variables:
+#
+# - Vulkan_FOUND
+# - Vulkan_INCLUDE_DIRS
+# - Vulkan_LIBRARY
+# - Vulkan_SPIRV_TOOLS_LIBRARY
+#
+
+macro(find_vulkan use_vulkan)
+ set(__use_vulkan ${use_vulkan})
+ if(IS_DIRECTORY ${__use_vulkan})
+ set(__vulkan_sdk ${__use_vulkan})
+ message(STATUS "Custom Vulkan SDK PATH=" ${__use_vulkan})
+ elseif(IS_DIRECTORY $ENV{VULKAN_SDK})
+ set(__vulkan_sdk $ENV{VULKAN_SDK})
+ else()
+ set(__vulkan_sdk "")
+ endif()
+
+ if(__vulkan_sdk)
+ set(Vulkan_INCLUDE_DIRS ${__vulkan_sdk}/include)
+ find_library(Vulkan_LIBRARY NAMES vulkan vulkan-1 PATHS ${__vulkan_sdk}/lib)
+ if(Vulkan_LIBRARY)
+ set(Vulkan_FOUND TRUE)
+ endif()
+ endif(__vulkan_sdk)
+
+ # resort to find vulkan of option is on
+ if(NOT Vulkan_FOUND)
+ if(__use_vulkan STREQUAL "ON")
+ find_package(Vulkan QUIET)
+ endif()
+ endif()
+ # additional libraries
+
+ if(Vulkan_FOUND)
+ get_filename_component(VULKAN_LIBRARY_PATH ${Vulkan_LIBRARY} DIRECTORY)
+ find_library(Vulkan_SPIRV_TOOLS_LIBRARY SPIRV-Tools
+ ${VULKAN_LIBRARY_PATH}/spirv-tools)
+
+ find_path(_libspirv libspirv.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv-tools)
+ find_path(_spirv spirv.hpp HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
+ find_path(_glsl_std GLSL.std.450.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
+ list(APPEND Vulkan_INCLUDE_DIRS ${_libspirv} ${_spirv} ${_glsl_std})
+ endif(Vulkan_FOUND)
+endmacro(find_vulkan)
diff --git a/cmake/Util.cmake b/cmake/util/Util.cmake
similarity index 72%
rename from cmake/Util.cmake
rename to cmake/util/Util.cmake
index 0cc3acea5e55..fb3118bde9e0 100644
--- a/cmake/Util.cmake
+++ b/cmake/util/Util.cmake
@@ -1,8 +1,14 @@
+macro(__tvm_option variable description value)
+ if(NOT DEFINED ${variable})
+ set(${variable} ${value} CACHE STRING ${description})
+ endif()
+endmacro()
+
#######################################################
# An option that the user can select. Can accept condition to control when option is available for user.
# Usage:
# tvm_option( "doc string" [IF ])
-function(tvm_option variable description value)
+macro(tvm_option variable description value)
set(__value ${value})
set(__condition "")
set(__varname "__value")
@@ -21,23 +27,23 @@ function(tvm_option variable description value)
if(${__condition})
if("${__value}" MATCHES ";")
if(${__value})
- option(${variable} "${description}" ON)
+ __tvm_option(${variable} "${description}" ON)
else()
- option(${variable} "${description}" OFF)
+ __tvm_option(${variable} "${description}" OFF)
endif()
elseif(DEFINED ${__value})
if(${__value})
- option(${variable} "${description}" ON)
+ __tvm_option(${variable} "${description}" ON)
else()
- option(${variable} "${description}" OFF)
+ __tvm_option(${variable} "${description}" OFF)
endif()
else()
- option(${variable} "${description}" ${__value})
+ __tvm_option(${variable} "${description}" "${__value}")
endif()
else()
unset(${variable} CACHE)
endif()
-endfunction()
+endmacro()
function(assign_source_group group)
foreach(_source IN ITEMS ${ARGN})
@@ -50,4 +56,4 @@ function(assign_source_group group)
string(REPLACE "/" "\\" _source_path_msvc "${_source_path}")
source_group("${group}\\${_source_path_msvc}" FILES "${_source}")
endforeach()
-endfunction(assign_source_group)
\ No newline at end of file
+endfunction(assign_source_group)
diff --git a/conda/conda_build_config.yaml b/conda/conda_build_config.yaml
new file mode 100644
index 000000000000..7f18f5eea432
--- /dev/null
+++ b/conda/conda_build_config.yaml
@@ -0,0 +1,4 @@
+python:
+ - 3.5
+ - 3.6
+ - 3.7
\ No newline at end of file
diff --git a/conda/nnvm/build.sh b/conda/nnvm/build.sh
new file mode 100644
index 000000000000..9f7889e610e7
--- /dev/null
+++ b/conda/nnvm/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+cd nnvm/python
+$PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml
new file mode 100644
index 000000000000..a8b47d0de118
--- /dev/null
+++ b/conda/nnvm/meta.yaml
@@ -0,0 +1,39 @@
+{% set version = "0.4.dev" %}
+
+package:
+ name: nnvm
+ version: {{ version }}
+
+source:
+ path: ../..
+
+build:
+ number: 0
+ skip: True # [win]
+
+requirements:
+ build:
+ - {{ compiler('cxx') }}
+ host:
+ - python {{ python }}
+ - cython
+ - numpy
+ - setuptools
+ - decorator
+ - tvm-libs =={{ version }}
+ run:
+ - tvm =={{ version }}
+ - topi =={{ version }}
+ - tvm-libs =={{ version }}
+ - python
+ - {{ pin_compatible('numpy') }}
+ - decorator
+
+test:
+ imports:
+ - nnvm
+
+about:
+ home: https://github.com/dmlc/nnvm
+ license: Apache2
+ summary: Bring deep learning to bare metal
diff --git a/conda/topi/build.sh b/conda/topi/build.sh
new file mode 100644
index 000000000000..a1f5e491c8eb
--- /dev/null
+++ b/conda/topi/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+cd topi/python
+$PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml
new file mode 100644
index 000000000000..af2fb4fd4228
--- /dev/null
+++ b/conda/topi/meta.yaml
@@ -0,0 +1,34 @@
+{% set version = "0.4.dev" %}
+
+package:
+ name: topi
+ version: {{ version }}
+
+source:
+ path: ../..
+
+build:
+ number: 0
+
+requirements:
+ host:
+ - python {{ python }}
+ - numpy
+ - setuptools
+ - decorator
+ - tvm-libs =={{ version }}
+ run:
+ - python
+ - {{ pin_compatible('numpy') }}
+ - decorator
+ - tvm-libs =={{ version }}
+ - tvm =={{ version }}
+
+test:
+ imports:
+ - topi
+
+about:
+ home: https://github.com/dmlc/tvm
+ license: Apache2
+ summary: "TOPI: TVM Operator Inventory"
diff --git a/conda/tvm-libs/build.sh b/conda/tvm-libs/build.sh
new file mode 100644
index 000000000000..d427d922a21e
--- /dev/null
+++ b/conda/tvm-libs/build.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -e
+
+if [ -z "$PREFIX" ]; then
+ PREFIX="$CONDA_PREFIX"
+fi
+
+rm -rf build || true
+mkdir -p build
+cd build
+cmake -DUSE_LLVM=ON -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" ..
+make -j2 VERBOSE=1
+make install
+cd ..
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
new file mode 100644
index 000000000000..dbdfd4a7701f
--- /dev/null
+++ b/conda/tvm-libs/meta.yaml
@@ -0,0 +1,30 @@
+{% set version = "0.4.dev" %}
+
+package:
+ name: tvm-libs
+ version: {{ version }}
+
+source:
+ path: ../..
+
+build:
+ number: 0
+
+requirements:
+ build:
+ - {{ compiler('cxx') }} # [linux]
+ - llvmdev ==6.0.0 # [osx]
+ host:
+ # The OS X build will require some manual setup or it will break
+ # See https://conda.io/docs/user-guide/tasks/build-packages/compiler-tools.html#macos-sdk
+ # It is also ass-backward because of llvm brokeness when mixed with the
+ # conda OS X compiler
+ - {{ compiler('cxx') }} # [osx]
+ - cmake
+ - llvmdev ==6.0.0 # [linux]
+ - zlib # [linux]
+
+about:
+ home: https://github.com/dmlc/tvm
+ license: Apache2
+ summary: a low level domain specific language for compiling tensor computation pipelines
\ No newline at end of file
diff --git a/conda/tvm/build.sh b/conda/tvm/build.sh
new file mode 100644
index 000000000000..9c958a32e629
--- /dev/null
+++ b/conda/tvm/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+cd python
+$PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
diff --git a/python/conda/meta.yaml b/conda/tvm/meta.yaml
similarity index 55%
rename from python/conda/meta.yaml
rename to conda/tvm/meta.yaml
index 9ebb5afac543..478e095322eb 100644
--- a/python/conda/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.1.dev" %}
+{% set version = "0.4.dev" %}
package:
name: tvm
@@ -8,23 +8,27 @@ source:
path: ../..
build:
- number: 1
- skip: True # [win]
- script_env:
- - CONDA_CUDA_HOME
+ number: 0
requirements:
build:
- - llvmdev ==4.0.0
- - python >=3
+ - {{ compiler('cxx') }}
+ host:
+ - python {{ python }}
+ - cython
- numpy
- setuptools
- - nose
- decorator
+ - tvm-libs =={{ version }}
run:
- - python >=3
- - numpy
+ - python
+ - {{ pin_compatible('numpy') }}
- decorator
+ - tvm-libs =={{ version }}
+
+test:
+ imports:
+ - tvm
about:
home: https://github.com/dmlc/tvm
diff --git a/dlpack b/dlpack
index 9422e98f3f4d..10892ac964f1 160000
--- a/dlpack
+++ b/dlpack
@@ -1 +1 @@
-Subproject commit 9422e98f3f4dafc6bc3473cf8484543ad376aab6
+Subproject commit 10892ac964f1af7c81aae145cd3fab78bbccd297
diff --git a/dmlc-core b/dmlc-core
index 04f91953ace7..e864aa6757cd 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 04f91953ace74aced3bb317990515304c5425849
+Subproject commit e864aa6757cdbe78b1296fe5231fd3050b7802c3
diff --git a/tests/ci_build/Dockerfile.cpu b/docker/Dockerfile.ci_cpu
similarity index 62%
rename from tests/ci_build/Dockerfile.cpu
rename to docker/Dockerfile.ci_cpu
index b113fc548fcb..0f0fc6f04d4c 100644
--- a/tests/ci_build/Dockerfile.cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -1,4 +1,4 @@
-# For CPU
+# CI docker CPU env
FROM ubuntu:16.04
RUN apt-get update --fix-missing
@@ -9,11 +9,12 @@ RUN bash /install/ubuntu_install_core.sh
COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
RUN bash /install/ubuntu_install_python.sh
-COPY install/ubuntu_install_iverilog.sh /install/ubuntu_install_iverilog.sh
-RUN bash /install/ubuntu_install_iverilog.sh
-
COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
RUN bash /install/ubuntu_install_python_package.sh
-COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
-RUN bash /install/ubuntu_install_java.sh
+COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
+RUN bash /install/ubuntu_install_llvm.sh
+
+# AutoTVM deps
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh
diff --git a/tests/ci_build/Dockerfile.emscripten b/docker/Dockerfile.ci_emscripten
similarity index 72%
rename from tests/ci_build/Dockerfile.emscripten
rename to docker/Dockerfile.ci_emscripten
index 59bf02ea7d2c..b4d5a63c52ef 100644
--- a/tests/ci_build/Dockerfile.emscripten
+++ b/docker/Dockerfile.ci_emscripten
@@ -15,4 +15,8 @@ RUN bash /install/ubuntu_install_emscripten.sh
COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
RUN bash /install/ubuntu_install_python_package.sh
-RUN cp /root/.emscripten /emsdk-portable/
\ No newline at end of file
+RUN chmod a+rwx -R /emsdk-portable
+RUN cp -r /emsdk-portable /emsdk-portable-backup
+RUN mv /emsdk-portable /emsdk-portable-x
+RUN mv /emsdk-portable-backup /emsdk-portable
+RUN cp /root/.emscripten /emsdk-portable/
diff --git a/tests/ci_build/Dockerfile.gpu b/docker/Dockerfile.ci_gpu
similarity index 56%
rename from tests/ci_build/Dockerfile.gpu
rename to docker/Dockerfile.ci_gpu
index 9dff84e84635..c177ef9d420a 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -1,3 +1,4 @@
+# CI docker GPU env
FROM nvidia/cuda:8.0-cudnn7-devel
# Base scripts
@@ -15,9 +16,6 @@ RUN bash /install/ubuntu_install_llvm.sh
COPY install/ubuntu_install_opencl.sh /install/ubuntu_install_opencl.sh
RUN bash /install/ubuntu_install_opencl.sh
-COPY install/ubuntu_install_iverilog.sh /install/ubuntu_install_iverilog.sh
-RUN bash /install/ubuntu_install_iverilog.sh
-
COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
RUN bash /install/ubuntu_install_python_package.sh
@@ -26,7 +24,10 @@ RUN bash /install/ubuntu_install_sphinx.sh
# Fix recommonmark to latest version
RUN git clone https://github.com/rtfd/recommonmark
-RUN cd recommonmark; python setup.py install
+RUN cd recommonmark; python3 setup.py install
+
+# Enable doxygen for c++ doc build
+RUN apt-get update && apt-get install -y doxygen graphviz libprotobuf-dev protobuf-compiler
COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
RUN bash /install/ubuntu_install_java.sh
@@ -37,16 +38,45 @@ RUN bash /install/ubuntu_install_nodejs.sh
COPY install/ubuntu_install_rocm.sh /install/ubuntu_install_rocm.sh
RUN bash /install/ubuntu_install_rocm.sh
-# Enable doxygen for c++ doc build
-RUN apt-get install -y doxygen graphviz
+COPY install/ubuntu_install_opengl.sh /install/ubuntu_install_opengl.sh
+RUN bash /install/ubuntu_install_opengl.sh
+
+# DL Frameworks
+COPY install/ubuntu_install_mxnet.sh /install/ubuntu_install_mxnet.sh
+RUN bash /install/ubuntu_install_mxnet.sh
+
+COPY install/ubuntu_install_coreml.sh /install/ubuntu_install_coreml.sh
+RUN bash /install/ubuntu_install_coreml.sh
+
+COPY install/ubuntu_install_keras.sh /install/ubuntu_install_keras.sh
+RUN bash /install/ubuntu_install_keras.sh
+
+COPY install/ubuntu_install_darknet.sh /install/ubuntu_install_darknet.sh
+RUN bash /install/ubuntu_install_darknet.sh
+
+COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
+RUN bash /install/ubuntu_install_onnx.sh
+
+RUN pip3 install Pillow
+
+COPY install/ubuntu_install_vulkan.sh /install/ubuntu_install_vulkan.sh
+RUN bash /install/ubuntu_install_vulkan.sh
+
+# AutoTVM deps
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh
# Environment variables
-ENV PATH=/node_modules/.bin:${PATH}
ENV PATH=/usr/local/nvidia/bin:${PATH}
-ENV PATH=/usr/clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-14.04/bin:${PATH}
ENV PATH=/usr/local/cuda/bin:${PATH}
ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH}
ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LIBRARY_PATH}
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
+
ENV LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH}
+ENV PATH=/node_modules/.bin:${PATH}
+ENV VULKAN_SDK=/usr/local/VulkanSDK/1.0.65.0/x86_64
+ENV PATH=${PATH}:${VULKAN_SDK}/bin
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${VULKAN_SDK}/lib
+ENV VK_LAYER_PATH=${VULKAN_SDK}/etc/explicit_layer.d
diff --git a/tests/ci_build/Dockerfile.i386 b/docker/Dockerfile.ci_i386
similarity index 78%
rename from tests/ci_build/Dockerfile.i386
rename to docker/Dockerfile.ci_i386
index e4577c37edd1..6a8394e85763 100644
--- a/tests/ci_build/Dockerfile.i386
+++ b/docker/Dockerfile.ci_i386
@@ -1,3 +1,5 @@
+# CI docker i386 env
+
FROM ioft/i386-ubuntu:16.04
RUN apt-get update --fix-missing
@@ -13,3 +15,7 @@ RUN bash /install/ubuntu_install_python.sh
COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
RUN bash /install/ubuntu_install_python_package.sh
+
+# AutoTVM deps
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
new file mode 100644
index 000000000000..132e8ebb7df9
--- /dev/null
+++ b/docker/Dockerfile.ci_lint
@@ -0,0 +1,9 @@
+# For lint test
+# CI docker lint env
+FROM ubuntu:16.04
+
+RUN apt-get update && apt-get install -y sudo wget
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+RUN apt-get install -y doxygen graphviz
+RUN pip3 install cpplint pylint mypy
diff --git a/docker/Dockerfile.demo_cpu b/docker/Dockerfile.demo_cpu
new file mode 100644
index 000000000000..0778b0a28784
--- /dev/null
+++ b/docker/Dockerfile.demo_cpu
@@ -0,0 +1,31 @@
+# Minimum docker image for demo purposes
+# prebuilt-image: tvmai/demo-cpu
+FROM ubuntu:16.04
+
+RUN apt-get update --fix-missing
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+# Python: basic dependencies
+RUN apt-get update && apt-get install -y python3-dev python3-pip
+RUN pip3 install numpy nose-timer cython decorator scipy
+
+# LLVM
+RUN echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main \
+ >> /etc/apt/sources.list.d/llvm.list && \
+ wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - && \
+ apt-get update && apt-get install -y --force-yes llvm-6.0
+
+# Jupyter notebook.
+RUN pip3 install matplotlib Image Pillow jupyter[notebook]
+
+# Deep learning frameworks
+RUN pip3 install mxnet tensorflow keras
+
+# Build TVM
+COPY install/install_tvm_cpu.sh /install/install_tvm_cpu.sh
+RUN bash /install/install_tvm_cpu.sh
+
+# Environment variables
+ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
diff --git a/docker/Dockerfile.demo_gpu b/docker/Dockerfile.demo_gpu
new file mode 100644
index 000000000000..6f249986e22c
--- /dev/null
+++ b/docker/Dockerfile.demo_gpu
@@ -0,0 +1,34 @@
+# Minimum docker image for demo purposes
+# prebuilt-image: tvmai/demo-gpu
+FROM nvidia/cuda:8.0-cudnn7-devel
+
+RUN apt-get update --fix-missing
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+# Python: basic dependencies
+RUN apt-get update && apt-get install -y python3-dev python3-pip
+RUN pip3 install numpy nose-timer cython decorator scipy
+
+# LLVM
+RUN echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main \
+ >> /etc/apt/sources.list.d/llvm.list && \
+ wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - && \
+ apt-get update && apt-get install -y --force-yes llvm-6.0
+
+# Jupyter notebook.
+RUN pip3 install matplotlib Image Pillow jupyter[notebook]
+
+# Deep learning frameworks
+RUN pip3 install mxnet tensorflow keras
+
+# Build TVM
+COPY install/install_tvm_gpu.sh /install/install_tvm_gpu.sh
+RUN bash /install/install_tvm_gpu.sh
+
+# Environment variables
+ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
+ENV PATH=/usr/local/nvidia/bin:${PATH}
+ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 000000000000..e9b8b503062f
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,87 @@
+# TVM Docker
+
+This directory contains the TVM's docker infrastructure.
+We use docker to provide build environments for CI and images for demo.
+We need [docker](https://docs.docker.com/engine/installation/) and
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker/) for GPU images.
+
+## Start Docker Bash Session
+
+You can use the following helper script to start an
+interactive bash session with a given image_name.
+
+```bash
+/path/to/tvm/docker/bash.sh image_name
+```
+
+The script does the following things:
+- Mount current directory to /workspace and set it as home
+- Switch user to be the same user that calls the bash.sh
+- Use the host-side network
+
+The helper bash script can be useful to build demo sessions.
+
+## Prebuilt Docker Images
+
+We provide several pre-built images for doing quick exploration with TVM installed.
+For example, you can run the following command to get ```tvmai/demo-cpu``` image.
+
+```bash
+/path/to/tvm/docker/bash.sh tvmai/demo-cpu
+```
+
+Then inside the docker container, you can type the following command to start the jupyter notebook
+```bash
+jupyter notebook
+```
+
+Check out https://hub.docker.com/r/tvmai/ to get the full list of available prebuilt images.
+
+
+## Use Local Build Script
+
+We also provide script to build docker images locally.
+We use (`build.sh`)[./build.sh] to build and run the commands.
+To build and run docker images, we can run the following command
+at the root of the project.
+
+```bash
+./docker/build.sh image_name [command]
+```
+
+Here image_name corresponds to the docker defined in the
+```Dockerfile.image_name```.
+
+You can also start an interactive session by typing
+
+```bash
+./docker/build.sh image_name -it bash
+```
+
+The build command will map the tvm root to /workspace/ inside the container
+with the same user as the user invoking the docker command.
+Here are some common use examples to perform CI tasks.
+
+- lint the python codes
+
+ ```bash
+ ./docker/build.sh ci_lint make pylint
+ ```
+
+- build codes with CUDA support
+
+ ```bash
+ ./docker/build.sh ci_gpu make -j$(nproc)
+ ```
+
+- do the python unittest
+
+ ```bash
+ ./docker/build.sh ci_gpu tests/scripts/task_python_unittest.sh
+ ```
+
+- build the documents. The results will be available at `docs/_build/html`
+
+ ```bash
+ ./docker/ci_build.sh ci_gpu make -C docs html
+ ```
diff --git a/docker/bash.sh b/docker/bash.sh
new file mode 100755
index 000000000000..ba935d7ed089
--- /dev/null
+++ b/docker/bash.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+#
+# Start a bash, mount /workspace to be current directory.
+#
+# Usage: docker/bash.sh
+# Starts an interactive session
+#
+# Usage2: docker/bash.sh [COMMAND]
+# Execute command in the docker image, non-interactive
+#
+if [ "$#" -lt 1 ]; then
+ echo "Usage: docker/bash.sh [COMMAND]"
+ exit -1
+fi
+
+DOCKER_IMAGE_NAME=("$1")
+
+if [ "$#" -eq 1 ]; then
+ COMMAND="bash"
+ CI_DOCKER_EXTRA_PARAMS=("-it --net=host")
+else
+ shift 1
+ COMMAND=("$@")
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+WORKSPACE="$(pwd)"
+
+# Use nvidia-docker if the container is GPU.
+if [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* ]]; then
+ DOCKER_BINARY="nvidia-docker"
+else
+ DOCKER_BINARY="docker"
+fi
+
+# Print arguments.
+echo "WORKSPACE: ${WORKSPACE}"
+echo "DOCKER CONTAINER NAME: ${DOCKER_IMAGE_NAME}"
+echo ""
+
+echo "Running '${COMMAND[@]}' inside ${DOCKER_IMAGE_NAME}..."
+
+# By default we cleanup - remove the container once it finish running (--rm)
+# and share the PID namespace (--pid=host) so the process inside does not have
+# pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it).
+echo ${DOCKER_BINARY}
+${DOCKER_BINARY} run --rm --pid=host\
+ -v ${WORKSPACE}:/workspace \
+ -v ${SCRIPT_DIR}:/docker \
+ -w /workspace \
+ -e "CI_BUILD_HOME=/workspace" \
+ -e "CI_BUILD_USER=$(id -u -n)" \
+ -e "CI_BUILD_UID=$(id -u)" \
+ -e "CI_BUILD_GROUP=$(id -g -n)" \
+ -e "CI_BUILD_GID=$(id -g)" \
+ ${CI_DOCKER_EXTRA_PARAMS[@]} \
+ ${DOCKER_IMAGE_NAME}\
+ bash /docker/with_the_same_user \
+ ${COMMAND[@]}
diff --git a/tests/ci_build/ci_build.sh b/docker/build.sh
similarity index 93%
rename from tests/ci_build/ci_build.sh
rename to docker/build.sh
index 86c138aaf3a5..1d476e52e642 100755
--- a/tests/ci_build/ci_build.sh
+++ b/docker/build.sh
@@ -2,7 +2,7 @@
#
# Execute command within a docker container
#
-# Usage: ci_build.sh [--dockerfile ] [-it]
+# Usage: build.sh [--dockerfile ] [-it]
#
#
# CONTAINER_TYPE: Type of the docker container used the run the build: e.g.,
@@ -37,6 +37,11 @@ if [[ "$1" == "-it" ]]; then
shift 1
fi
+if [[ "$1" == "--net=host" ]]; then
+ CI_DOCKER_EXTRA_PARAMS+=('--net=host')
+ shift 1
+fi
+
if [[ ! -f "${DOCKERFILE_PATH}" ]]; then
echo "Invalid Dockerfile path: \"${DOCKERFILE_PATH}\""
exit 1
@@ -71,8 +76,8 @@ function upsearch () {
# Set up WORKSPACE and BUILD_TAG. Jenkins will set them for you or we pick
# reasonable defaults if you run it outside of Jenkins.
-WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../../}"
-BUILD_TAG="${BUILD_TAG:-tvm-ci}"
+WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../}"
+BUILD_TAG="${BUILD_TAG:-tvm}"
# Determine the docker image name
DOCKER_IMG_NAME="${BUILD_TAG}.${CONTAINER_TYPE}"
@@ -122,5 +127,5 @@ ${DOCKER_BINARY} run --rm --pid=host \
-e "CI_BUILD_GID=$(id -g)" \
${CI_DOCKER_EXTRA_PARAMS[@]} \
${DOCKER_IMG_NAME} \
- bash tests/ci_build/with_the_same_user \
+ bash docker/with_the_same_user \
${COMMAND[@]}
diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh
new file mode 100644
index 000000000000..51593e66506e
--- /dev/null
+++ b/docker/install/install_tvm_cpu.sh
@@ -0,0 +1,12 @@
+cd /usr
+git clone https://github.com/dmlc/tvm --recursive
+cd /usr/tvm
+echo set\(USE_LLVM llvm-config-6.0\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
+echo set\(USE_BLAS openblas\) >> config.cmake
+mkdir -p build
+cd build
+cmake ..
+make -j10
diff --git a/docker/install/install_tvm_gpu.sh b/docker/install/install_tvm_gpu.sh
new file mode 100644
index 000000000000..8a1324646fd5
--- /dev/null
+++ b/docker/install/install_tvm_gpu.sh
@@ -0,0 +1,14 @@
+cd /usr
+git clone https://github.com/dmlc/tvm --recursive
+cd /usr/tvm
+echo set\(USE_LLVM llvm-config-6.0\) >> config.cmake
+echo set\(USE_CUDA ON\) >> config.cmake
+echo set\(USE_CUDNN ON\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
+echo set\(USE_BLAS openblas\) >> config.cmake
+mkdir -p build
+cd build
+cmake ..
+make -j10
diff --git a/tests/ci_build/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
similarity index 76%
rename from tests/ci_build/install/ubuntu_install_core.sh
rename to docker/install/ubuntu_install_core.sh
index 9823ae0788ac..efc69c946b97 100644
--- a/tests/ci_build/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -1,5 +1,5 @@
# install libraries for building c++ core on ubuntu
-apt-get install -y --no-install-recommends --force-yes \
+apt-get update && apt-get install -y --no-install-recommends --force-yes \
git make libgtest-dev cmake wget unzip libtinfo-dev libz-dev\
libcurl4-openssl-dev libopenblas-dev g++ sudo
diff --git a/docker/install/ubuntu_install_coreml.sh b/docker/install/ubuntu_install_coreml.sh
new file mode 100644
index 000000000000..4b0fd126c61d
--- /dev/null
+++ b/docker/install/ubuntu_install_coreml.sh
@@ -0,0 +1 @@
+pip3 install coremltools
diff --git a/docker/install/ubuntu_install_darknet.sh b/docker/install/ubuntu_install_darknet.sh
new file mode 100644
index 000000000000..f5e0c2791d80
--- /dev/null
+++ b/docker/install/ubuntu_install_darknet.sh
@@ -0,0 +1,4 @@
+#install the necessary dependancies, cffi, opencv
+wget 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
+pip2 install opencv-python cffi
+pip3 install opencv-python cffi
diff --git a/tests/ci_build/install/ubuntu_install_emscripten.sh b/docker/install/ubuntu_install_emscripten.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_emscripten.sh
rename to docker/install/ubuntu_install_emscripten.sh
diff --git a/tests/ci_build/install/ubuntu_install_iverilog.sh b/docker/install/ubuntu_install_iverilog.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_iverilog.sh
rename to docker/install/ubuntu_install_iverilog.sh
diff --git a/tests/ci_build/install/ubuntu_install_java.sh b/docker/install/ubuntu_install_java.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_java.sh
rename to docker/install/ubuntu_install_java.sh
diff --git a/docker/install/ubuntu_install_keras.sh b/docker/install/ubuntu_install_keras.sh
new file mode 100644
index 000000000000..33bc38c80972
--- /dev/null
+++ b/docker/install/ubuntu_install_keras.sh
@@ -0,0 +1,2 @@
+pip2 install keras tensorflow h5py
+pip3 install keras tensorflow h5py
diff --git a/tests/ci_build/install/ubuntu_install_llvm.sh b/docker/install/ubuntu_install_llvm.sh
similarity index 76%
rename from tests/ci_build/install/ubuntu_install_llvm.sh
rename to docker/install/ubuntu_install_llvm.sh
index e5b28b911f61..16d0fe150b7e 100644
--- a/tests/ci_build/install/ubuntu_install_llvm.sh
+++ b/docker/install/ubuntu_install_llvm.sh
@@ -8,10 +8,15 @@ echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-5.0 main\
echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-5.0 main\
>> /etc/apt/sources.list.d/llvm.list
+echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main\
+ >> /etc/apt/sources.list.d/llvm.list
+echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main\
+ >> /etc/apt/sources.list.d/llvm.list
+
echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
>> /etc/apt/sources.list.d/llvm.list
echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
>> /etc/apt/sources.list.d/llvm.list
wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
-apt-get update && apt-get install -y --force-yes llvm-4.0 llvm-5.0 llvm-6.0
+apt-get update && apt-get install -y --force-yes llvm-4.0 llvm-5.0 llvm-6.0 clang-6.0
diff --git a/docker/install/ubuntu_install_mxnet.sh b/docker/install/ubuntu_install_mxnet.sh
new file mode 100644
index 000000000000..0e7e9e3939a8
--- /dev/null
+++ b/docker/install/ubuntu_install_mxnet.sh
@@ -0,0 +1 @@
+pip3 install mxnet
diff --git a/tests/ci_build/install/ubuntu_install_nodejs.sh b/docker/install/ubuntu_install_nodejs.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_nodejs.sh
rename to docker/install/ubuntu_install_nodejs.sh
diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
new file mode 100644
index 000000000000..517ea77ab81e
--- /dev/null
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -0,0 +1,8 @@
+# fix to certain version for now
+pip2 install onnx>=1.1.0
+pip3 install onnx>=1.1.0
+
+pip2 install http://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp27-cp27mu-manylinux1_x86_64.whl
+pip2 install torchvision
+pip3 install http://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp35-cp35m-manylinux1_x86_64.whl
+pip3 install torchvision
diff --git a/tests/ci_build/install/ubuntu_install_opencl.sh b/docker/install/ubuntu_install_opencl.sh
similarity index 68%
rename from tests/ci_build/install/ubuntu_install_opencl.sh
rename to docker/install/ubuntu_install_opencl.sh
index 636236539a98..ca4d1d04fd5c 100644
--- a/tests/ci_build/install/ubuntu_install_opencl.sh
+++ b/docker/install/ubuntu_install_opencl.sh
@@ -1,8 +1,8 @@
# Install OpenCL runtime in nvidia docker.
-apt-get install -y --no-install-recommends --force-yes \
- ocl-icd-libopencl1 \
+apt-get update && apt-get install -y --no-install-recommends --force-yes \
+ ocl-icd-opencl-dev \
clinfo && \
- rm -rf /var/lib/apt/lists/*
+ rm -rf /var/lib/apt/lists/*
mkdir -p /etc/OpenCL/vendors && \
echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
diff --git a/docker/install/ubuntu_install_opengl.sh b/docker/install/ubuntu_install_opengl.sh
new file mode 100644
index 000000000000..f8be6e351581
--- /dev/null
+++ b/docker/install/ubuntu_install_opengl.sh
@@ -0,0 +1,4 @@
+apt-get update --fix-missing
+
+apt-get install -y --no-install-recommends --force-yes \
+ libgl1-mesa-dev libglfw3-dev
\ No newline at end of file
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
new file mode 100644
index 000000000000..a34019e1003e
--- /dev/null
+++ b/docker/install/ubuntu_install_python.sh
@@ -0,0 +1,12 @@
+# install python and pip, don't modify this, modify install_python_package.sh
+apt-get update && apt-get install -y python-dev
+
+# python 3.6
+apt-get update && yes | apt-get install software-properties-common
+add-apt-repository ppa:jonathonf/python-3.6 &&\
+ apt-get update && apt-get install -y python-pip python-dev python3.6 python3.6-dev
+
+rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
+
+# Install pip
+cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python2 get-pip.py && python3.6 get-pip.py
diff --git a/tests/ci_build/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
similarity index 81%
rename from tests/ci_build/install/ubuntu_install_python_package.sh
rename to docker/install/ubuntu_install_python_package.sh
index fbed2e1904cd..3e5c88674079 100644
--- a/tests/ci_build/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -1,3 +1,3 @@
# install libraries for python package on ubuntu
pip2 install nose pylint numpy nose-timer cython decorator scipy tornado
-pip3 install nose pylint numpy nose-timer cython decorator scipy tornado
+pip3 install nose pylint numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset
diff --git a/docker/install/ubuntu_install_redis.sh b/docker/install/ubuntu_install_redis.sh
new file mode 100644
index 000000000000..dfc9a3c381b6
--- /dev/null
+++ b/docker/install/ubuntu_install_redis.sh
@@ -0,0 +1,3 @@
+apt-get update && apt-get install -y redis-server
+pip2 install xgboost psutil
+pip3 install xgboost psutil
diff --git a/tests/ci_build/install/ubuntu_install_rocm.sh b/docker/install/ubuntu_install_rocm.sh
similarity index 100%
rename from tests/ci_build/install/ubuntu_install_rocm.sh
rename to docker/install/ubuntu_install_rocm.sh
diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
new file mode 100644
index 000000000000..ba04c2e25e6f
--- /dev/null
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -0,0 +1 @@
+pip3 install sphinx sphinx-gallery sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image commonmark>=0.7.3 docutils>=0.11
diff --git a/docker/install/ubuntu_install_vulkan.sh b/docker/install/ubuntu_install_vulkan.sh
new file mode 100644
index 000000000000..a4155da49651
--- /dev/null
+++ b/docker/install/ubuntu_install_vulkan.sh
@@ -0,0 +1,9 @@
+#/bin/bash
+
+wget https://sdk.lunarg.com/sdk/download/1.0.65.0/linux/vulkansdk-linux-x86_64-1.0.65.0.run
+
+bash vulkansdk-linux-x86_64-1.0.65.0.run
+mv VulkanSDK /usr/local/VulkanSDK
+cd /usr/local/VulkanSDK/1.0.65.0
+./build_tools.sh
+./build_samples.sh
diff --git a/tests/ci_build/with_the_same_user b/docker/with_the_same_user
similarity index 91%
rename from tests/ci_build/with_the_same_user
rename to docker/with_the_same_user
index 1e6ab883694b..470d64384de6 100644
--- a/tests/ci_build/with_the_same_user
+++ b/docker/with_the_same_user
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
# This script is a wrapper creating the same user inside container as the one
-# running the ci_build.sh outside the container. It also set the home directory
+# running the docker/build.sh outside the container. It also set the home directory
# for the user inside container to match the same absolute path as the workspace
# outside of container. Do not run this manually. It does not make sense. It is
# intended to be called by ci_build.sh only.
@@ -30,5 +30,6 @@ HOME=${CI_BUILD_HOME}\
sudo -u "#${CI_BUILD_UID}" --preserve-env\
PATH=${PATH}\
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\
+ PYTHONPATH=${PYTHONPATH}\
HOME=${CI_BUILD_HOME}\
${COMMAND[@]}
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 87e7da7043e5..7bb47ccab4c5 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -753,7 +753,7 @@ WARN_LOGFILE =
# spaces.
# Note: If this tag is empty the current directory is searched.
-INPUT = include/tvm topi/include/topi
+INPUT = include/tvm topi/include/topi nnvm/include/nnvm vta/include/vta
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1934,7 +1934,7 @@ ENABLE_PREPROCESSING = YES
# The default value is: NO.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-MACRO_EXPANSION = NO
+MACRO_EXPANSION = YES
# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
# the macro expansion is limited to the macros specified with the PREDEFINED and
diff --git a/docs/Makefile b/docs/Makefile
index 1e45fb5e3787..d7a12839ba3d 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -3,15 +3,10 @@
# You can set these variables from the command line.
SPHINXOPTS =
-SPHINXBUILD = sphinx-build
+SPHINXBUILD = python3 -m sphinx
PAPER =
BUILDDIR = _build
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
@@ -51,6 +46,8 @@ help:
clean:
rm -rf $(BUILDDIR)/*
rm -rf gen_modules
+ rm -rf tutorials
+ rm -rf vta/tutorials
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
diff --git a/docs/README.txt b/docs/README.txt
index b8780dd9fc87..fffdaa233ef8 100644
--- a/docs/README.txt
+++ b/docs/README.txt
@@ -1,6 +1,28 @@
-The documentation of tvm is generated with recommonmark and sphinx.
+TVM Documentations
+==================
+This folder contains the source of TVM documents
-- A hosted version of doc is at http://docs.tvmlang.org
-- pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark
+- A hosted version of doc is at http://docs.tvm.ai
+- pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark Pillow
- Build tvm first in the root folder.
- To build locally, you need to enable USE_CUDA, USE_OPENCL, LLVM_CONFIG in config.mk and then type "make html" in this folder.
+
+Only Execute Specified Tutorials
+--------------------------------
+The document build process will execute all the tutorials in the sphinx gallery.
+This will cause failure in some cases when certain machines do not have necessary
+environment. You can set ```TVM_TUTORIAL_EXEC_PATTERN``` to only execute
+the path that matches the regular expression pattern.
+
+For example, to only build tutorials under /vta/tutorials, run
+
+```bash
+TVM_TUTORIAL_EXEC_PATTERN=/vta/tutorials make html
+```
+
+To only build one specific file, do
+
+```bash
+# The slash \ is used to get . in regular expression
+TVM_TUTORIAL_EXEC_PATTERN=file_name\.py make html
+```
diff --git a/docs/_static/css/tvm_theme.css b/docs/_static/css/tvm_theme.css
index 5e0838abf6cb..274589887b3f 100644
--- a/docs/_static/css/tvm_theme.css
+++ b/docs/_static/css/tvm_theme.css
@@ -9,3 +9,13 @@
nav .hidden-section {
display: inherit;
}
+
+.wy-side-nav-search {
+ background-color: #fff;
+ color: #333;
+}
+
+.version{
+ color: #404040 !important;
+}
+
diff --git a/docs/_static/img/README b/docs/_static/img/README
new file mode 100644
index 000000000000..414328cc729d
--- /dev/null
+++ b/docs/_static/img/README
@@ -0,0 +1,2 @@
+The logo file in this repo is an exception due to the need of sphinx.
+By default we avoid to put large binary blobs into this repo.
\ No newline at end of file
diff --git a/docs/_static/img/tvm-logo-small.png b/docs/_static/img/tvm-logo-small.png
new file mode 100644
index 000000000000..c3519fece55b
Binary files /dev/null and b/docs/_static/img/tvm-logo-small.png differ
diff --git a/docs/api/python/autotvm.rst b/docs/api/python/autotvm.rst
new file mode 100644
index 000000000000..0a2ae40f24a9
--- /dev/null
+++ b/docs/api/python/autotvm.rst
@@ -0,0 +1,73 @@
+tvm.autotvm
+-----------
+.. automodule:: tvm.autotvm
+
+tvm.autotvm.measure
+~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.measure.measure
+
+.. autoclass:: tvm.autotvm.measure.MeasureInput
+ :members:
+
+.. autoclass:: tvm.autotvm.measure.MeasureResult
+ :members:
+
+.. autofunction:: tvm.autotvm.measure.measure_option
+
+.. autofunction:: tvm.autotvm.measure.create_measure_batch
+
+
+tvm.autotvm.tuner
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.tuner
+ :members:
+
+.. autoclass:: tvm.autotvm.tuner.Tuner
+ :members:
+
+.. autoclass:: tvm.autotvm.tuner.RandomTuner
+ :members:
+ :inherited-members:
+
+.. autoclass:: tvm.autotvm.tuner.GridSearchTuner
+ :members:
+ :inherited-members:
+
+.. autoclass:: tvm.autotvm.tuner.GATuner
+ :members:
+ :inherited-members:
+
+.. autoclass:: tvm.autotvm.tuner.XGBTuner
+ :members:
+ :inherited-members:
+
+.. automodule:: tvm.autotvm.tuner.callback
+ :members:
+
+.. automodule:: tvm.autotvm.tuner.graph_tuning
+ :members:
+
+tvm.autotvm.task
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.task
+ :members:
+
+.. automodule:: tvm.autotvm.task.task
+ :members:
+
+.. automodule:: tvm.autotvm.task.space
+ :members:
+
+.. automodule:: tvm.autotvm.task.dispatcher
+ :members:
+
+.. automodule:: tvm.autotvm.task.topi_integration
+ :members:
+
+.. automodule:: tvm.autotvm.task.nnvm_integration
+ :members:
+
+tvm.autotvm.record
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.record
+ :members:
diff --git a/docs/api/python/bridge.rst b/docs/api/python/bridge.rst
new file mode 100644
index 000000000000..afc7dc298652
--- /dev/null
+++ b/docs/api/python/bridge.rst
@@ -0,0 +1,7 @@
+Framework Bridge APIs
+---------------------
+
+tvm.contrib.mxnet
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.mxnet
+ :members:
diff --git a/docs/api/python/contrib.rst b/docs/api/python/contrib.rst
index ed04230deb8d..a58a3aa4fbef 100644
--- a/docs/api/python/contrib.rst
+++ b/docs/api/python/contrib.rst
@@ -1,38 +1,103 @@
-Contrib APIs
-------------
+Additional Contrib APIs
+-----------------------
.. automodule:: tvm.contrib
-tvm.contrib.nvcc
-~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.nvcc
+tvm.contrib.cblas
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.cc
+ :members:
+
+
+tvm.contrib.clang
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.clang
:members:
+
tvm.contrib.cc
~~~~~~~~~~~~~~
.. automodule:: tvm.contrib.cc
:members:
-tvm.contrib.xcode
-~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.xcode
+
+tvm.contrib.cublas
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.cublas
+ :members:
+
+
+tvm.contrib.emscripten
+~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.emscripten
:members:
-tvm.contrib.rpc
+tvm.contrib.miopen
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.miopen
+ :members:
+
+tvm.contrib.ndk
~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.rpc
+.. automodule:: tvm.contrib.ndk
+ :members:
+
+
+tvm.contrib.nnpack
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.nnpack
+ :members:
+
+
+tvm.contrib.nvcc
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.nvcc
+ :members:
+
+
+tvm.contrib.pickle_memoize
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.pickle_memoize
+ :members:
+
+
+tvm.contrib.random
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.random
:members:
-tvm.contrib.graph_runtime
-~~~~~~~~~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.graph_runtime
+
+tvm.contrib.rocblas
+~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.rocblas
:members:
+
+tvm.contrib.rocm
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.rocm
+ :members:
+
+
+tvm.contrib.spirv
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.spirv
+ :members:
+
+
+tvm.contrib.tar
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.tar
+ :members:
+
+
tvm.contrib.util
~~~~~~~~~~~~~~~~
.. automodule:: tvm.contrib.util
:members:
-tvm.contrib.cblas
+
+
+tvm.contrib.xcode
~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.cblas
+.. automodule:: tvm.contrib.xcode
:members:
diff --git a/docs/api/python/graph_runtime.rst b/docs/api/python/graph_runtime.rst
new file mode 100644
index 000000000000..89a223323e14
--- /dev/null
+++ b/docs/api/python/graph_runtime.rst
@@ -0,0 +1,4 @@
+tvm.contrib.graph_runtime
+-------------------------
+.. automodule:: tvm.contrib.graph_runtime
+ :members:
diff --git a/docs/api/python/hybrid.rst b/docs/api/python/hybrid.rst
new file mode 100644
index 000000000000..ac4111cfe768
--- /dev/null
+++ b/docs/api/python/hybrid.rst
@@ -0,0 +1,11 @@
+tvm.hybrid
+----------
+.. automodule:: tvm.hybrid
+
+.. autosummary::
+
+ tvm.hybrid.parse
+ tvm.hybrid.script
+
+.. autofunction:: tvm.hybrid.parse
+.. autofunction:: tvm.hybrid.script
diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index ee830e031462..59bd1795b7ec 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -14,6 +14,13 @@ Python API
ndarray
container
function
+ autotvm
+ graph_runtime
+ rpc
+ bridge
contrib
dev
topi
+ vta/index
+ nnvm/index
+ hybrid
diff --git a/docs/api/python/intrin.rst b/docs/api/python/intrin.rst
index 71ecaaa2ce8d..3942c57f1a04 100644
--- a/docs/api/python/intrin.rst
+++ b/docs/api/python/intrin.rst
@@ -10,7 +10,11 @@ tvm.intrin
tvm.register_intrin_rule
tvm.exp
tvm.log
-
+ tvm.floor
+ tvm.ceil
+ tvm.trunc
+ tvm.round
+ tvm.abs
.. autofunction:: tvm.call_packed
.. autofunction:: tvm.call_pure_intrin
@@ -18,3 +22,8 @@ tvm.intrin
.. autofunction:: tvm.register_intrin_rule
.. autofunction:: tvm.exp
.. autofunction:: tvm.log
+.. autofunction:: tvm.floor
+.. autofunction:: tvm.ceil
+.. autofunction:: tvm.trunc
+.. autofunction:: tvm.round
+.. autofunction:: tvm.abs
diff --git a/docs/api/python/ndarray.rst b/docs/api/python/ndarray.rst
index a06117e05543..2c8f0c292a43 100644
--- a/docs/api/python/ndarray.rst
+++ b/docs/api/python/ndarray.rst
@@ -14,5 +14,6 @@ tvm.ndarray
.. autofunction:: tvm.opencl
.. autofunction:: tvm.metal
.. autofunction:: tvm.ndarray.array
+.. autofunction:: tvm.ndarray.empty
.. autofunction:: tvm.register_extension
diff --git a/docs/api/python/nnvm/compiler.rst b/docs/api/python/nnvm/compiler.rst
new file mode 100644
index 000000000000..4b995b28cd9e
--- /dev/null
+++ b/docs/api/python/nnvm/compiler.rst
@@ -0,0 +1,23 @@
+nnvm.compiler
+-------------
+
+.. automodule:: nnvm.compiler
+
+.. autofunction:: nnvm.compiler.build
+
+.. autofunction:: nnvm.compiler.build_config
+
+.. autofunction:: nnvm.compiler.save_param_dict
+
+.. autofunction:: nnvm.compiler.load_param_dict
+
+.. autofunction:: nnvm.compiler.optimize
+
+.. automodule:: nnvm.compiler.graph_util
+ :members:
+
+.. automodule:: nnvm.compiler.graph_attr
+ :members:
+
+.. automodule:: nnvm.compiler.compile_engine
+ :members:
diff --git a/docs/api/python/nnvm/frontend.rst b/docs/api/python/nnvm/frontend.rst
new file mode 100644
index 000000000000..f872a6b878e2
--- /dev/null
+++ b/docs/api/python/nnvm/frontend.rst
@@ -0,0 +1,12 @@
+nnvm.frontend
+-------------
+
+.. automodule:: nnvm.frontend
+
+.. autofunction:: nnvm.frontend.from_mxnet
+
+.. autofunction:: nnvm.frontend.from_onnx
+
+.. autofunction:: nnvm.frontend.from_coreml
+
+.. autofunction:: nnvm.frontend.from_keras
diff --git a/docs/api/python/nnvm/graph.rst b/docs/api/python/nnvm/graph.rst
new file mode 100644
index 000000000000..5b36ab5194fd
--- /dev/null
+++ b/docs/api/python/nnvm/graph.rst
@@ -0,0 +1,8 @@
+nnvm.graph
+----------
+.. automodule:: nnvm.graph
+
+.. autofunction:: nnvm.graph.create
+
+.. autoclass:: nnvm.graph.Graph
+ :members:
diff --git a/docs/api/python/nnvm/index.rst b/docs/api/python/nnvm/index.rst
new file mode 100644
index 000000000000..c0e5912c76be
--- /dev/null
+++ b/docs/api/python/nnvm/index.rst
@@ -0,0 +1,13 @@
+NNVM API
+========
+
+This document contains the python API to NNVM compiler toolchain.
+
+.. toctree::
+ :maxdepth: 2
+
+ compiler
+ frontend
+ symbol
+ graph
+ top
diff --git a/docs/api/python/nnvm/symbol.rst b/docs/api/python/nnvm/symbol.rst
new file mode 100644
index 000000000000..c341d2ef71d7
--- /dev/null
+++ b/docs/api/python/nnvm/symbol.rst
@@ -0,0 +1,10 @@
+nnvm.symbol
+-----------
+.. automodule:: nnvm.symbol
+
+.. autoclass:: nnvm.symbol.Symbol
+ :members:
+
+.. autoclass:: nnvm.symbol.Variable
+
+.. autofunction:: nnvm.symbol.Group
diff --git a/docs/api/python/nnvm/top.rst b/docs/api/python/nnvm/top.rst
new file mode 100644
index 000000000000..fd28ff363f0d
--- /dev/null
+++ b/docs/api/python/nnvm/top.rst
@@ -0,0 +1,13 @@
+nnvm.top
+--------
+.. automodule:: nnvm.top
+
+.. autofunction:: register_compute
+
+.. autofunction:: register_schedule
+
+.. autofunction:: register_pattern
+
+
+.. autoclass:: nnvm.top.AttrDict
+ :members:
diff --git a/docs/api/python/rpc.rst b/docs/api/python/rpc.rst
new file mode 100644
index 000000000000..6c4ef59a493c
--- /dev/null
+++ b/docs/api/python/rpc.rst
@@ -0,0 +1,22 @@
+tvm.rpc
+-------
+.. automodule:: tvm.rpc
+
+.. autofunction:: tvm.rpc.connect
+.. autofunction:: tvm.rpc.connect_tracker
+
+.. autoclass:: tvm.rpc.TrackerSession
+ :members:
+ :inherited-members:
+
+.. autoclass:: tvm.rpc.RPCSession
+ :members:
+ :inherited-members:
+
+.. autoclass:: tvm.rpc.LocalSession
+ :members:
+ :inherited-members:
+
+.. autoclass:: tvm.rpc.Server
+ :members:
+ :inherited-members:
diff --git a/docs/api/python/target.rst b/docs/api/python/target.rst
index 0f824324d0c5..e5723349b5c0 100644
--- a/docs/api/python/target.rst
+++ b/docs/api/python/target.rst
@@ -1,13 +1,4 @@
tvm.target
----------
.. automodule:: tvm.target
-
-.. autofunction:: tvm.target.generic_func
-
-.. autoclass:: tvm.target.Target
:members:
-
-.. autofunction:: tvm.target.cuda
-.. autofunction:: tvm.target.rocm
-.. autofunction:: tvm.target.rasp
-.. autofunction:: tvm.target.create
diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 4f7d8cace31b..7f150ddbf7cd 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -1,24 +1,45 @@
-TVM Operator Inventory
-----------------------
+TOPI
+----
.. automodule:: topi
-Index
-~~~~~
-
-**List of operators**
+List of operators
+~~~~~~~~~~~~~~~~~
.. autosummary::
+ topi.identity
+ topi.negative
+ topi.floor
+ topi.ceil
+ topi.trunc
+ topi.round
+ topi.abs
topi.exp
topi.tanh
topi.log
topi.sqrt
topi.sigmoid
+ topi.clip
+ topi.cast
topi.transpose
+ topi.flip
+ topi.strided_slice
topi.expand_dims
+ topi.reshape
+ topi.squeeze
+ topi.concatenate
+ topi.split
+ topi.take
+ topi.full
+ topi.full_like
topi.nn.relu
topi.nn.leaky_relu
topi.nn.dilate
+ topi.nn.pool
+ topi.nn.global_pool
+ topi.nn.upsampling
+ topi.nn.softmax
+ topi.nn.log_softmax
topi.nn.conv2d_nchw
topi.nn.conv2d_hwcn
topi.nn.depthwise_conv2d_nchw
@@ -26,15 +47,28 @@ Index
topi.max
topi.sum
topi.min
+ topi.argmax
+ topi.argmin
topi.broadcast_to
- topi.broadcast_add
- topi.broadcast_sub
- topi.broadcast_mul
- topi.broadcast_div
-
+ topi.add
+ topi.subtract
+ topi.multiply
+ topi.divide
+ topi.mod
+ topi.maximum
+ topi.minimum
+ topi.power
+ topi.greater
+ topi.less
+ topi.equal
+ topi.not_equal
+ topi.greater_equal
+ topi.less_equal
+ topi.image.resize
-**List of schedules**
+List of schedules
+~~~~~~~~~~~~~~~~~
.. autosummary::
topi.generic.schedule_conv2d_nchw
@@ -45,33 +79,65 @@ Index
topi
~~~~
+.. autofunction:: topi.negative
+.. autofunction:: topi.identity
+.. autofunction:: topi.floor
+.. autofunction:: topi.ceil
+.. autofunction:: topi.trunc
+.. autofunction:: topi.round
+.. autofunction:: topi.abs
.. autofunction:: topi.exp
.. autofunction:: topi.tanh
.. autofunction:: topi.log
.. autofunction:: topi.sqrt
.. autofunction:: topi.sigmoid
+.. autofunction:: topi.clip
+.. autofunction:: topi.cast
.. autofunction:: topi.transpose
+.. autofunction:: topi.flip
+.. autofunction:: topi.strided_slice
.. autofunction:: topi.expand_dims
+.. autofunction:: topi.reshape
+.. autofunction:: topi.squeeze
+.. autofunction:: topi.concatenate
+.. autofunction:: topi.split
+.. autofunction:: topi.take
+.. autofunction:: topi.full
+.. autofunction:: topi.full_like
.. autofunction:: topi.max
.. autofunction:: topi.sum
.. autofunction:: topi.min
.. autofunction:: topi.broadcast_to
-.. autofunction:: topi.broadcast_add
-.. autofunction:: topi.broadcast_sub
-.. autofunction:: topi.broadcast_mul
-.. autofunction:: topi.broadcast_div
-
+.. autofunction:: topi.add
+.. autofunction:: topi.subtract
+.. autofunction:: topi.multiply
+.. autofunction:: topi.divide
+.. autofunction:: topi.mod
+.. autofunction:: topi.maximum
+.. autofunction:: topi.minimum
+.. autofunction:: topi.power
+.. autofunction:: topi.greater
+.. autofunction:: topi.less
topi.nn
~~~~~~~
.. autofunction:: topi.nn.relu
.. autofunction:: topi.nn.leaky_relu
.. autofunction:: topi.nn.dilate
+.. autofunction:: topi.nn.pool
+.. autofunction:: topi.nn.global_pool
+.. autofunction:: topi.nn.upsampling
+.. autofunction:: topi.nn.softmax
+.. autofunction:: topi.nn.log_softmax
.. autofunction:: topi.nn.conv2d_nchw
.. autofunction:: topi.nn.conv2d_hwcn
.. autofunction:: topi.nn.depthwise_conv2d_nchw
.. autofunction:: topi.nn.depthwise_conv2d_nhwc
+topi.image
+~~~~~~~~~~
+.. autofunction:: topi.image.resize
+
topi.generic
~~~~~~~~~~~~
diff --git a/docs/api/python/tvm.rst b/docs/api/python/tvm.rst
index 8700da38273b..6522df3ae9d3 100644
--- a/docs/api/python/tvm.rst
+++ b/docs/api/python/tvm.rst
@@ -15,6 +15,7 @@ The user facing API for computation declaration.
tvm.extern
tvm.decl_buffer
tvm.reduce_axis
+ tvm.select
tvm.thread_axis
tvm.comm_reducer
tvm.sum
@@ -33,6 +34,7 @@ The user facing API for computation declaration.
.. autofunction:: tvm.extern
.. autofunction:: tvm.decl_buffer
.. autofunction:: tvm.reduce_axis
+.. autofunction:: tvm.select
.. autofunction:: tvm.thread_axis
.. autofunction:: tvm.comm_reducer
.. autofunction:: tvm.sum
diff --git a/docs/api/python/vta/index.rst b/docs/api/python/vta/index.rst
new file mode 100644
index 000000000000..014b789e5aa0
--- /dev/null
+++ b/docs/api/python/vta/index.rst
@@ -0,0 +1,28 @@
+VTA API
+=======
+
+This document contains the python API to VTA compiler toolchain.
+
+.. automodule:: vta
+
+Hardware Information
+--------------------
+
+.. autofunction:: vta.Environment
+.. autofunction:: vta.get_env
+
+RPC Utilities
+-------------
+
+.. autofunction:: vta.reconfig_runtime
+.. autofunction:: vta.program_fpga
+
+
+Compiler API
+------------
+We program VTA using TVM, so the compiler API in vta package
+is only a thin wrapper to provide VTA specific extensions.
+
+.. autofunction:: vta.build_config
+.. autofunction:: vta.build
+.. autofunction:: vta.lower
diff --git a/docs/api_links.rst b/docs/api_links.rst
index 9a55af1728b9..909cfe367f29 100644
--- a/docs/api_links.rst
+++ b/docs/api_links.rst
@@ -1,5 +1,5 @@
-Links to API References
-=======================
+Links to C++ and JS API References
+==================================
This page contains links to API references that are build with different doc build system.
diff --git a/docs/conf.py b/docs/conf.py
index 4a42fb0fedb0..989d26f87d3e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -25,6 +25,8 @@
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
sys.path.insert(0, os.path.join(curr_path, '../python/'))
sys.path.insert(0, os.path.join(curr_path, '../topi/python'))
+sys.path.insert(0, os.path.join(curr_path, '../nnvm/python'))
+sys.path.insert(0, os.path.join(curr_path, '../vta/python'))
# -- General configuration ------------------------------------------------
@@ -40,6 +42,7 @@
'.md': CommonMarkParser
}
os.environ['TVM_BUILD_DOC'] = '1'
+os.environ['NNVM_BUILD_DOC'] = '1'
# Version information.
import tvm
version = tvm.__version__
@@ -137,6 +140,14 @@
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
+html_theme_options = {
+ 'analytics_id': 'UA-75982049-2',
+ 'logo_only': True,
+}
+
+html_logo = "_static/img/tvm-logo-small.png"
+
+
# Output file base name for HTML help builder.
htmlhelp_basename = project + 'doc'
@@ -174,12 +185,17 @@ def run_doxygen(folder):
from sphinx_gallery.sorting import ExplicitOrder
-examples_dirs = ['../tutorials/']
-gallery_dirs = ['tutorials']
+examples_dirs = ["../tutorials/", "../vta/tutorials/"]
+gallery_dirs = ["tutorials", "vta/tutorials"]
+
subsection_order = ExplicitOrder(
['../tutorials/language',
'../tutorials/optimize',
- '../tutorials/deployment'])
+ '../tutorials/autotvm',
+ '../tutorials/vta',
+ '../tutorials/topi',
+ '../tutorials/deployment',
+ '../tutorials/nnvm'])
def generate_doxygen_xml(app):
"""Run the doxygen make commands if we're on the ReadTheDocs server"""
@@ -207,7 +223,7 @@ def setup(app):
'examples_dirs': examples_dirs,
'gallery_dirs': gallery_dirs,
'subsection_order': subsection_order,
+ 'filename_pattern': os.environ.get("TVM_TUTORIAL_EXEC_PATTERN", ".py"),
'find_mayavi_figures': False,
- 'filename_pattern': '.py',
'expected_failing_examples': []
}
diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
new file mode 100644
index 000000000000..dc7d998ca37f
--- /dev/null
+++ b/docs/contribute/code_guide.rst
@@ -0,0 +1,39 @@
+.. _code_guide:
+
+Code Guide and Tips
+===================
+
+This is a document used to record tips in tvm codebase for reviewers and contributors.
+Most of them are summarized through lessons during the contributing and process.
+
+
+C++ Code Styles
+---------------
+- Use the Google C/C++ style.
+- The public facing functions are documented in doxygen format.
+- Favor concrete type declaration over ``auto`` as long as it is short.
+- Favor passing by const reference (e.g. ``const Expr&``) over passing by value.
+ Except when the function consumes the value by copy constructor or move,
+ pass by value is better than pass by const reference in such cases.
+
+Python Code Styles
+------------------
+- The functions and classes are documented in `numpydoc `_ format.
+- Check your code style using ``make pylint``
+
+
+Handle Integer Constant Expression
+----------------------------------
+We often need to handle constant integer expressions in tvm. Before we do so, the first question we want to ask is that is it really necessary to get a constant integer. If symbolic expression also works and let the logic flow, we should use symbolic expression as much as possible. So the generated code works for shapes that are not known ahead of time.
+
+Note that in some cases we cannot know certain information, e.g. sign of symbolic variable, it is ok to make assumptions in certain cases. While adding precise support if the variable is constant.
+
+If we do have to get constant integer expression, we should get the constant value using type ``int64_t`` instead of ``int``, to avoid potential integer overflow. We can always reconstruct an integer with the corresponding expression type via ``make_const``. The following code gives an example.
+
+.. code:: c++
+
+ Expr CalculateExpr(Expr value) {
+ int64_t int_value = GetConstInt(value);
+ int_value = CalculateExprInInt64(int_value);
+ return make_const(value.type(), int_value);
+ }
diff --git a/docs/contribute/code_review.rst b/docs/contribute/code_review.rst
new file mode 100644
index 000000000000..344296932703
--- /dev/null
+++ b/docs/contribute/code_review.rst
@@ -0,0 +1,63 @@
+Perform Code Reviews
+====================
+
+This is a general guideline for code reviewers. First of all, while it is great to add new features to a project, we must also be aware that each line of code we introduce also brings **technical debt** that we may have to eventually pay.
+
+Open source code is maintained by a community with diverse backend, and it is even more important to bring clear, documented and maintainable code. Code reviews are shepherding process to spot potential problems, improve quality of the code. We should, however, not rely on code review process to get the code into a ready state. Contributors are encouraged to polish the code to a ready state before requesting reviews. This is especially expected for code owner and comitter candidates.
+
+Here are some checklists for code reviews, it is also helpful reference for contributors
+
+
+Hold the Highest Standard
+-------------------------
+The first rule for code reviewers is to always keep the highest standard, and do not approve code just to "be friendly". Good, informative critics each other learn and prevents technical debt in early stages.
+
+Ensure Test Coverage
+--------------------
+Each new change of features should introduce test cases, bug fixes should include regression tests that prevent the problem from happening again.
+
+Documentations are Mandatory
+----------------------------
+Documentation is usually a place we overlooked, new functions or change to a function should be directly updated in documents. A new feature is meaningless without documentation to make it accessible. See more at :ref:`doc_guide`
+
+Deliberate on User-facing API
+-----------------------------
+A good, minimum and stable API is critical to the project’s life. A good API makes a huge difference. Always think very carefully about all the aspects including naming, arguments definitions and behavior. One good rule to check is to be consistent with existing well-known package’s APIs if the feature overlap. For example, tensor operation APIs should always be consistent with the numpy.
+
+Minimum Dependency
+------------------
+Always be cautious in introducing dependencies. While it is important to reuse code and not reinventing the wheel, dependencies can increase burden of users in deployment. A good design principle only depends on the part when a user actually use it.
+
+Ensure Readability
+------------------
+While it is hard to implement a new feature, it is even harder to make others understand and maintain the code you wrote. It is common for a PMC or committer to not being able to understand certain contributions. In such case, a reviewer should say "I don’t understand" and ask the contributor to clarify. We highly encourage code comments which explain the code logic along with the code.
+
+Concise Implementation
+----------------------
+Some basic principles applied here: favor vectorized array code over loops, is there existing API that solves the problem.
+
+Document Lessons in Code Reviews
+--------------------------------
+When you find there are some common lessons that can be summarized in the guideline,
+add it to the :ref:`code_guide`.
+It is always good to refer to the guideline document when requesting changes,
+so the lessons can be shared to all the community.
+
+Respect each other
+------------------
+The code reviewers and contributors are paying the most precious currencies in the world -- time. We are volunteers in the community to spend the time to build good code, help each other, learn and have fun hacking.
+
+Learn from other Code Reviews
+-----------------------------
+There can be multiple reviewers reviewing the same changes. Many cases other reviewers
+may spot things you did not find. Try to learn from other code reviews,
+when possible, document these lessons.
+
+Approve and Request Changes Explicitly
+--------------------------------------
+The contributor and code owner can request code reviews from multiple reviewers.
+Remember to approve changes when your comments are addressed in a code review.
+To do so -- please click on changes tab in the pull request, then select approve,
+or comment on the code and click request changes.
+Code owner can decide if the code can be merged in case by case if some of the reviewers
+did not respond in time(e.g. a week) and existing reviews are sufficient.
diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
new file mode 100644
index 000000000000..1023cf0ddccc
--- /dev/null
+++ b/docs/contribute/community.rst
@@ -0,0 +1,51 @@
+TVM Community Structure
+=======================
+
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community. There are several roles in the community:
+
+- Project Management Committee(PMC) Small group of active committers that moderate the discussion, RFC, manage project releases.
+- Committer Individual who has made substantial contributions to the project and is granted write access to the project and oversees the general direction of the projects.
+- Code Owner Individual who is responsible for a specific area of the codebase.
+- Reviewer Individual who is qualified to review for a specific area of the codebase.
+- Contributor Anyone who contributes to the project.
+
+This document explains responsibility and criteria for each role.
+See `CONTRIBUTORS.md `_ for the current list of contributors and their roles.
+
+
+Project Management Committee
+----------------------------
+
+The PMC consists of a small group of active committers that moderate the discussion, provide mentorship to committers and code owners and manage the project release. PMC members need to actively manage the general project directions. Note that most major design choices and proposed changes should reach consensus among the committers.
+
+Committer
+---------
+
+Committers are individuals who are granted the write access to the project. Committers oversee the general project directions and participate in the evaluation of the RFCs involving major design changes. Here is a list of useful things to do to help become a committer.
+
+- Deep understanding of one or a few modules in the project.
+- Good understanding of general project structure, demonstrated by discussion over RFCs, code reviews and proposals of new features
+- Active history of code reviews that demonstrate a good technical ability
+- Contribution history of high-quality documentation and tutorials to the promote project
+- History of creating clean, maintainable code and including good test cases.
+
+New committers are nominated by current committers from current code owners.
+
+Code Owner
+----------
+
+A code owner is an individual who is responsible for a specific area of the code-base. Code owners are responsible for the areas they are in charge of and oversee the code review process of the corresponding module. Changes to a specific area need to be approved by one of its owners in order to be merged. Once a pull request is approved by the designated code owner, the code can be directly merged into the repo. Code owners are essential for a high quality and healthy codebase.
+
+We welcome new code owners that help to keep good code quality, testing, and documentation in specific areas. Here is a list of useful traits that help the community to recognize potential code owners:
+
+- High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review
+- Good coverage of tests and documentation in the contributions
+- Informative code reviews to help other contributors that adhere to a good standard, spot problems in contributions etc.
+- Active participation in the discussion forum
+
+Reviewer
+--------
+
+A reviewer is an individual who actively contributed to the project and is willing to participate in the code review of new contributions. We invite reviewers from active contributors. The reviewer invitation will be sent to the potential reviewer’s email, so please log in to the discussion forum so that we can know which email address we could send an invitation to.
+We actively seek reviews from reviewers. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project.
+A pull request to the project has to be reviewed by a reviewer in order to be merged.
diff --git a/docs/contribute/document.rst b/docs/contribute/document.rst
new file mode 100644
index 000000000000..ab67fbec9384
--- /dev/null
+++ b/docs/contribute/document.rst
@@ -0,0 +1,88 @@
+.. _doc_guide:
+
+Write Document and Tutorials
+============================
+
+We use the `Sphinx `_ for the main documentation.
+Sphinx support both the reStructuredText and markdown.
+When possible, we encourage to use reStructuredText as it has richer features.
+Note that the python doc-string and tutorials allow you to embed reStructuredText syntax.
+
+
+Document Python
+---------------
+We use `numpydoc `_
+format to document the function and classes.
+The following snippet gives an example docstring.
+We always document all the public functions,
+when necessary, provide an usage example of the features we support(as shown below).
+
+.. code:: python
+
+ def myfunction(arg1, arg2, arg3=3):
+ """Briefly describe my function.
+
+ Parameters
+ ----------
+ arg1 : Type1
+ Description of arg1
+
+ arg2 : Type2
+ Description of arg2
+
+ arg3 : Type3, optional
+ Description of arg3
+
+ Returns
+ -------
+ rv1 : RType1
+ Description of return type one
+
+ Examples
+ --------
+ .. code:: python
+
+ # Example usage of myfunction
+ x = myfunction(1, 2)
+ """
+ return rv1
+
+Be careful to leave blank lines between sections of your documents.
+In the above case, there has to be a blank line before `Parameters`, `Returns` and `Examples`
+in order for the doc to be built correctly. To add a new function to the doc,
+we need to add the `sphinx.autodoc `_
+rules to the `docs/api/python `_).
+You can refer to the existing files under this folder on how to add the functions.
+
+
+Document C++
+------------
+We use the doxgen format to document c++ functions.
+The following snippet shows an example of c++ docstring.
+
+.. code:: c++
+
+ /*!
+ * \brief Description of my function
+ * \param arg1 Description of arg1
+ * \param arg2 Descroption of arg2
+ * \returns describe return value
+ */
+ int myfunction(int arg1, int arg2) {
+ // When necessary, also add comment to clarify internal logics
+ }
+
+Besides documenting function usages, we also highly recommend contributors
+to add comments about code logics to improve readability.
+
+
+Write Tutorials
+---------------
+We use the `sphinx-gallery `_ to build python tutorials.
+You can find the source code under `tutorials `_ quite self explanatory.
+One thing that worth noting is that the comment blocks are written in reStructuredText instead of markdown so be aware of the syntax.
+
+The tutorial code will run on our build server to generate the document page.
+So we may have a restriction like not being able to access a remote Raspberry Pi,
+in such case add a flag variable to the tutorial (e.g. `use_rasp`) and allow users to easily switch to the real device by changing one flag.
+Then use the existing environment to demonstrate the usage.
diff --git a/docs/contribute/git_howto.md b/docs/contribute/git_howto.md
new file mode 100644
index 000000000000..53ff89b127df
--- /dev/null
+++ b/docs/contribute/git_howto.md
@@ -0,0 +1,57 @@
+# Git Usage Tips
+
+Here are some tips for git workflow.
+
+## How to resolve conflict with master
+- First rebase to most recent master
+```bash
+# The first two steps can be skipped after you do it once.
+git remote add upstream [url to tvm repo]
+git fetch upstream
+git rebase upstream/master
+```
+- The git may show some conflicts it cannot merge, say ```conflicted.py```.
+ - Manually modify the file to resolve the conflict.
+ - After you resolved the conflict, mark it as resolved by
+```bash
+git add conflicted.py
+```
+- Then you can continue rebase by
+```bash
+git rebase --continue
+```
+- Finally push to your fork, you may need to force push here.
+```bash
+git push --force
+```
+
+## How to combine multiple commits into one
+Sometimes we want to combine multiple commits, especially when later commits are only fixes to previous ones,
+to create a PR with set of meaningful commits. You can do it by following steps.
+- Before doing so, configure the default editor of git if you haven't done so before.
+```bash
+git config core.editor the-editor-you-like
+```
+- Assume we want to merge last 3 commits, type the following commands
+```bash
+git rebase -i HEAD~3
+```
+- It will pop up an text editor. Set the first commit as ```pick```, and change later ones to ```squash```.
+- After you saved the file, it will pop up another text editor to ask you modify the combined commit message.
+- Push the changes to your fork, you need to force push.
+```bash
+git push --force
+```
+
+## Reset to the most recent master
+You can always use git reset to reset your version to the most recent master.
+Note that all your ***local changes will get lost***.
+So only do it when you do not have local changes or when your pull request just get merged.
+```bash
+git reset --hard [hash tag of master]
+git push --force
+```
+
+## What is the consequence of force push
+The previous two tips requires force push, this is because we altered the path of the commits.
+It is fine to force push to your own fork, as long as the commits changed are only yours.
diff --git a/docs/contribute/index.rst b/docs/contribute/index.rst
new file mode 100644
index 000000000000..ea358e287f60
--- /dev/null
+++ b/docs/contribute/index.rst
@@ -0,0 +1,31 @@
+Contribute to TVM
+=================
+
+TVM has been developed by community members.
+Everyone is welcomed to contribute.
+We value all forms of contributions, including, but not limited to:
+
+- Code reviewing of the existing patches.
+- Documentation and usage examples
+- Community participation in forums and issues.
+- Code readability and developer guide
+
+ - We welcome contributions that add code comments
+ to improve readability
+ - We also welcome contributions to docs to explain the
+ design choices of the internal.
+
+- Test cases to make the codebase more robust
+- Tutorials, blog posts, talks that promote the project.
+
+Here are guidelines for contributing to various aspect of the project:
+
+.. toctree::
+ :maxdepth: 2
+
+ community
+ code_review
+ document
+ code_guide
+ pull_request
+ git_howto
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
new file mode 100644
index 000000000000..80a0448c08dd
--- /dev/null
+++ b/docs/contribute/pull_request.rst
@@ -0,0 +1,26 @@
+Submit a Pull Request
+=====================
+
+This is a quick guide to submit a pull request, please also refer to the detailed guidelines.
+
+- Before submit, please rebase your code on the most recent version of master, you can do it by
+
+ .. code:: bash
+
+ git remote add upstream [url to tvm repo]
+ git fetch upstream
+ git rebase upstream/master
+
+- Make sure code style check pass by typing ``make lint``, and all the existing test-cases pass.
+- Add test-cases to cover the new features or bugfix the patch introduces.
+- Document the code you wrote, see more at :ref:`doc_guide`
+- Send the pull request, fix the problems reported by automatic checks.
+ Request code reviews from other contributors and improves your patch according to feedbacks.
+
+ - To get your code reviewed quickly, we encourage you to help review others' code so they can do the favor in return.
+ - Code review is a shepherding process that helps to improve contributor's code quality.
+ We should treat it proactively, to improve the code as much as possible before the review.
+ We highly value patches that can get in without extensive reviews.
+ - The detailed guidelines and summarizes useful lessons.
+
+- The patch can be merged after the reviewers approve the pull request.
diff --git a/docs/deploy/android.md b/docs/deploy/android.md
new file mode 100644
index 000000000000..ca431693c63a
--- /dev/null
+++ b/docs/deploy/android.md
@@ -0,0 +1,25 @@
+# Deploy to Android
+
+
+## Build model for Android Target
+
+NNVM compilation of model for android target could follow same approach like android_rpc.
+
+An reference exampe can be found at [chainer-nnvm-example](https://github.com/tkat0/chainer-nnvm-example)
+
+Above example will directly run the compiled model on RPC target. Below modification at [rum_mobile.py](https://github.com/tkat0/chainer-nnvm-example/blob/5b97fd4d41aa4dde4b0aceb0be311054fb5de451/run_mobile.py#L64) will save the compilation output which is required on android target.
+
+```
+lib.export_library("deploy_lib.so", ndk.create_shared)
+with open("deploy_graph.json", "w") as fo:
+ fo.write(graph.json())
+with open("deploy_param.params", "wb") as fo:
+ fo.write(nnvm.compiler.save_param_dict(params))
+```
+
+deploy_lib.so, deploy_graph.json, deploy_param.params will go to android target.
+
+## TVM Runtime for Android Target
+
+Refer [here](https://github.com/dmlc/tvm/blob/master/apps/android_deploy/README.md#build-and-installation) to build CPU/OpenCL version flavor TVM runtime for android target.
+From android java TVM API to load model & execute can be refered at this [java](https://github.com/dmlc/tvm/blob/master/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java) sample source.
diff --git a/docs/deploy/aocl_fpga.md b/docs/deploy/aocl_fpga.md
new file mode 100644
index 000000000000..bd0dae97879d
--- /dev/null
+++ b/docs/deploy/aocl_fpga.md
@@ -0,0 +1,92 @@
+AOCL Backend Example
+====================
+
+TVM supports Intel FPGA SDK for OpenCL also known as AOCL. Here is a tutorial for how to use TVM with AOCL.
+
+***Note***: This feature is still experimental. We cannot use AOCL to deploy an end to end neural networks for now. In addition, we only tested compilation for emulation mode of AOCL.
+
+We use two python scripts for this tutorial.
+
+- build.py - a script to synthesize FPGA bitstream.
+```
+import tvm
+
+tgt_host="llvm"
+tgt="aocl -device=s5_ref -mattr=emulator"
+
+n = tvm.var("n")
+A = tvm.placeholder((n,), name='A')
+B = tvm.placeholder((n,), name='B')
+C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+s = tvm.create_schedule(C.op)
+px, x = s[C].split(C.op.axis[0], nparts=1)
+
+s[C].bind(px, tvm.thread_axis("pipeline"))
+
+fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+
+fadd.save("myadd.o")
+fadd.imported_modules[0].save("myadd.aocx")
+
+tvm.contrib.cc.create_shared("myadd.so", ["myadd.o"])
+```
+
+- run.py - a script to use FPGA as an accelerator.
+```
+import tvm
+import numpy as np
+import os
+
+tgt="aocl -device=s5_ref -mattr=emulator"
+
+fadd = tvm.module.load("myadd.so")
+fadd_dev = tvm.module.load("myadd.aocx")
+fadd.import_module(fadd_dev)
+
+ctx = tvm.context(tgt, 0)
+
+n = 1024
+a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
+b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
+c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
+
+fadd(a, b, c)
+np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+```
+
+Setup
+-----
+
+- Install AOCL 17.1 on Ubuntu 16.04.4 LTS.
+- Install BSP for your FPGA device.
+- Install FPGA device driver.
+- Create an ICD file at /etc/OpenCL/vendors/Altera.icd so that the OpenCL platform can be found.
+```
+/opt/intelFPGA/17.1/hld/linux64/lib/libalteracl.so
+```
+- Create an FCD file for example at /opt/Intel/OpenCL/Boards/s5_ref.fcd so that your FPGA device can be found.
+```
+/opt/intelFPGA/17.1/hld/board/s5_ref/linux64/lib/libaltera_s5_ref_mmd.so
+```
+- Setup TVM with AOCL and OpenCL enabled.
+
+Emulation
+---------
+
+- Run software emulation
+```
+export CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1
+
+python build.py
+python run.py
+```
+
+- Run on FPGA devices (not tested)
+ - Change tgt value to "aocl -device=s5_ref" on build.py and run.py
+```
+unset CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA
+
+python build.py
+python run.py
+```
diff --git a/docs/deploy/aws_fpga.md b/docs/deploy/aws_fpga.md
new file mode 100644
index 000000000000..7554ce7f64cd
--- /dev/null
+++ b/docs/deploy/aws_fpga.md
@@ -0,0 +1,152 @@
+HLS Backend Example
+===================
+
+TVM supports Xilinx FPGA board with SDAccel. Here is a tutorial for how to deploy TVM to AWS F1 FPGA instance.
+
+***Note***: This feature is still experimental. We cannot use SDAccel to deploy an end to end neural networks for now.
+
+We use two python scripts for this tutorial.
+
+- build.py - a script to synthesize FPGA bitstream.
+```python
+import tvm
+
+tgt_host="llvm"
+tgt="sdaccel"
+
+n = tvm.var("n")
+A = tvm.placeholder((n,), name='A')
+B = tvm.placeholder((n,), name='B')
+C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+s = tvm.create_schedule(C.op)
+px, x = s[C].split(C.op.axis[0], nparts=1)
+
+s[C].bind(px, tvm.thread_axis("pipeline"))
+
+fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+
+fadd.save("myadd.o")
+fadd.imported_modules[0].save("myadd.xclbin")
+
+tvm.contrib.cc.create_shared("myadd.so", ["myadd.o"])
+```
+
+- run.py - a script to use FPGA as an accelerator.
+```python
+import tvm
+import numpy as np
+import os
+
+tgt="sdaccel"
+
+fadd = tvm.module.load("myadd.so")
+if os.environ.get("XCL_EMULATION_MODE"):
+ fadd_dev = tvm.module.load("myadd.xclbin")
+else:
+ fadd_dev = tvm.module.load("myadd.awsxclbin")
+fadd.import_module(fadd_dev)
+
+ctx = tvm.context(tgt, 0)
+
+n = 1024
+a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
+b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
+c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
+
+fadd(a, b, c)
+np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+```
+
+Setup
+-----
+
+- Launch an instance using the FPGA Developer AMI. We don't need an F1 instance for emulation and synthesis, so it is recommended to use a lower cost instance for them.
+
+- Setup AWS FPGA development kit.
+```bash
+git clone https://github.com/aws/aws-fpga.git
+cd aws-fpga
+source sdaccel_setup.sh
+source ${XILINX_SDX}/settings64.sh
+```
+
+- Setup TVM with OpenCL enabled.
+
+Emulation
+---------
+
+- Create emconfig.json for emulation.
+```bash
+emconfigutil --platform ${AWS_PLATFORM} --nd 1
+```
+
+- Copy emconfig.json to the python binary directory. It is because the current Xilinx toolkit assumes that both host binary and the emconfig.json file are in the same path.
+```bash
+cp emconfig.json $(dirname $(which python))
+```
+
+- Run software emulation
+```bash
+export XCL_EMULATION_MODE=1
+export XCL_TARGET=sw_emu
+
+python build.py
+python run.py
+```
+
+- Run hardware emulation
+```bash
+export XCL_EMULATION_MODE=1
+export XCL_TARGET=hw_emu
+
+python build.py
+python run.py
+```
+
+
+Synthesis
+---------
+
+- Run synthesis with the following script. `XCL_EMULATION_MODE` must be set to 1 at this stage.
+
+```bash
+export XCL_EMULATION_MODE=1
+export XCL_TARGET=hw
+
+python build.py
+```
+
+- Create AWS FPGA image and upload it to AWS S3.
+```
+${SDACCEL_DIR}/tools/create_sdaccel_afi.sh -xclbin=myadd.xclbin -o=myadd \
+ -s3_bucket= -s3_dcp_key= -s3_logs_key=
+```
+This also generates an awsxclbin file, which is necessary to use the AWS FPGA image on F1 instances.
+
+Run
+---
+
+- Launch Amazon EC2 F1 instance.
+
+- Copy `myadd.so`, `myadd.awsxclbin`, and `run.py` to the F1 instance.
+
+- Setup AWS FPGA development kit.
+```bash
+git clone https://github.com/aws/aws-fpga.git
+cd aws-fpga
+source sdaccel_setup.sh
+```
+
+- Setup TVM with OpenCL enabled.
+
+- Become root and setup environment variables.
+```bash
+sudo sh
+source ${INSTALL_ROOT}/setup.sh
+```
+
+- Run
+```bash
+python run.py
+```
diff --git a/docs/how_to/deploy.md b/docs/deploy/cpp_deploy.md
similarity index 93%
rename from docs/how_to/deploy.md
rename to docs/deploy/cpp_deploy.md
index b9f219acc335..d02d33d18694 100644
--- a/docs/how_to/deploy.md
+++ b/docs/deploy/cpp_deploy.md
@@ -1,5 +1,6 @@
-How to Deploy TVM Modules
-=========================
+Deploy TVM Module using C++ API
+===============================
+
We provide an example on how to deploy TVM modules in [apps/howto_deploy](https://github.com/dmlc/tvm/tree/master/apps/howto_deploy)
To run the example, you can use the following command
@@ -12,8 +13,6 @@ cd apps/howto_deploy
Get TVM Runtime Library
-----------------------
-![](http://www.tvmlang.org/images/release/tvm_flexible.png)
-
The only thing we need is to link to a TVM runtime in your target platform.
TVM provides a minimum runtime, which costs around 300K to 600K depending on how much modules we use.
In most cases, we can use ```libtvm_runtime.so``` that comes with the build.
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
new file mode 100644
index 000000000000..0ef5cf5c8246
--- /dev/null
+++ b/docs/deploy/index.rst
@@ -0,0 +1,54 @@
+.. _deploy-and-integration:
+
+Deploy and Integration
+======================
+
+This page contains guidelines on how to deploy TVM to various platforms
+as well as how to integrate it with your project.
+
+.. image:: http://www.tvm.ai/images/release/tvm_flexible.png
+
+Unlike traditional deep learning frameworks. TVM stack is divided into two major components:
+
+- TVM compiler, which does all the compilation and optimizations
+- TVM runtime, which runs on the target devices.
+
+In order to integrate the compiled module, we **do not** need to build entire TVM on the target device. You only need to build the TVM compiler stack on your desktop and use that to cross-compile modules that are deployed on the target device.
+We only need to use a light-weight runtime API that can be integrated into various platforms.
+
+For example, you can run the following commands to build the runtime API
+on a Linux based embedded system such as Raspberry Pi:
+
+.. code:: bash
+
+ git clone --recursive https://github.com/dmlc/tvm
+ cd tvm
+ mkdir build
+ cp cmake/config.cmake build
+ cd build
+ cmake ..
+ make runtime
+
+Note that we type `make runtime` to only build the runtime library.
+If you want to include additional runtime such as OpenCL,
+you can modify `config.cmake` to enable these options.
+After you get the TVM runtime library, you can link the compiled library
+
+The easiest and recommended way to test, tune and benchmark TVM kernels on
+embedded devices is through TVM's RPC API.
+Here are the links to the related tutorials.
+
+- :ref:`tutorial-cross-compilation-and-rpc`
+- :ref:`tutorial-deploy-model-on-mali-gpu`
+- :ref:`tutorial-deploy-model-on-rasp`
+
+After you finished tuning and benchmarking, you might need to deploy the model on the
+target device without relying on RPC. see the following resources on how to do so.
+
+.. toctree::
+ :maxdepth: 2
+
+ cpp_deploy
+ android
+ nnvm
+ integrate
diff --git a/docs/deploy/integrate.md b/docs/deploy/integrate.md
new file mode 100644
index 000000000000..b6f3b1faa3da
--- /dev/null
+++ b/docs/deploy/integrate.md
@@ -0,0 +1,50 @@
+Integrate TVM into Your Project
+===============================
+
+TVM's runtime is designed to be lightweight and portable.
+There are several ways you can integrate TVM into your project.
+
+This article introduces possible ways to integrate TVM
+as a JIT compiler to generate functions on your system.
+
+
+## DLPack Support
+
+TVM's generated function follows the PackedFunc convention.
+It is a function that can take positional arguments including
+standard types such as float, integer, string.
+The PackedFunc takes DLTensor pointer in [dlpack](https://github.com/dmlc/dlpack) convention.
+So the only thing you need to solve is to create a corresponding DLTensor object.
+
+
+
+## Integrate User Defined C++ Array
+
+The only thing we have to do in C++ is to convert your array to DLTensor and pass in its address as
+```DLTensor*``` to the generated function.
+
+
+## Integrate User Defined Python Array
+
+Assume you have a python object ```MyArray```. There are three things that you need to do
+
+- Add ```_tvm_tcode``` field to your array which returns ```tvm.TypeCode.ARRAY_HANDLE```
+- Support ```_tvm_handle``` property in your object, which returns the address of DLTensor in python integer
+- Register this class by ```tvm.register_extension```
+
+```python
+# Example code
+import tvm
+
+class MyArray(object):
+ _tvm_tcode = tvm.TypeCode.ARRAY_HANDLE
+
+ @property
+ def _tvm_handle(self):
+ dltensor_addr = self.get_dltensor_addr()
+ return dltensor_addr
+
+# You can put registration step in a separate file mypkg.tvm.py
+# and only optionally import that if you only want optional dependency.
+tvm.register_extension(MyArray)
+```
diff --git a/docs/deploy/nnvm.md b/docs/deploy/nnvm.md
new file mode 100644
index 000000000000..aa6c39facd1f
--- /dev/null
+++ b/docs/deploy/nnvm.md
@@ -0,0 +1,118 @@
+# Deploy NNVM Modules
+NNVM compiled modules are fully embedded in TVM runtime as long as ```GRAPH_RUNTIME``` option
+is enabled in tvm runtime.
+
+
+In a nutshell, we will need three items to deploy a compiled module.
+Checkout our tutorials on getting started with NNVM compiler for more details.
+
+- The graph json data which contains the execution graph.
+- The tvm module library of compiled functions.
+- The parameter blobs for stored parameters.
+
+We can then use TVM's runtime API to deploy the compiled module.
+Here is an example in python.
+
+```python
+import tvm
+
+# tvm module for compiled functions.
+loaded_lib = tvm.module.load("deploy.so")
+# json graph
+loaded_json = open(temp.relpath("deploy.json")).read()
+# parameters in binary
+loaded_params = bytearray(open(temp.relpath("deploy.params"), "rb").read())
+
+fcreate = tvm.get_global_func("tvm.graph_runtime.create")
+ctx = tvm.gpu(0)
+gmodule = fcreate(loaded_json, loaded_lib, ctx.device_type, ctx.device_id)
+set_input, get_output, run = gmodule["set_input"], gmodule["get_output"], gmodule["run"]
+set_input("x", tvm.nd.array(x_np))
+gmodule["load_params"](loaded_params)
+run()
+out = tvm.nd.empty(shape)
+get_output(0, out)
+print(out.asnumpy())
+```
+
+An example in c++.
+```cpp
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+int main()
+{
+ // tvm module for compiled functions
+ tvm::runtime::Module mod_syslib = tvm::runtime::Module::LoadFromFile("deploy.so");
+
+ // json graph
+ std::ifstream json_in("deploy.json", std::ios::in);
+ std::string json_data((std::istreambuf_iterator(json_in)), std::istreambuf_iterator());
+ json_in.close();
+
+ // parameters in binary
+ std::ifstream params_in("deploy.params", std::ios::binary);
+ std::string params_data((std::istreambuf_iterator(params_in)), std::istreambuf_iterator());
+ params_in.close();
+
+ // parameters need to be TVMByteArray type to indicate the binary data
+ TVMByteArray params_arr;
+ params_arr.data = params_data.c_str();
+ params_arr.size = params_data.length();
+
+ int dtype_code = kDLFloat;
+ int dtype_bits = 32;
+ int dtype_lanes = 1;
+ int device_type = kDLCPU;
+ int device_id = 0;
+
+ // get global function module for graph runtime
+ tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))(json_data, mod_syslib, device_type, device_id);
+
+ DLTensor* x;
+ int in_ndim = 4;
+ int64_t in_shape[4] = {1, 3, 224, 224};
+ TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+ // load image data saved in binary
+ std::ifstream data_fin("cat.bin", std::ios::binary);
+ data_fin.read(static_cast(x->data), 3 * 224 * 224 * 4);
+
+ // get the function from the module(set input data)
+ tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input");
+ set_input("data", x);
+
+ // get the function from the module(load patameters)
+ tvm::runtime::PackedFunc load_params = mod.GetFunction("load_params");
+ load_params(params_arr);
+
+ // get the function from the module(run it)
+ tvm::runtime::PackedFunc run = mod.GetFunction("run");
+ run();
+
+ DLTensor* y;
+ int out_ndim = 1;
+ int64_t out_shape[1] = {1000, };
+ TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+
+ // get the function from the module(get output data)
+ tvm::runtime::PackedFunc get_output = mod.GetFunction("get_output");
+ get_output(0, y);
+
+ // get the maximum position in output vector
+ auto y_iter = static_cast(y->data);
+ auto max_iter = std::max_element(y_iter, y_iter + 1000);
+ auto max_index = std::distance(y_iter, max_iter);
+ std::cout << "The maximum position in output vector is: " << max_index << std::endl;
+
+ TVMArrayFree(x);
+ TVMArrayFree(y);
+
+ return 0;
+}
+```
diff --git a/docs/dev/hybrid_script.rst b/docs/dev/hybrid_script.rst
new file mode 100644
index 000000000000..0af02a56e72c
--- /dev/null
+++ b/docs/dev/hybrid_script.rst
@@ -0,0 +1,76 @@
+Hybrid Frontend Developer Guide
+===============================
+
+If you are a developer:
+
+1. who is trying writing some preliminary patterns that have not been supported by TVM yet,
+maybe :ref:`hybrid-langref-label` is a better place for you.
+
+2. who wants to know the implementing details of this module, you are right here!
+
+Features
+--------
+
+Software emulation
+~~~~~~~~~~~~~~~~~~
+
+In software emulation, the most intresting thing is the decorator ``tvm.hybrid.script``.
+This decorator helps 2 things:
+
+1. Importing runtime variables
+
+2. Overload the function according to the arguments passed
+
+Correct me if I am wrong: I believe that how 1. is implemented is dangerous, but I have no
+choice. What I did is add those names into python dict ``func.__global__`` and after
+the call to ``func`` is done, those names will be cleaned up.
+
+Overload is simple: the decorator checks the arguments' types and determines which function
+should be actually called.
+
+
+Backend Compilation
+~~~~~~~~~~~~~~~~~~~
+
+Compilation is a large module, you can see ``python/tvm/hybrid/var_decl.py`` and
+``python/tvm/hybrid/parser.py`` for more details. The first stage determines the
+usage, or more accurately the declaration of each variable and the second stage does
+the actual IR generation.
+
+Attributes
+~~~~~~~~~~
+
+So far, ONLY tensors' `shape` attribute is supported. You can see ``visit_Subscript``
+in ``python/tvm/hybrid/parser.py`` for more details. This is a hacky solution, I just
+check the attributes when subscript.
+
+Loops
+~~~~~
+
+In HalideIR, loops have in total 4 types: ``serial``, ``unrolled``, ``parallel``, and ``vectorized``.
+
+
+.. note::
+
+ Unlike what that is in HalideIR, in ``loop_type(a, b)``, ``a`` is the starting point and ``b``
+ is the trip count of iterations. Here ``loop_type(a, b)`` indicates ``[a, b)``. Thus, when lowering it
+ to HalideIR, we need to do ``start, extent = a, b - a``
+
+
+.. note::
+
+ In HalideIR those are enums, they are in passive form.
+ Here we use active form to annotate loops, because they are ready to run.
+
+
+Variables
+~~~~~~~~~
+
+Because there is no variables in ``HalideIR``, all the mutatable variables will be lowered to an array with size 1.
+It takes the first store of a variable as its declaration.
+
+Math intrinsics
+~~~~~~~~~~~~~~~
+So far, these math intrinsics, ``log``, ``exp``, ``sigmoid``, ``tanh``, ``power``, and ``popcount``, are supported.
+Math intrinsics will be imported by the decorator. Most of the intrinsics are borrowed by library implementation
+except ``popcount`` and ``sigmoid``. I implemented them manually.
diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index 0d0ee852f6f8..f3ab322bfe53 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -1,11 +1,13 @@
-TVM Design and Developer Guide
-==============================
+Design and Developer Guide
+==========================
-Building an IR stack for deep learning systems involves many
-many systems-level design decisions.
+Building a compiler stack for deep learning systems involves many many systems-level design decisions.
In this part of documentation, we share the rationale for the specific choices made when designing TVM.
.. toctree::
:maxdepth: 2
runtime
+ nnvm_json_spec
+ nnvm_overview
+ hybrid_script
diff --git a/docs/dev/nnvm_json_spec.rst b/docs/dev/nnvm_json_spec.rst
new file mode 100644
index 000000000000..31f2d2dc5c07
--- /dev/null
+++ b/docs/dev/nnvm_json_spec.rst
@@ -0,0 +1,212 @@
+NNVM Graph JSON Specification
+=============================
+
+NNVM uses JSON for graph serialization. This allows NNVM graph to be
+exported to any backend either natively supported or by third-party
+without any dependency such as protobuf.
+
+Getting started
+---------------
+
+A serialized NNVM graph in JSON format can be deserialized by any JSON
+parser.
+
+.. code:: python
+
+ # python
+ import json
+ with open('model.json', 'r') as f:
+ graph = json.loads(f.read())
+ print(graph.keys())
+
+``['nodes', 'arg_nodes', 'heads', 'node_row_ptr']``
+
+Actually, the following keys are valid in JSON graph.
+
++--------------------------------------+------------+-----------------------------------+
+| Keys | Required | Description |
++======================================+============+===================================+
+| `nodes <#nodes>`__ | Yes | The nodes in graph. |
++--------------------------------------+------------+-----------------------------------+
+| `arg\_nodes <#arg_nodes>`__ | Yes | Indices of input nodes. |
++--------------------------------------+------------+-----------------------------------+
+| `heads <#heads>`__ | Yes | Indices of output nodes. |
++--------------------------------------+------------+-----------------------------------+
+| `node\_row\_ptr <#node_row_ptr>`__ | Optional | Depth first search row indices. |
++--------------------------------------+------------+-----------------------------------+
+| `attr <#attr>`__ | Optional | Additional information. |
++--------------------------------------+------------+-----------------------------------+
+
+nodes
+-----
+
+Explained by the name itself, ``nodes`` are either placeholders or
+computational nodes in NNVM graph. The ``nodes`` are stored in list.
+
+.. code:: python
+
+ nodes = graph['nodes']
+ print(len(nodes))
+ print(nodes[0])
+ print(nodes[3])
+
+::
+
+ 53
+ {'inputs': [], 'name': 'data', 'op': 'null'}
+ {'inputs': [[0, 0, 0], [1, 0, 0], [2, 0, 0]], 'attrs': {'channels': '64',
+ 'padding': '(1, 1)', 'layout': 'NCHW', 'kernel_size': '[3, 3]', 'groups': '1',
+ 'strides': '(1, 1)', 'use_bias': 'True', 'dilation': '(1, 1)'},
+ 'name': 'conv1_1', 'op': 'conv2d'}
+
+The following keys are valid in each node:
+
++----------------+------------------+----------+
+| Keys | Required | Descript |
+| | | ion |
++================+==================+==========+
+| op | Yes | The |
+| | | operator |
+| | | type |
+| | | name, |
+| | | 'null' |
+| | | is used |
+| | | if it's |
+| | | a |
+| | | placehol |
+| | | der/vari |
+| | | able/inp |
+| | | ut. |
++----------------+------------------+----------+
+| name | Yes | The |
+| | | given |
+| | | name of |
+| | | the |
+| | | node, |
+| | | defined |
+| | | by user |
+| | | composin |
+| | | g |
+| | | the |
+| | | network. |
++----------------+------------------+----------+
+| inputs | Yes | List of |
+| | | Entry |
+| | | of the |
+| | | input |
+| | | nodes, |
+| | | can be |
+| | | empty |
+| | | list []. |
+| | | Entry is |
+| | | a list |
+| | | of |
+| | | [nose\_i |
+| | | d, |
+| | | index, |
+| | | version] |
++----------------+------------------+----------+
+| attrs | Optional | Extra |
+| | | attribut |
+| | | es |
+| | | for the |
+| | | specific |
+| | | operator |
+| | | . |
++----------------+------------------+----------+
+| control\_deps | Optional | Control |
+| | | dependen |
+| | | cies, |
+| | | left |
+| | | blank |
+| | | unless |
+| | | specific |
+| | | ally |
+| | | used. |
++----------------+------------------+----------+
+
+``attrs`` for operators is a dictionary. Key-value pair examples:
+
++----------------+------------------+----------+----------+
+| Keys | Value | Operator | Descript |
+| | | | ion |
++================+==================+==========+==========+
+| 'channels' | '64' | conv2d | Output |
+| | | | channels |
+| | | | for 2d |
+| | | | convolut |
+| | | | ion. |
++----------------+------------------+----------+----------+
+| 'kernel\_size' | '[3, 3]' | conv2d | Convolut |
+| | | | ion |
+| | | | filter |
+| | | | kernel |
+| | | | size in |
+| | | | (h, w), |
+| | | | list and |
+| | | | tuple |
+| | | | both |
+| | | | works. |
++----------------+------------------+----------+----------+
+| 'use\_bias' | '1' | conv2d | Whether |
+| | | | use bias |
+| | | | such |
+| | | | that |
+| | | | `y = w |
+| | | | * x + b` |
+| | | | . |
++----------------+------------------+----------+----------+
+
+.. note::
+
+ Tips for parsing key-value pair:
+
+ * Both key and value are stored as strings.
+
+ * Boolean values need extra attention, convert to int is recommended since `bool('0') == True` in python.
+
+ * For a full list of operator attributes, please refer to the core operator `documentation `__.
+
+arg\_nodes
+----------
+
+``arg_nodes`` is a list of indices of nodes which is
+placeholder/variable/input to the graph.
+
+.. code:: python
+
+ print(graph['arg_nodes'])
+
+::
+
+ [0, 1, 2, 6, 7, 11, 12, 15, 16, 20, 21, 24, 25, 29, 30, 33, 34, 39, 40, 44, 45, 49, 50]
+
+For example, ``nodes[3]`` is not in ``arg_nodes`` because it's an
+internal node.
+
+heads
+-----
+
+``heads`` is a list of entries as the outlet/output of the graph.
+
+.. code:: python
+
+ print(graph['heads'])
+
+::
+
+ [[52, 0, 0]]
+
+This example indicating that there's only one output in the graph, with
+index 52.
+
+node\_row\_ptr
+--------------
+
+``node_row_ptr`` stores the history of forward path, so you can skip
+constructing the entire graph in inference tasks.
+
+attrs
+-----
+
+``attrs`` can contain version numbers or similar helpful informations.
diff --git a/docs/dev/nnvm_overview.md b/docs/dev/nnvm_overview.md
new file mode 100644
index 000000000000..4f01fdda2a03
--- /dev/null
+++ b/docs/dev/nnvm_overview.md
@@ -0,0 +1,126 @@
+
+# NNVM Design Overview
+
+NNVM is a reusable graph IR stack for deep learning systems. It provides useful API to construct, represent and transform computation graphs to get most high-level optimization needed in deep learning.
+As a part of TVM stack for deep learning, NNVM also provides a shared compiler for deep learning frameworks to optimize, compile and deploy into different hardware backends via [TVM](https://github.com/dmlc/tvm)
+
+## Key Requirements and Design Choices
+
+- Have minimum dependency in the deployment module.
+- Being able to add new operators to the IR, in a decentralized fashion.
+- Being able to add new optimization passes to the IR and applies to existing graphs.
+
+The item2 and 3 are particularly interesting if we compare it to a typical compiler IR. Compiler IR usually contains a fixed set of primitives(instructions), and use them as a contract between optimization pass designers. This design enables easy addition of new optimization passes, but not new operator(instruction). Because every time we add a new instruction, we need to modify the passes to accommodate these changes.
+
+Deep learning frameworks usually have a fixed operator interface(schema). These interfaces can contain properties like shape inference function, whether in-place computation can happen. The operator interface is an again contract that makes it easy to add new an operator. But it is hard to add new passes in decentralized fashion a new optimization pass usually requires additional information, and this results in frequent changes of the centralized operator interface when we are exploring new optimizations. There is also a drawback of modularization. For example, a graph compiler for FPGA devices may not need the GPU device specific attributes.
+
+During our explorations in graph optimization and compilation, we find that it is important to quickly add both operators and passes to the framework without changing the core library.
+
+Here is a list of key elements in NNVM's design
+
+- Operator registry system to register and add new operators
+- Operator attribute system provide property of operator in decentralized fashion
+- A reusable IR data structure for optimization passes.
+
+The above list is more like the generic language part of NNVM, besides of that, we also provide a collection of core operator primitives, and graph optimization passes. The core tensor operator primitives and optimizations already cover commonly deep learning workloads. This design allows the NNVM compiler to be directly used as optimization and compilation stack for frameworks. The extendible nature of NNVM makes new adjustment easy without constraining the backend providers.
+
+## Minimum Registration for a Symbolic Front-End
+To use NNVM to build language front end, a developer only needs to register minimum information about each operator.
+
+```c++
+NNVM_REGISTER_OP(add)
+.describe("add two data together")
+.set_num_inputs(2);
+
+NNVM_REGISTER_OP(conv2d)
+.describe("take 2d convolution of input")
+.set_num_inputs(2);
+
+NNVM_REGISTER_OP(assign)
+.describe("assign second input argument to the first one")
+.set_num_inputs(2);
+```
+
+Compiling the code with NNVM library. User can use the following interface to compose the computation graph in python, like the following code.
+
+```python
+import nnvm.symbol as nn
+
+# symbolic variable
+x = nn.Variable('x')
+y = nn.Variable('y')
+w = nn.Variable('w')
+
+z = nn.conv2d(nn.elemwise_add(x, y), w, kernel_size=(2,2), name='conv1')
+```
+
+The graph structure is interchangeable between the frontend and the backend. Python interface is supported currently. More language support can be easily
+moved in the future.
+
+## Operator Attribute for More Extensions
+
+The minimum information provided by the operator is enough to get a front-end. However, we need more knowledge about each operator to do transformations and executing the graph.
+A typical difference between neural nets' computation graph and traditional compiler IR is that there are a lot more high-level operators. We cannot fix the set of operators in the IR.
+
+NNVM allow developers to register attributes of each operator. The attributes can include shape inference function, whether the operator can perform in-place calculation etc.
+
+This design to having an operator attribute registry is not uncommon in deep learning systems.
+For example, MXNet has a ```OpProperty``` class, Tensorflow has a ```OpDef``` and Caffe2 have a ```OperatorSchema``` class.
+However, the operator attribute interface listed in these frameworks only support a fixed number of defined attributes of interest to the system. If we want to extend the framework to add a new attribute in each operator, we need to change the operator registry.
+Eventually, the operator interface grows into to be very big and have to evolve in the centralized repo.
+
+In NNVM, we decided to change the design and support arbitrary type of operator attributes, without changing the interface registry. The minimum interface also makes it easier to share across multiple projects
+
+User can register new attribute, such as inplace property checking function as follows.
+```c++
+using FInplaceOption = std::function<
+ std::vector > (const NodeAttrs& attrs)>;
+
+// we can register attributes from multiple places.
+NNVM_REGISTER_OP(elemwise_add)
+.set_num_inputs(2);
+
+// register to tell first input can be calculate inplace with first output
+NNVM_REGISTER_OP(add)
+.set_attr("FInplaceOption", [](const NodeAttrs& attrs) {
+ return std::vector >{{0, 0}};
+ });
+
+NNVM_REGISTER_OP(exp)
+.set_num_inputs(1)
+.set_attr("FInplaceOption", [](const NodeAttrs& attrs) {
+ return std::vector >{{0, 0}};
+ });
+```
+
+We can query these attributes at arbitrary parts of the code, like the following parts. Under the hood, each attribute is stored in a columnar store, that can easily be retrieved table and do quick lookups.
+
+```c++
+void MyFunction() {
+ const Op* add = Op::Get("add");
+ // if we need quick query, we can use static variable
+ // attribute map contains attributes of all operators.
+ static auto& finplace_option_map = Op::GetAttr("FInplaceOption");
+
+ // quick look up attribute of add, O(1) time, vector index lookup internally.
+ auto add_inplace = finplace_option_map[add];
+}
+```
+Besides making the code minimum, this attribute store enables decentralization of projects.
+Before, all the attributes of operator have to sit on a centralized interface class.
+Now, everyone can register attributes of their own, take some other attributes they need from another project without changing the operator interface and core library
+
+
+## Graph and Pass
+
+We can use the additional information on attribute registry to do optimizations and get more information about the graph. Graph is the unit we manipulate in these steps. A Graph in NNVM contains
+two parts:
+- The computation graph structure
+- A attribute map from string to any type ```map >```
+
+The second attribute map is quite important, as we may need different kinds
+of information about the graph during the transformation process. Let it be
+shapes of each tensor, types of each tensor or the storage allocation plans.
+
+A ```Pass``` can take a graph with existing attribute information,
+and transform it to the same graph structure with more graph attributes or another graph.
diff --git a/docs/dev/runtime.md b/docs/dev/runtime.md
index b9cc81186200..a5d8138c3372 100644
--- a/docs/dev/runtime.md
+++ b/docs/dev/runtime.md
@@ -1,18 +1,18 @@
# TVM Runtime System
-TVM support multiple programming languages for compiler stack development and deployment.
-In this note, we explain the key element of TVM runtime.
+TVM supports multiple programming languages for the compiler stack development and deployment.
+In this note, we explain the key elements of the TVM runtime.
-![](http://www.tvmlang.org/images/release/tvm_flexible.png)
+![](http://www.tvm.ai/images/release/tvm_flexible.png)
We need to satisfy quite a few interesting requirements
- Deployment: invoke the compiled function from python/javascript/c++ language.
- Debug: define a function in python and call that from a compiled function.
-- Link: write driver code to call device specific code(CUDA) and call it from compiled host function.
+- Link: write driver code to call device specific code (CUDA) and call it from compiled host function.
- Prototype: define an IR pass from python and call that from C++ backend.
-- Expose: compiler stack developed in c++ to front-end (i.e, python)
-- Experiment: ship a compiled function to an embedded device directly run there.
+- Expose: compiler stack developed in c++ to front-end (i.e, python)
+- Experiment: ship a compiled function to an embedded device to directly run there.
We want to be able to define a function from any language and call from another.
We also want the runtime core to be minimal to deploy to embedded devices.
@@ -41,11 +41,11 @@ void CallPacked() {
```
In the above codeblock, we defined a PackedFunc MyAdd. It takes two arguments
: ```args``` represents input arguments and ```rv``` represents return value.
-The function is type-erased, which means the function signature does not restrict which input type to pass in or type to return.
+The function is type-erased, which means that the function signature does not restrict which input type to pass in or type to return.
Under the hood, when we call a PackedFunc, it packs the input arguments to TVMArgs on stack,
-and get the result back via TVMRetValue.
+and gets the result back via TVMRetValue.
-Thanks to template tricks in C++, we can call a PackedFunc just like a normal function. Because of its type-erased nature, we can call a PackedFunc from dynamic languages like python, without additional glue code for each new type function created.
+Thanks to template tricks in C++, we can call a PackedFunc just like a normal function. Because of its type-erased nature, we can call a PackedFunc from dynamic languages like python, without additional glue code for each new type function created.
The following example registers PackedFunc in C++ and calls from python.
```c++
@@ -74,7 +74,7 @@ The restriction makes the implementation simple without the need of serializatio
Despite being minimum, the PackedFunc is sufficient for the use-case of deep learning deployment as
most functions only take DLTensor or numbers.
-Since one PackedFunc can take another PackedFunc as argument,
+Since one PackedFunc can take another PackedFunc as an argument,
we can pass functions from python(as PackedFunc) to C++.
```c++
TVM_REGISTER_GLOBAL("callhello")
@@ -97,15 +97,15 @@ callhello(f)
```
TVM provides a [minimum C API](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h),
-that allows us to embedded the PackedFunc into any languages. Besides python, so far we supported
+which allows us to embed the PackedFunc into any languages. Besides python, so far we supported
[java](https://github.com/dmlc/tvm/tree/master/jvm) and [javascript](https://github.com/dmlc/tvm/tree/master/web).
-This philosophy of embedded API is very like Lua, except that we don't have a new language and uses C++.
+This philosophy of embedded API is very like Lua, except that we don't have a new language but use C++.
One fun fact about PackedFunc is that we use it for both compiler and deployment stack.
- All TVM's compiler pass functions are exposed to frontend as PackedFunc, see [here](https://github.com/dmlc/tvm/tree/master/src/api)
-- The compiled modules also returns compiled function as PackedFunc
+- The compiled module also returns the compiled function as PackedFunc
-To keep the runtime minimum, we isolated the IR Node support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules(e.g., CUDA) get included.
+To keep the runtime minimum, we isolated the IR Node support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules (e.g., CUDA) get included.
The overhead of calling into PackedFunc vs. a normal function is small, as it is only saving a few values on the stack.
So it is OK as long as we don't wrap small functions.
@@ -113,9 +113,9 @@ In summary, the PackedFunc is the universal glue in TVM where we use it extensiv
## Module
-Since TVM support multiple types of devices, we need to support different type of drivers.
-We have to use driver API to load the kernel, set up the argument in packed format and perform kernel launch.
-We also need to patch up the driver API so that the exposed functions is threadsafe.
+Since TVM supports multiple types of devices, we need to support different type of drivers.
+We have to use the driver API to load the kernel, set up the argument in packed format and perform kernel launch.
+We also need to patch up the driver API so that the exposed functions are threadsafe.
So we often need to implement these driver glues in C++ and expose them to the user.
We can certainly not do it for each type of functions, so again PackedFunc is our answer.
@@ -130,32 +130,32 @@ of new device easy, and we do not need to redo the host code generation for each
## Remote Deployment
The PackedFunc and Module system also makes it easy to ship the function into remote devices directly.
-Under the hood, we have a RPCModule that serializes the arguments and do the data movement and launches the computation on the remote.
+Under the hood, we have an RPCModule that serializes the arguments to do the data movement and launches the computation on the remote.
-![](http://www.tvmlang.org/images/release/tvm_rpc.png)
+![](http://www.tvm.ai/images/release/tvm_rpc.png)
The RPC server itself is minimum and can be bundled into the runtime. We can start a minimum TVM
-RPC server on iPhone/android/raspberry pi or even your browser. The cross compilation on server and shipping of the module for testing can be done in the same script. Checkout
-[Cross compilation and RPC tutorial](http://docs.tvmlang.org/tutorials/deployment/cross_compilation_and_rpc.html#sphx-glr-tutorials-deployment-cross-compilation-and-rpc-py) for more details.
+RPC server on iPhone/android/raspberry pi or even the browser. The cross compilation on server and shipping of the module for testing can be done in the same script. Checkout
+[Cross compilation and RPC tutorial](http://docs.tvm.ai/tutorials/deployment/cross_compilation_and_rpc.html#sphx-glr-tutorials-deployment-cross-compilation-and-rpc-py) for more details.
-This instant feedback gives us a lot of advantages. For example, to test the correctness of generated code on iPhone, we no longer have to write test-cases in swift/objective-c from scratch -- We can use RPC to execute on iPhone copy the result back and do verification on my host via numpy. We can also do the profiling using the same script.
+This instant feedback gives us a lot of advantages. For example, to test the correctness of generated code on iPhone, we no longer have to write test-cases in swift/objective-c from scratch -- We can use RPC to execute on iPhone, copy the result back and do verification on the host via numpy. We can also do the profiling using the same script.
## TVM Node and Compiler Stack
As we mentioned earlier, we build compiler stack API on top of the PackedFunc runtime system.
-We faced a constant changing the compiler API for the need of research. We need a new language object or IR node from now and then when we want to test out new primitives.
+We faced a constant changing of the compiler API for the need of research. We need a new language object or IR node whenever we want to test out new primitives.
However, we don't want to change our API from time to time. Besides that, we also want to
- be able to serialize any language object and IRs
- be able to explore, print, and manipulate the IR objects in front-end language to do quick prototyping.
We introduced a base class, called [Node](https://github.com/dmlc/HalideIR/blob/master/src/tvm/node.h#L52) to solve this problem.
-All the language object in compiler stack is a subclass of Node. Each node contains a string type_key that uniquely identifies
-the type of object. We choose string instead of int as type key so new Node class can be added in decentralized fashion without
+All the language object in the compiler stack is a subclass of Node. Each node contains a string type_key that uniquely identifies
+the type of object. We choose string instead of int as type key so new Node class can be added in the decentralized fashion without
adding the code back to the central repo. To ease the speed of dispatching, we allocate an integer type_index at runtime for each type_key.
-Since usually one Node object could be referenced in multiple places in the language. We use a shared_ptr to keep
-track of reference. We use NodeRef class to represents a reference to the Node.
+Since usually one Node object could be referenced in multiple places in the language, we use a shared_ptr to keep
+track of reference. We use NodeRef class to represent a reference to the Node.
We can roughly view NodeRef class as shared_ptr to the Node container.
We can also define subclass NodeRef to hold each subtypes of Node. Each Node class needs to define the VisitAttr function.
@@ -206,7 +206,7 @@ class TensorNode : public Node {
```
In the above examples, both ```Operation``` and ```Array``` are NodeRef.
The VisitAttrs gives us a reflection API to visit each member of the object.
-We can use this function to visit the node any serialize any language object recursively.
+We can use this function to visit the node and serialize any language object recursively.
It also allows us to get members of an object easily in front-end language.
For example, in the following code, we accessed the op field of the TensorNode.
@@ -220,13 +220,13 @@ print(x.op.name)
New Node can be added to C++ without changing the front-end runtime, making it easy to make extensions to the compiler stack.
Note that this is not the fastest way to expose members to front-end language, but might be one of the simplest
-approach possible. We also find it fits our purposes as we mainly use python for testing and prototyping and still use c++
+approaches possible. We also find that it fits our purposes as we mainly use python for testing and prototyping and still use c++
to do the heavy lifting job.
## Implementation Details
Each argument in PackedFunc contains a union value [TVMValue](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L122)
-and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language
+and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language to
do runtime type checking during conversion.
The relevant files are
diff --git a/docs/faq.md b/docs/faq.md
index 92cb886f1ca7..54df0ced8fa8 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -4,7 +4,7 @@ This document contains frequently asked questions.
How to Install
--------------
-See [Installation](https://github.com/dmlc/tvm/blob/master/docs/how_to/install.md)
+See [Installation](http://tvm.ai/install/)
TVM's relation to Other IR/DSL Projects
---------------------------------------
diff --git a/docs/how_to/contribute.md b/docs/how_to/contribute.md
deleted file mode 100644
index a0ba99bdf718..000000000000
--- a/docs/how_to/contribute.md
+++ /dev/null
@@ -1,99 +0,0 @@
-# Contribute to TVM
-
-TVM has been developed by community members.
-Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users.
-
-- Please add your name to [CONTRIBUTORS.md](https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md)
-- Please update [NEWS.md](https://github.com/dmlc/tvm/blob/master/NEWS.md) to add note on your changes to the API or added a new document.
-
-## Guidelines
-* [Submit Pull Request](#submit-pull-request)
-* [Git Workflow Howtos](#git-workflow-howtos)
- - [How to resolve conflict with master](#how-to-resolve-conflict-with-master)
- - [How to combine multiple commits into one](#how-to-combine-multiple-commits-into-one)
- - [What is the consequence of force push](#what-is-the-consequence-of-force-push)
-* [Document](#document)
-* [Testcases](#testcases)
-* [Examples](#examples)
-* [Core Library](#core-library)
-* [Python Package](#python-package)
-
-## Submit Pull Request
-* Before submit, please rebase your code on the most recent version of master, you can do it by
-```bash
-git remote add upstream [url to tvm repo]
-git fetch upstream
-git rebase upstream/master
-```
-* If you have multiple small commits,
- it might be good to merge them together(use git rebase then squash) into more meaningful groups.
-* Send the pull request!
- - Fix the problems reported by automatic checks
- - If you are contributing a new module or new function, add a test.
-
-## Git Workflow Howtos
-### How to resolve conflict with master
-- First rebase to most recent master
-```bash
-# The first two steps can be skipped after you do it once.
-git remote add upstream [url to tvm repo]
-git fetch upstream
-git rebase upstream/master
-```
-- The git may show some conflicts it cannot merge, say ```conflicted.py```.
- - Manually modify the file to resolve the conflict.
- - After you resolved the conflict, mark it as resolved by
-```bash
-git add conflicted.py
-```
-- Then you can continue rebase by
-```bash
-git rebase --continue
-```
-- Finally push to your fork, you may need to force push here.
-```bash
-git push --force
-```
-
-### How to combine multiple commits into one
-Sometimes we want to combine multiple commits, especially when later commits are only fixes to previous ones,
-to create a PR with set of meaningful commits. You can do it by following steps.
-- Before doing so, configure the default editor of git if you haven't done so before.
-```bash
-git config core.editor the-editor-you-like
-```
-- Assume we want to merge last 3 commits, type the following commands
-```bash
-git rebase -i HEAD~3
-```
-- It will pop up an text editor. Set the first commit as ```pick```, and change later ones to ```squash```.
-- After you saved the file, it will pop up another text editor to ask you modify the combined commit message.
-- Push the changes to your fork, you need to force push.
-```bash
-git push --force
-```
-
-### Reset to the most recent master
-You can always use git reset to reset your version to the most recent master.
-Note that all your ***local changes will get lost***.
-So only do it when you do not have local changes or when your pull request just get merged.
-```bash
-git reset --hard [hash tag of master]
-git push --force
-```
-
-### What is the consequence of force push
-The previous two tips requires force push, this is because we altered the path of the commits.
-It is fine to force push to your own fork, as long as the commits changed are only yours.
-
-## Testcases
-- All the testcases are in tests
-
-## Core Library
-- Follow Google C style for C++.
-- We use doxygen to document all the interface code.
-- You can reproduce the linter checks by typing ```make lint```
-
-## Python Package
-- Always add docstring to the new functions in numpydoc format.
-- You can reproduce the linter checks by typing ```make lint```
diff --git a/docs/how_to/install.md b/docs/how_to/install.md
deleted file mode 100644
index 54db42281623..000000000000
--- a/docs/how_to/install.md
+++ /dev/null
@@ -1,92 +0,0 @@
-Installation Guide
-==================
-This page gives instructions on how to build and install the tvm package from
-scratch on various systems. It consists of two steps:
-
-1. First build the shared library from the C++ codes (`libtvm.so` for linux/osx and `libtvm.dll` for windows).
-2. Setup for the language packages (e.g. Python Package).
-
-To get started, clone tvm repo from github. It is important to clone the submodules along, with ```--recursive``` option.
-```bash
-git clone --recursive https://github.com/dmlc/tvm
-```
-For windows users who use github tools, you can open the git shell, and type the following command.
-```bash
-git submodule init
-git submodule update
-```
-
-## Contents
-- [Build the Shared Library](#build-the-shared-library)
-- [Python Package Installation](#python-package-installation)
-
-## Build the Shared Library
-
-Our goal is to build the shared library:
-- On Linux/OSX the target library is `libtvm.so`
-- On Windows the target library is `libtvm.dll`
-
-The minimal building requirement is
-- A recent c++ compiler supporting C++ 11 (g++-4.8 or higher)
-
-You can edit `make/config.mk` to change the compile options, and then build by
-`make`. If everything goes well, we can go to the specific language installation section.
-
-### Building on Windows
-
-TVM support build via MSVC using cmake. The minimum required VS version is **Visual Studio Community 2015 Update 3**. In order to generate the VS solution file using cmake,
-make sure you have a recent version of cmake added to your path and then from the tvm directory:
-
-```bash
-mkdir build
-cd build
-cmake -G "Visual Studio 14 2015 Win64" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" ..
-```
-This will generate the VS project using the MSVC 14 64 bit generator. Open the .sln file in the build directory and build with Visual Studio.
-
-### Customized Building
-
-Install prerequisites first:
-
-```bash
-sudo apt-get update
-sudo apt-get install -y python python-dev python-setuptools gcc libtinfo-dev zlib1g-dev
-```
-
-The configuration of tvm can be modified by ```config.mk```
-- First copy ```make/config.mk``` to the project root, on which
- any local modification will be ignored by git, then modify the according flags.
-- TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.
- - LLVM 4.0 or higher is needed for build with LLVM. Note that verison of LLVM from default apt may lower than 4.0.
- - Since LLVM takes long time to build from source, you can download pre-built version of LLVM frorm
- [LLVM Download Page](http://releases.llvm.org/download.html).
- - Unzip to a certain location, modify ```config.mk``` to add ```LLVM_CONFIG=/path/to/your/llvm/bin/llvm-config```
- - You can also use [LLVM Nightly Ubuntu Build](https://apt.llvm.org/)
- - Note that apt-package append ```llvm-config``` with version number. For example, set ```LLVM_CONFIG=llvm-config-4.0``` if you installed 4.0 package
- - By default CUDA and OpenCL code generator do not require llvm.
-
-## Python Package Installation
-
-The python package is located at python
-There are several ways to install the package:
-
-1. Set the environment variable `PYTHONPATH` to tell python where to find
- the library. For example, assume we cloned `tvm` on the home directory
- `~`. then we can added the following line in `~/.bashrc`.
- It is ***recommended for developers*** who may change the codes.
- The changes will be immediately reflected once you pulled the code and rebuild the project (no need to call ```setup``` again)
-
- ```bash
- export PYTHONPATH=/path/to/tvm/python:/path/to/tvm/topi/python:${PYTHONPATH}
- ```
-
-2. Install tvm python bindings by `setup.py`:
-
- ```bash
- # install tvm package for the current user
- # NOTE: if you installed python via homebrew, --user is not needed during installaiton
- # it will be automatically installed to your user directory.
- # providing --user flag may trigger error during installation in such case.
- cd python; python setup.py install --user; cd ..
- cd topi/python; python setup.py install --user; cd ../..
- ```
diff --git a/docs/index.rst b/docs/index.rst
index 9fa690e00fd9..20e64bfef641 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,22 +1,39 @@
TVM Documentation
=================
-Welcome to TVM documentation.
-
-
-Contents
---------
-
+Get Started
+-----------
.. toctree::
:maxdepth: 1
- self
- how_to/install
+ install/index
tutorials/index
+ vta/index
+ deploy/index
+ contribute/index
faq
- how_to/deploy
- how_to/contribute
+
+API Reference
+-------------
+.. toctree::
+ :maxdepth: 2
+
+ langref/index
api/python/index
- dev/index
api_links
+
+Developer Guide
+---------------
+.. toctree::
+ :maxdepth: 2
+
+ dev/index
+ nnvm_top
+
+
+Index
+-----
+.. toctree::
+ :maxdepth: 1
+
genindex
diff --git a/docs/install/docker.rst b/docs/install/docker.rst
new file mode 100644
index 000000000000..8d089522761d
--- /dev/null
+++ b/docs/install/docker.rst
@@ -0,0 +1,45 @@
+.. _docker-images:
+
+Docker Images
+=============
+We provide several prebuilt docker images to quickly try out tvm.
+These images are also helpful run through TVM demo and tutorials.
+You can get the docker images via the following steps.
+We need `docker `_ and
+`nvidia-docker `_ if we want to use cuda.
+
+First, clone tvm repo to get the auxiliary scripts
+
+.. code:: bash
+
+ git clone --recursive https://github.com/dmlc/tvm
+
+
+We can then use the following command to launch a `tvmai/demo-cpu` image.
+
+.. code:: bash
+
+ /path/to/tvm/docker/bash.sh tvmai/demo-cpu
+
+You can also change `demo-cpu` to `demo-gpu` to get a CUDA enabled image.
+You can find all the prebuilt images in ``_
+
+
+This auxiliary script does the following things:
+
+- Mount current directory to /workspace
+- Switch user to be the same user that calls the bash.sh (so you can read/write host system)
+- Use the host-side network (so you can use jupyter notebook)
+
+
+Then you can start a jupyter notebook by typing
+
+.. code:: bash
+
+ jupyter notebook
+
+
+Docker Source
+-------------
+Check out ``_ if you are interested in
+building your own docker images.
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
new file mode 100644
index 000000000000..edeba1ccfadc
--- /dev/null
+++ b/docs/install/from_source.rst
@@ -0,0 +1,157 @@
+.. _install-from-source:
+
+Install from Source
+===================
+This page gives instructions on how to build and install the tvm package from
+scratch on various systems. It consists of two steps:
+
+1. First build the shared library from the C++ codes (`libtvm.so` for linux/osx and `libtvm.dll` for windows).
+2. Setup for the language packages (e.g. Python Package).
+
+To get started, clone tvm repo from github. It is important to clone the submodules along, with ``--recursive`` option.
+
+.. code:: bash
+
+ git clone --recursive https://github.com/dmlc/tvm
+
+For windows users who use github tools, you can open the git shell, and type the following command.
+
+.. code:: bash
+
+ git submodule init
+ git submodule update
+
+
+Build the Shared Library
+------------------------
+
+Our goal is to build the shared libraries:
+
+- On Linux the target library are `libtvm.so, libtvm_topi.so`
+- On OSX the target library are `libtvm.dylib, libtvm_topi.dylib`
+- On Windows the target library are `libtvm.dll, libtvm_topi.dll`
+
+
+.. code:: bash
+
+ sudo apt-get update
+ sudo apt-get install -y python python-dev python-setuptools gcc libtinfo-dev zlib1g-dev
+
+The minimal building requirements are
+
+- A recent c++ compiler supporting C++ 11 (g++-4.8 or higher)
+- CMake 3.5 or higher
+- We highly recommend to build with LLVM to enable all the features.
+- It is possible to build without llvm dependency if we only want to use CUDA/OpenCL
+
+We use cmake to build the library.
+The configuration of tvm can be modified by `config.cmake`.
+
+
+- First, check the cmake in your system, you do not have cmake
+ you can obtain the latest version from `official website `_
+- First create a build directory, copy the ``cmake/config.cmake`` to the directory.
+
+ .. code:: bash
+
+ mkdir build
+ cp cmake/config.cmake build
+
+- Edit ``build/config.cmake`` to customize the compilation options
+
+ - On macOS, for some versions of XCode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
+ - Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. So do other backends and libraries
+ (OpenCL, RCOM, METAL, VULKAN, ...).
+
+- TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.
+
+ - LLVM 4.0 or higher is needed for build with LLVM. Note that verison of LLVM from default apt may lower than 4.0.
+ - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from
+ [LLVM Download Page](http://releases.llvm.org/download.html).
+
+
+ - Unzip to a certain location, modify ``build/config.cmake`` to add ``set(USE_LLVM /path/to/your/llvm/bin/llvm-config)``
+ - You can also directly set ``set(USE_LLVM ON)`` and let cmake search for a usable version of LLVM.
+
+ - You can also use [LLVM Nightly Ubuntu Build](https://apt.llvm.org/)
+
+ - Note that apt-package append ``llvm-config`` with version number.
+ For example, set ``set(LLVM_CONFIG llvm-config-4.0)`` if you installed 4.0 package
+
+- We can then build tvm and related libraries.
+
+ .. code:: bash
+
+ cd build
+ cmake ..
+ make -j4
+
+If everything goes well, we can go to :ref:`python-package-installation`
+
+Building on Windows
+~~~~~~~~~~~~~~~~~~~
+
+TVM support build via MSVC using cmake. The minimum required VS version is **Visual Studio Community 2015 Update 3**.
+In order to generate the VS solution file using cmake,
+make sure you have a recent version of cmake added to your path and then from the tvm directory:
+
+.. code:: bash
+
+ mkdir build
+ cd build
+ cmake -G "Visual Studio 14 2015 Win64" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" ..
+
+This will generate the VS project using the MSVC 14 64 bit generator.
+Open the .sln file in the build directory and build with Visual Studio.
+In order to build with LLVM in windows, you will need to build LLVM from source.
+You need to run build the nnvm by running the same script under the nnvm folder.
+
+Building ROCm support
+~~~~~~~~~~~~~~~~~~~~~
+
+Currently, ROCm is supported only on linux, so all the instructions are written with linux in mind.
+
+- Set ``set(USE_ROCM ON)``, set ROCM_PATH to the correct path.
+- You need to first install HIP runtime from ROCm. Make sure the installation system has ROCm installed in it.
+- Install latest stable version of LLVM (v6.0.1), and LLD, make sure ``ld.lld`` is available via command line.
+
+.. _python-package-installation:
+
+Python Package Installation
+---------------------------
+
+The python package is located at python
+There are several ways to install the package:
+
+1. Set the environment variable `PYTHONPATH` to tell python where to find
+ the library. For example, assume we cloned `tvm` on the home directory
+ `~`. then we can added the following line in `~/.bashrc`.
+ It is **recommended for developers** who may change the codes.
+ The changes will be immediately reflected once you pulled the code and rebuild the project (no need to call ``setup`` again)
+
+ .. code:: bash
+
+ export PYTHONPATH=/path/to/tvm/python:/path/to/tvm/topi/python:/path/to/tvm/nnvm/python:${PYTHONPATH}
+
+
+2. Install tvm python bindings by `setup.py`:
+
+ .. code:: bash
+
+ # install tvm package for the current user
+ # NOTE: if you installed python via homebrew, --user is not needed during installaiton
+ # it will be automatically installed to your user directory.
+ # providing --user flag may trigger error during installation in such case.
+ export MACOSX_DEPLOYMENT_TARGET=10.9 # This is required for mac to avoid symbol conflicts with libstdc++
+ cd python; python setup.py install --user; cd ..
+ cd topi/python; python setup.py install --user; cd ../..
+ cd nnvm/python; python setup.py install --user; cd ../..
+
+
+Install Contrib Libraries
+-------------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ nnpack
diff --git a/docs/install/index.rst b/docs/install/index.rst
new file mode 100644
index 000000000000..cc39f2433c7e
--- /dev/null
+++ b/docs/install/index.rst
@@ -0,0 +1,13 @@
+Installation
+============
+To install TVM, please read :ref:`install-from-source`.
+If you are interested in deploying to mobile/embedded devices,
+you do not need to install the entire tvm stack on your device,
+instead, you only need the runtime, please read :ref:`deploy-and-integration`.
+If you would like to quickly try out TVM or do demo/tutorials, checkout :ref:`docker-images`
+
+.. toctree::
+ :maxdepth: 2
+
+ from_source
+ docker
diff --git a/docs/install/nnpack.md b/docs/install/nnpack.md
new file mode 100644
index 000000000000..d4e6e39e4023
--- /dev/null
+++ b/docs/install/nnpack.md
@@ -0,0 +1,81 @@
+# NNPACK Contrib Installation
+
+[NNPACK](https://github.com/Maratyszcza/NNPACK) is an acceleration package
+for neural network computations, which can run on x86-64, ARMv7, or ARM64 architecture CPUs.
+Using NNPACK, higher-level libraries like _MXNet_ can speed up
+the execution on multi-core CPU computers, including laptops and mobile devices.
+
+***Note***: AS TVM already has natively tuned schedules, NNPACK is here mainly for reference and comparison purpose.
+For regular use prefer native tuned TVM implementation.
+
+_TVM_ supports NNPACK for forward propagation (inference only) in convolution, max-pooling, and fully-connected layers.
+In this document, we give a high level overview of how to use NNPACK with _TVM_.
+
+## Conditions
+The underlying implementation of NNPACK utilizes several acceleration methods,
+including [fft](https://arxiv.org/abs/1312.5851) and [winograd](https://arxiv.org/abs/1509.09308).
+These algorithms work better on some special `batch size`, `kernel size`, and `stride` settings than on other,
+so depending on the context, not all convolution, max-pooling, or fully-connected layers can be powered by NNPACK.
+When favorable conditions for running NNPACKS are not met,
+
+NNPACK only supports Linux and OS X systems. Windows is not supported at present.
+
+## Build/Install NNPACK
+
+If the trained model meets some conditions of using NNPACK,
+you can build TVM with NNPACK support.
+Follow these simple steps:
+* Build NNPACK shared library with the following commands. _TVM_ will link NNPACK dynamically.
+
+Note: The following NNPACK installation instructions have been tested on Ubuntu 16.04.
+
+### Build [Ninja](https://ninja-build.org/)
+
+NNPACK need a recent version of Ninja. So we need to install ninja from source.
+```bash
+git clone git://github.com/ninja-build/ninja.git
+cd ninja
+./configure.py --bootstrap
+```
+
+Set the environment variable PATH to tell bash where to find the ninja executable. For example, assume we cloned ninja on the home directory ~. then we can added the following line in ~/.bashrc.
+```bash
+export PATH="${PATH}:~/ninja"
+```
+
+### Build [NNPACK](https://github.com/Maratyszcza/NNPACK)
+
+The new CMAKE version of NNPACK download [Peach](https://github.com/Maratyszcza/PeachPy) and other dependencies alone
+
+```bash
+git clone --recursive https://github.com/Maratyszcza/NNPACK.git
+cd NNPACK
+# Add PIC option in CFLAG and CXXFLAG to build NNPACK shared library
+sed -i "s|gnu99|gnu99 -fPIC|g" CMakeLists.txt
+sed -i "s|gnu++11|gnu++11 -fPIC|g" CMakeLists.txt
+mkdir build
+cd build
+# Generate ninja build rule and add shared library in configuration
+cmake -G Ninja -D BUILD_SHARED_LIBS=ON ..
+ninja
+sudo ninja install
+
+# Add NNPACK lib folder in your ldconfig
+echo "/usr/local/lib" > /etc/ld.so.conf.d/nnpack.conf
+sudo ldconfig
+```
+
+## Build TVM with NNPACK support
+
+```bash
+git clone --recursive https://github.com/dmlc/tvm
+```
+
+* Set `set(USE_NNPACK ON)` in config.cmake.
+* Set `NNPACK_PATH` to the $(YOUR_NNPACK_INSTALL_PATH)
+
+after configuration use `make` to build TVM
+
+```bash
+make
+```
diff --git a/docs/langref/hybrid_script.rst b/docs/langref/hybrid_script.rst
new file mode 100644
index 000000000000..fdaed2b5be40
--- /dev/null
+++ b/docs/langref/hybrid_script.rst
@@ -0,0 +1,172 @@
+.. _hybrid-langref-label:
+
+Hybrid Frontend Language Reference
+==================================
+
+Overview
+--------
+
+This hybrid frontend allows users to write preliminary versions of some idioms that yet have
+been supported by TVM officially.
+
+Features
+--------
+
+Software Emulation
+~~~~~~~~~~~~~~~~~~
+
+Both software emulation and compilation are supported. To define a function,
+you need to use ``tvm.hybrid.script`` decorator to indicate this is a hybrid function:
+
+.. code-block:: python
+
+ @tvm.hybrid.script
+ def outer_product(a, b, c):
+ for i in range(a.shape[0]):
+ for j in range(b.shape[0]):
+ c[i, j] = a[i] * b[j]
+ a = numpy.random.rand(100)
+ b = numpy.random.rand(99)
+ c = numpy.zeros((100, 99))
+ outer_product(a, b, c)
+
+This decorator will import `Keywords`_ required spontaneously when software emulation.
+After software emulation is done, the imported keywords will be cleaned up. Users do not need
+worry about keyword conflict and pollution.
+
+Every element passed for software emulation in the argument list is either a python variable
+or ``numpy`` numeric type.
+
+Backend Compilation
+~~~~~~~~~~~~~~~~~~~
+
+The current parse interface looks like:
+
+.. code-block:: python
+
+ a = tvm.placeholder((100, ), name='a')
+ b = tvm.placeholder((99, ), name='b')
+ c = tvm.placeholder((100, 99), name='c')
+ tvm.hybrid.parse(outer_product, [a, b, c]) # return an ir root of this function
+
+If we pass these tvm tensors to this function, it returns a op node:
+
+**Under construction, we are still deciding what kind of node should be returned.**
+
+.. code-block:: python
+
+ a = tvm.placeholder((100, ), name='a')
+ b = tvm.placeholder((99, ), name='b')
+ c = tvm.placeholder((100, 99), name='c')
+ op = outer_product(a, b, c) # return the corresponding op node
+
+Tuning
+~~~~~~
+
+**Under construction, not truly supported yet.**
+
+Follow up the example above, you can use some tvm like interfaces to tune the code:
+
+.. code-block:: python
+
+ sch = tvm.create_schedule(op)
+ jo, ji = sch.split(j, 4)
+ sch.vectorize(ji)
+
+``split``, ``reorder``, and loop_annotation will be supported!
+
+Loops
+~~~~~
+
+In HalideIR, loops have in total 4 types: ``serial``, ``unrolled``, ``parallel``, and ``vectorized``.
+
+Here we use ``range`` aka ``serial``, ``unroll``, ``parallel``, and ``vectorize``,
+these **4** keywords to annotate the corresponding types of for loops.
+The the usage is roughly the same as Python standard ``range``.
+
+Variables
+~~~~~~~~~
+
+All the mutatable variables will be lowered to an array with size 1.
+It regards the first store of a variable as its declaration.
+
+.. note::
+
+ Unlike conventional Python, in hybrid script, the declared variable
+ can only be used in the scope level it is declared.
+
+
+.. note::
+
+ Currently, you can ONLY use basic-typed variables, i.e. the type of the
+ variable should be either ``float32``, or ``int32``.
+
+.. code-block:: python
+
+ for i in range(5):
+ s = 0 # declaration, this s will be a 1-array in lowered IR
+ for j in range(5):
+ s += a[i, j] # do something with sum
+ b[i] = sum # you can still use sum in this level
+ a[0] = s # you CANNOT use s here, even though it is allowed in conventional Python
+ b = (1, 2) # this has NOT been supported yet!
+
+
+Attributes
+~~~~~~~~~~
+
+So far, ONLY tensors' ``shape`` attribute is supported! The ``shape`` atrribute is essentailly a
+tuple, so you MUST access it as an array. Also, currently, only constant-indexed access is supported.
+
+.. code-block:: python
+
+ x = a.shape[2] # OK!
+ for i in range(3):
+ for j in a.shape[i]: # BAD! i is not a constant!
+ # do something
+
+
+Conditional Statement and Expression
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+ if condition:
+ # do something
+ a = b if condition else c
+
+However, NO ``True`` and ``False`` keyword supported yet.
+
+
+Math Intrinsics
+~~~~~~~~~~~~~~~
+
+So far, these math intrinsics, ``log``, ``exp``, ``sigmoid``,
+``tanh``, ``power``, and ``popcount``, are supported.
+No import is required, just as it is mentioned in `Software Emulation`_, just use it!
+
+Array Allocation
+~~~~~~~~~~~~~~~~
+
+**Under construction, this function will be supported later!**
+
+Use a function call ``allocation(shape, type, share/local)`` to declare an array buffer.
+The basic usage is roughly the same as a normal array.
+
+
+Thread Bind
+~~~~~~~~~~~
+
+
+You can also do loop-thread bind by writing code like this:
+
+.. code-block:: python
+
+ for tx in bind("threadIdx.x", 100):
+ a[tx] = b[tx]
+
+
+Keywords
+~~~~~~~~
+- For keywords: ``serial``, ``range``, ``unroll``, ``parallel``, ``vectorize``, ``bind``
+- Math keywords: ``log``, ``exp``, ``sigmoid``, ``tanh``, ``power``, ``popcount``
diff --git a/docs/langref/index.rst b/docs/langref/index.rst
new file mode 100644
index 000000000000..65f78d1d278b
--- /dev/null
+++ b/docs/langref/index.rst
@@ -0,0 +1,9 @@
+Language Reference
+==================
+This document provide references to
+embedded languages in TVM stack.
+
+.. toctree::
+ :maxdepth: 2
+
+ hybrid_script
diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
new file mode 100644
index 000000000000..96a37b779e1e
--- /dev/null
+++ b/docs/nnvm_top.rst
@@ -0,0 +1,193 @@
+NNVM Core Tensor Operators
+==========================
+
+This page contains the list of core tensor operator primitives pre-defined in NNVM.
+The core tensor operator primitives(``nnvm.top``) covers typical workloads in deep learning.
+They can represent workloads in front-end frameworks, and provide basic building blocks for optimization.
+Since deep learning is a fast evolving field and it is that possible to have operators that are not in here.
+NNVM is designed for this problem and can easily new operators without changing the core library.
+
+.. note::
+
+ Each operator node in the graph IR contains the following two kinds of parameters.
+
+ - inputs: positional list of input tensors
+ - attrs: attributes about operator(e.g. kernel_size in conv2d)
+
+ This document lists both inputs and attributes in the parameter field. You can distinguish them by the marked type. The inputs are of type Tensor, while the rest parameters are attributes.
+ To construct the graph with NNVM python API, a user can pass in the input Tensors as positional arguments, and attributes as keyword arguments.
+
+
+Overview of Operators
+---------------------
+**Level 1: Basic Operators**
+
+This level enables fully connected multi-layer perceptron.
+
+.. autosummary::
+ :nosignatures:
+
+ nnvm.symbol.dense
+ nnvm.symbol.relu
+ nnvm.symbol.tanh
+ nnvm.symbol.sigmoid
+ nnvm.symbol.exp
+ nnvm.symbol.log
+ nnvm.symbol.sqrt
+ nnvm.symbol.elemwise_add
+ nnvm.symbol.elemwise_sub
+ nnvm.symbol.elemwise_mul
+ nnvm.symbol.elemwise_div
+ nnvm.symbol.elemwise_sum
+ nnvm.symbol.flatten
+ nnvm.symbol.concatenate
+ nnvm.symbol.expand_dims
+ nnvm.symbol.squeeze
+ nnvm.symbol.split
+ nnvm.symbol.dropout
+ nnvm.symbol.batch_norm
+ nnvm.symbol.softmax
+ nnvm.symbol.log_softmax
+ nnvm.symbol.pad
+ nnvm.symbol.block_grad
+
+
+**Level 2: Convolutions**
+
+This level enables typical convnet models.
+
+.. autosummary::
+ :nosignatures:
+
+ nnvm.symbol.conv2d
+ nnvm.symbol.conv2d_transpose
+ nnvm.symbol.max_pool2d
+ nnvm.symbol.avg_pool2d
+ nnvm.symbol.global_max_pool2d
+ nnvm.symbol.global_avg_pool2d
+
+
+**Level 3: Additional Tensor Ops**
+
+.. autosummary::
+ :nosignatures:
+
+ nnvm.symbol.reshape
+ nnvm.symbol.copy
+ nnvm.symbol.negative
+ nnvm.symbol.floor
+ nnvm.symbol.ceil
+ nnvm.symbol.round
+ nnvm.symbol.trunc
+ nnvm.symbol.abs
+ nnvm.symbol.leaky_relu
+ nnvm.symbol.__add_scalar__
+ nnvm.symbol.__sub_scalar__
+ nnvm.symbol.__rsub_scalar__
+ nnvm.symbol.__mul_scalar__
+ nnvm.symbol.__div_scalar__
+ nnvm.symbol.__rdiv_scalar__
+ nnvm.symbol.__pow_scalar__
+ nnvm.symbol.__rpow_scalar__
+ nnvm.symbol.__lshift_scalar__
+ nnvm.symbol.__rshift_scalar__
+
+**Level 4: Broadcast and Reductions**
+
+.. autosummary::
+ :nosignatures:
+
+ nnvm.symbol.transpose
+ nnvm.symbol.broadcast_to
+ nnvm.symbol.sum
+ nnvm.symbol.min
+ nnvm.symbol.max
+ nnvm.symbol.broadcast_add
+ nnvm.symbol.broadcast_sub
+ nnvm.symbol.broadcast_mul
+ nnvm.symbol.broadcast_div
+ nnvm.symbol.clip
+ nnvm.symbol.greater
+ nnvm.symbol.less
+ nnvm.symbol.expand_like
+ nnvm.symbol.reshape_like
+ nnvm.symbol.full
+ nnvm.symbol.full_like
+ nnvm.symbol.ones
+ nnvm.symbol.ones_like
+ nnvm.symbol.zeros
+ nnvm.symbol.zeros_like
+
+Detailed Definitions
+--------------------
+.. autofunction:: nnvm.symbol.dense
+.. autofunction:: nnvm.symbol.relu
+.. autofunction:: nnvm.symbol.tanh
+.. autofunction:: nnvm.symbol.sigmoid
+.. autofunction:: nnvm.symbol.exp
+.. autofunction:: nnvm.symbol.log
+.. autofunction:: nnvm.symbol.sqrt
+.. autofunction:: nnvm.symbol.elemwise_add
+.. autofunction:: nnvm.symbol.elemwise_sub
+.. autofunction:: nnvm.symbol.elemwise_mul
+.. autofunction:: nnvm.symbol.elemwise_div
+.. autofunction:: nnvm.symbol.elemwise_sum
+.. autofunction:: nnvm.symbol.flatten
+.. autofunction:: nnvm.symbol.concatenate
+.. autofunction:: nnvm.symbol.expand_dims
+.. autofunction:: nnvm.symbol.squeeze
+.. autofunction:: nnvm.symbol.split
+.. autofunction:: nnvm.symbol.dropout
+.. autofunction:: nnvm.symbol.batch_norm
+.. autofunction:: nnvm.symbol.softmax
+.. autofunction:: nnvm.symbol.log_softmax
+.. autofunction:: nnvm.symbol.pad
+.. autofunction:: nnvm.symbol.block_grad
+
+.. autofunction:: nnvm.symbol.conv2d
+.. autofunction:: nnvm.symbol.conv2d_transpose
+.. autofunction:: nnvm.symbol.max_pool2d
+.. autofunction:: nnvm.symbol.avg_pool2d
+.. autofunction:: nnvm.symbol.global_max_pool2d
+.. autofunction:: nnvm.symbol.global_avg_pool2d
+
+.. autofunction:: nnvm.symbol.reshape
+.. autofunction:: nnvm.symbol.copy
+.. autofunction:: nnvm.symbol.negative
+.. autofunction:: nnvm.symbol.floor
+.. autofunction:: nnvm.symbol.ceil
+.. autofunction:: nnvm.symbol.round
+.. autofunction:: nnvm.symbol.trunc
+.. autofunction:: nnvm.symbol.abs
+.. autofunction:: nnvm.symbol.leaky_relu
+.. autofunction:: nnvm.symbol.__add_scalar__
+.. autofunction:: nnvm.symbol.__sub_scalar__
+.. autofunction:: nnvm.symbol.__rsub_scalar__
+.. autofunction:: nnvm.symbol.__mul_scalar__
+.. autofunction:: nnvm.symbol.__div_scalar__
+.. autofunction:: nnvm.symbol.__rdiv_scalar__
+.. autofunction:: nnvm.symbol.__pow_scalar__
+.. autofunction:: nnvm.symbol.__rpow_scalar__
+.. autofunction:: nnvm.symbol.__lshift_scalar__
+.. autofunction:: nnvm.symbol.__rshift_scalar__
+
+.. autofunction:: nnvm.symbol.transpose
+.. autofunction:: nnvm.symbol.broadcast_to
+.. autofunction:: nnvm.symbol.sum
+.. autofunction:: nnvm.symbol.min
+.. autofunction:: nnvm.symbol.max
+.. autofunction:: nnvm.symbol.broadcast_add
+.. autofunction:: nnvm.symbol.broadcast_sub
+.. autofunction:: nnvm.symbol.broadcast_mul
+.. autofunction:: nnvm.symbol.broadcast_div
+.. autofunction:: nnvm.symbol.clip
+.. autofunction:: nnvm.symbol.greater
+.. autofunction:: nnvm.symbol.less
+.. autofunction:: nnvm.symbol.expand_like
+.. autofunction:: nnvm.symbol.reshape_like
+.. autofunction:: nnvm.symbol.full
+.. autofunction:: nnvm.symbol.full_like
+.. autofunction:: nnvm.symbol.ones
+.. autofunction:: nnvm.symbol.ones_like
+.. autofunction:: nnvm.symbol.zeros
+.. autofunction:: nnvm.symbol.zeros_like
diff --git a/docs/vta/.gitignore b/docs/vta/.gitignore
new file mode 100644
index 000000000000..a07068979a60
--- /dev/null
+++ b/docs/vta/.gitignore
@@ -0,0 +1 @@
+tutorials
\ No newline at end of file
diff --git a/docs/vta/dev/config.rst b/docs/vta/dev/config.rst
new file mode 100644
index 000000000000..5b0ee966872b
--- /dev/null
+++ b/docs/vta/dev/config.rst
@@ -0,0 +1,70 @@
+VTA Configuration
+=================
+
+The VTA stack incorporates both a hardware accelerator stack and
+a TVM based software stack.
+VTA incorporates flexibility out of the box: by modifying the
+``vta/config/vta_config.json`` high-level configuration file,
+the user can change the shape of the tensor intrinsic,
+clock frequency, pipelining, data type width, and on-chip buffer sizes.
+
+Parameters Overview
+-------------------
+
+We explain the parameters listed in the ``vta_config.json`` file in the table
+below.
+
++-----------------------+------------+--------------------------------------------------------+
+| Attribute | Format | Description |
++=======================+============+========================================================+
+| ``TARGET`` | String | The TVM device target. |
++-----------------------+------------+--------------------------------------------------------+
+| ``HW_TARGET`` | Int | FPGA frequency in MHz. |
++-----------------------+------------+--------------------------------------------------------+
+| ``HW_CLK_TARGET`` | Int | FPGA clock period in ns target for HLS tool. |
++-----------------------+------------+--------------------------------------------------------+
+| ``HW_VER`` | String | VTA hardware version number. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_INP_WIDTH`` | Int (log2) | Input data type signed integer width. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_WGT_WIDTH`` | Int (log2) | Weight data type signed integer width. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_ACC_WIDTH`` | Int (log2) | Accumulator data type signed integer width. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_OUT_WIDTH`` | Int (log2) | Output data type signed integer width. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_BATCH`` | Int (log2) | VTA matrix multiply intrinsic output dimension 0. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_BLOCK_IN`` | Int (log2) | VTA matrix multiply reduction dimension. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_BLOCK_OUT`` | Int (log2) | VTA matrix multiply intrinsic output dimension 1. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_UOP_BUFF_SIZE`` | Int (log2) | Micro-op on-chip buffer in Bytes. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_INP_BUFF_SIZE`` | Int (log2) | Input on-chip buffer in Bytes. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_WGT_BUFF_SIZE`` | Int (log2) | Weight on-chip buffer in Bytes. |
++-----------------------+------------+--------------------------------------------------------+
+| ``LOG_ACC_BUFF_SIZE`` | Int (log2) | Accumulator on-chip buffer in Bytes. |
++-----------------------+------------+--------------------------------------------------------+
+
+
+ .. note::
+
+ When a parameter name is preceded with ``LOG``, it means that it describes a value that can only be expressed a power of two.
+ For that reason we describe these parameters by their log2 value.
+ For instance, to describe an integer width of 8-bits for the input data types, we set the ``LOG_INP_WIDTH`` to be 3, which is the log2 of 8.
+ Similarly, to descibe a 64kB micro-op buffer, we would set ``LOG_UOP_BUFF_SIZE`` to be 16.
+
+We provide additional detail below regarding each parameter:
+
+ - ``TARGET``: Can be set to ``"pynq"`` or ``"sim"``.
+ - ``HW_TARGET``: In pynq mode, can be set to ``100``, ``142``, ``167``, or ``200`` MHz.
+ - ``HW_CLK_TARGET``: The lower the target, the more pipeline stages HLS will insert to achieve timing closure during place and route (this can also slightly decrease performance).
+ - ``HW_VER``: Hardware version which increments everytime the VTA hardware design changes. This parameter is used to uniquely idenfity hardware bitstreams.
+ - ``LOG_OUT_WIDTH``: We recommend matching ``LOG_OUT_WIDTH`` to ``LOG_INP_WIDTH``.
+ - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
+ - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
+ - ``LOG_BLOCK_IN``: Equivalent to B in multiplication of shape (A, B) x (B, C), or typically, the input channel dimension.
+ - ``LOG_BLOCK_OUT``: Equivalent to C in multiplication of shape (A, B) x (B, C), or typically, the output channel dimension.
+
diff --git a/docs/vta/dev/hardware.rst b/docs/vta/dev/hardware.rst
new file mode 100644
index 000000000000..fd19f969687c
--- /dev/null
+++ b/docs/vta/dev/hardware.rst
@@ -0,0 +1,282 @@
+VTA Hardware Guide
+==================
+
+We present a top-down overview of the VTA hardware design.
+This hardware design guide covers VTA hardware at two levels:
+
+ - An architectural overview of the VTA design and its ISA hardware-software
+ interface.
+ - A micro-architectural overview of the VTA hardware modules, and the
+ micro-code specification for the compute core.
+
+VTA Overview
+------------
+
+VTA is a generic deep learning accelerator built for fast and efficient dense linear algebra.
+VTA incorporates a simple RISC-like processor that can perform dense linear algebra operations on rank 1 or 2 tensor registers.
+In addition the design adopts decoupled access-execute to hide memory access latency.
+
+
+To a broader extent, VTA can serve as a template deep learning accelerator design for full stack optimization, exposing a generic tensor computation interface to the compiler stack.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_overview.png
+ :align: center
+ :width: 80%
+
+The figure above gives a high-level overview of the VTA hardware organization.
+VTA is composed of four modules that communicate among each other via FIFO queues and local memory blocks (SRAM), to enable task-level pipeline parallelism:
+
+- The fetch module takes care of loading an instruction stream from DRAM. It also decodes those instructions to route them into one of three command queues.
+- The load module takes care of loading input and weight tensors from DRAM into data-specialized on-chip memories.
+- The compute module performs both dense linear algebra computation with its GEMM core, and general computation with its tensor ALU. It also takes care of loading data from DRAM into the register file, and loading micro-op kernels into the micro-op cache.
+- The store module stores results produced by the compute core back to DRAM.
+
+HLS Hardware Source Organization
+--------------------------------
+
+The VTA design is currently specified in Vivado HLS C++, which is only supported
+by Xilinx toolchains.
+The VTA hardware sources are contained under ``vta/hardware/xilinx/sources``:
+
+ - ``vta.cc`` contains the definitions for each VTA module, as well as a top
+ level behavioral model for the top-level VTA design.
+ - ``vta.h`` contains type definitions using Xilinx ``ap_int`` types, and
+ function prototypes declarations.
+
+In addition preprocessor macros are defined under ``vta/include/vta/hw_spec.h``.
+Much of these macro definitions are derived from the parameters listed in the
+``vta/config/vta_config.json`` file.
+The json file is processed by ``vta/config/vta_config.py`` to produce a string of
+compile flags that define the preprocessor macros.
+That string is used by the makefile in order to set those high-level
+parameters in both the HLS hardware synthesis compiler, and the C++
+compiler that builds the VTA runtime.
+
+HLS Module Example
+~~~~~~~~~~~~~~~~~~
+
+We show a definition of one of the VTA modules defined in C++:
+
+.. code-block:: c
+
+ void fetch(
+ uint32_t insn_count,
+ volatile insn_T *insns,
+ hls::stream &load_queue,
+ hls::stream &gemm_queue,
+ hls::stream &store_queue) {
+ #pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
+ #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
+ #pragma HLS INTERFACE axis port = load_queue
+ #pragma HLS INTERFACE axis port = gemm_queue
+ #pragma HLS INTERFACE axis port = store_queue
+ #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+
+ INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
+ #pragma HLS PIPELINE II = 1
+ // Read instruction fields
+ insn_T insn = insns[pc];
+ // Do some partial decoding
+ opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
+ memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+ // Push to appropriate instruction queue
+ if (opcode == VTA_OPCODE_STORE) {
+ store_queue.write(insn);
+ } else if (opcode == VTA_OPCODE_LOAD &&
+ (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) {
+ load_queue.write(insn);
+ } else {
+ gemm_queue.write(insn);
+ }
+ }
+ }
+
+A few observations on HLS coding:
+ - *Parameters:* The parameter list of each function, combined with the
+ interface pragmas define the hardware interface exposed by the
+ generated hardware module.
+
+ - Parameters passed by value indicate a read-only hardware memory-mapped
+ register that the host can write to.
+ This fetch function for instance has an ``insn_count`` parameter
+ which will be synthesized as a memory mapped register for the host
+ to write to, in order to set the length of a given VTA instruction
+ sequence.
+ - Pointer parameters can mean one of two things depending on the interface
+ pragma being used.
+
+ - When used with a ``m_axi`` interface pragma, an AXI master interface
+ gets generated to provide DMA access to DRAM.
+ - When used with a ``bram`` interface pragma, a BRAM interface gets
+ generated to expose read and/or write ports to an FPGA block-RAM.
+ - HLS streams being passed by reference combined with the ``axis`` interface
+ pragma produce FIFO interfaces to the module. Hardware FIFOs provide a
+ useful synchronization mechanism between modules.
+ - *Pragmas*: Compiler pragmas are essential to define hardware implementation
+ of each module. We list several pragmas used in the VTA design to communicate
+ implementation requirements to the compiler.
+
+ - ``HLS INTERFACE``: specifies the interface of the synthesized
+ hardware module.
+ - ``HLS PIPELINE``: defines hardware pipeline performance target by setting
+ an initiation interval goal. When the ``II == 1`` target is set, it tells
+ the compiler that the synthesized hardware pipeline should be able to
+ execute one loop iteration per cycle.
+ - ``HLS DEPENDENCE``: instructs the compiler to ignore certain types
+ of dependence checks in a given loop. Consider a loop body that writes
+ and reads to the same BRAM structure, and needs to achieve an II of 1.
+ The HLS compiler has to assume worst-case scenario, whereby a read is
+ issued to an address that a past write updated the cycle prior: this
+ cannot be achieved given BRAM timing characteristics (it takes at least
+ 2 cycles to see the updated value). Therefore in order to achieve an II of 1,
+ the dependence checks have to be relaxed.
+ Note that when turning this optimization on, it falls onto
+ the software stack to prevent writes followed by reads to the same address.
+
+ .. note::
+ This `reference guide `_
+ provides a much more in-depth, and complete specification of HLS for the Xilinx 2018.2 toolchains.
+
+Architectural Overview
+----------------------
+
+Instruction Set Architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+VTA's instruction set architecture (ISA) is composed of 4 CISC instructions that have a variable execution latency, two of which execute a micro-coded instruction sequence to perform computation.
+
+The VTA instructions are listed below:
+
+- ``LOAD`` instruction: loads a 2D tensor from DRAM into the input buffer, weight buffer, or register file. It can also load a micro-kernel into the micro-op cache. Supports dynamic padding when loading input and weight tiles.
+- ``GEMM`` instruction: performs a micro-op sequence of matrix-matrix multiplications over an input tensor and a weight tensors, and adds the result to a register-file tensor.
+- ``ALU`` instruction: performs a micro-op sequence of matrix-matrix ALU operations over register-file tensor data.
+- ``STORE`` instruction: stores a 2D tensor from the output buffer to DRAM.
+
+The ``LOAD`` instructions are executed by the load and compute modules depending on the store memory buffer location target.
+The ``GEMM`` and ``ALU`` instructions are executed by the compute module's GEMM core and tensor ALU.
+Finally, the ``STORE`` instructions are executed by the store module exclusively.
+The fields of each instruction is described in the figure below.
+The meaning of each field will be further explained in the :ref:`vta-uarch` section.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/vta_instructions.png
+ :align: center
+ :width: 100%
+
+.. note::
+ Note that the VTA ISA changes as VTA's architectural parameters are modified (i.e. GEMM core shape, data type, memory size etc.), and as a result the ISA does not guarantee compatibility across all variants of VTA.
+ This is acceptable however, since the VTA runtime adapts to parameter changes, and produces binary code tailored for the version of the accelerator that gets generated.
+ This exemplifies the co-design philosophy adopted by the VTA stack which embraces fluidity of the hardware-software interface.
+
+Dataflow Execution
+~~~~~~~~~~~~~~~~~~
+
+VTA relies on dependence FIFO queues between hardware modules to synchronize the execution of concurrent tasks.
+The figure below shows how a given hardware module can execute concurrently from its producer and consumer modules in a dataflow fashion through the use of dependence FIFO queues, and single-reader/single-writer SRAM buffers.
+Each module is connected to its consumer and producer via read-after-write (RAW) and write-after-read (WAR) dependence queues.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/dataflow.png
+ :align: center
+ :width: 100%
+
+The pseudo-code above describes how a module executes a given instruction predicated on dependences with other instructions.
+First, the dependence flags within each instruction are decoded in hardware.
+If the instruction has an incoming RAW dependences, execution is predicated upon receiving a RAW dependence token from the producer module.
+Similarly, if the task has an incoming WAR dependence, execution is predicated upon receiving a WAR dependence token from the consumer module.
+Finally when the task is done, we check for outgoing RAW and WAR dependences, and notify the consumer and producer modules respectively.
+
+.. note::
+ Note that the dependence tokens in this scenario are information-less.
+ This is because the instructions executed by each module cannot be reordered by design, as they arrive in FIFO order.
+
+Pipeline Expandability
+~~~~~~~~~~~~~~~~~~~~~~
+
+The default VTA design is composed of four modules that describe a 3-stage ``load-compute-store`` task pipeline.
+Following the dataflow hardware organization principle, we can extend VTA the pipeline to include more stages.
+For example, we can envision separating the tensor ALU from the GEMM core in order to maximize the utilization of the GEMM core.
+This would result in a ``load-gemm-activate-store`` task pipeline which closely reflects the TPU design.
+Adding more stages has a cost however: it can add storage and extra logic overhead, which is why we opted for a default 3-stage pipeline.
+
+.. _vta-uarch:
+Microarchitectural Overview
+----------------------
+
+We describe the modules that compose the VTA design.
+The module definitions are contained in ``vta/hardware/xilinx/sources/vta.cc``.
+
+Fetch Module
+~~~~~~~~~~~~
+
+VTA is programmed by a linear instruction stream.
+The fetch module is the entry point of VTA to the CPU and is programmed via three memory mapped registers:
+
+- The read-write ``control`` register starts the fetch module, and is read to check for its completion.
+- The write-only ``insn_count`` register sets the number of instructions to execute.
+- The write-only ``insns`` register sets the start address of the instruction stream in DRAM.
+
+The CPU prepares the instruction stream in DRAM in a physically-contiguous buffer prepared by the VTA runtime.
+When the instruction stream is ready, the CPU writes the start physical address into the ``insns`` register, the length of the instruction stream into the ``insn_count`` register, and asserts the start signal in the ``control`` register.
+This procedure starts VTA, which reads in the instruction stream from DRAM via DMA.
+
+Upon accessing the instruction stream, the fetch module partially decodes instructions, and pushes those instructions into command queues that feed into the load, compute, and store modules:
+
+- ``STORE`` instructions are pushed to the store command queue to be processed by the store module.
+- ``GEMM`` and ``ALU`` instructions are pushed to the compute command queue to be processed by the compute module.
+- ``LOAD`` instructions that describe a load operation of micro-op kernels or register file data are pushed to the compute command queue to be processed by the compute module.
+- ``LOAD`` instructions that describe a load operation of input or weight data are pushed to the load command queue to be processed by the load module.
+
+When one of the command queues becomes full, the fetch module stalls until the queue is not full.
+Consequently, the command queues are sized to be deep enough to allow for a wide execution window, and allow multiple tasks to be in flight concurrently across the ``load-compute-store`` pipeline.
+
+
+Compute Module
+~~~~~~~~~~~~~~
+
+VTA's compute module acts as a RISC processor that performs computation on tensor registers rather than scalar registers.
+Two functional units mutate the register file: the tensor ALU, and the GEMM core.
+
+The compute module executes RISC micro-ops from the micro-op cache.
+There are two types of compute micro-ops: ALU and GEMM operations.
+To minimize the footprint of micro-op kernels, while avoiding the need for control-flow instructions such as conditional jumps, the compute module executes micro-op sequences inside a two-level nested loop that computes the location of each tensor register location via an affine function.
+This compression approach helps reduce the micro-kernel instruction footprint, and applies to both matrix multiplication and 2D convolution, commonly found in neural network operators.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/gemm_core.png
+ :align: center
+ :width: 100%
+
+The **GEMM core** evaluates GEMM instructions, by executing a micro-code sequence in a 2-level nested loop described in the Figure above.
+The GEMM core can perform one input-weight matrix multiplication per cycle.
+The dimensions of the single-cycle matrix multiplication defines a hardware *tensorization intrinsic* which the TVM compiler has to lower a computation schedule onto.
+This tensorization intrinsic is defined by the dimensions of the input, weight and accumulator tensors.
+Each data type can have a different integer precision: typically both weight and input types are low-precision (8-bits or less), while the accumulator tensor has a wider type to prevent overflows (32-bits).
+In order to keep the GEMM core busy, each of the input buffer, weight buffer, and register file have to expose sufficient read/write bandwidth.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/alu_core.png
+ :align: center
+ :width: 100%
+
+The **Tensor ALU** supports a set of standard operations to implement common activation, normalization, and pooling operators.
+VTA being a modular design, the range of operators that the Tensor ALU supports can be extended for higher operator coverage, at the expense of higher resource utilization.
+The Tensor ALU can perform tensor-tensor operations, as well as tensor-scalar operations on an immediate value.
+The opcode of the tensor ALU, and the immediate value are specified by the high-level CISC instruction.
+The micro-code in the context of tensor ALU computation only takes care of specifying data access patterns.
+
+.. note::
+ In terms of computational throughput, the Tensor ALU does not execute at a rate of one operation per cycle.
+ The limitation comes from the lack of read-ports: since one register file tensor can be read per cycle, the tensor ALU has an initiation interval of at least 2 (i.e. performs at most 1 operation every 2 cycles).
+ In addition, performing a single tensor-tensor operation at once can be expensive especially given that register file types are wide, typically 32-bit integers.
+ As a result, in order to balance the resource utilization footprint of the Tensor ALU with the GEMM core, a tensor-tensor operation is by default performed via vector-vector operations over multiple cycles.
+
+
+Load and Store Modules
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/developer/2d_dma.png
+ :align: center
+ :width: 100%
+
+The load and store modules perform 2D DMA loads with a strided access pattern from DRAM to SRAM.
+In addition, the load module can insert 2D padding on the fly, which is useful when blocking 2D convolution.
+This means that VTA can tile 2D convolution inputs without paying the overhead of re-laying data out in DRAM to insert spatial padding around input and weight tiles.
+
+
diff --git a/docs/vta/dev/index.rst b/docs/vta/dev/index.rst
new file mode 100644
index 000000000000..788bafe34b3e
--- /dev/null
+++ b/docs/vta/dev/index.rst
@@ -0,0 +1,14 @@
+VTA Design and Developer Guide
+==============================
+
+This developer guide details the complete VTA-TVM hardware-software stack.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_stack.png
+ :align: center
+ :width: 60%
+
+.. toctree::
+ :maxdepth: 2
+
+ config
+ hardware
\ No newline at end of file
diff --git a/docs/vta/hardware.rst b/docs/vta/hardware.rst
new file mode 100644
index 000000000000..294b99a8269f
--- /dev/null
+++ b/docs/vta/hardware.rst
@@ -0,0 +1,2 @@
+VTA Hardware Design Overview
+============================
diff --git a/docs/vta/index.rst b/docs/vta/index.rst
new file mode 100644
index 000000000000..d29dd9f2ffcf
--- /dev/null
+++ b/docs/vta/index.rst
@@ -0,0 +1,36 @@
+VTA: Deep Learning Accelerator Stack
+====================================
+
+The Versatile Tensor Accelerator (VTA) is an open, generic, and customizable deep learning accelerator with a complete TVM-based compiler stack. We designed VTA to expose the most salient and common characteristics of mainstream deep learning accelerators. Together TVM and VTA form an end-to-end hardware-software deep learning system stack that includes hardware design, drivers, a JIT runtime, and an optimizing compiler stack based on TVM.
+
+.. image:: http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_overview.png
+ :align: center
+ :width: 60%
+
+VTA has the following key features:
+
+- Generic, modular, open-source hardware.
+- Streamlined workflow to deploy to FPGAs.
+- Simulator support to prototype compilation passes on regular workstations.
+- Pynq-based driver and JIT runtime for both simulated and FPGA hardware back-end.
+- End to end TVM stack integration.
+
+This page contains links to all the resources related to VTA:
+
+
+.. toctree::
+ :maxdepth: 1
+
+ install
+ dev/index
+ tutorials/index
+
+
+Literature
+----------
+
+- Read the VTA `release blog post`_.
+- Read the VTA tech report: `An Open Hardware Software Stack for Deep Learning`_.
+
+.. _release blog post: https://tvm.ai/2018/07/12/vta-release-announcement.html
+.. _An Open Hardware Software Stack for Deep Learning: https://arxiv.org/abs/1807.04188
\ No newline at end of file
diff --git a/docs/vta/install.md b/docs/vta/install.md
new file mode 100644
index 000000000000..ca5969386e80
--- /dev/null
+++ b/docs/vta/install.md
@@ -0,0 +1,263 @@
+VTA Installation Guide
+======================
+
+We present three installation guides, each extending on the previous one:
+1. [Simulator installation](#vta-simulator-installation)
+2. [Hardware test setup](#vta-pynq-based-test-setup)
+3. [FPGA toolchain installation](#vta-fpga-toolchain-installation)
+
+## VTA Simulator Installation
+
+You need [TVM installed](https://docs.tvm.ai/install/index.html) on your machine.
+For a quick and easy start, use the pre-built [TVM Docker image](https://docs.tvm.ai/install/docker.html).
+
+The VTA simulator library is built by default with TVM.
+Add the VTA library to your python path to run the VTA examples.
+
+```bash
+export PYTHONPATH=/path/to/vta/python:${PYTHONPATH}
+```
+
+### Testing your VTA Simulation Setup
+
+To ensure that you've properly installed the VTA python package, run the following 2D convolution testbench.
+
+```bash
+python /vta/tests/python/integration/test_benchmark_topi_conv2d.py
+```
+
+> Note: You'll notice that for every convolution layer, the throughput gets reported in GOPS. These numbers are actually the computational throughput that the simulator achieves, by evaluating the convolutions in software.
+
+You are invited to try out our [VTA programming tutorials](https://docs.tvm.ai/vta/tutorials/index.html).
+
+
+### Advanced Configuration (optional)
+
+VTA is a generic configurable deep learning accelerator.
+The configuration is specified by `vta_config.json` under the TVM root folder.
+This file provides an architectural specification of the VTA accelerator to parameterize the TVM compiler stack and the VTA hardware stack.
+
+The VTA configuration file also specifies the TVM compiler target.
+When `TARGET` is set to `sim`, all TVM workloads execute on the VTA simulator.
+You can modify the content of the configuration file to rebuild VTA to a different parameterization.
+To do so,
+
+```bash
+cd
+cp vta/config/vta_config.json vta_config.json
+# edit vta_config.json
+make vta
+```
+
+## VTA Pynq-Based Test Setup
+
+This second guide extends the *VTA Simulator Installation* guide above to run FPGA hardware tests of the complete TVM and VTA software-hardware stack.
+In terms of hardware components you'll need:
+* The [Pynq](http://www.pynq.io/) FPGA development board which can be acquired for $200, or $150 for academics from [Digilent](https://store.digilentinc.com/pynq-z1-python-productivity-for-zynq/).
+* An Ethernet-to-USB adapter to connect the Pynq board to your development machine.
+* An 8+GB micro SD card.
+* An AC to DC 12V 3A power adapter.
+
+This guide covers the following themes:
+1. Pynq board setup instructions.
+2. Pynq-side RPC server build and deployment.
+3. Revisiting the test examples from the *VTA Simulator Installation* guide, this time executing on the Pynq board.
+
+### Pynq Board Setup
+
+Setup your Pynq board based on the [Pynq board getting started tutorial](http://pynq.readthedocs.io/en/latest/getting_started.html).
+You should follow the instructions up to and including the *Turning On the PYNQ-Z1* step (no need to pursue the tutorial beyond this point).
+* Make sure that you've downloaded the latest Pynq image, [PYNQ-Z1 v2.1](http://pynq-testing.readthedocs.io/en/image_v2.2/getting_started/pynq_image.html) (released 21 Feb 2018), and have imaged your SD card with it (we recommend the free [Etcher](https://etcher.io/) program).
+* For this test setup, follow the ["Connect to a Computer"](http://pynq.readthedocs.io/en/latest/getting_started.html#connect-to-a-computer) Ethernet setup instructions. To be able to talk to the board, make sure to [assign your computer a static IP address](http://pynq.readthedocs.io/en/latest/appendix.html#assign-your-computer-a-static-ip)
+
+Once the board is powered on and connected to your development machine, try connecting to it to make sure you've properly set up your Pynq board:
+```bash
+# To connect to the Pynq board use the [username, password] combo: [xilinx, xilinx]
+ssh xilinx@192.168.2.99
+```
+
+### Pynq-Side RPC Server Build & Deployment
+
+Because the direct board-to-computer connection prevents the board from directly accessing the internet, we'll need to mount the Pynq's file system to your development machine's file system with [sshfs](https://www.digitalocean.com/community/tutorials/how-to-use-sshfs-to-mount-remote-file-systems-over-ssh). Next we directly clone the TVM repository into the sshfs mountpoint on your development machine.
+
+```bash
+# On the Host-side
+mkdir
+sshfs xilinx@192.168.2.99:/home/xilinx
+cd
+git clone --recursive https://github.com/dmlc/tvm
+# When finished, you can leave the moutpoint and unmount the directory
+cd ~
+sudo umount
+```
+
+Now that we've cloned the VTA repository in the Pynq's file system, we can ssh into it and launch the build of the TVM-based RPC server.
+The build process should take roughly 5 minutes.
+
+```bash
+ssh xilinx@192.168.2.99
+# Build TVM runtime library (takes 5 mins)
+cd /home/xilinx/tvm
+mkdir build
+cp cmake/config.cmake build/.
+# Copy pynq specific configuration
+cp vta/config/pynq_sample.json build/vta_config.json
+cd build
+cmake ..
+make runtime vta -j2
+# Build VTA RPC server (takes 1 min)
+cd ..
+sudo ./apps/pynq_rpc/start_rpc_server.sh # pw is 'xilinx'
+```
+
+You should see the following being displayed when starting the RPC server. In order to run the next examples, you'll need to leave the RPC server running in an `ssh` session.
+```
+INFO:root:RPCServer: bind to 0.0.0.0:9091
+```
+
+Tips regarding the Pynq RPC Server:
+* The RPC server should be listening on port `9091`. If not, an earlier process might have terminated unexpectedly and it's recommended in this case to just reboot the Pynq, and re-run the RPC server.
+* To kill the RPC server, just send the `Ctrl + c` command. You can re-run it with `sudo ./apps/pynq_rpc/start_rpc_server.sh`.
+* If unresponsive, the board can be rebooted by power-cycling it with the physical power switch.
+
+### Testing your Pynq-based Hardware Setup
+
+Before running the examples on your development machine, you'll need to configure your host environment as follows:
+```bash
+# On the Host-side
+export VTA_PYNQ_RPC_HOST=192.168.2.99
+export VTA_PYNQ_RPC_PORT=9091
+```
+
+In addition, you'll need to edit the `vta_config.json` file on the host to indicate that we are targeting the Pynq platform, by setting the `TARGET` field to `"pynq"`.
+Alternatively, you can copy the default `vta/config/pynq_sample.json` into the TVM root as `vta_config.json`.
+> Note: in contrast to our simulation setup, there are no libraries to compile on the host side since the host offloads all of the computation to the Pynq board.
+
+```bash
+# On the Host-side
+cd
+cp vta/config/pynq_sample.json vta_config.json
+```
+
+This time again, we will run the 2D convolution testbench.
+Beforehand, we need to program the Pynq board FPGA with a VTA bitstream, and build the VTA runtime via RPC.
+The following `test_program_rpc.py` script will perform two operations:
+* FPGA programming, by downloading a pre-compiled bitstream from a [VTA bitstream repository](https://github.com/uwsaml/vta-distro) that matches the default `vta_config.json` configuration set by the host, and sending it over to the Pynq via RPC to program the Pynq's FPGA.
+* Runtime building on the Pynq, which needs to be run every time the `vta_config.json` configuration is modified. This ensures that the VTA software runtime that generates the accelerator's executable via just-in-time (JIT) compilation matches the specifications of the VTA design that is programmed on the FPGA. The build process takes about 30 seconds to complete so be patient!
+
+```bash
+# On the Host-side
+python /vta/tests/python/pynq/test_program_rpc.py
+```
+
+> Tip: You can track progress of the FPGA programming and the runtime rebuilding steps by looking at the RPC server's logging messages in your Pynq `ssh` session.
+
+We are now ready to run the 2D convolution testbench in hardware.
+
+```bash
+# On the Host-side
+python /vta/tests/python/integration/test_benchmark_topi_conv2d.py
+```
+
+The performance metrics measured on the Pynq board will be reported for each convolutional layer.
+
+You can also try out our [VTA programming tutorials](https://docs.tvm.ai/vta/tutorials/index.html).
+
+
+## VTA FPGA Toolchain Installation
+
+This third and last guide allows users to generate custom VTA bitstreams using free-to-use Xilinx compilation toolchains.
+
+### Xilinx Toolchain Installation
+
+We recommend using `Vivado 2018.2` since our scripts have been tested to work on this version of the Xilinx toolchains.
+Our guide is written for Linux (Ubuntu) installation.
+
+You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2018.2](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain.
+
+#### Obtaining and Launching the Vivado GUI Installer
+
+1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2018-2.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2018.2: WebPACK and Editions.
+2. You’ll have to sign in with a Xilinx account. This requires a Xilinx account creation that will take 2 minutes.
+3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin`.
+4. Now that the file is downloaded, go to your `Downloads` directory, and change the file permissions so it can be executed:
+```bash
+chmod u+x Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
+```
+5. Now you can execute the binary:
+```bash
+./Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
+```
+
+#### Xilinx Vivado GUI Installer Steps
+
+At this point you've launched the Vivado 2017.1 Installer GUI program.
+
+1. Click “Next” on the *Welcome* screen.
+2. On the *Select Install Type* screen, enter your Xilinx user credentials under the “User Authentication” box and select the “Download and Install Now” option before clicking “Next” .
+3. On the *Accept License Agreements* screen, accept all terms before clicking “Next”.
+4. On the *Select Edition to Install* screen, select the “Vivado HL WebPACK” before clicking “Next” .
+5. Under the *Vivado HL WebPACK* screen, before hitting “Next", check the following options (the rest should be unchecked):
+ * Design Tools -> Vivado Design Suite -> Vivado
+ * Devices -> Production Devices -> SoCs -> Zynq-7000 (if you are targeting the Pynq board)
+ * Devices -> Production Devices -> SoCs -> UltraScale+ MPSoC (if you are targeting the Ultra-96 board)
+6. Your total download size should be about 5GB and the amount of Disk Space Required 23GB.
+7. On the *Select Destination Directory* screen, set the installation directory before clicking “Next”. It might highlight some paths as red - that’s because the installer doesn’t have the permission to write to the directory. In that case select a path that doesn’t require special write permissions (e.g. your home directory).
+8. On the *Installation Summary* screen, hit “Install”.
+9. An *Installation Progress* window will pop-up to track progress of the download and the installation.
+10. This process will take about 20-30 minutes depending on your connection speed.
+11. A pop-up window will inform you that the installation completed successfully. Click "OK".
+12. Finally the *Vivado License Manager* will launch. Select "Get Free ISE WebPACK, ISE/Vivado IP or PetaLinux License" and click "Connect Now" to complete the license registration process.
+
+#### Environment Setup
+
+The last step is to update your `~/.bashrc` with the following lines. This will include all of the Xilinx binary paths so you can launch compilation scripts from the command line.
+```bash
+# Xilinx Vivado 2018.2 environment
+export XILINX_VIVADO=${XILINX_PATH}/Vivado/2018.2
+export PATH=${XILINX_VIVADO}/bin:${PATH}
+```
+
+### Custom VTA Bitstream Compilation
+
+High-level hardware parameters are listed in the VTA configuration file and can be customized by the user.
+For this custom VTA bitstream compilation exercise, we'll change the frequency of our design, so it can be clocked a little faster.
+* Set the `HW_FREQ` field to `142`. The Pynq board supports 100, 142, 167 and 200MHz clocks. Note that the higher the frequency, the harder it will be to close timing. Increasing the frequency can lead to timing violation and thus faulty hardware execution.
+* Set the `HW_CLK_TARGET` to `6`. This parameters refers to the target clock period in nano seconds for HLS - a lower clock period leads to more aggressive pipelining to achieve timing closure at higher frequencies. Technically a 142MHz clock would require a 7ns target, but we intentionally lower the clock target to 6ns to more aggressively pipeline our design.
+
+Bitstream generation is driven by a top-level `Makefile` under `/vta/hardware/xilinx/`.
+
+If you just want to simulate the VTA design in software emulation to make sure that it is functional, enter:
+```bash
+cd /vta/hardware/xilinx
+make ip MODE=sim
+```
+
+If you just want to generate the HLS-based VTA IP cores without launching the entire design place and route, enter:
+```bash
+make ip
+```
+You'll be able to view the HLS synthesis reports under `/vta/build/hardware/xilinx/hls/` `//solution0/syn/report/_csynth.rpt`
+> Note: The `` name is a string that summarizes the VTA configuration parameters listed in the `vta_config.json`. The `` name refers to the specific module (or HLS function) that compose the high-level VTA pipeline.
+
+Finally to run the full hardware compilation and generate the VTA bitstream, run:
+
+```bash
+make
+```
+
+This process is lengthy, and can take around up to an hour to complete depending on your machine's specs.
+We recommend setting the `VTA_HW_COMP_THREADS` variable in the Makefile to take full advantage of all the cores on your development machine.
+
+Once the compilation completes, the generated bitstream can be found under `/vta/build/hardware/xilinx/vivado//export/vta.bit`.
+
+### Use the Custom Bitstream
+
+We can program the new VTA FPGA bitstream by setting the bitstream path of the `vta.program_fpga()` function in the tutorial examples, or in the `test_program_rpc.py` script.
+
+```python
+vta.program_fpga(remote, bitstream="/vta/build/hardware/xilinx/vivado//export/vta.bit")
+```
+
+Instead of downloading a pre-built bitstream from the VTA bitstream repository, TVM will instead use the new bitstream you just generated, which is a VTA design clocked at a higher frequency.
+Do you observe a noticeable performance increase on the ImageNet classification example?
diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index 9c2b5194ebe9..93bff2762481 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2017 by Contributors
- * \file api_registry.h
+ * \file tvm/api_registry.h
* \brief This files include necessary headers to
* be used to register an global API function.
*/
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index c4f338f1cd47..6a3c395fd404 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file arithmetic.h
+ * \file tvm/arithmetic.h
* \brief Algebra and set operations and simplifications.
*/
#ifndef TVM_ARITHMETIC_H_
diff --git a/include/tvm/base.h b/include/tvm/base.h
index 9a78c5ed503f..d113f45352bb 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file base.h
+ * \file tvm/base.h
* \brief Defines the base data structure
*/
#ifndef TVM_BASE_H_
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 610532e261a3..41fa1fa804a8 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file buffer.h
+ * \file tvm/buffer.h
* \brief Symbolic n-dimensional array, to represent a memory buffer.
*/
#ifndef TVM_BUFFER_H_
@@ -51,8 +51,11 @@ class Buffer : public NodeRef {
* \brief Get access ptr to the entire buffer.
* \param access_mask The access mask
* \param ptr_type The type of the pointer.
+ * \param content_lanes The number of lanes for the (data) type.
+ * \param offset The offset of ptr.
*/
- TVM_DLL Expr access_ptr(int access_mask, Type ptr_type = Handle()) const;
+ TVM_DLL Expr access_ptr(int access_mask, Type ptr_type = Handle(),
+ int content_lanes = 1, Expr offset = make_const(Int(32), 0)) const;
/*!
* \brief Create an Expr that does a vector load at begin index.
* \param begin The beginning index
@@ -122,6 +125,11 @@ class BufferNode : public Node {
v->Visit("offset_factor", &offset_factor);
}
+ /*! \return preferred index type for this buffer node */
+ Type DefaultIndexType() const {
+ return shape.size() != 0 ? shape[0].type() : Int(32);
+ }
+
// User can specify data_alignment and offset_factor to be 0
// A default value will be picked.
TVM_DLL static Buffer make(Var ptr,
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
new file mode 100644
index 000000000000..96b876fe92f0
--- /dev/null
+++ b/include/tvm/build_module.h
@@ -0,0 +1,464 @@
+/*!
+* Copyright (c) 2017 by Contributors
+* \file tvm/build_module.h
+* \brief Functions for compiling ops.
+*/
+#ifndef TVM_BUILD_MODULE_H_
+#define TVM_BUILD_MODULE_H_
+
+#include
+#include
+#include
+#include "./runtime/packed_func.h"
+#include "./schedule_pass.h"
+#include "./lowered_func.h"
+
+namespace tvm {
+using namespace tvm::runtime;
+
+/*!
+* \brief Container for target device information.
+* Use target::llvm, target::cuda etc functions instead of constructing directly.
+*/
+class TargetNode : public Node {
+ public:
+ /*! \brief The name of the target device */
+ std::string target_name;
+ /*! \brief The name of the target device */
+ std::string device_name;
+ /*! \brief The type of the target device */
+ int device_type;
+ /*! \brief The maximum threads that a schedule should use for this device */
+ int max_num_threads = 1;
+ /*! \brief The warp size that should be used by the LowerThreadAllreduce pass */
+ int thread_warp_size = 1;
+ /*! \brief Keys for this target */
+ Array keys_array;
+ /*! \brief Options for this target */
+ Array options_array;
+ /*! \brief Collection of imported libs */
+ Array libs_array;
+
+ /*! \return the full device string to pass to codegen::Build */
+ EXPORT std::string str() const;
+
+ void VisitAttrs(AttrVisitor* v) final {
+ v->Visit("target_name", &target_name);
+ v->Visit("device_name", &device_name);
+ v->Visit("device_type", &device_type);
+ v->Visit("max_num_threads", &max_num_threads);
+ v->Visit("thread_warp_size", &thread_warp_size);
+ v->Visit("keys_array", &keys_array);
+ v->Visit("options_array", &options_array);
+ v->Visit("libs_array", &libs_array);
+ }
+
+ /*! \brief Get the keys for this target as a vector of string */
+ EXPORT std::vector keys() const;
+
+ /*! \brief Get the options for this target as a vector of string */
+ EXPORT std::vector options() const;
+
+ /*! \brief Get the keys for this target as an unordered_set of string */
+ EXPORT std::unordered_set libs() const;
+
+ static constexpr const char* _type_key = "Target";
+ TVM_DECLARE_NODE_TYPE_INFO(TargetNode, Node);
+};
+
+class Target : public NodeRef {
+ public:
+ Target() {}
+ explicit Target(std::shared_ptr n) : NodeRef(n) {}
+
+ /*!
+ * \brief Create a Target given a string
+ * \param target_str the string to parse
+ */
+ EXPORT static Target create(const std::string& target_str);
+
+ /*!
+ * \brief Push a new target context onto the thread local stack. The Target on top of
+ * the stack is used to determine which specialization to use when invoking a GenericFunc.
+ * \param target The target to set as the current context.
+ */
+ EXPORT static void EnterTargetScope(const tvm::Target& target);
+
+ /*!
+ * \brief Pop a target off the thread local context stack, restoring the previous target
+ * as the current context.
+ */
+ EXPORT static void ExitTargetScope();
+
+ /*!
+ * \brief Get the current target context from thread local storage.
+ * \param allow_not_defined If the context stack is empty and this is set to true, an
+ * undefined Target will be returned. Otherwise, an empty context stack will cause a
+ * runtime error.
+ * \return The target that is the current context. The target may not be defined if
+ * allow_not_defined is true.
+ */
+ EXPORT static tvm::Target current_target(bool allow_not_defined = true);
+
+ inline const TargetNode* operator->() const {
+ return static_cast(node_.get());
+ }
+
+ using ContainerType = TargetNode;
+};
+
+/*!
+ * \brief RAII container to provide a scoped target context. Pushes a target onto the
+ * context stack when constructed, and pops it when destructed.
+ */
+struct TargetContext {
+ /*!
+ * \brief Enter a new target context. The given target becomes the new current context.
+ * When the TargetContext is destructed, the previous context is restored.
+ * \param target The target to set as the new current context.
+ */
+ explicit TargetContext(const tvm::Target& target) {
+ Target::EnterTargetScope(target);
+ }
+
+ /*! \brief Destructor. Pops the context off the thread local stack. */
+ ~TargetContext() {
+ Target::ExitTargetScope();
+ }
+};
+
+/*! \brief This namespace provides functions to construct Target instances */
+namespace target {
+/*! \return A target for LLVM */
+EXPORT Target llvm(const std::vector& options =
+ std::vector());
+
+/*! \return A target for CUDA */
+EXPORT Target cuda(const std::vector& options =
+ std::vector());
+
+/*! \return A target for ROCm */
+EXPORT Target rocm(const std::vector& options =
+ std::vector());
+
+/*! \return A target for OpenCL */
+EXPORT Target opencl(const std::vector& options =
+ std::vector());
+
+/*! \return A target for Metal */
+EXPORT Target metal(const std::vector& options =
+ std::vector());
+
+/*! \return A target for rasp */
+EXPORT Target rasp(const std::vector& options =
+ std::vector());
+
+/*! \return A target for Mali */
+EXPORT Target mali(const std::vector& options =
+ std::vector());
+
+/*! \return A target for Intel Graphics */
+EXPORT Target intel_graphics(const std::vector& options =
+ std::vector());
+
+/*! \return A target for stackvm */
+EXPORT Target stackvm(const std::vector& options =
+ std::vector());
+
+} // namespace target
+
+class BuildConfig;
+
+/*!
+* \brief Container for build configuration options
+*/
+class BuildConfigNode : public Node {
+ public:
+ /*!
+ * \brief The data alignment to use when constructing buffers. If this is set to
+ * -1, then TVM's internal default will be used
+ */
+ int data_alignment = -1;
+ /*!
+ * \brief The offset factor to use when constructing buffers. If this is set to
+ * 0, then the offset field is not used.
+ */
+ int offset_factor = 0;
+
+ /*!
+ * \brief Splitting factor for loop splitting. If this is set to zero, no splitting will be
+ * done. Otherwise, a split will be done with this factor and the inner loop will be unrolled.
+ */
+ int double_buffer_split_loop = 1;
+ /*! \brief Threshold of number of steps in the loop to be automatically unrolled */
+ int auto_unroll_max_step = 0;
+ /*! \brief The maximum nested level of loops that can be automatically unrolled */
+ int auto_unroll_max_depth = 8;
+ /*! \brief The maximum extent of loop that will be unrolled */
+ int auto_unroll_max_extent = 0;
+ /*!
+ * \brief Whether to explicitly unroll the loop. If set to false, the unroll hint will
+ * be passed to the CodeGen phase. Set to true if CodeGen supports unroll pragma.
+ */
+ bool unroll_explicit = true;
+
+ /*! \brief Set to true if buffer arguments do not overlap. This enables more optimization. */
+ bool restricted_func = true;
+
+ /*! \brief Whether to detect global barrier */
+ bool detect_global_barrier = false;
+
+ /*! \brief Whether to partition const loop */
+ bool partition_const_loop = false;
+
+ /*! \brief Whether to dump the IR of each pass (only when building from python) */
+ std::vector< std::pair > add_lower_pass;
+
+ /*! \brief Whether to dump the IR of each pass (only when building from python) */
+ bool dump_pass_ir = false;
+
+ void VisitAttrs(AttrVisitor* v) final {
+ v->Visit("data_alignment", &data_alignment);
+ v->Visit("offset_factor", &offset_factor);
+ v->Visit("double_buffer_split_loop", &double_buffer_split_loop);
+ v->Visit("auto_unroll_max_step", &auto_unroll_max_step);
+ v->Visit("auto_unroll_max_depth", &auto_unroll_max_depth);
+ v->Visit("auto_unroll_max_extent", &auto_unroll_max_extent);
+ v->Visit("unroll_explicit", &unroll_explicit);
+ v->Visit("restricted_func", &restricted_func);
+ v->Visit("detect_global_barrier", &detect_global_barrier);
+ v->Visit("partition_const_loop", &partition_const_loop);
+ v->Visit("dump_pass_ir", &dump_pass_ir);
+ }
+
+ static constexpr const char* _type_key = "BuildConfig";
+ TVM_DECLARE_NODE_TYPE_INFO(BuildConfigNode, Node);
+};
+
+/*!
+* \brief Container for build configuration options
+*/
+class BuildConfig : public ::tvm::NodeRef {
+ public:
+ BuildConfig() {}
+ explicit BuildConfig(std::shared_ptr<::tvm::Node> n) : NodeRef(n) {}
+
+ const BuildConfigNode* operator->() const {
+ return static_cast(node_.get());
+ }
+
+ BuildConfigNode* operator->() {
+ return static_cast(node_.get());
+ }
+
+ /*!
+ * \brief Push a new BuildConfig context onto the thread local stack.
+ * \param build_config The configuration to set as the current context.
+ */
+ EXPORT static void EnterBuildConfigScope(const tvm::BuildConfig& build_config);
+
+ /*!
+ * \brief Pop a build config off the thread local context stack, restoring the previous
+ * configuration as the current context.
+ */
+ EXPORT static void ExitBuildConfigScope();
+
+ /*!
+ * \brief Get the current BuildConfig context from thread local storage, or a default
+ * configuration if a BuildConfig scope has not been entered.
+ * \return The configuration that is the current context.
+ */
+ EXPORT static tvm::BuildConfig Current();
+
+ using ContainerType = BuildConfigNode;
+};
+
+/*!
+ * \brief RAII container to provide a scoped BuildConfig context. Pushes a configuration onto the
+ * context stack when constructed, and pops it when destructed.
+ */
+struct BuildConfigContext {
+ /*!
+ * \brief Enter a new BuildConfig context. The given BuildConfig becomes the new current
+ * context. When the BuildConfigContext is destructed, the previous context is restored.
+ * \param build_config The BuildConfig to set as the new current context.
+ */
+ explicit BuildConfigContext(const tvm::BuildConfig& build_config) {
+ BuildConfig::EnterBuildConfigScope(build_config);
+ }
+
+ /*! \brief Destructor. Pops the context off the thread local stack. */
+ ~BuildConfigContext() {
+ BuildConfig::ExitBuildConfigScope();
+ }
+};
+
+/*!
+* \brief Construct a BuildConfig containing a new BuildConfigNode
+* \return The new BuildConfig
+*/
+EXPORT BuildConfig build_config();
+
+/*!
+* \brief Build a LoweredFunc given a schedule, args and binds
+* \param sch The schedule to lower.
+* \param args The arguments to the function.
+* \param name The name of the lowered function.
+* \param binds Buffer assignments.
+* \param config The build configuration.
+* \return The lowered function.
+*/
+EXPORT Array lower(Schedule sch,
+ const Array& args,
+ const std::string& name,
+ const std::unordered_map& binds,
+ const BuildConfig& config);
+
+/*!
+* \brief Build a device and host module for a specific target from an array of lowered functions.
+* \param funcs The functions to be built.
+* \param target The target device to build for.
+* \param target_host The target for building host code. To use the default, pass Target()
+* \param config The build configuration.
+* \return The built module.
+*/
+EXPORT runtime::Module build(const Array& funcs,
+ const Target& target,
+ const Target& target_host,
+ const BuildConfig& config);
+
+class GenericFuncNode;
+
+/*!
+ * \brief Generic function that can be specialized on a per-target basis.
+ */
+class GenericFunc : public NodeRef {
+ public:
+ GenericFunc() {}
+ explicit GenericFunc(std::shared_ptr n) : NodeRef(n) {}
+
+ /*!
+ * \brief Set the default function implementaiton.
+ * \param value The default function
+ * \param allow_override If true, this call may override a previously registered function. If
+ * false, an error will be logged if the call would override a previously registered function.
+ * \return reference to self.
+ */
+ TVM_DLL GenericFunc& set_default(const PackedFunc value,
+ bool allow_override = false);
+ /*!
+ * \brief Register a specialized function
+ * \param tags The tags for this specialization
+ * \param value The specialized function
+ * \param allow_override If true, this call may override previously registered tags. If false,
+ * an error will be logged if the call would override previously registered tags.
+ * \return reference to self.
+ */
+ TVM_DLL GenericFunc& register_func(const std::vector& tags,
+ const PackedFunc value,
+ bool allow_override = false);
+ /*!
+ * \brief Call generic function by directly passing in unpacked format.
+ * \param args Arguments to be passed.
+ * \tparam Args arguments to be passed.
+ *
+ * \code
+ * // Example code on how to call generic function
+ * void CallGeneirc(GenericFunc f) {
+ * // call like normal functions by pass in arguments
+ * // return value is automatically converted back
+ * int rvalue = f(1, 2.0);
+ * }
+ * \endcode
+ */
+ template
+ inline TVMRetValue operator()(Args&& ...args) const;
+ /*!
+ * \brief Invoke the relevant function for the current target context, set by set_target_context.
+ * Arguments are passed in packed format.
+ * \param args The arguments to pass to the function.
+ * \param ret The return value
+ */
+ TVM_DLL void CallPacked(TVMArgs args, TVMRetValue* ret) const;
+
+ /*!
+ * \brief Find or register the GenericFunc instance corresponding to the give name
+ * \param name The name of the registered GenericFunc
+ * \return The GenericFunc instance
+ */
+ TVM_DLL static GenericFunc Get(const std::string& name);
+
+ /*!
+ * \brief Add a GenericFunc instance to the registry
+ * \param func The GenericFunc instance
+ * \param name The name of the registered GenericFunc
+ */
+ TVM_DLL static void RegisterGenericFunc(GenericFunc func, const std::string& name);
+
+ /*!
+ * \brief access the internal node container
+ * \return the pointer to the internal node container
+ */
+ inline GenericFuncNode* operator->();
+
+ // declare container type
+ using ContainerType = GenericFuncNode;
+
+ // Internal class.
+ struct Manager;
+
+ private:
+ friend struct Manager;
+};
+
+template
+inline TVMRetValue GenericFunc::operator()(Args&& ...args) const {
+ const int kNumArgs = sizeof...(Args);
+ const int kArraySize = kNumArgs > 0 ? kNumArgs : 1;
+ TVMValue values[kArraySize];
+ int type_codes[kArraySize];
+ detail::for_each(TVMArgsSetter(values, type_codes),
+ std::forward(args)...);
+ TVMRetValue rv;
+ CallPacked(TVMArgs(values, type_codes, kNumArgs), &rv);
+ return rv;
+}
+
+/*!
+ * \brief Represents a generic function that can be specialized on a per-target basis.
+ */
+class GenericFuncNode : public Node {
+ public:
+ /*! \brief name of the function */
+ std::string name_;
+ /* \brief the generic builder */
+ PackedFunc generic_func_;
+ /* \brief map from keys to registered functions */
+ std::unordered_map dispatch_dict_;
+
+ static constexpr const char* _type_key = "GenericFunc";
+ TVM_DECLARE_NODE_TYPE_INFO(GenericFuncNode, Node);
+};
+
+inline GenericFuncNode* GenericFunc::operator->() {
+ return static_cast(node_.get());
+}
+
+#define TVM_GENERIC_FUNC_REG_VAR_DEF \
+ static TVM_ATTRIBUTE_UNUSED ::tvm::GenericFunc& __mk_ ## TVM
+
+/*!
+ * \def TVM_REGISTER_GENERIC_FUNC
+ * \brief Register a new generic function, or set a device-specific variant
+ * of the corresponding function.
+ *
+ * \param name The name of the function
+ */
+#define TVM_REGISTER_GENERIC_FUNC(name) \
+ TVM_STR_CONCAT(TVM_GENERIC_FUNC_REG_VAR_DEF, __COUNTER__) = \
+ ::tvm::GenericFunc::Get(#name)
+
+
+} // namespace tvm
+
+#endif // TVM_BUILD_MODULE_H_
diff --git a/include/tvm/c_dsl_api.h b/include/tvm/c_dsl_api.h
index f81018a7610e..6f15ef9a3e80 100644
--- a/include/tvm/c_dsl_api.h
+++ b/include/tvm/c_dsl_api.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file c_dsl_api.h
+ * \file tvm/c_dsl_api.h
*
* \brief TVM DSL Node C API, used to interact to DSL compilation.
*
@@ -17,7 +17,7 @@
#include "./runtime/c_runtime_api.h"
#ifdef __cplusplus
-TVM_EXTERN_C {
+extern "C" {
#endif
/*! \brief handle to node */
diff --git a/include/tvm/channel.h b/include/tvm/channel.h
index 56adff4ad8df..28d9b5f7ce4a 100644
--- a/include/tvm/channel.h
+++ b/include/tvm/channel.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2017 by Contributors
- * \file channel.h
+ * \file tvm/channel.h
* \brief Channel object for pipeline.
*/
#ifndef TVM_CHANNEL_H_
diff --git a/include/tvm/codegen.h b/include/tvm/codegen.h
index c43227bb1164..6b5116a143cc 100644
--- a/include/tvm/codegen.h
+++ b/include/tvm/codegen.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file codegen.h
+ * \file tvm/codegen.h
* \brief Collection of Lowlevel IR pass to codegen.
*/
#ifndef TVM_CODEGEN_H_
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 4e4e25c0ce7d..8c789f8df1dc 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file expr.h
+ * \file tvm/expr.h
* \brief The Expr and related elements in DataFlow construction.
*/
#ifndef TVM_EXPR_H_
@@ -16,31 +16,31 @@
namespace tvm {
-using Halide::Type;
-using Halide::Float;
-using Halide::Bool;
-using Halide::Int;
-using Halide::UInt;
-using Halide::Handle;
-using Halide::ExprHash;
-using Halide::ExprEqual;
+using HalideIR::Type;
+using HalideIR::Float;
+using HalideIR::Bool;
+using HalideIR::Int;
+using HalideIR::UInt;
+using HalideIR::Handle;
+using HalideIR::ExprHash;
+using HalideIR::ExprEqual;
-using Halide::Expr;
-using Halide::VarExpr;
-using Halide::IR::RangeNode;
-using Halide::IR::FunctionRef;
-using Halide::IR::FunctionBaseNode;
-using Halide::Internal::Stmt;
-using Halide::Internal::IRPrinter;
-using Halide::Internal::Variable;
+using HalideIR::Expr;
+using HalideIR::VarExpr;
+using HalideIR::IR::RangeNode;
+using HalideIR::IR::FunctionRef;
+using HalideIR::IR::FunctionBaseNode;
+using HalideIR::Internal::Stmt;
+using HalideIR::Internal::IRPrinter;
+using HalideIR::Internal::Variable;
-using Halide::Internal::make_const;
-using Halide::Internal::make_zero;
-using Halide::Internal::as_const_int;
-using Halide::Internal::as_const_uint;
-using Halide::Internal::const_true;
-using Halide::Internal::const_false;
-using Halide::Internal::is_no_op;
+using HalideIR::Internal::make_const;
+using HalideIR::Internal::make_zero;
+using HalideIR::Internal::as_const_int;
+using HalideIR::Internal::as_const_uint;
+using HalideIR::Internal::const_true;
+using HalideIR::Internal::const_false;
+using HalideIR::Internal::is_no_op;
inline Type TVMShapeIndexType() {
if (std::is_signed::value) {
@@ -51,7 +51,7 @@ inline Type TVMShapeIndexType() {
}
inline Type TVMType2Type(TVMType t) {
- return Type(static_cast(t.code), t.bits, t.lanes);
+ return Type(static_cast(t.code), t.bits, t.lanes);
}
inline TVMType Type2TVMType(Type t) {
@@ -71,9 +71,9 @@ inline int GetVectorBytes(Type dtype) {
}
/*! \brief a named variable in TVM */
-class Var : public Halide::VarExpr {
+class Var : public HalideIR::VarExpr {
public:
- explicit Var(const std::string& name_hint = "v",
+ EXPORT explicit Var(const std::string& name_hint = "v",
Type t = Int(32)) : VarExpr(name_hint, t) {}
explicit Var(std::shared_ptr n) : VarExpr(n) {}
explicit Var(VarExpr v) : VarExpr(v) {}
@@ -94,7 +94,7 @@ class Var : public Halide::VarExpr {
class IterVarNode;
/*!
- * \brief same as Halide::IR::Range
+ * \brief same as HalideIR::IR::Range
* except it provide an constructor with (begin, end)
*
* \note Traditional Halide's Range have a constructor with
@@ -102,11 +102,11 @@ class IterVarNode;
* We decided to correct it by removing the constructor in HalideIR,
* and add it back in TVM's range.
*/
-class Range : public Halide::IR::Range {
+class Range : public HalideIR::IR::Range {
public:
/*! \brief constructor */
Range() {}
- explicit Range(std::shared_ptr n) : Halide::IR::Range(n) {}
+ explicit Range(std::shared_ptr n) : HalideIR::IR::Range(n) {}
/*!
* \brief constructor by begin and end
* \param begin The begin of the range.
@@ -291,6 +291,13 @@ inline const char* IterVarType2String(IterVarType t) {
return "Unknown";
}
+/*!
+ * \brief Construct a new Var expression
+ * \param name_hint The name hint for the expression
+ * \param t The type of the expression
+ */
+TVM_DLL Var var(const std::string& name_hint, Type t = Int(32));
+
/*
* \brief Template function to convert Map to unordered_map
* Sometimes useful for API gluing when internal uses unordered_map
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index ae53d38b82b2..9ea16131188d 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file ir.h
+ * \file tvm/ir.h
* \brief Additional high level nodes in the IR
*/
#ifndef TVM_IR_H_
@@ -16,11 +16,11 @@
namespace tvm {
namespace ir {
-using Halide::Internal::ExprNode;
-using Halide::Internal::StmtNode;
-using Halide::Internal::IRNodeType;
-using Halide::Internal::ForType;
-using Halide::DeviceAPI;
+using HalideIR::Internal::ExprNode;
+using HalideIR::Internal::StmtNode;
+using HalideIR::Internal::IRNodeType;
+using HalideIR::Internal::ForType;
+using HalideIR::DeviceAPI;
// Node container for CommReducer
struct CommReducerNode;
@@ -152,6 +152,12 @@ constexpr const char* coproc_scope = "coproc_scope";
constexpr const char* coproc_uop_scope = "coproc_uop_scope";
/*! \brief Mark the scope as volatile access for certain handle. */
constexpr const char* volatile_scope = "volatile_scope";
+/*!
+ * \brief Mark the scope as generated by extern primitive.
+ * such scope can contain arbitrary ir program and we need to be careful
+ * when make certain assumptions about the structure of the program.
+ */
+constexpr const char* extern_scope = "extern_scope";
/*!
* \brief Mark the scope as when computation start to happen
* This can hint some code generator to create a new function for compute.
@@ -171,8 +177,10 @@ constexpr const char* device_context_type = "device_context_type";
constexpr const char* loop_scope = "loop_scope";
/*! \brief Mark of reduce scope */
constexpr const char* reduce_scope = "reduce_scope";
-/*! \brief Mark region is guarded by the pragma */
-constexpr const char* pragma_scope = "pragma_scope";
+/*! \brief Mark region is guarded by the pragma extension */
+constexpr const char* pragma_scope_prefix = "pragma_";
+/*! \brief Import llvm source or file into the final code gen module */
+constexpr const char* pragma_import_llvm = "pragma_import_llvm";
/*!
* \brief Mark of prefetch scope, value=offset,
* run prefetch of Tensor on the current loop scope
@@ -220,6 +228,23 @@ constexpr const char* channel_write_advance = "channel_write_advance";
constexpr const char* pipeline_stage_scope = "pipeline_stage_scope";
/*! \brief pipeline execution scope, implies the scope can be pipelined. */
constexpr const char* pipeline_exec_scope = "pipeline_exec_scope";
+/*!
+ * \brief Mark that this stage is an OpenGL shader. Since OpenGL shader only
+ * allows writing out to one element of the output texture, the Provide node
+ * gets translated to a special Call::glsl_texture_store statement instead of a
+ * Store statement.
+ */
+constexpr const char* opengl_stage_scope = "opengl_stage_scope";
+
+/*!
+ * \brief Check if attr_key is a pragma key extension
+ * \param attr_key The attr key to be compared
+ * \return true if it is a pragma key
+ */
+inline bool IsPragmaKey(const std::string& attr_key) {
+ return attr_key.compare(0, 7, "pragma_") == 0;
+}
+
} // namespace attr
/*! \brief namespace of TVM Intrinsic functions */
@@ -256,6 +281,11 @@ constexpr const char* tvm_if_then_else = "tvm_if_then_else";
* }
*/
constexpr const char* tvm_access_ptr = "tvm_access_ptr";
+/*!
+ * \brief Create a function local static handle that iniitalizes to nullptr.
+ * can be used to cache function local static resources.
+ */
+constexpr const char* tvm_static_handle = "tvm_static_handle";
/*!
* \brief Return a unique context id, used for hint of workspace separation.
* Different context id ganrantees not having overlapping workspace.
@@ -393,6 +423,14 @@ constexpr const char* tvm_call_packed_lowered = "tvm_call_packed_lowered";
* }
*/
constexpr const char* tvm_storage_sync = "tvm_storage_sync";
+/*!
+ * \brief See pseudo code
+ *
+ * Type tvm_warp_shuffle(Type value, warp_id) {
+ * return (value passed in by warp indicated by warp_id);
+ * }
+ */
+constexpr const char* tvm_warp_shuffle = "tvm_warp_shuffle";
/*!
* \brief Initialize the global barrier.
* Call this at beginning of kernel that need global barrier.
@@ -433,51 +471,61 @@ enum TVMStructFieldKind : int {
} // namespace intrinsic
// Reuse IR node defintiion from HalideIR
-using Halide::Internal::IntImm;
-using Halide::Internal::UIntImm;
-using Halide::Internal::FloatImm;
-using Halide::Internal::StringImm;
-using Halide::Internal::Cast;
-using Halide::Internal::Add;
-using Halide::Internal::Sub;
-using Halide::Internal::Mul;
-using Halide::Internal::Div;
-using Halide::Internal::Mod;
-using Halide::Internal::Min;
-using Halide::Internal::Max;
-using Halide::Internal::EQ;
-using Halide::Internal::NE;
-using Halide::Internal::LT;
-using Halide::Internal::LE;
-using Halide::Internal::GT;
-using Halide::Internal::GE;
-using Halide::Internal::And;
-using Halide::Internal::Or;
-using Halide::Internal::Not;
-using Halide::Internal::Select;
-using Halide::Internal::Load;
-using Halide::Internal::Ramp;
-using Halide::Internal::Broadcast;
-using Halide::Internal::Call;
-using Halide::Internal::Let;
-using Halide::Internal::LetStmt;
-using Halide::Internal::AttrStmt;
-using Halide::Internal::AssertStmt;
-using Halide::Internal::ProducerConsumer;
-using Halide::Internal::For;
-using Halide::Internal::Store;
-using Halide::Internal::Provide;
-using Halide::Internal::Allocate;
-using Halide::Internal::Free;
-using Halide::Internal::Realize;
-using Halide::Internal::Prefetch;
-using Halide::Internal::Block;
-using Halide::Internal::IfThenElse;
-using Halide::Internal::Evaluate;
-using Halide::Internal::Shuffle;
+using HalideIR::Internal::IntImm;
+using HalideIR::Internal::UIntImm;
+using HalideIR::Internal::FloatImm;
+using HalideIR::Internal::StringImm;
+using HalideIR::Internal::Cast;
+using HalideIR::Internal::Add;
+using HalideIR::Internal::Sub;
+using HalideIR::Internal::Mul;
+using HalideIR::Internal::Div;
+using HalideIR::Internal::Mod;
+using HalideIR::Internal::Min;
+using HalideIR::Internal::Max;
+using HalideIR::Internal::EQ;
+using HalideIR::Internal::NE;
+using HalideIR::Internal::LT;
+using HalideIR::Internal::LE;
+using HalideIR::Internal::GT;
+using HalideIR::Internal::GE;
+using HalideIR::Internal::And;
+using HalideIR::Internal::Or;
+using HalideIR::Internal::Not;
+using HalideIR::Internal::Select;
+using HalideIR::Internal::Load;
+using HalideIR::Internal::Ramp;
+using HalideIR::Internal::Broadcast;
+using HalideIR::Internal::Call;
+using HalideIR::Internal::Let;
+using HalideIR::Internal::LetStmt;
+using HalideIR::Internal::AttrStmt;
+using HalideIR::Internal::AssertStmt;
+using HalideIR::Internal::ProducerConsumer;
+using HalideIR::Internal::For;
+using HalideIR::Internal::Store;
+using HalideIR::Internal::Provide;
+using HalideIR::Internal::Allocate;
+using HalideIR::Internal::Free;
+using HalideIR::Internal::Realize;
+using HalideIR::Internal::Prefetch;
+using HalideIR::Internal::Block;
+using HalideIR::Internal::IfThenElse;
+using HalideIR::Internal::Evaluate;
+using HalideIR::Internal::Shuffle;
// ir functions
-using Halide::Internal::is_const_power_of_two_integer;
+using HalideIR::Internal::is_const_power_of_two_integer;
+/*!
+ * \brief Create a type annotation expression
+ * \param dtype The data type
+ * \return Expr a expression with dtype.
+ */
+inline Expr TypeAnnotation(Type dtype) {
+ return ir::Call::make(dtype,
+ "type_annotation", {},
+ ir::Call::PureIntrinsic);
+}
} // namespace ir
} // namespace tvm
diff --git a/include/tvm/ir_functor_ext.h b/include/tvm/ir_functor_ext.h
index 55368fbea14d..3784608c8da1 100644
--- a/include/tvm/ir_functor_ext.h
+++ b/include/tvm/ir_functor_ext.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2017 by Contributors
- * \file ir_functor_ext.h
+ * \file tvm/ir_functor_ext.h
* \brief More powerful Visitor that allows define function signatures.
*/
#ifndef TVM_IR_FUNCTOR_EXT_H_
diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
index 1faf1724ddb6..b8aae3638149 100644
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file ir_mutator.h
+ * \file tvm/ir_mutator.h
* \brief Defines general IRMutation pass
*/
#ifndef TVM_IR_MUTATOR_H_
diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h
index a0726f0030ab..947c3b736d80 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/ir_operator.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2017 by Contributors
- * \file ir_operator.h
+ * \file tvm/ir_operator.h
* \brief Common operators of Expr
*/
#ifndef TVM_IR_OPERATOR_H_
@@ -12,14 +12,13 @@
namespace tvm {
-using Halide::likely;
-using Halide::likely_if_innermost;
+using HalideIR::likely;
+using HalideIR::likely_if_innermost;
// functions
-using Halide::cast;
-using Halide::min;
-using Halide::max;
-using Halide::abs;
-using Halide::select;
+using HalideIR::cast;
+using HalideIR::min;
+using HalideIR::max;
+using HalideIR::select;
/*!
* \brief sum of of source expression over axis
@@ -42,16 +41,55 @@ TVM_DLL Expr max(Expr source, Array axis);
*/
TVM_DLL Expr min(Expr source, Array axis);
+
// Unary intrinsic operators
#define TVM_DECLARE_INTRIN_UNARY(OpName) \
inline Expr OpName(Expr x) { \
- return ir::Call::make(x.type(), #OpName, {x}, ir::Call::PureExtern); \
+ return ir::Call::make(x.type(), #OpName, {x}, ir::Call::PureIntrinsic); \
} \
+
TVM_DECLARE_INTRIN_UNARY(exp);
TVM_DECLARE_INTRIN_UNARY(tanh);
TVM_DECLARE_INTRIN_UNARY(sigmoid);
TVM_DECLARE_INTRIN_UNARY(sqrt);
+TVM_DECLARE_INTRIN_UNARY(log);
+TVM_DECLARE_INTRIN_UNARY(floor);
+TVM_DECLARE_INTRIN_UNARY(ceil);
+TVM_DECLARE_INTRIN_UNARY(round);
+TVM_DECLARE_INTRIN_UNARY(trunc);
+
+/*!
+ * \brief Calculate power(x, y)
+ * \param x The left operand.
+ * \param y The right operand.
+ */
+inline Expr pow(Expr x, Expr y) {
+ match_types(x, y);
+ CHECK(x.type().is_float()) << "power only applies to float";
+ return ir::Call::make(x.type(), "pow", { x, y }, ir::Call::PureIntrinsic);
+}
+
+/*!
+ * \brief Calculate absolute value of x, elementwise
+ * \param x The input data
+ *
+ * \return The aboslute value of input data x
+ */
+inline Expr abs(Expr x) {
+ if (x.type().is_int()) {
+ return select(x >= make_zero(x.type()), x, -x);
+ } else if (x.type().is_float()) {
+ return ir::Call::make(x.type(), "fabs", {x}, ir::Call::PureIntrinsic);
+ } else if (x.type().is_uint()) {
+ return x;
+ } else {
+ LOG(WARNING) << "Warning: Data type " << x.type()
+ <<" not supported for absolute op. Skipping absolute op...";
+ return x;
+ }
+}
+
} // namespace tvm
#endif // TVM_IR_OPERATOR_H_
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index 6b95bd268652..d875621a3f5e 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file ir_pass.h
+ * \file tvm/ir_pass.h
* \brief Collection of IR pass functions
*
* When the pass functions in this file are for Stmt,
@@ -22,27 +22,39 @@
namespace tvm {
namespace ir {
-inline Expr Simplify(Expr a) {
- return Halide::Internal::simplify(a);
-}
+/*!
+ * \brief Simplify the expression.
+ * \param expr The expression to be simplifed.
+ * \param vrange The range information about the variable.
+ * \return Canonicalized statement.
+ */
+EXPORT Expr Simplify(Expr expr, Map vrange = Map());
-inline Stmt Simplify(Stmt a) {
- return Halide::Internal::simplify(a);
-}
+/*!
+ * \brief Simplify the statement.
+ * \param stmt The statement to be simplifed.
+ * \param vrange The range information about the variable.
+ * \return Canonicalized statement.
+ */
+Stmt Simplify(Stmt stmt, Map vrange = Map());
/*!
* \brief Simplify by applying canonical form.
* \param stmt The statement to be canonically simplifed.
+ * \param vrange The range information about the variable.
* \return Canonicalized statement.
*/
-Stmt CanonicalSimplify(Stmt stmt);
+Stmt CanonicalSimplify(Stmt stmt,
+ Map vrange = Map());
/*!
* \brief Simplify by applying canonical form.
* \param expr The statement to be canonically simplifed.
+ * \param vrange The range information about the variable.
* \return Canonicalized expression.
*/
-Expr CanonicalSimplify(Expr expr);
+EXPORT Expr CanonicalSimplify(Expr expr,
+ Map vrange = Map());
/*!
* \brief Deep compare lhs and rhs
@@ -50,7 +62,7 @@ Expr CanonicalSimplify(Expr expr);
* \param rhs The right operand
* \return The comparison result.
*/
-bool Equal(const Expr& lhs, const Expr& rhs);
+EXPORT bool Equal(const Expr& lhs, const Expr& rhs);
/*!
* \brief Deep compare lhs and rhs
@@ -204,11 +216,17 @@ Stmt NarrowChannelAccess(Stmt stmt);
*
* \param stmt The statment to be unrolled.
* \param auto_max_step The maximum step before stop attach automatic unroll
- * \param auto_min_depth The minimum depth before we can start automatic unroll
+ * \param auto_max_depth The maximum depth before stop attach automatic unroll
+ * \param auto_max_extent The maximum extent of the loop we can unroll,
+ * this is an legacy option that donot take the loop total steps into account.
* \param explicit_unroll Whether explicitly unroll the loop, or leave unroll annotation to codegen.
* \return Transformed stmt.
*/
-Stmt UnrollLoop(Stmt stmt, int auto_max_step, int auto_min_depth, bool explicit_unroll);
+Stmt UnrollLoop(Stmt stmt,
+ int auto_max_step,
+ int auto_max_depth,
+ int auto_max_extent,
+ bool explicit_unroll);
/*!
* \brief vectorize the constant loops
@@ -271,9 +289,10 @@ Stmt StorageRewrite(Stmt stmt);
/*!
* \brief partition loops in the stmt
* \param stmt The stmt to do loop partition
+ * \param split_const_loop flag to enable partition for const loop
* \return Transformed stmt.
*/
-Stmt LoopPartition(Stmt stmt);
+Stmt LoopPartition(Stmt stmt, bool split_const_loop);
/*!
* \brief Detect and insert sync points to co-processor.
@@ -388,6 +407,29 @@ LoweredFunc ThreadSync(LoweredFunc stmt, std::string storage_scope);
*/
LoweredFunc LowerThreadAllreduce(LoweredFunc f, int warp_size);
+/*!
+ * \brief Lower warp memory in stmt.
+ * \param f The device function to be lowered.
+ * \param warp_size the size of warp where no sync is needed.
+ * this function will only take in effect if warp_size is bigger than one.
+ * \return Transformed function.
+ */
+LoweredFunc LowerWarpMemory(LoweredFunc f, int warp_size);
+
+/*!
+ * \brief Remap the thread axis
+ *
+ * This can be used to get equivalent program which uses
+ * threadIdx.y in place of threadIdx.x by passing
+ * {"threadIdx.x": thread_axis("threadIdx.y")}
+ *
+ *
+ * \param f The device function to be lowered.
+ * \param axis_map The map from StringImm -> ItrVar
+ * \return Transformed function.
+ */
+LoweredFunc RemapThreadAxis(LoweredFunc f, Map axis_map);
+
/*!
* \brief Lower packed function call.
* \param f The function to be lowered.
@@ -402,6 +444,18 @@ LoweredFunc LowerTVMBuiltin(LoweredFunc f);
*/
LoweredFunc CombineContextCall(LoweredFunc f);
+/*!
+ * \brief Rewrite the pointer content type of arguments,
+ * as well as Alloc internal to the function to use
+ * the most frequently accessed type for load/store
+ * to avoid pointer casting in backend when possible.
+ *
+ * \note implemeneted in storage_rewrite.cc
+ * \param f The function to be trasnformed
+ * \return Transformed function.
+ */
+LoweredFunc PointerValueTypeRewrite(LoweredFunc f);
+
/*!
* \brief Lower intrinsic function calls.
* \param f The device function to be lowered.
@@ -409,6 +463,44 @@ LoweredFunc CombineContextCall(LoweredFunc f);
* \return Transformed function.
*/
LoweredFunc LowerIntrin(LoweredFunc f, const std::string& target);
+
+/*!
+ * \brief Verify if memory accesses are legal for a specific target device type.
+ *
+ * In the case that tgt is cuda, if not all workload is bound with
+ * threads, CPU code is generated that tries to access GPU memory,
+ * which is illegal. This pass performs verification for this case.
+ *
+ * \param func The function to be verified.
+ * \param device_type The target device type.
+ * \return Success of memory verification.
+ */
+bool VerifyMemory(LoweredFunc func, int device_type);
+
+
+/*!
+ * \brief Verify the correctness of a GPU code
+ * It will check the whether the amount of memory usage or the number of threads
+ * in a block exceeds the limit
+ * \param stmt The statement to be checked
+ * \param constraints The dict to specify constraints to check.
+ * Possible keys are
+ *
+ * "max_local_memory_per_block": Total amount of local memory per block (in bytes).
+ * "max_shared_memory_per_block": Total amount of shared memory per block (in bytes).
+ * "max_threads_per_block": Maximum number of threads per block.
+ * "max_thread_x": Maximum length of threadIdx.x.
+ * "max_thread_y": Maximum length of threadIdx.y.
+ * "max_thread_z": Maximum length of threadIdx.z.
+ *
+ * If one key is missing in this argument, the pass won't check for that item.
+ * \return valid Whether it is a valid GPU code
+ *
+ */
+bool VerifyGPUCode(Stmt stmt,
+ Map constraints);
+
+
} // namespace ir
} // namespace tvm
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index 7cfd45b833c8..8919b0f7a5c2 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file ir_visitor.h
+ * \file tvm/ir_visitor.h
* \brief Visitor to quickly visit IR trees
*/
#ifndef TVM_IR_VISITOR_H_
diff --git a/include/tvm/logging.h b/include/tvm/logging.h
new file mode 100644
index 000000000000..070b6e092a2e
--- /dev/null
+++ b/include/tvm/logging.h
@@ -0,0 +1,99 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file tvm/logging.h
+ * \brief logging utilities on top of dmlc-core
+ */
+#ifndef TVM_LOGGING_H_
+#define TVM_LOGGING_H_
+
+// a technique that enables overriding macro names on the number of parameters. This is used
+// to define other macros below
+#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME
+
+/*!
+ * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X
+ * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG (defined dmlc-core/include/dmlc/logging.h.)
+ * COND_X (but not COND_X_N) are supposed to be used outside this file.
+ * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert',
+ * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X.
+ * quit_on_assert determines the overall behaviour of COND_X. If it's true COND_X
+ * quits the program on assertion failure. If it's false, then it moves on and somehow reports
+ * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false'
+ * in a function, or 'continue' or 'break' in a loop)
+ * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not
+ * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what
+ * to do when when quit_on_assertion is false and the assertion fails.
+ *
+ * Rationale: These macros were designed to implement functions that have two behaviours
+ * in a concise way. Those behaviours are quitting on assertion failures, or trying to
+ * move on from assertion failures. Note that these macros hide lots of control flow in them,
+ * and therefore, makes the logic of the whole code slightly harder to understand. However,
+ * in pieces of code that use these macros frequently, it will significantly shorten the
+ * amount of code needed to be read, and we won't need to clutter the main logic of the
+ * function by repetitive control flow structure. The first problem
+ * mentioned will be improved over time as the developer gets used to the macro.
+ *
+ * Here is an example of how to use it
+ * \code
+ * bool f(..., bool quit_on_assertion) {
+ * int a = 0, b = 0;
+ * ...
+ * a = ...
+ * b = ...
+ * // if quit_on_assertion is true, if a==b, continue, otherwise quit.
+ * // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default behaviour)
+ * COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when quiting"
+ * ...
+ * for (int i = 0; i < N; i++) {
+ * a = ...
+ * b = ...
+ * // if quit_on_assertion is true, if a==b, continue, otherwise quit.
+ * // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default
+ * // behaviour, therefore, has to be explicitly specified)
+ * COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when quiting"
+ * }
+ * }
+ * \endcode
+ */
+#define COND_CHECK_GE(...) \
+ GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__)
+#define COND_CHECK_EQ(...) \
+ GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__)
+#define COND_CHECK(...) \
+ GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__)
+#define COND_LOG(...) \
+ GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__)
+
+// Not supposed to be used by users directly.
+#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \
+ if (!quit_on_assert) { \
+ if (!((x) op (y))) \
+ what; \
+ } \
+ else /* NOLINT(*) */ \
+ CHECK_##op(x, y)
+
+#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==)
+#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=)
+
+#define COND_CHECK_3(quit_on_assert, x, what) \
+ if (!quit_on_assert) { \
+ if (!(x)) \
+ what; \
+ } \
+ else /* NOLINT(*) */ \
+ CHECK(x)
+
+#define COND_LOG_3(quit_on_assert, x, what) \
+ if (!quit_on_assert) { \
+ what; \
+ } \
+ else /* NOLINT(*) */ \
+ LOG(x)
+
+#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false)
+#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false)
+#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false)
+#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false)
+
+#endif // TVM_LOGGING_H_
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index 7b7ebcf1e4d5..19f7e27f1c75 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2017 by Contributors
- * \file lowered_func.h
+ * \file tvm/lowered_func.h
* \brief Information about a lowered TVM function.
* This data structure is final step toward codegen.
*/
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index 8242bfbeefb4..d13680531af9 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file operation.h
+ * \file tvm/operation.h
* \brief Operation node can generate one or multiple Tensors
*/
#ifndef TVM_OPERATION_H_
@@ -41,6 +41,8 @@ class OperationNode : public FunctionBaseNode {
std::string name;
/*! \brief optional tag of the operation */
std::string tag;
+ /*! \brief addtitional attributes of the operation*/
+ Map attrs;
/*! \return name of the operation */
const std::string& func_name() const final {
return name;
@@ -117,11 +119,13 @@ class OperationNode : public FunctionBaseNode {
* \brief Build the statement that provide the output tensors.
* \param stage The schedule stage of the op.
* \param dom_map The domain map of all iteration domains.
+ * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
* \return A statement that add production and wraps consumer.
*/
virtual Stmt BuildProvide(
const Stage& stage,
- const std::unordered_map& dom_map) const = 0;
+ const std::unordered_map& dom_map,
+ bool debug_keep_trivial_loop) const = 0;
static constexpr const char* _type_key = "Operation";
@@ -160,10 +164,13 @@ class PlaceholderOpNode : public OperationNode {
const Stmt& body) const final;
Stmt BuildProvide(
const Stage& stage,
- const std::unordered_map& dom_map) const final;
+ const std::unordered_map& dom_map,
+ bool debug_keep_trivial_loop) const final;
void VisitAttrs(AttrVisitor* v) final {
v->Visit("name", &name);
+ v->Visit("tag", &tag);
+ v->Visit("attrs", &attrs);
v->Visit("shape", &shape);
v->Visit("dtype", &dtype);
}
@@ -178,7 +185,7 @@ class PlaceholderOpNode : public OperationNode {
/*!
* \brief A Compute op that compute a tensor on certain domain.
*/
-class ComputeOpNode : public OperationNode {
+class TVM_DLL ComputeOpNode : public OperationNode {
public:
/*! \brief IterVar on each axis */
Array axis;
@@ -211,17 +218,20 @@ class ComputeOpNode : public OperationNode {
const Stmt& body) const final;
Stmt BuildProvide(
const Stage& stage,
- const std::unordered_map& dom_map) const final;
+ const std::unordered_map& dom_map,
+ bool debug_keep_trivial_loop) const final;
void VisitAttrs(AttrVisitor* v) final {
v->Visit("name", &name);
v->Visit("tag", &tag);
+ v->Visit("attrs", &attrs);
v->Visit("axis", &axis);
v->Visit("reduce_axis", &reduce_axis);
v->Visit("body", &body);
}
static Operation make(std::string name,
std::string tag,
+ Map attrs,
Array axis,
Array body);
@@ -282,11 +292,13 @@ class ScanOpNode : public OperationNode {
const Stmt& body) const final;
Stmt BuildProvide(
const Stage& stage,
- const std::unordered_map& dom_map) const final;
+ const std::unordered_map& dom_map,
+ bool debug_keep_trivial_loop) const final;
void VisitAttrs(AttrVisitor* v) final {
v->Visit("name", &name);
v->Visit("tag", &tag);
+ v->Visit("attrs", &attrs);
v->Visit("scan_axis", &scan_axis);
v->Visit("init", &init);
v->Visit("update", &update);
@@ -296,6 +308,7 @@ class ScanOpNode : public OperationNode {
}
static Operation make(std::string name,
std::string tag,
+ Map attrs,
IterVar axis,
Array init,
Array update,
@@ -345,20 +358,23 @@ class ExternOpNode : public OperationNode {
const Stmt& body) const final;
Stmt BuildProvide(
const Stage& stage,
- const std::unordered_map& dom_map) const final;
+ const std::unordered_map& dom_map,
+ bool debug_keep_trivial_loop) const final;
void VisitAttrs(AttrVisitor* v) final {
v->Visit("name", &name);
v->Visit("tag", &tag);
+ v->Visit("attrs", &attrs);
v->Visit("inputs", &inputs);
v->Visit("body", &body);
}
- static Operation make(std::string name,
- std::string tag,
- Array inputs,
- Array input_placeholders,
- Array output_placeholders,
- Stmt body);
+ EXPORT static Operation make(std::string name,
+ std::string tag,
+ Map attrs,
+ Array inputs,
+ Array input_placeholders,
+ Array output_placeholders,
+ Stmt body);
static constexpr const char* _type_key = "ExternOp";
TVM_DECLARE_NODE_TYPE_INFO(ExternOpNode, OperationNode);
@@ -387,11 +403,13 @@ TVM_DLL Tensor placeholder(Array shape,
* \param fcompute The compute function to create the tensor.
* \param name The optional name of the tensor.
* \param tag The optional tag of the tensor.
+ * \param attrs Optional additional attributes of the compute.
*/
TVM_DLL Tensor compute(Array shape,
FCompute fcompute,
std::string name = "tensor",
- std::string tag = "");
+ std::string tag = "",
+ Map attrs = {});
/*!
* \brief Construct a new tensor by computing over shape,
@@ -400,11 +418,13 @@ TVM_DLL Tensor compute(Array shape,
* \param fcompute The compute function to create the tensors.
* \param name The optional name of the tensor.
* \param tag The optional tag of the tensor.
+ * \param attrs Optional additional attributes of the compute.
*/
TVM_DLL Array compute(Array shape,
FBatchCompute fcompute,
std::string name = "tensor",
- std::string tag = "");
+ std::string tag = "",
+ Map attrs = {});
/*!
* \brief Construct new tensors by scan.
@@ -416,42 +436,48 @@ TVM_DLL Array compute(Array shape,
* but recommended to provide concrete information about scan body.
* \param name The optional name of the tensor.
* \param tag The optional tag of the tensor.
+ * \param attrs Optional additional attributes of the compute.
*/
TVM_DLL Array scan(Array init,
Array update,
Array state_placeholder,
Array inputs = Array(),
std::string name = "scan",
- std::string tag = "");
+ std::string tag = "",
+ Map attrs = {});
// same as compute, specialized for different fcompute function
inline Tensor compute(Array shape,
std::function f,
std::string name = "tensor",
- std::string tag = "") {
+ std::string tag = "",
+ Map attrs = {}) {
FCompute fc = [f] (const Array& i) { return f(i[0]); };
- return compute(shape, fc, name, tag);
+ return compute(shape, fc, name, tag, attrs);
}
inline Tensor compute(Array shape,
std::function f,
std::string name = "tensor",
- std::string tag = "") {
+ std::string tag = "",
+ Map attrs = {}) {
FCompute fc = [f] (const Array& i) { return f(i[0], i[1]); };
- return compute(shape, fc, name, tag);
+ return compute(shape, fc, name, tag, attrs);
}
inline Tensor compute(Array shape,
std::function f,
std::string name = "tensor",
- std::string tag = "") {
+ std::string tag = "",
+ Map attrs = {}) {
FCompute fc = [f] (const Array& i) { return f(i[0], i[1], i[2]); };
- return compute(shape, fc, name, tag);
+ return compute(shape, fc, name, tag, attrs);
}
inline Tensor compute(Array shape,
std::function f,
std::string name = "tensor",
- std::string tag = "") {
+ std::string tag = "",
+ Map attrs = {}) {
FCompute fc = [f] (const Array& i) { return f(i[0], i[1], i[2], i[3]); };
- return compute(shape, fc, name, tag);
+ return compute(shape, fc, name, tag, attrs);
}
// inline function.
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 1f66232baacc..95964547ef8e 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -1,6 +1,6 @@
/*!
* Copyright (c) 2016 by Contributors
- * \file packed_func_ext.h
+ * \file tvm/packed_func_ext.h
* \brief Extension package to PackedFunc
* This enales pass NodeRef types into/from PackedFunc.
*/
@@ -14,6 +14,7 @@
#include "./base.h"
#include "./expr.h"
+#include "./tensor.h"
#include "./runtime/packed_func.h"
namespace tvm {
@@ -59,6 +60,25 @@ struct NodeTypeChecker > {
}
};
+template
+struct NodeTypeChecker