From b4cf76a7d349813d54fb2e2872e48ef6b7db11f9 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 13 Apr 2018 22:05:09 -0700
Subject: [PATCH] [RUNTIME] Simplify dynamic library and code path. (#27)

* [RUNTIME] Simplify dynamic library and code path.

* reword the readme
---
 vta/Makefile                                  |  16 +--
 vta/README.md                                 |  24 +++-
 vta/make/config.mk                            |   7 +-
 vta/python/vta/__init__.py                    |   3 +-
 vta/python/vta/environment.py                 |  65 ++++++++--
 vta/python/vta/exec/rpc_server.py             |  53 +++++----
 vta/python/vta/testing/simulator.py           |   1 -
 vta/python/vta/testing/util.py                |  14 ++-
 vta/src/data_buffer.cc                        |  44 -------
 vta/src/data_buffer.h                         |  90 --------------
 .../{tvm/vta_device_api.cc => device_api.cc}  |  29 ++---
 vta/src/runtime.cc                            | 112 +++++++++++++++++-
 vta/tests/python/unittest/test_vta_insn.py    |  15 +++
 13 files changed, 260 insertions(+), 213 deletions(-)
 delete mode 100644 vta/src/data_buffer.cc
 delete mode 100644 vta/src/data_buffer.h
 rename vta/src/{tvm/vta_device_api.cc => device_api.cc} (75%)

diff --git a/vta/Makefile b/vta/Makefile
index 6bfa82dc2e10..e112b39c9f58 100644
--- a/vta/Makefile
+++ b/vta/Makefile
@@ -53,12 +53,9 @@ else
 	NO_WHOLE_ARCH= --no-whole-archive
 endif
 
-
-all: lib/libvta.so lib/libvta_runtime.so
-
 VTA_LIB_SRC = $(wildcard src/*.cc src/tvm/*.cc)
 
-ifeq ($(TARGET), VTA_PYNQ_TARGET)
+ifeq ($(VTA_TARGET), pynq)
 	VTA_LIB_SRC += $(wildcard src/pynq/*.cc)
 	LDFLAGS += -L/usr/lib -lsds_lib
 	LDFLAGS += -L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
@@ -66,24 +63,23 @@ ifeq ($(TARGET), VTA_PYNQ_TARGET)
 	LDFLAGS += -l:libdma.so
 endif
 
-ifeq ($(TARGET), sim)
+ifeq ($(VTA_TARGET), sim)
 	VTA_LIB_SRC += $(wildcard src/sim/*.cc)
 endif
 
 VTA_LIB_OBJ = $(patsubst src/%.cc, build/%.o, $(VTA_LIB_SRC))
 
+all: lib/libvta.so
+
 build/%.o: src/%.cc
 	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/$*.d
+	$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
 	$(CXX) -c $(CFLAGS) -c $< -o $@
 
-lib/libvta.so: $(filter-out build/runtime.o, $(VTA_LIB_OBJ))
+lib/libvta.so: $(VTA_LIB_OBJ)
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS)
 
-lib/libvta_runtime.so: build/runtime.o
-	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS)
 
 lint: pylint cpplint
 
diff --git a/vta/README.md b/vta/README.md
index 39d094cc54e6..5408f3b950b1 100644
--- a/vta/README.md
+++ b/vta/README.md
@@ -1,11 +1,25 @@
-Open Hardware/Software Stack for Vertical Deep Learning System Optimization
-==============================================
+VTA: Open, Modular, Deep Learning Accelerator Stack
+===================================================
 
 [![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
 
-VTA is an open hardware/software co-design stack for deep learning systems systems.
-It provides a customizable hardware accelerator template for deep learning inference workloads,
-combined with a fully functional compiler stack built with TVM.
+VTA(versatile tensor accelerator) is an open-source deep learning accelerator stack.
+It is not just an open-source hardware, but is an end to end solution that includes
+the entire software stack on top of VTA open-source hardware.
+
+
+The key features include:
+
+- Generic, modular open-source hardware
+  - Streamlined workflow to deploy to FPGAs.
+  - Simulator support
+- Driver and JIT runtime for both simulated backend and FPGA.
+- End to end TVM stack integration
+  - Direct optimization and deploy models from deep learning frameworks via TVM stack.
+  - Customized and extendible TVM compiler backend
+  - Flexible RPC support to ease the deployment, you can program it with python :)
+
+VTA is part of our effort on [TVM Stack](http://www.tvmlang.org/).
 
 License
 -------
diff --git a/vta/make/config.mk b/vta/make/config.mk
index e329dcf987b8..2bf25132e245 100644
--- a/vta/make/config.mk
+++ b/vta/make/config.mk
@@ -26,8 +26,8 @@ ADD_LDFLAGS=
 # the additional compile flags you want to add
 ADD_CFLAGS=
 
-# the hardware target
-TARGET = pynq
+# the hardware target, can be [sim, pynq]
+VTA_TARGET = pynq
 
 #---------------------
 # VTA hardware parameters
@@ -88,7 +88,8 @@ $(shell echo "$$(( $(VTA_LOG_ACC_BUFF_SIZE) + $(VTA_LOG_OUT_WIDTH) - $(VTA_LOG_A
 VTA_OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_OUT_BUFF_SIZE) ))" )
 
 # Update ADD_CFLAGS
-ADD_CFLAGS += \
+ADD_CFLAGS +=
+	-DVTA_TARGET=$(VTA_TARGET)\
 	-DVTA_LOG_WGT_WIDTH=$(VTA_LOG_WGT_WIDTH) -DVTA_LOG_INP_WIDTH=$(VTA_LOG_INP_WIDTH) \
 	-DVTA_LOG_ACC_WIDTH=$(VTA_LOG_ACC_WIDTH) -DVTA_LOG_OUT_WIDTH=$(VTA_LOG_OUT_WIDTH) \
 	-DVTA_LOG_BATCH=$(VTA_LOG_BATCH) \
diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py
index 693a4124f40b..80091f80d164 100644
--- a/vta/python/vta/__init__.py
+++ b/vta/python/vta/__init__.py
@@ -1,5 +1,6 @@
 """TVM-based VTA Compiler Toolchain"""
 from __future__ import absolute_import as _abs
+import sys
 
 from .environment import get_env, Environment
 
@@ -10,5 +11,5 @@
     from .rpc_client import reconfig_runtime, program_fpga
 
     from . import graph
-except ImportError:
+except (ImportError, RuntimeError):
     pass
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index 8ff2bbce2787..a59e66a564b2 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -3,14 +3,10 @@
 from __future__ import absolute_import as _abs
 
 import os
+import glob
 import copy
-
-try:
-    # Allow missing import in config mode.
-    import tvm
-    from . import intrin
-except ImportError:
-    pass
+import tvm
+from . import intrin
 
 
 class DevContext(object):
@@ -65,6 +61,45 @@ def get_task_qid(self, qid):
         return 1 if self.DEBUG_NO_SYNC else qid
 
 
+class PkgConfig(object):
+    """Simple package config tool for VTA.
+
+    This is used to provide runtime specific configurations.
+    """
+    def __init__(self, env):
+        curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+        proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
+        # include path
+        self.include_path = [
+            "-I%s/include" % proj_root,
+            "-I%s/nnvm/tvm/include" % proj_root,
+            "-I%s/nnvm/tvm/dlpack/include" % proj_root,
+            "-I%s/nnvm/dmlc-core/include" % proj_root
+        ]
+        # List of source files that can be used to build standalone library.
+        self.lib_source = []
+        self.lib_source += glob.glob("%s/src/*.cc" % proj_root)
+        self.lib_source += glob.glob("%s/src/%s/*.cc" % (proj_root, env.TARGET))
+        # macro keys
+        self.macro_defs = []
+        for key in env.cfg_keys:
+            self.macro_defs.append("-DVTA_%s=%s" % (key, str(getattr(env, key))))
+
+        if env.TARGET == "pynq":
+            self.ldflags = [
+                "-L/usr/lib",
+                "-lsds_lib",
+                "-L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/",
+                "-L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/",
+                "-l:libdma.so"]
+        else:
+            self.ldflags = []
+
+    @property
+    def cflags(self):
+        return self.include_path + self.macro_defs
+
+
 class Environment(object):
     """Hareware configuration object.
 
@@ -160,6 +195,7 @@ def __init__(self, cfg):
         self.mock_mode = False
         self._mock_env = None
         self._dev_ctx = None
+        self._pkg_config = None
 
     @property
     def dev(self):
@@ -168,6 +204,13 @@ def dev(self):
             self._dev_ctx = DevContext(self)
         return self._dev_ctx
 
+    @property
+    def pkg_config(self):
+        """PkgConfig instance"""
+        if self._pkg_config is None:
+            self._pkg_config = PkgConfig(self)
+        return self._pkg_config
+
     @property
     def mock(self):
         """A mock version of the Environment
@@ -249,7 +292,7 @@ def mem_info_wgt_buffer():
                          head_address=None)
 
 @tvm.register_func("tvm.info.mem.%s" % Environment.acc_scope)
-def mem_info_out_buffer():
+def mem_info_acc_buffer():
     spec = get_env()
     return tvm.make.node("MemoryInfo",
                          unit_bits=spec.ACC_ELEM_BITS,
@@ -265,6 +308,7 @@ def coproc_sync(op):
         "int32", "VTASynchronize",
         get_env().dev.command_handle, 1<<31)
 
+
 @tvm.register_func("tvm.intrin.rule.default.vta.coproc_dep_push")
 def coproc_dep_push(op):
     return tvm.call_extern(
@@ -272,6 +316,7 @@ def coproc_dep_push(op):
         get_env().dev.command_handle,
         op.args[0], op.args[1])
 
+
 @tvm.register_func("tvm.intrin.rule.default.vta.coproc_dep_pop")
 def coproc_dep_pop(op):
     return tvm.call_extern(
@@ -288,7 +333,6 @@ def _init_env():
 
     for k in Environment.cfg_keys:
         keys.add("VTA_" + k)
-    keys.add("TARGET")
 
     if not os.path.isfile(filename):
         raise RuntimeError(
@@ -303,8 +347,9 @@ def _init_env():
                     val = line.split("=")[1].strip()
                     if k.startswith("VTA_"):
                         k = k[4:]
+                    try:
                         cfg[k] = int(val)
-                    else:
+                    except ValueError:
                         cfg[k] = val
     return Environment(cfg)
 
diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py
index ebaf15f8dc37..ca99416f7818 100644
--- a/vta/python/vta/exec/rpc_server.py
+++ b/vta/python/vta/exec/rpc_server.py
@@ -9,26 +9,46 @@
 import os
 import ctypes
 import tvm
+from tvm._ffi.base import c_str
 from tvm.contrib import rpc, cc
 
+from ..environment import get_env
+
 
 @tvm.register_func("tvm.contrib.rpc.server.start", override=True)
 def server_start():
-    """callback when server starts."""
+    """VTA RPC server extension."""
     # pylint: disable=unused-variable
     curr_path = os.path.dirname(
         os.path.abspath(os.path.expanduser(__file__)))
     dll_path = os.path.abspath(
-        os.path.join(curr_path, "../../../lib/libvta_runtime.so"))
+        os.path.join(curr_path, "../../../lib/libvta.so"))
     runtime_dll = []
     _load_module = tvm.get_global_func("tvm.contrib.rpc.server.load_module")
 
-    @tvm.register_func("tvm.contrib.rpc.server.load_module", override=True)
-    def load_module(file_name):
+    def load_vta_dll():
+        """Try to load vta dll"""
         if not runtime_dll:
             runtime_dll.append(ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL))
+        logging.info("Loading VTA library: %s", dll_path)
+        return runtime_dll[0]
+
+    @tvm.register_func("tvm.contrib.rpc.server.load_module", override=True)
+    def load_module(file_name):
+        load_vta_dll()
         return _load_module(file_name)
 
+    @tvm.register_func("device_api.ext_dev")
+    def ext_dev_callback():
+        load_vta_dll()
+        return tvm.get_global_func("device_api.ext_dev")()
+
+    @tvm.register_func("tvm.contrib.vta.init", override=True)
+    def program_fpga(file_name):
+        path = tvm.get_global_func("tvm.contrib.rpc.server.workpath")(file_name)
+        load_vta_dll().VTAProgram(c_str(path))
+        logging.info("Program FPGA with %s", file_name)
+
     @tvm.register_func("tvm.contrib.rpc.server.shutdown", override=True)
     def server_shutdown():
         if runtime_dll:
@@ -47,17 +67,15 @@ def reconfig_runtime(cflags):
         if runtime_dll:
             raise RuntimeError("Can only reconfig in the beginning of session...")
         cflags = cflags.split()
+        env = get_env()
         cflags += ["-O2", "-std=c++11"]
+        cflags += env.pkg_config.include_path
+        ldflags = env.pkg_config.ldflags
         lib_name = dll_path
-        curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-        proj_root = os.path.abspath(os.path.join(curr_path, "../../../"))
-        runtime_source = os.path.join(proj_root, "src/runtime.cc")
-        cflags += ["-I%s/include" % proj_root]
-        cflags += ["-I%s/nnvm/tvm/include" % proj_root]
-        cflags += ["-I%s/nnvm/tvm/dlpack/include" % proj_root]
-        cflags += ["-I%s/nnvm/dmlc-core/include" % proj_root]
-        logging.info("Rebuild runtime dll with %s", str(cflags))
-        cc.create_shared(lib_name, [runtime_source], cflags)
+        source = env.pkg_config.lib_source
+        logging.info("Rebuild runtime: output=%s, cflags=%s, source=%s, ldflags=%s",
+                     dll_path, str(cflags), str(source), str(ldflags))
+        cc.create_shared(lib_name, source, cflags + ldflags)
 
 
 def main():
@@ -75,14 +93,6 @@ def main():
                         help="Report to RPC tracker")
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    proj_root = os.path.abspath(os.path.join(curr_path, "../../../"))
-    lib_path = os.path.abspath(os.path.join(proj_root, "lib/libvta.so"))
-
-    libs = []
-    for file_name in [lib_path]:
-        libs.append(ctypes.CDLL(file_name, ctypes.RTLD_GLOBAL))
-        logging.info("Load additional library %s", file_name)
 
     if args.tracker:
         url, port = args.tracker.split(":")
@@ -99,7 +109,6 @@ def main():
                         args.port_end,
                         key=args.key,
                         tracker_addr=tracker_addr)
-    server.libs += libs
     server.proc.join()
 
 if __name__ == "__main__":
diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
index bb436a1853a8..3505e49eeb04 100644
--- a/vta/python/vta/testing/simulator.py
+++ b/vta/python/vta/testing/simulator.py
@@ -10,7 +10,6 @@ def _load_lib():
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
     dll_path = [
         os.path.abspath(os.path.join(curr_path, "../../../lib/libvta.so")),
-        os.path.abspath(os.path.join(curr_path, "../../../lib/libvta_runtime.so"))
     ]
     runtime_dll = []
     if not all(os.path.exists(f) for f in dll_path):
diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py
index bbf6417a167e..402546c0efa1 100644
--- a/vta/python/vta/testing/util.py
+++ b/vta/python/vta/testing/util.py
@@ -15,10 +15,18 @@ def run(run_func):
     run_func : function(env, remote)
     """
     env = get_env()
-    # run on simulator
-    if simulator.enabled():
+
+    # Run on local sim rpc if necessary
+    local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
+    if local_rpc:
         env.TARGET = "sim"
-        run_func(env, rpc.LocalSession())
+        remote = rpc.connect("localhost", local_rpc)
+        run_func(env, remote)
+    else:
+        # run on simulator
+        if simulator.enabled():
+            env.TARGET = "sim"
+            run_func(env, rpc.LocalSession())
 
     # Run on PYNQ if env variable exists
     pynq_host = os.environ.get("VTA_PYNQ_RPC_HOST", None)
diff --git a/vta/src/data_buffer.cc b/vta/src/data_buffer.cc
deleted file mode 100644
index 99f959ad8c8b..000000000000
--- a/vta/src/data_buffer.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file data_buffer.cc
- * \brief Buffer related API for VTA.
- * \note Buffer API remains stable across VTA designes.
- */
-#include "./data_buffer.h"
-
-void* VTABufferAlloc(size_t size) {
-  return vta::DataBuffer::Alloc(size);
-}
-
-void VTABufferFree(void* buffer) {
-  vta::DataBuffer::Free(vta::DataBuffer::FromHandle(buffer));
-}
-
-void VTABufferCopy(const void* from,
-                   size_t from_offset,
-                   void* to,
-                   size_t to_offset,
-                   size_t size,
-                   int kind_mask) {
-  vta::DataBuffer* from_buffer = nullptr;
-  vta::DataBuffer* to_buffer = nullptr;
-
-  if (kind_mask & 2) {
-    from_buffer = vta::DataBuffer::FromHandle(from);
-    from = from_buffer->virt_addr();
-  }
-  if (kind_mask & 1) {
-    to_buffer = vta::DataBuffer::FromHandle(to);
-    to = to_buffer->virt_addr();
-  }
-  if (from_buffer) {
-    from_buffer->InvalidateCache(from_offset, size);
-  }
-
-  memcpy(static_cast<char*>(to) + to_offset,
-         static_cast<const char*>(from) + from_offset,
-         size);
-  if (to_buffer) {
-    to_buffer->FlushCache(to_offset, size);
-  }
-}
diff --git a/vta/src/data_buffer.h b/vta/src/data_buffer.h
deleted file mode 100644
index fba46dc07efa..000000000000
--- a/vta/src/data_buffer.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file data_buffer.h
- * \brief VTA runtime internal data buffer structure.
- */
-#ifndef VTA_DATA_BUFFER_H_
-#define VTA_DATA_BUFFER_H_
-
-#include <vta/driver.h>
-#include <vta/runtime.h>
-#include <cassert>
-#include <cstring>
-
-namespace vta {
-
-/*! \brief Enable coherent access between VTA and CPU. */
-static const bool kBufferCoherent = true;
-
-/*!
- * \brief Data buffer represents data on CMA.
- */
-struct DataBuffer {
-  /*! \return Virtual address of the data. */
-  void* virt_addr() const {
-    return data_;
-  }
-  /*! \return Physical address of the data. */
-  uint32_t phy_addr() const {
-    return phy_addr_;
-  }
-  /*!
-   * \brief Invalidate the cache of given location in data buffer.
-   * \param offset The offset to the data.
-   * \param size The size of the data.
-   */
-  void InvalidateCache(size_t offset, size_t size) {
-    if (!kBufferCoherent) {
-      VTAInvalidateCache(phy_addr_ + offset, size);
-    }
-  }
-  /*!
-   * \brief Invalidate the cache of certain location in data buffer.
-   * \param offset The offset to the data.
-   * \param size The size of the data.
-   */
-  void FlushCache(size_t offset, size_t size) {
-    if (!kBufferCoherent) {
-      VTAFlushCache(phy_addr_ + offset, size);
-    }
-  }
-  /*!
-   * \brief Allocate a buffer of a given size.
-   * \param size The size of the buffer.
-   */
-  static DataBuffer* Alloc(size_t size) {
-    void* data = VTAMemAlloc(size, 1);
-    assert(data != nullptr);
-    DataBuffer* buffer = new DataBuffer();
-    buffer->data_ = data;
-    buffer->phy_addr_ = VTAMemGetPhyAddr(data);
-    return buffer;
-  }
-  /*!
-   * \brief Free the data buffer.
-   * \param buffer The buffer to be freed.
-   */
-  static void Free(DataBuffer* buffer) {
-    VTAMemFree(buffer->data_);
-    delete buffer;
-  }
-  /*!
-   * \brief Create data buffer header from buffer ptr.
-   * \param buffer The buffer pointer.
-   * \return The corresponding data buffer header.
-   */
-  static DataBuffer* FromHandle(const void* buffer) {
-    return const_cast<DataBuffer*>(
-        reinterpret_cast<const DataBuffer*>(buffer));
-  }
-
- private:
-  /*! \brief The internal data. */
-  void* data_;
-  /*! \brief The physical address of the buffer, excluding header. */
-  uint32_t phy_addr_;
-};
-
-}  // namespace vta
-
-#endif  // VTA_DATA_BUFFER_H_
diff --git a/vta/src/tvm/vta_device_api.cc b/vta/src/device_api.cc
similarity index 75%
rename from vta/src/tvm/vta_device_api.cc
rename to vta/src/device_api.cc
index e4671d8a0207..687ec625a02c 100644
--- a/vta/src/tvm/vta_device_api.cc
+++ b/vta/src/device_api.cc
@@ -1,15 +1,14 @@
 /*!
  *  Copyright (c) 2018 by Contributors
- * \file vta_device_api.cc
- * \brief VTA device API for TVM
+ * \file device_api.cc
+ * \brief TVM device API for VTA
  */
 
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
 #include <vta/runtime.h>
-#include <dlfcn.h>
 
-#include "../../nnvm/tvm/src/runtime/workspace_pool.h"
+#include "../nnvm/tvm/src/runtime/workspace_pool.h"
 
 
 namespace tvm {
@@ -26,7 +25,8 @@ class VTADeviceAPI final : public DeviceAPI {
   }
 
   void* AllocDataSpace(TVMContext ctx,
-                       size_t size, size_t alignment,
+                       size_t size,
+                       size_t alignment,
                        TVMType type_hint) final {
     return VTABufferAlloc(size);
   }
@@ -84,22 +84,9 @@ void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) {
   dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(ctx, data);
 }
 
-std::string VTARPCGetPath(const std::string& name) {
-  static const PackedFunc* f =
-      runtime::Registry::Get("tvm.contrib.rpc.server.workpath");
-  CHECK(f != nullptr) << "require tvm.contrib.rpc.server.workpath";
-  return (*f)(name);
-}
-
-// Global functions that can be called
-TVM_REGISTER_GLOBAL("tvm.contrib.vta.init")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    std::string path = VTARPCGetPath(args[0]);
-    VTAProgram(path.c_str());
-    LOG(INFO) << "VTA initialization end with bistream " << path;
-  });
-
-TVM_REGISTER_GLOBAL("device_api.ext_dev")
+// Register device api with override.
+static TVM_ATTRIBUTE_UNUSED auto& __register_dev__ =
+::tvm::runtime::Registry::Register("device_api.ext_dev", true)
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     DeviceAPI* ptr = VTADeviceAPI::Global().get();
     *rv = static_cast<void*>(ptr);
diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc
index da5109c141ee..c0de87fa339d 100644
--- a/vta/src/runtime.cc
+++ b/vta/src/runtime.cc
@@ -5,8 +5,6 @@
  *
  *  The runtime depends on specific instruction
  *  stream spec as specified in hw_spec.h
- *  It is intended to be used as a dynamic library
- *  to enable hot swapping of hardware configurations.
  */
 #include <vta/driver.h>
 #include <vta/hw_spec.h>
@@ -14,15 +12,87 @@
 #include <dmlc/logging.h>
 
 #include <cassert>
+#include <cstring>
 #include <vector>
 #include <thread>
 #include <memory>
 #include <atomic>
 
-#include "./data_buffer.h"
 
 namespace vta {
 
+/*! \brief Enable coherent access between VTA and CPU. */
+static const bool kBufferCoherent = true;
+
+/*!
+ * \brief Data buffer represents data on CMA.
+ */
+struct DataBuffer {
+  /*! \return Virtual address of the data. */
+  void* virt_addr() const {
+    return data_;
+  }
+  /*! \return Physical address of the data. */
+  uint32_t phy_addr() const {
+    return phy_addr_;
+  }
+  /*!
+   * \brief Invalidate the cache of given location in data buffer.
+   * \param offset The offset to the data.
+   * \param size The size of the data.
+   */
+  void InvalidateCache(size_t offset, size_t size) {
+    if (!kBufferCoherent) {
+      VTAInvalidateCache(phy_addr_ + offset, size);
+    }
+  }
+  /*!
+   * \brief Invalidate the cache of certain location in data buffer.
+   * \param offset The offset to the data.
+   * \param size The size of the data.
+   */
+  void FlushCache(size_t offset, size_t size) {
+    if (!kBufferCoherent) {
+      VTAFlushCache(phy_addr_ + offset, size);
+    }
+  }
+  /*!
+   * \brief Allocate a buffer of a given size.
+   * \param size The size of the buffer.
+   */
+  static DataBuffer* Alloc(size_t size) {
+    void* data = VTAMemAlloc(size, 1);
+    CHECK(data != nullptr);
+    DataBuffer* buffer = new DataBuffer();
+    buffer->data_ = data;
+    buffer->phy_addr_ = VTAMemGetPhyAddr(data);
+    return buffer;
+  }
+  /*!
+   * \brief Free the data buffer.
+   * \param buffer The buffer to be freed.
+   */
+  static void Free(DataBuffer* buffer) {
+    VTAMemFree(buffer->data_);
+    delete buffer;
+  }
+  /*!
+   * \brief Create data buffer header from buffer ptr.
+   * \param buffer The buffer pointer.
+   * \return The corresponding data buffer header.
+   */
+  static DataBuffer* FromHandle(const void* buffer) {
+    return const_cast<DataBuffer*>(
+        reinterpret_cast<const DataBuffer*>(buffer));
+  }
+
+ private:
+  /*! \brief The internal data. */
+  void* data_;
+  /*! \brief The physical address of the buffer, excluding header. */
+  uint32_t phy_addr_;
+};
+
 /*!
  * \brief Micro op kernel.
  *  Contains functions to construct the kernel with prefix Push.
@@ -1130,6 +1200,42 @@ class CommandQueue {
 
 }  // namespace vta
 
+void* VTABufferAlloc(size_t size) {
+  return vta::DataBuffer::Alloc(size);
+}
+
+void VTABufferFree(void* buffer) {
+  vta::DataBuffer::Free(vta::DataBuffer::FromHandle(buffer));
+}
+
+void VTABufferCopy(const void* from,
+                   size_t from_offset,
+                   void* to,
+                   size_t to_offset,
+                   size_t size,
+                   int kind_mask) {
+  vta::DataBuffer* from_buffer = nullptr;
+  vta::DataBuffer* to_buffer = nullptr;
+
+  if (kind_mask & 2) {
+    from_buffer = vta::DataBuffer::FromHandle(from);
+    from = from_buffer->virt_addr();
+  }
+  if (kind_mask & 1) {
+    to_buffer = vta::DataBuffer::FromHandle(to);
+    to = to_buffer->virt_addr();
+  }
+  if (from_buffer) {
+    from_buffer->InvalidateCache(from_offset, size);
+  }
+
+  memcpy(static_cast<char*>(to) + to_offset,
+         static_cast<const char*>(from) + from_offset,
+         size);
+  if (to_buffer) {
+    to_buffer->FlushCache(to_offset, size);
+  }
+}
 
 VTACommandHandle VTATLSCommandHandle() {
   return vta::CommandQueue::ThreadLocal().get();
diff --git a/vta/tests/python/unittest/test_vta_insn.py b/vta/tests/python/unittest/test_vta_insn.py
index 339d8d31e238..79a20be13de9 100644
--- a/vta/tests/python/unittest/test_vta_insn.py
+++ b/vta/tests/python/unittest/test_vta_insn.py
@@ -468,7 +468,22 @@ def _run(env, remote):
 
     vta.testing.run(_run)
 
+
+def test_runtime_array():
+    def _run(env, remote):
+        n = 100
+        ctx = remote.ext_dev(0)
+        x_np = np.random.randint(
+            1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype("int8")
+        x_nd = tvm.nd.array(x_np, ctx)
+        np.testing.assert_equal(x_np, x_nd.asnumpy())
+
+    vta.testing.run(_run)
+
+
 if __name__ == "__main__":
+    print("Array test")
+    test_runtime_array()
     print("Load/store test")
     test_save_load_out()
     print("Padded load test")