diff --git a/.gitmodules b/.gitmodules
index 78145973abd3..a1367c97b2f5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "3rdparty/rang"]
 	path = 3rdparty/rang
 	url = https://github.com/agauniyal/rang
+[submodule "3rdparty/vta-hw"]
+	path = 3rdparty/vta-hw
+	url = https://github.com/apache/incubator-tvm-vta
diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
new file mode 160000
index 000000000000..db65157208ec
--- /dev/null
+++ b/3rdparty/vta-hw
@@ -0,0 +1 @@
+Subproject commit db65157208ec8fabb7b548c94596211b9db04190
diff --git a/Makefile b/Makefile
index c1b565f25cdb..7bfe60b4e8a0 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ ifndef DLPACK_PATH
 endif
 
 ifndef VTA_HW_PATH
-  VTA_HW_PATH = $(ROOTDIR)/vta/vta-hw
+  VTA_HW_PATH = $(ROOTDIR)/3rdparty/vta-hw
 endif
 
 INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index f11ae0c86f3a..4af39e088b23 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -20,7 +20,7 @@ find_program(PYTHON NAMES python python3 python3.6)
 
 # Throw error if VTA_HW_PATH is not set
 if(NOT DEFINED ENV{VTA_HW_PATH})
-  set(VTA_HW_PATH ${CMAKE_CURRENT_SOURCE_DIR}/vta/vta-hw)
+  set(VTA_HW_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw)
 else()
   set(VTA_HW_PATH $ENV{VTA_HW_PATH})
 endif()
diff --git a/docs/vta/dev/config.rst b/docs/vta/dev/config.rst
index e9690cb3f43d..2f98d777608e 100644
--- a/docs/vta/dev/config.rst
+++ b/docs/vta/dev/config.rst
@@ -21,7 +21,7 @@ VTA Configuration
 The VTA stack incorporates both a hardware accelerator stack and
 a TVM based software stack.
 VTA incorporates flexibility out of the box: by modifying the
-``vta/vta-hw/config/vta_config.json`` high-level configuration file,
+``3rdparty/vta-hw/config/vta_config.json`` high-level configuration file,
 the user can change the shape of the tensor intrinsic,
 clock frequency, pipelining, data type width, and on-chip buffer sizes.
 
diff --git a/docs/vta/dev/hardware.rst b/docs/vta/dev/hardware.rst
index 84cfc45bb6b2..6eb30407997f 100644
--- a/docs/vta/dev/hardware.rst
+++ b/docs/vta/dev/hardware.rst
@@ -53,17 +53,17 @@ HLS Hardware Source Organization
 
 The VTA design is currently specified in Vivado HLS C++, which is only supported
 by Xilinx toolchains.
-The VTA hardware sources are contained under ``vta/vta-hw/hardware/xilinx/sources``:
+The VTA hardware sources are contained under ``3rdparty/vta-hw/hardware/xilinx/sources``:
 
  - ``vta.cc`` contains the definitions for each VTA module, as well as a top
    level behavioral model for the top-level VTA design.
  - ``vta.h`` contains type definitions using Xilinx ``ap_int`` types, and
    function prototypes declarations.
 
-In addition preprocessor macros are defined under ``vta/vta-hw/include/vta/hw_spec.h``.
+In addition preprocessor macros are defined under ``3rdparty/vta-hw/include/vta/hw_spec.h``.
 Much of these macro definitions are derived from the parameters listed in the
-``vta/vta-hw/config/vta_config.json`` file.
-The json file is processed by ``vta/vta-hw/config/vta_config.py`` to produce a string of
+``3rdparty/vta-hw/config/vta_config.json`` file.
+The json file is processed by ``3rdparty/vta-hw/config/vta_config.py`` to produce a string of
 compile flags that define the preprocessor macros.
 That string is used by the makefile in order to set those high-level
 parameters in both the HLS hardware synthesis compiler, and the C++
@@ -220,7 +220,7 @@ Microarchitectural Overview
 ---------------------------
 
 We describe the modules that compose the VTA design.
-The module definitions are contained in ``vta/vta-hw/hardware/xilinx/sources/vta.cc``.
+The module definitions are contained in ``3rdparty/vta-hw/hardware/xilinx/sources/vta.cc``.
 
 Fetch Module
 ~~~~~~~~~~~~
diff --git a/docs/vta/install.md b/docs/vta/install.md
index dd7ba9bb695c..a938a67218ff 100644
--- a/docs/vta/install.md
+++ b/docs/vta/install.md
@@ -32,7 +32,7 @@ For a quick and easy start, checkout the [Docker Guide](https://tvm.apache.org/d
 You'll need to set the following paths to use VTA:
 ```bash
 export TVM_PATH=<path to TVM root>
-export VTA_HW_PATH=$TVM_PATH/vta/vta-hw
+export VTA_HW_PATH=$TVM_PATH/3rdparty/vta-hw
 ```
 
 The VTA functional simulation library needs to be enabled when building TVM.
@@ -66,7 +66,7 @@ You are invited to try out our [VTA programming tutorials](https://tvm.apache.or
 ### Advanced Configuration (optional)
 
 VTA is a generic configurable deep learning accelerator.
-The configuration is specified by `vta_config.json` under `vta/vta-hw/config`.
+The configuration is specified by `vta_config.json` under `3rdparty/vta-hw/config`.
 This file provides an architectural specification of the VTA accelerator to parameterize the TVM compiler stack and the VTA hardware stack.
 
 The VTA configuration file also specifies the TVM compiler target.
@@ -76,7 +76,7 @@ To do so,
 
 ```bash
 cd <tvm root>
-vim vta/vta-hw/config/vta_config.json
+vim 3rdparty/vta-hw/config/vta_config.json
 # edit vta_config.json
 make
 ```
@@ -134,7 +134,7 @@ mkdir build
 cp cmake/config.cmake build/.
 echo 'set(USE_VTA_FPGA ON)' >> build/config.cmake
 # Copy pynq specific configuration
-cp vta/vta-hw/config/pynq_sample.json vta/vta-hw/config/vta_config.json
+cp 3rdparty/vta-hw/config/pynq_sample.json 3rdparty/vta-hw/config/vta_config.json
 cd build
 cmake ..
 make runtime vta -j2
@@ -168,7 +168,7 @@ In addition, you'll need to edit the `vta_config.json` file on the host to indic
 ```bash
 # On the Host-side
 cd <tvm root>
-cp vta/vta-hw/config/pynq_sample.json vta/vta-hw/config/vta_config.json
+cp 3rdparty/vta-hw/config/pynq_sample.json 3rdparty/vta-hw/config/vta_config.json
 ```
 
 This time again, we will run the 2D convolution testbench.
@@ -359,11 +359,11 @@ For this custom VTA bitstream compilation exercise, we'll change the frequency o
 * Set the `HW_FREQ` field to `142`. The Pynq board supports 100, 142, 167 and 200MHz clocks. Note that the higher the frequency, the harder it will be to close timing. Increasing the frequency can lead to timing violation and thus faulty hardware execution.
 * Set the `HW_CLK_TARGET` to `6`. This parameters refers to the target clock period in nano seconds for HLS - a lower clock period leads to more aggressive pipelining to achieve timing closure at higher frequencies. Technically a 142MHz clock would require a 7ns target, but we intentionally lower the clock target to 6ns to more aggressively pipeline our design.
 
-Bitstream generation is driven by a top-level `Makefile` under `<tvm root>/vta/vta-hw/hardware/xilinx/`.
+Bitstream generation is driven by a top-level `Makefile` under `<tvm root>/3rdparty/vta-hw/hardware/xilinx/`.
 
 If you just want to simulate the VTA design in software emulation to make sure that it is functional, enter:
 ```bash
-cd <tvm root>/vta/vta-hw/hardware/xilinx
+cd <tvm root>/3rdparty/vta-hw/hardware/xilinx
 make ip MODE=sim
 ```
 
@@ -371,7 +371,7 @@ If you just want to generate the HLS-based VTA IP cores without launching the en
 ```bash
 make ip
 ```
-You'll be able to view the HLS synthesis reports under `<tvm root>/vta/vta-hw/build/hardware/xilinx/hls/` `<configuration>/<block>/solution0/syn/report/<block>_csynth.rpt`
+You'll be able to view the HLS synthesis reports under `<tvm root>/3rdparty/vta-hw/build/hardware/xilinx/hls/` `<configuration>/<block>/solution0/syn/report/<block>_csynth.rpt`
 > Note: The `<configuration>` name is a string that summarizes the VTA configuration parameters listed in the `vta_config.json`. The `<block>` name refers to the specific module (or HLS function) that compose the high-level VTA pipeline.
 
 Finally to run the full hardware compilation and generate the VTA bitstream, run:
@@ -383,20 +383,20 @@ make
 This process is lengthy, and can take around up to an hour to complete depending on your machine's specs.
 We recommend setting the `VTA_HW_COMP_THREADS` variable in the Makefile to take full advantage of all the cores on your development machine.
 
-Once the compilation completes, the generated bitstream can be found under `<tvm root>/vta/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit`.
+Once the compilation completes, the generated bitstream can be found under `<tvm root>/3rdparty/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit`.
 
 ### Chisel-based Custom VTA Bitstream Compilation for DE10-Nano
 
-Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file [Configs.scala](https://github.com/apache/incubator-tvm/blob/master/vta/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala), and they can be customized by the user.
+Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file [Configs.scala](https://github.com/apache/incubator-tvm/blob/master/3rdparty/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala), and they can be customized by the user.
 
-For Intel FPGA, bitstream generation is driven by a top-level `Makefile` under `<tvmroot>/vta/vta-hw/hardware/intel`.
+For Intel FPGA, bitstream generation is driven by a top-level `Makefile` under `<tvm root>/3rdparty/vta-hw/hardware/intel`.
 
 If you just want to generate the Chisel-based VTA IP core for the DE10-Nano board without compiling the design for the FPGA hardware, enter:
 ```bash
-cd <tvmroot>/vta/vta-hw/hardware/intel
+cd <tvm root>/3rdparty/vta-hw/hardware/intel
 make ip
 ```
-Then you'll be able to locate the generated verilog file at `<tvmroot>/vta/vta-hw/build/hardware/intel/chisel/<configuration>/VTA.DefaultDe10Config.v`.
+Then you'll be able to locate the generated verilog file at `<tvm root>/3rdparty/vta-hw/build/hardware/intel/chisel/<configuration>/VTA.DefaultDe10Config.v`.
 
 If you would like to run the full hardware compilation for the `de10nano` board:
 ```bash
@@ -405,14 +405,14 @@ make
 
 This process might be a bit lengthy, and might take up to half an hour to complete depending on the performance of your PC. The Quartus Prime software would automatically detect the number of cores available on your PC and try to utilize all of them to perform such process.
 
-Once the compilation completes, the generated bistream can be found under `<tvmroot>vtay/vta-hw/build/hardware/intel/quartus/<configuration>/export/vta.rbf`. You can also open the Quartus project file (.qpf) available at `<tvmroot>/vta/vta-hw/build/hardware/intel/quartus/<configuration>/de10_nano_top.qpf` to look around the generated reports.
+Once the compilation completes, the generated bistream can be found under `<tvm root>/3rdparty/vta-hw/build/hardware/intel/quartus/<configuration>/export/vta.rbf`. You can also open the Quartus project file (.qpf) available at `<tvm root>/3rdparty/vta-hw/build/hardware/intel/quartus/<configuration>/de10_nano_top.qpf` to look around the generated reports.
 
 ### Use the Custom Bitstream
 
 We can program the new VTA FPGA bitstream by setting the bitstream path of the `vta.program_fpga()` function in the tutorial examples, or in the `test_program_rpc.py` script.
 
 ```python
-vta.program_fpga(remote, bitstream="<tvm root>/vta/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit")
+vta.program_fpga(remote, bitstream="<tvm root>/3rdparty/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit")
 ```
 
 Instead of downloading a pre-built bitstream from the VTA bitstream repository, TVM will instead use the new bitstream you just generated, which is a VTA design clocked at a higher frequency.
diff --git a/tests/scripts/task_build.sh b/tests/scripts/task_build.sh
index fbf3a63df0b2..d8e35ebd4de3 100755
--- a/tests/scripts/task_build.sh
+++ b/tests/scripts/task_build.sh
@@ -15,4 +15,5 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
 cd $1 && cmake .. && make $2 && cd ..
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index 69697caa2f3d..751e98e9abdc 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -20,6 +20,8 @@ set -e
 set -u
 
 export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
+# NOTE: important to use abspath, when VTA is enabled.
+export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
 
 # Remove existing testcases
 rm -f build/*_test
diff --git a/tests/scripts/task_python_vta_fsim.sh b/tests/scripts/task_python_vta_fsim.sh
index 304d6230bcb8..f269866c39e7 100755
--- a/tests/scripts/task_python_vta_fsim.sh
+++ b/tests/scripts/task_python_vta_fsim.sh
@@ -21,7 +21,7 @@ set -u
 
 export TVM_PATH=`pwd`
 export PYTHONPATH=${TVM_PATH}/python:${TVM_PATH}/vta/python:${TVM_PATH}/topi/python
-export VTA_HW_PATH=`pwd`/vta/vta-hw
+export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
 
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
diff --git a/tests/scripts/task_python_vta_tsim.sh b/tests/scripts/task_python_vta_tsim.sh
index 65057cc68b9c..49366748b895 100755
--- a/tests/scripts/task_python_vta_tsim.sh
+++ b/tests/scripts/task_python_vta_tsim.sh
@@ -21,7 +21,7 @@ set -u
 
 export TVM_PATH=`pwd`
 export PYTHONPATH=${TVM_PATH}/python:${TVM_PATH}/vta/python:${TVM_PATH}/topi/python
-export VTA_HW_PATH=`pwd`/vta/vta-hw
+export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
 
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index c5b56e3756db..bbaac2ce1797 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -28,7 +28,7 @@
 def get_vta_hw_path():
     """Get the VTA HW path."""
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    vta_hw_default = os.path.abspath(os.path.join(curr_path, "../../vta-hw"))
+    vta_hw_default = os.path.abspath(os.path.join(curr_path, "../../../3rdparty/vta-hw"))
     VTA_HW_PATH = os.getenv('VTA_HW_PATH', vta_hw_default)
     return os.path.abspath(VTA_HW_PATH)
 
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index b0870b17f7a5..571dde669d2a 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -181,7 +181,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
 tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0')
 tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 7ce38533e9d3..62fb32165a18 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -68,7 +68,7 @@
 # -------------------------------------
 # Execute on CPU vs. VTA, and define the model.
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # Set ``device=arm_cpu`` to run inference on the CPU
diff --git a/vta/tutorials/frontend/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
index 83fa8fb6d7ee..efcd2c43591d 100644
--- a/vta/tutorials/frontend/deploy_detection.py
+++ b/vta/tutorials/frontend/deploy_detection.py
@@ -111,7 +111,7 @@
 # --------------------------------------
 # Execute on CPU vs. VTA, and define the model.
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 # Set ``device=arm_cpu`` to run inference on the CPU
 # or ``device=vta`` to run inference on the FPGA.
diff --git a/vta/tutorials/matrix_multiply.py b/vta/tutorials/matrix_multiply.py
index 227144ec1709..024e1792e9d0 100644
--- a/vta/tutorials/matrix_multiply.py
+++ b/vta/tutorials/matrix_multiply.py
@@ -43,7 +43,7 @@
 from tvm.contrib import util
 from vta.testing import simulator
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # We read the Pynq RPC host IP address and port number from the OS environment
diff --git a/vta/tutorials/optimize/convolution_opt.py b/vta/tutorials/optimize/convolution_opt.py
index f609a7200843..0564a6ace179 100644
--- a/vta/tutorials/optimize/convolution_opt.py
+++ b/vta/tutorials/optimize/convolution_opt.py
@@ -47,7 +47,7 @@
 from tvm.contrib import util
 from vta.testing import simulator
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # We read the Pynq RPC host IP address and port number from the OS environment
diff --git a/vta/tutorials/optimize/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py
index da3b9bbc5fc0..77b038176b6b 100644
--- a/vta/tutorials/optimize/matrix_multiply_opt.py
+++ b/vta/tutorials/optimize/matrix_multiply_opt.py
@@ -46,7 +46,7 @@
 from tvm.contrib import util
 from vta.testing import simulator
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # We read the Pynq RPC host IP address and port number from the OS environment
diff --git a/vta/vta-hw/apps/gemm/CMakeLists.txt b/vta/vta-hw/apps/gemm/CMakeLists.txt
deleted file mode 100644
index f41a46706bdb..000000000000
--- a/vta/vta-hw/apps/gemm/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-cmake_minimum_required(VERSION 3.2)
-project(tsim C CXX)
-
-if(NOT DEFINED ENV{TVM_PATH})
-    message(ERROR "Make sure to set TVM_PATH in your environment")
-endif()
-
-if(NOT DEFINED ENV{VTA_HW_PATH})
-    message(ERROR "Make sure to set VTA_HW_PATH in your environment")
-endif()
-
-include_directories("$ENV{TVM_PATH}/include")
-include_directories("$ENV{TVM_PATH}/3rdparty/dlpack/include")
-include_directories("$ENV{TVM_PATH}/3rdparty/dmlc-core/include")
-include_directories("$ENV{VTA_HW_PATH}/src/dpi")
-
-set(CMAKE_C_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden")
-set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden -std=c++11")
-
-if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
-    CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
-endif()
-
-file(GLOB TSIM_SW_SRC src/driver.cc)
-list(APPEND TSIM_SW_SRC $ENV{VTA_HW_PATH}/src/vmem/virtual_memory.cc)
-list(APPEND TSIM_SW_SRC $ENV{VTA_HW_PATH}/src/dpi/module.cc)
-
-add_library(sw SHARED ${TSIM_SW_SRC})
-target_include_directories(sw PRIVATE $ENV{VTA_HW_PATH}/include $ENV{VTA_HW_PATH}/src)
-
-if(APPLE)
-  set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-endif(APPLE)
diff --git a/vta/vta-hw/apps/gemm/Makefile b/vta/vta-hw/apps/gemm/Makefile
deleted file mode 100644
index 6bdebea69777..000000000000
--- a/vta/vta-hw/apps/gemm/Makefile
+++ /dev/null
@@ -1,42 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-export PYTHONPATH:=$(abspath .)/python:$(PYTHONPATH)
-
-BUILD_NAME = build
-build_dir = $(abspath .)/$(BUILD_NAME)
-
-default: chisel driver serial parallel
-
-serial:
-	python3 tests/python/chisel_accel.py serial
-
-parallel:
-	python3 tests/python/chisel_accel.py parallel
-
-driver: | $(build_dir)
-	cd $(build_dir) && cmake .. && make
-
-$(build_dir):
-	mkdir -p $@
-
-chisel:
-	make -C hardware/chisel
-
-clean:
-	-rm -rf $(build_dir)
-	make -C hardware/chisel clean
diff --git a/vta/vta-hw/apps/gemm/README.md b/vta/vta-hw/apps/gemm/README.md
deleted file mode 100644
index bf7e1c1ade1e..000000000000
--- a/vta/vta-hw/apps/gemm/README.md
+++ /dev/null
@@ -1,50 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-VTA TSIM Application
-======================
-Prior to this application, please take a look at `<vta-hw-root>/apps/tsim_example` for installation
-This is an application that performs Bit Serial Multiplication for GEMM utilizing TSIM.
-
-**Bit Serial Multiplication for GEMM:**
-
-General Matrix Multiplications (GEMM), are mostly calculated by repeatly calculating the dot product for each pair of vectors.
-The dot product is calculated by summing every product of the vector pair.
-We approach this operation with slicing and shifting, like how basic multiplication works, each vector elements before we accumulate them.
-We can sufficiently reduce the cycles required to perform a gemm given that the data bit width is small. This GEMM application uses TSIM for future accerlerator prototypes.
-
-* Test Chisel3 backend with bit serial GEMM
-    * Go to `<vta-hw-root>/apps/gemm`
-    * Run `make`
-
-* If you have already compiled chisel backend (i.e. ran `make`)
-    * Bit Serial test with another input set, run `make serial`
-    * Bit parallel test with another input set, run `make parallel`
-
-* Some steps for creating your own custom TSIM application
-    * Go to `<vta-hw-root>/apps/gemm`
-    * Create custom circuit within `./hardware/chisel/src/scala.main/accel/Compute.scala`
-    * Map the according Registers in `./hardware/chisel/src/scala.main/accel/RegFile.scala`
-    * Create your test script
-    * Map the registers in `./src/driver.cc` and link it with both `RegFile.scala` and the test script
-    * Understanding of `<vta-hw-root>/apps/tsim_example`, which performs add by one to a vector, is highly encouraged to create a more complex application
-
-* Some pointers
-    * Chisel3 tests in `<vta-hw-root>/apps/gemm/tests/python`
-    * Chisel3 accelerator backend `<vta-hw-root>/apps/gemm/hardware/chisel`
-    * Software C++ driver (backend) that handles the accelerator `<vta-hw-root>/apps/gemm/src/driver.cc`
-    * Software Python driver (frontend) that handles the accelerator `<vta-hw-root>/apps/gemm/python/accel`
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/Makefile b/vta/vta-hw/apps/gemm/hardware/chisel/Makefile
deleted file mode 100644
index 310f62335043..000000000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/Makefile
+++ /dev/null
@@ -1,112 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifeq (, $(shell which verilator))
- $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
-endif
-
-# Change VERILATOR_INC_DIR if Verilator is installed on a different location
-ifeq (, $(VERILATOR_INC_DIR))
-  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
-    ifeq (, $(wildcard /usr/share/verilator/include/*))
-      $(error "Verilator include directory is not set properly")
-    else
-      VERILATOR_INC_DIR := /usr/share/verilator/include
-    endif
-  else
-      VERILATOR_INC_DIR := /usr/local/share/verilator/include
-  endif
-endif
-
-TOP = TestAccel
-BUILD_NAME = build
-USE_TRACE = 1
-LIBNAME = libhw
-
-vta_dir = $(abspath ../../../../)
-tvm_dir = $(abspath ../../../../../../)
-build_dir = $(abspath .)/$(BUILD_NAME)
-verilator_build_dir = $(build_dir)/verilator
-chisel_build_dir = $(build_dir)/chisel
-
-verilator_opt = --cc
-verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
-verilator_opt += +define+RANDOMIZE_REG_INIT
-verilator_opt += +define+RANDOMIZE_MEM_INIT
-verilator_opt += --x-assign unique
-verilator_opt += --output-split 20000
-verilator_opt += --output-split-cfuncs 20000
-verilator_opt += --top-module ${TOP}
-verilator_opt += -Mdir ${verilator_build_dir}
-verilator_opt += -I$(chisel_build_dir)
-
-cxx_flags = -O2 -Wall -fPIC -shared
-cxx_flags += -fvisibility=hidden -std=c++11
-cxx_flags += -DVL_TSIM_NAME=V$(TOP)
-cxx_flags += -DVL_PRINTF=printf
-cxx_flags += -DVL_USER_FINISH
-cxx_flags += -DVM_COVERAGE=0
-cxx_flags += -DVM_SC=0
-cxx_flags += -Wno-sign-compare
-cxx_flags += -include V$(TOP).h
-cxx_flags += -I$(verilator_build_dir)
-cxx_flags += -I$(VERILATOR_INC_DIR)
-cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
-cxx_flags += -I$(vta_dir)/include
-cxx_flags += -I$(tvm_dir)/include
-cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
-
-cxx_files = $(VERILATOR_INC_DIR)/verilated.cpp
-cxx_files += $(VERILATOR_INC_DIR)/verilated_dpi.cpp
-cxx_files += $(wildcard $(verilator_build_dir)/*.cpp)
-cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
-
-ifneq ($(USE_TRACE), 0)
-  verilator_opt += --trace
-  cxx_flags += -DVM_TRACE=1
-  cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP).vcd
-  cxx_files += $(VERILATOR_INC_DIR)/verilated_vcd_c.cpp
-else
-  cxx_flags += -DVM_TRACE=0
-endif
-
-# The following is to be consistent with cmake
-ifeq ($(shell uname), Darwin)
-  lib_path = $(build_dir)/$(LIBNAME).dylib
-else
-  lib_path = $(build_dir)/$(LIBNAME).so
-endif
-
-default: lib
-
-lib: $(lib_path)
-$(lib_path): $(verilator_build_dir)/V$(TOP).cpp
-	g++ $(cxx_flags) $(cxx_files) -o $@
-
-verilator: $(verilator_build_dir)/V$(TOP).cpp
-$(verilator_build_dir)/V$(TOP).cpp: $(chisel_build_dir)/$(TOP).v
-	verilator $(verilator_opt) $<
-
-verilog: $(chisel_build_dir)/$(TOP).v
-$(chisel_build_dir)/$(TOP).v: install_vta_package
-	sbt 'test:runMain test.Elaborate --target-dir $(chisel_build_dir) --top-name $(TOP)'
-
-install_vta_package:
-	cd $(vta_dir)/hardware/chisel && sbt publishLocal
-
-clean:
-	-rm -rf $(build_dir) target project/target project/project
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/build.sbt b/vta/vta-hw/apps/gemm/hardware/chisel/build.sbt
deleted file mode 100644
index a2afc0d9d362..000000000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/build.sbt
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-name := "accel"
-version := "0.1.0-SNAPSHOT"
-organization := "edu.washington.cs"
-
-def scalacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // If we're building with Scala > 2.11, enable the compile option
-    //  switch to support our anonymous Bundle definitions:
-    //  https://github.com/scala/bug/issues/10047
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 => Seq()
-      case _ => Seq(
-        "-Xsource:2.11",
-        "-language:reflectiveCalls",
-        "-language:implicitConversions",
-        "-deprecation",
-        "-Xlint",
-        "-Ywarn-unused",
-      )
-    }
-  }
-}
-
-def javacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // Scala 2.12 requires Java 8. We continue to generate
-    //  Java 7 compatible code for Scala 2.11
-    //  for compatibility with old clients.
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 =>
-        Seq("-source", "1.7", "-target", "1.7")
-      case _ =>
-        Seq("-source", "1.8", "-target", "1.8")
-    }
-  }
-}
-
-scalaVersion := "2.11.12"
-
-resolvers ++= Seq(
-  Resolver.sonatypeRepo("snapshots"),
-  Resolver.sonatypeRepo("releases"))
-
-libraryDependencies ++= Seq(
-  "edu.berkeley.cs" %% "chisel3" % "3.1.7",
-  "edu.washington.cs" %% "vta" % "0.1.0-SNAPSHOT",
-)
-
-scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
-javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/project/build.properties b/vta/vta-hw/apps/gemm/hardware/chisel/project/build.properties
deleted file mode 100644
index fc7998eb3eac..000000000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/project/build.properties
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-sbt.version = 1.3.2
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/project/plugins.sbt b/vta/vta-hw/apps/gemm/hardware/chisel/project/plugins.sbt
deleted file mode 100644
index 79ffb2245d52..000000000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/project/plugins.sbt
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-logLevel := Level.Warn
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Accel.scala b/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Accel.scala
deleted file mode 100644
index add07c320c1e..000000000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Accel.scala
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import vta.dpi._
-
-/** Add-by-one accelerator.
-  *
-  * ___________      ___________
-  * |         |      |         |
-  * | HostDPI | <--> | RegFile | <->|
-  * |_________|      |_________|    |
-  *                                 |
-  * ___________      ___________    |
-  * |         |      |         |    |
-  * | MemDPI  | <--> | Compute | <->|
-  * |_________|      |_________|
-  *
-  */
-case class AccelConfig() {
-  val nCtrl = 1
-  val nECnt = 1
-  val nVals = 4
-  val nPtrs = 3
-  val regBits = 32
-  val ptrBits = 2*regBits
-}
-
-class Accel extends Module {
-  val io = IO(new Bundle {
-    val host = new VTAHostDPIClient
-    val mem = new VTAMemDPIMaster
-  })
-  implicit val config = AccelConfig()
-  val rf = Module(new RegFile)
-  val ce = Module(new Compute)
-  rf.io.host <> io.host
-  io.mem <> ce.io.mem
-  ce.io.launch := rf.io.launch
-  rf.io.finish := ce.io.finish
-  rf.io.ecnt <> ce.io.ecnt
-  ce.io.vals <> rf.io.vals
-  ce.io.ptrs <> rf.io.ptrs
-}
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala b/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
deleted file mode 100644
index 1eced6ecc3c2..000000000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import chisel3.util._
-import vta.dpi._
-import vta.core._
-import vta.util.config._
-import vta.shell._
-
-class TestConfig extends Config(new CoreConfig ++ new PynqConfig)
-/** Compute
-  *
-  * Bit Slice GEMM:
-  *
-  * 1. Wait for launch to be asserted
-  * 2. Issue 1 read request for 8-bit value at inp1_baddr address (read matrix)
-  * 3. Wait for the value
-  * 4. Increment read-address for next value
-  * 5. Repeat until all inp1 data have been read
-
-  * 6. Issue 1 read request for 8-bit value at inp2_baddr address (read vector)
-  * 7. Wait for the value
-  * 8. Increment read-address for next value
-  * 9. Repeat until all inp2 data have been read
-
-  * 10. Wait for output to be calculated
-  * 11. Issue a write request for 8-byte value at out_baddr address
-  * 12. Increment write-address for next value to write
-  * 13. Check if counter (cntout) is equal to length to asser finish,
-       otherwise go to step 11
-  */
-class Compute(implicit config: AccelConfig) extends Module {
-  val io = IO(new Bundle {
-    val launch = Input(Bool())
-    val finish = Output(Bool())
-    val ecnt = Vec(config.nECnt, ValidIO(UInt(config.regBits.W)))
-    val vals = Input(Vec(config.nVals, UInt(config.regBits.W)))
-    val ptrs = Input(Vec(config.nPtrs, UInt(config.ptrBits.W)))
-    val mem = new VTAMemDPIMaster
-  })
-  implicit val p: Parameters = new TestConfig
-  val sIdle :: sReadAReq :: sReadAData :: sReadADone ::sReadBReq :: sReadBData :: sReadBDone :: sInpDone ::sWait:: sWriteReq :: sWriteData :: sWriteDone :: Nil = Enum(12)
-  val state = RegInit(sIdle)
-  val shift = io.vals(0)
-  val length = io.vals(1)
-  val rstAccum = io.vals(2)
-  val startDot = io.vals(3)
-  val cycles = RegInit(0.U(config.regBits.W))
-  val mvc = Module(new MatrixVectorMultiplication)
-  val reg1 = Reg(chiselTypeOf(mvc.io.wgt.data.bits))
-  val reg2 = Reg(chiselTypeOf(mvc.io.inp.data.bits))
-  val cntwgt = Reg(UInt(config.regBits.W))
-  val cntinp = Reg(UInt(config.regBits.W))
-  val cntout = Reg(UInt(config.regBits.W))
-  val raddr1 = Reg(UInt(config.ptrBits.W))
-  val raddr2 = Reg(UInt(config.ptrBits.W))
-  val waddr = Reg(UInt(config.ptrBits.W))
-  val accum = Module(new Accmulator(size = p(CoreKey).blockOut, accBits = p(CoreKey).accBits))
-
-  switch (state) {
-    is (sIdle) {
-      when (io.launch) {
-        state := sReadAReq
-      }
-    }
-    // Read
-    is (sReadAReq) {
-      state := sReadAData
-    }
-    is (sReadAData) {
-      when (io.mem.rd.valid) {
-        state := sReadADone
-      }
-    }
-    is (sReadADone) {
-      when (cntwgt === (length * length) - 1.U) {
-        state := sReadBReq
-      } .otherwise {
-        state := sReadAReq
-      }
-    }
-    is (sReadBReq) {
-      state := sReadBData
-    }
-    is (sReadBData) {
-      when (io.mem.rd.valid) {
-        state := sReadBDone
-      }
-    }
-    is (sReadBDone) {
-      when (cntinp === length-1.U) {
-        state := sInpDone
-      } .otherwise {
-        state := sReadBReq
-      }
-    }
-    // Both input is processed
-    is (sInpDone) {
-      state := sWait
-    }
-    // Wait for computation
-    is (sWait) {
-      when (accum.io.ready) {
-        state := sWriteReq
-      }
-    }
-    // Write
-    is (sWriteReq) {
-      state := sWriteData
-    }
-    is (sWriteData) {
-        state := sWriteDone
-    }
-    is (sWriteDone) {
-      when (cntout === (length - 1.U)) {
-        state := sIdle
-      } .otherwise {
-        state := sWriteReq
-      }
-    }
-  }
-
-  val last = state === sWriteDone && cntout === (length - 1.U)
-
-  // cycle counter
-  when (state === sIdle) {
-    cycles := 0.U
-  } .otherwise {
-    cycles := cycles + 1.U
-  }
-
-  io.ecnt(0).valid := last
-  io.ecnt(0).bits := cycles
-
-  // calculate next address
-  when (state === sIdle) {
-    raddr1 := io.ptrs(0)
-    raddr2 := io.ptrs(1)
-    waddr := io.ptrs(2)
-  } .elsewhen (state === sReadADone) { // increment input array by 1-byte
-    raddr1 := raddr1 + 1.U
-  } .elsewhen (state === sReadBDone) { // increment input array by 1-byte
-    raddr2 := raddr2 + 1.U
-  } .elsewhen (state === sWriteDone) {
-    waddr := waddr + 4.U // writing 4 bytes
-  }
-
-  // create request
-  io.mem.req.valid := state === sReadAReq | state === sReadBReq | state === sWriteReq
-  io.mem.req.opcode := state === sWriteReq
-  io.mem.req.len := 0.U // one-word-per-request
-  io.mem.req.addr := Mux(state === sReadAReq | state === sReadBReq, Mux(state === sReadAReq, raddr1, raddr2), waddr)
-
-  // read
-  when (state === sReadAData && io.mem.rd.valid) {
-    reg1(cntwgt/length)(cntwgt%length) := io.mem.rd.bits(7, 0)
-  }
-
-  when (state === sReadBData && io.mem.rd.valid) {
-    reg2(0)(cntinp) := io.mem.rd.bits(7, 0)
-  }
-
-  io.mem.rd.ready := state === sReadAData | state === sReadBData
-  mvc.io.inp.data.valid := state === sInpDone // 2 inputs have been processed
-  mvc.io.wgt.data.valid := state === sInpDone // 2 inputs have been processed
-
-  mvc.io.wgt.data.bits <> reg1
-  mvc.io.inp.data.bits <> reg2
-  // Modify when shift operation is supported
-  mvc.io.reset := false.B
-  mvc.io.acc_i.data.valid := true.B
-  for (i <- 0 until p(CoreKey).blockOut) {
-    mvc.io.acc_i.data.bits(0)(i) := 0.U
-  }
-
-  accum.io.in := mvc.io.acc_o.data.bits
-  accum.io.shift := shift
-  accum.io.clear := rstAccum
-  accum.io.valid := mvc.io.acc_o.data.valid
-
-  // write
-  io.mem.wr.valid := state === sWriteData
-  io.mem.wr.bits := accum.io.sum(cntout)
-
-  // count read/write
-  when (state === sIdle) {
-    cntwgt := 0.U
-    cntinp := 0.U
-    cntout := 0.U
-  } .elsewhen (state === sReadADone) {
-    cntwgt := cntwgt + 1.U
-  } .elsewhen (state === sReadBDone) {
-    cntinp := cntinp + 1.U
-  } .elsewhen (state === sWriteDone) {
-    cntout := cntout + 1.U
-  }
-
-  io.finish := last // data has been added
-}
-// Shift operation until supported in MVM
-class Accmulator(size: Int = 16, accBits: Int = 32) extends Module {
-  val io = IO(new Bundle {
-    val clear = Input(Bool())
-    val valid = Input(Bool())
-    val ready = Output(Bool())
-    val in = Input(Vec(1, Vec(size, (UInt(accBits.W)))))
-    val shift = Input(UInt(8.W))
-    val sum = Output(Vec(size, (UInt(accBits.W))))
-  })
-    val reg = RegInit(VecInit(Seq.fill(size)(0.U(accBits.W))))
-
-    for (i <- 0 until size) {
-      when (io.clear) {
-        reg(i) := 0.U
-      } .elsewhen(io.valid) {
-        reg(i) := reg(i) + (io.in(0)(i) << io.shift)
-      }
-    }
-    io.ready := RegNext(io.valid)
-    io.sum := reg
-}
-
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala b/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
deleted file mode 100644
index 10c40b5c2e72..000000000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import chisel3.util._
-import vta.dpi._
-
-/** Register File.
-  *
-  * Six 32-bit register file.
-  *
-  * -------------------------------
-  *  Register description    | addr
-  * -------------------------|-----
-  *  Control status register | 0x00
-  *  Cycle counter           | 0x04
-  *  Shift value             | 0x08
-  *  Vector length           | 0x0c
-  *  Reset Accumulator       | 0x10
-  *  Input1 pointer          | 0x18
-  *  Input2 pointer          | 0x20
-  *  Output pointer          | 0x28
-  * -------------------------------
-
-  * ------------------------------
-  *  Control status register | bit
-  * ------------------------------
-  *  Launch                  | 0
-  *  Finish                  | 1
-  * ------------------------------
-  */
-class RegFile(implicit config: AccelConfig) extends Module {
-  val io = IO(new Bundle {
-    val launch = Output(Bool())
-    val finish = Input(Bool())
-    val ecnt = Vec(config.nECnt, Flipped(ValidIO(UInt(config.regBits.W))))
-    val vals = Output(Vec(config.nVals, UInt(config.regBits.W)))
-    val ptrs = Output(Vec(config.nPtrs, UInt(config.ptrBits.W)))
-    val host = new VTAHostDPIClient
-  })
-  val sIdle :: sRead :: Nil = Enum(2)
-  val state = RegInit(sIdle)
-
-  switch (state) {
-    is (sIdle) {
-      when (io.host.req.valid && !io.host.req.opcode) {
-        state := sRead
-      }
-    }
-    is (sRead) {
-      state := sIdle
-    }
-  }
-
-  io.host.req.deq := state === sIdle & io.host.req.valid
-
-  val nTotal = config.nCtrl + config.nECnt + config.nVals + (2*config.nPtrs)
-  val reg = Seq.fill(nTotal)(RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value))))
-  val addr = Seq.tabulate(nTotal)(_ * 4)
-  val reg_map = (addr zip reg)  map { case (a, r) => a.U -> r }
-  val eo = config.nCtrl
-  val vo = eo + config.nECnt
-  val po = vo + config.nVals
-
-  when (io.finish) {
-    reg(0) := "b_10".U
-  } .elsewhen (state === sIdle && io.host.req.valid &&
-        io.host.req.opcode && addr(0).U === io.host.req.addr) {
-    reg(0) := io.host.req.value
-  }
-
-  for (i <- 0 until config.nECnt) {
-    when (io.ecnt(i).valid) {
-      reg(eo + i) := io.ecnt(i).bits
-    } .elsewhen (state === sIdle && io.host.req.valid &&
-          io.host.req.opcode && addr(eo + i).U === io.host.req.addr) {
-      reg(eo + i) := io.host.req.value
-    }
-  }
-
-  for (i <- 0 until (config.nVals + (2*config.nPtrs))) {
-    when (state === sIdle && io.host.req.valid &&
-          io.host.req.opcode && addr(vo + i).U === io.host.req.addr) {
-      reg(vo + i) := io.host.req.value
-    }
-  }
-
-  val rdata = RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value)))
-  when (state === sIdle && io.host.req.valid && !io.host.req.opcode) {
-    rdata := MuxLookup(io.host.req.addr, 0.U, reg_map)
-  }
-
-  io.host.resp.valid := state === sRead
-  io.host.resp.bits := rdata
-
-  io.launch := reg(0)(0)
-
-  for (i <- 0 until config.nVals) {
-    io.vals(i) := reg(vo + i)
-  }
-
-  for (i <- 0 until config.nPtrs) {
-    io.ptrs(i) := Cat(reg(po + 2*i + 1), reg(po + 2*i))
-  }
-}
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/src/test/scala/dut/TestAccel.scala b/vta/vta-hw/apps/gemm/hardware/chisel/src/test/scala/dut/TestAccel.scala
deleted file mode 100644
index d931620ec67d..000000000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/src/test/scala/dut/TestAccel.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package test
-
-import chisel3._
-import chisel3.experimental.MultiIOModule
-import vta.dpi._
-import accel._
-
-/** VTA simulation shell.
-  *
-  * Instantiate Host and Memory DPI modules.
-  *
-  */
-class VTASimShell extends MultiIOModule {
-  val host = IO(new VTAHostDPIMaster)
-  val mem = IO(new VTAMemDPIClient)
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val mod_sim = Module(new VTASimDPI)
-  val mod_host = Module(new VTAHostDPI)
-  val mod_mem = Module(new VTAMemDPI)
-  mod_mem.io.clock := clock
-  mod_mem.io.reset := reset
-  mod_mem.io.dpi <> mem
-  mod_host.io.clock := clock
-  mod_host.io.reset := reset
-  host <> mod_host.io.dpi
-  mod_sim.io.clock := sim_clock
-  mod_sim.io.reset := reset
-  sim_wait := mod_sim.io.dpi_wait
-}
-
-/** Test accelerator.
-  *
-  * Instantiate and connect the simulation-shell and the accelerator.
-  *
-  */
-class TestAccel extends MultiIOModule {
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val sim_shell = Module(new VTASimShell)
-  val vta_accel = Module(new Accel)
-  sim_shell.sim_clock := sim_clock
-  sim_wait := sim_shell.sim_wait
-  sim_shell.mem <> vta_accel.io.mem
-  vta_accel.io.host <> sim_shell.host
-}
-
-/** Generate TestAccel as top module */
-object Elaborate extends App {
-  chisel3.Driver.execute(args, () => new TestAccel)
-}
diff --git a/vta/vta-hw/apps/gemm/python/__init__.py b/vta/vta-hw/apps/gemm/python/__init__.py
deleted file mode 100644
index 4bc21e287d69..000000000000
--- a/vta/vta-hw/apps/gemm/python/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from . import tsim
diff --git a/vta/vta-hw/apps/gemm/python/tsim.py b/vta/vta-hw/apps/gemm/python/tsim.py
deleted file mode 100644
index 85fd463e3278..000000000000
--- a/vta/vta-hw/apps/gemm/python/tsim.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import ctypes
-import os.path as osp
-from sys import platform
-
-def get_ext():
-    """Return shared library extension"""
-    return ".dylib" if platform == "darwin" else ".so"
-
-def load_dll(dll):
-    """Load shared library
-
-    Parameters
-    ------------
-    dll : str
-        Path for shared library
-
-    Returns
-    ------------
-    The shared library
-    """
-    try:
-        return [ctypes.CDLL(dll, ctypes.RTLD_GLOBAL)]
-    except OSError:
-        return []
-
-def load_sw():
-    """Load all software shared libraries"""
-    cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-    sw_libname = "libsw" + get_ext()
-    sw_lib = osp.join(cur_path, "..", "build", sw_libname)
-    load_dll(sw_lib)
-
-def init(hw_backend):
-    """Init hardware and software shared library for accelerator
-
-    Parameters
-    ------------
-    hw_backend : str
-        Hardware backend can be verilog or chisel
-
-    """
-    cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-    hw_libname = "libhw" + get_ext()
-    if hw_backend in ("verilog", "chisel"):
-        hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build", hw_libname)
-    load_sw()
-    m = tvm.runtime.load_module(hw_lib, "vta-tsim")
-    f = tvm.get_global_func("tvm.vta.tsim.init")
-    f(m)
-
-def load_module():
-    """Return driver function"""
-    load_sw()
-    return tvm.get_global_func("tvm.vta.driver")
diff --git a/vta/vta-hw/apps/gemm/src/driver.cc b/vta/vta-hw/apps/gemm/src/driver.cc
deleted file mode 100644
index 24b998edd211..000000000000
--- a/vta/vta-hw/apps/gemm/src/driver.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-#include <vta/dpi/module.h>
-
-#include "vmem/virtual_memory.h"
-
-namespace vta {
-namespace driver {
-
-using vta::dpi::DPIModuleNode;
-using tvm::runtime::Module;
-
-class DPILoader {
- public:
-  ~DPILoader() {
-    dpi_->SimResume();
-    dpi_->SimFinish();
-  }
-
-  void Init(Module module) {
-    mod_ = module;
-    dpi_ = this->Get();
-    dpi_->SimLaunch();
-    dpi_->SimWait();
-  }
-
-  DPIModuleNode* Get() {
-    return static_cast<DPIModuleNode*>(mod_.operator->());
-  }
-
-  static DPILoader* Global() {
-    static DPILoader inst;
-    return &inst;
-  }
-
-  // TVM module
-  Module mod_;
-  // DPI Module
-  DPIModuleNode* dpi_{nullptr};
-};
-
-class Device {
- public:
-  Device() {
-    loader_ = DPILoader::Global();
-  }
-
-  uint32_t Run(DLTensor* inp1, DLTensor* inp2, uint32_t shiftVal, DLTensor* out, uint32_t reset) {
-    uint32_t cycles;
-    uint32_t length = inp2->shape[0];
-    // 1 matrix 1 vector input
-    size_t size1 = (inp1->dtype.bits >> 3) * length * length;
-    size_t size2 = (inp2->dtype.bits >> 3) * length;
-    // 1 vector output
-    size_t size3 = (32 >> 3) * length;
-    inp1_ = this->MemAlloc(size1);
-    inp2_ = this->MemAlloc(size2);
-    out_ = this->MemAlloc(size3);
-    this->MemCopyFromHost(inp1_, inp1->data, size1);
-    this->MemCopyFromHost(inp2_, inp2->data, size2);
-    this->Init();
-    this->Launch(length, shiftVal, reset);
-    cycles = this->WaitForCompletion();
-    this->MemCopyToHost(out->data, out_, size3);
-    this->MemFree(inp1_);
-    this->MemFree(inp2_);
-    this->MemFree(out_);
-    return cycles;
-  }
-
- private:
-  void Init() {
-    dpi_ = loader_->Get();
-    dpi_->SimResume();
-  }
-
-  void* MemAlloc(size_t size) {
-    void * addr = vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
-    return reinterpret_cast<void*>(vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(addr));
-  }
-
-  void MemFree(void* buf) {
-    void * addr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(reinterpret_cast<uint64_t>(buf));
-    vta::vmem::VirtualMemoryManager::Global()->Free(addr);
-  }
-
-  vta_phy_addr_t MemGetPhyAddr(void* buf) {
-    return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
-  }
-
-  void MemCopyFromHost(void* dst, const void* src, size_t size) {
-    vta::vmem::VirtualMemoryManager::Global()->MemCopyFromHost(dst, src, size);
-  }
-
-  void MemCopyToHost(void* dst, const void* src, size_t size) {
-    vta::vmem::VirtualMemoryManager::Global()->MemCopyToHost(dst, src, size);
-  }
-
-  void Launch(uint32_t length, uint32_t shiftVal, uint32_t reset) {
-    dpi_->WriteReg(0x08, shiftVal);
-    dpi_->WriteReg(0x0c, length); // tensor size
-    dpi_->WriteReg(0x18, this->MemGetPhyAddr(inp1_));
-    dpi_->WriteReg(0x20, this->MemGetPhyAddr(inp2_));
-    dpi_->WriteReg(0x28, this->MemGetPhyAddr(out_));
-    dpi_->WriteReg(0x00, 0x1); // launch
-    dpi_->WriteReg(0x00, 0x0); 
-
-    if (reset == 1) {
-      dpi_->WriteReg(0x10, 0x1); // reset accumulator
-      dpi_->WriteReg(0x10, 0x0); 
-    }
-  }
-
-  uint32_t WaitForCompletion() {
-    uint32_t i, val;
-    for (i = 0; i < wait_cycles_; i++) {
-      val = dpi_->ReadReg(0x00);
-      if (val == 2) break; // finish
-    }
-    val = dpi_->ReadReg(0x04);
-    dpi_->SimWait();
-    return val;
-  }
-
-  // wait cycles
-  uint32_t wait_cycles_{100000000};
-  // DPI loader
-  DPILoader* loader_{nullptr};
-  // DPI Module
-  DPIModuleNode* dpi_{nullptr};
-  // input vm ptr
-  void* inp1_{nullptr};
-  void* inp2_{nullptr};
-  // output vm ptr
-  void* out_{nullptr};
-};
-
-using tvm::runtime::TVMRetValue;
-using tvm::runtime::TVMArgs;
-
-TVM_REGISTER_GLOBAL("tvm.vta.tsim.init")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Module m = args[0];
-    DPILoader::Global()->Init(m);
-  });
-
-TVM_REGISTER_GLOBAL("tvm.vta.driver")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    DLTensor* C = args[3];
-    Device dev_;
-    uint32_t cycles = dev_.Run(A, B, static_cast<int>(args[2]), C, static_cast<int>(args[4]));
-    *rv = static_cast<int>(cycles);
-  });
-
-}  // namespace driver
-}  // namespace vta
diff --git a/vta/vta-hw/apps/gemm/tests/python/chisel_accel.py b/vta/vta-hw/apps/gemm/tests/python/chisel_accel.py
deleted file mode 100644
index 441f36d8de09..000000000000
--- a/vta/vta-hw/apps/gemm/tests/python/chisel_accel.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-import tsim
-import sys
-
-""" Vector Bit Slice and Pack Function
-Parameters
-----------
-A : Vector to be sliced and packed
-slice_width : slice width
-
-Returns
----------
-C: 2d matrix where each cloumn (because of bit packing) represents each bit slice of A
-"""
-def slice(A, slice_width):
-    assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
-    dtype = type(A[0])
-    row = 0
-    # currently only supports uint
-    if dtype is np.uint8: row = 8 // slice_width
-    elif dtype is np.uint16: row = 16 // slice_width
-    elif dtype is np.uint32: row = 32 // slice_width
-    elif dtype is np.uint64: row = 64 // slice_width
-    else: raise ValueError("datatype currently not supported")
-    if (row >= 8):
-        dtype = 'uint' + str(row)
-    else:
-        dtype = 'uint8'
-
-    C = np.zeros((row, len(A))).astype(dtype) # sliced and transform
-
-    # create mask
-    slice_mask = 2**(slice_width)-1
-    # slice and pack
-    for x in range(len(A)):
-        for y in range(row):
-            C[y][x] = (np.uint64(A[x]) >> np.uint64(slice_width * y)) & np.uint64(slice_mask)
-    return C
-
-def slice_mat(A, slice_width):
-    assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
-    dtype = type(A[0][0])
-    row = 0
-    # currently only supports uint
-    if dtype is np.uint8: row = 8 // slice_width
-    elif dtype is np.uint16: row = 16 // slice_width
-    elif dtype is np.uint32: row = 32 // slice_width
-    elif dtype is np.uint64: row = 64 // slice_width
-    else: raise ValueError("datatype currently not supported")
-    if (row >= 8):
-        dtype = 'uint' + str(row)
-    else:
-        dtype = 'uint8'
-
-    # 3d array (bits, row, clmn)
-    C = np.zeros((row, A.shape[0], A.shape[1])).astype(dtype) # sliced and transform
-
-    # create mask
-    slice_mask = 2**(slice_width)-1
-    # slice and pack
-    for z in range(A.shape[0]):
-        C[:, z, :] = slice(A[z], slice_width)
-    return C
-
-""" Matrix Multiplication Function
-Parameters
-----------
-A : Matrix A
-B: Matrix B
-i_width : weight slice width
-w_width : activation slice width
-
-Returns
----------
-C: result of A * B
-"""
-# A is a n*m matrix, B is a m*p matrix(not transposed yet)
-def matrix_multiply(A, B, i_width, w_width):
-    assert A.shape[1] == B.shape[0], "can't perform multiplication"
-    BT = B.transpose()
-    cycles = 0
-    B_sliced = slice_mat(BT, w_width)
-    C = np.zeros((A.shape[0], B.shape[1])).astype('uint64')
-    for i in range(A.shape[0]):
-        A_sliced = slice(A[i], i_width)
-        test = test_accel(A_sliced, B_sliced, i_width, w_width)
-        C[i] = test[0]
-        cycles += test[1]
-        np.testing.assert_array_equal(C[i], compute(A_sliced, B_sliced, i_width, w_width))
-        print("PASS row " + str(i))
-
-    np.testing.assert_array_equal(C, np.matmul(A.astype('uint64'),B))
-    print("result: ")
-    print(C)
-    print("TEST PASSED, cycles: " + str(cycles))
-    return C
-
-""" Software Verification Function
-Parameter Dimesions
----------
-A (bits, y) and B (bits, y, x) (transposed)
-
-Takes 1 vector and 1 matrix input (sliced and packed)
-
-Returns
----------
-Resulting vector
-"""
-def compute(A, B, i_width, w_width):
-    assert A.shape[1] == B.shape[1], "sliced shape not match"
-    # reset hardware accumulator
-    accum = np.zeros(A.shape[1])
-    for x in range(A.shape[0]):
-        for y in range(B.shape[0]):
-            accum += np.matmul(A[x].astype('uint64'), B[y].transpose()) << np.uint64(x*i_width + y*w_width)
-    # get value from accumulator
-    return accum
-
-"""Testing Function for Matrix Vector Multiplication"""
-def test_accel(A, B, i_width, w_width):
-    assert A.shape[1] == B.shape[2], "sliced shape not match"
-    dtype = A.dtype
-    ctx = tvm.cpu(0)
-    f = tsim.load_module()
-
-    a_arr = []
-    b_arr = []
-    for i in range(A.shape[0]):
-        list_a = np.zeros(A.shape[1]).astype(dtype)
-        for j in range(A.shape[1]):
-            list_a[j] = A[i][j]
-        a_arr.append(tvm.nd.array(list_a.astype(dtype), ctx))
-
-    for i in range(B.shape[0]):
-        # transpose
-        list_b = np.zeros((B.shape[2], B.shape[1])).astype(dtype)
-        for j in range(B.shape[2]):
-            for k in range(B.shape[1]):
-                list_b[j][k] = B[i][j][k]
-        b_arr.append(tvm.nd.array(list_b.astype(dtype), ctx))
-
-    cycles = 0
-    accum = tvm.nd.array(np.zeros(A.shape[1]).astype("uint32"), ctx)
-    for i in range(len(a_arr)):
-        for j in range(len(b_arr)):
-            shift = np.uint8(i*i_width + j*w_width)
-            if i == 0 and j == 0:
-                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(1)) # reset accumulator
-            else:
-                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(0)) # no reset
-
-    return (accum.asnumpy(), cycles)
-
-""" Matrix Generator
-Parameters
-----------
-dtype : String, datatype generated (supports only uint)
-i_width : weight bit slices(needs to be less than actual bit width)
-w_width : activation bit slices(needs to be less than actual bit width)
-"""
-def top_test(dtype, i_width, w_width):
-
-    # only supports positive values (up to 2**(bits-1))
-    rmax = 127
-    # (m,16) * (16,16) GEMM
-    rrow = np.random.randint(7) + 1
-    clmn = 16
-    A = np.random.randint(rmax, size=(rrow,clmn)).astype(dtype)
-    B = np.random.randint(rmax, size=(clmn,clmn)).astype(dtype)
-
-    print("A: " + str(A))
-    print("B: " + str(B))
-    # perform GEMM
-    matrix_multiply(A, B, i_width, w_width)
-
-if __name__ == "__main__":
-    tsim.init("chisel")
-    for i in range(1):
-        # reg1 and reg2 bits in hardware/chisel/src/main/Compute.scala must be modified for slices greater than 8 bits
-        if sys.argv[1] == 'serial':
-          # generates a random uint8 GEMM with 2-bit(8/4) input and 4-bit(8/2) weight
-          top_test("uint8", 4, 2)
-        elif sys.argv[1] == 'parallel':
-          # generates a random uint8 GEMM with 8-bit input and 8-bit weight (bit parallel)
-          top_test('uint8', 8, 8)
diff --git a/vta/vta-hw/apps/tsim_example/CMakeLists.txt b/vta/vta-hw/apps/tsim_example/CMakeLists.txt
deleted file mode 100644
index f41a46706bdb..000000000000
--- a/vta/vta-hw/apps/tsim_example/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-cmake_minimum_required(VERSION 3.2)
-project(tsim C CXX)
-
-if(NOT DEFINED ENV{TVM_PATH})
-    message(ERROR "Make sure to set TVM_PATH in your environment")
-endif()
-
-if(NOT DEFINED ENV{VTA_HW_PATH})
-    message(ERROR "Make sure to set VTA_HW_PATH in your environment")
-endif()
-
-include_directories("$ENV{TVM_PATH}/include")
-include_directories("$ENV{TVM_PATH}/3rdparty/dlpack/include")
-include_directories("$ENV{TVM_PATH}/3rdparty/dmlc-core/include")
-include_directories("$ENV{VTA_HW_PATH}/src/dpi")
-
-set(CMAKE_C_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden")
-set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden -std=c++11")
-
-if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
-    CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
-endif()
-
-file(GLOB TSIM_SW_SRC src/driver.cc)
-list(APPEND TSIM_SW_SRC $ENV{VTA_HW_PATH}/src/vmem/virtual_memory.cc)
-list(APPEND TSIM_SW_SRC $ENV{VTA_HW_PATH}/src/dpi/module.cc)
-
-add_library(sw SHARED ${TSIM_SW_SRC})
-target_include_directories(sw PRIVATE $ENV{VTA_HW_PATH}/include $ENV{VTA_HW_PATH}/src)
-
-if(APPLE)
-  set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-endif(APPLE)
diff --git a/vta/vta-hw/apps/tsim_example/Makefile b/vta/vta-hw/apps/tsim_example/Makefile
deleted file mode 100644
index 406f9314ab03..000000000000
--- a/vta/vta-hw/apps/tsim_example/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-export PYTHONPATH:=$(PWD)/python:$(PYTHONPATH)
-
-BUILD_NAME = build
-build_dir = $(abspath .)/$(BUILD_NAME)
-
-default: run_verilog
-
-run_verilog: verilog driver
-	python3 tests/python/verilog_accel.py
-
-run_chisel: chisel driver
-	python3 tests/python/chisel_accel.py
-
-driver: | $(build_dir)
-	cd $(build_dir) && cmake .. && make
-
-$(build_dir):
-	mkdir -p $@
-
-verilog:
-	make -C hardware/verilog
-
-chisel:
-	make -C hardware/chisel
-
-clean:
-	-rm -rf $(build_dir)
-	make -C hardware/chisel clean
-	make -C hardware/verilog clean
diff --git a/vta/vta-hw/apps/tsim_example/README.md b/vta/vta-hw/apps/tsim_example/README.md
deleted file mode 100644
index 07d984125610..000000000000
--- a/vta/vta-hw/apps/tsim_example/README.md
+++ /dev/null
@@ -1,87 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-VTA TSIM Installation
-======================
-
-*TSIM* is a cycle-accurate hardware simulation environment that can be invoked and managed directly from TVM. It aims to enable cycle accurate simulation of deep learning accelerators including VTA.
-This simulation environment can be used in both OSX and Linux.
-There are two dependencies required to make *TSIM* works: [Verilator](https://www.veripool.org/wiki/verilator) and [sbt](https://www.scala-sbt.org/) for accelerators designed in [Chisel3](https://github.com/freechipsproject/chisel3).
-
-## OSX Dependencies
-
-Install `sbt` and `verilator` using [Homebrew](https://brew.sh/).
-
-```bash
-brew install verilator sbt
-```
-
-## Linux Dependencies
-
-Add `sbt` to package manager (Ubuntu).
-
-```bash
-echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
-sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
-sudo apt-get update
-```
-
-Install `sbt` and `verilator`.
-
-```bash
-sudo apt install verilator sbt
-```
-
-Verilator version check
-
-```bash
-verilator --version
-```
-
-the supported version of Verilator should be at least 4.012,
-if homebrew (OSX) or package-manager (Linux) does not support that version,
-please install Verilator 4.012 or later from binary or source base on following
-instruction of Verilator wiki.
-
-https://www.veripool.org/projects/verilator/wiki/Installing
-
-## Setup in TVM
-
-1. Install `verilator` and `sbt` as described above
-2. Get tvm `git clone https://github.com/apache/incubator-tvm.git tvm --recursive`
-3. Build [tvm](https://docs.tvm.ai/install/from_source.html#build-the-shared-library)
-
-## How to run VTA TSIM examples
-
-There are two sample VTA accelerators, add-a-constant, designed in Chisel3 and Verilog to show how *TSIM* works.
-The default target language for these two implementations is Verilog. The following instructions show
-how to run both of them:
-
-* Test Verilog backend
-    * Go to `<vta-hw-root>/apps/tsim_example`
-    * Run `make`
-
-* Test Chisel3 backend
-    * Go to `<vta-hw-root>/apps/tsim_example`
-    * Run `make run_chisel`
-
-* Some pointers
-    * Verilog and Chisel3 tests in `<vta-hw-root>/apps/tsim_example/tests/python`
-    * Verilog accelerator backend `<vta-hw-root>/apps/tsim_example/hardware/verilog`
-    * Chisel3 accelerator backend `<vta-hw-root>/apps/tsim_example/hardware/chisel`
-    * Software C++ driver (backend) that handles the accelerator `<vta-hw-root>/apps/tsim_example/src/driver.cc`
-    * Software Python driver (frontend) that handles the accelerator `<vta-hw-root>/apps/tsim_example/python/accel`
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/Makefile b/vta/vta-hw/apps/tsim_example/hardware/chisel/Makefile
deleted file mode 100644
index 2bbe7778095c..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/Makefile
+++ /dev/null
@@ -1,116 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifeq (, $(shell which verilator))
- $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
-endif
-
-# Change VERILATOR_INC_DIR if Verilator is installed on a different location
-ifeq (, $(VERILATOR_INC_DIR))
-  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
-    ifeq (, $(wildcard /usr/share/verilator/include/*))
-      $(error "Verilator include directory is not set properly")
-    else
-      VERILATOR_INC_DIR := /usr/share/verilator/include
-    endif
-  else
-      VERILATOR_INC_DIR := /usr/local/share/verilator/include
-  endif
-endif
-
-TOP = TestAccel
-BUILD_NAME = build
-USE_TRACE = 0
-LIBNAME = libhw
-
-vta_dir = $(abspath ../../../../)
-tvm_dir = $(abspath ../../../../../../)
-build_dir = $(abspath .)/$(BUILD_NAME)
-verilator_build_dir = $(build_dir)/verilator
-chisel_build_dir = $(build_dir)/chisel
-
-verilator_opt = --cc
-verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
-verilator_opt += +define+RANDOMIZE_REG_INIT
-verilator_opt += +define+RANDOMIZE_MEM_INIT
-verilator_opt += --x-assign unique
-verilator_opt += --output-split 20000
-verilator_opt += --output-split-cfuncs 20000
-verilator_opt += --top-module ${TOP}
-verilator_opt += -Mdir ${verilator_build_dir}
-verilator_opt += -I$(chisel_build_dir)
-
-cxx_flags = -O2 -Wall -fPIC -shared
-cxx_flags += -fvisibility=hidden -std=c++11
-cxx_flags += -DVL_TSIM_NAME=V$(TOP)
-cxx_flags += -DVL_PRINTF=printf
-cxx_flags += -DVL_USER_FINISH
-cxx_flags += -DVM_COVERAGE=0
-cxx_flags += -DVM_SC=0
-cxx_flags += -Wno-sign-compare
-cxx_flags += -include V$(TOP).h
-cxx_flags += -I$(verilator_build_dir)
-cxx_flags += -I$(VERILATOR_INC_DIR)
-cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
-cxx_flags += -I$(vta_dir)/include
-cxx_flags += -I$(tvm_dir)/include
-cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
-
-cxx_files = $(VERILATOR_INC_DIR)/verilated.cpp
-cxx_files += $(VERILATOR_INC_DIR)/verilated_dpi.cpp
-cxx_files += $(wildcard $(verilator_build_dir)/*.cpp)
-cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
-
-ifneq ($(USE_TRACE), 0)
-  verilator_opt += --trace
-  cxx_flags += -DVM_TRACE=1
-  cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP).vcd
-  cxx_files += $(VERILATOR_INC_DIR)/verilated_vcd_c.cpp
-else
-  cxx_flags += -DVM_TRACE=0
-endif
-
-# The following is to be consistent with cmake
-ifeq ($(shell uname), Darwin)
-  lib_path = $(build_dir)/$(LIBNAME).dylib
-else
-  lib_path = $(build_dir)/$(LIBNAME).so
-endif
-
-default: lint lib
-
-lint:
-	cp $(vta_dir)/hardware/chisel/scalastyle-config.xml .
-	sbt scalastyle
-
-lib: $(lib_path)
-$(lib_path): $(verilator_build_dir)/V$(TOP).cpp
-	g++ $(cxx_flags) $(cxx_files) -o $@
-
-verilator: $(verilator_build_dir)/V$(TOP).cpp
-$(verilator_build_dir)/V$(TOP).cpp: $(chisel_build_dir)/$(TOP).v
-	verilator $(verilator_opt) $<
-
-verilog: $(chisel_build_dir)/$(TOP).v
-$(chisel_build_dir)/$(TOP).v: install_vta_package
-	sbt 'test:runMain test.Elaborate --target-dir $(chisel_build_dir) --top-name $(TOP)'
-
-install_vta_package:
-	cd $(vta_dir)/hardware/chisel && sbt publishLocal
-
-clean:
-	-rm -rf $(build_dir) target project/target project/project
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/build.sbt b/vta/vta-hw/apps/tsim_example/hardware/chisel/build.sbt
deleted file mode 100644
index a2afc0d9d362..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/build.sbt
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-name := "accel"
-version := "0.1.0-SNAPSHOT"
-organization := "edu.washington.cs"
-
-def scalacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // If we're building with Scala > 2.11, enable the compile option
-    //  switch to support our anonymous Bundle definitions:
-    //  https://github.com/scala/bug/issues/10047
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 => Seq()
-      case _ => Seq(
-        "-Xsource:2.11",
-        "-language:reflectiveCalls",
-        "-language:implicitConversions",
-        "-deprecation",
-        "-Xlint",
-        "-Ywarn-unused",
-      )
-    }
-  }
-}
-
-def javacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // Scala 2.12 requires Java 8. We continue to generate
-    //  Java 7 compatible code for Scala 2.11
-    //  for compatibility with old clients.
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 =>
-        Seq("-source", "1.7", "-target", "1.7")
-      case _ =>
-        Seq("-source", "1.8", "-target", "1.8")
-    }
-  }
-}
-
-scalaVersion := "2.11.12"
-
-resolvers ++= Seq(
-  Resolver.sonatypeRepo("snapshots"),
-  Resolver.sonatypeRepo("releases"))
-
-libraryDependencies ++= Seq(
-  "edu.berkeley.cs" %% "chisel3" % "3.1.7",
-  "edu.washington.cs" %% "vta" % "0.1.0-SNAPSHOT",
-)
-
-scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
-javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/project/build.properties b/vta/vta-hw/apps/tsim_example/hardware/chisel/project/build.properties
deleted file mode 100644
index fc7998eb3eac..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/project/build.properties
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-sbt.version = 1.3.2
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/project/plugins.sbt b/vta/vta-hw/apps/tsim_example/hardware/chisel/project/plugins.sbt
deleted file mode 100644
index 19ae5c9d49b9..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/project/plugins.sbt
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-logLevel := Level.Warn
-addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala b/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
deleted file mode 100644
index 7ba1e633629f..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import vta.dpi._
-
-/** Add-by-one accelerator.
- *
- * ___________      ___________
- * |         |      |         |
- * | HostDPI | <--> | RegFile | <->|
- * |_________|      |_________|    |
- *                                 |
- * ___________      ___________    |
- * |         |      |         |    |
- * | MemDPI  | <--> | Compute | <->|
- * |_________|      |_________|
- *
- */
-case class AccelConfig() {
-  val nCtrl = 1
-  val nECnt = 1
-  val nVals = 2
-  val nPtrs = 2
-  val regBits = 32
-  val ptrBits = 2 * regBits
-}
-
-class Accel extends Module {
-  val io = IO(new Bundle {
-    val host = new VTAHostDPIClient
-    val mem = new VTAMemDPIMaster
-  })
-  implicit val config = AccelConfig()
-  val rf = Module(new RegFile)
-  val ce = Module(new Compute)
-  rf.io.host <> io.host
-  io.mem <> ce.io.mem
-  ce.io.launch := rf.io.launch
-  rf.io.finish := ce.io.finish
-  rf.io.ecnt <> ce.io.ecnt
-  ce.io.vals <> rf.io.vals
-  ce.io.ptrs <> rf.io.ptrs
-}
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala b/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
deleted file mode 100644
index 3ef2e7e69bdb..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import chisel3.util._
-import vta.dpi._
-
-/** Compute
- *
- * Add-by-one procedure:
- *
- * 1. Wait for launch to be asserted
- * 2. Issue a read request for 8-byte value at inp_baddr address
- * 3. Wait for the value
- * 4. Issue a write request for 8-byte value at out_baddr address
- * 5. Increment read-address and write-address for next value
- * 6. Check if counter (cnt) is equal to length to assert finish,
- *    otherwise go to step 2.
- */
-class Compute(implicit config: AccelConfig) extends Module {
-  val io = IO(new Bundle {
-    val launch = Input(Bool())
-    val finish = Output(Bool())
-    val ecnt = Vec(config.nECnt, ValidIO(UInt(config.regBits.W)))
-    val vals = Input(Vec(config.nVals, UInt(config.regBits.W)))
-    val ptrs = Input(Vec(config.nPtrs, UInt(config.ptrBits.W)))
-    val mem = new VTAMemDPIMaster
-  })
-  val sIdle :: sReadReq :: sReadData :: sWriteReq :: sWriteData :: Nil = Enum(5)
-  val state = RegInit(sIdle)
-  val const = io.vals(0)
-  val length = io.vals(1)
-  val cycles = RegInit(0.U(config.regBits.W))
-  val reg = Reg(chiselTypeOf(io.mem.rd.bits))
-  val cnt = Reg(UInt(config.regBits.W))
-  val raddr = Reg(UInt(config.ptrBits.W))
-  val waddr = Reg(UInt(config.ptrBits.W))
-
-  switch(state) {
-    is(sIdle) {
-      when(io.launch) {
-        state := sReadReq
-      }
-    }
-    is(sReadReq) {
-      state := sReadData
-    }
-    is(sReadData) {
-      when(io.mem.rd.valid) {
-        state := sWriteReq
-      }
-    }
-    is(sWriteReq) {
-      state := sWriteData
-    }
-    is(sWriteData) {
-      when(cnt === (length - 1.U)) {
-        state := sIdle
-      }.otherwise {
-        state := sReadReq
-      }
-    }
-  }
-
-  val last = state === sWriteData && cnt === (length - 1.U)
-
-  // cycle counter
-  when(state === sIdle) {
-    cycles := 0.U
-  }.otherwise {
-    cycles := cycles + 1.U
-  }
-
-  io.ecnt(0).valid := last
-  io.ecnt(0).bits := cycles
-
-  // calculate next address
-  when(state === sIdle) {
-    raddr := io.ptrs(0)
-    waddr := io.ptrs(1)
-  }.elsewhen(state === sWriteData) { // increment by 8-bytes
-    raddr := raddr + 8.U
-    waddr := waddr + 8.U
-  }
-
-  // create request
-  io.mem.req.valid := state === sReadReq | state === sWriteReq
-  io.mem.req.opcode := state === sWriteReq
-  io.mem.req.len := 0.U // one-word-per-request
-  io.mem.req.addr := Mux(state === sReadReq, raddr, waddr)
-
-  // read
-  when(state === sReadData && io.mem.rd.valid) {
-    reg := io.mem.rd.bits + const
-  }
-  io.mem.rd.ready := state === sReadData
-
-  // write
-  io.mem.wr.valid := state === sWriteData
-  io.mem.wr.bits := reg
-
-  // count read/write
-  when(state === sIdle) {
-    cnt := 0.U
-  }.elsewhen(state === sWriteData) {
-    cnt := cnt + 1.U
-  }
-
-  // done when read/write are equal to length
-  io.finish := last
-}
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala b/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
deleted file mode 100644
index 2764510a68ee..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import chisel3.util._
-import vta.dpi._
-
-/** Register File.
- *
- * Six 32-bit register file.
- *
- * -------------------------------
- *  Register description    | addr
- * -------------------------|-----
- *  Control status register | 0x00
- *  Cycle counter           | 0x04
- *  Constant value          | 0x08
- *  Vector length           | 0x0c
- *  Input pointer lsb       | 0x10
- *  Input pointer msb       | 0x14
- *  Output pointer lsb      | 0x18
- *  Output pointer msb      | 0x1c
- * -------------------------------
- *
- * ------------------------------
- *  Control status register | bit
- * ------------------------------
- *  Launch                  | 0
- *  Finish                  | 1
- * ------------------------------
- */
-class RegFile(implicit config: AccelConfig) extends Module {
-  val io = IO(new Bundle {
-    val launch = Output(Bool())
-    val finish = Input(Bool())
-    val ecnt = Vec(config.nECnt, Flipped(ValidIO(UInt(config.regBits.W))))
-    val vals = Output(Vec(config.nVals, UInt(config.regBits.W)))
-    val ptrs = Output(Vec(config.nPtrs, UInt(config.ptrBits.W)))
-    val host = new VTAHostDPIClient
-  })
-  val sIdle :: sRead :: Nil = Enum(2)
-  val state = RegInit(sIdle)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.host.req.valid && !io.host.req.opcode) {
-        state := sRead
-      }
-    }
-    is(sRead) {
-      state := sIdle
-    }
-  }
-
-  io.host.req.deq := state === sIdle & io.host.req.valid
-
-  val nTotal = config.nCtrl + config.nECnt + config.nVals + (2 * config.nPtrs)
-  val reg =
-    Seq.fill(nTotal)(RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value))))
-  val addr = Seq.tabulate(nTotal)(_ * 4)
-  val reg_map = (addr zip reg) map { case (a, r) => a.U -> r }
-  val eo = config.nCtrl
-  val vo = eo + config.nECnt
-  val po = vo + config.nVals
-
-  when(io.finish) {
-    reg(0) := "b_10".U
-  }.elsewhen(state === sIdle && io.host.req.valid &&
-    io.host.req.opcode && addr(0).U === io.host.req.addr) {
-    reg(0) := io.host.req.value
-  }
-
-  for (i <- 0 until config.nECnt) {
-    when(io.ecnt(i).valid) {
-      reg(eo + i) := io.ecnt(i).bits
-    }.elsewhen(state === sIdle && io.host.req.valid &&
-      io.host.req.opcode && addr(eo + i).U === io.host.req.addr) {
-      reg(eo + i) := io.host.req.value
-    }
-  }
-
-  for (i <- 0 until (config.nVals + (2 * config.nPtrs))) {
-    when(state === sIdle && io.host.req.valid &&
-      io.host.req.opcode && addr(vo + i).U === io.host.req.addr) {
-      reg(vo + i) := io.host.req.value
-    }
-  }
-
-  val rdata = RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value)))
-  when(state === sIdle && io.host.req.valid && !io.host.req.opcode) {
-    rdata := MuxLookup(io.host.req.addr, 0.U, reg_map)
-  }
-
-  io.host.resp.valid := state === sRead
-  io.host.resp.bits := rdata
-
-  io.launch := reg(0)(0)
-
-  for (i <- 0 until config.nVals) {
-    io.vals(i) := reg(vo + i)
-  }
-
-  for (i <- 0 until config.nPtrs) {
-    io.ptrs(i) := Cat(reg(po + (2 * i) + 1), reg(po + (2 * i)))
-  }
-}
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala b/vta/vta-hw/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
deleted file mode 100644
index d931620ec67d..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package test
-
-import chisel3._
-import chisel3.experimental.MultiIOModule
-import vta.dpi._
-import accel._
-
-/** VTA simulation shell.
-  *
-  * Instantiate Host and Memory DPI modules.
-  *
-  */
-class VTASimShell extends MultiIOModule {
-  val host = IO(new VTAHostDPIMaster)
-  val mem = IO(new VTAMemDPIClient)
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val mod_sim = Module(new VTASimDPI)
-  val mod_host = Module(new VTAHostDPI)
-  val mod_mem = Module(new VTAMemDPI)
-  mod_mem.io.clock := clock
-  mod_mem.io.reset := reset
-  mod_mem.io.dpi <> mem
-  mod_host.io.clock := clock
-  mod_host.io.reset := reset
-  host <> mod_host.io.dpi
-  mod_sim.io.clock := sim_clock
-  mod_sim.io.reset := reset
-  sim_wait := mod_sim.io.dpi_wait
-}
-
-/** Test accelerator.
-  *
-  * Instantiate and connect the simulation-shell and the accelerator.
-  *
-  */
-class TestAccel extends MultiIOModule {
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val sim_shell = Module(new VTASimShell)
-  val vta_accel = Module(new Accel)
-  sim_shell.sim_clock := sim_clock
-  sim_wait := sim_shell.sim_wait
-  sim_shell.mem <> vta_accel.io.mem
-  vta_accel.io.host <> sim_shell.host
-}
-
-/** Generate TestAccel as top module */
-object Elaborate extends App {
-  chisel3.Driver.execute(args, () => new TestAccel)
-}
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/Makefile b/vta/vta-hw/apps/tsim_example/hardware/verilog/Makefile
deleted file mode 100644
index 72b0a2a14eba..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/Makefile
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifeq (, $(shell which verilator))
- $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
-endif
-
-# Change VERILATOR_INC_DIR if Verilator is installed on a different location
-ifeq (, $(VERILATOR_INC_DIR))
-  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
-    ifeq (, $(wildcard /usr/share/verilator/include/*))
-      $(error "Verilator include directory is not set properly")
-    else
-      VERILATOR_INC_DIR := /usr/share/verilator/include
-    endif
-  else
-      VERILATOR_INC_DIR := /usr/local/share/verilator/include
-  endif
-endif
-
-TOP = TestAccel
-BUILD_NAME = build
-USE_TRACE = 0
-LIBNAME = libhw
-
-vta_dir = $(abspath ../../../../)
-tvm_dir = $(abspath ../../../../../../)
-build_dir = $(abspath .)/$(BUILD_NAME)
-
-verilator_opt = --cc
-verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
-verilator_opt += +define+RANDOMIZE_REG_INIT
-verilator_opt += +define+RANDOMIZE_MEM_INIT
-verilator_opt += --x-assign unique
-verilator_opt += --output-split 20000
-verilator_opt += --output-split-cfuncs 20000
-verilator_opt += --top-module ${TOP}
-verilator_opt += -Mdir ${build_dir}
-
-cxx_flags = -O2 -Wall -fPIC -shared
-cxx_flags += -fvisibility=hidden -std=c++11
-cxx_flags += -DVL_TSIM_NAME=V$(TOP)
-cxx_flags += -DVL_PRINTF=printf
-cxx_flags += -DVL_USER_FINISH
-cxx_flags += -DVM_COVERAGE=0
-cxx_flags += -DVM_SC=0
-cxx_flags += -Wno-sign-compare
-cxx_flags += -include V$(TOP).h
-cxx_flags += -I$(build_dir)
-cxx_flags += -I$(VERILATOR_INC_DIR)
-cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
-cxx_flags += -I$(vta_dir)/include
-cxx_flags += -I$(tvm_dir)/include
-cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
-
-cxx_files = $(VERILATOR_INC_DIR)/verilated.cpp
-cxx_files += $(VERILATOR_INC_DIR)/verilated_dpi.cpp
-cxx_files += $(wildcard $(build_dir)/*.cpp)
-cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
-
-v_files = $(wildcard $(abspath .)/src/*.v $(vta_dir)/hardware/chisel/src/main/resources/verilog/*.v)
-
-ifneq ($(USE_TRACE), 0)
-  verilator_opt += --trace
-  cxx_flags += -DVM_TRACE=1
-  cxx_flags += -DTSIM_TRACE_FILE=$(build_dir)/$(TOP).vcd
-  cxx_files += $(VERILATOR_INC_DIR)/verilated_vcd_c.cpp
-else
-  cxx_flags += -DVM_TRACE=0
-endif
-
-# The following is to be consistent with cmake
-ifeq ($(shell uname), Darwin)
-  lib_path = $(build_dir)/$(LIBNAME).dylib
-else
-  lib_path = $(build_dir)/$(LIBNAME).so
-endif
-
-default: lib
-
-lib: $(lib_path)
-$(lib_path): $(build_dir)/V$(TOP).cpp
-	g++ $(cxx_flags) $(cxx_files) -o $@
-
-verilator: $(build_dir)/V$(TOP).cpp
-$(build_dir)/V$(TOP).cpp: $(v_files) | $(build_dir)
-	verilator $(verilator_opt) $(v_files)
-
-$(build_dir):
-	mkdir -p $@
-
-clean:
-	-rm -rf $(build_dir)
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Accel.v b/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Accel.v
deleted file mode 100644
index 34d7d957a858..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Accel.v
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/** Add-by-one accelerator.
-  *
-  * ___________      ___________
-  * |         |      |         |
-  * | HostDPI | <--> | RegFile | <->|
-  * |_________|      |_________|    |
-  *                                 |
-  * ___________      ___________    |
-  * |         |      |         |    |
-  * | MemDPI  | <--> | Compute | <->|
-  * |_________|      |_________|
-  *
-  */
-module Accel #
-( parameter HOST_ADDR_BITS = 8,
-  parameter HOST_DATA_BITS = 32,
-  parameter MEM_LEN_BITS = 8,
-  parameter MEM_ADDR_BITS = 64,
-  parameter MEM_DATA_BITS = 64
-)
-(
-  input                         clock,
-  input                         reset,
-
-  input                         host_req_valid,
-  input                         host_req_opcode,
-  input    [HOST_ADDR_BITS-1:0] host_req_addr,
-  input    [HOST_DATA_BITS-1:0] host_req_value,
-  output                        host_req_deq,
-  output                        host_resp_valid,
-  output   [HOST_DATA_BITS-1:0] host_resp_bits,
-
-  output                        mem_req_valid,
-  output                        mem_req_opcode,
-  output     [MEM_LEN_BITS-1:0] mem_req_len,
-  output    [MEM_ADDR_BITS-1:0] mem_req_addr,
-  output                        mem_wr_valid,
-  output    [MEM_DATA_BITS-1:0] mem_wr_bits,
-  input                         mem_rd_valid,
-  input     [MEM_DATA_BITS-1:0] mem_rd_bits,
-  output                        mem_rd_ready
-);
-
-  logic                      launch;
-  logic                      finish;
-
-  logic                      event_counter_valid;
-  logic [HOST_DATA_BITS-1:0] event_counter_value;
-
-  logic [HOST_DATA_BITS-1:0] constant;
-  logic [HOST_DATA_BITS-1:0] length;
-  logic  [MEM_ADDR_BITS-1:0] inp_baddr;
-  logic  [MEM_ADDR_BITS-1:0] out_baddr;
-
-  RegFile #
-  (
-    .MEM_ADDR_BITS(MEM_ADDR_BITS),
-    .HOST_ADDR_BITS(HOST_ADDR_BITS),
-    .HOST_DATA_BITS(HOST_DATA_BITS)
-  )
-  rf
-  (
-    .clock               (clock),
-    .reset               (reset),
-
-    .host_req_valid      (host_req_valid),
-    .host_req_opcode     (host_req_opcode),
-    .host_req_addr       (host_req_addr),
-    .host_req_value      (host_req_value),
-    .host_req_deq        (host_req_deq),
-    .host_resp_valid     (host_resp_valid),
-    .host_resp_bits      (host_resp_bits),
-
-    .launch              (launch),
-    .finish              (finish),
-
-    .event_counter_valid (event_counter_valid),
-    .event_counter_value (event_counter_value),
-
-    .constant            (constant),
-    .length              (length),
-    .inp_baddr           (inp_baddr),
-    .out_baddr           (out_baddr)
-  );
-
-  Compute #
-  (
-    .MEM_LEN_BITS(MEM_LEN_BITS),
-    .MEM_ADDR_BITS(MEM_ADDR_BITS),
-    .MEM_DATA_BITS(MEM_DATA_BITS),
-    .HOST_DATA_BITS(HOST_DATA_BITS)
-  )
-  comp
-  (
-    .clock               (clock),
-    .reset               (reset),
-
-    .mem_req_valid       (mem_req_valid),
-    .mem_req_opcode      (mem_req_opcode),
-    .mem_req_len         (mem_req_len),
-    .mem_req_addr        (mem_req_addr),
-    .mem_wr_valid        (mem_wr_valid),
-    .mem_wr_bits         (mem_wr_bits),
-    .mem_rd_valid        (mem_rd_valid),
-    .mem_rd_bits         (mem_rd_bits),
-    .mem_rd_ready        (mem_rd_ready),
-
-    .launch              (launch),
-    .finish              (finish),
-
-    .event_counter_valid (event_counter_valid),
-    .event_counter_value (event_counter_value),
-
-    .constant            (constant),
-    .length              (length),
-    .inp_baddr           (inp_baddr),
-    .out_baddr           (out_baddr)
-  );
-
-endmodule
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Compute.v b/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Compute.v
deleted file mode 100644
index 4360b1ca20dd..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Compute.v
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/** Compute
-  *
-  * Add-by-one procedure:
-  *
-  * 1. Wait for launch to be asserted
-  * 2. Issue a read request for 8-byte value at inp_baddr address
-  * 3. Wait for the value
-  * 4. Issue a write request for 8-byte value at out_baddr address
-  * 5. Increment read-address and write-address for next value
-  * 6. Check if counter (cnt) is equal to length to assert finish,
-  *    otherwise go to step 2.
-  */
-module Compute #
-(
-  parameter MEM_LEN_BITS = 8,
-  parameter MEM_ADDR_BITS = 64,
-  parameter MEM_DATA_BITS = 64,
-  parameter HOST_DATA_BITS = 32
-)
-(
-  input                         clock,
-  input                         reset,
-
-  output                        mem_req_valid,
-  output                        mem_req_opcode,
-  output     [MEM_LEN_BITS-1:0] mem_req_len,
-  output    [MEM_ADDR_BITS-1:0] mem_req_addr,
-  output                        mem_wr_valid,
-  output    [MEM_DATA_BITS-1:0] mem_wr_bits,
-  input                         mem_rd_valid,
-  input     [MEM_DATA_BITS-1:0] mem_rd_bits,
-  output                        mem_rd_ready,
-
-  input                         launch,
-  output                        finish,
-
-  output                        event_counter_valid,
-  output   [HOST_DATA_BITS-1:0] event_counter_value,
-
-  input    [HOST_DATA_BITS-1:0] constant,
-  input    [HOST_DATA_BITS-1:0] length,
-  input     [MEM_ADDR_BITS-1:0] inp_baddr,
-  input     [MEM_ADDR_BITS-1:0] out_baddr
-);
-
-  typedef enum logic [2:0] {IDLE,
-                            READ_REQ,
-                            READ_DATA,
-                            WRITE_REQ,
-                            WRITE_DATA} state_t;
-
-  state_t state_n, state_r;
-
-  logic [31:0] cnt;
-  logic [MEM_DATA_BITS-1:0] data;
-  logic [MEM_ADDR_BITS-1:0] raddr;
-  logic [MEM_ADDR_BITS-1:0] waddr;
-
-  always_ff @(posedge clock) begin
-    if (reset) begin
-      state_r <= IDLE;
-    end else begin
-      state_r <= state_n;
-    end
-  end
-
-  always_comb begin
-    state_n = IDLE;
-    case (state_r)
-      IDLE: begin
-        if (launch) begin
-          state_n = READ_REQ;
-        end
-      end
-
-      READ_REQ: begin
-        state_n = READ_DATA;
-      end
-
-      READ_DATA: begin
-        if (mem_rd_valid) begin
-          state_n = WRITE_REQ;
-        end else begin
-          state_n = READ_DATA;
-        end
-      end
-
-      WRITE_REQ: begin
-        state_n = WRITE_DATA;
-      end
-
-      WRITE_DATA: begin
-        if (cnt == (length - 1'b1)) begin
-          state_n = IDLE;
-        end else begin
-          state_n = READ_REQ;
-        end
-      end
-
-      default: begin
-      end
-    endcase
-  end
-
-  logic last;
-  assign last = (state_r == WRITE_DATA) & (cnt == (length - 1'b1));
-
-  // cycle counter
-  logic [HOST_DATA_BITS-1:0] cycle_counter;
-  always_ff @(posedge clock) begin
-    if (reset | state_r == IDLE) begin
-      cycle_counter <= '0;
-    end else begin
-      cycle_counter <= cycle_counter + 1'b1;
-    end
-  end
-
-  assign event_counter_valid = last;
-  assign event_counter_value = cycle_counter;
-
-  // calculate next address
-  always_ff @(posedge clock) begin
-    if (reset | state_r == IDLE) begin
-      raddr <= inp_baddr;
-      waddr <= out_baddr;
-    end else if (state_r == WRITE_DATA) begin
-      raddr <= raddr + 'd8;
-      waddr <= waddr + 'd8;
-    end
-  end
-
-  // create request
-  assign mem_req_valid = (state_r == READ_REQ) | (state_r == WRITE_REQ);
-  assign mem_req_opcode = state_r == WRITE_REQ;
-  assign mem_req_len = 'd0; // one-word-per-request
-  assign mem_req_addr = (state_r == READ_REQ)? raddr : waddr;
-
-  // read
-  always_ff @(posedge clock) begin
-    if ((state_r == READ_DATA) & mem_rd_valid) begin
-      data <= mem_rd_bits + {32'd0, constant};
-    end
-  end
-  assign mem_rd_ready = state_r == READ_DATA;
-
-  // write
-  assign mem_wr_valid = state_r == WRITE_DATA;
-  assign mem_wr_bits = data;
-
-  // count read/write
-  always_ff @(posedge clock) begin
-    if (reset | state_r == IDLE) begin
-      cnt <= 'd0;
-    end else if (state_r == WRITE_DATA) begin
-      cnt <= cnt + 1'b1;
-    end
-  end
-
-  // done when read/write are equal to length
-  assign finish = last;
-endmodule
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/RegFile.v b/vta/vta-hw/apps/tsim_example/hardware/verilog/src/RegFile.v
deleted file mode 100644
index 7174682dc8a2..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/RegFile.v
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/** Register File.
-  *
-  * Six 32-bit register file.
-  *
-  * -------------------------------
-  *  Register description    | addr
-  * -------------------------|-----
-  *  Control status register | 0x00
-  *  Cycle counter           | 0x04
-  *  Constant value          | 0x08
-  *  Vector length           | 0x0c
-  *  Input pointer lsb       | 0x10
-  *  Input pointer msb       | 0x14
-  *  Output pointer lsb      | 0x18
-  *  Output pointer msb      | 0x1c
-  * -------------------------------
-
-  * ------------------------------
-  *  Control status register | bit
-  * ------------------------------
-  *  Launch                  | 0
-  *  Finish                  | 1
-  * ------------------------------
-  */
-module RegFile #
- (parameter MEM_ADDR_BITS = 64,
-  parameter HOST_ADDR_BITS = 8,
-  parameter HOST_DATA_BITS = 32
-)
-(
-  input                         clock,
-  input                         reset,
-
-  input                         host_req_valid,
-  input                         host_req_opcode,
-  input    [HOST_ADDR_BITS-1:0] host_req_addr,
-  input    [HOST_DATA_BITS-1:0] host_req_value,
-  output                        host_req_deq,
-  output                        host_resp_valid,
-  output   [HOST_DATA_BITS-1:0] host_resp_bits,
-
-  output                        launch,
-  input                         finish,
-
-  input                         event_counter_valid,
-  input    [HOST_DATA_BITS-1:0] event_counter_value,
-
-  output   [HOST_DATA_BITS-1:0] constant,
-  output   [HOST_DATA_BITS-1:0] length,
-  output    [MEM_ADDR_BITS-1:0] inp_baddr,
-  output    [MEM_ADDR_BITS-1:0] out_baddr
-);
-
-  localparam NUM_REG = 8;
-
-  typedef enum logic {IDLE, READ} state_t;
-  state_t state_n, state_r;
-
-  always_ff @(posedge clock) begin
-    if (reset) begin
-      state_r <= IDLE;
-    end else begin
-      state_r <= state_n;
-    end
-  end
-
-  always_comb begin
-    state_n = IDLE;
-    case (state_r)
-      IDLE: begin
-        if (host_req_valid & ~host_req_opcode) begin
-          state_n = READ;
-        end
-      end
-
-      READ: begin
-        state_n = IDLE;
-      end
-    endcase
-  end
-
-  assign host_req_deq = (state_r == IDLE) ? host_req_valid : 1'b0;
-
-  logic [HOST_DATA_BITS-1:0] rf [NUM_REG-1:0];
-
-  genvar i;
-  for (i = 0; i < NUM_REG; i++) begin
-
-    logic wen = (state_r == IDLE)? host_req_valid & host_req_opcode & i*4 == host_req_addr : 1'b0;
-
-    if (i == 0) begin
-
-      always_ff @(posedge clock) begin
-        if (reset) begin
-          rf[i] <= 'd0;
-        end else if (finish) begin
-          rf[i] <= 'd2;
-        end else if (wen) begin
-          rf[i] <= host_req_value;
-        end
-      end
-
-    end else if (i == 1) begin
-
-      always_ff @(posedge clock) begin
-        if (reset) begin
-          rf[i] <= 'd0;
-        end else if (event_counter_valid) begin
-          rf[i] <= event_counter_value;
-        end else if (wen) begin
-          rf[i] <= host_req_value;
-        end
-      end
-
-    end else begin
-
-      always_ff @(posedge clock) begin
-        if (reset) begin
-          rf[i] <= 'd0;
-        end else if (wen) begin
-          rf[i] <= host_req_value;
-        end
-      end
-
-    end
-
-  end
-
-  logic [HOST_DATA_BITS-1:0] rdata;
-  always_ff @(posedge clock) begin
-    if (reset) begin
-      rdata <= 'd0;
-    end else if ((state_r == IDLE) & host_req_valid & ~host_req_opcode) begin
-      if (host_req_addr == 'h00) begin
-        rdata <= rf[0];
-      end else if (host_req_addr == 'h04) begin
-        rdata <= rf[1];
-      end else if (host_req_addr == 'h08) begin
-        rdata <= rf[2];
-      end else if (host_req_addr == 'h0c) begin
-        rdata <= rf[3];
-      end else if (host_req_addr == 'h10) begin
-        rdata <= rf[4];
-      end else if (host_req_addr == 'h14) begin
-        rdata <= rf[5];
-      end else if (host_req_addr == 'h18) begin
-        rdata <= rf[6];
-      end else if (host_req_addr == 'h1c) begin
-        rdata <= rf[7];
-      end else begin
-        rdata <= 'd0;
-      end
-    end
-  end
-
-  assign host_resp_valid = (state_r == READ);
-  assign host_resp_bits = rdata;
-
-  assign launch = rf[0][0];
-  assign constant = rf[2];
-  assign length = rf[3];
-  assign inp_baddr = {rf[5], rf[4]};
-  assign out_baddr = {rf[7], rf[6]};
-
-endmodule
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/TestAccel.v b/vta/vta-hw/apps/tsim_example/hardware/verilog/src/TestAccel.v
deleted file mode 100644
index cc1ec8580d62..000000000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/TestAccel.v
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/** Test accelerator.
-  *
-  * Instantiate host/memory DPI modules and connect them to the accelerator.
-  *
-  */
-module TestAccel
-(
-  input clock,
-  input reset,
-  input sim_clock,
-  output sim_wait
-);
-
-  localparam HOST_ADDR_BITS = 8;
-  localparam HOST_DATA_BITS = 32;
-
-  logic                      host_req_valid;
-  logic                      host_req_opcode;
-  logic [HOST_ADDR_BITS-1:0] host_req_addr;
-  logic [HOST_DATA_BITS-1:0] host_req_value;
-  logic                      host_req_deq;
-  logic                      host_resp_valid;
-  logic [HOST_DATA_BITS-1:0] host_resp_bits;
-
-  localparam MEM_LEN_BITS = 8;
-  localparam MEM_ADDR_BITS = 64;
-  localparam MEM_DATA_BITS = 64;
-
-  logic                     mem_req_valid;
-  logic                     mem_req_opcode;
-  logic  [MEM_LEN_BITS-1:0] mem_req_len;
-  logic [MEM_ADDR_BITS-1:0] mem_req_addr;
-  logic                     mem_wr_valid;
-  logic [MEM_DATA_BITS-1:0] mem_wr_bits;
-  logic                     mem_rd_valid;
-  logic [MEM_DATA_BITS-1:0] mem_rd_bits;
-  logic                     mem_rd_ready;
-
-  VTASimDPI sim
-  (
-    .clock          (sim_clock),
-    .reset          (reset),
-
-    .dpi_wait       (sim_wait)
-  );
-
-  VTAHostDPI host
-  (
-    .clock          (clock),
-    .reset          (reset),
-
-    .dpi_req_valid  (host_req_valid),
-    .dpi_req_opcode (host_req_opcode),
-    .dpi_req_addr   (host_req_addr),
-    .dpi_req_value  (host_req_value),
-    .dpi_req_deq    (host_req_deq),
-    .dpi_resp_valid (host_resp_valid),
-    .dpi_resp_bits  (host_resp_bits)
-  );
-
-  VTAMemDPI mem
-  (
-    .clock          (clock),
-    .reset          (reset),
-
-    .dpi_req_valid  (mem_req_valid),
-    .dpi_req_opcode (mem_req_opcode),
-    .dpi_req_len    (mem_req_len),
-    .dpi_req_addr   (mem_req_addr),
-    .dpi_wr_valid   (mem_wr_valid),
-    .dpi_wr_bits    (mem_wr_bits),
-    .dpi_rd_valid   (mem_rd_valid),
-    .dpi_rd_bits    (mem_rd_bits),
-    .dpi_rd_ready   (mem_rd_ready)
-  );
-
-  Accel #
-  (
-    .HOST_ADDR_BITS(HOST_ADDR_BITS),
-    .HOST_DATA_BITS(HOST_DATA_BITS),
-    .MEM_LEN_BITS(MEM_LEN_BITS),
-    .MEM_ADDR_BITS(MEM_ADDR_BITS),
-    .MEM_DATA_BITS(MEM_DATA_BITS)
-  )
-  accel
-  (
-    .clock           (clock),
-    .reset           (reset),
-
-    .host_req_valid  (host_req_valid),
-    .host_req_opcode (host_req_opcode),
-    .host_req_addr   (host_req_addr),
-    .host_req_value  (host_req_value),
-    .host_req_deq    (host_req_deq),
-    .host_resp_valid (host_resp_valid),
-    .host_resp_bits  (host_resp_bits),
-
-    .mem_req_valid   (mem_req_valid),
-    .mem_req_opcode  (mem_req_opcode),
-    .mem_req_len     (mem_req_len),
-    .mem_req_addr    (mem_req_addr),
-    .mem_wr_valid    (mem_wr_valid),
-    .mem_wr_bits     (mem_wr_bits),
-    .mem_rd_valid    (mem_rd_valid),
-    .mem_rd_bits     (mem_rd_bits),
-    .mem_rd_ready    (mem_rd_ready)
-  );
-
-endmodule
diff --git a/vta/vta-hw/apps/tsim_example/python/__init__.py b/vta/vta-hw/apps/tsim_example/python/__init__.py
deleted file mode 100644
index 4bc21e287d69..000000000000
--- a/vta/vta-hw/apps/tsim_example/python/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from . import tsim
diff --git a/vta/vta-hw/apps/tsim_example/python/tsim.py b/vta/vta-hw/apps/tsim_example/python/tsim.py
deleted file mode 100644
index 85fd463e3278..000000000000
--- a/vta/vta-hw/apps/tsim_example/python/tsim.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import ctypes
-import os.path as osp
-from sys import platform
-
-def get_ext():
-    """Return shared library extension"""
-    return ".dylib" if platform == "darwin" else ".so"
-
-def load_dll(dll):
-    """Load shared library
-
-    Parameters
-    ------------
-    dll : str
-        Path for shared library
-
-    Returns
-    ------------
-    The shared library
-    """
-    try:
-        return [ctypes.CDLL(dll, ctypes.RTLD_GLOBAL)]
-    except OSError:
-        return []
-
-def load_sw():
-    """Load all software shared libraries"""
-    cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-    sw_libname = "libsw" + get_ext()
-    sw_lib = osp.join(cur_path, "..", "build", sw_libname)
-    load_dll(sw_lib)
-
-def init(hw_backend):
-    """Init hardware and software shared library for accelerator
-
-    Parameters
-    ------------
-    hw_backend : str
-        Hardware backend can be verilog or chisel
-
-    """
-    cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-    hw_libname = "libhw" + get_ext()
-    if hw_backend in ("verilog", "chisel"):
-        hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build", hw_libname)
-    load_sw()
-    m = tvm.runtime.load_module(hw_lib, "vta-tsim")
-    f = tvm.get_global_func("tvm.vta.tsim.init")
-    f(m)
-
-def load_module():
-    """Return driver function"""
-    load_sw()
-    return tvm.get_global_func("tvm.vta.driver")
diff --git a/vta/vta-hw/apps/tsim_example/src/driver.cc b/vta/vta-hw/apps/tsim_example/src/driver.cc
deleted file mode 100644
index 95606961e8a9..000000000000
--- a/vta/vta-hw/apps/tsim_example/src/driver.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-#include <vta/dpi/module.h>
-
-#include "vmem/virtual_memory.h"
-
-namespace vta {
-namespace driver {
-
-using vta::dpi::DPIModuleNode;
-using tvm::runtime::Module;
-
-class DPILoader {
- public:
-  ~DPILoader() {
-    dpi_->SimResume();
-    dpi_->SimFinish();
-  }
-
-  void Init(Module module) {
-    mod_ = module;
-    dpi_ = this->Get();
-    dpi_->SimLaunch();
-    dpi_->SimWait();
-  }
-
-  DPIModuleNode* Get() {
-    return static_cast<DPIModuleNode*>(mod_.operator->());
-  }
-
-  static DPILoader* Global() {
-    static DPILoader inst;
-    return &inst;
-  }
-
-  // TVM module
-  Module mod_;
-  // DPI Module
-  DPIModuleNode* dpi_{nullptr};
-};
-
-class Device {
- public:
-  Device() {
-    loader_ = DPILoader::Global();
-  }
-
-  uint32_t Run(uint32_t c, DLTensor* a, DLTensor* b) {
-    uint32_t cycles;
-    uint32_t len = a->shape[0];
-    size_t size = (a->dtype.bits >> 3) * len;
-    a_ = this->MemAlloc(size);
-    b_ = this->MemAlloc(size);
-    this->MemCopyFromHost(a_, a->data, size);
-    this->Init();
-    this->Launch(c, len);
-    cycles = this->WaitForCompletion();
-    this->MemCopyToHost(b->data, b_, size);
-    this->MemFree(a_);
-    this->MemFree(b_);
-    return cycles;
-  }
-
- private:
-  void Init() {
-    dpi_ = loader_->Get();
-    dpi_->SimResume();
-  }
-
-  void* MemAlloc(size_t size) {
-    void * addr = vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
-    return reinterpret_cast<void*>(vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(addr));
-  }
-
-  void MemFree(void* buf) {
-    void * addr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(reinterpret_cast<uint64_t>(buf));
-    vta::vmem::VirtualMemoryManager::Global()->Free(addr);
-  }
-
-  vta_phy_addr_t MemGetPhyAddr(void* buf) {
-    return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
-  }
-
-  void MemCopyFromHost(void* dst, const void* src, size_t size) {
-    vta::vmem::VirtualMemoryManager::Global()->MemCopyFromHost(dst, src, size);
-  }
-
-  void MemCopyToHost(void* dst, const void* src, size_t size) {
-    vta::vmem::VirtualMemoryManager::Global()->MemCopyToHost(dst, src, size);
-  }
-
-  void Launch(uint32_t c, uint32_t len) {
-    dpi_->WriteReg(0x08, c);
-    dpi_->WriteReg(0x0c, len);
-    dpi_->WriteReg(0x10, this->MemGetPhyAddr(a_));
-    dpi_->WriteReg(0x14, 0);
-    dpi_->WriteReg(0x18, this->MemGetPhyAddr(b_));
-    dpi_->WriteReg(0x1c, 0);
-    dpi_->WriteReg(0x00, 0x1); // launch
-  }
-
-  uint32_t WaitForCompletion() {
-    uint32_t i, val;
-    for (i = 0; i < wait_cycles_; i++) {
-      val = dpi_->ReadReg(0x00);
-      if (val == 2) break; // finish
-    }
-    val = dpi_->ReadReg(0x04);
-    dpi_->SimWait();
-    return val;
-  }
-
-  // wait cycles
-  uint32_t wait_cycles_{100000000};
-  // DPI loader
-  DPILoader* loader_{nullptr};
-  // DPI Module
-  DPIModuleNode* dpi_{nullptr};
-  // input vm ptr
-  void* a_{nullptr};
-  // output vm ptr
-  void* b_{nullptr};
-};
-
-using tvm::runtime::TVMRetValue;
-using tvm::runtime::TVMArgs;
-
-TVM_REGISTER_GLOBAL("tvm.vta.tsim.init")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Module m = args[0];
-    DPILoader::Global()->Init(m);
-  });
-
-TVM_REGISTER_GLOBAL("tvm.vta.driver")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Device dev_;
-    DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    uint32_t c = static_cast<int>(args[2]);
-    uint32_t cycles = dev_.Run(c, A, B);
-    *rv = static_cast<int>(cycles);
-  });
-
-}  // namespace driver
-}  // namespace vta
diff --git a/vta/vta-hw/apps/tsim_example/tests/python/chisel_accel.py b/vta/vta-hw/apps/tsim_example/tests/python/chisel_accel.py
deleted file mode 100644
index 370ac4068e18..000000000000
--- a/vta/vta-hw/apps/tsim_example/tests/python/chisel_accel.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-import tsim
-
-def test_accel():
-    rmax = 64
-    dtype = "uint64"
-    n = np.random.randint(1, rmax)
-    c = np.random.randint(0, rmax)
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
-    b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
-    f = tsim.load_module()
-    cycles = f(a, b, c)
-    msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " + msg)
-    print("[PASS] " + msg)
-
-if __name__ == "__main__":
-    tsim.init("chisel")
-    for i in range(10):
-        test_accel()
diff --git a/vta/vta-hw/apps/tsim_example/tests/python/verilog_accel.py b/vta/vta-hw/apps/tsim_example/tests/python/verilog_accel.py
deleted file mode 100644
index 3489ff2f6fed..000000000000
--- a/vta/vta-hw/apps/tsim_example/tests/python/verilog_accel.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-import tsim
-
-def test_accel():
-    rmax = 64
-    dtype = "uint64"
-    n = np.random.randint(1, rmax)
-    c = np.random.randint(0, rmax)
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
-    b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
-    f = tsim.load_module()
-    cycles = f(a, b, c)
-    msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " + msg)
-    print("[PASS] " + msg)
-
-if __name__ == "__main__":
-    tsim.init("verilog")
-    for i in range(10):
-        test_accel()
diff --git a/vta/vta-hw/config/README.md b/vta/vta-hw/config/README.md
deleted file mode 100644
index b675ef293e10..000000000000
--- a/vta/vta-hw/config/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# VTA Configuration
-
-Each VTA runtime/hardware configuration is specified by vta_config.json file.
-You can copy the vta_config.json to tvm project root and modify the configuration
-before you type make.
-
-The config is going to affect the behavior of python package as well as
-the hardware runtime build.
diff --git a/vta/vta-hw/config/de10nano_sample.json b/vta/vta-hw/config/de10nano_sample.json
deleted file mode 100644
index e4148c3e8ecf..000000000000
--- a/vta/vta-hw/config/de10nano_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "de10nano",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/fsim_sample.json b/vta/vta-hw/config/fsim_sample.json
deleted file mode 100644
index 0591bb486143..000000000000
--- a/vta/vta-hw/config/fsim_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "sim",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/pkg_config.py b/vta/vta-hw/config/pkg_config.py
deleted file mode 100644
index 9c57706e0274..000000000000
--- a/vta/vta-hw/config/pkg_config.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA Package configuration module
-
-This module is dependency free and can be used to configure package.
-"""
-from __future__ import absolute_import as _abs
-
-import json
-import glob
-import os
-
-
-def get_vta_hw_path():
-    """Get the VTA HW path."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    vta_hw_default = os.path.abspath(os.path.join(curr_path, ".."))
-    VTA_HW_PATH = os.getenv('VTA_HW_PATH', vta_hw_default)
-    return VTA_HW_PATH
-
-def get_tvm_path():
-    """Get the TVM path."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    tvm_default = os.path.abspath(os.path.join(curr_path, "../../.."))
-    TVM_PATH = os.getenv('TVM_PATH', tvm_default)
-    return TVM_PATH
-
-class PkgConfig(object):
-    """Simple package config tool for VTA.
-
-    This is used to provide runtime specific configurations.
-
-    Parameters
-    ----------
-    cfg : dict
-        The config dictionary
-    """
-    cfg_keys = [
-        "TARGET",
-        "LOG_INP_WIDTH",
-        "LOG_WGT_WIDTH",
-        "LOG_ACC_WIDTH",
-        "LOG_BATCH",
-        "LOG_BLOCK",
-        "LOG_UOP_BUFF_SIZE",
-        "LOG_INP_BUFF_SIZE",
-        "LOG_WGT_BUFF_SIZE",
-        "LOG_ACC_BUFF_SIZE",
-    ]
-
-    def __init__(self, cfg):
-
-        # Derived parameters
-        cfg["LOG_BLOCK_IN"] = cfg["LOG_BLOCK"]
-        cfg["LOG_BLOCK_OUT"] = cfg["LOG_BLOCK"]
-        cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"]
-        cfg["LOG_OUT_BUFF_SIZE"] = (
-            cfg["LOG_ACC_BUFF_SIZE"] +
-            cfg["LOG_OUT_WIDTH"] -
-            cfg["LOG_ACC_WIDTH"])
-
-        # Update cfg now that we've extended it
-        self.__dict__.update(cfg)
-
-        # VTA_HW path and TVM_PATH
-        vta_hw_path = get_vta_hw_path()
-        tvm_path = get_tvm_path()
-
-        # Include path
-        self.include_path = [
-            "-I%s/include" % tvm_path,
-            "-I%s/include" % vta_hw_path,
-            "-I%s/3rdparty/dlpack/include" % tvm_path,
-            "-I%s/3rdparty/dmlc-core/include" % tvm_path
-        ]
-
-        # List of source files that can be used to build standalone library.
-        self.lib_source = []
-        self.lib_source += glob.glob("%s/src/*.cc" % vta_hw_path)
-        if self.TARGET in ["pynq", "ultra96"]:
-            # add pynq drivers for any board that uses pynq driver stack (see pynq.io)
-            self.lib_source += glob.glob("%s/src/pynq/*.cc" % vta_hw_path)
-        elif self.TARGET in ["de10nano"]:
-            self.lib_source += glob.glob("%s/src/de10nano/*.cc" % vta_hw_path)
-            self.include_path += [
-                "-I%s/src/de10nano" % vta_hw_path,
-                "-I%s/3rdparty" % tvm_path
-            ]
-
-        # Linker flags
-        if self.TARGET in ["pynq", "ultra96"]:
-            self.ldflags = [
-                "-L/usr/lib",
-                "-l:libcma.so"]
-        else:
-            self.ldflags = []
-
-        # Derive bitstream config string.
-        self.bitstream = "{}x{}_i{}w{}a{}_{}_{}_{}_{}".format(
-            (1 << cfg["LOG_BATCH"]),
-            (1 << cfg["LOG_BLOCK"]),
-            (1 << cfg["LOG_INP_WIDTH"]),
-            (1 << cfg["LOG_WGT_WIDTH"]),
-            (1 << cfg["LOG_ACC_WIDTH"]),
-            cfg["LOG_UOP_BUFF_SIZE"],
-            cfg["LOG_INP_BUFF_SIZE"],
-            cfg["LOG_WGT_BUFF_SIZE"],
-            cfg["LOG_ACC_BUFF_SIZE"])
-
-        # Derive FPGA parameters from target
-        #   - device:           part number
-        #   - family:           fpga family
-        #   - freq:             PLL frequency
-        #   - per:              clock period to achieve in HLS
-        #                       (how aggressively design is pipelined)
-        #   - axi_bus_width:    axi bus width used for DMA transactions
-        #                       (property of FPGA memory interface)
-        #   - axi_cache_bits:   ARCACHE/AWCACHE signals for the AXI bus
-        #                       (e.g. 1111 is write-back read and write allocate)
-        #   - axi_prot_bits:    ARPROT/AWPROT signals for the AXI bus
-        if self.TARGET == "de10nano":
-            self.fpga_device = "5CSEBA6U23I7"
-            self.fpga_family = "Cyclone\\ V"
-            # TODO: The following parameters have not been propagated into
-            # current Chisel-based implement of VTA hardware for DE10-Nano.
-            # A future change should be made to propagate these parameters,
-            # in order to avoid duplicated definition.
-            self.fpga_freq = 100
-            self.fpga_per = 2
-            self.fpga_log_axi_bus_width = 6
-            self.axi_prot_bits = '100'
-            # IP register address map
-            self.ip_reg_map_range = "0x1000"
-            self.fetch_base_addr = "0xFF220000"
-            self.load_base_addr = "0xFF221000"
-            self.compute_base_addr = "0xFF222000"
-            self.store_base_addr = "0xFF223000"
-        elif self.TARGET == "ultra96":
-            self.fpga_device = "xczu3eg-sbva484-1-e"
-            self.fpga_family = "zynq-ultrascale+"
-            self.fpga_freq = 333
-            self.fpga_per = 2
-            self.fpga_log_axi_bus_width = 7
-            self.axi_prot_bits = '010'
-            # IP register address map
-            self.ip_reg_map_range = "0x1000"
-            self.fetch_base_addr = "0xA0000000"
-            self.load_base_addr = "0xA0001000"
-            self.compute_base_addr = "0xA0002000"
-            self.store_base_addr = "0xA0003000"
-        else:
-            # By default, we use the pynq parameters
-            self.fpga_device = "xc7z020clg484-1"
-            self.fpga_family = "zynq-7000"
-            self.fpga_freq = 100
-            self.fpga_per = 7
-            self.fpga_log_axi_bus_width = 6
-            self.axi_prot_bits = '000'
-            # IP register address map
-            self.ip_reg_map_range = "0x1000"
-            self.fetch_base_addr = "0x43C00000"
-            self.load_base_addr = "0x43C01000"
-            self.compute_base_addr = "0x43C02000"
-            self.store_base_addr = "0x43C03000"
-        # Set coherence settings
-        coherent = True
-        if coherent:
-            self.axi_cache_bits = '1111'
-            self.coherent = True
-
-        # Define IP memory mapped registers offsets.
-        # In HLS 0x00-0x0C is reserved for block-level I/O protocol.
-        # Make sure to leave 8B between register offsets to maintain
-        # compatibility with 64bit systems.
-        self.fetch_insn_count_offset = 0x10
-        self.fetch_insn_addr_offset = self.fetch_insn_count_offset + 0x08
-        self.load_inp_addr_offset = 0x10
-        self.load_wgt_addr_offset = self.load_inp_addr_offset + 0x08
-        self.compute_done_wr_offet = 0x10
-        self.compute_done_rd_offet = self.compute_done_wr_offet + 0x08
-        self.compute_uop_addr_offset = self.compute_done_rd_offet + 0x08
-        self.compute_bias_addr_offset = self.compute_uop_addr_offset + 0x08
-        self.store_out_addr_offset = 0x10
-
-        # Derive SRAM parameters
-        # The goal here is to determine how many memory banks are needed,
-        # how deep and wide each bank needs to be. This is derived from
-        # the size of each memory element (result of data width, and tensor shape),
-        # and also how wide a memory can be as permitted by the FPGA tools.
-        #
-        # The mem axi ratio is a parameter used by HLS to resize memories
-        # so memory read/write ports are the same size as the design axi bus width.
-        #
-        # Max bus width allowed (property of FPGA vendor toolchain)
-        max_bus_width = 1024
-        # Bus width of a memory interface
-        mem_bus_width = 1 << self.fpga_log_axi_bus_width
-        # Input memory
-        inp_mem_bus_width = 1 << (cfg["LOG_INP_WIDTH"] + \
-                                  cfg["LOG_BATCH"] + \
-                                  cfg["LOG_BLOCK_IN"])
-        self.inp_mem_size = 1 << cfg["LOG_INP_BUFF_SIZE"]  # bytes
-        self.inp_mem_banks = (inp_mem_bus_width + \
-                              max_bus_width - 1) // \
-            max_bus_width
-        self.inp_mem_width = min(inp_mem_bus_width, max_bus_width)
-        self.inp_mem_depth = self.inp_mem_size * 8 // inp_mem_bus_width
-        self.inp_mem_axi_ratio = self.inp_mem_width // mem_bus_width
-        # Weight memory
-        wgt_mem_bus_width = 1 << (cfg["LOG_WGT_WIDTH"] + \
-                                  cfg["LOG_BLOCK_IN"] + \
-                                  cfg["LOG_BLOCK_OUT"])
-        self.wgt_mem_size = 1 << cfg["LOG_WGT_BUFF_SIZE"]  # bytes
-        self.wgt_mem_banks = (wgt_mem_bus_width + \
-                              max_bus_width - 1) // \
-            max_bus_width
-        self.wgt_mem_width = min(wgt_mem_bus_width, max_bus_width)
-        self.wgt_mem_depth = self.wgt_mem_size * 8 // wgt_mem_bus_width
-        self.wgt_mem_axi_ratio = self.wgt_mem_width // mem_bus_width
-        # Output memory
-        out_mem_bus_width = 1 << (cfg["LOG_OUT_WIDTH"] + \
-                                  cfg["LOG_BATCH"] + \
-                                  cfg["LOG_BLOCK_OUT"])
-        self.out_mem_size = 1 << cfg["LOG_OUT_BUFF_SIZE"]  # bytes
-        self.out_mem_banks = (out_mem_bus_width + \
-                              max_bus_width - 1) // \
-            max_bus_width
-        self.out_mem_width = min(out_mem_bus_width, max_bus_width)
-        self.out_mem_depth = self.out_mem_size * 8 // out_mem_bus_width
-        self.out_mem_axi_ratio = self.out_mem_width // mem_bus_width
-
-        # Macro defs
-        self.macro_defs = []
-        self.cfg_dict = {}
-        for key in cfg:
-            self.macro_defs.append("-DVTA_%s=%s" % (key, str(cfg[key])))
-            self.cfg_dict[key] = cfg[key]
-        self.macro_defs.append("-DVTA_LOG_BUS_WIDTH=%s" % (self.fpga_log_axi_bus_width))
-        # Macros used by the VTA driver
-        self.macro_defs.append("-DVTA_IP_REG_MAP_RANGE=%s" % (self.ip_reg_map_range))
-        self.macro_defs.append("-DVTA_FETCH_ADDR=%s" % (self.fetch_base_addr))
-        self.macro_defs.append("-DVTA_LOAD_ADDR=%s" % (self.load_base_addr))
-        self.macro_defs.append("-DVTA_COMPUTE_ADDR=%s" % (self.compute_base_addr))
-        self.macro_defs.append("-DVTA_STORE_ADDR=%s" % (self.store_base_addr))
-        # IP register offsets
-        self.macro_defs.append("-DVTA_FETCH_INSN_COUNT_OFFSET=%s" % \
-                               (self.fetch_insn_count_offset))
-        self.macro_defs.append("-DVTA_FETCH_INSN_ADDR_OFFSET=%s" % \
-                               (self.fetch_insn_addr_offset))
-        self.macro_defs.append("-DVTA_LOAD_INP_ADDR_OFFSET=%s" % \
-                               (self.load_inp_addr_offset))
-        self.macro_defs.append("-DVTA_LOAD_WGT_ADDR_OFFSET=%s" % \
-                               (self.load_wgt_addr_offset))
-        self.macro_defs.append("-DVTA_COMPUTE_DONE_WR_OFFSET=%s" % \
-                               (self.compute_done_wr_offet))
-        self.macro_defs.append("-DVTA_COMPUTE_DONE_RD_OFFSET=%s" % \
-                               (self.compute_done_rd_offet))
-        self.macro_defs.append("-DVTA_COMPUTE_UOP_ADDR_OFFSET=%s" % \
-                               (self.compute_uop_addr_offset))
-        self.macro_defs.append("-DVTA_COMPUTE_BIAS_ADDR_OFFSET=%s" % \
-                               (self.compute_bias_addr_offset))
-        self.macro_defs.append("-DVTA_STORE_OUT_ADDR_OFFSET=%s" % \
-                               (self.store_out_addr_offset))
-        # Coherency
-        if coherent:
-            self.macro_defs.append("-DVTA_COHERENT_ACCESSES=true")
-        else:
-            self.macro_defs.append("-DVTA_COHERENT_ACCESSES=false")
-
-    @property
-    def cflags(self):
-        return self.include_path + self.macro_defs
-
-    @property
-    def cfg_json(self):
-        return json.dumps(self.cfg_dict, indent=2)
-
-    def same_config(self, cfg):
-        """Compare if cfg is same as current config.
-
-        Parameters
-        ----------
-        cfg : the configuration
-            The configuration
-
-        Returns
-        -------
-        equal : bool
-            Whether the configuration is the same.
-        """
-        for k, v in self.cfg_dict.items():
-            if k not in cfg:
-                return False
-            if cfg[k] != v:
-                return False
-        return True
diff --git a/vta/vta-hw/config/pynq_sample.json b/vta/vta-hw/config/pynq_sample.json
deleted file mode 100644
index 7a2664105f76..000000000000
--- a/vta/vta-hw/config/pynq_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "pynq",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/tsim_sample.json b/vta/vta-hw/config/tsim_sample.json
deleted file mode 100644
index 71f77c0225a2..000000000000
--- a/vta/vta-hw/config/tsim_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "tsim",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/ultra96_sample.json b/vta/vta-hw/config/ultra96_sample.json
deleted file mode 100644
index 35b5a7e322f0..000000000000
--- a/vta/vta-hw/config/ultra96_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "ultra96",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/vta_config.json b/vta/vta-hw/config/vta_config.json
deleted file mode 100644
index 0591bb486143..000000000000
--- a/vta/vta-hw/config/vta_config.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "sim",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/vta_config.py b/vta/vta-hw/config/vta_config.py
deleted file mode 100644
index 9bb6d7b2f725..000000000000
--- a/vta/vta-hw/config/vta_config.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA config tool"""
-import os
-import sys
-import json
-import argparse
-
-
-def pkg_config(cfg):
-    """Returns PkgConfig pkg config object."""
-    pkg_config_py = os.path.join(
-            os.path.dirname(os.path.abspath(os.path.expanduser(__file__))),
-            "pkg_config.py"
-    )
-    libpkg = {"__file__": pkg_config_py}
-    exec(compile(open(pkg_config_py, "rb").read(), pkg_config_py, "exec"), libpkg, libpkg)
-    PkgConfig = libpkg["PkgConfig"]
-    return PkgConfig(cfg)
-
-def main():
-    """Main funciton"""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--use-cfg", type=str, default="",
-                        help="path to the config json")
-    parser.add_argument("--cflags", action="store_true",
-                        help="print the cflags")
-    parser.add_argument("--defs", action="store_true",
-                        help="print the macro defs")
-    parser.add_argument("--sources", action="store_true",
-                        help="print the source file paths")
-    parser.add_argument("--update", action="store_true",
-                        help="Print out the json option.")
-    parser.add_argument("--ldflags", action="store_true",
-                        help="print the ldflags")
-    parser.add_argument("--cfg-json", action="store_true",
-                        help="print all the config json")
-    parser.add_argument("--save-cfg-json", type=str, default="",
-                        help="save config json to file")
-    parser.add_argument("--target", action="store_true",
-                        help="print the target")
-    parser.add_argument("--cfg-str", action="store_true",
-                        help="print the configuration string")
-    parser.add_argument("--get-inp-mem-banks", action="store_true",
-                        help="returns number of input memory banks")
-    parser.add_argument("--get-inp-mem-width", action="store_true",
-                        help="returns input memory read/write port width")
-    parser.add_argument("--get-inp-mem-depth", action="store_true",
-                        help="returns input memory depth")
-    parser.add_argument("--get-inp-mem-axi-ratio", action="store_true",
-                        help="returns ratio between input element width and axi width")
-    parser.add_argument("--get-wgt-mem-banks", action="store_true",
-                        help="returns number of weight memory banks")
-    parser.add_argument("--get-wgt-mem-width", action="store_true",
-                        help="returns weight memory read/write port width")
-    parser.add_argument("--get-wgt-mem-depth", action="store_true",
-                        help="returns weight memory depth")
-    parser.add_argument("--get-wgt-mem-axi-ratio", action="store_true",
-                        help="returns ratio between weight element width and axi width")
-    parser.add_argument("--get-out-mem-banks", action="store_true",
-                        help="returns number of output memory banks")
-    parser.add_argument("--get-out-mem-width", action="store_true",
-                        help="returns output memory read/write port width")
-    parser.add_argument("--get-out-mem-depth", action="store_true",
-                        help="returns output memory depth")
-    parser.add_argument("--get-out-mem-axi-ratio", action="store_true",
-                        help="returns ratio between output element width and axi width")
-    parser.add_argument("--get-axi-cache-bits", action="store_true",
-                        help="returns AXI system ARCACHE/AWCACHE hardcoded bit value")
-    parser.add_argument("--get-axi-prot-bits", action="store_true",
-                        help="returns AXI system ARPROT/AWPROT hardcoded bit value")
-    parser.add_argument("--get-ip-reg-map-range", action="store_true",
-                        help="returns ip register map address range")
-    parser.add_argument("--get-fetch-base-addr", action="store_true",
-                        help="returns fetch module base address")
-    parser.add_argument("--get-load-base-addr", action="store_true",
-                        help="returns load module base address")
-    parser.add_argument("--get-compute-base-addr", action="store_true",
-                        help="returns compute module base address")
-    parser.add_argument("--get-store-base-addr", action="store_true",
-                        help="returns store module base address")
-    parser.add_argument("--get-fpga-dev", action="store_true",
-                        help="returns FPGA device target")
-    parser.add_argument("--get-fpga-family", action="store_true",
-                        help="returns FPGA device family")
-    parser.add_argument("--get-fpga-freq", action="store_true",
-                        help="returns FPGA frequency")
-    parser.add_argument("--get-fpga-per", action="store_true",
-                        help="returns HLS target clock period")
-    args = parser.parse_args()
-
-    if len(sys.argv) == 1:
-        parser.print_help()
-        return
-
-    # Path to vta config
-    curr_path = os.path.dirname(
-        os.path.abspath(os.path.expanduser(__file__)))
-
-    path_list = [
-        "vta_config.json", os.path.join(curr_path, "vta_config.json")
-    ]
-
-    if args.use_cfg:
-        path_list = [args.use_cfg]
-
-    ok_path_list = [p for p in path_list if os.path.exists(p)]
-    if not ok_path_list:
-        raise RuntimeError("Cannot find config in %s" % str(path_list))
-
-    cfg = json.load(open(ok_path_list[0]))
-    pkg = pkg_config(cfg)
-
-    if args.target:
-        print(pkg.TARGET)
-
-    if args.defs:
-        print(" ".join(pkg.macro_defs))
-
-    if args.sources:
-        print(" ".join(pkg.lib_source))
-
-    if args.cflags:
-        cflags_str = " ".join(pkg.cflags)
-        if pkg.TARGET == "pynq":
-            cflags_str += " -DVTA_TARGET_PYNQ"
-        elif pkg.TARGET == "de10nano":
-            cflags_str += " -DVTA_TARGET_DE10_NANO"
-        elif pkg.TARGET == "ultra96":
-            cflags_str += " -DVTA_TARGET_ULTRA96"
-        print(cflags_str)
-
-    if args.ldflags:
-        print(" ".join(pkg.ldflags))
-
-    if args.cfg_json:
-        print(pkg.cfg_json)
-
-    if args.save_cfg_json:
-        with open(args.save_cfg_json, "w") as fo:
-            fo.write(pkg.cfg_json)
-
-    if args.cfg_str:
-        print(pkg.TARGET + "_" + pkg.bitstream)
-
-    if args.get_inp_mem_banks:
-        print(pkg.inp_mem_banks)
-
-    if args.get_inp_mem_width:
-        print(pkg.inp_mem_width)
-
-    if args.get_inp_mem_depth:
-        print(pkg.inp_mem_depth)
-
-    if args.get_inp_mem_axi_ratio:
-        print(pkg.inp_mem_axi_ratio)
-
-    if args.get_wgt_mem_banks:
-        print(pkg.wgt_mem_banks)
-
-    if args.get_wgt_mem_width:
-        print(pkg.wgt_mem_width)
-
-    if args.get_wgt_mem_depth:
-        print(pkg.wgt_mem_depth)
-
-    if args.get_wgt_mem_axi_ratio:
-        print(pkg.wgt_mem_axi_ratio)
-
-    if args.get_out_mem_banks:
-        print(pkg.out_mem_banks)
-
-    if args.get_out_mem_width:
-        print(pkg.out_mem_width)
-
-    if args.get_out_mem_depth:
-        print(pkg.out_mem_depth)
-
-    if args.get_out_mem_axi_ratio:
-        print(pkg.out_mem_axi_ratio)
-
-    if args.get_axi_cache_bits:
-        print(pkg.axi_cache_bits)
-
-    if args.get_axi_prot_bits:
-        print(pkg.axi_prot_bits)
-
-    if args.get_ip_reg_map_range:
-        print(pkg.ip_reg_map_range)
-
-    if args.get_fetch_base_addr:
-        print(pkg.fetch_base_addr)
-
-    if args.get_load_base_addr:
-        print(pkg.load_base_addr)
-
-    if args.get_compute_base_addr:
-        print(pkg.compute_base_addr)
-
-    if args.get_store_base_addr:
-        print(pkg.store_base_addr)
-
-    if args.get_fpga_dev:
-        print(pkg.fpga_device)
-
-    if args.get_fpga_family:
-        print(pkg.fpga_family)
-
-    if args.get_fpga_freq:
-        print(pkg.fpga_freq)
-
-    if args.get_fpga_per:
-        print(pkg.fpga_per)
-
-if __name__ == "__main__":
-    main()
diff --git a/vta/vta-hw/hardware/chisel/.gitignore b/vta/vta-hw/hardware/chisel/.gitignore
deleted file mode 100644
index f65a6ba0485f..000000000000
--- a/vta/vta-hw/hardware/chisel/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-test_run_dir
diff --git a/vta/vta-hw/hardware/chisel/Makefile b/vta/vta-hw/hardware/chisel/Makefile
deleted file mode 100644
index 049b4d468d2f..000000000000
--- a/vta/vta-hw/hardware/chisel/Makefile
+++ /dev/null
@@ -1,205 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifeq (, $(shell which verilator))
- $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
-endif
-
-# Change VERILATOR_INC_DIR if Verilator is installed on a different location
-ifeq (, $(VERILATOR_INC_DIR))
-	ifeq (, $(wildcard /usr/local/share/verilator/include/*))
-		ifeq (, $(wildcard /usr/share/verilator/include/*))
-			$(error "Verilator include directory is not set properly")
-		else
-			VERILATOR_INC_DIR := /usr/share/verilator/include
-		endif
-	else
-			VERILATOR_INC_DIR := /usr/local/share/verilator/include
-	endif
-endif
-
-CONFIG = DefaultDe10Config
-TOP = VTA
-TOP_TEST = Test
-BUILD_NAME = build
-# Set USE_TRACE = 1 to generate a trace during simulation.
-USE_TRACE = 0
-# With USE_TRACE = 1, default trace format is VCD.
-# Set USE_TRACE_FST = 1 to use the FST format.
-# Note that although FST is around two orders of magnitude smaller than VCD
-# it is also currently much slower to produce (verilator limitation). But if
-# you are low on disk space it may be your only option.
-USE_TRACE_FST = 0
-# With USE_TRACE = 1, USE_TRACE_DETAILED = 1 will generate traces that also
-# include non-interface internal signal names starting with an underscore.
-# This will significantly increase the trace size and should only be used
-# on a per need basis for difficult debug problems.
-USE_TRACE_DETAILED = 0
-USE_THREADS = 0
-VTA_LIBNAME = libvta_hw
-UNITTEST_NAME = all
-CXX = g++
-# A debug build with DEBUG = 1 is useful to trace the simulation with a
-# debugger.
-DEBUG = 0
-# With DEBUG = 1, SANITIZE = 1 turns on address sanitizing to verify that
-# the verilator build is sane. To be used if you know what you are doing.
-SANITIZE = 0
-
-CXX_MAJOR := $(shell $(CXX) -dumpversion | sed 's/\..*//')
-CXX_HAS_ALIGN_NEW := $(shell [ $(CXX_MAJOR) -ge 7 ] && echo true)
-
-config_test = $(TOP_TEST)$(CONFIG)
-
-
-ifndef TVM_PATH
-   TVM_PATH := $(abspath ../../../../)
-endif
-
-ifndef VTA_HW_PATH
-   VTA_HW_PATH := $(abspath ../../)
-endif
-
-verilator_build_dir = $(VTA_HW_PATH)/$(BUILD_NAME)/verilator
-chisel_build_dir = $(VTA_HW_PATH)/$(BUILD_NAME)/chisel
-
-verilator_opt = --cc
-verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
-verilator_opt += +define+RANDOMIZE_REG_INIT
-verilator_opt += +define+RANDOMIZE_MEM_INIT
-verilator_opt += --x-assign unique
-verilator_opt += --output-split 20000
-verilator_opt += --output-split-cfuncs 20000
-verilator_opt += --top-module ${TOP_TEST}
-verilator_opt += -Mdir ${verilator_build_dir}
-verilator_opt += -I$(chisel_build_dir)
-
-ifeq ($(DEBUG), 0)
-	cxx_flags = -O2 -Wall -fvisibility=hidden
-else
-	cxx_flags = -O0 -g -Wall
-endif
-
-cxx_flags += -std=c++11 -Wno-maybe-uninitialized
-ifeq ($(CXX_HAS_ALIGN_NEW),true)
-	cxx_flags += -faligned-new
-endif
-cxx_flags += -DVL_TSIM_NAME=V$(TOP_TEST)
-cxx_flags += -DVL_PRINTF=printf
-cxx_flags += -DVL_USER_FINISH
-cxx_flags += -DVM_COVERAGE=0
-cxx_flags += -DVM_SC=0
-cxx_flags += -Wno-sign-compare
-cxx_flags += -include V$(TOP_TEST).h
-cxx_flags += -I$(verilator_build_dir)
-cxx_flags += -I$(VERILATOR_INC_DIR)
-cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
-cxx_flags += -I$(VTA_HW_PATH)/include
-cxx_flags += -I$(TVM_PATH)/include
-cxx_flags += -I$(TVM_PATH)/3rdparty/dlpack/include
-
-ld_flags = -fPIC -shared
-
-ifeq ($(SANITIZE), 1)
-	ifeq ($(DEBUG), 1)
-		cxx_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
-		ld_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
-	endif
-endif
-
-cxx_objs = $(verilator_build_dir)/verilated.o $(verilator_build_dir)/verilated_dpi.o $(verilator_build_dir)/tsim_device.o
-
-ifneq ($(USE_TRACE), 0)
-	cxx_flags += -DVM_TRACE=1
-	ifeq ($(USE_TRACE_FST), 1)
-		cxx_flags += -DVM_TRACE_FST
-		verilator_opt += --trace-fst
-	else
-		verilator_opt += --trace
-	endif
-	ifeq ($(USE_TRACE_DETAILED), 1)
-		verilator_opt += --trace-underscore --trace-structs
-	endif
-	ifeq ($(USE_TRACE_FST), 1)
-		cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).fst
-		cxx_objs += $(verilator_build_dir)/verilated_fst_c.o
-	else
-		cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
-		cxx_objs += $(verilator_build_dir)/verilated_vcd_c.o
-	endif
-else
-	cxx_flags += -DVM_TRACE=0
-endif
-
-ifneq ($(USE_THREADS), 0)
-	verilator_opt += --threads $(USE_THREADS)
-	cxx_flags += -DVL_THREADED
-	cxx_objs += $(verilator_build_dir)/verilated_threads.o
-endif
-
-VPATH = $(VERILATOR_INC_DIR):$(verilator_build_dir):$(VTA_HW_PATH)/hardware/dpi
-
-# The following is to be consistent with cmake
-ifeq ($(shell uname), Darwin)
-	lib_path = $(VTA_HW_PATH)/$(BUILD_NAME)/$(VTA_LIBNAME).dylib
-	cxx_flags += -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-else
-	lib_path = $(VTA_HW_PATH)/$(BUILD_NAME)/$(VTA_LIBNAME).so
-endif
-
-default: lint lib
-
-lint:
-	sbt scalastyle
-
-lib: $(lib_path)
-
-$(verilator_build_dir)/%.o: %.cpp
-	$(CXX) -fPIC $(cxx_flags) -c $^ -o $@
-
-$(verilator_build_dir)/tsim_device.o: tsim_device.cc
-	$(CXX) -fPIC $(cxx_flags) -c $^ -o $@
-
-$(lib_path): $(verilator_build_dir)/V$(TOP_TEST).cpp $(cxx_objs)
-	for f in $(shell find $(verilator_build_dir)/*.cpp); do \
-		$(CXX) -fPIC $(cxx_flags) -c $${f} -o $${f}.o ; \
-	done
-	$(CXX) $(ld_flags) $(cxx_flags) $(cxx_objs) $(patsubst %.cpp,%.cpp.o,$(shell find $(verilator_build_dir)/*.cpp)) -o $@
-
-verilator: $(verilator_build_dir)/V$(TOP_TEST).cpp
-$(verilator_build_dir)/V$(TOP_TEST).cpp: $(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v
-	verilator $(verilator_opt) $<
-
-verilog: $(chisel_build_dir)/$(TOP).$(CONFIG).v
-$(chisel_build_dir)/$(TOP).$(CONFIG).v:
-	sbt 'runMain vta.$(CONFIG) --target-dir $(chisel_build_dir) --top-name $(TOP).$(CONFIG)'
-
-verilog_test: $(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v
-$(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v:
-	sbt 'runMain vta.$(config_test) --target-dir $(chisel_build_dir) --top-name $(TOP_TEST).$(CONFIG)'
-
-unittest:
-	sbt 'test:runMain unittest.Launcher $(UNITTEST_NAME)'
-
-clean:
-	-rm -rf target project/target project/project test_run_dir
-
-cleanall:
-	-rm -rf $(VTA_HW_PATH)/$(BUILD_NAME)/chisel
-	-rm -rf $(VTA_HW_PATH)/$(BUILD_NAME)/libvta_hw.so
-	-rm -rf $(VTA_HW_PATH)/$(BUILD_NAME)/libvta_hw.dylib
-	-rm -rf $(VTA_HW_PATH)/$(BUILD_NAME)/verilator
diff --git a/vta/vta-hw/hardware/chisel/README.md b/vta/vta-hw/hardware/chisel/README.md
deleted file mode 100644
index 40c43220c5e1..000000000000
--- a/vta/vta-hw/hardware/chisel/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-VTA in Chisel
-===================================================
-For contributors who wants to test a chisel module:
-
- - You can add your test files in  `src/test/scala/unitttest`
- - Add your test name and tests to the `test` object in `src/test/scala/unitttest/Launcher.scala`
- - Check out the provided sample test `mvm` which tests the MatrixVectorComputation module
-    in `src/main/scala/core/TensorGemm.scala`
-
-- Running unit tests: `make test test_name=your_own test_name`
-
-
-
diff --git a/vta/vta-hw/hardware/chisel/build.sbt b/vta/vta-hw/hardware/chisel/build.sbt
deleted file mode 100644
index 7efd59db3486..000000000000
--- a/vta/vta-hw/hardware/chisel/build.sbt
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-name := "vta"
-version := "0.1.0-SNAPSHOT"
-organization := "edu.washington.cs"
-
-def scalacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // If we're building with Scala > 2.11, enable the compile option
-    //  switch to support our anonymous Bundle definitions:
-    //  https://github.com/scala/bug/issues/10047
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 => Seq()
-      case _ => Seq(
-        "-Xsource:2.11",
-        "-language:reflectiveCalls",
-        "-language:implicitConversions",
-        "-deprecation",
-        "-Xlint",
-        "-Ywarn-unused",
-      )
-    }
-  }
-}
-
-def javacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // Scala 2.12 requires Java 8. We continue to generate
-    //  Java 7 compatible code for Scala 2.11
-    //  for compatibility with old clients.
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 =>
-        Seq("-source", "1.7", "-target", "1.7")
-      case _ =>
-        Seq("-source", "1.8", "-target", "1.8")
-    }
-  }
-}
-
-scalaVersion := "2.11.12"
-
-resolvers ++= Seq(
-  Resolver.sonatypeRepo("snapshots"),
-  Resolver.sonatypeRepo("releases"))
-
-val defaultVersions = Map(
-  "chisel3" -> "3.1.7",
-  "chisel-iotesters" -> "1.2.4"
-  )
-
-libraryDependencies ++= Seq("chisel3","chisel-iotesters").map {
-  dep: String => "edu.berkeley.cs" %% dep % sys.props.getOrElse(dep + "Version", defaultVersions(dep)) }
-
-scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
-javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/vta/vta-hw/hardware/chisel/project/build.properties b/vta/vta-hw/hardware/chisel/project/build.properties
deleted file mode 100644
index fc7998eb3eac..000000000000
--- a/vta/vta-hw/hardware/chisel/project/build.properties
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-sbt.version = 1.3.2
diff --git a/vta/vta-hw/hardware/chisel/project/plugins.sbt b/vta/vta-hw/hardware/chisel/project/plugins.sbt
deleted file mode 100644
index 19ae5c9d49b9..000000000000
--- a/vta/vta-hw/hardware/chisel/project/plugins.sbt
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-logLevel := Level.Warn
-addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
diff --git a/vta/vta-hw/hardware/chisel/scalastyle-config.xml b/vta/vta-hw/hardware/chisel/scalastyle-config.xml
deleted file mode 100644
index ae7c8e6b588a..000000000000
--- a/vta/vta-hw/hardware/chisel/scalastyle-config.xml
+++ /dev/null
@@ -1,128 +0,0 @@
-<scalastyle>
- <name>Scalastyle standard configuration</name>
- <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="true">
-  <parameters>
-   <parameter name="maxFileLength"><![CDATA[800]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
-  <parameters>
-   <parameter name="header"><![CDATA[/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
-  <parameters>
-   <parameter name="maxLineLength"><![CDATA[120]]></parameter>
-   <parameter name="tabSize"><![CDATA[2]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true">
-  <parameters>
-   <parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
-  <parameters>
-   <parameter name="maxParameters"><![CDATA[8]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
-  <parameters>
-   <parameter name="ignore"><![CDATA[-1,0,1,2,3,4,8,16,32,64,128]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[println]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="true">
-  <parameters>
-   <parameter name="maxTypes"><![CDATA[30]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="true">
-  <parameters>
-   <parameter name="maximum"><![CDATA[10]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="false">
-  <parameters>
-   <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
-   <parameter name="doubleLineAllowed"><![CDATA[false]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="true">
-  <parameters>
-   <parameter name="maxLength"><![CDATA[50]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
-  <parameters>
-   <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="true">
-  <parameters>
-   <parameter name="maxMethods"><![CDATA[30]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="false"></check>
- <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
- <check level="error" class="org.scalastyle.file.IndentationChecker" enabled="true">
-   <parameters>
-     <parameter name="tabSize">2</parameter>
-     <parameter name="methodParamIndentSize">2</parameter>
-     <parameter name="classParamIndentSize">4</parameter>
-   </parameters>
- </check>
-</scalastyle>
diff --git a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v b/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
deleted file mode 100644
index 3441e3e31891..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-module VTAHostDPI #
-( parameter ADDR_BITS = 8,
-  parameter DATA_BITS = 32
-)
-(
-  input                        clock,
-  input                        reset,
-  output logic                 dpi_req_valid,
-  output logic                 dpi_req_opcode,
-  output logic [ADDR_BITS-1:0] dpi_req_addr,
-  output logic [DATA_BITS-1:0] dpi_req_value,
-  input                        dpi_req_deq,
-  input                        dpi_resp_valid,
-  input        [DATA_BITS-1:0] dpi_resp_bits
-);
-
-  import "DPI-C" function void VTAHostDPI
-  (
-    output byte unsigned req_valid,
-    output byte unsigned req_opcode,
-    output byte unsigned req_addr,
-    output int  unsigned req_value,
-    input  byte unsigned req_deq,
-    input  byte unsigned resp_valid,
-    input  int  unsigned resp_value
-  );
-
-  typedef logic        dpi1_t;
-  typedef logic  [7:0] dpi8_t;
-  typedef logic [31:0] dpi32_t;
-
-  dpi1_t  __reset;
-  dpi8_t  __req_valid;
-  dpi8_t  __req_opcode;
-  dpi8_t  __req_addr;
-  dpi32_t __req_value;
-  dpi8_t  __req_deq;
-  dpi8_t  __resp_valid;
-  dpi32_t __resp_bits;
-
-  // reset
-  always_ff @(posedge clock) begin
-    __reset <= reset;
-  end
-
-  // delaying outputs by one-cycle
-  // since verilator does not support delays
-  always_ff @(posedge clock) begin
-    dpi_req_valid  <= dpi1_t ' (__req_valid);
-    dpi_req_opcode <= dpi1_t ' (__req_opcode);
-    dpi_req_addr   <= __req_addr;
-    dpi_req_value  <= __req_value;
-  end
-
-  assign __req_deq    = dpi8_t ' (dpi_req_deq);
-  assign __resp_valid = dpi8_t ' (dpi_resp_valid);
-  assign __resp_bits  = dpi_resp_bits;
-
-  // evaluate DPI function
-  always_ff @(posedge clock) begin
-    if (reset | __reset) begin
-      __req_valid = 0;
-      __req_opcode = 0;
-      __req_addr = 0;
-      __req_value = 0;
-    end
-    else begin
-      VTAHostDPI(
-        __req_valid,
-        __req_opcode,
-        __req_addr,
-        __req_value,
-        __req_deq,
-        __resp_valid,
-        __resp_bits);
-    end
-  end
-
-endmodule
diff --git a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v b/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v
deleted file mode 100644
index e0ed949bf8cf..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-module VTAMemDPI #
-( parameter LEN_BITS = 8,
-  parameter ADDR_BITS = 64,
-  parameter DATA_BITS = 64
-)
-(
-  input                        clock,
-  input                        reset,
-  input                        dpi_req_valid,
-  input                        dpi_req_opcode,
-  input         [LEN_BITS-1:0] dpi_req_len,
-  input        [ADDR_BITS-1:0] dpi_req_addr,
-  input                        dpi_wr_valid,
-  input        [DATA_BITS-1:0] dpi_wr_bits,
-  output logic                 dpi_rd_valid,
-  output logic [DATA_BITS-1:0] dpi_rd_bits,
-  input                        dpi_rd_ready
-);
-
-  import "DPI-C" function void VTAMemDPI
-  (
-    input  byte     unsigned req_valid,
-    input  byte     unsigned req_opcode,
-    input  byte     unsigned req_len,
-    input  longint  unsigned req_addr,
-    input  byte     unsigned wr_valid,
-    input  longint  unsigned wr_value,
-    output byte     unsigned rd_valid,
-    output longint  unsigned rd_value,
-    input  byte     unsigned rd_ready
-  );
-
-  typedef logic        dpi1_t;
-  typedef logic  [7:0] dpi8_t;
-  typedef logic [31:0] dpi32_t;
-  typedef logic [63:0] dpi64_t;
-
-  dpi1_t  __reset;
-  dpi8_t  __req_valid;
-  dpi8_t  __req_opcode;
-  dpi8_t  __req_len;
-  dpi64_t __req_addr;
-  dpi8_t  __wr_valid;
-  dpi64_t __wr_value;
-  dpi8_t  __rd_valid;
-  dpi64_t __rd_value;
-  dpi8_t  __rd_ready;
-
-  always_ff @(posedge clock) begin
-    __reset <= reset;
-  end
-
-  // delaying outputs by one-cycle
-  // since verilator does not support delays
-  always_ff @(posedge clock) begin
-    dpi_rd_valid <= dpi1_t ' (__rd_valid);
-    dpi_rd_bits  <= __rd_value;
-  end
-
-  assign __req_valid  = dpi8_t ' (dpi_req_valid);
-  assign __req_opcode = dpi8_t ' (dpi_req_opcode);
-  assign __req_len    = dpi_req_len;
-  assign __req_addr   = dpi_req_addr;
-  assign __wr_valid   = dpi8_t ' (dpi_wr_valid);
-  assign __wr_value   = dpi_wr_bits;
-  assign __rd_ready   = dpi8_t ' (dpi_rd_ready);
-
-  // evaluate DPI function
-  always_ff @(posedge clock) begin
-    if (reset | __reset) begin
-      __rd_valid = 0;
-      __rd_value = 0;
-    end
-    else begin
-      VTAMemDPI(
-        __req_valid,
-        __req_opcode,
-        __req_len,
-        __req_addr,
-        __wr_valid,
-        __wr_value,
-        __rd_valid,
-        __rd_value,
-        __rd_ready);
-    end
-  end
-endmodule
diff --git a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTASimDPI.v b/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTASimDPI.v
deleted file mode 100644
index fc0d4c815d77..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTASimDPI.v
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-module VTASimDPI
-(
-  input                        clock,
-  input                        reset,
-  output logic                 dpi_wait
-);
-
-  import "DPI-C" function void VTASimDPI
-  (
-    output byte unsigned sim_wait,
-    output byte unsigned sim_exit
-  );
-
-  typedef logic        dpi1_t;
-  typedef logic  [7:0] dpi8_t;
-
-  dpi1_t __reset;
-  dpi8_t __wait;
-  dpi8_t __exit;
-
-  // reset
-  always_ff @(posedge clock) begin
-    __reset <= reset;
-  end
-
-  // evaluate DPI function
-  always_ff @(posedge clock) begin
-    if (reset | __reset) begin
-      __wait = 0;
-      __exit = 0;
-    end
-    else begin
-      VTASimDPI(
-        __wait,
-	__exit);
-    end
-  end
-
-  logic wait_reg;
-
-  always_ff @(posedge clock) begin
-    if (reset | __reset) begin
-      wait_reg <= 1'b0;
-    end else if (__wait == 1) begin
-      wait_reg <= 1'b1;
-    end else begin
-      wait_reg <= 1'b0;
-    end
-  end
-
-  assign dpi_wait = wait_reg;
-
-  always_ff @(posedge clock) begin
-    if (__exit == 1) begin
-      $finish;
-    end
-  end
-
-endmodule
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Compute.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Compute.scala
deleted file mode 100644
index a1e7fadd96cf..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Compute.scala
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** Compute.
- *
- * The compute unit is in charge of the following:
- * - Loading micro-ops from memory (loadUop module)
- * - Loading biases (acc) from memory (tensorAcc module)
- * - Compute ALU instructions (tensorAlu module)
- * - Compute GEMM instructions (tensorGemm module)
- */
-class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val i_post = Vec(2, Input(Bool()))
-    val o_post = Vec(2, Output(Bool()))
-    val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
-    val uop_baddr = Input(UInt(mp.addrBits.W))
-    val acc_baddr = Input(UInt(mp.addrBits.W))
-    val vme_rd = Vec(2, new VMEReadMaster)
-    val inp = new TensorMaster(tensorType = "inp")
-    val wgt = new TensorMaster(tensorType = "wgt")
-    val out = new TensorMaster(tensorType = "out")
-    val finish = Output(Bool())
-    val acc_wr_event = Output(Bool())
-  })
-  val sIdle :: sSync :: sExe :: Nil = Enum(3)
-  val state = RegInit(sIdle)
-
-  val s = Seq.tabulate(2)(_ =>
-    Module(new Semaphore(counterBits = 8, counterInitValue = 0)))
-
-  val loadUop = Module(new LoadUop)
-  val tensorAcc = Module(new TensorLoad(tensorType = "acc"))
-  val tensorGemm = Module(new TensorGemm)
-  val tensorAlu = Module(new TensorAlu)
-
-  val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
-
-  // decode
-  val dec = Module(new ComputeDecode)
-  dec.io.inst := inst_q.io.deq.bits
-
-  val inst_type =
-    Cat(dec.io.isFinish,
-      dec.io.isAlu,
-      dec.io.isGemm,
-      dec.io.isLoadAcc,
-      dec.io.isLoadUop).asUInt
-
-  val sprev = inst_q.io.deq.valid & Mux(dec.io.pop_prev, s(0).io.sready, true.B)
-  val snext = inst_q.io.deq.valid & Mux(dec.io.pop_next, s(1).io.sready, true.B)
-  val start = snext & sprev
-  val done =
-    MuxLookup(
-      inst_type,
-      false.B, // default
-      Array(
-        "h_01".U -> loadUop.io.done,
-        "h_02".U -> tensorAcc.io.done,
-        "h_04".U -> tensorGemm.io.done,
-        "h_08".U -> tensorAlu.io.done,
-        "h_10".U -> true.B // Finish
-      )
-    )
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(start) {
-        when(dec.io.isSync) {
-          state := sSync
-        }.elsewhen(inst_type.orR) {
-          state := sExe
-        }
-      }
-    }
-    is(sSync) {
-      state := sIdle
-    }
-    is(sExe) {
-      when(done) {
-        state := sIdle
-      }
-    }
-  }
-
-  // instructions
-  inst_q.io.enq <> io.inst
-  inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
-
-  // uop
-  loadUop.io.start := state === sIdle & start & dec.io.isLoadUop
-  loadUop.io.inst := inst_q.io.deq.bits
-  loadUop.io.baddr := io.uop_baddr
-  io.vme_rd(0) <> loadUop.io.vme_rd
-  loadUop.io.uop.idx <> Mux(dec.io.isGemm, tensorGemm.io.uop.idx, tensorAlu.io.uop.idx)
-
-  // acc
-  tensorAcc.io.start := state === sIdle & start & dec.io.isLoadAcc
-  tensorAcc.io.inst := inst_q.io.deq.bits
-  tensorAcc.io.baddr := io.acc_baddr
-  tensorAcc.io.tensor.rd.idx <> Mux(dec.io.isGemm, tensorGemm.io.acc.rd.idx, tensorAlu.io.acc.rd.idx)
-  tensorAcc.io.tensor.wr <> Mux(dec.io.isGemm, tensorGemm.io.acc.wr, tensorAlu.io.acc.wr)
-  io.vme_rd(1) <> tensorAcc.io.vme_rd
-  io.acc_wr_event := tensorAcc.io.tensor.wr.valid
-
-  // gemm
-  tensorGemm.io.start := state === sIdle & start & dec.io.isGemm
-  tensorGemm.io.inst := inst_q.io.deq.bits
-  tensorGemm.io.uop.data.valid := loadUop.io.uop.data.valid & dec.io.isGemm
-  tensorGemm.io.uop.data.bits <> loadUop.io.uop.data.bits
-  tensorGemm.io.inp <> io.inp
-  tensorGemm.io.wgt <> io.wgt
-  tensorGemm.io.acc.rd.data.valid := tensorAcc.io.tensor.rd.data.valid & dec.io.isGemm
-  tensorGemm.io.acc.rd.data.bits <> tensorAcc.io.tensor.rd.data.bits
-  tensorGemm.io.out.rd.data.valid := io.out.rd.data.valid & dec.io.isGemm
-  tensorGemm.io.out.rd.data.bits <> io.out.rd.data.bits
-
-  // alu
-  tensorAlu.io.start := state === sIdle & start & dec.io.isAlu
-  tensorAlu.io.inst := inst_q.io.deq.bits
-  tensorAlu.io.uop.data.valid := loadUop.io.uop.data.valid & dec.io.isAlu
-  tensorAlu.io.uop.data.bits <> loadUop.io.uop.data.bits
-  tensorAlu.io.acc.rd.data.valid := tensorAcc.io.tensor.rd.data.valid & dec.io.isAlu
-  tensorAlu.io.acc.rd.data.bits <> tensorAcc.io.tensor.rd.data.bits
-  tensorAlu.io.out.rd.data.valid := io.out.rd.data.valid & dec.io.isAlu
-  tensorAlu.io.out.rd.data.bits <> io.out.rd.data.bits
-
-  // out
-  io.out.rd.idx <> Mux(dec.io.isGemm,
-    tensorGemm.io.out.rd.idx,
-    tensorAlu.io.out.rd.idx)
-  io.out.wr <> Mux(dec.io.isGemm, tensorGemm.io.out.wr, tensorAlu.io.out.wr)
-
-  // semaphore
-  s(0).io.spost := io.i_post(0)
-  s(1).io.spost := io.i_post(1)
-  s(0).io.swait := dec.io.pop_prev & (state === sIdle & start)
-  s(1).io.swait := dec.io.pop_next & (state === sIdle & start)
-  io.o_post(0) := dec.io.push_prev & ((state === sExe & done) | (state === sSync))
-  io.o_post(1) := dec.io.push_next & ((state === sExe & done) | (state === sSync))
-
-  // finish
-  io.finish := state === sExe & done & dec.io.isFinish
-
-  // debug
-  if (debug) {
-    // start
-    when(state === sIdle && start) {
-      when(dec.io.isSync) {
-        printf("[Compute] start sync\n")
-      }.elsewhen(dec.io.isLoadUop) {
-        printf("[Compute] start load uop\n")
-      }.elsewhen(dec.io.isLoadAcc) {
-        printf("[Compute] start load acc\n")
-      }.elsewhen(dec.io.isGemm) {
-        printf("[Compute] start gemm\n")
-      }.elsewhen(dec.io.isAlu) {
-        printf("[Compute] start alu\n")
-      }.elsewhen(dec.io.isFinish) {
-        printf("[Compute] start finish\n")
-      }
-    }
-    // done
-    when(state === sSync) {
-      printf("[Compute] done sync\n")
-    }
-    when(state === sExe) {
-      when(done) {
-        when(dec.io.isLoadUop) {
-          printf("[Compute] done load uop\n")
-        }.elsewhen(dec.io.isLoadAcc) {
-          printf("[Compute] done load acc\n")
-        }.elsewhen(dec.io.isGemm) {
-          printf("[Compute] done gemm\n")
-        }.elsewhen(dec.io.isAlu) {
-          printf("[Compute] done alu\n")
-        }.elsewhen(dec.io.isFinish) {
-          printf("[Compute] done finish\n")
-        }
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala
deleted file mode 100644
index 4ab7d8503a0a..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import vta.util.config._
-
-/** CoreConfig.
- *
- * This is one supported configuration for VTA. This file will
- * be eventually filled out with class configurations that can be
- * mixed/matched with Shell configurations for different backends.
- */
-class CoreConfig extends Config((site, here, up) => {
-  case CoreKey =>
-    CoreParams(
-      batch = 1,
-      blockOut = 16,
-      blockIn = 16,
-      inpBits = 8,
-      wgtBits = 8,
-      uopBits = 32,
-      accBits = 32,
-      outBits = 8,
-      uopMemDepth = 2048,
-      inpMemDepth = 2048,
-      wgtMemDepth = 1024,
-      accMemDepth = 2048,
-      outMemDepth = 2048,
-      instQueueEntries = 512
-    )
-})
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Core.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Core.scala
deleted file mode 100644
index e2ac51a55d48..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Core.scala
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import vta.util.config._
-import vta.shell._
-
-/** Core parameters */
-case class CoreParams(
-    batch: Int = 1,
-    blockOut: Int = 16,
-    blockIn: Int = 16,
-    inpBits: Int = 8,
-    wgtBits: Int = 8,
-    uopBits: Int = 32,
-    accBits: Int = 32,
-    outBits: Int = 8,
-    uopMemDepth: Int = 512,
-    inpMemDepth: Int = 512,
-    wgtMemDepth: Int = 512,
-    accMemDepth: Int = 512,
-    outMemDepth: Int = 512,
-    instQueueEntries: Int = 32
-) {
-  require(uopBits % 8 == 0,
-    s"\n\n[VTA] [CoreParams] uopBits must be byte aligned\n\n")
-}
-
-case object CoreKey extends Field[CoreParams]
-
-/** Core.
- *
- * The core defines the current VTA architecture by connecting memory and
- * compute modules together such as load/store and compute. Most of the
- * connections in the core are bulk (<>), and we should try to keep it this
- * way, because it is easier to understand what is going on.
- *
- * Also, the core must be instantiated by a shell using the
- * VTA Control Register (VCR) and the VTA Memory Engine (VME) interfaces.
- * More info about these interfaces and modules can be found in the shell
- * directory.
- */
-class Core(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val vcr = new VCRClient
-    val vme = new VMEMaster
-  })
-  val fetch = Module(new Fetch)
-  val load = Module(new Load)
-  val compute = Module(new Compute)
-  val store = Module(new Store)
-  val ecounters = Module(new EventCounters)
-
-  // Read(rd) and write(wr) from/to memory (i.e. DRAM)
-  io.vme.rd(0) <> fetch.io.vme_rd
-  io.vme.rd(1) <> compute.io.vme_rd(0)
-  io.vme.rd(2) <> load.io.vme_rd(0)
-  io.vme.rd(3) <> load.io.vme_rd(1)
-  io.vme.rd(4) <> compute.io.vme_rd(1)
-  io.vme.wr(0) <> store.io.vme_wr
-
-  // Fetch instructions (tasks) from memory (DRAM) into queues (SRAMs)
-  fetch.io.launch := io.vcr.launch
-  fetch.io.ins_baddr := io.vcr.ptrs(0)
-  fetch.io.ins_count := io.vcr.vals(0)
-
-  // Load inputs and weights from memory (DRAM) into scratchpads (SRAMs)
-  load.io.i_post := compute.io.o_post(0)
-  load.io.inst <> fetch.io.inst.ld
-  load.io.inp_baddr := io.vcr.ptrs(2)
-  load.io.wgt_baddr := io.vcr.ptrs(3)
-
-  // The compute module performs the following:
-  // - Load micro-ops (uops) and accumulations (acc)
-  // - Compute dense and ALU instructions (tasks)
-  compute.io.i_post(0) := load.io.o_post
-  compute.io.i_post(1) := store.io.o_post
-  compute.io.inst <> fetch.io.inst.co
-  compute.io.uop_baddr := io.vcr.ptrs(1)
-  compute.io.acc_baddr := io.vcr.ptrs(4)
-  compute.io.inp <> load.io.inp
-  compute.io.wgt <> load.io.wgt
-
-  // The store module performs the following:
-  // - Writes results from compute into scratchpads (SRAMs)
-  // - Store results from scratchpads (SRAMs) to memory (DRAM)
-  store.io.i_post := compute.io.o_post(1)
-  store.io.inst <> fetch.io.inst.st
-  store.io.out_baddr := io.vcr.ptrs(5)
-  store.io.out <> compute.io.out
-
-  // Event counters
-  ecounters.io.launch := io.vcr.launch
-  ecounters.io.finish := compute.io.finish
-  io.vcr.ecnt <> ecounters.io.ecnt
-  io.vcr.ucnt <> ecounters.io.ucnt
-  ecounters.io.acc_wr_event := compute.io.acc_wr_event
-
-  // Finish instruction is executed and asserts the VCR finish flag
-  val finish = RegNext(compute.io.finish)
-  io.vcr.finish := finish
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Decode.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Decode.scala
deleted file mode 100644
index 37f6ab40584c..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Decode.scala
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-
-import ISA._
-
-/** MemDecode.
- *
- * Decode memory instructions with a Bundle. This is similar to an union,
- * therefore order matters when declaring fields. These are the instructions
- * decoded with this bundle:
- *   - LUOP
- *   - LWGT
- *   - LINP
- *   - LACC
- *   - SOUT
- */
-class MemDecode extends Bundle {
-  val xpad_1 = UInt(M_PAD_BITS.W)
-  val xpad_0 = UInt(M_PAD_BITS.W)
-  val ypad_1 = UInt(M_PAD_BITS.W)
-  val ypad_0 = UInt(M_PAD_BITS.W)
-  val xstride = UInt(M_STRIDE_BITS.W)
-  val xsize = UInt(M_SIZE_BITS.W)
-  val ysize = UInt(M_SIZE_BITS.W)
-  val empty_0 = UInt(7.W) // derive this
-  val dram_offset = UInt(M_DRAM_OFFSET_BITS.W)
-  val sram_offset = UInt(M_SRAM_OFFSET_BITS.W)
-  val id = UInt(M_ID_BITS.W)
-  val push_next = Bool()
-  val push_prev = Bool()
-  val pop_next = Bool()
-  val pop_prev = Bool()
-  val op = UInt(OP_BITS.W)
-}
-
-/** GemmDecode.
- *
- * Decode GEMM instruction with a Bundle. This is similar to an union,
- * therefore order matters when declaring fields.
- */
-class GemmDecode extends Bundle {
-  val wgt_1 = UInt(C_WIDX_BITS.W)
-  val wgt_0 = UInt(C_WIDX_BITS.W)
-  val inp_1 = UInt(C_IIDX_BITS.W)
-  val inp_0 = UInt(C_IIDX_BITS.W)
-  val acc_1 = UInt(C_AIDX_BITS.W)
-  val acc_0 = UInt(C_AIDX_BITS.W)
-  val empty_0 = Bool()
-  val lp_1 = UInt(C_ITER_BITS.W)
-  val lp_0 = UInt(C_ITER_BITS.W)
-  val uop_end = UInt(C_UOP_END_BITS.W)
-  val uop_begin = UInt(C_UOP_BGN_BITS.W)
-  val reset = Bool()
-  val push_next = Bool()
-  val push_prev = Bool()
-  val pop_next = Bool()
-  val pop_prev = Bool()
-  val op = UInt(OP_BITS.W)
-}
-
-/** AluDecode.
- *
- * Decode ALU instructions with a Bundle. This is similar to an union,
- * therefore order matters when declaring fields. These are the instructions
- * decoded with this bundle:
- *   - VMIN
- *   - VMAX
- *   - VADD
- *   - VSHX
- */
-class AluDecode extends Bundle {
-  val empty_1 = Bool()
-  val alu_imm = UInt(C_ALU_IMM_BITS.W)
-  val alu_use_imm = Bool()
-  val alu_op = UInt(C_ALU_DEC_BITS.W)
-  val src_1 = UInt(C_IIDX_BITS.W)
-  val src_0 = UInt(C_IIDX_BITS.W)
-  val dst_1 = UInt(C_AIDX_BITS.W)
-  val dst_0 = UInt(C_AIDX_BITS.W)
-  val empty_0 = Bool()
-  val lp_1 = UInt(C_ITER_BITS.W)
-  val lp_0 = UInt(C_ITER_BITS.W)
-  val uop_end = UInt(C_UOP_END_BITS.W)
-  val uop_begin = UInt(C_UOP_BGN_BITS.W)
-  val reset = Bool()
-  val push_next = Bool()
-  val push_prev = Bool()
-  val pop_next = Bool()
-  val pop_prev = Bool()
-  val op = UInt(OP_BITS.W)
-}
-
-/** UopDecode.
- *
- * Decode micro-ops (uops).
- */
-class UopDecode extends Bundle {
-  val u2 = UInt(10.W)
-  val u1 = UInt(11.W)
-  val u0 = UInt(11.W)
-}
-
-/** FetchDecode.
- *
- * Partial decoding for dispatching instructions to Load, Compute, and Store.
- */
-class FetchDecode extends Module {
-  val io = IO(new Bundle {
-    val inst = Input(UInt(INST_BITS.W))
-    val isLoad = Output(Bool())
-    val isCompute = Output(Bool())
-    val isStore = Output(Bool())
-  })
-  val csignals =
-    ListLookup(
-      io.inst,
-      List(N, OP_X),
-      Array(
-        LUOP -> List(Y, OP_G),
-        LWGT -> List(Y, OP_L),
-        LINP -> List(Y, OP_L),
-        LACC -> List(Y, OP_G),
-        SOUT -> List(Y, OP_S),
-        GEMM -> List(Y, OP_G),
-        FNSH -> List(Y, OP_G),
-        VMIN -> List(Y, OP_G),
-        VMAX -> List(Y, OP_G),
-        VADD -> List(Y, OP_G),
-        VSHX -> List(Y, OP_G)
-      )
-    )
-
-  val (cs_val_inst: Bool) :: cs_op_type :: Nil = csignals
-
-  io.isLoad := cs_val_inst & cs_op_type === OP_L
-  io.isCompute := cs_val_inst & cs_op_type === OP_G
-  io.isStore := cs_val_inst & cs_op_type === OP_S
-}
-
-/** LoadDecode.
- *
- * Decode dependencies, type and sync for Load module.
- */
-class LoadDecode extends Module {
-  val io = IO(new Bundle {
-    val inst = Input(UInt(INST_BITS.W))
-    val push_next = Output(Bool())
-    val pop_next = Output(Bool())
-    val isInput = Output(Bool())
-    val isWeight = Output(Bool())
-    val isSync = Output(Bool())
-  })
-  val dec = io.inst.asTypeOf(new MemDecode)
-  io.push_next := dec.push_next
-  io.pop_next := dec.pop_next
-  io.isInput := io.inst === LINP & dec.xsize =/= 0.U
-  io.isWeight := io.inst === LWGT & dec.xsize =/= 0.U
-  io.isSync := (io.inst === LINP | io.inst === LWGT) & dec.xsize === 0.U
-}
-
-/** ComputeDecode.
- *
- * Decode dependencies, type and sync for Compute module.
- */
-class ComputeDecode extends Module {
-  val io = IO(new Bundle {
-    val inst = Input(UInt(INST_BITS.W))
-    val push_next = Output(Bool())
-    val push_prev = Output(Bool())
-    val pop_next = Output(Bool())
-    val pop_prev = Output(Bool())
-    val isLoadAcc = Output(Bool())
-    val isLoadUop = Output(Bool())
-    val isSync = Output(Bool())
-    val isAlu = Output(Bool())
-    val isGemm = Output(Bool())
-    val isFinish = Output(Bool())
-  })
-  val dec = io.inst.asTypeOf(new MemDecode)
-  io.push_next := dec.push_next
-  io.push_prev := dec.push_prev
-  io.pop_next := dec.pop_next
-  io.pop_prev := dec.pop_prev
-  io.isLoadAcc := io.inst === LACC & dec.xsize =/= 0.U
-  io.isLoadUop := io.inst === LUOP & dec.xsize =/= 0.U
-  io.isSync := (io.inst === LACC | io.inst === LUOP) & dec.xsize === 0.U
-  io.isAlu := io.inst === VMIN | io.inst === VMAX | io.inst === VADD | io.inst === VSHX
-  io.isGemm := io.inst === GEMM
-  io.isFinish := io.inst === FNSH
-}
-
-/** StoreDecode.
- *
- * Decode dependencies, type and sync for Store module.
- */
-class StoreDecode extends Module {
-  val io = IO(new Bundle {
-    val inst = Input(UInt(INST_BITS.W))
-    val push_prev = Output(Bool())
-    val pop_prev = Output(Bool())
-    val isStore = Output(Bool())
-    val isSync = Output(Bool())
-  })
-  val dec = io.inst.asTypeOf(new MemDecode)
-  io.push_prev := dec.push_prev
-  io.pop_prev := dec.pop_prev
-  io.isStore := io.inst === SOUT & dec.xsize =/= 0.U
-  io.isSync := io.inst === SOUT & dec.xsize === 0.U
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/EventCounters.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/EventCounters.scala
deleted file mode 100644
index 5ef358627fec..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/EventCounters.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** EventCounters.
- *
- * This unit contains all the event counting logic. One common event tracked in
- * hardware is the number of clock cycles taken to achieve certain task. We
- * can count the total number of clock cycles spent in a VTA run by checking
- * launch and finish signals.
- *
- * The event counter value is passed to the VCR module via the ecnt port, so
- * they can be accessed by the host. The number of event counters (nECnt) is
- * defined in the Shell VCR module as a parameter, see VCRParams.
- *
- * If one would like to add an event counter, then the value of nECnt must be
- * changed in VCRParams together with the corresponding counting logic here.
- */
-class EventCounters(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val vp = p(ShellKey).vcrParams
-  val io = IO(new Bundle {
-    val launch = Input(Bool())
-    val finish = Input(Bool())
-    val ecnt = Vec(vp.nECnt, ValidIO(UInt(vp.regBits.W)))
-    val ucnt = Vec(vp.nUCnt, ValidIO(UInt(vp.regBits.W)))
-    val acc_wr_event = Input(Bool())
-  })
-  val cycle_cnt = RegInit(0.U(vp.regBits.W))
-  when(io.launch && !io.finish) {
-    cycle_cnt := cycle_cnt + 1.U
-  }.otherwise {
-    cycle_cnt := 0.U
-  }
-  io.ecnt(0).valid := io.finish
-  io.ecnt(0).bits := cycle_cnt
-
-  val acc_wr_count = Reg(UInt(vp.regBits.W))
-  when (!io.launch || io.finish) {
-    acc_wr_count := 0.U
-  }.elsewhen (io.acc_wr_event) {
-    acc_wr_count := acc_wr_count + 1.U
-  }
-  io.ucnt(0).valid := io.finish
-  io.ucnt(0).bits := acc_wr_count
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Fetch.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Fetch.scala
deleted file mode 100644
index 0ea35a3e653a..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Fetch.scala
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** Fetch.
- *
- * The fetch unit reads instructions (tasks) from memory (i.e. DRAM), using the
- * VTA Memory Engine (VME), and push them into an instruction queue called
- * inst_q. Once the instruction queue is full, instructions are dispatched to
- * the Load, Compute and Store module queues based on the instruction opcode.
- * After draining the queue, the fetch unit checks if there are more instructions
- * via the ins_count register which is written by the host.
- *
- * Additionally, instructions are read into two chunks (see sReadLSB and sReadMSB)
- * because we are using a DRAM payload of 8-bytes or half of a VTA instruction.
- * This should be configurable for larger payloads, i.e. 64-bytes, which can load
- * more than one instruction at the time. Finally, the instruction queue is
- * sized (entries_q), depending on the maximum burst allowed in the memory.
- */
-class Fetch(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val vp = p(ShellKey).vcrParams
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val launch = Input(Bool())
-    val ins_baddr = Input(UInt(mp.addrBits.W))
-    val ins_count = Input(UInt(vp.regBits.W))
-    val vme_rd = new VMEReadMaster
-    val inst = new Bundle {
-      val ld = Decoupled(UInt(INST_BITS.W))
-      val co = Decoupled(UInt(INST_BITS.W))
-      val st = Decoupled(UInt(INST_BITS.W))
-    }
-  })
-  val entries_q = 1 << (mp.lenBits - 1) // one-instr-every-two-vme-word
-  val inst_q = Module(new Queue(UInt(INST_BITS.W), entries_q))
-  val dec = Module(new FetchDecode)
-
-  val s1_launch = RegNext(io.launch)
-  val pulse = io.launch & ~s1_launch
-
-  val raddr = Reg(chiselTypeOf(io.vme_rd.cmd.bits.addr))
-  val rlen = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
-  val ilen = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
-
-  val xrem = Reg(chiselTypeOf(io.ins_count))
-  val xsize = (io.ins_count << 1.U) - 1.U
-  val xmax = (1 << mp.lenBits).U
-  val xmax_bytes = ((1 << mp.lenBits) * mp.dataBits / 8).U
-
-  val sIdle :: sReadCmd :: sReadLSB :: sReadMSB :: sDrain :: Nil = Enum(5)
-  val state = RegInit(sIdle)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(pulse) {
-        state := sReadCmd
-        when(xsize < xmax) {
-          rlen := xsize
-          ilen := xsize >> 1.U
-          xrem := 0.U
-        }.otherwise {
-          rlen := xmax - 1.U
-          ilen := (xmax >> 1.U) - 1.U
-          xrem := xsize - xmax
-        }
-      }
-    }
-    is(sReadCmd) {
-      when(io.vme_rd.cmd.ready) {
-        state := sReadLSB
-      }
-    }
-    is(sReadLSB) {
-      when(io.vme_rd.data.valid) {
-        state := sReadMSB
-      }
-    }
-    is(sReadMSB) {
-      when(io.vme_rd.data.valid) {
-        when(inst_q.io.count === ilen) {
-          state := sDrain
-        }.otherwise {
-          state := sReadLSB
-        }
-      }
-    }
-    is(sDrain) {
-      when(inst_q.io.count === 0.U) {
-        when(xrem === 0.U) {
-          state := sIdle
-        }.elsewhen(xrem < xmax) {
-          state := sReadCmd
-          rlen := xrem
-          ilen := xrem >> 1.U
-          xrem := 0.U
-        }.otherwise {
-          state := sReadCmd
-          rlen := xmax - 1.U
-          ilen := (xmax >> 1.U) - 1.U
-          xrem := xrem - xmax
-        }
-      }
-    }
-  }
-
-  // read instructions from dram
-  when(state === sIdle) {
-    raddr := io.ins_baddr
-  }.elsewhen(state === sDrain && inst_q.io.count === 0.U && xrem =/= 0.U) {
-    raddr := raddr + xmax_bytes
-  }
-
-  io.vme_rd.cmd.valid := state === sReadCmd
-  io.vme_rd.cmd.bits.addr := raddr
-  io.vme_rd.cmd.bits.len := rlen
-
-  io.vme_rd.data.ready := inst_q.io.enq.ready
-
-  val lsb = Reg(chiselTypeOf(io.vme_rd.data.bits))
-  val msb = io.vme_rd.data.bits
-  val inst = Cat(msb, lsb)
-
-  when(state === sReadLSB) { lsb := io.vme_rd.data.bits }
-
-  inst_q.io.enq.valid := io.vme_rd.data.valid & state === sReadMSB
-  inst_q.io.enq.bits := inst
-
-  // decode
-  dec.io.inst := inst_q.io.deq.bits
-
-  // instruction queues
-  io.inst.ld.valid := dec.io.isLoad & inst_q.io.deq.valid & state === sDrain
-  io.inst.co.valid := dec.io.isCompute & inst_q.io.deq.valid & state === sDrain
-  io.inst.st.valid := dec.io.isStore & inst_q.io.deq.valid & state === sDrain
-
-  io.inst.ld.bits := inst_q.io.deq.bits
-  io.inst.co.bits := inst_q.io.deq.bits
-  io.inst.st.bits := inst_q.io.deq.bits
-
-  // check if selected queue is ready
-  val deq_sel = Cat(dec.io.isCompute, dec.io.isStore, dec.io.isLoad).asUInt
-  val deq_ready =
-    MuxLookup(deq_sel,
-      false.B, // default
-      Array(
-        "h_01".U -> io.inst.ld.ready,
-        "h_02".U -> io.inst.st.ready,
-        "h_04".U -> io.inst.co.ready
-      ))
-
-  // dequeue instruction
-  inst_q.io.deq.ready := deq_ready & inst_q.io.deq.valid & state === sDrain
-
-  // debug
-  if (debug) {
-    when(state === sIdle && pulse) {
-      printf("[Fetch] Launch\n")
-    }
-    // instruction
-    when(inst_q.io.deq.fire()) {
-      when(dec.io.isLoad) {
-        printf("[Fetch] [instruction decode] [L] %x\n", inst_q.io.deq.bits)
-      }
-      when(dec.io.isCompute) {
-        printf("[Fetch] [instruction decode] [C] %x\n", inst_q.io.deq.bits)
-      }
-      when(dec.io.isStore) {
-        printf("[Fetch] [instruction decode] [S] %x\n", inst_q.io.deq.bits)
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/ISA.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/ISA.scala
deleted file mode 100644
index bfe89ebb41f5..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/ISA.scala
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import scala.collection.mutable.HashMap
-
-/** ISAConstants.
- *
- * These constants are used for decoding (parsing) fields on instructions.
- */
-trait ISAConstants {
-  val INST_BITS = 128
-
-  val OP_BITS = 3
-
-  val M_DEP_BITS = 4
-  val M_ID_BITS = 2
-  val M_SRAM_OFFSET_BITS = 16
-  val M_DRAM_OFFSET_BITS = 32
-  val M_SIZE_BITS = 16
-  val M_STRIDE_BITS = 16
-  val M_PAD_BITS = 4
-
-  val C_UOP_BGN_BITS = 13
-  val C_UOP_END_BITS = 14
-  val C_ITER_BITS = 14
-  val C_AIDX_BITS = 11
-  val C_IIDX_BITS = 11
-  val C_WIDX_BITS = 10
-  val C_ALU_DEC_BITS = 2 // FIXME: there should be a SHL and SHR instruction
-  val C_ALU_OP_BITS = 3
-  val C_ALU_IMM_BITS = 16
-
-  val Y = true.B
-  val N = false.B
-
-  val OP_L = 0.asUInt(OP_BITS.W)
-  val OP_S = 1.asUInt(OP_BITS.W)
-  val OP_G = 2.asUInt(OP_BITS.W)
-  val OP_F = 3.asUInt(OP_BITS.W)
-  val OP_A = 4.asUInt(OP_BITS.W)
-  val OP_X = 5.asUInt(OP_BITS.W)
-
-  val ALU_OP_NUM = 5
-  val ALU_OP = Enum(ALU_OP_NUM)
-
-  val M_ID_U = 0.asUInt(M_ID_BITS.W)
-  val M_ID_W = 1.asUInt(M_ID_BITS.W)
-  val M_ID_I = 2.asUInt(M_ID_BITS.W)
-  val M_ID_A = 3.asUInt(M_ID_BITS.W)
-}
-
-/** ISA.
- *
- * This is the VTA task ISA
- *
- * TODO: Add VXOR to clear accumulator
- * TODO: Use ISA object for decoding as well
- * TODO: Eventually deprecate ISAConstants
- */
-object ISA {
-  private val xLen = 128
-  private val depBits = 4
-
-  private val idBits: HashMap[String, Int] =
-    HashMap(("task", 3), ("mem", 2), ("alu", 2))
-
-  private val taskId: HashMap[String, String] =
-    HashMap(("load", "000"),
-      ("store", "001"),
-      ("gemm", "010"),
-      ("finish", "011"),
-      ("alu", "100"))
-
-  private val memId: HashMap[String, String] =
-    HashMap(("uop", "00"), ("wgt", "01"), ("inp", "10"), ("acc", "11"))
-
-  private val aluId: HashMap[String, String] =
-    HashMap(("minpool", "00"),
-      ("maxpool", "01"),
-      ("add", "10"),
-      ("shift", "11"))
-
-  private def dontCare(bits: Int): String = "?" * bits
-
-  private def instPat(bin: String): BitPat = BitPat("b" + bin)
-
-  private def load(id: String): BitPat = {
-    val rem = xLen - idBits("mem") - depBits - idBits("task")
-    val inst = dontCare(rem) + memId(id) + dontCare(depBits) + taskId("load")
-    instPat(inst)
-  }
-
-  private def store: BitPat = {
-    val rem = xLen - idBits("task")
-    val inst = dontCare(rem) + taskId("store")
-    instPat(inst)
-  }
-
-  private def gemm: BitPat = {
-    val rem = xLen - idBits("task")
-    val inst = dontCare(rem) + taskId("gemm")
-    instPat(inst)
-  }
-
-  private def alu(id: String): BitPat = {
-    // TODO: move alu id next to task id
-    val inst = dontCare(18) + aluId(id) + dontCare(105) + taskId("alu")
-    instPat(inst)
-  }
-
-  private def finish: BitPat = {
-    val rem = xLen - idBits("task")
-    val inst = dontCare(rem) + taskId("finish")
-    instPat(inst)
-  }
-
-  def LUOP = load("uop")
-  def LWGT = load("wgt")
-  def LINP = load("inp")
-  def LACC = load("acc")
-  def SOUT = store
-  def GEMM = gemm
-  def VMIN = alu("minpool")
-  def VMAX = alu("maxpool")
-  def VADD = alu("add")
-  def VSHX = alu("shift")
-  def FNSH = finish
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Load.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Load.scala
deleted file mode 100644
index 50c26bb8e8ed..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Load.scala
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** Load.
- *
- * Load inputs and weights from memory (DRAM) into scratchpads (SRAMs).
- * This module instantiate the TensorLoad unit which is in charge of
- * loading 1D and 2D tensors to scratchpads, so it can be used by
- * other modules such as Compute.
- */
-class Load(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val i_post = Input(Bool())
-    val o_post = Output(Bool())
-    val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
-    val inp_baddr = Input(UInt(mp.addrBits.W))
-    val wgt_baddr = Input(UInt(mp.addrBits.W))
-    val vme_rd = Vec(2, new VMEReadMaster)
-    val inp = new TensorClient(tensorType = "inp")
-    val wgt = new TensorClient(tensorType = "wgt")
-  })
-  val sIdle :: sSync :: sExe :: Nil = Enum(3)
-  val state = RegInit(sIdle)
-
-  val s = Module(new Semaphore(counterBits = 8, counterInitValue = 0))
-  val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
-
-  val dec = Module(new LoadDecode)
-  dec.io.inst := inst_q.io.deq.bits
-
-  val tensorType = Seq("inp", "wgt")
-  val tensorDec = Seq(dec.io.isInput, dec.io.isWeight)
-  val tensorLoad =
-    Seq.tabulate(2)(i => Module(new TensorLoad(tensorType = tensorType(i))))
-
-  val start = inst_q.io.deq.valid & Mux(dec.io.pop_next, s.io.sready, true.B)
-  val done = Mux(dec.io.isInput, tensorLoad(0).io.done, tensorLoad(1).io.done)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(start) {
-        when(dec.io.isSync) {
-          state := sSync
-        }.elsewhen(dec.io.isInput || dec.io.isWeight) {
-          state := sExe
-        }
-      }
-    }
-    is(sSync) {
-      state := sIdle
-    }
-    is(sExe) {
-      when(done) {
-        state := sIdle
-      }
-    }
-  }
-
-  // instructions
-  inst_q.io.enq <> io.inst
-  inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
-
-  // load tensor
-  // [0] input (inp)
-  // [1] weight (wgt)
-  val ptr = Seq(io.inp_baddr, io.wgt_baddr)
-  val tsor = Seq(io.inp, io.wgt)
-  for (i <- 0 until 2) {
-    tensorLoad(i).io.start := state === sIdle & start & tensorDec(i)
-    tensorLoad(i).io.inst := inst_q.io.deq.bits
-    tensorLoad(i).io.baddr := ptr(i)
-    tensorLoad(i).io.tensor <> tsor(i)
-    io.vme_rd(i) <> tensorLoad(i).io.vme_rd
-  }
-
-  // semaphore
-  s.io.spost := io.i_post
-  s.io.swait := dec.io.pop_next & (state === sIdle & start)
-  io.o_post := dec.io.push_next & ((state === sExe & done) | (state === sSync))
-
-  // debug
-  if (debug) {
-    // start
-    when(state === sIdle && start) {
-      when(dec.io.isSync) {
-        printf("[Load] start sync\n")
-      }.elsewhen(dec.io.isInput) {
-        printf("[Load] start input\n")
-      }.elsewhen(dec.io.isWeight) {
-        printf("[Load] start weight\n")
-      }
-    }
-    // done
-    when(state === sSync) {
-      printf("[Load] done sync\n")
-    }
-    when(state === sExe) {
-      when(done) {
-        when(dec.io.isInput) {
-          printf("[Load] done input\n")
-        }.elsewhen(dec.io.isWeight) {
-          printf("[Load] done weight\n")
-        }
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/LoadUop.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/LoadUop.scala
deleted file mode 100644
index 87bd50858f2e..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/LoadUop.scala
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** UopMaster.
- *
- * Uop interface used by a master module, i.e. TensorAlu or TensorGemm,
- * to request a micro-op (uop) from the uop-scratchpad. The index (idx) is
- * used as an address to find the uop in the uop-scratchpad.
- */
-class UopMaster(implicit p: Parameters) extends Bundle {
-  val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
-  val idx = ValidIO(UInt(addrBits.W))
-  val data = Flipped(ValidIO(new UopDecode))
-  override def cloneType = new UopMaster().asInstanceOf[this.type]
-}
-
-/** UopClient.
- *
- * Uop interface used by a client module, i.e. LoadUop, to receive
- * a request from a master module, i.e. TensorAlu or TensorGemm.
- * The index (idx) is used as an address to find the uop in the uop-scratchpad.
- */
-class UopClient(implicit p: Parameters) extends Bundle {
-  val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
-  val idx = Flipped(ValidIO(UInt(addrBits.W)))
-  val data = ValidIO(new UopDecode)
-  override def cloneType = new UopClient().asInstanceOf[this.type]
-}
-
-/** LoadUop.
- *
- * Load micro-ops (uops) from memory, i.e. DRAM, and store them in the
- * uop-scratchpad. Currently, micro-ops are 32-bit wide and loaded in
- * group of 2 given the fact that the DRAM payload is 8-bytes. This module
- * should be modified later on to support different DRAM sizes efficiently.
- */
-class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val baddr = Input(UInt(mp.addrBits.W))
-    val vme_rd = new VMEReadMaster
-    val uop = new UopClient
-  })
-  val numUop = 2 // store two uops per sram word
-  val uopBits = p(CoreKey).uopBits
-  val uopBytes = uopBits / 8
-  val uopDepth = p(CoreKey).uopMemDepth / numUop
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-  val raddr = Reg(chiselTypeOf(io.vme_rd.cmd.bits.addr))
-  val xcnt = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
-  val xlen = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
-  val xrem = Reg(chiselTypeOf(dec.xsize))
-  val xsize = (dec.xsize >> log2Ceil(numUop)) + dec.xsize(0) + (dec.sram_offset % 2.U) - 1.U
-  val xmax = (1 << mp.lenBits).U
-  val xmax_bytes = ((1 << mp.lenBits) * mp.dataBits / 8).U
-
-  val dram_even = (dec.dram_offset % 2.U) === 0.U
-  val sram_even = (dec.sram_offset % 2.U) === 0.U
-  val sizeIsEven = (dec.xsize % 2.U) === 0.U
-
-  val sIdle :: sReadCmd :: sReadData :: Nil = Enum(3)
-  val state = RegInit(sIdle)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        state := sReadCmd
-        when(xsize < xmax) {
-          xlen := xsize
-          xrem := 0.U
-        }.otherwise {
-          xlen := xmax - 1.U
-          xrem := xsize - xmax
-        }
-      }
-    }
-    is(sReadCmd) {
-      when(io.vme_rd.cmd.ready) {
-        state := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.vme_rd.data.valid) {
-        when(xcnt === xlen) {
-          when(xrem === 0.U) {
-            state := sIdle
-          }.otherwise {
-            raddr := raddr + xmax_bytes
-            when(xrem < xmax) {
-              state := sReadCmd
-              xlen := xrem
-              xrem := 0.U
-            }
-            .otherwise {
-              state := sReadCmd
-              xlen := xmax - 1.U
-              xrem := xrem - xmax
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // read-from-dram
-  val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
-  when(state === sIdle) {
-    when(dram_even) {
-      raddr := io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(uopBytes)))
-    }.otherwise {
-      raddr := (io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(uopBytes)))) - uopBytes.U
-    }
-  }
-
-  io.vme_rd.cmd.valid := state === sReadCmd
-  io.vme_rd.cmd.bits.addr := raddr
-  io.vme_rd.cmd.bits.len := xlen
-
-  io.vme_rd.data.ready := state === sReadData
-
-  when(state =/= sReadData) {
-    xcnt := 0.U
-  }.elsewhen(io.vme_rd.data.fire()) {
-    xcnt := xcnt + 1.U
-  }
-
-  val waddr = Reg(UInt(log2Ceil(uopDepth).W))
-  when(state === sIdle) {
-    waddr := dec.sram_offset >> log2Ceil(numUop)
-  }.elsewhen(io.vme_rd.data.fire()) {
-    waddr := waddr + 1.U
-  }
-
-  val wdata = Wire(Vec(numUop, UInt(uopBits.W)))
-  val mem = SyncReadMem(uopDepth, chiselTypeOf(wdata))
-  val wmask = Reg(Vec(numUop, Bool()))
-
-  when(sram_even) {
-    when(sizeIsEven) {
-      wmask := "b_11".U.asTypeOf(wmask)
-    }.elsewhen(io.vme_rd.cmd.fire()) {
-      when(dec.xsize === 1.U) {
-        wmask := "b_01".U.asTypeOf(wmask)
-      }.otherwise {
-        wmask := "b_11".U.asTypeOf(wmask)
-      }
-    }.elsewhen(io.vme_rd.data.fire()) {
-      when((xcnt === xlen - 1.U) && (xrem === 0.U)) {
-        wmask := "b_01".U.asTypeOf(wmask)
-      }.otherwise {
-        wmask := "b_11".U.asTypeOf(wmask)
-      }
-    }
-  }.otherwise {
-    when(io.vme_rd.cmd.fire()) {
-      wmask := "b_10".U.asTypeOf(wmask)
-    }.elsewhen(io.vme_rd.data.fire()) {
-      when(sizeIsEven && (xcnt === xlen - 1.U) && (xrem === 0.U)) {
-        wmask := "b_01".U.asTypeOf(wmask)
-      }.otherwise {
-        wmask := "b_11".U.asTypeOf(wmask)
-      }
-    }
-  }
-
-  wdata := io.vme_rd.data.bits.asTypeOf(wdata)
-  when(dram_even === false.B && sram_even) {
-    wdata(0) := io.vme_rd.data.bits.asTypeOf(wdata)(1)
-  }.elsewhen(sram_even === false.B && dram_even) {
-    wdata(1) := io.vme_rd.data.bits.asTypeOf(wdata)(0)
-  }
-
-  when(io.vme_rd.data.fire()) {
-    mem.write(waddr, wdata, wmask)
-  }
-
-  // read-from-sram
-  io.uop.data.valid := RegNext(io.uop.idx.valid)
-
-  val sIdx = io.uop.idx.bits % numUop.U
-  val rIdx = io.uop.idx.bits >> log2Ceil(numUop)
-  val memRead = mem.read(rIdx, io.uop.idx.valid)
-  val sWord = memRead.asUInt.asTypeOf(wdata)
-  val sUop = sWord(sIdx).asTypeOf(io.uop.data.bits)
-
-  io.uop.data.bits <> sUop
-
-  // done
-  io.done := state === sReadData & io.vme_rd.data.valid & xcnt === xlen & xrem === 0.U
-
-  // debug
-  if (debug) {
-    when(io.vme_rd.cmd.fire()) {
-      printf("[LoadUop] cmd addr:%x len:%x rem:%x\n", raddr, xlen, xrem)
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Semaphore.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Semaphore.scala
deleted file mode 100644
index efc895bc673e..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Semaphore.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-
-/** Semaphore.
- *
- * This semaphore is used instead of push/pop fifo, used in the initial
- * version of VTA. This semaphore is incremented (spost) or decremented (swait)
- * depending on the push and pop fields on instructions to prevent RAW and WAR
- * hazards.
- */
-class Semaphore(counterBits: Int = 1, counterInitValue: Int = 1) extends Module {
-  val io = IO(new Bundle {
-    val spost = Input(Bool())
-    val swait = Input(Bool())
-    val sready = Output(Bool())
-  })
-  val cnt = RegInit(counterInitValue.U(counterBits.W))
-  when(io.spost && !io.swait && cnt =/= ((1 << counterBits) - 1).asUInt) {
-    cnt := cnt + 1.U
-  }
-  when(!io.spost && io.swait && cnt =/= 0.U) { cnt := cnt - 1.U }
-  io.sready := cnt =/= 0.U
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Store.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Store.scala
deleted file mode 100644
index 025a0a24696b..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Store.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** Store.
- *
- * Store results back to memory (DRAM) from scratchpads (SRAMs).
- * This module instantiate the TensorStore unit which is in charge
- * of storing 1D and 2D tensors to main memory.
- */
-class Store(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val i_post = Input(Bool())
-    val o_post = Output(Bool())
-    val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
-    val out_baddr = Input(UInt(mp.addrBits.W))
-    val vme_wr = new VMEWriteMaster
-    val out = new TensorClient(tensorType = "out")
-  })
-  val sIdle :: sSync :: sExe :: Nil = Enum(3)
-  val state = RegInit(sIdle)
-
-  val s = Module(new Semaphore(counterBits = 8, counterInitValue = 0))
-  val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
-
-  val dec = Module(new StoreDecode)
-  dec.io.inst := inst_q.io.deq.bits
-
-  val tensorStore = Module(new TensorStore(tensorType = "out"))
-
-  val start = inst_q.io.deq.valid & Mux(dec.io.pop_prev, s.io.sready, true.B)
-  val done = tensorStore.io.done
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(start) {
-        when(dec.io.isSync) {
-          state := sSync
-        }.elsewhen(dec.io.isStore) {
-          state := sExe
-        }
-      }
-    }
-    is(sSync) {
-      state := sIdle
-    }
-    is(sExe) {
-      when(done) {
-        state := sIdle
-      }
-    }
-  }
-
-  // instructions
-  inst_q.io.enq <> io.inst
-  inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
-
-  // store
-  tensorStore.io.start := state === sIdle & start & dec.io.isStore
-  tensorStore.io.inst := inst_q.io.deq.bits
-  tensorStore.io.baddr := io.out_baddr
-  io.vme_wr <> tensorStore.io.vme_wr
-  tensorStore.io.tensor <> io.out
-
-  // semaphore
-  s.io.spost := io.i_post
-  s.io.swait := dec.io.pop_prev & (state === sIdle & start)
-  io.o_post := dec.io.push_prev & ((state === sExe & done) | (state === sSync))
-
-  // debug
-  if (debug) {
-    // start
-    when(state === sIdle && start) {
-      when(dec.io.isSync) {
-        printf("[Store] start sync\n")
-      }.elsewhen(dec.io.isStore) {
-        printf("[Store] start\n")
-      }
-    }
-    // done
-    when(state === sSync) {
-      printf("[Store] done sync\n")
-    }
-    when(state === sExe) {
-      when(done) {
-        printf("[Store] done\n")
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorAlu.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorAlu.scala
deleted file mode 100644
index 6af3c834e451..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorAlu.scala
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-
-/** ALU datapath */
-class Alu(implicit p: Parameters) extends Module {
-  val aluBits = p(CoreKey).accBits
-  val io = IO(new Bundle {
-    val opcode = Input(UInt(C_ALU_OP_BITS.W))
-    val a = Input(SInt(aluBits.W))
-    val b = Input(SInt(aluBits.W))
-    val y = Output(SInt(aluBits.W))
-  })
-
-  // FIXME: the following three will change once we support properly SHR and SHL
-  val ub = io.b.asUInt
-  val width = log2Ceil(aluBits)
-  val m = ~ub(width - 1, 0) + 1.U
-
-  val n = ub(width - 1, 0)
-  val fop = Seq(Mux(io.a < io.b, io.a, io.b), Mux(io.a < io.b, io.b, io.a),
-    io.a + io.b, io.a >> n, io.a << m)
-
-  val opmux = Seq.tabulate(ALU_OP_NUM)(i => ALU_OP(i) -> fop(i))
-  io.y := MuxLookup(io.opcode, io.a, opmux)
-}
-
-/** Pipelined ALU */
-class AluReg(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val opcode = Input(UInt(C_ALU_OP_BITS.W))
-    val a = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
-    val b = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
-    val y = ValidIO(UInt(p(CoreKey).accBits.W))
-  })
-  val alu = Module(new Alu)
-  val rA = RegEnable(io.a.bits, io.a.valid)
-  val rB = RegEnable(io.b.bits, io.b.valid)
-  val valid = RegNext(io.b.valid)
-
-  alu.io.opcode := io.opcode
-
-  // register input
-  alu.io.a := rA.asSInt
-  alu.io.b := rB.asSInt
-
-  // output
-  io.y.valid := valid
-  io.y.bits := alu.io.y.asUInt
-}
-
-/** Vector of pipeline ALUs */
-class AluVector(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val opcode = Input(UInt(C_ALU_OP_BITS.W))
-    val acc_a = new TensorMasterData(tensorType = "acc")
-    val acc_b = new TensorMasterData(tensorType = "acc")
-    val acc_y = new TensorClientData(tensorType = "acc")
-    val out = new TensorClientData(tensorType = "out")
-  })
-  val blockOut = p(CoreKey).blockOut
-  val f = Seq.fill(blockOut)(Module(new AluReg))
-  val valid = Wire(Vec(blockOut, Bool()))
-  for (i <- 0 until blockOut) {
-    f(i).io.opcode := io.opcode
-    f(i).io.a.valid := io.acc_a.data.valid
-    f(i).io.a.bits := io.acc_a.data.bits(0)(i)
-    f(i).io.b.valid := io.acc_b.data.valid
-    f(i).io.b.bits := io.acc_b.data.bits(0)(i)
-    valid(i) := f(i).io.y.valid
-    io.acc_y.data.bits(0)(i) := f(i).io.y.bits
-    io.out.data.bits(0)(i) := f(i).io.y.bits
-  }
-  io.acc_y.data.valid := valid.asUInt.andR
-  io.out.data.valid := valid.asUInt.andR
-}
-
-/** TensorAlu.
- *
- * This unit instantiate the ALU vector unit (AluVector) and go over the
- * micro-ops (uops) which are used to read the source operands (vectors)
- * from the acc-scratchpad and then they are written back the same
- * acc-scratchpad.
- */
-class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val aluBits = p(CoreKey).accBits
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val uop = new UopMaster
-    val acc = new TensorMaster(tensorType = "acc")
-    val out = new TensorMaster(tensorType = "out")
-  })
-  val sIdle :: sReadUop :: sComputeIdx :: sReadTensorA :: sReadTensorB :: sExe :: Nil =
-    Enum(6)
-  val state = RegInit(sIdle)
-  val alu = Module(new AluVector)
-  val dec = io.inst.asTypeOf(new AluDecode)
-  val uop_idx = Reg(chiselTypeOf(dec.uop_end))
-  val uop_end = dec.uop_end
-  val uop_dst = Reg(chiselTypeOf(dec.uop_end))
-  val uop_src = Reg(chiselTypeOf(dec.uop_end))
-  val cnt_o = Reg(chiselTypeOf(dec.lp_0))
-  val dst_o = Reg(chiselTypeOf(dec.uop_end))
-  val src_o = Reg(chiselTypeOf(dec.uop_end))
-  val cnt_i = Reg(chiselTypeOf(dec.lp_1))
-  val dst_i = Reg(chiselTypeOf(dec.uop_end))
-  val src_i = Reg(chiselTypeOf(dec.uop_end))
-  val done =
-    state === sExe &
-      alu.io.out.data.valid &
-      (cnt_o === dec.lp_0 - 1.U) &
-      (cnt_i === dec.lp_1 - 1.U) &
-      (uop_idx === uop_end - 1.U)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        state := sReadUop
-      }
-    }
-    is(sReadUop) {
-      state := sComputeIdx
-    }
-    is(sComputeIdx) {
-      state := sReadTensorA
-    }
-    is(sReadTensorA) {
-      state := sReadTensorB
-    }
-    is(sReadTensorB) {
-      state := sExe
-    }
-    is(sExe) {
-      when(alu.io.out.data.valid) {
-        when(
-          (cnt_o === dec.lp_0 - 1.U) &&
-            (cnt_i === dec.lp_1 - 1.U) &&
-            (uop_idx === uop_end - 1.U)) {
-          state := sIdle
-        }.otherwise {
-          state := sReadUop
-        }
-      }
-    }
-  }
-
-  when(
-    state === sIdle ||
-      (state === sExe &&
-        alu.io.out.data.valid &&
-        uop_idx === uop_end - 1.U)) {
-    uop_idx := dec.uop_begin
-  }.elsewhen(state === sExe && alu.io.out.data.valid) {
-    uop_idx := uop_idx + 1.U
-  }
-
-  when(state === sIdle) {
-    cnt_o := 0.U
-    dst_o := 0.U
-    src_o := 0.U
-  }.elsewhen(
-    state === sExe &&
-      alu.io.out.data.valid &&
-      uop_idx === uop_end - 1.U &&
-      cnt_i === dec.lp_1 - 1.U) {
-    cnt_o := cnt_o + 1.U
-    dst_o := dst_o + dec.dst_0
-    src_o := src_o + dec.src_0
-  }
-
-  when(state === sIdle) {
-    cnt_i := 0.U
-    dst_i := 0.U
-    src_i := 0.U
-  }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
-    cnt_i := 0.U
-    dst_i := dst_o
-    src_i := src_o
-  }.elsewhen(state === sExe && alu.io.out.data.valid && uop_idx === uop_end - 1.U) {
-    cnt_i := cnt_i + 1.U
-    dst_i := dst_i + dec.dst_1
-    src_i := src_i + dec.src_1
-  }
-
-  when(state === sComputeIdx && io.uop.data.valid) {
-    uop_dst := io.uop.data.bits.u0 + dst_i
-    uop_src := io.uop.data.bits.u1 + src_i
-  }
-
-  // uop
-  io.uop.idx.valid := state === sReadUop
-  io.uop.idx.bits := uop_idx
-
-  // acc_i
-  io.acc.rd.idx.valid := state === sReadTensorA | (state === sReadTensorB & ~dec.alu_use_imm)
-  io.acc.rd.idx.bits := Mux(state === sReadTensorA, uop_dst, uop_src)
-
-  // imm
-  val tensorImm = Wire(new TensorClientData(tensorType = "acc"))
-  tensorImm.data.valid := state === sReadTensorB
-  tensorImm.data.bits.foreach { b =>
-    b.foreach { c =>
-      c := Mux(dec.alu_imm(C_ALU_IMM_BITS - 1),
-        Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), dec.alu_imm), dec.alu_imm)
-    }
-  }
-
-  // alu
-  val isSHR = dec.alu_op === ALU_OP(3)
-  val neg_shift = isSHR & dec.alu_imm(C_ALU_IMM_BITS - 1)
-  val fixme_alu_op = Cat(neg_shift, Mux(neg_shift, 0.U, dec.alu_op))
-  alu.io.opcode := fixme_alu_op
-  alu.io.acc_a.data.valid := io.acc.rd.data.valid & state === sReadTensorB
-  alu.io.acc_a.data.bits <> io.acc.rd.data.bits
-  alu.io.acc_b.data.valid := Mux(dec.alu_use_imm,
-    tensorImm.data.valid,
-    io.acc.rd.data.valid & state === sExe)
-  alu.io.acc_b.data.bits <> Mux(dec.alu_use_imm,
-    tensorImm.data.bits,
-    io.acc.rd.data.bits)
-
-  // acc_o
-  io.acc.wr.valid := alu.io.acc_y.data.valid
-  io.acc.wr.bits.idx := uop_dst
-  io.acc.wr.bits.data <> alu.io.acc_y.data.bits
-
-  // out
-  io.out.wr.valid := alu.io.out.data.valid
-  io.out.wr.bits.idx := uop_dst
-  io.out.wr.bits.data <> alu.io.out.data.bits
-  io.out.tieoffRead() // write-only
-
-  io.done := done
-
-  if (debug) {
-
-    when(state === sReadUop) {
-      printf("[TensorAlu] [uop] idx:%x\n", uop_idx)
-    }
-
-    when(state === sReadTensorA) {
-      printf("[TensorAlu] [uop] dst:%x src:%x\n", uop_dst, uop_src)
-    }
-
-    when(state === sIdle && io.start) {
-      printf(p"[TensorAlu] decode:$dec\n")
-    }
-
-    alu.io.acc_a.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(alu.io.acc_a.data.valid) {
-            printf("[TensorAlu] [a] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    alu.io.acc_b.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(alu.io.acc_b.data.valid) {
-            printf("[TensorAlu] [b] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    alu.io.acc_y.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(alu.io.acc_y.data.valid) {
-            printf("[TensorAlu] [y] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    alu.io.out.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(alu.io.out.data.valid) {
-            printf("[TensorAlu] [out] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorGemm.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorGemm.scala
deleted file mode 100644
index f2d295f66220..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorGemm.scala
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import chisel3.experimental._
-import vta.util.config._
-import scala.math.pow
-
-/** Pipelined multiply and accumulate */
-class MAC(aBits: Int = 8, bBits: Int = 8, cBits: Int = 16) extends Module {
-  val outBits = Math.max(aBits + bBits, cBits) + 1
-  val io = IO(new Bundle {
-    val a = Input(SInt(aBits.W))
-    val b = Input(SInt(bBits.W))
-    val c = Input(SInt(cBits.W))
-    val y = Output(SInt(outBits.W))
-  })
-  val mult = Wire(SInt((aBits + bBits).W))
-  val add = Wire(SInt(outBits.W))
-  val rA = RegNext(io.a)
-  val rB = RegNext(io.b)
-  val rC = RegNext(io.c)
-
-  mult := rA * rB
-  add := rC +& mult
-
-  io.y := add
-}
-
-/** PipeAdder
- *
- * This unit loads input bits into register and performs addition in the next cycle
- */
-class PipeAdder(aBits: Int = 8, bBits: Int = 8) extends Module {
-  val outBits = Math.max(aBits, bBits) + 1
-  val io = IO(new Bundle {
-    val a = Input(SInt(aBits.W))
-    val b = Input(SInt(bBits.W))
-    val y = Output(SInt(outBits.W))
-  })
-  val add = Wire(SInt(outBits.W))
-  val rA = RegNext(io.a)
-  val rB = RegNext(io.b)
-  add := rA +& rB
-  io.y := add
-}
-
-/** Adder
- *
- * This unit wires input bits to an adder directly.
- * The output comes out of combinational logic without waiting for another cycle.
- */
-class Adder(aBits: Int = 8, bBits: Int = 8) extends Module {
-  val outBits = Math.max(aBits, bBits) + 1
-  val io = IO(new Bundle {
-    val a = Input(SInt(aBits.W))
-    val b = Input(SInt(bBits.W))
-    val y = Output(SInt(outBits.W))
-  })
-  val add = Wire(SInt(outBits.W))
-  val rA = Wire(SInt(aBits.W))
-  val rB = Wire(SInt(bBits.W))
-  rA := io.a
-  rB := io.b
-  add := rA +& rB
-  io.y := add
-}
-
-/** Pipelined DotProduct based on MAC and PipeAdder */
-class DotProduct(aBits: Int = 8, bBits: Int = 8, size: Int = 16) extends Module {
-  val errorMsg =
-    s"\n\n[VTA] [DotProduct] size must be greater than 4 and a power of 2\n\n"
-  require(size >= 2 && isPow2(size), errorMsg)
-  val b = aBits + bBits
-  val outBits = b + log2Ceil(size) + 1
-  val io = IO(new Bundle {
-    val a = Input(Vec(size, SInt(aBits.W)))
-    val b = Input(Vec(size, SInt(bBits.W)))
-    val y = Output(SInt(outBits.W))
-  })
-  val s = Seq.tabulate(log2Ceil(size + 1))(i =>
-    pow(2, log2Ceil(size) - i).toInt) // # of total layers
-  val p = log2Ceil(size / 2) + 1 // # of adder layers
-  val m = Seq.fill(s(0))(Module(new MAC(aBits, bBits, cBits = 1))) // # of total vector pairs
-  val a = Seq.tabulate(p)(
-    i =>
-      Seq.fill(s(i + 1))(
-        if (i == 0)
-          Module(new PipeAdder(aBits = (b + i + 1), bBits = (b + i + 1)))
-        else
-          Module(new Adder(aBits = (b + i + 1), bBits = (b + i + 1))))) // # adders within each layer
-
-  // Vector MACs
-  for (i <- 0 until s(0)) {
-    m(i).io.a := io.a(i)
-    m(i).io.b := io.b(i)
-    m(i).io.c := 0.S
-  }
-
-  // PipeAdder Reduction
-  for (i <- 0 until p) {
-    for (j <- 0 until s(i + 1)) {
-      if (i == 0) {
-        // First layer of PipeAdders
-        a(i)(j).io.a := m(2 * j).io.y
-        a(i)(j).io.b := m(2 * j + 1).io.y
-      } else {
-        a(i)(j).io.a := a(i - 1)(2 * j).io.y
-        a(i)(j).io.b := a(i - 1)(2 * j + 1).io.y
-      }
-    }
-  }
-
-  // last adder
-  io.y := a(p - 1)(0).io.y
-}
-
-/** Perform matrix-vector-multiplication based on DotProduct */
-class MatrixVectorMultiplication(implicit p: Parameters) extends Module {
-  val accBits = p(CoreKey).accBits
-  val size = p(CoreKey).blockOut
-  val inpBits = p(CoreKey).inpBits
-  val wgtBits = p(CoreKey).wgtBits
-  val outBits = p(CoreKey).outBits
-  val io = IO(new Bundle {
-    val reset = Input(Bool()) // FIXME: reset should be replaced by a load-acc instr
-    val inp = new TensorMasterData(tensorType = "inp")
-    val wgt = new TensorMasterData(tensorType = "wgt")
-    val acc_i = new TensorMasterData(tensorType = "acc")
-    val acc_o = new TensorClientData(tensorType = "acc")
-    val out = new TensorClientData(tensorType = "out")
-  })
-  val dot = Seq.fill(size)(
-    Module(new DotProduct(aBits = inpBits, bBits = wgtBits, size)))
-  // Latency is defined as two in the following, because there is one cycle in the MAC module,
-  // and another cycle in the pipelined adders as the first layer of the accumulator
-  val acc = Seq.fill(size)(Module(new Pipe(UInt(accBits.W), latency = 2)))
-  val add = Seq.fill(size)(Wire(SInt(accBits.W)))
-  val vld = Wire(Vec(size, Bool()))
-
-  for (i <- 0 until size) {
-    acc(i).io.enq.valid := io.inp.data.valid & io.wgt.data.valid & io.acc_i.data.valid & ~io.reset
-    acc(i).io.enq.bits := io.acc_i.data.bits(0)(i)
-    for (j <- 0 until size) {
-      dot(i).io.a(j) := io.inp.data.bits(0)(j).asSInt
-      dot(i).io.b(j) := io.wgt.data.bits(i)(j).asSInt
-    }
-    add(i) := acc(i).io.deq.bits.asSInt + dot(i).io.y
-    io.acc_o.data.bits(0)(i) := Mux(io.reset, 0.U, add(i).asUInt)
-    io.out.data.bits(0)(i) := add(i).asUInt
-    vld(i) := acc(i).io.deq.valid
-  }
-  io.acc_o.data.valid := vld.asUInt.andR | io.reset
-  io.out.data.valid := vld.asUInt.andR
-}
-
-/** TensorGemm.
- *
- * This unit instantiate the MatrixVectorMultiplication and go over the
- * micro-ops (uops) which are used to read inputs, weights and biases,
- * and writes results back to the acc and out scratchpads.
- *
- * Also, the TensorGemm uses the reset field in the Gemm instruction to
- * clear or zero-out the acc-scratchpad locations based on the micro-ops.
- */
-class TensorGemm(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val uop = new UopMaster
-    val inp = new TensorMaster(tensorType = "inp")
-    val wgt = new TensorMaster(tensorType = "wgt")
-    val acc = new TensorMaster(tensorType = "acc")
-    val out = new TensorMaster(tensorType = "out")
-  })
-  val sIdle :: sReadUop :: sComputeIdx :: sReadTensor :: sExe :: sWait :: Nil =
-    Enum(6)
-  val state = RegInit(sIdle)
-  val mvc = Module(new MatrixVectorMultiplication)
-  val dec = io.inst.asTypeOf(new GemmDecode)
-  val uop_idx = Reg(chiselTypeOf(dec.uop_end))
-  val uop_end = dec.uop_end
-  val uop_acc = Reg(chiselTypeOf(dec.uop_end))
-  val uop_inp = Reg(chiselTypeOf(dec.uop_end))
-  val uop_wgt = Reg(chiselTypeOf(dec.uop_end))
-  val cnt_o = Reg(chiselTypeOf(dec.lp_0))
-  val acc_o = Reg(chiselTypeOf(dec.uop_end))
-  val inp_o = Reg(chiselTypeOf(dec.uop_end))
-  val wgt_o = Reg(chiselTypeOf(dec.uop_end))
-  val cnt_i = Reg(chiselTypeOf(dec.lp_1))
-  val acc_i = Reg(chiselTypeOf(dec.uop_end))
-  val inp_i = Reg(chiselTypeOf(dec.uop_end))
-  val wgt_i = Reg(chiselTypeOf(dec.uop_end))
-  val pBits = log2Ceil(p(CoreKey).blockOut) + 1
-  val inflight = Reg(UInt(pBits.W))
-  // Latency is defined as two in the following, because there is one cycle in the MAC module,
-  // and another cycle in the pipelined adders as the first layer of the accumulator
-  val wrpipe = Module(new Pipe(chiselTypeOf(dec.uop_end), latency = 2))
-  val done = inflight === 0.U &
-    ((state === sExe &
-      cnt_o === dec.lp_0 - 1.U &
-      cnt_i === dec.lp_1 - 1.U &
-      uop_idx === uop_end - 1.U &
-      inflight === 0.U) |
-      state === sWait)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        state := sReadUop
-      }
-    }
-    is(sReadUop) {
-      state := sComputeIdx
-    }
-    is(sComputeIdx) {
-      state := sReadTensor
-    }
-    is(sReadTensor) {
-      state := sExe
-    }
-    is(sExe) {
-      when(
-        (cnt_o === dec.lp_0 - 1.U) &&
-          (cnt_i === dec.lp_1 - 1.U) &&
-          (uop_idx === uop_end - 1.U)) {
-        when(inflight =/= 0.U) {
-          state := sWait
-        }.otherwise {
-          state := sIdle
-        }
-      }.otherwise {
-        state := sReadUop
-      }
-    }
-    is(sWait) {
-      when(inflight === 0.U) {
-        state := sIdle
-      }
-    }
-  }
-
-  when(state === sIdle) {
-    inflight := 0.U
-  }.elsewhen(!dec.reset) {
-    when((state === sReadTensor) && mvc.io.acc_o.data.valid) { // issue & commit
-      inflight := inflight
-    }.elsewhen(state === sReadTensor) { // issue a tensor
-      inflight := inflight + 1.U
-    }.elsewhen(mvc.io.acc_o.data.valid) { // commit a tensor
-      inflight := inflight - 1.U
-    }
-  }
-
-  when(
-    state === sIdle ||
-      (state === sExe &&
-        uop_idx === uop_end - 1.U)) {
-    uop_idx := dec.uop_begin
-  }.elsewhen(state === sExe && dec.uop_begin =/= uop_end) {
-    uop_idx := uop_idx + 1.U
-  }
-
-  when(state === sIdle) {
-    cnt_o := 0.U
-    acc_o := 0.U
-    inp_o := 0.U
-    wgt_o := 0.U
-  }.elsewhen(
-    state === sExe &&
-      uop_idx === uop_end - 1.U &&
-      cnt_i === dec.lp_1 - 1.U) {
-    cnt_o := cnt_o + 1.U
-    acc_o := acc_o + dec.acc_0
-    inp_o := inp_o + dec.inp_0
-    wgt_o := wgt_o + dec.wgt_0
-  }
-
-  when(state === sIdle) {
-    cnt_i := 0.U
-    acc_i := 0.U
-    inp_i := 0.U
-    wgt_i := 0.U
-  }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
-    cnt_i := 0.U
-    acc_i := acc_o
-    inp_i := inp_o
-    wgt_i := wgt_o
-  }.elsewhen(state === sExe && uop_idx === uop_end - 1.U) {
-    cnt_i := cnt_i + 1.U
-    acc_i := acc_i + dec.acc_1
-    inp_i := inp_i + dec.inp_1
-    wgt_i := wgt_i + dec.wgt_1
-  }
-
-  when(state === sComputeIdx && io.uop.data.valid) {
-    uop_acc := io.uop.data.bits.u0 + acc_i
-    uop_inp := io.uop.data.bits.u1 + inp_i
-    uop_wgt := io.uop.data.bits.u2 + wgt_i
-  }
-
-  wrpipe.io.enq.valid := state === sExe & ~dec.reset
-  wrpipe.io.enq.bits := uop_acc
-
-  // uop
-  io.uop.idx.valid := state === sReadUop
-  io.uop.idx.bits := uop_idx
-
-  // inp
-  io.inp.rd.idx.valid := state === sReadTensor
-  io.inp.rd.idx.bits := uop_inp
-  io.inp.tieoffWrite() // read-only
-
-  // wgt
-  io.wgt.rd.idx.valid := state === sReadTensor
-  io.wgt.rd.idx.bits := uop_wgt
-  io.wgt.tieoffWrite() // read-only
-
-  // acc_i
-  io.acc.rd.idx.valid := state === sReadTensor
-  io.acc.rd.idx.bits := uop_acc
-
-  // mvc
-  mvc.io.reset := dec.reset & state === sExe
-  mvc.io.inp.data <> io.inp.rd.data
-  mvc.io.wgt.data <> io.wgt.rd.data
-  mvc.io.acc_i.data <> io.acc.rd.data
-
-  // acc_o
-  io.acc.wr.valid := mvc.io.acc_o.data.valid &
-    Mux(dec.reset, true.B, wrpipe.io.deq.valid)
-  io.acc.wr.bits.idx := Mux(dec.reset, uop_acc, wrpipe.io.deq.bits)
-  io.acc.wr.bits.data <> mvc.io.acc_o.data.bits
-
-  // out
-  io.out.wr.valid := mvc.io.out.data.valid & wrpipe.io.deq.valid
-  io.out.wr.bits.idx := wrpipe.io.deq.bits
-  io.out.wr.bits.data <> mvc.io.out.data.bits
-  io.out.tieoffRead() // write-only
-
-  io.done := done
-
-  if (debug) {
-    when(state === sReadUop && ~dec.reset) {
-      printf("[TensorGemm] [uop] idx:%x\n", uop_idx)
-    }
-
-    when(state === sReadTensor && ~dec.reset) {
-      printf("[TensorGemm] [uop] acc:%x inp:%x wgt:%x\n", uop_acc, uop_inp, uop_wgt)
-    }
-
-    io.inp.rd.data.bits.zipWithIndex.foreach {
-      case (r, i) =>
-        when(io.inp.rd.data.valid && ~dec.reset) {
-          printf("[TensorGemm] [inp] i:%x val:%x\n", i.U, r.asUInt)
-        }
-    }
-
-    io.wgt.rd.data.bits.zipWithIndex.foreach {
-      case (r, i) =>
-        when(io.wgt.rd.data.valid && ~dec.reset) {
-          printf("[TensorGemm] [wgt] i:%x val:%x\n", i.U, r.asUInt)
-        }
-    }
-
-    io.acc.rd.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(io.acc.rd.data.valid && ~dec.reset) {
-            printf("[TensorGemm] [acc_i] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    mvc.io.acc_o.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(mvc.io.acc_o.data.valid && ~dec.reset) {
-            printf("[TensorGemm] [acc_o] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    mvc.io.out.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(mvc.io.out.data.valid && ~dec.reset) {
-            printf("[TensorGemm] [out] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorLoad.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorLoad.scala
deleted file mode 100644
index 5ab690d8637c..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorLoad.scala
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** TensorLoad.
- *
- * Load 1D and 2D tensors from main memory (DRAM) to input/weight
- * scratchpads (SRAM). Also, there is support for zero padding, while
- * doing the load. Zero-padding works on the y and x axis, and it is
- * managed by TensorPadCtrl. The TensorDataCtrl is in charge of
- * handling the way tensors are stored on the scratchpads.
- */
-class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
-    implicit p: Parameters)
-    extends Module {
-  val tp = new TensorParams(tensorType)
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val baddr = Input(UInt(mp.addrBits.W))
-    val vme_rd = new VMEReadMaster
-    val tensor = new TensorClient(tensorType)
-  })
-  val sizeFactor = tp.tensorLength * tp.numMemBlock
-  val strideFactor = tp.tensorLength * tp.tensorWidth
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-  val dataCtrl = Module(
-    new TensorDataCtrl(tensorType, sizeFactor, strideFactor))
-  val dataCtrlDone = RegInit(false.B)
-  val yPadCtrl0 = Module(new TensorPadCtrl(padType = "YPad0", sizeFactor))
-  val yPadCtrl1 = Module(new TensorPadCtrl(padType = "YPad1", sizeFactor))
-  val xPadCtrl0 = Module(new TensorPadCtrl(padType = "XPad0", sizeFactor))
-  val xPadCtrl1 = Module(new TensorPadCtrl(padType = "XPad1", sizeFactor))
-
-  val tag = Reg(UInt(log2Ceil(tp.numMemBlock).W))
-  val set = Reg(UInt(log2Ceil(tp.tensorLength).W))
-
-  val sIdle :: sYPad0 :: sXPad0 :: sReadCmd :: sReadData :: sXPad1 :: sYPad1 :: Nil =
-    Enum(7)
-  val state = RegInit(sIdle)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        when(dec.ypad_0 =/= 0.U) {
-          state := sYPad0
-        }.elsewhen(dec.xpad_0 =/= 0.U) {
-          state := sXPad0
-        }.otherwise {
-          state := sReadCmd
-        }
-      }
-    }
-    is(sYPad0) {
-      when(yPadCtrl0.io.done) {
-        when(dec.xpad_0 =/= 0.U) {
-          state := sXPad0
-        }.otherwise {
-          state := sReadCmd
-        }
-      }
-    }
-    is(sXPad0) {
-      when(xPadCtrl0.io.done) {
-        state := sReadCmd
-      }
-    }
-    is(sReadCmd) {
-      when(io.vme_rd.cmd.ready) {
-        state := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.vme_rd.data.valid) {
-        when(dataCtrl.io.done) {
-          when(dec.xpad_1 =/= 0.U) {
-            state := sXPad1
-          }.elsewhen(dec.ypad_1 =/= 0.U) {
-            state := sYPad1
-          }.otherwise {
-            state := sIdle
-          }
-        }.elsewhen(dataCtrl.io.stride) {
-          when(dec.xpad_1 =/= 0.U) {
-            state := sXPad1
-          }.elsewhen(dec.xpad_0 =/= 0.U) {
-            state := sXPad0
-          }.otherwise {
-            state := sReadCmd
-          }
-        }.elsewhen(dataCtrl.io.split) {
-          state := sReadCmd
-        }
-      }
-    }
-    is(sXPad1) {
-      when(xPadCtrl1.io.done) {
-        when(dataCtrlDone) {
-          when(dec.ypad_1 =/= 0.U) {
-            state := sYPad1
-          }.otherwise {
-            state := sIdle
-          }
-        }.otherwise {
-          when(dec.xpad_0 =/= 0.U) {
-            state := sXPad0
-          }.otherwise {
-            state := sReadCmd
-          }
-        }
-      }
-    }
-    is(sYPad1) {
-      when(yPadCtrl1.io.done && dataCtrlDone) {
-        state := sIdle
-      }
-    }
-  }
-
-  // data controller
-  dataCtrl.io.start := state === sIdle & io.start
-  dataCtrl.io.inst := io.inst
-  dataCtrl.io.baddr := io.baddr
-  dataCtrl.io.xinit := io.vme_rd.cmd.fire()
-  dataCtrl.io.xupdate := io.vme_rd.data.fire()
-  dataCtrl.io.yupdate := io.vme_rd.data.fire()
-
-  when(state === sIdle) {
-    dataCtrlDone := false.B
-  }.elsewhen(io.vme_rd.data.fire() && dataCtrl.io.done) {
-    dataCtrlDone := true.B
-  }
-
-  // pad
-  yPadCtrl0.io.start := dec.ypad_0 =/= 0.U & state === sIdle & io.start
-
-  yPadCtrl1.io.start := dec.ypad_1 =/= 0.U &
-    ((io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U) |
-      (state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone))
-
-  xPadCtrl0.io.start := dec.xpad_0 =/= 0.U &
-    ((state === sIdle & io.start) |
-      (state === sYPad0 & yPadCtrl0.io.done) |
-      (io.vme_rd.data.fire() & ~dataCtrlDone & dataCtrl.io.stride & dec.xpad_1 === 0.U) |
-      (state === sXPad1 & xPadCtrl1.io.done & ~dataCtrlDone))
-
-  xPadCtrl1.io.start := dec.xpad_1 =/= 0.U & io.vme_rd.data.fire() &
-    ((dataCtrl.io.done) | (~dataCtrl.io.done & dataCtrl.io.stride & dec.xpad_1 =/= 0.U))
-
-  yPadCtrl0.io.inst := io.inst
-  yPadCtrl1.io.inst := io.inst
-  xPadCtrl0.io.inst := io.inst
-  xPadCtrl1.io.inst := io.inst
-
-  // read-from-dram
-  io.vme_rd.cmd.valid := state === sReadCmd
-  io.vme_rd.cmd.bits.addr := dataCtrl.io.addr
-  io.vme_rd.cmd.bits.len := dataCtrl.io.len
-
-  io.vme_rd.data.ready := state === sReadData
-
-  // write-to-sram
-  val isZeroPad = state === sYPad0 |
-    state === sXPad0 |
-    state === sXPad1 |
-    state === sYPad1
-
-  when(state === sIdle || state === sReadCmd || tag === (tp.numMemBlock - 1).U) {
-    tag := 0.U
-  }.elsewhen(io.vme_rd.data.fire() || isZeroPad) {
-    tag := tag + 1.U
-  }
-
-  when(state === sIdle || dataCtrlDone || (set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U)) {
-    set := 0.U
-  }.elsewhen((io.vme_rd.data.fire() || isZeroPad) && tag === (tp.numMemBlock - 1).U) {
-    set := set + 1.U
-  }
-
-  val waddr_cur = Reg(UInt(tp.memAddrBits.W))
-  val waddr_nxt = Reg(UInt(tp.memAddrBits.W))
-  when(state === sIdle) {
-    waddr_cur := dec.sram_offset
-    waddr_nxt := dec.sram_offset
-  }.elsewhen((io.vme_rd.data.fire() || isZeroPad)
-    && set === (tp.tensorLength - 1).U
-    && tag === (tp.numMemBlock - 1).U)
-  {
-    waddr_cur := waddr_cur + 1.U
-  }.elsewhen(dataCtrl.io.stride && io.vme_rd.data.fire()) {
-    waddr_cur := waddr_nxt + dec.xsize
-    waddr_nxt := waddr_nxt + dec.xsize
-  }
-
-  val tensorFile = Seq.fill(tp.tensorLength) {
-    SyncReadMem(tp.memDepth, Vec(tp.numMemBlock, UInt(tp.memBlockBits.W)))
-  }
-  val wmask = Seq.fill(tp.tensorLength) { Wire(Vec(tp.numMemBlock, Bool())) }
-  val wdata = Seq.fill(tp.tensorLength) {
-    Wire(Vec(tp.numMemBlock, UInt(tp.memBlockBits.W)))
-  }
-  val no_mask = Wire(Vec(tp.numMemBlock, Bool()))
-  no_mask.foreach { m =>
-    m := true.B
-  }
-
-  for (i <- 0 until tp.tensorLength) {
-    for (j <- 0 until tp.numMemBlock) {
-      wmask(i)(j) := tag === j.U
-      wdata(i)(j) := Mux(isZeroPad, 0.U, io.vme_rd.data.bits)
-    }
-    val tdata = io.tensor.wr.bits.data(i).asUInt.asTypeOf(wdata(i))
-    val muxWen =
-      Mux(state === sIdle,
-        io.tensor.wr.valid,
-        (io.vme_rd.data.fire() | isZeroPad) & set === i.U)
-    val muxWaddr = Mux(state === sIdle, io.tensor.wr.bits.idx, waddr_cur)
-    val muxWdata = Mux(state === sIdle, tdata, wdata(i))
-    val muxWmask = Mux(state === sIdle, no_mask, wmask(i))
-    when(muxWen) {
-      tensorFile(i).write(muxWaddr, muxWdata, muxWmask)
-    }
-  }
-
-  // read-from-sram
-  val rvalid = RegNext(io.tensor.rd.idx.valid)
-  io.tensor.rd.data.valid := rvalid
-
-  val rdata =
-    tensorFile.map(_.read(io.tensor.rd.idx.bits, io.tensor.rd.idx.valid))
-  rdata.zipWithIndex.foreach {
-    case (r, i) =>
-      io.tensor.rd.data.bits(i) := r.asUInt.asTypeOf(io.tensor.rd.data.bits(i))
-  }
-
-  // done
-  val done_no_pad = io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U & dec.ypad_1 === 0.U
-  val done_x_pad = state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone & dec.ypad_1 === 0.U
-  val done_y_pad = state === sYPad1 & dataCtrlDone & yPadCtrl1.io.done
-  io.done := done_no_pad | done_x_pad | done_y_pad
-
-  // debug
-  if (debug) {
-    if (tensorType == "inp") {
-      when(io.vme_rd.cmd.fire()) {
-        printf("[TensorLoad] [inp] cmd addr:%x len:%x\n",
-          dataCtrl.io.addr,
-          dataCtrl.io.len)
-      }
-      when(state === sYPad0) {
-        printf("[TensorLoad] [inp] sYPad0\n")
-      }
-      when(state === sYPad1) {
-        printf("[TensorLoad] [inp] sYPad1\n")
-      }
-      when(state === sXPad0) {
-        printf("[TensorLoad] [inp] sXPad0\n")
-      }
-      when(state === sXPad1) {
-        printf("[TensorLoad] [inp] sXPad1\n")
-      }
-    } else if (tensorType == "wgt") {
-      when(io.vme_rd.cmd.fire()) {
-        printf("[TensorLoad] [wgt] cmd addr:%x len:%x\n",
-          dataCtrl.io.addr,
-          dataCtrl.io.len)
-      }
-    } else if (tensorType == "acc") {
-      when(io.vme_rd.cmd.fire()) {
-        printf("[TensorLoad] [acc] cmd addr:%x len:%x\n",
-          dataCtrl.io.addr,
-          dataCtrl.io.len)
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorStore.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorStore.scala
deleted file mode 100644
index 9b4bf748a3a5..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorStore.scala
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** TensorStore.
- *
- * Store 1D and 2D tensors from out-scratchpad (SRAM) to main memory (DRAM).
- */
-class TensorStore(tensorType: String = "none", debug: Boolean = false)(
-    implicit p: Parameters)
-    extends Module {
-  val tp = new TensorParams(tensorType)
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val baddr = Input(UInt(mp.addrBits.W))
-    val vme_wr = new VMEWriteMaster
-    val tensor = new TensorClient(tensorType)
-  })
-  val tensorLength = tp.tensorLength
-  val tensorWidth = tp.tensorWidth
-  val tensorElemBits = tp.tensorElemBits
-  val memBlockBits = tp.memBlockBits
-  val memDepth = tp.memDepth
-  val numMemBlock = tp.numMemBlock
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-  val waddr_cur = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
-  val waddr_nxt = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
-  val xcnt = Reg(chiselTypeOf(io.vme_wr.cmd.bits.len))
-  val xlen = Reg(chiselTypeOf(io.vme_wr.cmd.bits.len))
-  val xrem = Reg(chiselTypeOf(dec.xsize))
-  val xsize = (dec.xsize << log2Ceil(tensorLength * numMemBlock)) - 1.U
-  val xmax = (1 << mp.lenBits).U
-  val xmax_bytes = ((1 << mp.lenBits) * mp.dataBits / 8).U
-  val ycnt = Reg(chiselTypeOf(dec.ysize))
-  val ysize = dec.ysize
-  val tag = Reg(UInt(8.W))
-  val set = Reg(UInt(8.W))
-
-  val xfer_bytes = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
-  val xstride_bytes = dec.xstride << log2Ceil(tensorLength * tensorWidth)
-  val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
-  val elemBytes = (p(CoreKey).batch * p(CoreKey).blockOut * p(CoreKey).outBits) / 8
-  val pulse_bytes_bits = log2Ceil(mp.dataBits >> 3)
-
-  val xfer_init_addr = io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(elemBytes)))
-  val xfer_split_addr = waddr_cur + xfer_bytes
-  val xfer_stride_addr = waddr_nxt + xstride_bytes
-
-  val xfer_init_bytes   = xmax_bytes - xfer_init_addr % xmax_bytes
-  val xfer_init_pulses  = xfer_init_bytes >> pulse_bytes_bits
-  val xfer_split_bytes  = xmax_bytes - xfer_split_addr % xmax_bytes
-  val xfer_split_pulses = xfer_split_bytes >> pulse_bytes_bits
-  val xfer_stride_bytes = xmax_bytes - xfer_stride_addr % xmax_bytes
-  val xfer_stride_pulses= xfer_stride_bytes >> pulse_bytes_bits
-
-  val sIdle :: sWriteCmd :: sWriteData :: sReadMem :: sWriteAck :: Nil = Enum(5)
-  val state = RegInit(sIdle)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      xfer_bytes := xfer_init_bytes
-      when (io.start) {
-        state := sWriteCmd
-        when (xsize < xfer_init_pulses) {
-          xlen := xsize
-          xrem := 0.U
-        }.otherwise {
-          xlen := xfer_init_pulses - 1.U
-          xrem := xsize - xfer_init_pulses
-        }
-      }
-    }
-    is(sWriteCmd) {
-      when(io.vme_wr.cmd.ready) {
-        state := sWriteData
-      }
-    }
-    is(sWriteData) {
-      when(io.vme_wr.data.ready) {
-        when(xcnt === xlen) {
-          state := sWriteAck
-        }.elsewhen(tag === (numMemBlock - 1).U) {
-          state := sReadMem
-        }
-      }
-    }
-    is(sReadMem) {
-      state := sWriteData
-    }
-    is(sWriteAck) {
-      when(io.vme_wr.ack) {
-        when(xrem === 0.U) {
-          when(ycnt === ysize - 1.U) {
-            state := sIdle
-          }.otherwise { // stride
-            state := sWriteCmd
-            xfer_bytes := xfer_stride_bytes
-            when(xsize < xfer_stride_pulses) {
-              xlen := xsize
-              xrem := 0.U
-            }.otherwise {
-              xlen := xfer_stride_pulses - 1.U
-              xrem := xsize - xfer_stride_pulses
-            }
-          }
-        } // split
-        .elsewhen(xrem < xfer_split_pulses) {
-          state := sWriteCmd
-          xfer_bytes := xfer_split_bytes
-          xlen := xrem
-          xrem := 0.U
-        }
-        .otherwise {
-          state := sWriteCmd
-          xfer_bytes := xfer_split_bytes
-          xlen := xfer_split_pulses - 1.U
-          xrem := xrem - xfer_split_pulses
-        }
-      }
-    }
-  }
-
-  // write-to-sram
-  val tensorFile = Seq.fill(tensorLength) {
-    SyncReadMem(memDepth, Vec(numMemBlock, UInt(memBlockBits.W)))
-  }
-  val wdata_t = Wire(Vec(numMemBlock, UInt(memBlockBits.W)))
-  val no_mask = Wire(Vec(numMemBlock, Bool()))
-
-  wdata_t := DontCare
-  no_mask.foreach { m =>
-    m := true.B
-  }
-
-  for (i <- 0 until tensorLength) {
-    val inWrData = io.tensor.wr.bits.data(i).asUInt.asTypeOf(wdata_t)
-    when(io.tensor.wr.valid) {
-      tensorFile(i).write(io.tensor.wr.bits.idx, inWrData, no_mask)
-    }
-  }
-
-  // read-from-sram
-  val stride = state === sWriteAck &
-    io.vme_wr.ack &
-    xcnt === xlen + 1.U &
-    xrem === 0.U &
-    ycnt =/= ysize - 1.U
-
-  when(state === sIdle) {
-    ycnt := 0.U
-  }.elsewhen(stride) {
-    ycnt := ycnt + 1.U
-  }
-
-  when(state === sWriteCmd || tag === (numMemBlock - 1).U) {
-    tag := 0.U
-  }.elsewhen(io.vme_wr.data.fire()) {
-    tag := tag + 1.U
-  }
-
-  when(
-    state === sWriteCmd || (set === (tensorLength - 1).U && tag === (numMemBlock - 1).U)) {
-    set := 0.U
-  }.elsewhen(io.vme_wr.data.fire() && tag === (numMemBlock - 1).U) {
-    set := set + 1.U
-  }
-
-  val raddr_cur = Reg(UInt(tp.memAddrBits.W))
-  val raddr_nxt = Reg(UInt(tp.memAddrBits.W))
-  when(state === sIdle) {
-    raddr_cur := dec.sram_offset
-    raddr_nxt := dec.sram_offset
-  }.elsewhen(io.vme_wr.data.fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
-    raddr_cur := raddr_cur + 1.U
-  }.elsewhen(stride) {
-    raddr_cur := raddr_nxt + dec.xsize
-    raddr_nxt := raddr_nxt + dec.xsize
-  }
-
-  val tread = Seq.tabulate(tensorLength) { i =>
-    i.U ->
-      tensorFile(i).read(raddr_cur, state === sWriteCmd | state === sReadMem)
-  }
-  val mdata = MuxLookup(set, 0.U.asTypeOf(chiselTypeOf(wdata_t)), tread)
-
-  // write-to-dram
-  when(state === sIdle) {
-    waddr_cur := xfer_init_addr
-    waddr_nxt := xfer_init_addr
-  }.elsewhen(state === sWriteAck && io.vme_wr.ack && xrem =/= 0.U) {
-    waddr_cur := xfer_split_addr
-  }.elsewhen(stride) {
-    waddr_cur := xfer_stride_addr
-    waddr_nxt := xfer_stride_addr
-  }
-
-  io.vme_wr.cmd.valid := state === sWriteCmd
-  io.vme_wr.cmd.bits.addr := waddr_cur
-  io.vme_wr.cmd.bits.len := xlen
-
-  io.vme_wr.data.valid := state === sWriteData
-  io.vme_wr.data.bits := mdata(tag)
-
-  when(state === sWriteCmd) {
-    xcnt := 0.U
-  }.elsewhen(io.vme_wr.data.fire()) {
-    xcnt := xcnt + 1.U
-  }
-
-  // disable external read-from-sram requests
-  io.tensor.tieoffRead()
-
-  // done
-  io.done := state === sWriteAck & io.vme_wr.ack & xrem === 0.U & ycnt === ysize - 1.U
-
-  // debug
-  if (debug) {
-    when(io.vme_wr.cmd.fire()) {
-      printf("[TensorStore] ysize:%x ycnt:%x raddr:%x waddr:%x len:%x rem:%x\n",
-        ysize, ycnt, raddr_cur, waddr_cur, xlen, xrem)
-    }
-    when(io.vme_wr.data.fire()) {
-      printf("[TensorStore] data:%x\n", io.vme_wr.data.bits)
-    }
-    when(io.vme_wr.ack) {
-      printf("[TensorStore] ack\n")
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorUtil.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorUtil.scala
deleted file mode 100644
index d0a8ba7ef647..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorUtil.scala
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** TensorParams.
- *
- * This Bundle derives parameters for each tensorType, including inputs (inp),
- * weights (wgt), biases (acc), and outputs (out). This is used to avoid
- * doing the same boring calculations over and over again.
- */
-class TensorParams(tensorType: String = "none")(implicit p: Parameters) extends Bundle {
-  val errorMsg =
-    s"\n\n[VTA] [TensorParams] only inp, wgt, acc, and out supported\n\n"
-
-  require(tensorType == "inp" || tensorType == "wgt"
-    || tensorType == "acc" || tensorType == "out",
-    errorMsg)
-
-  val (tensorLength, tensorWidth, tensorElemBits) =
-    if (tensorType == "inp")
-      (p(CoreKey).batch, p(CoreKey).blockIn, p(CoreKey).inpBits)
-    else if (tensorType == "wgt")
-      (p(CoreKey).blockOut, p(CoreKey).blockIn, p(CoreKey).wgtBits)
-    else if (tensorType == "acc")
-      (p(CoreKey).batch, p(CoreKey).blockOut, p(CoreKey).accBits)
-    else
-      (p(CoreKey).batch, p(CoreKey).blockOut, p(CoreKey).outBits)
-
-  val memBlockBits = p(ShellKey).memParams.dataBits
-  val numMemBlock = (tensorWidth * tensorElemBits) / memBlockBits
-
-  val memDepth =
-    if (tensorType == "inp")
-      p(CoreKey).inpMemDepth
-    else if (tensorType == "wgt")
-      p(CoreKey).wgtMemDepth
-    else if (tensorType == "acc")
-      p(CoreKey).accMemDepth
-    else
-      p(CoreKey).outMemDepth
-
-  val memAddrBits = log2Ceil(memDepth)
-}
-
-/** TensorMaster.
- *
- * This interface issue read and write tensor-requests to scratchpads. For example,
- * The TensorGemm unit uses this interface for managing the inputs (inp), weights (wgt),
- * biases (acc), and outputs (out).
- *
- */
-class TensorMaster(tensorType: String = "none")
-  (implicit p: Parameters) extends TensorParams(tensorType) {
-  val rd = new Bundle {
-    val idx = ValidIO(UInt(memAddrBits.W))
-    val data = Flipped(
-      ValidIO(Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))))
-  }
-  val wr = ValidIO(new Bundle {
-    val idx = UInt(memAddrBits.W)
-    val data = Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))
-  })
-  def tieoffRead() {
-    rd.idx.valid := false.B
-    rd.idx.bits := 0.U
-  }
-  def tieoffWrite() {
-    wr.valid := false.B
-    wr.bits.idx := 0.U
-    wr.bits.data.foreach { b =>
-      b.foreach { c =>
-        c := 0.U
-      }
-    }
-  }
-  override def cloneType =
-    new TensorMaster(tensorType).asInstanceOf[this.type]
-}
-
-/** TensorClient.
- *
- * This interface receives read and write tensor-requests to scratchpads. For example,
- * The TensorLoad unit uses this interface for receiving read and write requests from
- * the TensorGemm unit.
- */
-class TensorClient(tensorType: String = "none")
-  (implicit p: Parameters) extends TensorParams(tensorType) {
-  val rd = new Bundle {
-    val idx = Flipped(ValidIO(UInt(memAddrBits.W)))
-    val data = ValidIO(
-      Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W))))
-  }
-  val wr = Flipped(ValidIO(new Bundle {
-    val idx = UInt(memAddrBits.W)
-    val data = Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))
-  }))
-  def tieoffRead() {
-    rd.data.valid := false.B
-    rd.data.bits.foreach { b =>
-      b.foreach { c =>
-        c := 0.U
-      }
-    }
-  }
-  override def cloneType =
-    new TensorClient(tensorType).asInstanceOf[this.type]
-}
-
-/** TensorMasterData.
- *
- * This interface is only used for datapath only purposes and the direction convention
- * is based on the TensorMaster interface, which means this is an input. This interface
- * is used on datapath only module such MatrixVectorCore or AluVector.
- */
-class TensorMasterData(tensorType: String = "none")
-  (implicit p: Parameters) extends TensorParams(tensorType) {
-  val data = Flipped(
-    ValidIO(Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))))
-  override def cloneType =
-    new TensorMasterData(tensorType).asInstanceOf[this.type]
-}
-
-/** TensorClientData.
- *
- * This interface is only used for datapath only purposes and the direction convention
- * is based on the TensorClient interface, which means this is an output. This interface
- * is used on datapath only module such MatrixVectorCore or AluVector.
- */
-class TensorClientData(tensorType: String = "none")
-  (implicit p: Parameters) extends TensorParams(tensorType) {
-  val data = ValidIO(
-    Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W))))
-  override def cloneType =
-    new TensorClientData(tensorType).asInstanceOf[this.type]
-}
-
-/** TensorPadCtrl. Zero-padding controller for TensorLoad. */
-class TensorPadCtrl(padType: String = "none", sizeFactor: Int = 1) extends Module {
-  val errorMsg =
-    s"\n\n\n[VTA-ERROR] only YPad0, YPad1, XPad0, or XPad1 supported\n\n\n"
-  require(padType == "YPad0" || padType == "YPad1"
-    || padType == "XPad0" || padType == "XPad1",
-    errorMsg)
-
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-  })
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-
-  val xmax = Reg(chiselTypeOf(dec.xsize))
-  val ymax = Reg(chiselTypeOf(dec.ypad_0))
-  val xcnt = Reg(chiselTypeOf(dec.xsize))
-  val ycnt = Reg(chiselTypeOf(dec.ypad_0))
-
-  val xval =
-    if (padType == "YPad0" || padType == "YPad1")
-      ((dec.xpad_0 + dec.xsize + dec.xpad_1) << log2Ceil(sizeFactor)) - 1.U
-    else if (padType == "XPad0")
-      (dec.xpad_0 << log2Ceil(sizeFactor)) - 1.U
-    else
-      (dec.xpad_1 << log2Ceil(sizeFactor)) - 1.U
-
-  val yval =
-    if (padType == "YPad0")
-      Mux(dec.ypad_0 =/= 0.U, dec.ypad_0 - 1.U, 0.U)
-    else if (padType == "YPad1")
-      Mux(dec.ypad_1 =/= 0.U, dec.ypad_1 - 1.U, 0.U)
-    else
-      0.U
-
-  val sIdle :: sActive :: Nil = Enum(2)
-  val state = RegInit(sIdle)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        state := sActive
-      }
-    }
-    is(sActive) {
-      when(ycnt === ymax && xcnt === xmax) {
-        state := sIdle
-      }
-    }
-  }
-
-  when(state === sIdle) {
-    xmax := xval
-    ymax := yval
-  }
-
-  when(state === sIdle || xcnt === xmax) {
-    xcnt := 0.U
-  }.elsewhen(state === sActive) {
-    xcnt := xcnt + 1.U
-  }
-
-  when(state === sIdle || ymax === 0.U) {
-    ycnt := 0.U
-  }.elsewhen(state === sActive && xcnt === xmax) {
-    ycnt := ycnt + 1.U
-  }
-
-  io.done := state === sActive & ycnt === ymax & xcnt === xmax
-}
-
-/** TensorDataCtrl. Data controller for TensorLoad. */
-class TensorDataCtrl(tensorType: String = "none",
-    sizeFactor: Int = 1, strideFactor: Int = 1)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val baddr = Input(UInt(mp.addrBits.W))
-    val xinit = Input(Bool())
-    val xupdate = Input(Bool())
-    val yupdate = Input(Bool())
-    val stride = Output(Bool())
-    val split = Output(Bool())
-    val commit = Output(Bool())
-    val addr = Output(UInt(mp.addrBits.W))
-    val len = Output(UInt(mp.lenBits.W))
-  })
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-
-  val caddr = Reg(UInt(mp.addrBits.W))
-  val baddr = Reg(UInt(mp.addrBits.W))
-  val len = Reg(UInt(mp.lenBits.W))
-  val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
-  val elemBytes =
-    if (tensorType == "inp") {
-      (p(CoreKey).batch * p(CoreKey).blockIn * p(CoreKey).inpBits) / 8
-    } else if (tensorType == "wgt") {
-      (p(CoreKey).blockOut * p(CoreKey).blockIn * p(CoreKey).wgtBits) / 8
-    } else {
-      (p(CoreKey).batch * p(CoreKey).blockOut * p(CoreKey).accBits) / 8
-    }
-
-  val xmax_bytes = ((1 << mp.lenBits) * mp.dataBits / 8).U
-  val xcnt = Reg(UInt(mp.lenBits.W))
-  val xrem = Reg(chiselTypeOf(dec.xsize))
-  val xsize = (dec.xsize << log2Ceil(sizeFactor)) - 1.U
-  val xmax = (1 << mp.lenBits).U
-  val ycnt = Reg(chiselTypeOf(dec.ysize))
-
-  val xfer_bytes = Reg(UInt(mp.addrBits.W))
-  val pulse_bytes_bits = log2Ceil(mp.dataBits >> 3)
-  val xstride_bytes = dec.xstride << log2Ceil(elemBytes)
-
-  val xfer_init_addr = io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(elemBytes)))
-  val xfer_split_addr = caddr + xfer_bytes
-  val xfer_stride_addr = baddr + xstride_bytes
-
-  val xfer_init_bytes   = xmax_bytes - xfer_init_addr % xmax_bytes
-  val xfer_init_pulses  = xfer_init_bytes >> pulse_bytes_bits
-  val xfer_split_bytes  = xmax_bytes - xfer_split_addr % xmax_bytes
-  val xfer_split_pulses = xfer_split_bytes >> pulse_bytes_bits
-  val xfer_stride_bytes = xmax_bytes - xfer_stride_addr % xmax_bytes
-  val xfer_stride_pulses= xfer_stride_bytes >> pulse_bytes_bits
-
-  val stride = xcnt === len &
-    xrem === 0.U &
-    ycnt =/= dec.ysize - 1.U
-
-  val split = xcnt === len & xrem =/= 0.U
-
-  when(io.start) {
-    xfer_bytes := xfer_init_bytes
-    when(xsize < xfer_init_pulses) {
-      len := xsize
-      xrem := 0.U
-    }.otherwise {
-      len := xfer_init_pulses - 1.U
-      xrem := xsize - xfer_init_pulses
-    }
-  }.elsewhen(io.xupdate && stride) {
-    xfer_bytes := xfer_stride_bytes
-    when(xsize < xfer_stride_pulses) {
-      len := xsize
-      xrem := 0.U
-    }.otherwise {
-      len := xfer_stride_pulses - 1.U
-      xrem := xsize - xfer_stride_pulses
-    }
-  }.elsewhen(io.xupdate && split) {
-    xfer_bytes := xfer_split_bytes
-    when(xrem < xfer_split_pulses) {
-      len := xrem
-      xrem := 0.U
-    }.otherwise {
-      len := xfer_split_pulses - 1.U
-      xrem := xrem - xfer_split_pulses
-    }
-  }
-
-  when(io.xinit) {
-    xcnt := 0.U
-  }.elsewhen(io.xupdate) {
-    xcnt := xcnt + 1.U
-  }
-
-  when(io.start) {
-    ycnt := 0.U
-  }.elsewhen(io.yupdate && stride) {
-    ycnt := ycnt + 1.U
-  }
-
-  when(io.start) {
-    caddr := xfer_init_addr
-    baddr := xfer_init_addr
-  }.elsewhen(io.yupdate) {
-    when(split) {
-      caddr := xfer_split_addr
-    }.elsewhen(stride) {
-      caddr := xfer_stride_addr
-      baddr := xfer_stride_addr
-    }
-  }
-
-  io.stride := stride
-  io.split := split
-  io.commit := xcnt === len
-  io.addr := caddr
-  io.len := len
-  io.done := xcnt === len &
-    xrem === 0.U &
-    ycnt === dec.ysize - 1.U
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/package.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/package.scala
deleted file mode 100644
index 673d390901de..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/package.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta
-
-/** This trick makes ISAConstants globally available */
-package object core extends vta.core.ISAConstants
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala b/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
deleted file mode 100644
index a42891661851..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.dpi
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.interface.axi._
-import vta.shell._
-
-/** Host DPI parameters */
-trait VTAHostDPIParams {
-  val dpiAddrBits = 8
-  val dpiDataBits = 32
-}
-
-/** Host master interface.
- *
- * This interface is tipically used by the Host
- */
-class VTAHostDPIMaster extends Bundle with VTAHostDPIParams {
-  val req = new Bundle {
-    val valid = Output(Bool())
-    val opcode = Output(Bool())
-    val addr = Output(UInt(dpiAddrBits.W))
-    val value = Output(UInt(dpiDataBits.W))
-    val deq = Input(Bool())
-  }
-  val resp = Flipped(ValidIO(UInt(dpiDataBits.W)))
-}
-
-/** Host client interface.
- *
- * This interface is tipically used by the Accelerator
- */
-class VTAHostDPIClient extends Bundle with VTAHostDPIParams {
-  val req = new Bundle {
-    val valid = Input(Bool())
-    val opcode = Input(Bool())
-    val addr = Input(UInt(dpiAddrBits.W))
-    val value = Input(UInt(dpiDataBits.W))
-    val deq = Output(Bool())
-  }
-  val resp = ValidIO(UInt(dpiDataBits.W))
-}
-
-/** Host DPI module.
- *
- * Wrapper for Host Verilog DPI module.
- */
-class VTAHostDPI extends BlackBox with HasBlackBoxResource {
-  val io = IO(new Bundle {
-    val clock = Input(Clock())
-    val reset = Input(Bool())
-    val dpi = new VTAHostDPIMaster
-  })
-  setResource("/verilog/VTAHostDPI.v")
-}
-
-/** Host DPI to AXI Converter.
- *
- * Convert Host DPI to AXI for VTAShell
- */
-class VTAHostDPIToAXI(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val dpi = new VTAHostDPIClient
-    val axi = new AXILiteMaster(p(ShellKey).hostParams)
-  })
-  val addr = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.addr)))
-  val data = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.value)))
-  val sIdle :: sReadAddress :: sReadData :: sWriteAddress :: sWriteData :: sWriteResponse :: Nil =
-    Enum(6)
-  val state = RegInit(sIdle)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.dpi.req.valid) {
-        when(io.dpi.req.opcode) {
-          state := sWriteAddress
-        }.otherwise {
-          state := sReadAddress
-        }
-      }
-    }
-    is(sReadAddress) {
-      when(io.axi.ar.ready) {
-        state := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.axi.r.valid) {
-        state := sIdle
-      }
-    }
-    is(sWriteAddress) {
-      when(io.axi.aw.ready) {
-        state := sWriteData
-      }
-    }
-    is(sWriteData) {
-      when(io.axi.w.ready) {
-        state := sWriteResponse
-      }
-    }
-    is(sWriteResponse) {
-      when(io.axi.b.valid) {
-        state := sIdle
-      }
-    }
-  }
-
-  when(state === sIdle && io.dpi.req.valid) {
-    addr := io.dpi.req.addr
-    data := io.dpi.req.value
-  }
-
-  io.axi.aw.valid := state === sWriteAddress
-  io.axi.aw.bits.addr := addr
-  io.axi.w.valid := state === sWriteData
-  io.axi.w.bits.data := data
-  io.axi.w.bits.strb := "h_f".U
-  io.axi.b.ready := state === sWriteResponse
-
-  io.axi.ar.valid := state === sReadAddress
-  io.axi.ar.bits.addr := addr
-  io.axi.r.ready := state === sReadData
-
-  io.dpi.req.deq := (state === sReadAddress & io.axi.ar.ready) | (state === sWriteAddress & io.axi.aw.ready)
-  io.dpi.resp.valid := io.axi.r.valid
-  io.dpi.resp.bits := io.axi.r.bits.data
-
-  if (debug) {
-    when(state === sWriteAddress && io.axi.aw.ready) {
-      printf("[VTAHostDPIToAXI] [AW] addr:%x\n", addr)
-    }
-    when(state === sReadAddress && io.axi.ar.ready) {
-      printf("[VTAHostDPIToAXI] [AR] addr:%x\n", addr)
-    }
-    when(io.axi.r.fire()) {
-      printf("[VTAHostDPIToAXI] [R] value:%x\n", io.axi.r.bits.data)
-    }
-    when(io.axi.w.fire()) {
-      printf("[VTAHostDPIToAXI] [W] value:%x\n", io.axi.w.bits.data)
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala b/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
deleted file mode 100644
index bffbc1c651cf..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.dpi
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.interface.axi._
-import vta.shell._
-
-/** Memory DPI parameters */
-trait VTAMemDPIParams {
-  val dpiLenBits = 8
-  val dpiAddrBits = 64
-  val dpiDataBits = 64
-}
-
-/** Memory master interface.
- *
- * This interface is tipically used by the Accelerator
- */
-class VTAMemDPIMaster extends Bundle with VTAMemDPIParams {
-  val req = new Bundle {
-    val valid = Output(Bool())
-    val opcode = Output(Bool())
-    val len = Output(UInt(dpiLenBits.W))
-    val addr = Output(UInt(dpiAddrBits.W))
-  }
-  val wr = ValidIO(UInt(dpiDataBits.W))
-  val rd = Flipped(Decoupled(UInt(dpiDataBits.W)))
-}
-
-/** Memory client interface.
- *
- * This interface is tipically used by the Host
- */
-class VTAMemDPIClient extends Bundle with VTAMemDPIParams {
-  val req = new Bundle {
-    val valid = Input(Bool())
-    val opcode = Input(Bool())
-    val len = Input(UInt(dpiLenBits.W))
-    val addr = Input(UInt(dpiAddrBits.W))
-  }
-  val wr = Flipped(ValidIO(UInt(dpiDataBits.W)))
-  val rd = Decoupled(UInt(dpiDataBits.W))
-}
-
-/** Memory DPI module.
- *
- * Wrapper for Memory Verilog DPI module.
- */
-class VTAMemDPI extends BlackBox with HasBlackBoxResource {
-  val io = IO(new Bundle {
-    val clock = Input(Clock())
-    val reset = Input(Bool())
-    val dpi = new VTAMemDPIClient
-  })
-  setResource("/verilog/VTAMemDPI.v")
-}
-
-class VTAMemDPIToAXI(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val dpi = new VTAMemDPIMaster
-    val axi = new AXIClient(p(ShellKey).memParams)
-  })
-  val opcode = RegInit(false.B)
-  val len = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.len)))
-  val addr = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.addr)))
-  val sIdle :: sReadAddress :: sReadData :: sWriteAddress :: sWriteData :: sWriteResponse :: Nil =
-    Enum(6)
-  val state = RegInit(sIdle)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.axi.ar.valid) {
-        state := sReadAddress
-      }.elsewhen(io.axi.aw.valid) {
-        state := sWriteAddress
-      }
-    }
-    is(sReadAddress) {
-      when(io.axi.ar.valid) {
-        state := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.axi.r.ready && io.dpi.rd.valid && len === 0.U) {
-        state := sIdle
-      }
-    }
-    is(sWriteAddress) {
-      when(io.axi.aw.valid) {
-        state := sWriteData
-      }
-    }
-    is(sWriteData) {
-      when(io.axi.w.valid && io.axi.w.bits.last) {
-        state := sWriteResponse
-      }
-    }
-    is(sWriteResponse) {
-      when(io.axi.b.ready) {
-        state := sIdle
-      }
-    }
-  }
-
-  when(state === sIdle) {
-    when(io.axi.ar.valid) {
-      opcode := false.B
-      len := io.axi.ar.bits.len
-      addr := io.axi.ar.bits.addr
-    }.elsewhen(io.axi.aw.valid) {
-      opcode := true.B
-      len := io.axi.aw.bits.len
-      addr := io.axi.aw.bits.addr
-    }
-  }.elsewhen(state === sReadData) {
-    when(io.axi.r.ready && io.dpi.rd.valid && len =/= 0.U) {
-      len := len - 1.U
-    }
-  }
-
-  io.dpi.req.valid := (state === sReadAddress & io.axi.ar.valid) | (state === sWriteAddress & io.axi.aw.valid)
-  io.dpi.req.opcode := opcode
-  io.dpi.req.len := len
-  io.dpi.req.addr := addr
-
-  io.axi.ar.ready := state === sReadAddress
-  io.axi.aw.ready := state === sWriteAddress
-
-  io.axi.r.valid := state === sReadData & io.dpi.rd.valid
-  io.axi.r.bits.data := io.dpi.rd.bits
-  io.axi.r.bits.last := len === 0.U
-  io.axi.r.bits.resp := 0.U
-  io.axi.r.bits.user := 0.U
-  io.axi.r.bits.id := 0.U
-  io.dpi.rd.ready := state === sReadData & io.axi.r.ready
-
-  io.dpi.wr.valid := state === sWriteData & io.axi.w.valid
-  io.dpi.wr.bits := io.axi.w.bits.data
-  io.axi.w.ready := state === sWriteData
-
-  io.axi.b.valid := state === sWriteResponse
-  io.axi.b.bits.resp := 0.U
-  io.axi.b.bits.user := 0.U
-  io.axi.b.bits.id := 0.U
-
-  if (debug) {
-    when(state === sReadAddress && io.axi.ar.valid) {
-      printf("[VTAMemDPIToAXI] [AR] addr:%x len:%x\n", addr, len)
-    }
-    when(state === sWriteAddress && io.axi.aw.valid) {
-      printf("[VTAMemDPIToAXI] [AW] addr:%x len:%x\n", addr, len)
-    }
-    when(io.axi.r.fire()) {
-      printf("[VTAMemDPIToAXI] [R] last:%x data:%x\n",
-        io.axi.r.bits.last,
-        io.axi.r.bits.data)
-    }
-    when(io.axi.w.fire()) {
-      printf("[VTAMemDPIToAXI] [W] last:%x data:%x\n",
-        io.axi.w.bits.last,
-        io.axi.w.bits.data)
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala b/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala
deleted file mode 100644
index 2f2532804eec..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.dpi
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.interface.axi._
-import vta.shell._
-
-/** Sim DPI module.
- *
- * Wrapper for Sim Verilog DPI module.
- */
-class VTASimDPI extends BlackBox with HasBlackBoxResource {
-  val io = IO(new Bundle {
-    val clock = Input(Clock())
-    val reset = Input(Bool())
-    val dpi_wait = Output(Bool())
-  })
-  setResource("/verilog/VTASimDPI.v")
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/interface/axi/AXI.scala b/vta/vta-hw/hardware/chisel/src/main/scala/interface/axi/AXI.scala
deleted file mode 100644
index 515159075602..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/interface/axi/AXI.scala
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.interface.axi
-
-import chisel3._
-import chisel3.util._
-import vta.util.genericbundle._
-
-case class AXIParams(
-    coherent: Boolean = false,
-    idBits: Int = 1,
-    addrBits: Int = 32,
-    dataBits: Int = 64,
-    lenBits: Int = 8,
-    userBits: Int = 1
-) {
-  require(addrBits > 0)
-  require(dataBits >= 8 && dataBits % 2 == 0)
-
-  val strbBits = dataBits / 8
-  val sizeBits = 3
-  val burstBits = 2
-  val lockBits = 2
-  val cacheBits = 4
-  val protBits = 3
-  val qosBits = 4
-  val regionBits = 4
-  val respBits = 2
-  val sizeConst = log2Ceil(dataBits / 8)
-  val idConst = 0
-  val userConst = if (coherent) 1 else 0
-  val burstConst = 1
-  val lockConst = 0
-  val cacheConst = if (coherent) 15 else 3
-  val protConst = if (coherent) 4 else 0
-  val qosConst = 0
-  val regionConst = 0
-}
-
-abstract class AXIBase(params: AXIParams)
-  extends GenericParameterizedBundle(params)
-
-// AXILite
-
-class AXILiteAddress(params: AXIParams) extends AXIBase(params) {
-  val addr = UInt(params.addrBits.W)
-}
-
-class AXILiteWriteData(params: AXIParams) extends AXIBase(params) {
-  val data = UInt(params.dataBits.W)
-  val strb = UInt(params.strbBits.W)
-}
-
-class AXILiteWriteResponse(params: AXIParams) extends AXIBase(params) {
-  val resp = UInt(params.respBits.W)
-}
-
-class AXILiteReadData(params: AXIParams) extends AXIBase(params) {
-  val data = UInt(params.dataBits.W)
-  val resp = UInt(params.respBits.W)
-}
-
-class AXILiteMaster(params: AXIParams) extends AXIBase(params) {
-  val aw = Decoupled(new AXILiteAddress(params))
-  val w = Decoupled(new AXILiteWriteData(params))
-  val b = Flipped(Decoupled(new AXILiteWriteResponse(params)))
-  val ar = Decoupled(new AXILiteAddress(params))
-  val r = Flipped(Decoupled(new AXILiteReadData(params)))
-
-  def tieoff() {
-    aw.valid := false.B
-    aw.bits.addr := 0.U
-    w.valid := false.B
-    w.bits.data := 0.U
-    w.bits.strb := 0.U
-    b.ready := false.B
-    ar.valid := false.B
-    ar.bits.addr := 0.U
-    r.ready := false.B
-  }
-}
-
-class AXILiteClient(params: AXIParams) extends AXIBase(params) {
-  val aw = Flipped(Decoupled(new AXILiteAddress(params)))
-  val w = Flipped(Decoupled(new AXILiteWriteData(params)))
-  val b = Decoupled(new AXILiteWriteResponse(params))
-  val ar = Flipped(Decoupled(new AXILiteAddress(params)))
-  val r = Decoupled(new AXILiteReadData(params))
-
-  def tieoff() {
-    aw.ready := false.B
-    w.ready := false.B
-    b.valid := false.B
-    b.bits.resp := 0.U
-    ar.ready := false.B
-    r.valid := false.B
-    r.bits.resp := 0.U
-    r.bits.data := 0.U
-  }
-}
-
-// AXI extends AXILite
-
-class AXIAddress(params: AXIParams) extends AXILiteAddress(params) {
-  val id = UInt(params.idBits.W)
-  val user = UInt(params.userBits.W)
-  val len = UInt(params.lenBits.W)
-  val size = UInt(params.sizeBits.W)
-  val burst = UInt(params.burstBits.W)
-  val lock = UInt(params.lockBits.W)
-  val cache = UInt(params.cacheBits.W)
-  val prot = UInt(params.protBits.W)
-  val qos = UInt(params.qosBits.W)
-  val region = UInt(params.regionBits.W)
-}
-
-class AXIWriteData(params: AXIParams) extends AXILiteWriteData(params) {
-  val last = Bool()
-  val id = UInt(params.idBits.W)
-  val user = UInt(params.userBits.W)
-}
-
-class AXIWriteResponse(params: AXIParams) extends AXILiteWriteResponse(params) {
-  val id = UInt(params.idBits.W)
-  val user = UInt(params.userBits.W)
-}
-
-class AXIReadData(params: AXIParams) extends AXILiteReadData(params) {
-  val last = Bool()
-  val id = UInt(params.idBits.W)
-  val user = UInt(params.userBits.W)
-}
-
-class AXIMaster(params: AXIParams) extends AXIBase(params) {
-  val aw = Decoupled(new AXIAddress(params))
-  val w = Decoupled(new AXIWriteData(params))
-  val b = Flipped(Decoupled(new AXIWriteResponse(params)))
-  val ar = Decoupled(new AXIAddress(params))
-  val r = Flipped(Decoupled(new AXIReadData(params)))
-
-  def tieoff() {
-    aw.valid := false.B
-    aw.bits.addr := 0.U
-    aw.bits.id := 0.U
-    aw.bits.user := 0.U
-    aw.bits.len := 0.U
-    aw.bits.size := 0.U
-    aw.bits.burst := 0.U
-    aw.bits.lock := 0.U
-    aw.bits.cache := 0.U
-    aw.bits.prot := 0.U
-    aw.bits.qos := 0.U
-    aw.bits.region := 0.U
-    w.valid := false.B
-    w.bits.data := 0.U
-    w.bits.strb := 0.U
-    w.bits.last := false.B
-    w.bits.id := 0.U
-    w.bits.user := 0.U
-    b.ready := false.B
-    ar.valid := false.B
-    ar.bits.addr := 0.U
-    ar.bits.id := 0.U
-    ar.bits.user := 0.U
-    ar.bits.len := 0.U
-    ar.bits.size := 0.U
-    ar.bits.burst := 0.U
-    ar.bits.lock := 0.U
-    ar.bits.cache := 0.U
-    ar.bits.prot := 0.U
-    ar.bits.qos := 0.U
-    ar.bits.region := 0.U
-    r.ready := false.B
-  }
-
-  def setConst() {
-    aw.bits.user := params.userConst.U
-    aw.bits.burst := params.burstConst.U
-    aw.bits.lock := params.lockConst.U
-    aw.bits.cache := params.cacheConst.U
-    aw.bits.prot := params.protConst.U
-    aw.bits.qos := params.qosConst.U
-    aw.bits.region := params.regionConst.U
-    aw.bits.size := params.sizeConst.U
-    aw.bits.id := params.idConst.U
-    w.bits.id := params.idConst.U
-    w.bits.user := params.userConst.U
-    w.bits.strb := Fill(params.strbBits, true.B)
-    ar.bits.user := params.userConst.U
-    ar.bits.burst := params.burstConst.U
-    ar.bits.lock := params.lockConst.U
-    ar.bits.cache := params.cacheConst.U
-    ar.bits.prot := params.protConst.U
-    ar.bits.qos := params.qosConst.U
-    ar.bits.region := params.regionConst.U
-    ar.bits.size := params.sizeConst.U
-    ar.bits.id := params.idConst.U
-  }
-}
-
-class AXIClient(params: AXIParams) extends AXIBase(params) {
-  val aw = Flipped(Decoupled(new AXIAddress(params)))
-  val w = Flipped(Decoupled(new AXIWriteData(params)))
-  val b = Decoupled(new AXIWriteResponse(params))
-  val ar = Flipped(Decoupled(new AXIAddress(params)))
-  val r = Decoupled(new AXIReadData(params))
-
-  def tieoff() {
-    aw.ready := false.B
-    w.ready := false.B
-    b.valid := false.B
-    b.bits.resp := 0.U
-    b.bits.user := 0.U
-    b.bits.id := 0.U
-    ar.ready := false.B
-    r.valid := false.B
-    r.bits.resp := 0.U
-    r.bits.data := 0.U
-    r.bits.user := 0.U
-    r.bits.last := false.B
-    r.bits.id := 0.U
-  }
-}
-
-// XilinxAXILiteClient and XilinxAXIMaster bundles are needed
-// for wrapper purposes, because the package RTL tool in Xilinx Vivado
-// only allows certain name formats
-
-class XilinxAXILiteClient(params: AXIParams) extends AXIBase(params) {
-  val AWVALID = Input(Bool())
-  val AWREADY = Output(Bool())
-  val AWADDR = Input(UInt(params.addrBits.W))
-  val WVALID = Input(Bool())
-  val WREADY = Output(Bool())
-  val WDATA = Input(UInt(params.dataBits.W))
-  val WSTRB = Input(UInt(params.strbBits.W))
-  val BVALID = Output(Bool())
-  val BREADY = Input(Bool())
-  val BRESP = Output(UInt(params.respBits.W))
-  val ARVALID = Input(Bool())
-  val ARREADY = Output(Bool())
-  val ARADDR = Input(UInt(params.addrBits.W))
-  val RVALID = Output(Bool())
-  val RREADY = Input(Bool())
-  val RDATA = Output(UInt(params.dataBits.W))
-  val RRESP = Output(UInt(params.respBits.W))
-}
-
-class XilinxAXIMaster(params: AXIParams) extends AXIBase(params) {
-  val AWVALID = Output(Bool())
-  val AWREADY = Input(Bool())
-  val AWADDR = Output(UInt(params.addrBits.W))
-  val AWID = Output(UInt(params.idBits.W))
-  val AWUSER = Output(UInt(params.userBits.W))
-  val AWLEN = Output(UInt(params.lenBits.W))
-  val AWSIZE = Output(UInt(params.sizeBits.W))
-  val AWBURST = Output(UInt(params.burstBits.W))
-  val AWLOCK = Output(UInt(params.lockBits.W))
-  val AWCACHE = Output(UInt(params.cacheBits.W))
-  val AWPROT = Output(UInt(params.protBits.W))
-  val AWQOS = Output(UInt(params.qosBits.W))
-  val AWREGION = Output(UInt(params.regionBits.W))
-  val WVALID = Output(Bool())
-  val WREADY = Input(Bool())
-  val WDATA = Output(UInt(params.dataBits.W))
-  val WSTRB = Output(UInt(params.strbBits.W))
-  val WLAST = Output(Bool())
-  val WID = Output(UInt(params.idBits.W))
-  val WUSER = Output(UInt(params.userBits.W))
-  val BVALID = Input(Bool())
-  val BREADY = Output(Bool())
-  val BRESP = Input(UInt(params.respBits.W))
-  val BID = Input(UInt(params.idBits.W))
-  val BUSER = Input(UInt(params.userBits.W))
-  val ARVALID = Output(Bool())
-  val ARREADY = Input(Bool())
-  val ARADDR = Output(UInt(params.addrBits.W))
-  val ARID = Output(UInt(params.idBits.W))
-  val ARUSER = Output(UInt(params.userBits.W))
-  val ARLEN = Output(UInt(params.lenBits.W))
-  val ARSIZE = Output(UInt(params.sizeBits.W))
-  val ARBURST = Output(UInt(params.burstBits.W))
-  val ARLOCK = Output(UInt(params.lockBits.W))
-  val ARCACHE = Output(UInt(params.cacheBits.W))
-  val ARPROT = Output(UInt(params.protBits.W))
-  val ARQOS = Output(UInt(params.qosBits.W))
-  val ARREGION = Output(UInt(params.regionBits.W))
-  val RVALID = Input(Bool())
-  val RREADY = Output(Bool())
-  val RDATA = Input(UInt(params.dataBits.W))
-  val RRESP = Input(UInt(params.respBits.W))
-  val RLAST = Input(Bool())
-  val RID = Input(UInt(params.idBits.W))
-  val RUSER = Input(UInt(params.userBits.W))
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/Configs.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/Configs.scala
deleted file mode 100644
index b0c54029b1a4..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/Configs.scala
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.interface.axi._
-
-/** PynqConfig. Shell configuration for Pynq */
-class PynqConfig extends Config((site, here, up) => {
-  case ShellKey =>
-    ShellParams(
-      hostParams = AXIParams(coherent = false,
-        addrBits = 16,
-        dataBits = 32,
-        lenBits = 8,
-        userBits = 1),
-      memParams = AXIParams(coherent = true,
-        addrBits = 32,
-        dataBits = 64,
-        lenBits = 8,
-        userBits = 1),
-      vcrParams = VCRParams(),
-      vmeParams = VMEParams()
-    )
-})
-
-/** F1Config. Shell configuration for F1 */
-class F1Config extends Config((site, here, up) => {
-  case ShellKey =>
-    ShellParams(
-      hostParams = AXIParams(coherent = false,
-        addrBits = 16,
-        dataBits = 32,
-        lenBits = 8,
-        userBits = 1),
-      memParams = AXIParams(coherent = false,
-        addrBits = 64,
-        dataBits = 64,
-        lenBits = 8,
-        userBits = 1),
-      vcrParams = VCRParams(),
-      vmeParams = VMEParams()
-    )
-})
-
-/** De10Config. Shell configuration for De10 */
-class De10Config extends Config((site, here, up) => {
-  case ShellKey =>
-    ShellParams(
-      hostParams =
-        AXIParams(addrBits = 16, dataBits = 32, idBits = 13, lenBits = 4),
-      memParams = AXIParams(
-        addrBits = 32,
-        dataBits = 64,
-        userBits = 5,
-        lenBits = 4,  // limit to 16 beats, instead of 256 beats in AXI4
-        coherent = true),
-      vcrParams = VCRParams(),
-      vmeParams = VMEParams()
-    )
-})
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/IntelShell.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/IntelShell.scala
deleted file mode 100644
index e1b6995decd0..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/IntelShell.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import vta.util.config._
-import vta.interface.axi._
-import vta.core._
-
-/** IntelShell.
- *
- * The IntelShell is based on a VME, VCR and core. This creates a complete VTA
- * system that can be used for simulation or real hardware.
- */
-class IntelShell(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val host = new AXIClient(p(ShellKey).hostParams)
-    val mem = new AXIMaster(p(ShellKey).memParams)
-  })
-
-  val vcr = Module(new VCR)
-  val vme = Module(new VME)
-  val core = Module(new Core)
-
-  core.io.vcr <> vcr.io.vcr
-  vme.io.vme <> core.io.vme
-
-  // vcr.io.host <> io.host
-  io.host.aw.ready := vcr.io.host.aw.ready
-  vcr.io.host.aw.valid := io.host.aw.valid
-  vcr.io.host.aw.bits.addr := io.host.aw.bits.addr
-  io.host.w.ready := vcr.io.host.w.ready
-  vcr.io.host.w.valid := io.host.w.valid
-  vcr.io.host.w.bits.data := io.host.w.bits.data
-  vcr.io.host.w.bits.strb := io.host.w.bits.strb
-  vcr.io.host.b.ready := io.host.b.ready
-  io.host.b.valid := vcr.io.host.b.valid
-  io.host.b.bits.resp := vcr.io.host.b.bits.resp
-  io.host.b.bits.id := io.host.w.bits.id
-
-  io.host.ar.ready := vcr.io.host.ar.ready
-  vcr.io.host.ar.valid := io.host.ar.valid
-  vcr.io.host.ar.bits.addr := io.host.ar.bits.addr
-  vcr.io.host.r.ready := io.host.r.ready
-  io.host.r.valid := vcr.io.host.r.valid
-  io.host.r.bits.data := vcr.io.host.r.bits.data
-  io.host.r.bits.resp := vcr.io.host.r.bits.resp
-  io.host.r.bits.id := io.host.ar.bits.id
-
-  io.host.b.bits.user <> DontCare
-  io.host.r.bits.user <> DontCare
-  io.host.r.bits.last := 1.U
-
-  io.mem <> vme.io.mem
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/SimShell.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/SimShell.scala
deleted file mode 100644
index 0909d1bfe47e..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/SimShell.scala
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import chisel3.experimental.MultiIOModule
-import vta.util.config._
-import vta.interface.axi._
-import vta.shell._
-import vta.dpi._
-
-/** VTAHost.
- *
- * This module translate the DPI protocol into AXI. This is a simulation only
- * module and used to test host-to-VTA communication. This module should be updated
- * for testing hosts using a different bus protocol, other than AXI.
- */
-class VTAHost(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val axi = new AXILiteMaster(p(ShellKey).hostParams)
-  })
-  val host_dpi = Module(new VTAHostDPI)
-  val host_axi = Module(new VTAHostDPIToAXI)
-  host_dpi.io.reset := reset
-  host_dpi.io.clock := clock
-  host_axi.io.dpi <> host_dpi.io.dpi
-  io.axi <> host_axi.io.axi
-}
-
-/** VTAMem.
- *
- * This module translate the DPI protocol into AXI. This is a simulation only
- * module and used to test VTA-to-memory communication. This module should be updated
- * for testing memories using a different bus protocol, other than AXI.
- */
-class VTAMem(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val axi = new AXIClient(p(ShellKey).memParams)
-  })
-  val mem_dpi = Module(new VTAMemDPI)
-  val mem_axi = Module(new VTAMemDPIToAXI)
-  mem_dpi.io.reset := reset
-  mem_dpi.io.clock := clock
-  mem_dpi.io.dpi <> mem_axi.io.dpi
-  mem_axi.io.axi <> io.axi
-}
-
-/** VTASim.
- *
- * This module is used to handle hardware simulation thread, such as halting
- * or terminating the simulation thread. The sim_wait port is used to halt
- * the simulation thread when it is asserted and resume it when it is
- * de-asserted.
- */
-class VTASim(implicit p: Parameters) extends MultiIOModule {
-  val sim_wait = IO(Output(Bool()))
-  val sim = Module(new VTASimDPI)
-  sim.io.reset := reset
-  sim.io.clock := clock
-  sim_wait := sim.io.dpi_wait
-}
-
-/** SimShell.
- *
- * The simulation shell instantiate the sim, host and memory DPI modules that
- * are connected to the VTAShell. An extra clock, sim_clock, is used to eval
- * the VTASim DPI function when the main simulation clock is on halt state.
- */
-class SimShell(implicit p: Parameters) extends MultiIOModule {
-  val mem = IO(new AXIClient(p(ShellKey).memParams))
-  val host = IO(new AXILiteMaster(p(ShellKey).hostParams))
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val mod_sim = Module(new VTASim)
-  val mod_host = Module(new VTAHost)
-  val mod_mem = Module(new VTAMem)
-  mem <> mod_mem.io.axi
-  host <> mod_host.io.axi
-  mod_sim.reset := reset
-  mod_sim.clock := sim_clock
-  sim_wait := mod_sim.sim_wait
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VCR.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/VCR.scala
deleted file mode 100644
index 9a80cd7799a3..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VCR.scala
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.util.genericbundle._
-import vta.interface.axi._
-
-/** VCR parameters.
- *
- * These parameters are used on VCR interfaces and modules.
- */
-case class VCRParams() {
-  val nCtrl = 1
-  val nECnt = 1
-  val nVals = 1
-  val nPtrs = 6
-  val nUCnt = 1
-  val regBits = 32
-}
-
-/** VCRBase. Parametrize base class. */
-abstract class VCRBase(implicit p: Parameters) extends GenericParameterizedBundle(p)
-
-/** VCRMaster.
- *
- * This is the master interface used by VCR in the VTAShell to control
- * the Core unit.
- */
-class VCRMaster(implicit p: Parameters) extends VCRBase {
-  val vp = p(ShellKey).vcrParams
-  val mp = p(ShellKey).memParams
-  val launch = Output(Bool())
-  val finish = Input(Bool())
-  val ecnt = Vec(vp.nECnt, Flipped(ValidIO(UInt(vp.regBits.W))))
-  val vals = Output(Vec(vp.nVals, UInt(vp.regBits.W)))
-  val ptrs = Output(Vec(vp.nPtrs, UInt(mp.addrBits.W)))
-  val ucnt = Vec(vp.nUCnt, Flipped(ValidIO(UInt(vp.regBits.W))))
-}
-
-/** VCRClient.
- *
- * This is the client interface used by the Core module to communicate
- * to the VCR in the VTAShell.
- */
-class VCRClient(implicit p: Parameters) extends VCRBase {
-  val vp = p(ShellKey).vcrParams
-  val mp = p(ShellKey).memParams
-  val launch = Input(Bool())
-  val finish = Output(Bool())
-  val ecnt = Vec(vp.nECnt, ValidIO(UInt(vp.regBits.W)))
-  val vals = Input(Vec(vp.nVals, UInt(vp.regBits.W)))
-  val ptrs = Input(Vec(vp.nPtrs, UInt(mp.addrBits.W)))
-  val ucnt = Vec(vp.nUCnt, ValidIO(UInt(vp.regBits.W)))
-}
-
-/** VTA Control Registers (VCR).
- *
- * This unit provides control registers (32 and 64 bits) to be used by a control'
- * unit, typically a host processor. These registers are read-only by the core
- * at the moment but this will likely change once we add support to general purpose
- * registers that could be used as event counters by the Core unit.
- */
-class VCR(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val host = new AXILiteClient(p(ShellKey).hostParams)
-    val vcr = new VCRMaster
-  })
-
-  val vp = p(ShellKey).vcrParams
-  val mp = p(ShellKey).memParams
-  val hp = p(ShellKey).hostParams
-
-  // Write control (AW, W, B)
-  val waddr = RegInit("h_ffff".U(hp.addrBits.W)) // init with invalid address
-  val wdata = io.host.w.bits.data
-  val sWriteAddress :: sWriteData :: sWriteResponse :: Nil = Enum(3)
-  val wstate = RegInit(sWriteAddress)
-
-  // read control (AR, R)
-  val sReadAddress :: sReadData :: Nil = Enum(2)
-  val rstate = RegInit(sReadAddress)
-  val rdata = RegInit(0.U(vp.regBits.W))
-
-  // registers
-  val nPtrs = if (mp.addrBits == 32) vp.nPtrs else 2 * vp.nPtrs
-  val nTotal = vp.nCtrl + vp.nECnt + vp.nVals + nPtrs + vp.nUCnt
-
-  val reg = Seq.fill(nTotal)(RegInit(0.U(vp.regBits.W)))
-  val addr = Seq.tabulate(nTotal)(_ * 4)
-  val reg_map = (addr zip reg) map { case (a, r) => a.U -> r }
-  val eo = vp.nCtrl
-  val vo = eo + vp.nECnt
-  val po = vo + vp.nVals
-  val uo = po + nPtrs
-
-  switch(wstate) {
-    is(sWriteAddress) {
-      when(io.host.aw.valid) {
-        wstate := sWriteData
-      }
-    }
-    is(sWriteData) {
-      when(io.host.w.valid) {
-        wstate := sWriteResponse
-      }
-    }
-    is(sWriteResponse) {
-      when(io.host.b.ready) {
-        wstate := sWriteAddress
-      }
-    }
-  }
-
-  when(io.host.aw.fire()) { waddr := io.host.aw.bits.addr }
-
-  io.host.aw.ready := wstate === sWriteAddress
-  io.host.w.ready := wstate === sWriteData
-  io.host.b.valid := wstate === sWriteResponse
-  io.host.b.bits.resp := 0.U
-
-  switch(rstate) {
-    is(sReadAddress) {
-      when(io.host.ar.valid) {
-        rstate := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.host.r.ready) {
-        rstate := sReadAddress
-      }
-    }
-  }
-
-  io.host.ar.ready := rstate === sReadAddress
-  io.host.r.valid := rstate === sReadData
-  io.host.r.bits.data := rdata
-  io.host.r.bits.resp := 0.U
-
-  when(io.vcr.finish) {
-    reg(0) := "b_10".U
-  }.elsewhen(io.host.w.fire() && addr(0).U === waddr) {
-    reg(0) := wdata
-  }
-
-  for (i <- 0 until vp.nECnt) {
-    when(io.vcr.ecnt(i).valid) {
-      reg(eo + i) := io.vcr.ecnt(i).bits
-    }.elsewhen(io.host.w.fire() && addr(eo + i).U === waddr) {
-      reg(eo + i) := wdata
-    }
-  }
-
-  for (i <- 0 until (vp.nVals + nPtrs)) {
-    when(io.host.w.fire() && addr(vo + i).U === waddr) {
-      reg(vo + i) := wdata
-    }
-  }
-
-  when(io.host.ar.fire()) {
-    rdata := MuxLookup(io.host.ar.bits.addr, 0.U, reg_map)
-  }
-
-  io.vcr.launch := reg(0)(0)
-
-  for (i <- 0 until vp.nVals) {
-    io.vcr.vals(i) := reg(vo + i)
-  }
-
-  if (mp.addrBits == 32) { // 32-bit pointers
-    for (i <- 0 until nPtrs) {
-      io.vcr.ptrs(i) := reg(po + i)
-    }
-  } else { // 64-bits pointers
-    for (i <- 0 until (nPtrs / 2)) {
-      io.vcr.ptrs(i) := Cat(reg(po + 2 * i + 1), reg(po + 2 * i))
-    }
-  }
-
-  for (i <- 0 until vp.nUCnt) {
-    when(io.vcr.ucnt(i).valid) {
-      reg(uo + i) := io.vcr.ucnt(i).bits
-    }.elsewhen(io.host.w.fire() && addr(uo + i).U === waddr) {
-      reg(uo + i) := wdata
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VME.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/VME.scala
deleted file mode 100644
index 41b24d1ba7aa..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VME.scala
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.util.genericbundle._
-import vta.interface.axi._
-
-/** VME parameters.
- *
- * These parameters are used on VME interfaces and modules.
- */
-case class VMEParams() {
-  val nReadClients: Int = 5
-  val nWriteClients: Int = 1
-  require(nReadClients > 0,
-    s"\n\n[VTA] [VMEParams] nReadClients must be larger than 0\n\n")
-  require(
-    nWriteClients == 1,
-    s"\n\n[VTA] [VMEParams] nWriteClients must be 1, only one-write-client support atm\n\n")
-}
-
-/** VMEBase. Parametrize base class. */
-abstract class VMEBase(implicit p: Parameters) extends GenericParameterizedBundle(p)
-
-/** VMECmd.
- *
- * This interface is used for creating write and read requests to memory.
- */
-class VMECmd(implicit p: Parameters) extends VMEBase {
-  val addrBits = p(ShellKey).memParams.addrBits
-  val lenBits = p(ShellKey).memParams.lenBits
-  val addr = UInt(addrBits.W)
-  val len = UInt(lenBits.W)
-}
-
-/** VMEReadMaster.
- *
- * This interface is used by modules inside the core to generate read requests
- * and receive responses from VME.
- */
-class VMEReadMaster(implicit p: Parameters) extends Bundle {
-  val dataBits = p(ShellKey).memParams.dataBits
-  val cmd = Decoupled(new VMECmd)
-  val data = Flipped(Decoupled(UInt(dataBits.W)))
-  override def cloneType =
-    new VMEReadMaster().asInstanceOf[this.type]
-}
-
-/** VMEReadClient.
- *
- * This interface is used by the VME to receive read requests and generate
- * responses to modules inside the core.
- */
-class VMEReadClient(implicit p: Parameters) extends Bundle {
-  val dataBits = p(ShellKey).memParams.dataBits
-  val cmd = Flipped(Decoupled(new VMECmd))
-  val data = Decoupled(UInt(dataBits.W))
-  override def cloneType =
-    new VMEReadClient().asInstanceOf[this.type]
-}
-
-/** VMEWriteMaster.
- *
- * This interface is used by modules inside the core to generate write requests
- * to the VME.
- */
-class VMEWriteMaster(implicit p: Parameters) extends Bundle {
-  val dataBits = p(ShellKey).memParams.dataBits
-  val cmd = Decoupled(new VMECmd)
-  val data = Decoupled(UInt(dataBits.W))
-  val ack = Input(Bool())
-  override def cloneType =
-    new VMEWriteMaster().asInstanceOf[this.type]
-}
-
-/** VMEWriteClient.
- *
- * This interface is used by the VME to handle write requests from modules inside
- * the core.
- */
-class VMEWriteClient(implicit p: Parameters) extends Bundle {
-  val dataBits = p(ShellKey).memParams.dataBits
-  val cmd = Flipped(Decoupled(new VMECmd))
-  val data = Flipped(Decoupled(UInt(dataBits.W)))
-  val ack = Output(Bool())
-  override def cloneType =
-    new VMEWriteClient().asInstanceOf[this.type]
-}
-
-/** VMEMaster.
- *
- * Pack nRd number of VMEReadMaster interfaces and nWr number of VMEWriteMaster
- * interfaces.
- */
-class VMEMaster(implicit p: Parameters) extends Bundle {
-  val nRd = p(ShellKey).vmeParams.nReadClients
-  val nWr = p(ShellKey).vmeParams.nWriteClients
-  val rd = Vec(nRd, new VMEReadMaster)
-  val wr = Vec(nWr, new VMEWriteMaster)
-}
-
-/** VMEClient.
- *
- * Pack nRd number of VMEReadClient interfaces and nWr number of VMEWriteClient
- * interfaces.
- */
-class VMEClient(implicit p: Parameters) extends Bundle {
-  val nRd = p(ShellKey).vmeParams.nReadClients
-  val nWr = p(ShellKey).vmeParams.nWriteClients
-  val rd = Vec(nRd, new VMEReadClient)
-  val wr = Vec(nWr, new VMEWriteClient)
-}
-
-/** VTA Memory Engine (VME).
- *
- * This unit multiplexes the memory controller interface for the Core. Currently,
- * it supports single-writer and multiple-reader mode and it is also based on AXI.
- */
-class VME(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val mem = new AXIMaster(p(ShellKey).memParams)
-    val vme = new VMEClient
-  })
-
-  val nReadClients = p(ShellKey).vmeParams.nReadClients
-  val rd_arb = Module(new Arbiter(new VMECmd, nReadClients))
-  val rd_arb_chosen = RegEnable(rd_arb.io.chosen, rd_arb.io.out.fire())
-
-  for (i <- 0 until nReadClients) { rd_arb.io.in(i) <> io.vme.rd(i).cmd }
-
-  val sReadIdle :: sReadAddr :: sReadData :: Nil = Enum(3)
-  val rstate = RegInit(sReadIdle)
-
-  switch(rstate) {
-    is(sReadIdle) {
-      when(rd_arb.io.out.valid) {
-        rstate := sReadAddr
-      }
-    }
-    is(sReadAddr) {
-      when(io.mem.ar.ready) {
-        rstate := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.mem.r.fire() && io.mem.r.bits.last) {
-        rstate := sReadIdle
-      }
-    }
-  }
-
-  val sWriteIdle :: sWriteAddr :: sWriteData :: sWriteResp :: Nil = Enum(4)
-  val wstate = RegInit(sWriteIdle)
-  val addrBits = p(ShellKey).memParams.addrBits
-  val lenBits = p(ShellKey).memParams.lenBits
-  val wr_cnt = RegInit(0.U(lenBits.W))
-
-  when(wstate === sWriteIdle) {
-    wr_cnt := 0.U
-  }.elsewhen(io.mem.w.fire()) {
-    wr_cnt := wr_cnt + 1.U
-  }
-
-  switch(wstate) {
-    is(sWriteIdle) {
-      when(io.vme.wr(0).cmd.valid) {
-        wstate := sWriteAddr
-      }
-    }
-    is(sWriteAddr) {
-      when(io.mem.aw.ready) {
-        wstate := sWriteData
-      }
-    }
-    is(sWriteData) {
-      when(
-        io.vme
-          .wr(0)
-          .data
-          .valid && io.mem.w.ready && wr_cnt === io.vme.wr(0).cmd.bits.len) {
-        wstate := sWriteResp
-      }
-    }
-    is(sWriteResp) {
-      when(io.mem.b.valid) {
-        wstate := sWriteIdle
-      }
-    }
-  }
-
-  // registers storing read/write cmds
-
-  val rd_len = RegInit(0.U(lenBits.W))
-  val wr_len = RegInit(0.U(lenBits.W))
-  val rd_addr = RegInit(0.U(addrBits.W))
-  val wr_addr = RegInit(0.U(addrBits.W))
-
-  when(rd_arb.io.out.fire()) {
-    rd_len := rd_arb.io.out.bits.len
-    rd_addr := rd_arb.io.out.bits.addr
-  }
-
-  when(io.vme.wr(0).cmd.fire()) {
-    wr_len := io.vme.wr(0).cmd.bits.len
-    wr_addr := io.vme.wr(0).cmd.bits.addr
-  }
-
-  // rd arb
-  rd_arb.io.out.ready := rstate === sReadIdle
-
-  // vme
-  for (i <- 0 until nReadClients) {
-    io.vme.rd(i).data.valid := rd_arb_chosen === i.asUInt & io.mem.r.valid
-    io.vme.rd(i).data.bits := io.mem.r.bits.data
-  }
-
-  io.vme.wr(0).cmd.ready := wstate === sWriteIdle
-  io.vme.wr(0).ack := io.mem.b.fire()
-  io.vme.wr(0).data.ready := wstate === sWriteData & io.mem.w.ready
-
-  // mem
-  io.mem.aw.valid := wstate === sWriteAddr
-  io.mem.aw.bits.addr := wr_addr
-  io.mem.aw.bits.len := wr_len
-
-  io.mem.w.valid := wstate === sWriteData & io.vme.wr(0).data.valid
-  io.mem.w.bits.data := io.vme.wr(0).data.bits
-  io.mem.w.bits.last := wr_cnt === io.vme.wr(0).cmd.bits.len
-
-  io.mem.b.ready := wstate === sWriteResp
-
-  io.mem.ar.valid := rstate === sReadAddr
-  io.mem.ar.bits.addr := rd_addr
-  io.mem.ar.bits.len := rd_len
-
-  io.mem.r.ready := rstate === sReadData & io.vme.rd(rd_arb_chosen).data.ready
-
-  // AXI constants - statically defined
-  io.mem.setConst()
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VTAShell.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/VTAShell.scala
deleted file mode 100644
index 650a1c56c352..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VTAShell.scala
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import vta.util.config._
-import vta.interface.axi._
-import vta.core._
-
-/** Shell parameters. */
-case class ShellParams(
-    hostParams: AXIParams,
-    memParams: AXIParams,
-    vcrParams: VCRParams,
-    vmeParams: VMEParams
-)
-
-case object ShellKey extends Field[ShellParams]
-
-/** VTAShell.
- *
- * The VTAShell is based on a VME, VCR and core. This creates a complete VTA
- * system that can be used for simulation or real hardware.
- */
-class VTAShell(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val host = new AXILiteClient(p(ShellKey).hostParams)
-    val mem = new AXIMaster(p(ShellKey).memParams)
-  })
-
-  val vcr = Module(new VCR)
-  val vme = Module(new VME)
-  val core = Module(new Core)
-
-  core.io.vcr <> vcr.io.vcr
-  vme.io.vme <> core.io.vme
-
-  vcr.io.host <> io.host
-  io.mem <> vme.io.mem
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/XilinxShell.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/XilinxShell.scala
deleted file mode 100644
index 28f95ea36bc4..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/XilinxShell.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import chisel3.experimental.{withClockAndReset, RawModule}
-import vta.util.config._
-import vta.interface.axi._
-
-/** XilinxShell.
- *
- * This is a wrapper shell mostly used to match Xilinx convention naming,
- * therefore we can pack VTA as an IP for IPI based flows.
- */
-class XilinxShell(implicit p: Parameters) extends RawModule {
-
-  val hp = p(ShellKey).hostParams
-  val mp = p(ShellKey).memParams
-
-  val ap_clk = IO(Input(Clock()))
-  val ap_rst_n = IO(Input(Bool()))
-  val m_axi_gmem = IO(new XilinxAXIMaster(mp))
-  val s_axi_control = IO(new XilinxAXILiteClient(hp))
-
-  val shell = withClockAndReset(clock = ap_clk, reset = ~ap_rst_n) {
-    Module(new VTAShell)
-  }
-
-  // memory
-  m_axi_gmem.AWVALID := shell.io.mem.aw.valid
-  shell.io.mem.aw.ready := m_axi_gmem.AWREADY
-  m_axi_gmem.AWADDR := shell.io.mem.aw.bits.addr
-  m_axi_gmem.AWID := shell.io.mem.aw.bits.id
-  m_axi_gmem.AWUSER := shell.io.mem.aw.bits.user
-  m_axi_gmem.AWLEN := shell.io.mem.aw.bits.len
-  m_axi_gmem.AWSIZE := shell.io.mem.aw.bits.size
-  m_axi_gmem.AWBURST := shell.io.mem.aw.bits.burst
-  m_axi_gmem.AWLOCK := shell.io.mem.aw.bits.lock
-  m_axi_gmem.AWCACHE := shell.io.mem.aw.bits.cache
-  m_axi_gmem.AWPROT := shell.io.mem.aw.bits.prot
-  m_axi_gmem.AWQOS := shell.io.mem.aw.bits.qos
-  m_axi_gmem.AWREGION := shell.io.mem.aw.bits.region
-
-  m_axi_gmem.WVALID := shell.io.mem.w.valid
-  shell.io.mem.w.ready := m_axi_gmem.WREADY
-  m_axi_gmem.WDATA := shell.io.mem.w.bits.data
-  m_axi_gmem.WSTRB := shell.io.mem.w.bits.strb
-  m_axi_gmem.WLAST := shell.io.mem.w.bits.last
-  m_axi_gmem.WID := shell.io.mem.w.bits.id
-  m_axi_gmem.WUSER := shell.io.mem.w.bits.user
-
-  shell.io.mem.b.valid := m_axi_gmem.BVALID
-  m_axi_gmem.BREADY := shell.io.mem.b.valid
-  shell.io.mem.b.bits.resp := m_axi_gmem.BRESP
-  shell.io.mem.b.bits.id := m_axi_gmem.BID
-  shell.io.mem.b.bits.user := m_axi_gmem.BUSER
-
-  m_axi_gmem.ARVALID := shell.io.mem.ar.valid
-  shell.io.mem.ar.ready := m_axi_gmem.ARREADY
-  m_axi_gmem.ARADDR := shell.io.mem.ar.bits.addr
-  m_axi_gmem.ARID := shell.io.mem.ar.bits.id
-  m_axi_gmem.ARUSER := shell.io.mem.ar.bits.user
-  m_axi_gmem.ARLEN := shell.io.mem.ar.bits.len
-  m_axi_gmem.ARSIZE := shell.io.mem.ar.bits.size
-  m_axi_gmem.ARBURST := shell.io.mem.ar.bits.burst
-  m_axi_gmem.ARLOCK := shell.io.mem.ar.bits.lock
-  m_axi_gmem.ARCACHE := shell.io.mem.ar.bits.cache
-  m_axi_gmem.ARPROT := shell.io.mem.ar.bits.prot
-  m_axi_gmem.ARQOS := shell.io.mem.ar.bits.qos
-  m_axi_gmem.ARREGION := shell.io.mem.ar.bits.region
-
-  shell.io.mem.r.valid := m_axi_gmem.RVALID
-  m_axi_gmem.RREADY := shell.io.mem.r.ready
-  shell.io.mem.r.bits.data := m_axi_gmem.RDATA
-  shell.io.mem.r.bits.resp := m_axi_gmem.RRESP
-  shell.io.mem.r.bits.last := m_axi_gmem.RLAST
-  shell.io.mem.r.bits.id := m_axi_gmem.RID
-  shell.io.mem.r.bits.user := m_axi_gmem.RUSER
-
-  // host
-  shell.io.host.aw.valid := s_axi_control.AWVALID
-  s_axi_control.AWREADY := shell.io.host.aw.ready
-  shell.io.host.aw.bits.addr := s_axi_control.AWADDR
-
-  shell.io.host.w.valid := s_axi_control.WVALID
-  s_axi_control.WREADY := shell.io.host.w.ready
-  shell.io.host.w.bits.data := s_axi_control.WDATA
-  shell.io.host.w.bits.strb := s_axi_control.WSTRB
-
-  s_axi_control.BVALID := shell.io.host.b.valid
-  shell.io.host.b.ready := s_axi_control.BREADY
-  s_axi_control.BRESP := shell.io.host.b.bits.resp
-
-  shell.io.host.ar.valid := s_axi_control.ARVALID
-  s_axi_control.ARREADY := shell.io.host.ar.ready
-  shell.io.host.ar.bits.addr := s_axi_control.ARADDR
-
-  s_axi_control.RVALID := shell.io.host.r.valid
-  shell.io.host.r.ready := s_axi_control.RREADY
-  s_axi_control.RDATA := shell.io.host.r.bits.data
-  s_axi_control.RRESP := shell.io.host.r.bits.resp
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/test/Test.scala b/vta/vta-hw/hardware/chisel/src/main/scala/test/Test.scala
deleted file mode 100644
index 7749d9536554..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/test/Test.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.test
-
-import chisel3._
-import chisel3.experimental.MultiIOModule
-import vta.util.config._
-import vta.shell._
-
-/** Test. This generates a testbench file for simulation */
-class Test(implicit p: Parameters) extends MultiIOModule {
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val sim_shell = Module(new SimShell)
-  val vta_shell = Module(new VTAShell)
-  sim_shell.sim_clock := sim_clock
-  sim_wait := sim_shell.sim_wait
-  sim_shell.mem <> vta_shell.io.mem
-  vta_shell.io.host <> sim_shell.host
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/util/Config.scala b/vta/vta-hw/hardware/chisel/src/main/scala/util/Config.scala
deleted file mode 100644
index d63d95665571..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/util/Config.scala
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.util.config
-
-// taken from https://github.com/vta.roject/rocket-chip
-
-abstract class Field[T] private (val default: Option[T]) {
-  def this() = this(None)
-  def this(default: T) = this(Some(default))
-}
-
-abstract class View {
-  final def apply[T](pname: Field[T]): T = apply(pname, this)
-  final def apply[T](pname: Field[T], site: View): T = {
-    val out = find(pname, site)
-    require(out.isDefined, s"Key ${pname} is not defined in Parameters")
-    out.get
-  }
-
-  final def lift[T](pname: Field[T]): Option[T] = lift(pname, this)
-  final def lift[T](pname: Field[T], site: View): Option[T] =
-    find(pname, site).map(_.asInstanceOf[T])
-
-  protected[config] def find[T](pname: Field[T], site: View): Option[T]
-}
-
-abstract class Parameters extends View {
-  final def ++(x: Parameters): Parameters =
-    new ChainParameters(this, x)
-
-  final def alter(
-    f: (View, View, View) => PartialFunction[Any, Any]): Parameters =
-    Parameters(f) ++ this
-
-  final def alterPartial(f: PartialFunction[Any, Any]): Parameters =
-    Parameters((_, _, _) => f) ++ this
-
-  final def alterMap(m: Map[Any, Any]): Parameters =
-    new MapParameters(m) ++ this
-
-  protected[config] def chain[T](site: View,
-    tail: View,
-    pname: Field[T]): Option[T]
-  protected[config] def find[T](pname: Field[T], site: View) =
-    chain(site, new TerminalView, pname)
-}
-
-object Parameters {
-  def empty: Parameters = new EmptyParameters
-  def apply(f: (View, View, View) => PartialFunction[Any, Any]): Parameters =
-    new PartialParameters(f)
-}
-
-class Config(p: Parameters) extends Parameters {
-  def this(f: (View, View, View) => PartialFunction[Any, Any]) =
-    this(Parameters(f))
-
-  protected[config] def chain[T](site: View, tail: View, pname: Field[T]) =
-    p.chain(site, tail, pname)
-  override def toString = this.getClass.getSimpleName
-  def toInstance = this
-}
-
-// Internal implementation:
-
-private class TerminalView extends View {
-  def find[T](pname: Field[T], site: View): Option[T] = pname.default
-}
-
-private class ChainView(head: Parameters, tail: View) extends View {
-  def find[T](pname: Field[T], site: View) = head.chain(site, tail, pname)
-}
-
-private class ChainParameters(x: Parameters, y: Parameters) extends Parameters {
-  def chain[T](site: View, tail: View, pname: Field[T]) =
-    x.chain(site, new ChainView(y, tail), pname)
-}
-
-private class EmptyParameters extends Parameters {
-  def chain[T](site: View, tail: View, pname: Field[T]) = tail.find(pname, site)
-}
-
-private class PartialParameters(
-    f: (View, View, View) => PartialFunction[Any, Any])
-    extends Parameters {
-  protected[config] def chain[T](site: View, tail: View, pname: Field[T]) = {
-    val g = f(site, this, tail)
-    if (g.isDefinedAt(pname)) Some(g.apply(pname).asInstanceOf[T])
-    else tail.find(pname, site)
-  }
-}
-
-private class MapParameters(map: Map[Any, Any]) extends Parameters {
-  protected[config] def chain[T](site: View, tail: View, pname: Field[T]) = {
-    val g = map.get(pname)
-    if (g.isDefined) Some(g.get.asInstanceOf[T]) else tail.find(pname, site)
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala b/vta/vta-hw/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
deleted file mode 100644
index 063e76673396..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.util.genericbundle
-
-// taken from https://github.com/vta.roject/rocket-chip
-
-import chisel3._
-
-abstract class GenericParameterizedBundle[+T <: Object]
-  (val params: T) extends Bundle {
-  override def cloneType = {
-    try {
-      this.getClass.getConstructors.head
-        .newInstance(params)
-        .asInstanceOf[this.type]
-    } catch {
-      case e: java.lang.IllegalArgumentException =>
-        throw new Exception(
-          "Unable to use GenericParameterizedBundle.cloneType on " +
-            this.getClass + ", probably because " + this.getClass +
-            "() takes more than one argument.  Consider overriding " +
-            "cloneType() on " + this.getClass,
-          e
-        )
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/vta/Configs.scala b/vta/vta-hw/hardware/chisel/src/main/scala/vta/Configs.scala
deleted file mode 100644
index 350379b2ec22..000000000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/vta/Configs.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta
-
-import chisel3._
-import vta.util.config._
-import vta.shell._
-import vta.core._
-import vta.test._
-
-/** VTA.
- *
- * This file contains all the configurations supported by VTA.
- * These configurations are built in a mix/match form based on core
- * and shell configurations.
- */
-class DefaultPynqConfig extends Config(new CoreConfig ++ new PynqConfig)
-class DefaultF1Config extends Config(new CoreConfig ++ new F1Config)
-class DefaultDe10Config extends Config(new CoreConfig ++ new De10Config)
-
-object DefaultPynqConfig extends App {
-  implicit val p: Parameters = new DefaultPynqConfig
-  chisel3.Driver.execute(args, () => new XilinxShell)
-}
-
-object DefaultF1Config extends App {
-  implicit val p: Parameters = new DefaultF1Config
-  chisel3.Driver.execute(args, () => new XilinxShell)
-}
-
-object DefaultDe10Config extends App {
-  implicit val p: Parameters = new DefaultDe10Config
-  chisel3.Driver.execute(args, () => new IntelShell)
-}
-
-object TestDefaultPynqConfig extends App {
-  implicit val p: Parameters = new DefaultPynqConfig
-  chisel3.Driver.execute(args, () => new Test)
-}
-
-object TestDefaultF1Config extends App {
-  implicit val p: Parameters = new DefaultF1Config
-  chisel3.Driver.execute(args, () => new Test)
-}
-
-object TestDefaultDe10Config extends App {
-  implicit val p: Parameters = new DefaultDe10Config
-  chisel3.Driver.execute(args, () => new Test)
-}
diff --git a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/AluTest.scala b/vta/vta-hw/hardware/chisel/src/test/scala/unittest/AluTest.scala
deleted file mode 100644
index 56d81b874dd7..000000000000
--- a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/AluTest.scala
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package unittest
-
-import chisel3._
-import chisel3.util._
-import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester}
-import scala.util.Random
-import unittest.util._
-import vta.core._
-
-class TestAluVector(c: AluVector) extends PeekPokeTester(c) {
-
-  /* alu_ref
-   *
-   * This is a software function used as a reference for the hardware
-   */
-  def aluRef(opcode: Int, a: Array[Int], b: Array[Int], width: Int) : Array[Int] = {
-    val size = a.length
-    val mask = helper.getMask(log2Ceil(width))
-    val res = Array.fill(size) {0}
-
-    if (opcode == 1) {
-      for (i <- 0 until size) {
-        res(i) = if (a(i) < b(i)) b(i) else a(i)
-      }
-    } else if (opcode == 2) {
-      for (i <- 0 until size) {
-        res(i) = a(i) + b(i)
-      }
-    } else if (opcode == 3) {
-      for (i <- 0 until size) {
-        res(i) = a(i) >> (b(i) & mask).toInt
-      }
-    } else if (opcode == 4) {
-      // HLS shift left by >> negative number
-      // b always < 0 when opcode == 4
-      for (i <- 0 until size) {
-        res(i) = a(i) << ((-1*b(i)) & mask)
-      }
-    } else {
-      // default
-      for (i <- 0 until size) {
-        res(i) = if (a(i) < b(i)) a(i) else b(i)
-      }
-    }
-    return res
-  }
-
-  val num_ops = ALU_OP_NUM
-  for (i <- 0 until num_ops) {
-    // generate data based on bits
-    val bits = c.aluBits
-    val dataGen = new RandomArray(c.blockOut, bits)
-    val op = i
-    val in_a = dataGen.any
-    val in_b = if (op != 4) dataGen.any else dataGen.negative
-    val mask = helper.getMask(bits)
-    val res = aluRef(op, in_a, in_b, bits)
-
-    for (i <- 0 until c.blockOut) {
-      poke(c.io.acc_a.data.bits(0)(i), in_a(i) & mask)
-      poke(c.io.acc_b.data.bits(0)(i), in_b(i) & mask)
-    }
-    poke(c.io.opcode, op)
-
-    poke(c.io.acc_a.data.valid, 1)
-    poke(c.io.acc_b.data.valid, 1)
-    poke(c.io.acc_y.data.valid, 1)
-
-    step(1)
-
-    poke(c.io.acc_a.data.valid, 0)
-    poke(c.io.acc_b.data.valid, 0)
-    poke(c.io.acc_y.data.valid, 0)
-
-    // wait for valid signal
-    while (peek(c.io.acc_y.data.valid) == BigInt(0)) {
-      step(1) // advance clock
-    }
-    if (peek(c.io.acc_y.data.valid) == BigInt(1)) {
-      for (i <- 0 until c.blockOut) {
-          expect(c.io.acc_y.data.bits(0)(i), res(i) & mask)
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/Launcher.scala b/vta/vta-hw/hardware/chisel/src/test/scala/unittest/Launcher.scala
deleted file mode 100644
index 2a1d201088ad..000000000000
--- a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/Launcher.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package unittest
-// taken from https://github.com/freechipsproject/chisel-testers
-
-import chisel3._
-import chisel3.iotesters.{Driver, TesterOptionsManager}
-import unittest.util._
-import vta.core._
-import vta.util.config._
-import vta.shell._
-
-class TestConfig extends Config(new CoreConfig ++ new PynqConfig)
-
-/* Launcher.
- *
- * The Launcher object includes a test list for the TestRunner to check.
- * Users can utilize this Launcher to run custom tests.
- *
- * How to Use:
- * When the user input: sbt 'test:runMain unittest.Launcher mvm'
- * the TestRunner will look for 'mvm' in the map and executes the
- * test that 'mvm' is mapped to
- */
-object Launcher {
-  implicit val p: Parameters = new TestConfig
-  val tests = Map(
-    "mvm" -> { (manager: TesterOptionsManager) =>
-      Driver.execute(() => new MatrixVectorMultiplication, manager) {
-        (c) => new TestMatrixVectorMultiplication(c)
-      }
-    },
-		"alu" -> { (manager: TesterOptionsManager) =>
-      Driver.execute(() => new AluVector, manager) {
-        (c) => new TestAluVector(c)
-      }
-    }
-  )
-
-  def main(args: Array[String]): Unit = {
-    TestRunner(tests, args)
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/MvmTest.scala b/vta/vta-hw/hardware/chisel/src/test/scala/unittest/MvmTest.scala
deleted file mode 100644
index b8af87958067..000000000000
--- a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/MvmTest.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package unittest
-
-import chisel3._
-import chisel3.util._
-import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester}
-import scala.math.pow
-import unittest.util._
-import vta.core._
-
-class TestMatrixVectorMultiplication(c: MatrixVectorMultiplication) extends PeekPokeTester(c) {
-
-  /* mvm_ref
-   *
-   * This is a software function that computes dot product with a programmable shift
-   * This is used as a reference for the hardware
-   */
-  def mvmRef(inp: Array[Int], wgt: Array[Array[Int]], shift: Int) : Array[Int] = {
-    val size = inp.length
-    val res = Array.fill(size) {0}
-    for (i <- 0 until size) {
-        var dot = 0
-        for (j <- 0 until size) {
-          dot += wgt(i)(j) * inp(j)
-        }
-        res(i) = dot * pow(2, shift).toInt
-    }
-    return res
-  }
-
-  val cycles = 5
-  for (i <- 0 until cycles) {
-    // generate data based on bits
-    val inpGen = new RandomArray(c.size, c.inpBits)
-    val wgtGen = new RandomArray(c.size, c.wgtBits)
-    val in_a = inpGen.any
-    val in_b = Array.fill(c.size) { wgtGen.any }
-    val res = mvmRef(in_a, in_b, 0)
-    val inpMask = helper.getMask(c.inpBits)
-    val wgtMask = helper.getMask(c.wgtBits)
-    val accMask = helper.getMask(c.accBits)
-
-    for (i <- 0 until c.size) {
-      poke(c.io.inp.data.bits(0)(i), in_a(i) & inpMask)
-      poke(c.io.acc_i.data.bits(0)(i), 0)
-      for (j <- 0 until c.size) {
-        poke(c.io.wgt.data.bits(i)(j), in_b(i)(j) & wgtMask)
-      }
-    }
-
-    poke(c.io.reset, 0)
-
-    poke(c.io.inp.data.valid, 1)
-    poke(c.io.wgt.data.valid, 1)
-    poke(c.io.acc_i.data.valid, 1)
-
-    step(1)
-
-    poke(c.io.inp.data.valid, 0)
-    poke(c.io.wgt.data.valid, 0)
-    poke(c.io.acc_i.data.valid, 0)
-
-    // wait for valid signal
-    while (peek(c.io.acc_o.data.valid) == BigInt(0)) {
-      step(1) // advance clock
-    }
-    if (peek(c.io.acc_o.data.valid) == BigInt(1)) {
-      for (i <- 0 until c.size) {
-          expect(c.io.acc_o.data.bits(0)(i), res(i) & accMask)
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/utils/Helper.scala b/vta/vta-hw/hardware/chisel/src/test/scala/unittest/utils/Helper.scala
deleted file mode 100644
index c6b006a9f950..000000000000
--- a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/utils/Helper.scala
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package unittest.util
-
-import scala.math.pow
-
-object helper {
-  def getMask(bits: Int) : Long = {
-    if (bits <= 0) throw new IllegalArgumentException ("bits should be greater than 0")
-    return (pow(2, bits) - 1).toLong
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/utils/RandomArray.scala b/vta/vta-hw/hardware/chisel/src/test/scala/unittest/utils/RandomArray.scala
deleted file mode 100644
index 727ad825905d..000000000000
--- a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/utils/RandomArray.scala
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package unittest.util
-
-import scala.util.Random
-import scala.math.pow
-
-class RandomArray(val len: Int, val bits: Int) {
-  val r = new Random
-  if (bits < 1) throw new IllegalArgumentException ("bits should be greater than 1")
-
-  def any : Array[Int] = {
-    return Array.fill(len) { r.nextInt(pow(2, bits).toInt) - pow(2, bits-1).toInt }
-  }
-
-  def positive : Array[Int] = {
-    return Array.fill(len) { r.nextInt(pow(2, bits-1).toInt) }
-  }
-
-  def negative : Array[Int] = {
-    return Array.fill(len) { 0 - r.nextInt(pow(2, bits-1).toInt) }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/utils/TestRunner.scala b/vta/vta-hw/hardware/chisel/src/test/scala/unittest/utils/TestRunner.scala
deleted file mode 100644
index 789eeb957836..000000000000
--- a/vta/vta-hw/hardware/chisel/src/test/scala/unittest/utils/TestRunner.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package unittest.util
-// taken from https://github.com/freechipsproject/chisel-testers
-
-import scala.collection.mutable.ArrayBuffer
-import chisel3.iotesters._
-
-object TestRunner {
-
-  def apply(testMap: Map[String, TesterOptionsManager => Boolean], args: Array[String]): Unit = {
-    var successful = 0
-    val errors = new ArrayBuffer[String]
-
-    val optionsManager = new TesterOptionsManager()
-    optionsManager.doNotExitOnHelp()
-
-    optionsManager.parse(args)
-
-    val programArgs = optionsManager.commonOptions.programArgs
-
-    if(programArgs.isEmpty) {
-      println("Available tests")
-      for(x <- testMap.keys) {
-        println(x)
-      }
-      println("all")
-      System.exit(0)
-    }
-
-    val testsToRun = if(programArgs.exists(x => x.toLowerCase() == "all")) {
-      testMap.keys
-    }
-    else {
-      programArgs
-    }
-
-    for(testName <- testsToRun) {
-      testMap.get(testName) match {
-        case Some(test) =>
-          println(s"Starting $testName")
-          try {
-            optionsManager.setTopName(testName)
-            optionsManager.setTargetDirName(s"test_run_dir/$testName")
-            if(test(optionsManager)) {
-              successful += 1
-            }
-            else {
-              errors += s"$testName: test error occurred"
-            }
-          }
-          catch {
-            case exception: Exception =>
-              exception.printStackTrace()
-              errors += s"$testName: exception ${exception.getMessage}"
-            case t : Throwable =>
-              errors += s"$testName: throwable ${t.getMessage}"
-          }
-        case _ =>
-          errors += s"Bad Test name: $testName"
-      }
-
-    }
-    if(successful > 0) {
-      println(s"Tests passing: $successful")
-    }
-    if(errors.nonEmpty) {
-      println("=" * 80)
-      println(s"Errors: ${errors.length}: in the following tests")
-      println(errors.mkString("\n"))
-      println("=" * 80)
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/dpi/tsim_device.cc b/vta/vta-hw/hardware/dpi/tsim_device.cc
deleted file mode 100644
index ffa192b283ea..000000000000
--- a/vta/vta-hw/hardware/dpi/tsim_device.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <chrono>
-#include <thread>
-#include <vta/dpi/tsim.h>
-
-#if VM_TRACE
-#ifdef VM_TRACE_FST
-#include <verilated_fst_c.h>
-#else
-#include <verilated_vcd_c.h>
-#endif
-#endif
-
-#if VM_TRACE
-#define STRINGIZE(x) #x
-#define STRINGIZE_VALUE_OF(x) STRINGIZE(x)
-#endif
-
-static VTAContextHandle _ctx = nullptr;
-static VTASimDPIFunc _sim_dpi = nullptr;
-static VTAHostDPIFunc _host_dpi = nullptr;
-static VTAMemDPIFunc _mem_dpi = nullptr;
-
-void VTASimDPI(dpi8_t* wait,
-               dpi8_t* exit) {
-  assert(_sim_dpi != nullptr);
-  (*_sim_dpi)(_ctx, wait, exit);
-}
-
-void VTAHostDPI(dpi8_t* req_valid,
-                dpi8_t* req_opcode,
-                dpi8_t* req_addr,
-                dpi32_t* req_value,
-                dpi8_t req_deq,
-                dpi8_t resp_valid,
-                dpi32_t resp_value) {
-  assert(_host_dpi != nullptr);
-  (*_host_dpi)(_ctx, req_valid, req_opcode,
-               req_addr, req_value, req_deq,
-               resp_valid, resp_value);
-}
-
-void VTAMemDPI(dpi8_t req_valid,
-               dpi8_t req_opcode,
-               dpi8_t req_len,
-               dpi64_t req_addr,
-               dpi8_t wr_valid,
-               dpi64_t wr_value,
-               dpi8_t* rd_valid,
-               dpi64_t* rd_value,
-               dpi8_t rd_ready) {
-  assert(_mem_dpi != nullptr);
-  (*_mem_dpi)(_ctx, req_valid, req_opcode, req_len,
-              req_addr, wr_valid, wr_value,
-              rd_valid, rd_value, rd_ready);
-
-}
-
-void VTADPIInit(VTAContextHandle handle,
-                VTASimDPIFunc sim_dpi,
-                VTAHostDPIFunc host_dpi,
-                VTAMemDPIFunc mem_dpi) {
-  _ctx = handle;
-  _sim_dpi = sim_dpi;
-  _host_dpi = host_dpi;
-  _mem_dpi = mem_dpi;
-}
-
-
-// Override Verilator finish definition
-// VL_USER_FINISH needs to be defined when compiling Verilator code
-void vl_finish(const char* filename, int linenum, const char* hier) {
-  Verilated::gotFinish(true);
-}
-
-int VTADPISim() {
-  uint64_t trace_count = 0;
-  Verilated::flushCall();
-  Verilated::gotFinish(false);
-
-#if VM_TRACE
-  uint64_t start = 0;
-#endif
-
-  VL_TSIM_NAME* top = new VL_TSIM_NAME;
-
-#if VM_TRACE
-  Verilated::traceEverOn(true);
-#ifdef VM_TRACE_FST
-  VerilatedFstC* tfp = new VerilatedFstC;
-#else
-  VerilatedVcdC* tfp = new VerilatedVcdC;
-#endif // VM_TRACE_FST
-  top->trace(tfp, 99);
-  tfp->open(STRINGIZE_VALUE_OF(TSIM_TRACE_FILE));
-#endif
-
-  // reset
-  for (int i = 0; i < 10; i++) {
-    top->reset = 1;
-    top->clock = 0;
-    top->eval();
-#if VM_TRACE
-    if (trace_count >= start)
-      tfp->dump(static_cast<vluint64_t>(trace_count * 2));
-#endif
-    top->clock = 1;
-    top->eval();
-#if VM_TRACE
-    if (trace_count >= start)
-      tfp->dump(static_cast<vluint64_t>(trace_count * 2 + 1));
-#endif
-    trace_count++;
-  }
-  top->reset = 0;
-
-  // start simulation
-  while (!Verilated::gotFinish()) {
-    top->sim_clock = 0;
-    top->clock = 0;
-    top->eval();
-#if VM_TRACE
-    if (trace_count >= start)
-      tfp->dump(static_cast<vluint64_t>(trace_count * 2));
-#endif
-    top->sim_clock = 1;
-    top->clock = 1;
-    top->eval();
-#if VM_TRACE
-    if (trace_count >= start)
-      tfp->dump(static_cast<vluint64_t>(trace_count * 2 + 1));
-#endif
-    trace_count++;
-    if ((trace_count % 1000000) == 1)
-      fprintf(stderr, "[traced %luM cycles]\n", trace_count / 1000000);
-    while (top->sim_wait) {
-      top->clock = 0;
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      top->sim_clock = 0;
-      top->eval();
-      top->sim_clock = 1;
-      top->eval();
-    }
-  }
-
-#if VM_TRACE
-  tfp->close();
-#endif
-
-  delete top;
-
-  return 0;
-}
diff --git a/vta/vta-hw/hardware/intel/Makefile b/vta/vta-hw/hardware/intel/Makefile
deleted file mode 100644
index b3638dc4c0ab..000000000000
--- a/vta/vta-hw/hardware/intel/Makefile
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Directories
-ROOTDIR = $(CURDIR)
-BUILD_NAME = build
-BUILD_DIR = $(ROOTDIR)/../../$(BUILD_NAME)/hardware/intel
-SCRIPT_DIR = $(ROOTDIR)/scripts
-SRC_DIR = $(ROOTDIR)/../chisel
-
-# Process VTA JSON config
-VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
-
-# Debug flag
-DEBUG = false
-# Prevent generation of DSP
-NO_DSP = true
-# Device
-DEVICE = $(shell $(VTA_CONFIG) --get-fpga-dev)
-# Device family
-DEVICE_FAMILY = $(shell $(VTA_CONFIG) --get-fpga-family)
-# Project name
-PROJECT = de10_nano_top
-# Frequency in MHz
-FREQ_MHZ = $(shell $(VTA_CONFIG) --get-fpga-freq)
-
-#---------------------
-# Compilation parameters
-#--------------------
-
-# Derive config name
-CONF = $(shell ${VTA_CONFIG} --cfg-str)
-IP_BUILD_PATH = $(BUILD_DIR)/chisel/$(CONF)
-HW_BUILD_PATH = $(BUILD_DIR)/quartus/$(CONF)
-
-ifeq ($(NO_DSP), true)
-  DSP_FLAG =
-else
-  DSP_FLAG = --dsp
-endif
-
-# IP file path
-IP_PATH = $(IP_BUILD_PATH)/VTA.DefaultDe10Config.v
-
-# Bitstream file path
-BIT_PATH = $(HW_BUILD_PATH)/export/vta_$(FREQ_MHZ)MHz.rbf
-CPF_OPT := -o bitstream_compression=on
-
-# System design file path
-QSYS_PATH = $(HW_BUILD_PATH)/soc_system.qsys
-
-.PHONY: all ip bit qsys clean
-
-all: bit
-ip: $(IP_PATH)
-bit: $(BIT_PATH)
-qsys: $(QSYS_PATH)
-
-$(IP_PATH): $(SRC_DIR)
-	mkdir -p $(IP_BUILD_PATH)
-	cd $(SRC_DIR) && \
-    make CONFIG=DefaultDe10Config chisel_build_dir=$(IP_BUILD_PATH) verilog
-
-$(QSYS_PATH): $(IP_PATH)
-	mkdir -p $(HW_BUILD_PATH)
-	cd $(HW_BUILD_PATH) && \
-    cp -r $(SCRIPT_DIR)/* $(HW_BUILD_PATH) && \
-    python3 $(SCRIPT_DIR)/set_attrs.py -i $(IP_PATH) -o $(HW_BUILD_PATH)/ip/vta/VTAShell.v $(DSP_FLAG) && \
-    qsys-script --script=soc_system.tcl $(DEVICE) $(DEVICE_FAMILY) $(FREQ_MHZ)
-
-$(BIT_PATH): $(QSYS_PATH)
-	cd $(HW_BUILD_PATH) && \
-    quartus_sh -t $(SCRIPT_DIR)/compile_design.tcl $(DEVICE) $(PROJECT) && \
-    mkdir -p $(shell dirname $(BIT_PATH)) && \
-    quartus_cpf $(CPF_OPT) -c $(HW_BUILD_PATH)/$(PROJECT).sof $(BIT_PATH)
-
-clean:
-	rm -rf $(BUILD_DIR)
-
-clean-qsys:
-	rm -rf $(QSYS_PATH)
diff --git a/vta/vta-hw/hardware/intel/README.md b/vta/vta-hw/hardware/intel/README.md
deleted file mode 100644
index 6d289cff0a6e..000000000000
--- a/vta/vta-hw/hardware/intel/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Complete instructions on how to build custom FPGA hardware designs are available on the [TVM documentation webpage](https://docs.tvm.ai/vta/install.html#vta-fpga-toolchain-installation).
diff --git a/vta/vta-hw/hardware/intel/scripts/compile_design.tcl b/vta/vta-hw/hardware/intel/scripts/compile_design.tcl
deleted file mode 100644
index 2297e6340147..000000000000
--- a/vta/vta-hw/hardware/intel/scripts/compile_design.tcl
+++ /dev/null
@@ -1,177 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Load Quartus Prime Tcl Project package
-package require ::quartus::project
-
-set DEVICE [lindex $argv 0]
-set PROJECT_NAME [lindex $argv 1]
-
-set need_to_close_project 0
-set make_assignments 1
-
-# Check that the right project is open
-if {[is_project_open]} {
-  if {[string compare $quartus(project) "${PROJECT_NAME}"]} {
-    puts "Project ${PROJECT_NAME} is not open"
-    set make_assignments 0
-  }
-} else {
-  # Only open if not already open
-  if {[project_exists ${PROJECT_NAME}]} {
-    project_open -revision ${PROJECT_NAME} ${PROJECT_NAME}
-  } else {
-    project_new -revision ${PROJECT_NAME} ${PROJECT_NAME}
-  }
-  set need_to_close_project 1
-}
-
-# Make assignments
-if {$make_assignments} {
-  set_global_assignment -name FAMILY "Cyclone V"
-  set_global_assignment -name DEVICE $DEVICE
-  set_global_assignment -name ORIGINAL_QUARTUS_VERSION 18.1.0
-  set_global_assignment -name PROJECT_CREATION_TIME_DATE "14:21:53  JUNE 17, 2019"
-  set_global_assignment -name LAST_QUARTUS_VERSION "18.1.0 Lite Edition"
-  set_global_assignment -name MIN_CORE_JUNCTION_TEMP "-40"
-  set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100
-  set_global_assignment -name POWER_PRESET_COOLING_SOLUTION "23 MM HEAT SINK WITH 200 LFPM AIRFLOW"
-  set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)"
-  set_global_assignment -name PARTITION_NETLIST_TYPE SOURCE -section_id Top
-  set_global_assignment -name PARTITION_FITTER_PRESERVATION_LEVEL PLACEMENT_AND_ROUTING -section_id Top
-  set_global_assignment -name PARTITION_COLOR 16764057 -section_id Top
-  set_global_assignment -name DEVICE_MIGRATION_LIST $DEVICE
-  set_global_assignment -name USE_DLL_FREQUENCY_FOR_DQS_DELAY_CHAIN ON
-  set_global_assignment -name UNIPHY_SEQUENCER_DQS_CONFIG_ENABLE ON
-  set_global_assignment -name ECO_REGENERATE_REPORT ON
-  set_global_assignment -name ENABLE_SIGNALTAP OFF
-  set_global_assignment -name ALLOW_REGISTER_RETIMING ON
-  set_global_assignment -name OPTIMIZATION_MODE BALANCED
-  set_global_assignment -name VERILOG_FILE ip/vta/VTAShell.v
-  set_global_assignment -name QSYS_FILE soc_system.qsys
-  set_global_assignment -name SDC_FILE set_clocks.sdc
-  set_global_assignment -name VERILOG_FILE ${PROJECT_NAME}.v
-  set_global_assignment -name SIGNALTAP_FILE ${PROJECT_NAME}.stp
-  set_global_assignment -name USE_SIGNALTAP_FILE ${PROJECT_NAME}.stp
-
-  set_location_assignment PIN_V11 -to FPGA_CLK1_50
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to FPGA_CLK1_50
-  set_location_assignment PIN_Y13 -to FPGA_CLK2_50
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to FPGA_CLK2_50
-  set_location_assignment PIN_E11 -to FPGA_CLK3_50
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to FPGA_CLK3_50
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to HPS_CONV_USB_N
-  set_location_assignment PIN_W15 -to LED[0]
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[0]
-  set_location_assignment PIN_AA24 -to LED[1]
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[1]
-  set_location_assignment PIN_V16 -to LED[2]
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[2]
-  set_location_assignment PIN_V15 -to LED[3]
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[3]
-  set_location_assignment PIN_AF26 -to LED[4]
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[4]
-  set_location_assignment PIN_AE26 -to LED[5]
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[5]
-  set_location_assignment PIN_Y16 -to LED[6]
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[6]
-  set_location_assignment PIN_AA23 -to LED[7]
-  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[7]
-
-  for {set i 0} {$i < 32} {incr i} {
-    set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_DQ[$i]
-    set_instance_assignment -name INPUT_TERMINATION "PARALLEL 50 OHM WITH CALIBRATION" -to HPS_DDR3_DQ[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name OUTPUT_TERMINATION "SERIES 50 OHM WITH CALIBRATION" -to HPS_DDR3_DQ[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_DQ[$i] -tag __hps_sdram_p0
-  }
-
-  for {set i 0} {$i < 15} {incr i} {
-    set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_ADDR[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_ADDR[$i]
-    set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_ADDR[$i]
-  }
-
-  for {set i 0} {$i < 4} {incr i} {
-    set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_DM[$i]
-    set_instance_assignment -name IO_STANDARD "DIFFERENTIAL 1.5-V SSTL CLASS I" -to HPS_DDR3_DQS_N[$i]
-    set_instance_assignment -name IO_STANDARD "DIFFERENTIAL 1.5-V SSTL CLASS I" -to HPS_DDR3_DQS_P[$i]
-    set_instance_assignment -name INPUT_TERMINATION "PARALLEL 50 OHM WITH CALIBRATION" -to HPS_DDR3_DQS_P[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name OUTPUT_TERMINATION "SERIES 50 OHM WITH CALIBRATION" -to HPS_DDR3_DQS_P[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name INPUT_TERMINATION "PARALLEL 50 OHM WITH CALIBRATION" -to HPS_DDR3_DQS_N[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name OUTPUT_TERMINATION "SERIES 50 OHM WITH CALIBRATION" -to HPS_DDR3_DQS_N[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name OUTPUT_TERMINATION "SERIES 50 OHM WITH CALIBRATION" -to HPS_DDR3_DM[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_DM[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_DQS_P[$i] -tag __hps_sdram_p0
-    set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_DQS_N[$i] -tag __hps_sdram_p0
-  }
-
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_BA[0]
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_BA[0]
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_BA[1]
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_BA[1]
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_BA[2]
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_BA[2]
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_CAS_N
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_CAS_N
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_CKE
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_CKE
-  set_instance_assignment -name IO_STANDARD "DIFFERENTIAL 1.5-V SSTL CLASS I" -to HPS_DDR3_CK_N
-  set_instance_assignment -name IO_STANDARD "DIFFERENTIAL 1.5-V SSTL CLASS I" -to HPS_DDR3_CK_P
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_CS_N
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_CS_N
-
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_ODT
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_ODT
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_RAS_N
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_RAS_N
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_RESET_N
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_RESET_N
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_RZQ
-  set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_WE_N
-  set_instance_assignment -name CURRENT_STRENGTH_NEW "MAXIMUM CURRENT" -to HPS_DDR3_WE_N
-
-  set_instance_assignment -name OUTPUT_TERMINATION "SERIES 50 OHM WITHOUT CALIBRATION" -to HPS_DDR3_CK_P -tag __hps_sdram_p0
-  set_instance_assignment -name D5_DELAY 2 -to HPS_DDR3_CK_P -tag __hps_sdram_p0
-  set_instance_assignment -name OUTPUT_TERMINATION "SERIES 50 OHM WITHOUT CALIBRATION" -to HPS_DDR3_CK_N -tag __hps_sdram_p0
-  set_instance_assignment -name D5_DELAY 2 -to HPS_DDR3_CK_N -tag __hps_sdram_p0
-
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_BA[0] -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_BA[1] -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_BA[2] -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_CAS_N -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_CKE -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_CS_N -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_ODT -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_RAS_N -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_WE_N -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_RESET_N -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_CK_P -tag __hps_sdram_p0
-  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_CK_N -tag __hps_sdram_p0
-
-  set_instance_assignment -name PARTITION_HIERARCHY root_partition -to | -section_id Top
-
-  # Commit assignments
-  export_assignments
-
-  load_package flow
-  execute_flow -compile
-
-  # Close project
-  if {$need_to_close_project} {
-    project_close
-  }
-}
diff --git a/vta/vta-hw/hardware/intel/scripts/de10_nano_top.v b/vta/vta-hw/hardware/intel/scripts/de10_nano_top.v
deleted file mode 100644
index 80a8fd686cb6..000000000000
--- a/vta/vta-hw/hardware/intel/scripts/de10_nano_top.v
+++ /dev/null
@@ -1,110 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//=======================================================
-//  This code is partially generated by Terasic System Builder
-//=======================================================
-
-module de10_nano_top(
-
-    //////////// CLOCK //////////
-    input               FPGA_CLK1_50,
-    input               FPGA_CLK2_50,
-    input               FPGA_CLK3_50,
-
-    //////////// HPS //////////
-    inout               HPS_CONV_USB_N,
-    output   [14: 0]    HPS_DDR3_ADDR,
-    output   [ 2: 0]    HPS_DDR3_BA,
-    output              HPS_DDR3_CAS_N,
-    output              HPS_DDR3_CK_N,
-    output              HPS_DDR3_CK_P,
-    output              HPS_DDR3_CKE,
-    output              HPS_DDR3_CS_N,
-    output   [ 3: 0]    HPS_DDR3_DM,
-    inout    [31: 0]    HPS_DDR3_DQ,
-    inout    [ 3: 0]    HPS_DDR3_DQS_N,
-    inout    [ 3: 0]    HPS_DDR3_DQS_P,
-    output              HPS_DDR3_ODT,
-    output              HPS_DDR3_RAS_N,
-    output              HPS_DDR3_RESET_N,
-    input               HPS_DDR3_RZQ,
-    output              HPS_DDR3_WE_N,
-
-    //////////// LED //////////
-    output   [ 7: 0]    LED
-);
-
-
-
-//=======================================================
-//  REG/WIRE declarations
-//=======================================================
-wire hps_fpga_reset_n;
-wire                fpga_clk_50;
-// connection of internal logics
-assign fpga_clk_50 = FPGA_CLK1_50;
-
-
-//=======================================================
-//  Structural coding
-//=======================================================
-soc_system u0(
-               //Clock&Reset
-               .clk_clk(FPGA_CLK1_50),                                      //                            clk.clk
-               .reset_reset_n(hps_fpga_reset_n),                            //                          reset.reset_n
-               //HPS ddr3
-               .memory_mem_a(HPS_DDR3_ADDR),                                //                         memory.mem_a
-               .memory_mem_ba(HPS_DDR3_BA),                                 //                               .mem_ba
-               .memory_mem_ck(HPS_DDR3_CK_P),                               //                               .mem_ck
-               .memory_mem_ck_n(HPS_DDR3_CK_N),                             //                               .mem_ck_n
-               .memory_mem_cke(HPS_DDR3_CKE),                               //                               .mem_cke
-               .memory_mem_cs_n(HPS_DDR3_CS_N),                             //                               .mem_cs_n
-               .memory_mem_ras_n(HPS_DDR3_RAS_N),                           //                               .mem_ras_n
-               .memory_mem_cas_n(HPS_DDR3_CAS_N),                           //                               .mem_cas_n
-               .memory_mem_we_n(HPS_DDR3_WE_N),                             //                               .mem_we_n
-               .memory_mem_reset_n(HPS_DDR3_RESET_N),                       //                               .mem_reset_n
-               .memory_mem_dq(HPS_DDR3_DQ),                                 //                               .mem_dq
-               .memory_mem_dqs(HPS_DDR3_DQS_P),                             //                               .mem_dqs
-               .memory_mem_dqs_n(HPS_DDR3_DQS_N),                           //                               .mem_dqs_n
-               .memory_mem_odt(HPS_DDR3_ODT),                               //                               .mem_odt
-               .memory_mem_dm(HPS_DDR3_DM),                                 //                               .mem_dm
-               .memory_oct_rzqin(HPS_DDR3_RZQ),                             //                               .oct_rzqin
-               //FPGA
-               .hps_0_h2f_reset_reset_n(hps_fpga_reset_n)                   //                hps_0_h2f_reset.reset_n
-           );
-
-
-// Blink LED, to indicate everything is working
-reg [25: 0] counter;
-reg led_level;
-always @(posedge fpga_clk_50 or negedge hps_fpga_reset_n) begin
-    if (~hps_fpga_reset_n) begin
-        counter <= 0;
-        led_level <= 0;
-    end
-    else if (counter == 24999999) begin
-        counter <= 0;
-        led_level <= ~led_level;
-    end
-    else
-        counter <= counter + 1'b1;
-end
-assign LED[0] = led_level;
-
-
-endmodule
diff --git a/vta/vta-hw/hardware/intel/scripts/ip/vta/vta_hw.tcl b/vta/vta-hw/hardware/intel/scripts/ip/vta/vta_hw.tcl
deleted file mode 100644
index 59d6de7c6951..000000000000
--- a/vta/vta-hw/hardware/intel/scripts/ip/vta/vta_hw.tcl
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# request TCL package from ACDS 16.1
-package require -exact qsys 16.1
-
-# module vta
-set_module_property DESCRIPTION ""
-set_module_property NAME vta
-set_module_property VERSION 1.0
-set_module_property INTERNAL false
-set_module_property OPAQUE_ADDRESS_MAP true
-set_module_property AUTHOR ""
-set_module_property DISPLAY_NAME "VTA Subsystem"
-set_module_property INSTANTIATE_IN_SYSTEM_MODULE true
-set_module_property EDITABLE true
-set_module_property REPORT_TO_TALKBACK false
-set_module_property ALLOW_GREYBOX_GENERATION false
-set_module_property REPORT_HIERARCHY false
-
-# file sets
-add_fileset QUARTUS_SYNTH QUARTUS_SYNTH "" ""
-set_fileset_property QUARTUS_SYNTH TOP_LEVEL IntelShell
-set_fileset_property QUARTUS_SYNTH ENABLE_RELATIVE_INCLUDE_PATHS false
-set_fileset_property QUARTUS_SYNTH ENABLE_FILE_OVERWRITE_MODE false
-add_fileset_file VTAShell.v VERILOG PATH VTAShell.v TOP_LEVEL_FILE
-
-# connection point clock
-add_interface clock clock end
-set_interface_property clock clockRate 0
-set_interface_property clock ENABLED true
-set_interface_property clock EXPORT_OF ""
-set_interface_property clock PORT_NAME_MAP ""
-set_interface_property clock CMSIS_SVD_VARIABLES ""
-set_interface_property clock SVD_ADDRESS_GROUP ""
-
-add_interface_port clock clock clk Input 1
-
-# connection point reset
-add_interface reset reset end
-set_interface_property reset associatedClock clock
-set_interface_property reset synchronousEdges DEASSERT
-set_interface_property reset ENABLED true
-set_interface_property reset EXPORT_OF ""
-set_interface_property reset PORT_NAME_MAP ""
-set_interface_property reset CMSIS_SVD_VARIABLES ""
-set_interface_property reset SVD_ADDRESS_GROUP ""
-
-add_interface_port reset reset reset Input 1
-
-# connection point m_axi_gmem
-add_interface m_axi_gmem axi start
-set_interface_property m_axi_gmem associatedClock clock
-set_interface_property m_axi_gmem associatedReset reset
-set_interface_property m_axi_gmem readIssuingCapability 1
-set_interface_property m_axi_gmem writeIssuingCapability 1
-set_interface_property m_axi_gmem combinedIssuingCapability 1
-set_interface_property m_axi_gmem ENABLED true
-set_interface_property m_axi_gmem EXPORT_OF ""
-set_interface_property m_axi_gmem PORT_NAME_MAP ""
-set_interface_property m_axi_gmem CMSIS_SVD_VARIABLES ""
-set_interface_property m_axi_gmem SVD_ADDRESS_GROUP ""
-
-add_interface_port m_axi_gmem io_mem_ar_ready arready Input 1
-add_interface_port m_axi_gmem io_mem_ar_valid arvalid Output 1
-add_interface_port m_axi_gmem io_mem_ar_bits_addr araddr Output 32
-add_interface_port m_axi_gmem io_mem_ar_bits_burst arburst Output 2
-add_interface_port m_axi_gmem io_mem_ar_bits_cache arcache Output 4
-add_interface_port m_axi_gmem io_mem_ar_bits_len arlen Output 4
-add_interface_port m_axi_gmem io_mem_ar_bits_lock arlock Output 2
-add_interface_port m_axi_gmem io_mem_ar_bits_prot arprot Output 3
-add_interface_port m_axi_gmem io_mem_ar_bits_size arsize Output 3
-add_interface_port m_axi_gmem io_mem_ar_bits_user aruser Output 5
-add_interface_port m_axi_gmem io_mem_ar_bits_id arid Output 1
-add_interface_port m_axi_gmem io_mem_r_ready rready Output 1
-add_interface_port m_axi_gmem io_mem_r_valid rvalid Input 1
-add_interface_port m_axi_gmem io_mem_r_bits_data rdata Input 64
-add_interface_port m_axi_gmem io_mem_r_bits_id rid Input 1
-add_interface_port m_axi_gmem io_mem_r_bits_last rlast Input 1
-add_interface_port m_axi_gmem io_mem_r_bits_resp rresp Input 2
-add_interface_port m_axi_gmem io_mem_aw_valid awvalid Output 1
-add_interface_port m_axi_gmem io_mem_aw_ready awready Input 1
-add_interface_port m_axi_gmem io_mem_aw_bits_addr awaddr Output 32
-add_interface_port m_axi_gmem io_mem_aw_bits_prot awprot Output 3
-add_interface_port m_axi_gmem io_mem_aw_bits_burst awburst Output 2
-add_interface_port m_axi_gmem io_mem_aw_bits_cache awcache Output 4
-add_interface_port m_axi_gmem io_mem_aw_bits_len awlen Output 4
-add_interface_port m_axi_gmem io_mem_aw_bits_lock awlock Output 2
-add_interface_port m_axi_gmem io_mem_aw_bits_size awsize Output 3
-add_interface_port m_axi_gmem io_mem_aw_bits_user awuser Output 5
-add_interface_port m_axi_gmem io_mem_aw_bits_id awid Output 1
-add_interface_port m_axi_gmem io_mem_w_bits_data wdata Output 64
-add_interface_port m_axi_gmem io_mem_w_ready wready Input 1
-add_interface_port m_axi_gmem io_mem_w_valid wvalid Output 1
-add_interface_port m_axi_gmem io_mem_w_bits_last wlast Output 1
-add_interface_port m_axi_gmem io_mem_w_bits_strb wstrb Output 8
-add_interface_port m_axi_gmem io_mem_w_bits_id wid Output 1
-add_interface_port m_axi_gmem io_mem_b_ready bready Output 1
-add_interface_port m_axi_gmem io_mem_b_valid bvalid Input 1
-add_interface_port m_axi_gmem io_mem_b_bits_resp bresp Input 2
-add_interface_port m_axi_gmem io_mem_b_bits_id bid Input 1
-
-# connection point s_axi_control
-add_interface s_axi_control axi end
-set_interface_property s_axi_control associatedClock clock
-set_interface_property s_axi_control associatedReset reset
-set_interface_property s_axi_control readAcceptanceCapability 1
-set_interface_property s_axi_control writeAcceptanceCapability 1
-set_interface_property s_axi_control combinedAcceptanceCapability 1
-set_interface_property s_axi_control readDataReorderingDepth 1
-set_interface_property s_axi_control bridgesToMaster ""
-set_interface_property s_axi_control ENABLED true
-set_interface_property s_axi_control EXPORT_OF ""
-set_interface_property s_axi_control PORT_NAME_MAP ""
-set_interface_property s_axi_control CMSIS_SVD_VARIABLES ""
-set_interface_property s_axi_control SVD_ADDRESS_GROUP ""
-
-add_interface_port s_axi_control io_host_aw_ready awready Output 1
-add_interface_port s_axi_control io_host_aw_valid awvalid Input 1
-add_interface_port s_axi_control io_host_aw_bits_addr awaddr Input 16
-add_interface_port s_axi_control io_host_aw_bits_prot awprot Input 3
-add_interface_port s_axi_control io_host_w_valid wvalid Input 1
-add_interface_port s_axi_control io_host_w_ready wready Output 1
-add_interface_port s_axi_control io_host_w_bits_data wdata Input 32
-add_interface_port s_axi_control io_host_b_ready bready Input 1
-add_interface_port s_axi_control io_host_b_valid bvalid Output 1
-add_interface_port s_axi_control io_host_b_bits_resp bresp Output 2
-add_interface_port s_axi_control io_host_ar_ready arready Output 1
-add_interface_port s_axi_control io_host_ar_valid arvalid Input 1
-add_interface_port s_axi_control io_host_ar_bits_addr araddr Input 16
-add_interface_port s_axi_control io_host_ar_bits_prot arprot Input 3
-add_interface_port s_axi_control io_host_r_ready rready Input 1
-add_interface_port s_axi_control io_host_r_valid rvalid Output 1
-add_interface_port s_axi_control io_host_r_bits_resp rresp Output 2
-add_interface_port s_axi_control io_host_r_bits_data rdata Output 32
-add_interface_port s_axi_control io_host_aw_bits_id awid Input 13
-add_interface_port s_axi_control io_host_ar_bits_id arid Input 13
-add_interface_port s_axi_control io_host_aw_bits_len awlen Input 4
-add_interface_port s_axi_control io_host_ar_bits_size arsize Input 3
-add_interface_port s_axi_control io_host_r_bits_id rid Output 13
-add_interface_port s_axi_control io_host_w_bits_id wid Input 13
-add_interface_port s_axi_control io_host_b_bits_id bid Output 13
-add_interface_port s_axi_control io_host_aw_bits_size awsize Input 3
-add_interface_port s_axi_control io_host_aw_bits_burst awburst Input 2
-add_interface_port s_axi_control io_host_aw_bits_lock awlock Input 2
-add_interface_port s_axi_control io_host_aw_bits_cache awcache Input 4
-add_interface_port s_axi_control io_host_ar_bits_burst arburst Input 2
-add_interface_port s_axi_control io_host_ar_bits_cache arcache Input 4
-add_interface_port s_axi_control io_host_ar_bits_len arlen Input 4
-add_interface_port s_axi_control io_host_ar_bits_lock arlock Input 2
-add_interface_port s_axi_control io_host_r_bits_last rlast Output 1
-add_interface_port s_axi_control io_host_w_bits_last wlast Input 1
-add_interface_port s_axi_control io_host_w_bits_strb wstrb Input 4
diff --git a/vta/vta-hw/hardware/intel/scripts/set_attrs.py b/vta/vta-hw/hardware/intel/scripts/set_attrs.py
deleted file mode 100755
index 9a7509ec6283..000000000000
--- a/vta/vta-hw/hardware/intel/scripts/set_attrs.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/env python
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os, sys
-import argparse
-
-def set_attrs(fname, fname_out, dsp=False, verbose=True):
-    """Set attributes to precompiled verilog code to indicate synthesis preference.
-
-    Parameters
-    ----------
-    fname : str
-        The name of input verilog source code file.
-
-    fname_out : str
-        The name of output verilog source code file.
-    """
-    out = ""
-    with open(fname, 'rt') as fp:
-        module = ''
-        for idx, line in enumerate(fp):
-            if 'module' in line:
-                module = line[line.find('module')+7:line.find('(')]
-                out += line
-            elif " * " in line:
-                if dsp:
-                    line = line.replace(" * ", ' * (* multstyle="dsp" *) ')
-                else:
-                    line = line.replace(" * ", ' * (* multstyle="logic" *) ')
-                if verbose:
-                    print(fname_out+":"+str(idx+1)+": "+module+":"+line[1:line.find(";")+1])
-                out += line
-            elif "rA;" in line:
-                line = line.replace("rA;", 'rA /* synthesis noprune */;')
-                if verbose:
-                    print(fname_out+":"+str(idx+1)+": "+module+":"+line[1:line.find(";")+1])
-                out += line
-            elif "rB;" in line:
-                line = line.replace("rB;", 'rB /* synthesis noprune */;')
-                if verbose:
-                    print(fname_out+":"+str(idx+1)+": "+module+":"+line[1:line.find(";")+1])
-                out += line
-            elif "rC;" in line:
-                line = line.replace("rC;", 'rC /* synthesis noprune */;')
-                if verbose:
-                    print(fname_out+":"+str(idx+1)+": "+module+":"+line[1:line.find(";")+1])
-                out += line
-            else:
-                out += line
-    with open(fname_out, 'wt') as fp:
-        fp.write(out)
-
-if __name__=="__main__":
-    parser = argparse.ArgumentParser(description='Set attributes to precompiled ' +
-                                     'verilog code to indicate synthesis preference')
-    parser.add_argument('-i', '--input', type=str, default='VTA.DefaultDe10Config.v',
-                        help='input verilog file to be decorated')
-    parser.add_argument('-o', '--output', type=str, default='IntelShell.v',
-                        help='decorated verilog file')
-    parser.add_argument('--dsp', default=False, action='store_true',
-                        help='use dsp instead of logic.')
-    parser.add_argument('--verbose', default=False, action='store_true',
-                        help='print output file name and decorated lines.')
-    args = parser.parse_args()
-    set_attrs(args.input, args.output, args.dsp, args.verbose)
diff --git a/vta/vta-hw/hardware/intel/scripts/set_clocks.sdc b/vta/vta-hw/hardware/intel/scripts/set_clocks.sdc
deleted file mode 100644
index b28e01d2549c..000000000000
--- a/vta/vta-hw/hardware/intel/scripts/set_clocks.sdc
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#**************************************************************
-# This .sdc file is created by Terasic Tool.
-# Users are recommended to modify this file to match users logic.
-#**************************************************************
-
-# Create Clock
-create_clock -period "50.0 MHz" [get_ports FPGA_CLK1_50]
-create_clock -period "50.0 MHz" [get_ports FPGA_CLK2_50]
-create_clock -period "50.0 MHz" [get_ports FPGA_CLK3_50]
-
-# for enhancing USB BlasterII to be reliable, 25MHz
-create_clock -name {altera_reserved_tck} -period 40 {altera_reserved_tck}
-set_input_delay -clock altera_reserved_tck -clock_fall 3 [get_ports altera_reserved_tdi]
-set_input_delay -clock altera_reserved_tck -clock_fall 3 [get_ports altera_reserved_tms]
-set_output_delay -clock altera_reserved_tck 3 [get_ports altera_reserved_tdo]
-
-# Turn off warning on unconstrained LED port.
-set_false_path -to [get_ports {LED[0]}]
-
-# Create Generated Clock
-derive_pll_clocks
-
-# Set Clock Uncertainty
-derive_clock_uncertainty
diff --git a/vta/vta-hw/hardware/intel/scripts/soc_system.tcl b/vta/vta-hw/hardware/intel/scripts/soc_system.tcl
deleted file mode 100644
index eea815d47558..000000000000
--- a/vta/vta-hw/hardware/intel/scripts/soc_system.tcl
+++ /dev/null
@@ -1,760 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package require -exact qsys 16.0
-
-create_system soc_system
-
-set_project_property DEVICE [lindex $argv 0]
-set_project_property DEVICE_FAMILY [lindex $argv 1]
-set FREQ_MHZ [lindex $argv 2]
-
-set_project_property HIDE_FROM_IP_CATALOG {false}
-
-# Instances and instance parameters
-# (disabled instances are intentionally culled)
-add_instance clk_0 clock_source 18.1
-set_instance_parameter_value clk_0 {clockFrequency} {50000000.0}
-set_instance_parameter_value clk_0 {clockFrequencyKnown} {1}
-set_instance_parameter_value clk_0 {resetSynchronousEdges} {NONE}
-
-add_instance hps_0 altera_hps 18.1
-set_instance_parameter_value hps_0 {ABSTRACT_REAL_COMPARE_TEST} {0}
-set_instance_parameter_value hps_0 {ABS_RAM_MEM_INIT_FILENAME} {meminit}
-set_instance_parameter_value hps_0 {ACV_PHY_CLK_ADD_FR_PHASE} {0.0}
-set_instance_parameter_value hps_0 {AC_PACKAGE_DESKEW} {0}
-set_instance_parameter_value hps_0 {AC_ROM_USER_ADD_0} {0_0000_0000_0000}
-set_instance_parameter_value hps_0 {AC_ROM_USER_ADD_1} {0_0000_0000_1000}
-set_instance_parameter_value hps_0 {ADDR_ORDER} {0}
-set_instance_parameter_value hps_0 {ADD_EFFICIENCY_MONITOR} {0}
-set_instance_parameter_value hps_0 {ADD_EXTERNAL_SEQ_DEBUG_NIOS} {0}
-set_instance_parameter_value hps_0 {ADVANCED_CK_PHASES} {0}
-set_instance_parameter_value hps_0 {ADVERTIZE_SEQUENCER_SW_BUILD_FILES} {0}
-set_instance_parameter_value hps_0 {AFI_DEBUG_INFO_WIDTH} {32}
-set_instance_parameter_value hps_0 {ALTMEMPHY_COMPATIBLE_MODE} {0}
-set_instance_parameter_value hps_0 {AP_MODE} {0}
-set_instance_parameter_value hps_0 {AP_MODE_EN} {0}
-set_instance_parameter_value hps_0 {AUTO_PD_CYCLES} {0}
-set_instance_parameter_value hps_0 {AUTO_POWERDN_EN} {0}
-set_instance_parameter_value hps_0 {AVL_DATA_WIDTH_PORT} {32 32 32 32 32 32}
-set_instance_parameter_value hps_0 {AVL_MAX_SIZE} {4}
-set_instance_parameter_value hps_0 {BONDING_OUT_ENABLED} {0}
-set_instance_parameter_value hps_0 {BOOTFROMFPGA_Enable} {0}
-set_instance_parameter_value hps_0 {BSEL} {1}
-set_instance_parameter_value hps_0 {BSEL_EN} {0}
-set_instance_parameter_value hps_0 {BYTE_ENABLE} {1}
-set_instance_parameter_value hps_0 {C2P_WRITE_CLOCK_ADD_PHASE} {0.0}
-set_instance_parameter_value hps_0 {CALIBRATION_MODE} {Skip}
-set_instance_parameter_value hps_0 {CALIB_REG_WIDTH} {8}
-set_instance_parameter_value hps_0 {CAN0_Mode} {N/A}
-set_instance_parameter_value hps_0 {CAN0_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {CAN1_Mode} {N/A}
-set_instance_parameter_value hps_0 {CAN1_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {CFG_DATA_REORDERING_TYPE} {INTER_BANK}
-set_instance_parameter_value hps_0 {CFG_REORDER_DATA} {1}
-set_instance_parameter_value hps_0 {CFG_TCCD_NS} {2.5}
-set_instance_parameter_value hps_0 {COMMAND_PHASE} {0.0}
-set_instance_parameter_value hps_0 {CONTROLLER_LATENCY} {5}
-set_instance_parameter_value hps_0 {CORE_DEBUG_CONNECTION} {EXPORT}
-set_instance_parameter_value hps_0 {CPORT_TYPE_PORT} {Bidirectional Bidirectional Bidirectional Bidirectional Bidirectional Bidirectional}
-set_instance_parameter_value hps_0 {CSEL} {0}
-set_instance_parameter_value hps_0 {CSEL_EN} {0}
-set_instance_parameter_value hps_0 {CTI_Enable} {0}
-set_instance_parameter_value hps_0 {CTL_AUTOPCH_EN} {0}
-set_instance_parameter_value hps_0 {CTL_CMD_QUEUE_DEPTH} {8}
-set_instance_parameter_value hps_0 {CTL_CSR_CONNECTION} {INTERNAL_JTAG}
-set_instance_parameter_value hps_0 {CTL_CSR_ENABLED} {0}
-set_instance_parameter_value hps_0 {CTL_CSR_READ_ONLY} {1}
-set_instance_parameter_value hps_0 {CTL_DEEP_POWERDN_EN} {0}
-set_instance_parameter_value hps_0 {CTL_DYNAMIC_BANK_ALLOCATION} {0}
-set_instance_parameter_value hps_0 {CTL_DYNAMIC_BANK_NUM} {4}
-set_instance_parameter_value hps_0 {CTL_ECC_AUTO_CORRECTION_ENABLED} {0}
-set_instance_parameter_value hps_0 {CTL_ECC_ENABLED} {0}
-set_instance_parameter_value hps_0 {CTL_ENABLE_BURST_INTERRUPT} {0}
-set_instance_parameter_value hps_0 {CTL_ENABLE_BURST_TERMINATE} {0}
-set_instance_parameter_value hps_0 {CTL_HRB_ENABLED} {0}
-set_instance_parameter_value hps_0 {CTL_LOOK_AHEAD_DEPTH} {4}
-set_instance_parameter_value hps_0 {CTL_SELF_REFRESH_EN} {0}
-set_instance_parameter_value hps_0 {CTL_USR_REFRESH_EN} {0}
-set_instance_parameter_value hps_0 {CTL_ZQCAL_EN} {0}
-set_instance_parameter_value hps_0 {CUT_NEW_FAMILY_TIMING} {1}
-set_instance_parameter_value hps_0 {DAT_DATA_WIDTH} {32}
-set_instance_parameter_value hps_0 {DEBUGAPB_Enable} {0}
-set_instance_parameter_value hps_0 {DEBUG_MODE} {0}
-set_instance_parameter_value hps_0 {DEVICE_DEPTH} {1}
-set_instance_parameter_value hps_0 {DEVICE_FAMILY_PARAM} {}
-set_instance_parameter_value hps_0 {DISABLE_CHILD_MESSAGING} {0}
-set_instance_parameter_value hps_0 {DISCRETE_FLY_BY} {1}
-set_instance_parameter_value hps_0 {DLL_SHARING_MODE} {None}
-set_instance_parameter_value hps_0 {DMA_Enable} {No No No No No No No No}
-set_instance_parameter_value hps_0 {DQS_DQSN_MODE} {DIFFERENTIAL}
-set_instance_parameter_value hps_0 {DQ_INPUT_REG_USE_CLKN} {0}
-set_instance_parameter_value hps_0 {DUPLICATE_AC} {0}
-set_instance_parameter_value hps_0 {ED_EXPORT_SEQ_DEBUG} {0}
-set_instance_parameter_value hps_0 {EMAC0_Mode} {N/A}
-set_instance_parameter_value hps_0 {EMAC0_PTP} {0}
-set_instance_parameter_value hps_0 {EMAC0_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {EMAC1_Mode} {N/A}
-set_instance_parameter_value hps_0 {EMAC1_PTP} {0}
-set_instance_parameter_value hps_0 {EMAC1_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {ENABLE_ABS_RAM_MEM_INIT} {0}
-set_instance_parameter_value hps_0 {ENABLE_BONDING} {0}
-set_instance_parameter_value hps_0 {ENABLE_BURST_MERGE} {0}
-set_instance_parameter_value hps_0 {ENABLE_CTRL_AVALON_INTERFACE} {1}
-set_instance_parameter_value hps_0 {ENABLE_DELAY_CHAIN_WRITE} {0}
-set_instance_parameter_value hps_0 {ENABLE_EMIT_BFM_MASTER} {0}
-set_instance_parameter_value hps_0 {ENABLE_EXPORT_SEQ_DEBUG_BRIDGE} {0}
-set_instance_parameter_value hps_0 {ENABLE_EXTRA_REPORTING} {0}
-set_instance_parameter_value hps_0 {ENABLE_ISS_PROBES} {0}
-set_instance_parameter_value hps_0 {ENABLE_NON_DESTRUCTIVE_CALIB} {0}
-set_instance_parameter_value hps_0 {ENABLE_NON_DES_CAL} {0}
-set_instance_parameter_value hps_0 {ENABLE_NON_DES_CAL_TEST} {0}
-set_instance_parameter_value hps_0 {ENABLE_SEQUENCER_MARGINING_ON_BY_DEFAULT} {0}
-set_instance_parameter_value hps_0 {ENABLE_USER_ECC} {0}
-set_instance_parameter_value hps_0 {EXPORT_AFI_HALF_CLK} {0}
-set_instance_parameter_value hps_0 {EXTRA_SETTINGS} {}
-set_instance_parameter_value hps_0 {F2SCLK_COLDRST_Enable} {0}
-set_instance_parameter_value hps_0 {F2SCLK_DBGRST_Enable} {0}
-set_instance_parameter_value hps_0 {F2SCLK_PERIPHCLK_Enable} {0}
-set_instance_parameter_value hps_0 {F2SCLK_SDRAMCLK_Enable} {0}
-set_instance_parameter_value hps_0 {F2SCLK_WARMRST_Enable} {0}
-set_instance_parameter_value hps_0 {F2SDRAM_Type} {}
-set_instance_parameter_value hps_0 {F2SDRAM_Width} {}
-set_instance_parameter_value hps_0 {F2SINTERRUPT_Enable} {0}
-set_instance_parameter_value hps_0 {F2S_Width} {2}
-set_instance_parameter_value hps_0 {FIX_READ_LATENCY} {8}
-set_instance_parameter_value hps_0 {FORCED_NON_LDC_ADDR_CMD_MEM_CK_INVERT} {0}
-set_instance_parameter_value hps_0 {FORCED_NUM_WRITE_FR_CYCLE_SHIFTS} {0}
-set_instance_parameter_value hps_0 {FORCE_DQS_TRACKING} {AUTO}
-set_instance_parameter_value hps_0 {FORCE_MAX_LATENCY_COUNT_WIDTH} {0}
-set_instance_parameter_value hps_0 {FORCE_SEQUENCER_TCL_DEBUG_MODE} {0}
-set_instance_parameter_value hps_0 {FORCE_SHADOW_REGS} {AUTO}
-set_instance_parameter_value hps_0 {FORCE_SYNTHESIS_LANGUAGE} {}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_EMAC0_GTX_CLK} {125}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_EMAC0_MD_CLK} {2.5}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_EMAC1_GTX_CLK} {125}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_EMAC1_MD_CLK} {2.5}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_I2C0_CLK} {100}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_I2C1_CLK} {100}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_I2C2_CLK} {100}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_I2C3_CLK} {100}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_QSPI_SCLK_OUT} {100}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_SDIO_CCLK} {100}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_SPIM0_SCLK_OUT} {100}
-set_instance_parameter_value hps_0 {FPGA_PERIPHERAL_OUTPUT_CLOCK_FREQ_SPIM1_SCLK_OUT} {100}
-set_instance_parameter_value hps_0 {GPIO_Enable} {No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No}
-set_instance_parameter_value hps_0 {GP_Enable} {0}
-set_instance_parameter_value hps_0 {HARD_EMIF} {1}
-set_instance_parameter_value hps_0 {HCX_COMPAT_MODE} {0}
-set_instance_parameter_value hps_0 {HHP_HPS} {1}
-set_instance_parameter_value hps_0 {HHP_HPS_SIMULATION} {0}
-set_instance_parameter_value hps_0 {HHP_HPS_VERIFICATION} {0}
-set_instance_parameter_value hps_0 {HLGPI_Enable} {0}
-set_instance_parameter_value hps_0 {HPS_PROTOCOL} {DDR3}
-set_instance_parameter_value hps_0 {I2C0_Mode} {N/A}
-set_instance_parameter_value hps_0 {I2C0_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {I2C1_Mode} {N/A}
-set_instance_parameter_value hps_0 {I2C1_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {I2C2_Mode} {N/A}
-set_instance_parameter_value hps_0 {I2C2_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {I2C3_Mode} {N/A}
-set_instance_parameter_value hps_0 {I2C3_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {INCLUDE_BOARD_DELAY_MODEL} {0}
-set_instance_parameter_value hps_0 {INCLUDE_MULTIRANK_BOARD_DELAY_MODEL} {0}
-set_instance_parameter_value hps_0 {IS_ES_DEVICE} {0}
-set_instance_parameter_value hps_0 {LOANIO_Enable} {No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No}
-set_instance_parameter_value hps_0 {LOCAL_ID_WIDTH} {8}
-set_instance_parameter_value hps_0 {LRDIMM_EXTENDED_CONFIG} {0x000000000000000000}
-set_instance_parameter_value hps_0 {LWH2F_Enable} {true}
-set_instance_parameter_value hps_0 {MARGIN_VARIATION_TEST} {0}
-set_instance_parameter_value hps_0 {MAX_PENDING_RD_CMD} {32}
-set_instance_parameter_value hps_0 {MAX_PENDING_WR_CMD} {16}
-set_instance_parameter_value hps_0 {MEM_ASR} {Manual}
-set_instance_parameter_value hps_0 {MEM_ATCL} {Disabled}
-set_instance_parameter_value hps_0 {MEM_AUTO_LEVELING_MODE} {1}
-set_instance_parameter_value hps_0 {MEM_BANKADDR_WIDTH} {3}
-set_instance_parameter_value hps_0 {MEM_BL} {OTF}
-set_instance_parameter_value hps_0 {MEM_BT} {Sequential}
-set_instance_parameter_value hps_0 {MEM_CK_PHASE} {0.0}
-set_instance_parameter_value hps_0 {MEM_CK_WIDTH} {1}
-set_instance_parameter_value hps_0 {MEM_CLK_EN_WIDTH} {1}
-set_instance_parameter_value hps_0 {MEM_CLK_FREQ} {400.0}
-set_instance_parameter_value hps_0 {MEM_CLK_FREQ_MAX} {800.0}
-set_instance_parameter_value hps_0 {MEM_COL_ADDR_WIDTH} {10}
-set_instance_parameter_value hps_0 {MEM_CS_WIDTH} {1}
-set_instance_parameter_value hps_0 {MEM_DEVICE} {MISSING_MODEL}
-set_instance_parameter_value hps_0 {MEM_DLL_EN} {1}
-set_instance_parameter_value hps_0 {MEM_DQ_PER_DQS} {8}
-set_instance_parameter_value hps_0 {MEM_DQ_WIDTH} {32}
-set_instance_parameter_value hps_0 {MEM_DRV_STR} {RZQ/6}
-set_instance_parameter_value hps_0 {MEM_FORMAT} {DISCRETE}
-set_instance_parameter_value hps_0 {MEM_GUARANTEED_WRITE_INIT} {0}
-set_instance_parameter_value hps_0 {MEM_IF_BOARD_BASE_DELAY} {10}
-set_instance_parameter_value hps_0 {MEM_IF_DM_PINS_EN} {1}
-set_instance_parameter_value hps_0 {MEM_IF_DQSN_EN} {1}
-set_instance_parameter_value hps_0 {MEM_IF_SIM_VALID_WINDOW} {0}
-set_instance_parameter_value hps_0 {MEM_INIT_EN} {0}
-set_instance_parameter_value hps_0 {MEM_INIT_FILE} {}
-set_instance_parameter_value hps_0 {MEM_MIRROR_ADDRESSING} {0}
-set_instance_parameter_value hps_0 {MEM_NUMBER_OF_DIMMS} {1}
-set_instance_parameter_value hps_0 {MEM_NUMBER_OF_RANKS_PER_DEVICE} {1}
-set_instance_parameter_value hps_0 {MEM_NUMBER_OF_RANKS_PER_DIMM} {1}
-set_instance_parameter_value hps_0 {MEM_PD} {DLL off}
-set_instance_parameter_value hps_0 {MEM_RANK_MULTIPLICATION_FACTOR} {1}
-set_instance_parameter_value hps_0 {MEM_ROW_ADDR_WIDTH} {15}
-set_instance_parameter_value hps_0 {MEM_RTT_NOM} {RZQ/6}
-set_instance_parameter_value hps_0 {MEM_RTT_WR} {Dynamic ODT off}
-set_instance_parameter_value hps_0 {MEM_SRT} {Normal}
-set_instance_parameter_value hps_0 {MEM_TCL} {7}
-set_instance_parameter_value hps_0 {MEM_TFAW_NS} {37.5}
-set_instance_parameter_value hps_0 {MEM_TINIT_US} {500}
-set_instance_parameter_value hps_0 {MEM_TMRD_CK} {4}
-set_instance_parameter_value hps_0 {MEM_TRAS_NS} {35.0}
-set_instance_parameter_value hps_0 {MEM_TRCD_NS} {13.75}
-set_instance_parameter_value hps_0 {MEM_TREFI_US} {7.8}
-set_instance_parameter_value hps_0 {MEM_TRFC_NS} {300.0}
-set_instance_parameter_value hps_0 {MEM_TRP_NS} {13.75}
-set_instance_parameter_value hps_0 {MEM_TRRD_NS} {7.5}
-set_instance_parameter_value hps_0 {MEM_TRTP_NS} {7.5}
-set_instance_parameter_value hps_0 {MEM_TWR_NS} {15.0}
-set_instance_parameter_value hps_0 {MEM_TWTR} {4}
-set_instance_parameter_value hps_0 {MEM_USER_LEVELING_MODE} {Leveling}
-set_instance_parameter_value hps_0 {MEM_VENDOR} {Other}
-set_instance_parameter_value hps_0 {MEM_VERBOSE} {1}
-set_instance_parameter_value hps_0 {MEM_VOLTAGE} {1.5V DDR3}
-set_instance_parameter_value hps_0 {MEM_WTCL} {7}
-set_instance_parameter_value hps_0 {MPU_EVENTS_Enable} {0}
-set_instance_parameter_value hps_0 {MRS_MIRROR_PING_PONG_ATSO} {0}
-set_instance_parameter_value hps_0 {MULTICAST_EN} {0}
-set_instance_parameter_value hps_0 {NAND_Mode} {N/A}
-set_instance_parameter_value hps_0 {NAND_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {NEXTGEN} {1}
-set_instance_parameter_value hps_0 {NIOS_ROM_DATA_WIDTH} {32}
-set_instance_parameter_value hps_0 {NUM_DLL_SHARING_INTERFACES} {1}
-set_instance_parameter_value hps_0 {NUM_EXTRA_REPORT_PATH} {10}
-set_instance_parameter_value hps_0 {NUM_OCT_SHARING_INTERFACES} {1}
-set_instance_parameter_value hps_0 {NUM_OF_PORTS} {1}
-set_instance_parameter_value hps_0 {NUM_PLL_SHARING_INTERFACES} {1}
-set_instance_parameter_value hps_0 {OCT_SHARING_MODE} {None}
-set_instance_parameter_value hps_0 {P2C_READ_CLOCK_ADD_PHASE} {0.0}
-set_instance_parameter_value hps_0 {PACKAGE_DESKEW} {0}
-set_instance_parameter_value hps_0 {PARSE_FRIENDLY_DEVICE_FAMILY_PARAM} {}
-set_instance_parameter_value hps_0 {PARSE_FRIENDLY_DEVICE_FAMILY_PARAM_VALID} {0}
-set_instance_parameter_value hps_0 {PHY_CSR_CONNECTION} {INTERNAL_JTAG}
-set_instance_parameter_value hps_0 {PHY_CSR_ENABLED} {0}
-set_instance_parameter_value hps_0 {PHY_ONLY} {0}
-set_instance_parameter_value hps_0 {PINGPONGPHY_EN} {0}
-set_instance_parameter_value hps_0 {PLL_ADDR_CMD_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_ADDR_CMD_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_ADDR_CMD_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_ADDR_CMD_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_ADDR_CMD_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_ADDR_CMD_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_AFI_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_AFI_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_AFI_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_AFI_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_AFI_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_AFI_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_AFI_HALF_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_AFI_HALF_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_AFI_HALF_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_AFI_HALF_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_AFI_HALF_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_AFI_HALF_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_AFI_PHY_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_AFI_PHY_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_AFI_PHY_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_AFI_PHY_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_AFI_PHY_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_AFI_PHY_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_C2P_WRITE_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_C2P_WRITE_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_C2P_WRITE_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_C2P_WRITE_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_C2P_WRITE_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_C2P_WRITE_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_CLK_PARAM_VALID} {0}
-set_instance_parameter_value hps_0 {PLL_CONFIG_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_CONFIG_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_CONFIG_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_CONFIG_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_CONFIG_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_CONFIG_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_DR_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_DR_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_DR_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_DR_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_DR_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_DR_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_HR_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_HR_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_HR_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_HR_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_HR_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_HR_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_LOCATION} {Top_Bottom}
-set_instance_parameter_value hps_0 {PLL_MEM_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_MEM_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_MEM_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_MEM_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_MEM_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_MEM_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_NIOS_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_NIOS_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_NIOS_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_NIOS_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_NIOS_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_NIOS_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_P2C_READ_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_P2C_READ_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_P2C_READ_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_P2C_READ_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_P2C_READ_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_P2C_READ_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_SHARING_MODE} {None}
-set_instance_parameter_value hps_0 {PLL_WRITE_CLK_DIV_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_WRITE_CLK_FREQ_PARAM} {0.0}
-set_instance_parameter_value hps_0 {PLL_WRITE_CLK_FREQ_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {PLL_WRITE_CLK_MULT_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_WRITE_CLK_PHASE_PS_PARAM} {0}
-set_instance_parameter_value hps_0 {PLL_WRITE_CLK_PHASE_PS_SIM_STR_PARAM} {}
-set_instance_parameter_value hps_0 {POWER_OF_TWO_BUS} {0}
-set_instance_parameter_value hps_0 {PRIORITY_PORT} {1 1 1 1 1 1}
-set_instance_parameter_value hps_0 {QSPI_Mode} {N/A}
-set_instance_parameter_value hps_0 {QSPI_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {RATE} {Full}
-set_instance_parameter_value hps_0 {RDIMM_CONFIG} {0000000000000000}
-set_instance_parameter_value hps_0 {READ_DQ_DQS_CLOCK_SOURCE} {INVERTED_DQS_BUS}
-set_instance_parameter_value hps_0 {READ_FIFO_SIZE} {8}
-set_instance_parameter_value hps_0 {REFRESH_BURST_VALIDATION} {0}
-set_instance_parameter_value hps_0 {REFRESH_INTERVAL} {15000}
-set_instance_parameter_value hps_0 {REF_CLK_FREQ} {125.0}
-set_instance_parameter_value hps_0 {REF_CLK_FREQ_MAX_PARAM} {0.0}
-set_instance_parameter_value hps_0 {REF_CLK_FREQ_MIN_PARAM} {0.0}
-set_instance_parameter_value hps_0 {REF_CLK_FREQ_PARAM_VALID} {0}
-set_instance_parameter_value hps_0 {S2FCLK_COLDRST_Enable} {0}
-set_instance_parameter_value hps_0 {S2FCLK_PENDINGRST_Enable} {0}
-set_instance_parameter_value hps_0 {S2FCLK_USER0CLK_Enable} {0}
-set_instance_parameter_value hps_0 {S2FCLK_USER1CLK_Enable} {0}
-set_instance_parameter_value hps_0 {S2FCLK_USER1CLK_FREQ} {100.0}
-set_instance_parameter_value hps_0 {S2FCLK_USER2CLK} {5}
-set_instance_parameter_value hps_0 {S2FCLK_USER2CLK_Enable} {0}
-set_instance_parameter_value hps_0 {S2FCLK_USER2CLK_FREQ} {100.0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_CAN_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_CLOCKPERIPHERAL_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_CTI_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_DMA_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_EMAC_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_FPGAMANAGER_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_GPIO_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_I2CEMAC_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_I2CPERIPHERAL_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_L4TIMER_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_NAND_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_OSCTIMER_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_QSPI_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_SDMMC_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_SPIMASTER_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_SPISLAVE_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_UART_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_USB_Enable} {0}
-set_instance_parameter_value hps_0 {S2FINTERRUPT_WATCHDOG_Enable} {0}
-set_instance_parameter_value hps_0 {S2F_Width} {0}
-set_instance_parameter_value hps_0 {SDIO_Mode} {N/A}
-set_instance_parameter_value hps_0 {SDIO_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {SEQUENCER_TYPE} {NIOS}
-set_instance_parameter_value hps_0 {SEQ_MODE} {0}
-set_instance_parameter_value hps_0 {SKIP_MEM_INIT} {1}
-set_instance_parameter_value hps_0 {SOPC_COMPAT_RESET} {0}
-set_instance_parameter_value hps_0 {SPEED_GRADE} {7}
-set_instance_parameter_value hps_0 {SPIM0_Mode} {N/A}
-set_instance_parameter_value hps_0 {SPIM0_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {SPIM1_Mode} {N/A}
-set_instance_parameter_value hps_0 {SPIM1_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {SPIS0_Mode} {N/A}
-set_instance_parameter_value hps_0 {SPIS0_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {SPIS1_Mode} {N/A}
-set_instance_parameter_value hps_0 {SPIS1_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {STARVE_LIMIT} {10}
-set_instance_parameter_value hps_0 {STM_Enable} {0}
-set_instance_parameter_value hps_0 {TEST_Enable} {0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_AC_EYE_REDUCTION_H} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_AC_EYE_REDUCTION_SU} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_AC_SKEW} {0.02}
-set_instance_parameter_value hps_0 {TIMING_BOARD_AC_SLEW_RATE} {1.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_AC_TO_CK_SKEW} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_CK_CKN_SLEW_RATE} {2.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_DELTA_DQS_ARRIVAL_TIME} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_DELTA_READ_DQS_ARRIVAL_TIME} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_DERATE_METHOD} {AUTO}
-set_instance_parameter_value hps_0 {TIMING_BOARD_DQS_DQSN_SLEW_RATE} {2.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_DQ_EYE_REDUCTION} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_DQ_SLEW_RATE} {1.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_DQ_TO_DQS_SKEW} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_ISI_METHOD} {AUTO}
-set_instance_parameter_value hps_0 {TIMING_BOARD_MAX_CK_DELAY} {0.6}
-set_instance_parameter_value hps_0 {TIMING_BOARD_MAX_DQS_DELAY} {0.6}
-set_instance_parameter_value hps_0 {TIMING_BOARD_READ_DQ_EYE_REDUCTION} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_SKEW_BETWEEN_DIMMS} {0.05}
-set_instance_parameter_value hps_0 {TIMING_BOARD_SKEW_BETWEEN_DQS} {0.02}
-set_instance_parameter_value hps_0 {TIMING_BOARD_SKEW_CKDQS_DIMM_MAX} {0.01}
-set_instance_parameter_value hps_0 {TIMING_BOARD_SKEW_CKDQS_DIMM_MIN} {-0.01}
-set_instance_parameter_value hps_0 {TIMING_BOARD_SKEW_WITHIN_DQS} {0.02}
-set_instance_parameter_value hps_0 {TIMING_BOARD_TDH} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_TDS} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_TIH} {0.0}
-set_instance_parameter_value hps_0 {TIMING_BOARD_TIS} {0.0}
-set_instance_parameter_value hps_0 {TIMING_TDH} {125}
-set_instance_parameter_value hps_0 {TIMING_TDQSCK} {400}
-set_instance_parameter_value hps_0 {TIMING_TDQSCKDL} {1200}
-set_instance_parameter_value hps_0 {TIMING_TDQSCKDM} {900}
-set_instance_parameter_value hps_0 {TIMING_TDQSCKDS} {450}
-set_instance_parameter_value hps_0 {TIMING_TDQSH} {0.35}
-set_instance_parameter_value hps_0 {TIMING_TDQSQ} {120}
-set_instance_parameter_value hps_0 {TIMING_TDQSS} {0.25}
-set_instance_parameter_value hps_0 {TIMING_TDS} {50}
-set_instance_parameter_value hps_0 {TIMING_TDSH} {0.2}
-set_instance_parameter_value hps_0 {TIMING_TDSS} {0.2}
-set_instance_parameter_value hps_0 {TIMING_TIH} {250}
-set_instance_parameter_value hps_0 {TIMING_TIS} {175}
-set_instance_parameter_value hps_0 {TIMING_TQH} {0.38}
-set_instance_parameter_value hps_0 {TIMING_TQHS} {300}
-set_instance_parameter_value hps_0 {TIMING_TQSH} {0.38}
-set_instance_parameter_value hps_0 {TPIUFPGA_Enable} {0}
-set_instance_parameter_value hps_0 {TPIUFPGA_alt} {0}
-set_instance_parameter_value hps_0 {TRACE_Mode} {N/A}
-set_instance_parameter_value hps_0 {TRACE_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {TRACKING_ERROR_TEST} {0}
-set_instance_parameter_value hps_0 {TRACKING_WATCH_TEST} {0}
-set_instance_parameter_value hps_0 {TREFI} {35100}
-set_instance_parameter_value hps_0 {TRFC} {350}
-set_instance_parameter_value hps_0 {UART0_Mode} {N/A}
-set_instance_parameter_value hps_0 {UART0_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {UART1_Mode} {N/A}
-set_instance_parameter_value hps_0 {UART1_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {USB0_Mode} {N/A}
-set_instance_parameter_value hps_0 {USB0_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {USB1_Mode} {N/A}
-set_instance_parameter_value hps_0 {USB1_PinMuxing} {Unused}
-set_instance_parameter_value hps_0 {USER_DEBUG_LEVEL} {1}
-set_instance_parameter_value hps_0 {USE_AXI_ADAPTOR} {0}
-set_instance_parameter_value hps_0 {USE_FAKE_PHY} {0}
-set_instance_parameter_value hps_0 {USE_MEM_CLK_FREQ} {0}
-set_instance_parameter_value hps_0 {USE_MM_ADAPTOR} {1}
-set_instance_parameter_value hps_0 {USE_SEQUENCER_BFM} {0}
-set_instance_parameter_value hps_0 {WEIGHT_PORT} {0 0 0 0 0 0}
-set_instance_parameter_value hps_0 {WRBUFFER_ADDR_WIDTH} {6}
-set_instance_parameter_value hps_0 {can0_clk_div} {1}
-set_instance_parameter_value hps_0 {can1_clk_div} {1}
-set_instance_parameter_value hps_0 {configure_advanced_parameters} {0}
-set_instance_parameter_value hps_0 {customize_device_pll_info} {0}
-set_instance_parameter_value hps_0 {dbctrl_stayosc1} {1}
-set_instance_parameter_value hps_0 {dbg_at_clk_div} {0}
-set_instance_parameter_value hps_0 {dbg_clk_div} {1}
-set_instance_parameter_value hps_0 {dbg_trace_clk_div} {0}
-set_instance_parameter_value hps_0 {desired_can0_clk_mhz} {100.0}
-set_instance_parameter_value hps_0 {desired_can1_clk_mhz} {100.0}
-set_instance_parameter_value hps_0 {desired_cfg_clk_mhz} {100.0}
-set_instance_parameter_value hps_0 {desired_emac0_clk_mhz} {250.0}
-set_instance_parameter_value hps_0 {desired_emac1_clk_mhz} {250.0}
-set_instance_parameter_value hps_0 {desired_gpio_db_clk_hz} {32000}
-set_instance_parameter_value hps_0 {desired_l4_mp_clk_mhz} {100.0}
-set_instance_parameter_value hps_0 {desired_l4_sp_clk_mhz} {100.0}
-set_instance_parameter_value hps_0 {desired_mpu_clk_mhz} {800.0}
-set_instance_parameter_value hps_0 {desired_nand_clk_mhz} {12.5}
-set_instance_parameter_value hps_0 {desired_qspi_clk_mhz} {400.0}
-set_instance_parameter_value hps_0 {desired_sdmmc_clk_mhz} {200.0}
-set_instance_parameter_value hps_0 {desired_spi_m_clk_mhz} {200.0}
-set_instance_parameter_value hps_0 {desired_usb_mp_clk_mhz} {200.0}
-set_instance_parameter_value hps_0 {device_pll_info_manual} {{320000000 1600000000} {320000000 1000000000} {800000000 400000000 400000000}}
-set_instance_parameter_value hps_0 {eosc1_clk_mhz} {25.0}
-set_instance_parameter_value hps_0 {eosc2_clk_mhz} {25.0}
-set_instance_parameter_value hps_0 {gpio_db_clk_div} {6249}
-set_instance_parameter_value hps_0 {l3_mp_clk_div} {1}
-set_instance_parameter_value hps_0 {l3_sp_clk_div} {1}
-set_instance_parameter_value hps_0 {l4_mp_clk_div} {1}
-set_instance_parameter_value hps_0 {l4_mp_clk_source} {1}
-set_instance_parameter_value hps_0 {l4_sp_clk_div} {1}
-set_instance_parameter_value hps_0 {l4_sp_clk_source} {1}
-set_instance_parameter_value hps_0 {main_pll_c3} {3}
-set_instance_parameter_value hps_0 {main_pll_c4} {3}
-set_instance_parameter_value hps_0 {main_pll_c5} {15}
-set_instance_parameter_value hps_0 {main_pll_m} {63}
-set_instance_parameter_value hps_0 {main_pll_n} {0}
-set_instance_parameter_value hps_0 {nand_clk_source} {2}
-set_instance_parameter_value hps_0 {periph_pll_c0} {3}
-set_instance_parameter_value hps_0 {periph_pll_c1} {3}
-set_instance_parameter_value hps_0 {periph_pll_c2} {1}
-set_instance_parameter_value hps_0 {periph_pll_c3} {19}
-set_instance_parameter_value hps_0 {periph_pll_c4} {4}
-set_instance_parameter_value hps_0 {periph_pll_c5} {9}
-set_instance_parameter_value hps_0 {periph_pll_m} {79}
-set_instance_parameter_value hps_0 {periph_pll_n} {1}
-set_instance_parameter_value hps_0 {periph_pll_source} {0}
-set_instance_parameter_value hps_0 {qspi_clk_source} {1}
-set_instance_parameter_value hps_0 {sdmmc_clk_source} {2}
-set_instance_parameter_value hps_0 {show_advanced_parameters} {0}
-set_instance_parameter_value hps_0 {show_debug_info_as_warning_msg} {0}
-set_instance_parameter_value hps_0 {show_warning_as_error_msg} {0}
-set_instance_parameter_value hps_0 {spi_m_clk_div} {0}
-set_instance_parameter_value hps_0 {usb_mp_clk_div} {0}
-set_instance_parameter_value hps_0 {use_default_mpu_clk} {1}
-
-add_instance pll_0 altera_pll 18.1
-set_instance_parameter_value pll_0 {debug_print_output} {0}
-set_instance_parameter_value pll_0 {debug_use_rbc_taf_method} {0}
-set_instance_parameter_value pll_0 {gui_active_clk} {0}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency0} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency1} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency10} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency11} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency12} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency13} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency14} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency15} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency16} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency17} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency2} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency3} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency4} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency5} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency6} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency7} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency8} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_output_clock_frequency9} {0 MHz}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift0} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift1} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift10} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift11} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift12} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift13} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift14} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift15} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift16} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift17} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift2} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift3} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift4} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift5} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift6} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift7} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift8} {0}
-set_instance_parameter_value pll_0 {gui_actual_phase_shift9} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter0} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter1} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter10} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter11} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter12} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter13} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter14} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter15} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter16} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter17} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter2} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter3} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter4} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter5} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter6} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter7} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter8} {0}
-set_instance_parameter_value pll_0 {gui_cascade_counter9} {0}
-set_instance_parameter_value pll_0 {gui_cascade_outclk_index} {0}
-set_instance_parameter_value pll_0 {gui_channel_spacing} {0.0}
-set_instance_parameter_value pll_0 {gui_clk_bad} {0}
-set_instance_parameter_value pll_0 {gui_device_speed_grade} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c0} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c1} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c10} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c11} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c12} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c13} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c14} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c15} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c16} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c17} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c2} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c3} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c4} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c5} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c6} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c7} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c8} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_c9} {1}
-set_instance_parameter_value pll_0 {gui_divide_factor_n} {1}
-set_instance_parameter_value pll_0 {gui_dps_cntr} {C0}
-set_instance_parameter_value pll_0 {gui_dps_dir} {Positive}
-set_instance_parameter_value pll_0 {gui_dps_num} {1}
-set_instance_parameter_value pll_0 {gui_dsm_out_sel} {1st_order}
-set_instance_parameter_value pll_0 {gui_duty_cycle0} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle1} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle10} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle11} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle12} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle13} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle14} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle15} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle16} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle17} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle2} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle3} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle4} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle5} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle6} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle7} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle8} {50}
-set_instance_parameter_value pll_0 {gui_duty_cycle9} {50}
-set_instance_parameter_value pll_0 {gui_en_adv_params} {0}
-set_instance_parameter_value pll_0 {gui_en_dps_ports} {0}
-set_instance_parameter_value pll_0 {gui_en_phout_ports} {0}
-set_instance_parameter_value pll_0 {gui_en_reconf} {0}
-set_instance_parameter_value pll_0 {gui_enable_cascade_in} {0}
-set_instance_parameter_value pll_0 {gui_enable_cascade_out} {0}
-set_instance_parameter_value pll_0 {gui_enable_mif_dps} {0}
-set_instance_parameter_value pll_0 {gui_feedback_clock} {Global Clock}
-set_instance_parameter_value pll_0 {gui_frac_multiply_factor} {1.0}
-set_instance_parameter_value pll_0 {gui_fractional_cout} {32}
-set_instance_parameter_value pll_0 {gui_mif_generate} {0}
-set_instance_parameter_value pll_0 {gui_multiply_factor} {1}
-set_instance_parameter_value pll_0 {gui_number_of_clocks} {1}
-set_instance_parameter_value pll_0 {gui_operation_mode} {normal}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency0} $FREQ_MHZ
-set_instance_parameter_value pll_0 {gui_output_clock_frequency1} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency10} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency11} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency12} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency13} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency14} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency15} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency16} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency17} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency2} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency3} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency4} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency5} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency6} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency7} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency8} {100.0}
-set_instance_parameter_value pll_0 {gui_output_clock_frequency9} {100.0}
-set_instance_parameter_value pll_0 {gui_phase_shift0} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift1} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift10} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift11} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift12} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift13} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift14} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift15} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift16} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift17} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift2} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift3} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift4} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift5} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift6} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift7} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift8} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift9} {0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg0} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg1} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg10} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg11} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg12} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg13} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg14} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg15} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg16} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg17} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg2} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg3} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg4} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg5} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg6} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg7} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg8} {0.0}
-set_instance_parameter_value pll_0 {gui_phase_shift_deg9} {0.0}
-set_instance_parameter_value pll_0 {gui_phout_division} {1}
-set_instance_parameter_value pll_0 {gui_pll_auto_reset} {Off}
-set_instance_parameter_value pll_0 {gui_pll_bandwidth_preset} {Auto}
-set_instance_parameter_value pll_0 {gui_pll_cascading_mode} {Create an adjpllin signal to connect with an upstream PLL}
-set_instance_parameter_value pll_0 {gui_pll_mode} {Integer-N PLL}
-set_instance_parameter_value pll_0 {gui_ps_units0} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units1} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units10} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units11} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units12} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units13} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units14} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units15} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units16} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units17} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units2} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units3} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units4} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units5} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units6} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units7} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units8} {ps}
-set_instance_parameter_value pll_0 {gui_ps_units9} {ps}
-set_instance_parameter_value pll_0 {gui_refclk1_frequency} {100.0}
-set_instance_parameter_value pll_0 {gui_refclk_switch} {0}
-set_instance_parameter_value pll_0 {gui_reference_clock_frequency} {50.0}
-set_instance_parameter_value pll_0 {gui_switchover_delay} {0}
-set_instance_parameter_value pll_0 {gui_switchover_mode} {Automatic Switchover}
-set_instance_parameter_value pll_0 {gui_use_locked} {0}
-
-add_instance vta_0 vta 1.0
-
-# exported interfaces
-add_interface clk clock sink
-set_interface_property clk EXPORT_OF clk_0.clk_in
-add_interface hps_0_h2f_reset reset source
-set_interface_property hps_0_h2f_reset EXPORT_OF hps_0.h2f_reset
-add_interface memory conduit end
-set_interface_property memory EXPORT_OF hps_0.memory
-add_interface reset reset sink
-set_interface_property reset EXPORT_OF clk_0.clk_in_reset
-
-# connections and connection parameters
-add_connection clk_0.clk pll_0.refclk
-
-add_connection clk_0.clk_reset pll_0.reset
-
-add_connection clk_0.clk_reset vta_0.reset
-
-add_connection hps_0.h2f_lw_axi_master vta_0.s_axi_control
-set_connection_parameter_value hps_0.h2f_lw_axi_master/vta_0.s_axi_control arbitrationPriority {1}
-set_connection_parameter_value hps_0.h2f_lw_axi_master/vta_0.s_axi_control baseAddress {0x00020000}
-set_connection_parameter_value hps_0.h2f_lw_axi_master/vta_0.s_axi_control defaultConnection {0}
-
-add_connection pll_0.outclk0 hps_0.f2h_axi_clock
-
-add_connection pll_0.outclk0 hps_0.h2f_lw_axi_clock
-
-add_connection pll_0.outclk0 vta_0.clock
-
-add_connection vta_0.m_axi_gmem hps_0.f2h_axi_slave
-set_connection_parameter_value vta_0.m_axi_gmem/hps_0.f2h_axi_slave arbitrationPriority {1}
-set_connection_parameter_value vta_0.m_axi_gmem/hps_0.f2h_axi_slave baseAddress {0x0000}
-set_connection_parameter_value vta_0.m_axi_gmem/hps_0.f2h_axi_slave defaultConnection {0}
-
-# interconnect requirements
-set_interconnect_requirement {$system} {qsys_mm.clockCrossingAdapter} {HANDSHAKE}
-set_interconnect_requirement {$system} {qsys_mm.enableEccProtection} {FALSE}
-set_interconnect_requirement {$system} {qsys_mm.insertDefaultSlave} {FALSE}
-set_interconnect_requirement {$system} {qsys_mm.maxAdditionalLatency} {1}
-
-save_system soc_system.qsys
diff --git a/vta/vta-hw/hardware/xilinx/.gitignore b/vta/vta-hw/hardware/xilinx/.gitignore
deleted file mode 100644
index 54ff70b80cd1..000000000000
--- a/vta/vta-hw/hardware/xilinx/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-build
-*.out
-*.log
-*.sb
diff --git a/vta/vta-hw/hardware/xilinx/Makefile b/vta/vta-hw/hardware/xilinx/Makefile
deleted file mode 100644
index 2651583bea33..000000000000
--- a/vta/vta-hw/hardware/xilinx/Makefile
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Directories
-ROOTDIR = $(CURDIR)
-VTA_HW_DIR = $(CURDIR)/../..
-BUILD_DIR = $(VTA_HW_DIR)/build/hardware/xilinx
-SCRIPT_DIR = $(CURDIR)/scripts
-SRC_DIR = $(CURDIR)/src
-
-# Executables
-VIVADO_HLS = vivado_hls
-VIVADO = vivado
-
-# Process VTA JSON config
-VTA_CONFIG := $(VTA_HW_DIR)/config/vta_config.py
-
-# Derive config name
-CONF := $(shell python ${VTA_CONFIG} --cfg-str)
-IP_BUILD_PATH := $(BUILD_DIR)/hls/$(CONF)
-HW_BUILD_PATH := $(BUILD_DIR)/vivado/$(CONF)
-
-# IP file path
-IP_PATH := $(BUILD_DIR)/hls/$(CONF)/vta_compute/soln/impl/ip/xilinx_com_hls_compute_1_0.zip
-
-# Bitstream file path
-BIT_PATH := $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
-
-.PHONY: all ip bit clean clean_all
-
-all: bit
-ip: $(IP_PATH)
-bit: $(BIT_PATH)
-
-$(IP_PATH): $(SRC_DIR)/*
-	mkdir -p $(IP_BUILD_PATH)
-	cd $(IP_BUILD_PATH) && \
-		$(VIVADO_HLS) \
-		-f $(SCRIPT_DIR)/hls.tcl \
-		-tclargs \
-			$(VTA_HW_DIR) \
-			${VTA_CONFIG}
-
-$(BIT_PATH): $(IP_PATH)
-	mkdir -p $(HW_BUILD_PATH)
-	cd $(HW_BUILD_PATH) && \
-		$(VIVADO) \
-		-mode tcl \
-		-source $(SCRIPT_DIR)/vivado.tcl \
-		-tclargs \
-			$(BUILD_DIR)/hls/$(CONF) \
-			${VTA_CONFIG}
-
-clean:
-	rm -rf *.out *.log
-
-cleanall: clean
-	rm -rf $(BUILD_DIR)
diff --git a/vta/vta-hw/hardware/xilinx/README.md b/vta/vta-hw/hardware/xilinx/README.md
deleted file mode 100644
index 6d289cff0a6e..000000000000
--- a/vta/vta-hw/hardware/xilinx/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Complete instructions on how to build custom FPGA hardware designs are available on the [TVM documentation webpage](https://docs.tvm.ai/vta/install.html#vta-fpga-toolchain-installation).
diff --git a/vta/vta-hw/hardware/xilinx/scripts/hls.tcl b/vta/vta-hw/hardware/xilinx/scripts/hls.tcl
deleted file mode 100644
index 724bdbf2d2ac..000000000000
--- a/vta/vta-hw/hardware/xilinx/scripts/hls.tcl
+++ /dev/null
@@ -1,138 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Command line arguments:
-# Arg 1: path to vta root
-# Arg 2: path of config param script
-
-if { [llength $argv] eq 4 } {
-    set root_dir        [lindex $argv 2]
-    set vta_config      [lindex $argv 3]
-} else {
-    puts "Not enough arguments provided!"
-    exit
-}
-
-# Derive paths
-set src_dir "$root_dir/hardware/xilinx/src"
-set sim_dir "$root_dir/hardware/xilinx/sim"
-set test_dir "$root_dir/tests/hardware/common"
-
-# C define flags that we want to pass to the compiler
-set cflags [exec python $vta_config --cflags]
-
-# Get the VTA configuration paramters
-set ::device        [exec python $vta_config --get-fpga-dev]
-set ::period        [exec python $vta_config --get-fpga-per]
-
-# Get the VTA SRAM reshape/partition factors to get all memories
-# to be of the same axi width.
-set ::inp_reshape_factor    [exec python $vta_config --get-inp-mem-axi-ratio]
-set ::inp_partition_factor  [exec python $vta_config --get-inp-mem-banks]
-set ::wgt_reshape_factor    [exec python $vta_config --get-wgt-mem-axi-ratio]
-set ::wgt_partition_factor  [exec python $vta_config --get-wgt-mem-banks]
-set ::out_reshape_factor    [exec python $vta_config --get-out-mem-axi-ratio]
-set ::out_partition_factor  [exec python $vta_config --get-out-mem-banks]
-
-
-# Initializes the HLS design and sets HLS pragmas for memory partitioning.
-# This is necessary because of a Vivado restriction that doesn't allow for
-# buses wider than 1024 bits.
-proc init_design {} {
-
-    # Set device id
-    set_part $::device
-
-    # Set the clock frequency
-    create_clock -period $::period -name default
-
-    # HLS pragmas to reshape/partition the input memory read/write port
-    set_directive_array_reshape -type block -factor $::inp_reshape_factor -dim 2 "load" inp_mem
-    set_directive_array_reshape -type block -factor $::inp_reshape_factor -dim 2 "compute" inp_mem
-    if {$::inp_partition_factor > 1} {
-        set_directive_array_partition -type block -factor $::inp_partition_factor -dim 2 "load" inp_mem
-        set_directive_array_partition -type block -factor $::inp_partition_factor -dim 2 "compute" inp_mem
-    }
-    # HLS pragmas to reshape/partition the weight memory read/write port
-    set_directive_array_reshape -type block -factor $::wgt_reshape_factor -dim 2 "load" wgt_mem
-    set_directive_array_reshape -type block -factor $::wgt_reshape_factor -dim 2 "compute" wgt_mem
-    if {$::wgt_partition_factor >1} {
-        set_directive_array_partition -type block -factor $::wgt_partition_factor -dim 2 "load" wgt_mem
-        set_directive_array_partition -type block -factor $::wgt_partition_factor -dim 2 "compute" wgt_mem
-    }
-    # HLS pragmas to reshape/partition the output memory read/write port
-    set_directive_array_reshape -type block -factor $::out_reshape_factor -dim 2 "compute" out_mem
-    set_directive_array_reshape -type block -factor $::out_reshape_factor -dim 2 "store" out_mem
-    if {$::out_partition_factor > 1} {
-        set_directive_array_partition -type block -factor $::out_partition_factor -dim 2 "compute" out_mem
-        set_directive_array_partition -type block -factor $::out_partition_factor -dim 2 "store" out_mem
-    }
-}
-
-# HLS behavioral sim
-open_project vta_sim
-set_top vta
-add_files $src_dir/vta.cc -cflags $cflags
-add_files -tb $sim_dir/vta_test.cc -cflags $cflags
-add_files -tb $test_dir/test_lib.cc -cflags $cflags
-open_solution "soln"
-init_design
-csim_design -clean
-close_project
-
-# Generate fetch stage
-open_project vta_fetch
-set_top fetch
-add_files $src_dir/vta.cc -cflags $cflags
-open_solution "soln"
-init_design
-csynth_design
-export_design -format ip_catalog
-close_project
-
-# Generate load stage
-open_project vta_load
-set_top load
-add_files $src_dir/vta.cc -cflags $cflags
-open_solution "soln"
-init_design
-csynth_design
-export_design -format ip_catalog
-close_project
-
-# Generate compute stage
-open_project vta_compute
-set_top compute
-add_files $src_dir/vta.cc -cflags $cflags
-open_solution "soln"
-init_design
-csynth_design
-export_design -format ip_catalog
-close_project
-
-# Generate store stage
-open_project vta_store
-set_top store
-add_files $src_dir/vta.cc -cflags $cflags
-open_solution "soln"
-init_design
-csynth_design
-export_design -format ip_catalog
-close_project
-
-exit
-
diff --git a/vta/vta-hw/hardware/xilinx/scripts/hsi.tcl b/vta/vta-hw/hardware/xilinx/scripts/hsi.tcl
deleted file mode 100644
index 712a7586aca2..000000000000
--- a/vta/vta-hw/hardware/xilinx/scripts/hsi.tcl
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#  file: hsi.tcl
-#  brief: Driver generation script for ARMv7 driver libraries.
-#
-
-open_hw_design export/vta.hdf
-create_sw_design swdesign -proc ps7_cortexa9_0 -os standalone
-generate_bsp -dir bsp
-
-exit
diff --git a/vta/vta-hw/hardware/xilinx/scripts/vivado.tcl b/vta/vta-hw/hardware/xilinx/scripts/vivado.tcl
deleted file mode 100644
index 1f8f1dac2f0b..000000000000
--- a/vta/vta-hw/hardware/xilinx/scripts/vivado.tcl
+++ /dev/null
@@ -1,437 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Check if script is running in correct Vivado version.
-set scripts_vivado_version 2018.3
-set current_vivado_version [version -short]
-
-if { [string first $scripts_vivado_version $current_vivado_version] == -1 } {
-   puts ""
-   catch {common::send_msg_id "BD_TCL-109" "ERROR" "This script was generated using Vivado \
-    <$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado."}
-   return 1
-}
-
-# Parse argument list, derive the clock to utilize
-if { [llength $argv] eq 2 } {
-  set ip_path     [lindex $argv 0]
-  set vta_config  [lindex $argv 1]
-} else {
-  puts "Arg list incomplete: <path to ip dir> <path to vta_config.py>"
-  return 1
-}
-
-# Get the VTA configuration paramters
-set target            [exec python $vta_config --target]
-set device_family     [exec python $vta_config --get-fpga-family]
-set clock_freq        [exec python $vta_config --get-fpga-freq]
-
-# SRAM dimensions
-set inp_part          [exec python $vta_config --get-inp-mem-banks]
-set inp_mem_width     [exec python $vta_config --get-inp-mem-width]
-set inp_mem_depth     [exec python $vta_config --get-inp-mem-depth]
-set wgt_part          [exec python $vta_config --get-wgt-mem-banks]
-set wgt_mem_width     [exec python $vta_config --get-wgt-mem-width]
-set wgt_mem_depth     [exec python $vta_config --get-wgt-mem-depth]
-set out_part          [exec python $vta_config --get-out-mem-banks]
-set out_mem_width     [exec python $vta_config --get-out-mem-width]
-set out_mem_depth     [exec python $vta_config --get-out-mem-depth]
-
-# AXI bus signals
-set axi_cache         [exec python $vta_config --get-axi-cache-bits]
-set axi_prot          [exec python $vta_config --get-axi-prot-bits]
-
-# Address map
-set ip_reg_map_range  [exec python $vta_config --get-ip-reg-map-range]
-set fetch_base_addr   [exec python $vta_config --get-fetch-base-addr]
-set load_base_addr    [exec python $vta_config --get-load-base-addr]
-set compute_base_addr [exec python $vta_config --get-compute-base-addr]
-set store_base_addr   [exec python $vta_config --get-store-base-addr]
-
-# Paths to IP library of VTA modules
-set proj_name vta
-set design_name $proj_name
-set proj_path "."
-set ip_lib "ip_lib"
-set fetch_ip "${ip_path}/vta_fetch/soln/impl/ip/xilinx_com_hls_fetch_1_0.zip"
-set load_ip "${ip_path}/vta_load/soln/impl/ip/xilinx_com_hls_load_1_0.zip"
-set compute_ip "${ip_path}/vta_compute/soln/impl/ip/xilinx_com_hls_compute_1_0.zip"
-set store_ip "${ip_path}/vta_store/soln/impl/ip/xilinx_com_hls_store_1_0.zip"
-
-# Create custom project
-set device [exec python $vta_config --get-fpga-dev]
-create_project -force $proj_name $proj_path -part $device
-
-# Update IP repository with generated IP
-file mkdir $ip_lib
-set_property ip_repo_paths $ip_lib [current_project]
-update_ip_catalog
-update_ip_catalog -add_ip $fetch_ip -repo_path $ip_lib
-update_ip_catalog -add_ip $load_ip -repo_path $ip_lib
-update_ip_catalog -add_ip $compute_ip -repo_path $ip_lib
-update_ip_catalog -add_ip $store_ip -repo_path $ip_lib
-
-
-##################################################################
-# CONFIGURE BLOCK DIAGRAM DESIGN
-##################################################################
-
-# Create bd design
-create_bd_design $design_name
-current_bd_design $design_name
-
-# Procedure to initialize FIFO
-proc init_fifo_property {fifo width_bytes depth} {
-  set_property -dict [ list \
-    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
-    CONFIG.Full_Flags_Reset_Value {1} \
-    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
-    CONFIG.Input_Depth_axis $depth \
-    CONFIG.Reset_Type {Asynchronous_Reset} \
-    CONFIG.TDATA_NUM_BYTES $width_bytes \
-  ] $fifo
-}
-
-# Procedure to initialize BRAM
-proc init_bram_property {bram width depth} {
-  set_property -dict [ list \
-    CONFIG.Assume_Synchronous_Clk {true} \
-    CONFIG.Byte_Size {8} \
-    CONFIG.Enable_32bit_Address {true} \
-    CONFIG.Enable_B {Use_ENB_Pin} \
-    CONFIG.Memory_Type {True_Dual_Port_RAM} \
-    CONFIG.Read_Width_A $width \
-    CONFIG.Read_Width_B $width \
-    CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
-    CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
-    CONFIG.Use_Byte_Write_Enable {true} \
-    CONFIG.Use_RSTA_Pin {true} \
-    CONFIG.Use_RSTB_Pin {true} \
-    CONFIG.Write_Depth_A $depth \
-    CONFIG.Write_Width_A $width \
-    CONFIG.Write_Width_B $width \
-  ] $bram
-}
-
-# Create instance: proc_sys_reset, and set properties
-set proc_sys_reset \
-  [ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ]
-
-# Create instance: pll_clk, and set properties
-set pll_clk [ create_bd_cell -type ip -vlnv xilinx.com:ip:clk_wiz:6.0 pll_clk ]
-set_property -dict [ list \
-  CONFIG.CLKOUT1_REQUESTED_OUT_FREQ $clock_freq \
-  CONFIG.RESET_PORT {resetn} \
-  CONFIG.RESET_TYPE {ACTIVE_LOW} \
-  CONFIG.USE_LOCKED {false} \
-] $pll_clk
-
-# Create instance: axi_smc0, and set properties
-set axi_smc0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc0 ]
-set_property -dict [ list \
-  CONFIG.NUM_MI {1} \
-  CONFIG.NUM_SI {5} \
-] $axi_smc0
-
-# Create instance: axi_xbar, and set properties
-set axi_xbar \
-  [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_xbar ]
-set_property -dict [ list \
-  CONFIG.NUM_MI {4} \
-  CONFIG.NUM_SI {1} \
-] $axi_xbar
-
-# Create instance: fetch_0, and set properties
-set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ]
-set_property -dict [ list \
-  CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE $axi_cache \
-  CONFIG.C_M_AXI_INS_PORT_PROT_VALUE $axi_prot \
-] $fetch_0
-
-# Create instance: load_0, and set properties
-set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ]
-set_property -dict [ list \
-  CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \
-  CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \
-] $load_0
-
-# Create instance: compute_0, and set properties
-set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ]
-set_property -dict [ list \
-  CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \
-  CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \
-  CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE $axi_cache \
-  CONFIG.C_M_AXI_UOP_PORT_PROT_VALUE $axi_prot \
-] $compute_0
-
-# Create instance: store_0, and set properties
-set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ]
-set_property -dict [ list \
-  CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \
-  CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \
-] $store_0
-
-# Create command queues and set properties
-set cmd_queue_list {load_queue gemm_queue store_queue}
-foreach cmd_queue $cmd_queue_list {
-  set tmp_cmd_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 $cmd_queue ]
-  # Width is 16B (128b, as set in hw_spec.h), depth is 512 (depth of FIFO on Zynq 7000 and Zynq Ultrascale+)
-  # TODO: derive it from vta_config.h
-  [ init_fifo_property $tmp_cmd_queue 16 512 ]
-}
-
-# Create dependence queues and set properties
-set dep_queue_list {l2g_queue g2l_queue g2s_queue s2g_queue}
-foreach dep_queue $dep_queue_list {
-  set tmp_dep_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 $dep_queue ]
-  # Width is 1B (min width), depth is 1024
-  # TODO: derive it from vta_config.h
-  [ init_fifo_property $tmp_dep_queue 1 1024 ]
-}
-
-# Create and connect inp_mem partitions
-for {set i 0} {$i < $inp_part} {incr i} {
-  # Create instance: inp_mem, and set properties
-  set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem_${i} ]
-  [ init_bram_property $inp_mem $inp_mem_width $inp_mem_depth ]
-  # If module has more than 1 mem port, the naming convention changes
-  if {$inp_part > 1} {
-    set porta [get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA]
-    set portb [get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA]
-  } else {
-    set porta [get_bd_intf_pins load_0/inp_mem_V_PORTA]
-    set portb [get_bd_intf_pins compute_0/inp_mem_V_PORTA]
-  }
-  # Create interface connections
-  connect_bd_intf_net -intf_net load_0_inp_mem_V_PORTA \
-    [get_bd_intf_pins $inp_mem/BRAM_PORTA] \
-    $porta
-  connect_bd_intf_net -intf_net compute_0_inp_mem_V_PORTA \
-    [get_bd_intf_pins $inp_mem/BRAM_PORTB] \
-    $portb
-}
-
-# Create and connect wgt_mem partitions
-for {set i 0} {$i < $wgt_part} {incr i} {
-  # Create instance: wgt_mem, and set properties
-  set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem_${i} ]
-  [ init_bram_property $wgt_mem $wgt_mem_width $wgt_mem_depth ]
-  # If module has more than 1 mem port, the naming convention changes
-  if {$wgt_part > 1} {
-    set porta [get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA]
-    set portb [get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA]
-  } else {
-    set porta [get_bd_intf_pins load_0/wgt_mem_V_PORTA]
-    set portb [get_bd_intf_pins compute_0/wgt_mem_V_PORTA]
-  }
-  # Create interface connections
-  connect_bd_intf_net -intf_net load_0_wgt_mem_${i}_V_PORTA \
-    [get_bd_intf_pins $wgt_mem/BRAM_PORTA] \
-    $porta
-  connect_bd_intf_net -intf_net compute_0_wgt_mem_${i}_V_PORTA \
-    [get_bd_intf_pins $wgt_mem/BRAM_PORTB] \
-    $portb
-}
-
-# Create and connect out_mem partitions
-for {set i 0} {$i < $out_part} {incr i} {
-  # Create instance: out_mem, and set properties
-  set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem_${i} ]
-  [ init_bram_property $out_mem $out_mem_width $out_mem_depth ]
-  # If module has more than 1 mem port, the naming convention changes
-  if {$out_part > 1} {
-    set porta [get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA]
-    set portb [get_bd_intf_pins store_0/out_mem_${i}_V_PORTA]
-  } else {
-    set porta [get_bd_intf_pins compute_0/out_mem_V_PORTA]
-    set portb [get_bd_intf_pins store_0/out_mem_V_PORTA]
-  }
-  # Create interface connections
-  connect_bd_intf_net -intf_net compute_0_out_mem_${i}_V_PORTA \
-    [get_bd_intf_pins $out_mem/BRAM_PORTA] \
-    $porta
-  connect_bd_intf_net -intf_net store_0_out_mem_${i}_V_PORTA \
-    [get_bd_intf_pins $out_mem/BRAM_PORTB] \
-    $portb
-}
-
-# Create instance: processing_system, and set properties
-if { $device_family eq "zynq-7000" } {
-  set processing_system [ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system ]
-  set_property -dict [ list \
-    CONFIG.PCW_EN_CLK0_PORT {1} \
-    CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \
-    CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \
-    CONFIG.PCW_USE_S_AXI_ACP {1} \
-    CONFIG.preset {ZC702} \
-  ] $processing_system
-  # Get ports that are specific to the Zynq 7000 processing system
-  set ps_clk    [get_bd_pins processing_system/FCLK_CLK0]
-  set ps_rstn   [get_bd_pins processing_system/FCLK_RESET0_N]
-  set maxi_clk  [get_bd_pins processing_system/M_AXI_GP0_ACLK]
-  set saxi_clk  [get_bd_pins processing_system/S_AXI_ACP_ACLK]
-  set maxi      [get_bd_intf_pins processing_system/M_AXI_GP0]
-  set saxi      [get_bd_intf_pins processing_system/S_AXI_ACP]
-} elseif { $device_family eq "zynq-ultrascale+" } {
-  set processing_system [ create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.2 processing_system ]
-  set_property -dict [ list \
-    CONFIG.PSU__FPGA_PL0_ENABLE {1} \
-    CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ {100} \
-    CONFIG.PSU__USE__M_AXI_GP0 {1} \
-    CONFIG.PSU__USE__M_AXI_GP2 {0} \
-    CONFIG.PSU__USE__S_AXI_GP0 {1}
-  ] $processing_system
-  # Get ports that are specific to the Zynq Ultrascale MPSoC processing system
-  set ps_clk    [get_bd_pins processing_system/pl_clk0]
-  set ps_rstn   [get_bd_pins processing_system/pl_resetn0]
-  set maxi_clk  [get_bd_pins processing_system/maxihpm0_fpd_aclk]
-  set saxi_clk  [get_bd_pins processing_system/saxihpc0_fpd_aclk]
-  set maxi      [get_bd_intf_pins processing_system/M_AXI_HPM0_FPD]
-  set saxi      [get_bd_intf_pins processing_system/S_AXI_HPC0_FPD]
-}
-
-# Create interface connections
-connect_bd_intf_net -intf_net axi_xbar_M00_AXI [get_bd_intf_pins axi_xbar/M00_AXI] [get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS]
-connect_bd_intf_net -intf_net axi_xbar_M01_AXI [get_bd_intf_pins axi_xbar/M01_AXI] [get_bd_intf_pins load_0/s_axi_CONTROL_BUS]
-connect_bd_intf_net -intf_net axi_xbar_M02_AXI [get_bd_intf_pins axi_xbar/M02_AXI] [get_bd_intf_pins compute_0/s_axi_CONTROL_BUS]
-connect_bd_intf_net -intf_net axi_xbar_M03_AXI [get_bd_intf_pins axi_xbar/M03_AXI] [get_bd_intf_pins store_0/s_axi_CONTROL_BUS]
-connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V [get_bd_intf_pins l2g_queue/S_AXIS] [get_bd_intf_pins load_0/l2g_dep_queue_V]
-connect_bd_intf_net -intf_net fetch_0_load_queue_V_V [get_bd_intf_pins fetch_0/load_queue_V_V] [get_bd_intf_pins load_queue/S_AXIS]
-connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V [get_bd_intf_pins fetch_0/gemm_queue_V_V] [get_bd_intf_pins gemm_queue/S_AXIS]
-connect_bd_intf_net -intf_net fetch_0_store_queue_V_V [get_bd_intf_pins fetch_0/store_queue_V_V] [get_bd_intf_pins store_queue/S_AXIS]
-connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V [get_bd_intf_pins compute_0/g2l_dep_queue_V] [get_bd_intf_pins g2l_queue/S_AXIS]
-connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V [get_bd_intf_pins compute_0/g2s_dep_queue_V] [get_bd_intf_pins g2s_queue/S_AXIS]
-connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V [get_bd_intf_pins s2g_queue/S_AXIS] [get_bd_intf_pins store_0/s2g_dep_queue_V]
-connect_bd_intf_net -intf_net load_queue_M_AXIS [get_bd_intf_pins load_0/load_queue_V_V] [get_bd_intf_pins load_queue/M_AXIS]
-connect_bd_intf_net -intf_net gemm_queue_M_AXIS [get_bd_intf_pins compute_0/gemm_queue_V_V] [get_bd_intf_pins gemm_queue/M_AXIS]
-connect_bd_intf_net -intf_net store_queue_M_AXIS [get_bd_intf_pins store_0/store_queue_V_V] [get_bd_intf_pins store_queue/M_AXIS]
-connect_bd_intf_net -intf_net l2g_queue_M_AXIS [get_bd_intf_pins compute_0/l2g_dep_queue_V] [get_bd_intf_pins l2g_queue/M_AXIS]
-connect_bd_intf_net -intf_net g2l_queue_M_AXIS [get_bd_intf_pins g2l_queue/M_AXIS] [get_bd_intf_pins load_0/g2l_dep_queue_V]
-connect_bd_intf_net -intf_net g2s_queue_M_AXIS [get_bd_intf_pins g2s_queue/M_AXIS] [get_bd_intf_pins store_0/g2s_dep_queue_V]
-connect_bd_intf_net -intf_net s2g_queue_M_AXIS [get_bd_intf_pins compute_0/s2g_dep_queue_V] [get_bd_intf_pins s2g_queue/M_AXIS]
-connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port [get_bd_intf_pins axi_smc0/S00_AXI] [get_bd_intf_pins fetch_0/m_axi_ins_port]
-connect_bd_intf_net -intf_net load_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S01_AXI] [get_bd_intf_pins load_0/m_axi_data_port]
-connect_bd_intf_net -intf_net compute_0_m_axi_uop_port [get_bd_intf_pins axi_smc0/S02_AXI] [get_bd_intf_pins compute_0/m_axi_uop_port]
-connect_bd_intf_net -intf_net compute_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S03_AXI] [get_bd_intf_pins compute_0/m_axi_data_port]
-connect_bd_intf_net -intf_net store_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S04_AXI] [get_bd_intf_pins store_0/m_axi_data_port]
-connect_bd_intf_net -intf_net axi_smc0_M00_AXI [get_bd_intf_pins axi_smc0/M00_AXI] $saxi
-connect_bd_intf_net -intf_net processing_system_m_axi [get_bd_intf_pins axi_xbar/S00_AXI] $maxi
-
-# Create port connections
-connect_bd_net -net processing_system_reset \
-  [get_bd_pins pll_clk/resetn] \
-  [get_bd_pins proc_sys_reset/ext_reset_in] \
-  $ps_rstn
-connect_bd_net -net ps_clk_net \
-  [get_bd_pins pll_clk/clk_in1] \
-  $ps_clk
-connect_bd_net -net proc_sys_reset_interconnect_aresetn \
-  [get_bd_pins axi_xbar/ARESETN] \
-  [get_bd_pins proc_sys_reset/interconnect_aresetn]
-connect_bd_net -net proc_sys_reset_peripheral_aresetn \
-  [get_bd_pins proc_sys_reset/peripheral_aresetn] \
-  [get_bd_pins axi_smc0/aresetn] \
-  [get_bd_pins axi_xbar/M00_ARESETN] \
-  [get_bd_pins axi_xbar/M01_ARESETN] \
-  [get_bd_pins axi_xbar/M02_ARESETN] \
-  [get_bd_pins axi_xbar/M03_ARESETN] \
-  [get_bd_pins axi_xbar/S00_ARESETN] \
-  [get_bd_pins fetch_0/ap_rst_n] \
-  [get_bd_pins load_0/ap_rst_n] \
-  [get_bd_pins store_0/ap_rst_n] \
-  [get_bd_pins compute_0/ap_rst_n] \
-  [get_bd_pins load_queue/s_aresetn] \
-  [get_bd_pins gemm_queue/s_aresetn] \
-  [get_bd_pins store_queue/s_aresetn] \
-  [get_bd_pins l2g_queue/s_aresetn] \
-  [get_bd_pins g2l_queue/s_aresetn] \
-  [get_bd_pins g2s_queue/s_aresetn] \
-  [get_bd_pins s2g_queue/s_aresetn]
-connect_bd_net -net processing_system_clk \
-  [get_bd_pins pll_clk/clk_out1] \
-  [get_bd_pins proc_sys_reset/slowest_sync_clk] \
-  [get_bd_pins axi_smc0/aclk] \
-  [get_bd_pins axi_xbar/ACLK] \
-  [get_bd_pins axi_xbar/M00_ACLK] \
-  [get_bd_pins axi_xbar/M01_ACLK] \
-  [get_bd_pins axi_xbar/M02_ACLK] \
-  [get_bd_pins axi_xbar/M03_ACLK] \
-  [get_bd_pins axi_xbar/S00_ACLK] \
-  [get_bd_pins fetch_0/ap_clk] \
-  [get_bd_pins load_0/ap_clk] \
-  [get_bd_pins compute_0/ap_clk] \
-  [get_bd_pins store_0/ap_clk] \
-  [get_bd_pins load_queue/s_aclk] \
-  [get_bd_pins gemm_queue/s_aclk] \
-  [get_bd_pins store_queue/s_aclk] \
-  [get_bd_pins l2g_queue/s_aclk] \
-  [get_bd_pins g2l_queue/s_aclk] \
-  [get_bd_pins g2s_queue/s_aclk] \
-  [get_bd_pins s2g_queue/s_aclk] \
-  $maxi_clk \
-  $saxi_clk
-
-# Create address segments
-create_bd_addr_seg -range $ip_reg_map_range -offset $fetch_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg
-create_bd_addr_seg -range $ip_reg_map_range -offset $load_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg
-create_bd_addr_seg -range $ip_reg_map_range -offset $compute_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg
-create_bd_addr_seg -range $ip_reg_map_range -offset $store_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg
-if { $device_family eq "zynq-7000" } {
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces load_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces store_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
-} elseif { $device_family eq "zynq-ultrascale+"} {
-  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
-  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces load_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
-  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
-  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
-  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces store_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
-}
-
-save_bd_design
-
-
-##################################################################
-# COMPILATION FLOW
-##################################################################
-
-# Create top-level wrapper file
-make_wrapper -files \
-  [get_files $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/$proj_name.bd] -top
-add_files -norecurse $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/hdl/${proj_name}_wrapper.v
-update_compile_order -fileset sources_1
-update_compile_order -fileset sim_1
-
-# Run bistream generation on 8 threads with performance oriented P&R strategy
-set num_threads 8
-launch_runs impl_1 -to_step write_bitstream -jobs $num_threads
-wait_on_run impl_1
-
-# Export hardware description file and bitstream files to export/ dir
-if {[file exist $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit]} {
-  file mkdir $proj_path/export
-  file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.sysdef \
-    $proj_path/export/vta.hdf
-  file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit \
-    $proj_path/export/vta.bit
-}
-
-exit
diff --git a/vta/vta-hw/hardware/xilinx/sim/vta_test.cc b/vta/vta-hw/hardware/xilinx/sim/vta_test.cc
deleted file mode 100644
index 90cef3a3c125..000000000000
--- a/vta/vta-hw/hardware/xilinx/sim/vta_test.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file vta_test.cpp
- * \brief Simulation tests for the VTA design.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-
-#include "../src/vta.h"
-#include "../../../tests/hardware/common/test_lib.h"
-
-int main(void) {
-#if DEBUG == 1
-    printParameters();
-#endif
-
-    int status = 0;
-
-    // Run ALU test (vector-scalar operators)
-    status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true);
-    status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false);
-    status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true);
-    status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false);
-    status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true);
-    status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false);
-    status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true);
-    status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false);
-
-    // Run ALU test (vector-vector operators)
-    status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true);
-    status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false);
-    status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true);
-    status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
-    status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
-    status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
-    status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, true);
-    status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, false);
-
-    // Run blocked GEMM test
-    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
-    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
-
-    // Simple GEMM unit test
-    status |= gemm_test(4 * VTA_BATCH, 4 * VTA_BLOCK_OUT, 4 * VTA_BLOCK_IN, false);
-
-    return status;
-}
diff --git a/vta/vta-hw/hardware/xilinx/src/vta.cc b/vta/vta-hw/hardware/xilinx/src/vta.cc
deleted file mode 100644
index d9fe6f111303..000000000000
--- a/vta/vta-hw/hardware/xilinx/src/vta.cc
+++ /dev/null
@@ -1,742 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file vta.cpp
- * \brief VTA HLS design.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "vta.h"
-
-template <typename DATA_T, int MAT_AXI_RATIO>
-void reset_mem(
-  memop_sram_T &sram_idx,
-  memop_sram_T range,
-  DATA_T mem[][MAT_AXI_RATIO]) {
-
-  for (int i = 0; i < range; i ++) {
-    for (int j = 0; j < MAT_AXI_RATIO; j ++) {
-#pragma HLS UNROLL
-      mem[sram_idx][j] = 0;
-    }
-    sram_idx ++;
-  }
-}
-
-template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES>
-void load_pad_2d(
-  volatile DATA_T *src,
-  DATA_T dst[][MAT_AXI_RATIO],
-  memop_sram_T sram_idx,
-  memop_dram_T dram_idx,
-  memop_size_T y_size,
-  memop_size_T x_size,
-  memop_stride_T x_stride,
-  memop_pad_T x_pad_0,
-  memop_pad_T x_pad_1,
-  memop_sram_T y_offset_0,
-  memop_sram_T y_offset_1) {
-#pragma HLS INLINE
-
-  reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_0, dst);
-  for (int y = 0; y < y_size; y++) {
-#pragma HLS PIPELINE
-    reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_0, dst);
-    memcpy(&dst[sram_idx][0],
-           (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO],
-           x_size * ELEM_BYTES);
-    sram_idx += x_size;
-    dram_idx += x_stride;
-    reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_1, dst);
-  }
-  reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_1, dst);
-}
-
-template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES>
-void load_2d(
-  volatile DATA_T *src,
-  DATA_T dst[][MAT_AXI_RATIO],
-  memop_sram_T sram_idx,
-  memop_dram_T dram_idx,
-  memop_size_T y_size,
-  memop_size_T x_size,
-  memop_stride_T x_stride) {
-#pragma HLS INLINE
-
-  for (int y = 0; y < y_size; y++) {
-    memcpy(&dst[sram_idx][0],
-           (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO],
-           x_size * ELEM_BYTES);
-#pragma HLS RESOURCE variable = sram_idx core = Mul_LUT
-    sram_idx += x_size;
-    dram_idx += x_stride;
-  }
-}
-
-template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM>
-void read_tensor(
-  IDX_T idx,
-  WIDE_T src[][NARROW_W * Y_DIM * X_DIM / WIDE_W],
-  NARROW_T dst[Y_DIM][X_DIM]) {
-#pragma HLS INLINE
-
-  // Read in weight tensor
-  for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) {
-    WIDE_T packet = src[idx][p];
-    for (int w = 0; w < (WIDE_W / NARROW_W); w++) {
-      int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM;
-      int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM;
-      dst[x][y] = (NARROW_T) packet.range((w + 1) * NARROW_W - 1, w * NARROW_W);
-    }
-  }
-}
-
-template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM>
-void write_tensor(
-  IDX_T idx,
-  NARROW_T src[Y_DIM][X_DIM],
-  WIDE_T dst[][NARROW_W * Y_DIM * X_DIM / WIDE_W]) {
-#pragma HLS INLINE
-
-  for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) {
-    WIDE_T packet = 0;
-    for (int w = 0; w < (WIDE_W / NARROW_W); w++) {
-      int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM;
-      int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM;
-      packet.range((w + 1) * NARROW_W - 1, w * NARROW_W) = src[x][y];
-    }
-    dst[idx][p] = packet;
-  }
-}
-
-void fetch(
-  uint32_t insn_count,
-  volatile insn_T *insns,
-  hls::stream<insn_T> &load_queue,
-  hls::stream<insn_T> &gemm_queue,
-  hls::stream<insn_T> &store_queue) {
-PRAGMA_HLS(HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS offset = VTA_FETCH_INSN_COUNT_OFFSET)
-#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
-#pragma HLS INTERFACE axis port = load_queue
-#pragma HLS INTERFACE axis port = gemm_queue
-#pragma HLS INTERFACE axis port = store_queue
-#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
-
-  INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
-#pragma HLS PIPELINE
-    // Read instruction fields
-    insn_T raw_insn = insns[pc];
-    VTAInsn insn;
-    insn.generic = *((VTAGenericInsn *) &raw_insn);
-    // Do some partial decoding
-    opcode_T opcode = insn.generic.opcode;
-    memop_id_T memory_type = insn.mem.memory_type;
-    // Push to appropriate instruction queue
-    if (opcode == VTA_OPCODE_STORE) {
-      store_queue.write(raw_insn);
-    } else if (opcode == VTA_OPCODE_LOAD) {
-      if (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT) {
-        load_queue.write(raw_insn);
-      } else {
-        gemm_queue.write(raw_insn);
-      }
-    } else {
-      gemm_queue.write(raw_insn);
-    }
-  }
-}
-
-void load(
-  volatile bus_T *inputs,
-  volatile bus_T *weights,
-  hls::stream<insn_T> &load_queue,
-  hls::stream<bool> &g2l_dep_queue,
-  hls::stream<bool> &l2g_dep_queue,
-  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
-  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]) {
-#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
-#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
-#pragma HLS INTERFACE axis port = load_queue
-#pragma HLS INTERFACE axis port = g2l_dep_queue
-#pragma HLS INTERFACE axis port = l2g_dep_queue
-#pragma HLS INTERFACE bram port = wgt_mem
-#pragma HLS INTERFACE bram port = inp_mem
-#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
-#pragma HLS RESOURCE variable = inp_mem core = RAM_1P
-#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
-
-  // Pop load instruction
-  insn_T raw_insn = load_queue.read();
-  // Cast to MemInsn
-  insn_T raw_copy = raw_insn;
-  VTAMemInsn insn = *((VTAMemInsn *) &raw_copy);
-
-  // Pop dependence token if instructed
-  if (insn.pop_next_dep) {
-    g2l_dep_queue.read();
-  }
-
-  // Pre-processing
-  memop_sram_T x_width = (insn.x_pad_0 + insn.x_size + insn.x_pad_1);
-  memop_sram_T y_offset_0 = x_width * insn.y_pad_0;
-#pragma HLS RESOURCE variable = y_offset_0 core = Mul_LUT latency = 4
-  memop_sram_T y_offset_1 = x_width * insn.y_pad_1;
-#pragma HLS RESOURCE variable = y_offset_1 core = Mul_LUT latency = 4
-
-  if (insn.memory_type == VTA_MEM_ID_INP) {
-    load_pad_2d<bus_T, INP_MAT_AXI_RATIO, VTA_INP_ELEM_BYTES>(
-        inputs,
-        inp_mem,
-        insn.sram_base,
-        insn.dram_base,
-        insn.y_size,
-        insn.x_size,
-        insn.x_stride,
-        insn.x_pad_0,
-        insn.x_pad_1,
-        y_offset_0,
-        y_offset_1);
-  } else if (insn.memory_type == VTA_MEM_ID_WGT) {
-    load_2d<bus_T, WGT_MAT_AXI_RATIO, VTA_WGT_ELEM_BYTES>(
-        weights,
-        wgt_mem,
-        insn.sram_base,
-        insn.dram_base,
-        insn.y_size,
-        insn.x_size,
-        insn.x_stride);
-  }
-
-  // Push dependence token if instructed
-  if (insn.push_next_dep) {
-    l2g_dep_queue.write(1);
-  }
-}
-
-void gemm(
-  insn_T insn_raw,
-  uop_T uop_mem[VTA_UOP_BUFF_DEPTH],
-  bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO],
-  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
-  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
-  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
-#pragma HLS INLINE
-
-  VTAGemInsn insn = *((VTAGemInsn *) &insn_raw);
-
-  // Loop offset
-  acc_idx_T dst_offset_out = 0;
-  inp_idx_T src_offset_out = 0;
-  wgt_idx_T wgt_offset_out = 0;
-
-  // Outer Loop
-  EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) {
-    acc_idx_T dst_offset_in = dst_offset_out;
-    inp_idx_T src_offset_in = src_offset_out;
-    wgt_idx_T wgt_offset_in = wgt_offset_out;
-
-    // Inner Loop
-    EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) {
-
-      // Iterate over micro op
-      READ_GEMM_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) {
-#pragma HLS PIPELINE II = 1
-        // Read micro-op fields
-        uop_T uop = uop_mem[upc];
-
-        // Decode indices
-        acc_idx_T dst_idx =
-            uop.range(VTA_UOP_GEM_0_1, VTA_UOP_GEM_0_0) + dst_offset_in;
-        inp_idx_T src_idx =
-            uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + src_offset_in;
-        wgt_idx_T wgt_idx =
-            uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + wgt_offset_in;
-
-        // Read in weight tensor
-        wgt_T w_tensor[VTA_BLOCK_OUT][VTA_BLOCK_IN];
-        read_tensor<bus_T, wgt_T, wgt_idx_T, VTA_BUS_WIDTH, VTA_WGT_WIDTH, VTA_BLOCK_OUT, VTA_BLOCK_IN>(wgt_idx, wgt_mem, w_tensor);
-        // Read in input tensor
-        inp_T i_tensor[VTA_BATCH][VTA_BLOCK_IN];
-        read_tensor<bus_T, inp_T, inp_idx_T, VTA_BUS_WIDTH, VTA_INP_WIDTH, VTA_BATCH, VTA_BLOCK_IN>(src_idx, inp_mem, i_tensor);
-        // Read in accum tensor
-        acc_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT];
-        read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, a_tensor);
-        // Output tensor
-        out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT];
-
-        // Inner GEMM loop
-        for (int b = 0; b < VTA_BATCH; b++) {
-          for (int oc = 0; oc < VTA_BLOCK_OUT; oc++) {
-            // Initialize the accumulator values
-            acc_T accum = a_tensor[b][oc];
-            // Dot product sum
-            sum_T tmp = 0;
-            // Inner matrix multiplication loop (input channel/feature)
-            for (int ic = 0; ic < VTA_BLOCK_IN; ic++) {
-              wgt_T w_elem = w_tensor[oc][ic];
-              inp_T i_elem = i_tensor[b][ic];
-              mul_T prod_dsp = i_elem * w_elem;
-              tmp += (sum_T) prod_dsp;
-            }
-            // Update summation
-            accum += (acc_T) tmp;
-            // Write back result acc_mem
-            a_tensor[b][oc] = insn.reset_reg ? (acc_T) 0 : accum;
-            // And output vector
-            o_tensor[b][oc] = (out_T) accum.range(VTA_OUT_WIDTH - 1, 0);
-          }
-        }
-
-        // Write the results back into accumulator
-        write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, a_tensor, acc_mem);
-        // Write the results back in the output buffer
-        write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem);
-      }
-      // Update offsets
-      dst_offset_in += insn.dst_factor_in;
-      src_offset_in += insn.src_factor_in;
-      wgt_offset_in += insn.wgt_factor_in;
-    }
-    // Update offsets
-    dst_offset_out += insn.dst_factor_out;
-    src_offset_out += insn.src_factor_out;
-    wgt_offset_out += insn.wgt_factor_out;
-  }
-}
-
-void alu(
-  insn_T insn_raw,
-  uop_T uop_mem[VTA_UOP_BUFF_DEPTH],
-  bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO],
-  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
-  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
-  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
-#pragma HLS INLINE
-
-  VTAAluInsn insn = *((VTAAluInsn *) &insn_raw);
-
-  // Loop offset
-  acc_idx_T dst_offset_out = 0;
-  inp_idx_T src_offset_out = 0;
-
-  // Outer Loop
-  EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) {
-    acc_idx_T dst_offset_in = dst_offset_out;
-    inp_idx_T src_offset_in = src_offset_out;
-
-    // Inner Loop
-    EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) {
-      // Iterate over micro op
-      READ_ALU_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) {
-#pragma HLS PIPELINE II = 2
-        // Read micro-op fields
-        uop_T uop = uop_mem[upc];
-
-        // Decode
-        acc_idx_T dst_idx =
-            uop.range(VTA_UOP_ALU_0_1, VTA_UOP_ALU_0_0) + dst_offset_in;
-        acc_idx_T src_idx =
-            uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in;
-
-        // Read in src tensor
-        acc_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT];
-        read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(src_idx, acc_mem, src_tensor);
-        // Read in dst tensor
-        acc_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT];
-        read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, dst_tensor);
-        // Output tensor
-        out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT];
-
-        // Perform ALU op over matrix elements
-        for (int i = 0; i < VTA_BATCH; i++) {
-          for (int b = 0; b < VTA_BLOCK_OUT; b++) {
-            // Read in operands
-            acc_T src_0 = dst_tensor[i][b];
-            acc_T src_1 = insn.use_imm ? (acc_T) insn.imm : src_tensor[i][b];
-            aluop_shr_arg_T shft_by = src_1.range(VTA_SHR_ARG_BIT_WIDTH - 1, 0);
-            aluop_mul_arg_T mul_by = src_1.range(VTA_MUL_ARG_BIT_WIDTH - 1, 0);
-            if (insn.alu_opcode == VTA_ALU_OPCODE_MIN || insn.alu_opcode == VTA_ALU_OPCODE_MAX) {
-              // Compute Min/Max
-              acc_T mix_val = src_0 < src_1 ?
-                  (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
-                  (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
-              dst_tensor[i][b] = mix_val;
-              o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
-            } else if (insn.alu_opcode == VTA_ALU_OPCODE_ADD) {
-              // Compute Sum
-              acc_T add_val =
-                  src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
-              dst_tensor[i][b] = add_val;
-              o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
-            } else if (insn.alu_opcode == VTA_ALU_OPCODE_SHR) {
-              // Compute Shift Right
-              acc_T shr_val = src_0 >> shft_by;
-              dst_tensor[i][b] = shr_val;
-              o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH - 1, 0);
-            }
-          }
-        }
-
-        // Write the results back into accumulator
-        write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, dst_tensor, acc_mem);
-        // Write the results back in the output buffer
-        write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem);
-      }
-      // Update offsets
-      dst_offset_in += insn.dst_factor_in;
-      src_offset_in += insn.src_factor_in;
-    }
-    // Update offsets
-    dst_offset_out += insn.dst_factor_out;
-    src_offset_out += insn.src_factor_out;
-  }
-}
-
-void compute(
-  volatile uint32_t &done,
-  volatile uop_T *uops,
-  volatile bus_T *biases,
-  hls::stream<insn_T> &gemm_queue,
-  hls::stream<bool> &l2g_dep_queue,
-  hls::stream<bool> &s2g_dep_queue,
-  hls::stream<bool> &g2l_dep_queue,
-  hls::stream<bool> &g2s_dep_queue,
-  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
-  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
-  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
-PRAGMA_HLS(HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS offset = VTA_COMPUTE_DONE_WR_OFFSET)
-#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
-#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
-#pragma HLS INTERFACE axis port = gemm_queue
-#pragma HLS INTERFACE axis port = l2g_dep_queue
-#pragma HLS INTERFACE axis port = s2g_dep_queue
-#pragma HLS INTERFACE axis port = g2l_dep_queue
-#pragma HLS INTERFACE axis port = g2s_dep_queue
-#pragma HLS INTERFACE bram port = inp_mem
-#pragma HLS INTERFACE bram port = wgt_mem
-#pragma HLS INTERFACE bram port = out_mem
-#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
-#pragma HLS RESOURCE variable = inp_mem core = RAM_1P
-#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
-#pragma HLS RESOURCE variable = out_mem core = RAM_1P
-
-  // Micro-op storage
-  static uop_T uop_mem[VTA_UOP_BUFF_DEPTH];
-
-  // Accumulator storage
-  static bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO];
-#pragma HLS ARRAY_RESHAPE variable = acc_mem complete dim=2
-// This is necessary to obtain II=1
-#pragma HLS DEPENDENCE variable = acc_mem inter false
-
-  // Pop GEMM instruction
-  insn_T raw_insn = gemm_queue.read();
-  // Cast to GenericInsn
-  VTAInsn insn;
-  insn_T raw_copy = raw_insn;
-  insn.generic = *((VTAGenericInsn *) &raw_copy);
-
-  // Pop dependence token if instructed
-  if (insn.generic.pop_prev_dep) {
-    l2g_dep_queue.read();
-  }
-  if (insn.generic.pop_next_dep) {
-    s2g_dep_queue.read();
-  }
-
-  // Set done value
-  done = 0;
-  // Perform action based on opcode
-  if (insn.generic.opcode == VTA_OPCODE_FINISH) {
-    // Set done flag if we reach a FINISH instruction
-    done = 1;
-  } else if (insn.generic.opcode == VTA_OPCODE_LOAD) {
-    // Initialize indices
-    memop_sram_T sram_idx = insn.mem.sram_base;
-    memop_dram_T dram_idx = insn.mem.dram_base;
-    if (insn.mem.memory_type == VTA_MEM_ID_UOP) {
-      // Perform data transfer
-      memcpy(&uop_mem[sram_idx],
-             (const uop_T*) &uops[dram_idx],
-             insn.mem.x_size * sizeof(uop_T));
-    } else if (insn.mem.memory_type == VTA_MEM_ID_ACC) {
-      // Perform data transfer from DRAM
-      load_2d<bus_T, ACC_MAT_AXI_RATIO, VTA_ACC_ELEM_BYTES>(
-          biases,
-          acc_mem,
-          sram_idx,
-          dram_idx,
-          insn.mem.y_size,
-          insn.mem.x_size,
-          insn.mem.x_stride);
-    }
-  } else if (insn.generic.opcode == VTA_OPCODE_GEMM) {
-    gemm(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
-  } else if (insn.generic.opcode == VTA_OPCODE_ALU) {
-    alu(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
-  }
-
-  // Push dependence token if instructed
-  if (insn.generic.push_prev_dep) {
-    g2l_dep_queue.write(1);
-  }
-  if (insn.generic.push_next_dep) {
-    g2s_dep_queue.write(1);
-  }
-}
-
-void store(
-  volatile bus_T *outputs,
-  hls::stream<insn_T> &store_queue,
-  hls::stream<bool> &g2s_dep_queue,
-  hls::stream<bool> &s2g_dep_queue,
-  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
-#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
-#pragma HLS INTERFACE axis port = store_queue
-#pragma HLS INTERFACE axis port = g2s_dep_queue
-#pragma HLS INTERFACE axis port = s2g_dep_queue
-#pragma HLS INTERFACE bram port = out_mem
-#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
-#pragma HLS RESOURCE variable = out_mem core = RAM_1P
-
-  // Pop store instruction
-  insn_T raw_insn = store_queue.read();
-  // Cast to MemInsn
-  insn_T raw_copy = raw_insn;
-  VTAMemInsn insn = *((VTAMemInsn *) &raw_copy);
-
-  // Pop dependence token if instructed
-  if (insn.pop_prev_dep) {
-    g2s_dep_queue.read();
-  }
-
-  // Initialize indices
-  memop_sram_T sram_idx = insn.sram_base;
-  memop_dram_T dram_idx = insn.dram_base;
-
-  // Copy along y dimension
-  for (int y = 0; y < insn.y_size; y++) {
-#pragma HLS PIPELINE
-    // Perform data transfer
-    memcpy(
-      const_cast<bus_T*>(&outputs[dram_idx * OUT_MAT_AXI_RATIO]),
-      (const bus_T*) &out_mem[sram_idx][0],
-      insn.x_size * VTA_OUT_ELEM_BYTES);
-#pragma HLS RESOURCE variable = sram_idx core = Mul_LUT
-    sram_idx += insn.x_size;
-    dram_idx += insn.x_stride;
-  }
-
-  // Push dependence token if instructed
-  if (insn.push_prev_dep) {
-    s2g_dep_queue.write(1);
-  }
-}
-
-void vta(
-  uint32_t insn_count,
-  volatile insn_T *insns,
-  volatile uop_T *uops,
-  volatile bus_T *inputs,
-  volatile bus_T *weights,
-  volatile bus_T *biases,
-  volatile bus_T *outputs) {
-#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
-#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
-#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
-#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
-#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
-#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
-#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
-#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
-
-  // Instantiate temporary instruction queues (used for peeking)
-  hls::stream<insn_T> tmp_load_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_load_queue)
-  hls::stream<insn_T> tmp_gemm_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_gemm_queue)
-  hls::stream<insn_T> tmp_store_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_store_queue)
-
-  // Instatiate physical instruction queues
-  hls::stream<insn_T> load_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=load_queue)
-  hls::stream<insn_T> gemm_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=gemm_queue)
-  hls::stream<insn_T> store_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=store_queue)
-
-  // Dependence queues
-  hls::stream<bool> l2g_dep_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=l2g_dep_queue)
-  hls::stream<bool> s2g_dep_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=s2g_dep_queue)
-  hls::stream<bool> g2l_dep_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2l_dep_queue)
-  hls::stream<bool> g2s_dep_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue)
-
-  // Instantiate memories
-  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO];
-  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO];
-  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO];
-
-  // Push all instructions into the queues
-  fetch(insn_count, insns, tmp_load_queue, tmp_gemm_queue, tmp_store_queue);
-
-  // Global done indicator
-  uint32_t done = 0;
-
-  // Temporary instructions
-  insn_T tmp_load;
-  insn_T tmp_gemv;
-  insn_T tmp_store;
-
-  // Peeking status
-  bool tmp_load_popped = false;
-  bool tmp_gemm_popped = false;
-  bool tmp_store_popped = false;
-  int exit_counter = 0;
-
-  // Main control loop
-  while (true) {
-    // First execute as many load instructions as possible
-    while (!tmp_load_queue.empty() || tmp_load_popped == true) {
-      // Pop the load instruction
-      if (!tmp_load_popped) {
-        tmp_load_queue.read(tmp_load);
-        tmp_load_popped = true;
-      }
-      // Check dependences and invoke the load stage
-      VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_load);
-      if ((insn.pop_next_dep && !g2l_dep_queue.empty()) ||
-          !insn.pop_next_dep) {
-        // Push the instruction in the load queue
-        load_queue.write(tmp_load);
-        tmp_load_popped = false;
-        load(inputs, weights, load_queue, g2l_dep_queue, l2g_dep_queue, inp_mem, wgt_mem);
-      } else {
-        // Execution of load stage pending on completion of other stages, so break here...
-        break;
-      }
-    }
-    // Next execute as many gemm instructions as possible
-    while (!tmp_gemm_queue.empty() || tmp_gemm_popped == true) {
-      // Pop the gemm instruction
-      if (!tmp_gemm_popped) {
-        tmp_gemm_queue.read(tmp_gemv);
-        tmp_gemm_popped = true;
-      }
-      // Check dependences and invoke the load stage
-      VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
-      if (
-        (insn.pop_prev_dep && !l2g_dep_queue.empty() &&
-         insn.pop_next_dep && !s2g_dep_queue.empty()) ||
-        (!insn.pop_prev_dep && insn.pop_next_dep &&
-         !s2g_dep_queue.empty()) ||
-        (insn.pop_prev_dep && !l2g_dep_queue.empty() &&
-        !insn.pop_next_dep) ||
-        (!insn.pop_prev_dep && !insn.pop_next_dep)
-      ) {
-        // Push the instruction in the load queue
-        gemm_queue.write(tmp_gemv);
-        tmp_gemm_popped = false;
-        compute(done, uops, biases, gemm_queue, l2g_dep_queue, s2g_dep_queue,
-                g2l_dep_queue, g2s_dep_queue, inp_mem, wgt_mem, out_mem);
-      } else {
-        // Execution of load stage pending on completion of other stages,
-        // so break here...
-        break;
-      }
-    }
-    // Finally execute as many store instructions as possible
-    while (!tmp_store_queue.empty() || tmp_store_popped == true) {
-      // Pop the load instruction
-      if (!tmp_store_popped) {
-        tmp_store_queue.read(tmp_store);
-        tmp_store_popped = true;
-      }
-      // Check dependences and invoke the load stage
-      VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_store);
-
-      if ((insn.pop_prev_dep && !g2s_dep_queue.empty()) ||
-          !insn.pop_prev_dep) {
-        // Push the instruction in the load queue
-        store_queue.write(tmp_store);
-        tmp_store_popped = false;
-        store(outputs, store_queue, g2s_dep_queue, s2g_dep_queue, out_mem);
-      } else {
-        // Execution of load stage pending on completion of other stages, so break here...
-        break;
-      }
-    }
-    // Check if we get a signal that we are done
-    if (done) {
-      break;
-    }
-    exit_counter++;
-    if (exit_counter > 1000) {
-      if (tmp_load_popped) {
-        if (g2l_dep_queue.empty()) {
-          printf("waiting on g2l\n");
-        }
-      }
-      if (tmp_gemm_popped) {
-        VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
-        if (l2g_dep_queue.empty() && insn.pop_prev_dep) {
-          printf("waiting on l2g\n");
-        }
-        if (s2g_dep_queue.empty() && insn.pop_next_dep) {
-          printf("waiting on s2g\n");
-        }
-      }
-      if (tmp_store_popped) {
-        if (g2s_dep_queue.empty()) {
-          printf("waiting on g2s\n");
-        }
-      }
-      break;
-    }
-  }
-
-  // Ensure that the tokens are empty
-  bool tmp_tok;
-  int l2g_count = 0;
-  int s2g_count = 0;
-  int g2l_count = 0;
-  int g2s_count = 0;
-  while (l2g_dep_queue.read_nb(tmp_tok)) {
-    l2g_count++;
-  }
-  while (s2g_dep_queue.read_nb(tmp_tok)) {
-    s2g_count++;
-  }
-  while (g2l_dep_queue.read_nb(tmp_tok)) {
-    g2l_count++;
-  }
-  while (g2s_dep_queue.read_nb(tmp_tok)) {
-    g2s_count++;
-  }
-
-  assert(l2g_count == 0 && g2s_count == 0 && g2l_count == 0 && g2s_count == 0);
-}
diff --git a/vta/vta-hw/hardware/xilinx/src/vta.h b/vta/vta-hw/hardware/xilinx/src/vta.h
deleted file mode 100644
index d4a2a2dd98f8..000000000000
--- a/vta/vta-hw/hardware/xilinx/src/vta.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file vta.h
- * \brief Type definitions and prototype for VTA HLS design.
- */
-#ifndef VTA_VTA_H_
-#define VTA_VTA_H_
-
-#include <ap_axi_sdata.h>
-#include <ap_int.h>
-#include <assert.h>
-#include <hls_stream.h>
-
-#include <vta/hw_spec.h>
-
-/*!
-* Define HLS stream depth
-*/
-#define PRAGMA_SUB(x) _Pragma (#x)
-#define PRAGMA_HLS(x) PRAGMA_SUB(x)
-#define STREAM_IN_DEPTH 8
-
-/* \typedef bus_T memory bus datatype*/
-typedef ap_uint<VTA_BUS_WIDTH> bus_T;
-
-/* \typedef uop_T Micro-op datatype*/
-typedef ap_uint<VTA_UOP_WIDTH> uop_T;
-
-/* \typedef inp_T Input datatype*/
-typedef ap_int<VTA_INP_WIDTH> inp_T;
-
-/* \typedef wgt_T Weight datatype*/
-typedef ap_int<VTA_WGT_WIDTH> wgt_T;
-
-/* \typedef out_T Output datatype*/
-typedef ap_int<VTA_OUT_WIDTH> out_T;
-
-/* \typedef acc_T Accumulator datatype*/
-typedef ap_int<VTA_ACC_WIDTH> acc_T;
-
-/* \typedef mul_T Multiplier output datatype*/
-typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
-
-/* \typedef sum_T GEMM accumulator datatype*/
-typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
-
-/* \typedef uop_idx_T Micro-op SRAM index datatype*/
-typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
-
-/* \typedef inp_idx_T Input SRAM index datatype*/
-typedef ap_uint<VTA_LOG_INP_BUFF_DEPTH+1> inp_idx_T;
-
-/* \typedef wgt_idx_T Weight SRAM index datatype*/
-typedef ap_uint<VTA_LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
-
-/* \typedef acc_idx_T Accumulator SRAM index datatype*/
-typedef ap_uint<VTA_LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
-
-/* \typedef opcode_T Opcode datatype*/
-typedef ap_uint<VTA_OPCODE_BIT_WIDTH> opcode_T;
-
-/* \typedef insn_T Instruction datatype*/
-typedef ap_uint<VTA_INS_WIDTH> insn_T;
-
-/* \typedef loop_T Loop bound datatype*/
-typedef ap_uint<VTA_LOOP_ITER_WIDTH> loop_T;
-
-/* \typedef memop_id_T Memory operation ID datatype*/
-typedef ap_uint<VTA_MEMOP_ID_BIT_WIDTH> memop_id_T;
-
-/* \typedef memop_sram_T Memory operation SRAM index datatype*/
-typedef ap_uint<VTA_MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
-
-/* \typedef memop_dram_T Memory operation DRAM index datatype*/
-typedef ap_uint<VTA_MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
-
-/* \typedef memop_size_T Memory operation range datatype*/
-typedef ap_uint<VTA_MEMOP_SIZE_BIT_WIDTH> memop_size_T;
-
-/* \typedef memop_stride_T Memory operation stride datatype*/
-typedef ap_uint<VTA_MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
-
-/* \typedef memop_pad_T Memory operation pad width datatype*/
-typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
-
-/* \typedef aluop_opcode_T ALU operation opcode datatype*/
-typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
-
-/* \typedef aluop_imm_T ALU operation immediate datatype*/
-typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
-
-/* \typedef aluop_shr_arg_T ALU operation shift right immediate datatype*/
-typedef ap_int<VTA_SHR_ARG_BIT_WIDTH> aluop_shr_arg_T;
-
-/* \typedef aluop_mul_arg_T ALU operation multiply datatype*/
-typedef ap_int<VTA_MUL_ARG_BIT_WIDTH> aluop_mul_arg_T;
-
-/*!
-* \brief Fetch module.
-*   Reads in \a insn_count instructions via DMA and pushes them to the
-*   appropriate load, gemm or store queue.
-* \param insns Instruction data base address in DRAM. AXI-4 master port.
-* \param insn_count Total instruction count. AXI-lite memory mapped register.
-* \param load_queue Load instruction queue. AXI-stream FIFO.
-* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
-* \param store_queue Store instruction queue. AXI-stream FIFO.
-*/
-void fetch(
-  uint32_t insn_count,
-  volatile insn_T *insns,
-  hls::stream<insn_T> &load_queue,
-  hls::stream<insn_T> &gemm_queue,
-  hls::stream<insn_T> &store_queue);
-
-/*!
-* \brief Load module.
-*   Reads in load instructions from the load queue, and performs appropriate
-*   DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM.
-*   Updates dependence queues accordingly.
-* \param inputs Input data base address in DRAM. AXI-4 master port.
-* \param weights Weight data base address in DRAM. AXI-4 master port.
-* \param load_queue Load instruction queue. AXI-stream FIFO.
-* \param g2l_dep_queue Dependence queue from GEMM to load stage.
-*   AXI-stream FIFO.
-* \param l2g_dep_queue Dependence queue from load to GEMM stage.
-*   AXI-stream FIFO.
-* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
-* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
-*/
-void load(
-  volatile bus_T *inputs,
-  volatile bus_T *weights,
-  hls::stream<insn_T> &load_queue,
-  hls::stream<bool> &g2l_dep_queue,
-  hls::stream<bool> &l2g_dep_queue,
-  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
-  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]);
-
-/*!
-* \brief Compute module.
-*   Reads in GEMM instructions from the gemm queue, and performs appropriate
-*   GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem,
-*   and writes computation results into the \a out_mem. Updates dependence
-*   queues accordingly.
-* \param done Signal that indicates that VLA is done.  AXI-lite memory mapped
-*   register.
-* \param uops Micro-op data base address in DRAM. AXI-4 master port.
-* \param biases Bias data base address in DRAM. AXI-4 master port.
-* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
-* \param l2g_dep_queue Dependence queue from load to gemm stage.
-*   AXI-stream FIFO.
-* \param s2g_dep_queue Dependence queue from store to gemm stage.
-*   AXI-stream FIFO.
-* \param g2l_dep_queue Dependence queue from gemm to load stage.
-*   AXI-stream FIFO.
-* \param g2s_dep_queue Dependence queue from gemm to store stage.
-*   AXI-stream FIFO.
-* \param inp_mem Local input SRAM buffer. Read only single port BRAM.
-* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
-* \param out_mem Local output SRAM buffer. Write only single port BRAM.
-*/
-void compute(
-  volatile uint32_t &done,
-  volatile uop_T *uops,
-  volatile bus_T *biases,
-  hls::stream<insn_T> &gemm_queue,
-  hls::stream<bool> &l2g_dep_queue,
-  hls::stream<bool> &s2g_dep_queue,
-  hls::stream<bool> &g2l_dep_queue,
-  hls::stream<bool> &g2s_dep_queue,
-  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
-  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
-  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
-
-/*!
-* \brief Store module.
-*   Reads in store instructions from the store queue, and performs appropriate
-*   store instructions from the output buffer in SRAM to DRAM. Updates dependence
-*   queues accordingly.
-* \param outputs Output data base address in DRAM. AXI-4 master port.
-* \param store_queue Store instruction queue. AXI-stream FIFO.
-* \param g2s_dep_queue Dependence queue from gemm to store stage.
-*   AXI-stream FIFO.
-* \param s2g_dep_queue Dependence queue from store to gemm stage.
-*   AXI-stream FIFO.
-* \param out_mem Local output SRAM buffer. Read only single port BRAM.
-*/
-void store(
-  volatile bus_T *outputs,
-  hls::stream<insn_T> &store_queue,
-  hls::stream<bool> &g2s_dep_queue,
-  hls::stream<bool> &s2g_dep_queue,
-  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
-
-/*!
-* \brief VTA wrapper for simulation purpose only.
-*   Orchestrates dataflow execution of the fetch, load, GEMM and store stages.
-* \param insn_count Total instruction count. AXI-lite memory mapped register.
-* \param insns Instruction data base address in DRAM. AXI-4 master port.
-* \param uops Micro-op data base address in DRAM. AXI-4 master port.
-* \param inputs Input data base address in DRAM. AXI-4 master port.
-* \param weights Weight data base address in DRAM. AXI-4 master port.
-* \param biases Bias data base address in DRAM. AXI-4 master port.
-* \param outputs Output data base address in DRAM. AXI-4 master port.
-*/
-void vta(
-  uint32_t insn_count,
-  volatile insn_T *insns,
-  volatile uop_T *uops,
-  volatile bus_T *inputs,
-  volatile bus_T *weights,
-  volatile bus_T *biases,
-  volatile bus_T *outputs);
-
-#endif  // VTA_VTA_H_
diff --git a/vta/vta-hw/include/vta/dpi/module.h b/vta/vta-hw/include/vta/dpi/module.h
deleted file mode 100644
index c1fc6bf43c98..000000000000
--- a/vta/vta-hw/include/vta/dpi/module.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef VTA_DPI_MODULE_H_
-#define VTA_DPI_MODULE_H_
-
-#include <tvm/runtime/module.h>
-#include <mutex>
-#include <queue>
-#include <condition_variable>
-#include <string>
-
-namespace vta {
-namespace dpi {
-
-/*!
- * \brief DPI driver module for managing the accelerator
- */
-class DPIModuleNode : public tvm::runtime::ModuleNode {
- public:
-/*! \brief Launch hardware simulation */
-  virtual void SimLaunch() = 0;
-
-/*! \brief Halt hardware simulation */
-  virtual void SimWait() = 0;
-
-/*! \brief Resume hardware simulation */
-  virtual void SimResume() = 0;
-
-/*! \brief Finish hardware simulation */
-  virtual void SimFinish() = 0;
-
-/*!
- * \brief Write an accelerator register
- * \param addr The register address
- * \param value The register value
- */
-  virtual void WriteReg(int addr, uint32_t value) = 0;
-
-/*!
- * \brief Read an accelerator register
- * \param addr The register address
- */
-  virtual uint32_t ReadReg(int addr) = 0;
-
-  static tvm::runtime::Module Load(std::string dll_name);
-};
-
-}  // namespace dpi
-}  // namespace vta
-#endif  // VTA_DPI_MODULE_H_
diff --git a/vta/vta-hw/include/vta/dpi/tsim.h b/vta/vta-hw/include/vta/dpi/tsim.h
deleted file mode 100644
index 8e13defc06b0..000000000000
--- a/vta/vta-hw/include/vta/dpi/tsim.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef VTA_DPI_TSIM_H_
-#define VTA_DPI_TSIM_H_
-
-#include <tvm/runtime/c_runtime_api.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned char dpi8_t;
-
-typedef unsigned int dpi32_t;
-
-typedef unsigned long long dpi64_t; // NOLINT(*)
-
-/*! \brief the context handle */
-typedef void* VTAContextHandle;
-
-typedef void (*VTASimDPIFunc)(
-    VTAContextHandle self,
-    dpi8_t* wait,
-    dpi8_t* exit);
-
-/*!
- * \brief Host DPI callback function that is invoked in VTAHostDPI.v every clock cycle
- * \param req_valid Host has a valid request for read or write a register in Accel
- * \param req_opcode Host request type, opcode=0 for read and opcode=1 for write
- * \param req_addr Host request register address
- * \param req_value Host request value to be written to a register
- * \param req_deq Accel is ready to dequeue Host request
- * \param resp_valid Accel has a valid response for Host
- * \param resp_value Accel response value for Host
- * \return 0 if success,
- */
-typedef void (*VTAHostDPIFunc)(
-    VTAContextHandle self,
-    dpi8_t* req_valid,
-    dpi8_t* req_opcode,
-    dpi8_t* req_addr,
-    dpi32_t* req_value,
-    dpi8_t req_deq,
-    dpi8_t resp_valid,
-    dpi32_t resp_value);
-
-/*!
- * \brief Memory DPI callback function that is invoked in VTAMemDPI.v every clock cycle
- * \param req_valid Accel has a valid request for Host
- * \param req_opcode Accel request type, opcode=0 (read) and opcode=1 (write)
- * \param req_len Accel request length of size 8-byte and starts at 0
- * \param req_addr Accel request base address
- * \param wr_valid Accel has a valid value for Host
- * \param wr_value Accel has a value to be written Host
- * \param rd_valid Host has a valid value for Accel
- * \param rd_value Host has a value to be read by Accel
- */
-typedef void (*VTAMemDPIFunc)(
-    VTAContextHandle self,
-    dpi8_t req_valid,
-    dpi8_t req_opcode,
-    dpi8_t req_len,
-    dpi64_t req_addr,
-    dpi8_t wr_valid,
-    dpi64_t wr_value,
-    dpi8_t* rd_valid,
-    dpi64_t* rd_value,
-    dpi8_t rd_ready);
-
-/*! \brief The type of VTADPIInit function pointer */
-typedef void (*VTADPIInitFunc)(VTAContextHandle handle,
-                            VTASimDPIFunc sim_dpi,
-                            VTAHostDPIFunc host_dpi,
-                            VTAMemDPIFunc mem_dpi);
-
-
-/*! \brief The type of VTADPISim function pointer */
-typedef int (*VTADPISimFunc)();
-
-/*!
- * \brief Set Host and Memory DPI functions
- * \param handle DPI Context handle
- * \param sim_dpi Sim DPI function
- * \param host_dpi Host DPI function
- * \param mem_dpi Memory DPI function
- */
-TVM_DLL void VTADPIInit(VTAContextHandle handle,
-                VTASimDPIFunc sim_dpi,
-                VTAHostDPIFunc host_dpi,
-                VTAMemDPIFunc mem_dpi);
-
-/*! \brief VTA hardware simulation thread */
-TVM_DLL int VTADPISim();
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // VTA_DPI_TSIM_H_
diff --git a/vta/vta-hw/include/vta/driver.h b/vta/vta-hw/include/vta/driver.h
deleted file mode 100644
index 6d39d414de92..000000000000
--- a/vta/vta-hw/include/vta/driver.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file vta/driver.h
- * \brief Driver interface that is used by runtime.
- *
- * Driver's implementation is device specific.
- */
-
-#ifndef VTA_DRIVER_H_
-#define VTA_DRIVER_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-/*! \brief Memory management constants for cached memory */
-#define VTA_CACHED 1
-/*! \brief Memory management constants for non-cached memory */
-#define VTA_NOT_CACHED 0
-
-/*! \brief Physically contiguous buffer size limit */
-#ifndef VTA_MAX_XFER
-#define VTA_MAX_XFER (1<<25)
-#endif
-
-/*! PAGE SIZE */
-#define VTA_PAGE_BITS 12
-#define VTA_PAGE_BYTES (1 << VTA_PAGE_BITS)
-
-/*! \brief Device resource context  */
-typedef void * VTADeviceHandle;
-
-/*! \brief physical address */
-#ifdef USE_VTA64
-typedef uint64_t vta_phy_addr_t;
-#else
-typedef uint32_t vta_phy_addr_t;
-#endif
-
-/*!
- * \brief Allocate a device resource handle
- * \return The device handle.
- */
-VTADeviceHandle VTADeviceAlloc();
-
-/*!
- * \brief Free a device handle
- * \param handle The device handle to be freed.
- */
-void VTADeviceFree(VTADeviceHandle handle);
-
-/*!
- * \brief Launch the instructions block until done.
- * \param device The device handle.
- * \param insn_phy_addr The physical address of instruction stream.
- * \param insn_count Instruction count.
- * \param wait_cycles The maximum of cycles to wait
- *
- * \return 0 if running is successful, 1 if timeout.
- */
-int VTADeviceRun(VTADeviceHandle device,
-                 vta_phy_addr_t insn_phy_addr,
-                 uint32_t insn_count,
-                 uint32_t wait_cycles);
-
-/*!
- * \brief Allocates physically contiguous region in memory readable/writeable by FPGA.
- * \param size Size of the region in Bytes.
- * \param cached Region can be set to not cached (write-back) if set to 0.
- * \return A pointer to the allocated region.
- */
-void* VTAMemAlloc(size_t size, int cached);
-
-/*!
- * \brief Frees a physically contiguous region in memory readable/writeable by FPGA.
- * \param buf Buffer to free.
- */
-void VTAMemFree(void* buf);
-
-/*!
- * \brief Returns a physical address to the region of memory allocated with VTAMemAlloc.
- * \param buf Pointer to memory region allocated with VTAMemAlloc.
- * \return The physical address of the memory region.
- */
-vta_phy_addr_t VTAMemGetPhyAddr(void* buf);
-
-/*!
- * \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc.
- * \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc.
- * \param src The source buffer in host memory.
- * \param size Size of the region in Bytes.
- */
-void VTAMemCopyFromHost(void* dst, const void* src, size_t size);
-
-/*!
- * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
- * \param dst The destination buffer in host memory.
- * \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc.
- * \param size Size of the region in Bytes.
- */
-void VTAMemCopyToHost(void* dst, const void* src, size_t size);
-
-/*!
- * \brief Flushes the region of memory out of the CPU cache to DRAM.
- * \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be flushed.
- *                 This need to be the virtual address.
- * \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be flushed.
- *                 This need to be the physical address.
- * \param size Size of the region to flush in Bytes.
- */
-void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size);
-
-/*!
- * \brief Invalidates the region of memory that is cached.
- * \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated.
- *                 This need to be the virtual address.
- * \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated.
- *                 This need to be the physical address.
- * \param size Size of the region to invalidate in Bytes.
- */
-void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // VTA_DRIVER_H_
diff --git a/vta/vta-hw/include/vta/hw_spec.h b/vta/vta-hw/include/vta/hw_spec.h
deleted file mode 100644
index 2294ae90ffde..000000000000
--- a/vta/vta-hw/include/vta/hw_spec.h
+++ /dev/null
@@ -1,415 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file hw_spec.h
- * \brief Preprocessor definitions for VTA HLS design and runtime.
- */
-
-#ifndef VTA_HW_SPEC_H_
-#define VTA_HW_SPEC_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-
-/*! Memory bus width */
-#define VTA_BUS_WIDTH (1 << VTA_LOG_BUS_WIDTH)
-
-/*! log2 of instruction data type width */
-#define VTA_LOG_INS_WIDTH 7
-/*! Instruction data type width */
-#define VTA_INS_WIDTH (1 << VTA_LOG_INS_WIDTH)
-/*! log2 of micro op data type width */
-#define VTA_LOG_UOP_WIDTH 5
-/*! Micro Op data type width */
-#define VTA_UOP_WIDTH (1 << VTA_LOG_UOP_WIDTH)
-/*! Weight data type width */
-#define VTA_WGT_WIDTH (1 << VTA_LOG_WGT_WIDTH)
-/*! Input data type width */
-#define VTA_INP_WIDTH (1 << VTA_LOG_INP_WIDTH)
-/*! Output data type width */
-#define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH)
-/*! Accumulator data type width */
-#define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH)
-
-/*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
-#define VTA_BATCH (1 << VTA_LOG_BATCH)
-/*! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) */
-#define VTA_BLOCK_IN (1 << VTA_LOG_BLOCK_IN)
-/*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
-#define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT)
-
-/*! On-chip micro-op buffer size in B */
-#define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE)
-/*! On-chip weight buffer size in B */
-#define VTA_WGT_BUFF_SIZE (1 << VTA_LOG_WGT_BUFF_SIZE)
-/*! On-chip activation buffer size in B */
-#define VTA_INP_BUFF_SIZE (1 << VTA_LOG_INP_BUFF_SIZE)
-/*! On-chip accumulator buffer size in B */
-#define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE)
-
-/*! Input vector size in bits */
-#define VTA_INP_MATRIX_WIDTH (VTA_INP_WIDTH * VTA_BATCH * VTA_BLOCK_IN)
-/*! Weight vector size in bits */
-#define VTA_WGT_MATRIX_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_OUT * VTA_BLOCK_IN)
-/*! Accumulator vector size in bits */
-#define VTA_ACC_MATRIX_WIDTH (VTA_ACC_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
-/*! Output vector size in bits */
-#define VTA_OUT_MATRIX_WIDTH (VTA_OUT_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
-
-/*! Ratio between input matrix size and axi width */
-#define INP_MAT_AXI_RATIO (VTA_INP_MATRIX_WIDTH / VTA_BUS_WIDTH)
-/*! Ratio between weight matrix size and axi width */
-#define WGT_MAT_AXI_RATIO (VTA_WGT_MATRIX_WIDTH / VTA_BUS_WIDTH)
-/*! Ratio between accumulator matrix size and axi width */
-#define ACC_MAT_AXI_RATIO (VTA_ACC_MATRIX_WIDTH / VTA_BUS_WIDTH)
-/*! Ratio between output matrix size and axi width */
-#define OUT_MAT_AXI_RATIO (VTA_OUT_MATRIX_WIDTH / VTA_BUS_WIDTH)
-
-/*! Size of instruction buffer element in B */
-#define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8)
-/*! Size of uop buffer element in B*/
-#define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8)
-/*! Size of activation buffer element in B*/
-#define VTA_INP_ELEM_BYTES (VTA_INP_MATRIX_WIDTH / 8)
-/*! Size of weight buffer element in B*/
-#define VTA_WGT_ELEM_BYTES (VTA_WGT_MATRIX_WIDTH / 8)
-/*! Size of accumulator buffer element in B*/
-#define VTA_ACC_ELEM_BYTES (VTA_ACC_MATRIX_WIDTH / 8)
-/*! Size of output buffer element in B*/
-#define VTA_OUT_ELEM_BYTES (VTA_OUT_MATRIX_WIDTH / 8)
-
-/*! On-chip micro-op buffer depth */
-#define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES)
-/*! log2 of on-chip micro-op buffer depth */
-#define VTA_LOG_UOP_BUFF_DEPTH (VTA_LOG_UOP_BUFF_SIZE - VTA_LOG_UOP_WIDTH + 3)
-// ! \brief On-chip weight buffer depth
-#define VTA_WGT_BUFF_DEPTH (VTA_WGT_BUFF_SIZE / VTA_WGT_ELEM_BYTES)
-/*! log2 of weight micro-op buffer depth */
-#define VTA_LOG_WGT_BUFF_DEPTH \
-    (VTA_LOG_WGT_BUFF_SIZE - VTA_LOG_BLOCK_OUT - VTA_LOG_BLOCK_IN - VTA_LOG_WGT_WIDTH + 3)
-/*! On-chip activation buffer depth */
-#define VTA_INP_BUFF_DEPTH (VTA_INP_BUFF_SIZE / VTA_INP_ELEM_BYTES)
-/*! log2 of activation micro-op buffer depth */
-#define VTA_LOG_INP_BUFF_DEPTH \
-    (VTA_LOG_INP_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_IN - VTA_LOG_INP_WIDTH + 3)
-/*! On-chip accumulator buffer depth */
-#define VTA_ACC_BUFF_DEPTH (VTA_ACC_BUFF_SIZE / VTA_ACC_ELEM_BYTES)
-/*! log2 of on-chip accumulator buffer depth */
-#define VTA_LOG_ACC_BUFF_DEPTH \
-    (VTA_LOG_ACC_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_OUT - VTA_LOG_ACC_WIDTH + 3)
-
-/*! Instruction opcode field bitwidth */
-#define VTA_OPCODE_BIT_WIDTH 3
-/*! ALU opcode field bitwidth */
-#define VTA_ALU_OPCODE_BIT_WIDTH 2
-
-/*! Opcode: load encoding */
-#define VTA_OPCODE_LOAD 0
-/*! Opcode: store encoding */
-#define VTA_OPCODE_STORE 1
-/*! Opcode: GEMM encoding */
-#define VTA_OPCODE_GEMM 2
-/*! Opcode: finish encoding */
-#define VTA_OPCODE_FINISH 3
-/*! Opcode: ALU encoding */
-#define VTA_OPCODE_ALU 4
-
-/*! ALU opcode: unary min op */
-#define VTA_ALU_OPCODE_MIN 0
-/*! ALU opcode: unary max op */
-#define VTA_ALU_OPCODE_MAX 1
-/*! ALU opcode: binary add op */
-#define VTA_ALU_OPCODE_ADD 2
-/*! ALU opcode: shift right by immediate op */
-#define VTA_ALU_OPCODE_SHR 3
-
-/*! Memory type field bitwidth */
-#define VTA_MEMOP_ID_BIT_WIDTH 2
-/*! Load/Store Instruction: DRAM address width*/
-#define VTA_MEMOP_SRAM_ADDR_BIT_WIDTH 16
-/*! Load/Store Instruction: DRAM address width*/
-#define VTA_MEMOP_DRAM_ADDR_BIT_WIDTH 32
-/*! Load/Store Instruction: transfer size width*/
-#define VTA_MEMOP_SIZE_BIT_WIDTH 16
-/*! Load/Store Instruction: stride size width*/
-#define VTA_MEMOP_STRIDE_BIT_WIDTH 16
-/*! Load/Store Instruction: padding width*/
-#define VTA_MEMOP_PAD_BIT_WIDTH 4
-/*! Load/Store Instruction: padding value encoding width*/
-#define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
-/*! GEMM/ALU Instruction: loop max iter bits */
-#define VTA_LOOP_ITER_WIDTH 14
-/*! ALU Instruction: immediate bitwidth*/
-#define VTA_ALUOP_IMM_BIT_WIDTH 16
-/*! ALU Instruction: shift arg bitwidth*/
-#define VTA_SHR_ARG_BIT_WIDTH (VTA_LOG_ACC_WIDTH)
-/*! ALU Instruction: multiply arg bitwidth*/
-#define VTA_MUL_ARG_BIT_WIDTH 8
-
-/*! Mem ID constant: uop memory */
-#define VTA_MEM_ID_UOP 0
-/*! Mem ID constant: weight memory */
-#define VTA_MEM_ID_WGT 1
-/*! Mem ID constant: input memory */
-#define VTA_MEM_ID_INP 2
-/*! Mem ID constant: accumulator/bias memory */
-#define VTA_MEM_ID_ACC 3
-/*! Mem ID constant: output store buffer */
-#define VTA_MEM_ID_OUT 4
-
-/*! GEMM Micro-op start position of the acc_idx field */
-#define VTA_UOP_GEM_0_0 0
-/*! GEMM Micro-op end position of the acc_idx field */
-#define VTA_UOP_GEM_0_1 (VTA_UOP_GEM_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
-/*! GEMM Micro-op start position of the inp_idx field */
-#define VTA_UOP_GEM_1_0 (VTA_UOP_GEM_0_1 + 1)
-/*! GEMM Micro-op end position of the inp_idx field */
-#define VTA_UOP_GEM_1_1 (VTA_UOP_GEM_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
-/*! GEMM Micro-op start position of the wgt_idx field */
-#define VTA_UOP_GEM_2_0 (VTA_UOP_GEM_1_1 + 1)
-/*! GEMM Micro-op end position of the wgt_idx field */
-#define VTA_UOP_GEM_2_1 (VTA_UOP_GEM_2_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
-
-/*! GEMM Micro-op start position of the acc_idx field */
-#define VTA_UOP_ALU_0_0 0
-/*! GEMM Micro-op end position of the acc_idx field */
-#define VTA_UOP_ALU_0_1 (VTA_UOP_ALU_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
-/*! GEMM Micro-op start position of the inp_idx field */
-#define VTA_UOP_ALU_1_0 (VTA_UOP_ALU_0_1 + 1)
-/*! GEMM Micro-op end position of the inp_idx field */
-#define VTA_UOP_ALU_1_1 (VTA_UOP_ALU_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
-
-/*! \brief VTA generic instruction */
-typedef struct {
-  /*! \brief The instruction opcode */
-  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
-  /*! \brief Unused in this instruction */
-  uint64_t pop_prev_dep   : 1;
-  /*! \brief Pop dependence token from GEMM stage */
-  uint64_t pop_next_dep   : 1;
-  /*! \brief Unused in this instruction */
-  uint64_t push_prev_dep  : 1;
-  /*! \brief Push dependence token to GEMM stage */
-  uint64_t push_next_dep  : 1;
-  /*! \brief Padding */
-  uint64_t pad_0          : 64 - VTA_OPCODE_BIT_WIDTH - 4;
-  /*! \brief Padding */
-  uint64_t pad_1          : 64;
-} VTAGenericInsn;
-
-/*! \brief VTA load/store instruction
-*   Load/store instruction can describe a 2D strided access pattern
-*   with padding, which can be useful to perform spatial padding
-*   on the fly on a tensor on which to perform 2D convolution.
-*   For instance if we try to load a 4x4 spatial tile from a 16x16
-*   matrix with padding of size 1 on all dimensions:
-*   y_size = 4, x_size = 4, x_stride = 16, y_pad_0 = 1, y_pad_1 = 1,
-*   x_pad_0 = 1, x_pad_1 = 1.
-*/
-typedef struct {
-  /*! \brief The instruction opcode */
-  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
-  /*! \brief Unused in this instruction */
-  uint64_t pop_prev_dep   : 1;
-  /*! \brief Pop dependence token from GEMM stage */
-  uint64_t pop_next_dep   : 1;
-  /*! \brief Unused in this instruction */
-  uint64_t push_prev_dep  : 1;
-  /*! \brief Push dependence token to GEMM stage */
-  uint64_t push_next_dep  : 1;
-  /*! \brief Source/destination SRAM for store/load instruction */
-  uint64_t memory_type    : VTA_MEMOP_ID_BIT_WIDTH;
-  /*! \brief SRAM base address (pointer to memory elem type) */
-  uint64_t sram_base      : VTA_MEMOP_SRAM_ADDR_BIT_WIDTH;
-  /*! \brief DRAM base address (pointer to memory elem type) */
-  uint64_t dram_base      : VTA_MEMOP_DRAM_ADDR_BIT_WIDTH;
-  /*! \brief 2D access pattern: y-size */
-  uint64_t y_size         : VTA_MEMOP_SIZE_BIT_WIDTH;
-  /*! \brief 2D access pattern: x-size (in terms of memory elements) */
-  uint64_t x_size         : VTA_MEMOP_SIZE_BIT_WIDTH;
-  /*! \brief 2D access pattern: x-stride (in terms of memory elements) */
-  uint64_t x_stride       : VTA_MEMOP_STRIDE_BIT_WIDTH;
-  /*! \brief 2D access pattern: start padding along y dimension */
-  uint64_t y_pad_0        : VTA_MEMOP_PAD_BIT_WIDTH;
-  /*! \brief 2D access pattern: end padding along y dimension */
-  uint64_t y_pad_1        : VTA_MEMOP_PAD_BIT_WIDTH;
-  /*! \brief 2D access pattern: start padding along x dimension */
-  uint64_t x_pad_0        : VTA_MEMOP_PAD_BIT_WIDTH;
-  /*! \brief 2D access pattern: end padding along x dimension */
-  uint64_t x_pad_1        : VTA_MEMOP_PAD_BIT_WIDTH;
-} VTAMemInsn;
-
-/*! \brief VTA GEMM instruction
-*   GEMM instruction is implemented by executing a sequence of micro-operations
-*   that is read in the local micro-op memory, delimited by \a uop_bgn and
-*   \a uop_end. For improved storage-efficiency, the micro-operations can be
-*   executed in a 2-level nested loop as follows:
-*   \code{.cpp}
-*     for (i = 0; i < iter_out; i++) {
-*       for (j = 0; j < iter_in; j++) {
-*         for (k = uop_bgn; k < uop_end; k++) {
-*           // Read micro op
-*           uop_T uop = uop_mem[k];
-*           // Read in memory indices
-*           acc_idx_T acc_idx = uop.dst_idx;
-*           inp_idx_T inp_idx = uop.inp_idx;
-*           wgt_idx_T wgt_idx = uop.wgt_idx;
-*           // Update those indices with the following affine functions
-*           acc_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
-*           inp_idx += iter_in * src_factor_in + iter_out * src_factor_out;
-*           wgt_idx += iter_in * wgt_factor_in + iter_out * wgt_factor_out;
-*           // Perform GEMM operation
-*           acc_mem[acc_idx] += dot(inp_mem[inp_idx], wgt[wgt_idx]);
-*         }
-*       }
-*     }
-*   \endcode
-*
-*/
-typedef struct {
-  /*! \brief The instruction opcode */
-  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
-  /*! \brief Pop dependence token from load stage */
-  uint64_t pop_prev_dep   : 1;
-  /*! \brief Pop dependence token from store stage */
-  uint64_t pop_next_dep   : 1;
-  /*! \brief Push dependence token to load stage */
-  uint64_t push_prev_dep  : 1;
-  /*! \brief Push dependence token to store stage */
-  uint64_t push_next_dep  : 1;
-  /*! \brief Reset register */
-  uint64_t reset_reg      : 1;
-  /*! \brief Micro-op begin address */
-  uint64_t uop_bgn        : VTA_LOG_UOP_BUFF_DEPTH;
-  /*! \brief Micro-op end address */
-  uint64_t uop_end        : VTA_LOG_UOP_BUFF_DEPTH + 1;
-  /*! \brief Iterations in the outer uop execution loop */
-  uint64_t iter_out       : VTA_LOOP_ITER_WIDTH;
-  /*! \brief Iterations in the inner uop execution loop */
-  uint64_t iter_in        : VTA_LOOP_ITER_WIDTH;
-  /*! \brief Outer loop accumulator memory index factor */
-  uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
-  /*! \brief Inner loop accumulator memory index factor */
-  uint64_t dst_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
-  /*! \brief Outer loop input memory index factor */
-  uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
-  /*! \brief Inner loop input memory index factor */
-  uint64_t src_factor_in  : VTA_LOG_INP_BUFF_DEPTH;
-  /*! \brief Outer loop weight memory index factor */
-  uint64_t wgt_factor_out : VTA_LOG_WGT_BUFF_DEPTH;
-  /*! \brief Inner loop weight memory index factor */
-  uint64_t wgt_factor_in  : VTA_LOG_WGT_BUFF_DEPTH;
-} VTAGemInsn;
-
-/*! \brief VTA ALU instruction
-*   ALU instruction is implemented by executing a sequence of micro-operations
-*   that is read in the local micro-op memory, delimited by \a uop_bgn and
-*   \a uop_end. For improved storage-efficiency, the micro-operations can be
-*   executed in a 2-level nested loop as follows:
-*   \code{.cpp}
-*     for (i = 0; i < iter_out; i++) {
-*       for (j = 0; j < iter_in; j++) {
-*         for (k = uop_bgn; k < uop_end; k++) {
-*           // Read micro op
-*           uop_T uop = uop_mem[k];
-*           // Read in memory indices
-*           acc_idx_T dst_idx = uop.dst_idx;
-*           inp_idx_T src_idx = uop.inp_idx;
-*           // Update those indices with the following affine functions
-*           dst_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
-*           src_idx += iter_in * src_factor_in + iter_out * src_factor_out;
-*           // Perform ALU operation
-*           if (use_imm) {
-*             acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], imm);
-*           } else {
-*             acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], acc_mem[src_idx]);
-*           }
-*         }
-*       }
-*     }
-*   \endcode
-*
-*/
-typedef struct {
-  /*! \brief The instruction opcode */
-  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
-  /*! \brief Pop dependence token from load stage */
-  uint64_t pop_prev_dep   : 1;
-  /*! \brief Pop dependence token from store stage */
-  uint64_t pop_next_dep   : 1;
-  /*! \brief Push dependence token to load stage */
-  uint64_t push_prev_dep  : 1;
-  /*! \brief Push dependence token to store stage */
-  uint64_t push_next_dep  : 1;
-  /*! \brief Reset register */
-  uint64_t reset_reg      : 1;
-  /*! \brief Micro-op begin address */
-  uint64_t uop_bgn        : VTA_LOG_UOP_BUFF_DEPTH;
-  /*! \brief Micro-op end address */
-  uint64_t uop_end        : VTA_LOG_UOP_BUFF_DEPTH + 1;
-  /*! \brief Iterations in the outer uop execution loop */
-  uint64_t iter_out       : VTA_LOOP_ITER_WIDTH;
-  /*! \brief Iterations in the inner uop execution loop */
-  uint64_t iter_in        : VTA_LOOP_ITER_WIDTH;
-  /*! \brief Outer loop accumulator memory destination index factor */
-  uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
-  /*! \brief Inner loop accumulator memory destination index factor */
-  uint64_t dst_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
-  /*! \brief Outer loop accumulator memory source index factor */
-  uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
-  /*! \brief Inner loop accumulator memory source index factor */
-  uint64_t src_factor_in  : VTA_LOG_INP_BUFF_DEPTH;
-  /*! \brief ALU opcode */
-  uint64_t alu_opcode     : VTA_ALU_OPCODE_BIT_WIDTH;
-  /*! \brief Use immediate is true */
-  uint64_t use_imm        : 1;
-  /*! \brief Immediate value: allow negative value */
-  int64_t imm            : VTA_ALUOP_IMM_BIT_WIDTH;
-} VTAAluInsn;
-
-/*! \brief VTA ALU instruction converter */
-union VTAInsn {
-  /*! \brief VTA generic instruction */
-  VTAGenericInsn generic;
-  /*! \brief VTA load/store instruction */
-  VTAMemInsn mem;
-  /*! \brief VTA GEMM instruction */
-  VTAGemInsn gemm;
-  /*! \brief VTA ALU instruction */
-  VTAAluInsn alu;
-};
-
-/*! \brief VTA micro-op for GEMM/ALU instruction */
-typedef struct {
-  /*! \brief Destination index (indexes accum buffer) */
-  uint32_t dst_idx    : VTA_LOG_ACC_BUFF_DEPTH;
-  /*! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) */
-  uint32_t src_idx    : VTA_LOG_INP_BUFF_DEPTH;
-  /*! \brief Weight index (indexes weight buffer) */
-  uint32_t wgt_idx    : VTA_LOG_WGT_BUFF_DEPTH;
-} VTAUop;
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // VTA_HW_SPEC_H_
diff --git a/vta/vta-hw/include/vta/sim_tlpp.h b/vta/vta-hw/include/vta/sim_tlpp.h
deleted file mode 100644
index 531dceb09d5b..000000000000
--- a/vta/vta-hw/include/vta/sim_tlpp.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file sim_tlpp.h
- * \brief TVM VTA multiple thread simulator header file.
- */
-#ifndef VTA_SIM_TLPP_H_
-#define VTA_SIM_TLPP_H_
-#include <vta/hw_spec.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <vector>
-#include <ctime>
-#include <cassert>
-#include <queue>
-
-#define SCOREGEMM "gemm"
-#define SCORELOAD "load"
-#define SCORESTORE "store"
-#define SCOREUNKNOWN "unknown"
-typedef void (*Run_Function)(const VTAGenericInsn *, void *);
-typedef enum {COREGEMM = 0, CORELOAD, CORESTORE, COREMAX} CORE_TYPE;
-typedef std::queue<const void*> Insn_q_t;
-typedef std::queue<int> Dep_q_t;
-/*!
- * \brief simulate core level pipe line parallism logic.
- */
-class TlppVerify {
- public:
-    /*! Return TlppVefiy class instance.*/
-    static TlppVerify *Global() { static TlppVerify Cls; return &Cls;}
-
-    /*!
-     *  \brief Loop to process instruction and verify tlpp logic.
-     *  \param run_function function pointer to excute instruction .
-     *  \param fsim_handle class pointer of function simulator class Device.
-     *  \param debug to enable/disable debug
-     */
-    void TlppSynchronization(Run_Function run_function,
-                             void *fsim_handle,
-                             bool debug = false);
-    /*!
-     *  \brief Push instruction into queue for later excute.
-     *  \param insn instructions.
-     */
-    void TlppPushInsn(const VTAGenericInsn *insn);
-    /*! \ Event pump to handle dependency event. */
-    void EventProcess(void);
-    /*! \ Schedule a paticular core to run. */
-    void CoreRun(CORE_TYPE core_type);
-
- private:
-    /*! TlppVerify construction function.*/
-    TlppVerify();
-    /*!
-     * \brief clear class variable.
-     */
-    void Clear();
-    /*!
-     * \ brief check if the insn dependency condition satisfy and do notify.
-     * \ param insn instructions.
-     * \ param before_run identify this check is happen before
-     *   instruction excute or after instruction excute, for before
-     *   scenario need to check if depency condition satisfy, for post
-     *   case need to check if need to send notfication.
-     */
-    bool InsnDependencyCheck(const VTAGenericInsn *insn, bool before_run);
-    /*!
-     * \ brief get operation code from insn
-     * \ param insn instructions
-     */
-    uint64_t GetOperationCode(const VTAGenericInsn *insn);
-    /*!
-     * \ brief find which core should run this instruction.
-     * \ param operation_code operation type like load/gemm etc.
-     * \ param insn instructions.
-     */
-    CORE_TYPE GetCoreType(uint64_t operation_code, const VTAGenericInsn *insn);
-    /*!
-     * \ brief , pick up first instruction for specify core.
-     * \ param core_type core type
-     */
-    const VTAGenericInsn *PickFrontInsn(uint64_t core_type);
-    /*!
-     * \ brief consume one instruction after pass dependency condition.
-     * \ param core_type core type
-     */
-    void ConsumeFrontInsn(uint64_t core_type);
-    /*!
-     * \ brief, process dependency logic
-     * param before_run if this call happen before instruction run.
-     * param pop_prev if instruction have previous core dependency.
-     * param pop_next if instruction have depency for next core.
-     * param pop_prev_q notification from previous core.
-     * param pop_next_q notification from next core.
-     * param push_prev_q notification queue need to send notification
-     * for prevous core.
-     * param push_next_q notification queue need to send notification
-     * from next core.
-     * push_to_prev_q_indx which core need wake up if have notification
-     * fro previous core.
-     * push_to_next_q_indx which core need wake up if have notification
-     * fro next core.
-     */
-    bool DependencyProcess(bool before_run,
-        bool pop_prev, bool pop_next,
-        bool push_prev, bool push_next,
-        Dep_q_t *pop_prev_q, Dep_q_t *pop_next_q,
-        Dep_q_t *push_prev_q, Dep_q_t *push_next_q,
-        CORE_TYPE push_to_prev_q_indx, CORE_TYPE push_to_next_q_indx);
-    /*!
-     * \ brief , return name based on core type.
-     * \ param core_type core type
-     */
-    inline const char * GetCoreTypeName(CORE_TYPE core_type) {
-      return (core_type == COREGEMM) ? SCOREGEMM :
-        (core_type == CORELOAD) ? SCORELOAD :
-        (core_type == CORESTORE) ? SCORESTORE :
-        SCOREUNKNOWN;
-    }
-    /*! debug flag*/
-    bool debug_;
-    /*! function simulator device class pointer*/
-    void *fsim_handle_;
-    /*! function simulator instruction excute function pointer*/
-    Run_Function run_fsim_function_;
-    /*! instruction queue for each core*/
-    Insn_q_t insnq_array_[COREMAX];
-    /*! dependency queue from load to gemm*/
-    Dep_q_t l2g_q_;
-    /*! dependency queue from store to gemm*/
-    Dep_q_t s2g_q_;
-    /*! dependency queue from gemm to load*/
-    Dep_q_t g2l_q_;
-    /*! dependency queue from gemm to store*/
-    Dep_q_t g2s_q_;
-    /*! computation done*/
-    int done_;
-    /*! event queue for core wake up*/
-    std::queue<CORE_TYPE> dep_push_event_;
-};
-#endif  // VTA_SIM_TLPP_H_
diff --git a/vta/vta-hw/src/de10nano/cma_api.cc b/vta/vta-hw/src/de10nano/cma_api.cc
deleted file mode 100644
index 10941cfae6ed..000000000000
--- a/vta/vta-hw/src/de10nano/cma_api.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-/*!
- * \file cma_api.cc
- * \brief Application layer implementation for contigous memory allocation.
- */
-
-#include "cma_api.h"
-#include <cma/cma_api_impl.h>
diff --git a/vta/vta-hw/src/de10nano/cma_api.h b/vta/vta-hw/src/de10nano/cma_api.h
deleted file mode 100644
index 5e1653f172c7..000000000000
--- a/vta/vta-hw/src/de10nano/cma_api.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- * \file cma_api.h
- * \brief API for contigous memory allocation driver.
- */
-
-#ifndef VTA_DE10NANO_CMA_API_H_
-#define VTA_DE10NANO_CMA_API_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stddef.h>
-
-/**
- * \brief Initialize CMA api (basically perform open() syscall).
- *
- * \return Returns 0 on SUCCESS. On FAILURE returns -1 and errno is set
- * accordingly.
- */
-int cma_init(void);
-
-
-/**
- * \brief Release CMA api (basically perform close() syscall).
- *
- * \return Returns 0 on SUCCESS. On FAILURE returns -1 and errno is set
- * accordingly.
- */
-int cma_release(void);
-
-
-/**
- * \brief Allocate cached, physically contigous memory.
- *
- * \param size Size in bytes.
- *
- * \return Returns NULL on FAILURE. Otherwise pointer to valid userspace
- * memory.
- */
-void *cma_alloc_cached(size_t size);
-
-
-/**
- * \brief Allocate noncached, physically contigous memory.
- *
- * \param size Size in bytes.
- *
- * \return Returns NULL on FAILURE. Otherwise pointer to valid userspace
- * memory.
- */
-void *cma_alloc_noncached(size_t size);
-
-
-/**
- * \brief Release physically contigous memory.
- *
- * \param mem Pointer to previously allocated contiguous memory.
- *
- * \return Returns 0 on SUCCESS, -1 on FAILURE.
- */
-int cma_free(void *mem);
-
-
-/**
- * \brief Get physical memory of cma memory block (should be used for DMA).
- *
- * \param mem Pointer to previously allocated contiguous memory.
- *
- * \return Returns address on SUCCESS, 0 on FAILURE.
- */
-unsigned cma_get_phy_addr(void *mem);
-
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // VTA_DE10NANO_CMA_API_H_
diff --git a/vta/vta-hw/src/de10nano/de10nano_driver.cc b/vta/vta-hw/src/de10nano/de10nano_driver.cc
deleted file mode 100644
index 94d000114dfc..000000000000
--- a/vta/vta-hw/src/de10nano/de10nano_driver.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- * \file de10-nano_driver.cc
- * \brief VTA driver for DE10_Nano board.
- */
-
-#include "de10nano_driver.h"
-#include "de10nano_mgr.h"
-
-#include <string.h>
-#include <vta/driver.h>
-#include <tvm/runtime/registry.h>
-#include <dmlc/logging.h>
-#include <thread>
-#include <string>
-#include "cma_api.h"
-
-void* VTAMemAlloc(size_t size, int cached) {
-  static int _ = cma_init(); (void)_;
-  if (cached) {
-    return cma_alloc_cached(size);
-  } else {
-    return cma_alloc_noncached(size);
-  }
-}
-
-void VTAMemFree(void* buf) {
-  cma_free(buf);
-}
-
-vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
-  return cma_get_phy_addr(buf) + 0x80000000;
-}
-
-void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
-  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
-  memcpy(dst, src, size);
-}
-
-void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
-  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
-  memcpy(dst, src, size);
-}
-
-void VTAFlushCache(void * offset, vta_phy_addr_t buf, int size) {
-  CHECK(false) << "VTAFlushCache not implemented for de10nano";
-  printf("VTAFlushCache not implemented for de10nano");
-}
-
-void VTAInvalidateCache(void * offset, vta_phy_addr_t buf, int size) {
-  CHECK(false) << "VTAInvalidateCache not implemented for de10nano";
-  printf("VTAInvalidateCache not implemented for de10nano");
-}
-
-void *VTAMapRegister(uint32_t addr) {
-  // Align the base address with the pages
-  uint32_t virt_base = addr & ~(getpagesize() - 1);
-  // Calculate base address offset w.r.t the base address
-  uint32_t virt_offset = addr - virt_base;
-  // Open file and mmap
-  uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC);
-  // Note that if virt_offset != 0, i.e. addr is not page aligned
-  // munmap will not be unmapping all memory.
-  void *vmem = mmap(NULL,
-              (VTA_IP_REG_MAP_RANGE + virt_offset),
-              PROT_READ|PROT_WRITE,
-              MAP_SHARED,
-              mmap_file,
-              virt_base);
-  close(mmap_file);
-  return vmem;
-}
-
-void VTAUnmapRegister(void *vta) {
-  // Unmap memory
-  int status = munmap(vta, VTA_IP_REG_MAP_RANGE);
-  assert(status == 0);
-}
-
-void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
-  *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)) = val;
-}
-
-uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
-  return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
-}
-
-class VTADevice {
- public:
-  VTADevice() {
-    // VTA stage handles
-    vta_host_handle_ = VTAMapRegister(VTA_HOST_ADDR);
-  }
-
-  ~VTADevice() {
-    // Close VTA stage handle
-    VTAUnmapRegister(vta_host_handle_);
-  }
-
-  int Run(vta_phy_addr_t insn_phy_addr,
-          uint32_t insn_count,
-          uint32_t wait_cycles) {
-    VTAWriteMappedReg(vta_host_handle_, 0x04, 0);
-    VTAWriteMappedReg(vta_host_handle_, 0x08, insn_count);
-    VTAWriteMappedReg(vta_host_handle_, 0x0c, insn_phy_addr);
-
-    // VTA start
-    VTAWriteMappedReg(vta_host_handle_, 0x0, VTA_START);
-
-    // Loop until the VTA is done
-    unsigned t, flag = 0;
-    for (t = 0; t < wait_cycles; ++t) {
-      flag = VTAReadMappedReg(vta_host_handle_, 0x00);
-      flag &= 0x2;
-      if (flag == 0x2) break;
-      std::this_thread::yield();
-    }
-    // Report error if timeout
-    return t < wait_cycles ? 0 : 1;
-  }
-
- private:
-  // VTA handles (register maps)
-  void* vta_host_handle_{nullptr};
-};
-
-VTADeviceHandle VTADeviceAlloc() {
-  return new VTADevice();
-}
-
-void VTADeviceFree(VTADeviceHandle handle) {
-  delete static_cast<VTADevice*>(handle);
-}
-
-int VTADeviceRun(VTADeviceHandle handle,
-                 vta_phy_addr_t insn_phy_addr,
-                 uint32_t insn_count,
-                 uint32_t wait_cycles) {
-  return static_cast<VTADevice*>(handle)->Run(
-      insn_phy_addr, insn_count, wait_cycles);
-}
-
-void VTAProgram(const char *rbf) {
-  De10NanoMgr mgr;
-  CHECK(mgr.mapped()) << "de10nano: mapping of /dev/mem failed";
-  CHECK(mgr.program_rbf(rbf)) << "Programming of the de10nano failed.\n"
-  "This is usually due to the use of an RBF file that is incompatible "
-  "with the MSEL switches on the DE10-Nano board. The recommended RBF "
-  "format is FastPassiveParallel32 with compression enabled, "
-  "corresponding to MSEL 01010. An RBF file in FPP32 mode can be "
-  "generated in a Quartus session with the command "
-  "'quartus_cpf -o bitstream_compression=on -c <file>.sof <file>.rbf'.";
-}
-
-using tvm::runtime::TVMRetValue;
-using tvm::runtime::TVMArgs;
-
-TVM_REGISTER_GLOBAL("vta.de10nano.program")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    std::string bitstream = args[0];
-    VTAProgram(bitstream.c_str());
-});
-
diff --git a/vta/vta-hw/src/de10nano/de10nano_driver.h b/vta/vta-hw/src/de10nano/de10nano_driver.h
deleted file mode 100644
index 0009e7574b02..000000000000
--- a/vta/vta-hw/src/de10nano/de10nano_driver.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- * \file de10-nano_driver.h
- * \brief VTA driver for DE10_Nano board.
- */
-
-#ifndef VTA_DE10NANO_DE10NANO_DRIVER_H_
-#define VTA_DE10NANO_DE10NANO_DRIVER_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include <assert.h>
-#include <fcntl.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-void *VTAMapRegister(uint32_t addr);
-void VTAUnmapRegister(void *vta);
-void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
-uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
-void VTAProgram(const char* bitstream);
-
-/*! \brief VTA configuration register address range */
-#define VTA_RANGE 0x400
-/*! \brief VTA configuration register start value */
-#define VTA_START 0x1
-/*! \brief VTA configuration register auto-restart value */
-#define VTA_AUTORESTART 0x81
-/*! \brief VTA configuration register done value */
-#define VTA_DONE 0x2
-
-/*! \brief VTA fetch stage configuration register address
-*/
-#define VTA_HOST_ADDR    0xFF220000
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // VTA_DE10NANO_DE10NANO_DRIVER_H_
diff --git a/vta/vta-hw/src/de10nano/de10nano_mgr.h b/vta/vta-hw/src/de10nano/de10nano_mgr.h
deleted file mode 100644
index a054640b4191..000000000000
--- a/vta/vta-hw/src/de10nano/de10nano_mgr.h
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- * \file de10nano_mgr.h
- * \brief DE10-Nano fpga manager.
- */
-
-#ifndef VTA_DE10NANO_DE10NANO_MGR_H_
-#define VTA_DE10NANO_DE10NANO_MGR_H_
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-
-// Register definition and address map taken from cv_5v4.pdf,
-// Cyclone V Hard Processor System Technical Reference Manual,
-// chapter 5: FPGA Manager.
-struct De10NanoMgr {
-  // Reg32 is a static base class interface and implementation
-  // of a generic 32 bit register that avoids the use of a virtual
-  // class and ugly bit shift manipulations.
-  struct Reg32 {
-    explicit Reg32(uint32_t offset, uint32_t reset = 0) :
-      m_offset(offset),
-      m_reset(reset)
-    {}
-    void map(uint8_t *base) {
-      m_addr = reinterpret_cast<uint32_t*>(base + m_offset);
-      m_reg  = reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(this)+sizeof(Reg32));
-    }
-    uint32_t read() {
-      *m_reg = *m_addr;
-      return *m_reg;
-    }
-    void write() { *m_addr = *m_reg; }
-    void write(uint32_t value) { *m_addr = *m_reg = value; }
-    void clear() { *m_reg = 0; }
-    void reset() { *m_reg = m_reset; }
-    void print(const char *name, bool addr = false) {
-      if (addr)
-        printf("DE10-Nano-Mgr: %16s: 0x%08x addr: %p\n", name, read(), m_addr);
-      else
-        printf("DE10-Nano-Mgr: %16s: 0x%08x\n", name, read());
-    }
-
-    uint32_t m_offset, m_reset, *m_reg;
-    volatile uint32_t *m_addr;
-
-   private:  // Do not use this class on its own.
-    Reg32(const Reg32 &rhs);
-  };
-
-  // Register definitions. All registers are of 32 bit size.
-  // Add one structure for each register, making sure that all
-  // bit fields come first and pack exactly into 32 bits.
-
-  struct data : public Reg32 {
-    data() : Reg32(0x0, 0x0) {}
-    uint32_t value;
-  } data;
-
-  struct stat : public Reg32 {
-    stat() : Reg32(0x0, 0x45) {}
-    enum mode_values {
-      FPGA_POWER_OFF    = 0x0,
-      FPGA_RESET_PHASE  = 0x1,
-      FPGA_CONFIG_PHASE = 0x2,
-      FPGA_INIT_PHASE   = 0x3,
-      FPGA_USER_MODE    = 0x4,
-      FPGA_ZOMBIE_MODE  = 0x5
-    };
-
-    enum msel_values {
-      FPP16_AESN_ZIPN = 0x0,
-      FPP32_AESO_ZIPY = 0xA
-    };
-
-    const char * mode_str() {
-      const char *str = "UNKNOWN";
-      switch (mode) {
-        case FPGA_POWER_OFF    : str = "POWER_OFF"    ; break;
-        case FPGA_RESET_PHASE  : str = "RESET_PHASE"  ; break;
-        case FPGA_CONFIG_PHASE : str = "CONFIG_PHASE" ; break;
-        case FPGA_INIT_PHASE   : str = "INIT_PHASE"   ; break;
-        case FPGA_USER_MODE    : str = "USER_MODE"    ; break;
-        case FPGA_ZOMBIE_MODE  : str = "UNDEF_MODE"   ; break;
-      }
-      return str;
-    }
-
-    bool msel_is_invalid() {
-      return msel & 0x10 || (msel & 0x3) == 0x3;
-    }
-
-    void print(bool addr = false, bool fields = true) {
-      Reg32::print("stat", addr);
-      if (fields) {
-        printf("DE10-Nano-Mgr: %16s: %x\n", "msel", msel);
-        printf("DE10-Nano-Mgr: %16s: %s\n", "mode", mode_str());
-      }
-    }
-
-    uint32_t mode :  3;  //  2:0 RW
-    uint32_t msel :  5;  //  7:3 RO
-    uint32_t rsvd : 24;  // 31:8
-  } stat;
-
-  struct ctrl : public Reg32 {
-    ctrl() : Reg32(0x4, 0x200) {}
-
-    uint32_t           en :  1;  //     0 RW
-    uint32_t          nce :  1;  //     1 RW
-    uint32_t  nconfigpull :  1;  //     2 RW
-    uint32_t  nstatuspull :  1;  //     3 RW
-    uint32_t confdonepull :  1;  //     4 RW
-    uint32_t        prreq :  1;  //     5 RW
-    uint32_t      cdratio :  2;  //   7:6 RW
-    uint32_t     axicfgen :  1;  //     8 RW
-    uint32_t      cfgwdth :  1;  //     9 RW
-    uint32_t         rsvd : 22;  // 31:10
-
-    void print(bool addr = false, bool fields = true) {
-      Reg32::print("ctrl", addr);
-      if (fields) {
-        printf("DE10-Nano-Mgr: %16s: %x\n", "en"          , en);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "nce"         , nce);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "nconfigpull" , nconfigpull);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "nstatuspull" , nstatuspull);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "confdonepull", confdonepull);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "prreq"       , prreq);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "cdratio"     , cdratio);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "axicfgen"    , axicfgen);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "cfgwdth"     , cfgwdth);
-      }
-    }
-  } ctrl;
-
-  struct dclkcnt : public Reg32 {
-    dclkcnt() : Reg32(0x8, 0x0) {}
-    void print() { return Reg32::print("dclkcnt"); }
-
-    uint32_t cnt;  // RW
-  } dclkcnt;
-
-  struct dclkstat : public Reg32 {
-    dclkstat() : Reg32(0xC, 0x0) {}
-    void print() { return Reg32::print("dclkstat"); }
-
-    uint32_t dcntdone :  1;  // RW
-    uint32_t     rsvd : 31;
-  } dclkstat;
-
-  struct gpio_inten : public Reg32 {
-    gpio_inten() : Reg32(0x830, 0x0) {}
-    void print() { return Reg32::print("gpio_inten"); }
-
-    uint32_t    value : 32;  // RW
-  } gpio_inten;
-
-  struct gpio_porta_eoi : public Reg32 {
-    gpio_porta_eoi() : Reg32(0x84C, 0x0) {}
-    void print() { return Reg32::print("gpio_porta_eoi"); }
-
-    uint32_t   ns :  1;  //     0 WO
-    uint32_t   cd :  1;  //     1 WO
-    uint32_t   id :  1;  //     2 WO
-    uint32_t  crc :  1;  //     3 WO
-    uint32_t  ccd :  1;  //     4 WO
-    uint32_t  prr :  1;  //     5 WO
-    uint32_t  pre :  1;  //     6 WO
-    uint32_t  prd :  1;  //     7 WO
-    uint32_t  ncp :  1;  //     8 WO
-    uint32_t  nsp :  1;  //     9 WO
-    uint32_t  cdp :  1;  //    10 WO
-    uint32_t  fpo :  1;  //    11 WO
-    uint32_t rsvd : 20;  // 31:12
-  } gpio_porta_eoi;
-
-  struct gpio_ext_porta : public Reg32 {
-    gpio_ext_porta() : Reg32(0x850, 0x0) {}
-    void print(bool addr = false, bool fields = true) {
-      Reg32::print("gpio_ext_porta", addr);
-      if (fields) {
-        printf("DE10-Nano-Mgr: %16s: %x\n", "nSTATUS"       , ns);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "CONF_DONE"     , cd);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "INIT_DONE"     , id);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "CRC_ERROR"     , crc);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "CVP_CONF_DONE" , ccd);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "PR_READY"      , prr);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "PR_ERROR"      , pre);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "PR_DONE"       , prd);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "nCONFIG_PIN"   , ncp);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "nSTATUS_PIN"   , nsp);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "CONF_DONE_PIN" , cdp);
-        printf("DE10-Nano-Mgr: %16s: %x\n", "FPGA_POWER_ON" , fpo);
-      }
-    }
-
-    uint32_t   ns :  1;  //     0 RO
-    uint32_t   cd :  1;  //     1 RO
-    uint32_t   id :  1;  //     2 RO
-    uint32_t  crc :  1;  //     3 RO
-    uint32_t  ccd :  1;  //     4 RO
-    uint32_t  prr :  1;  //     5 RO
-    uint32_t  pre :  1;  //     6 RO
-    uint32_t  prd :  1;  //     7 RO
-    uint32_t  ncp :  1;  //     8 RO
-    uint32_t  nsp :  1;  //     9 RO
-    uint32_t  cdp :  1;  //    10 RO
-    uint32_t  fpo :  1;  //    11 RO
-    uint32_t rsvd : 20;  // 31:12
-  } gpio_ext_porta;
-
-  struct monitor {
-    // This is used to both break a polling loop if the specified number
-    // of milliseconds have passed and to relax the polling yielding the
-    // cpu every millisecond.
-    monitor() : msg(""), m_status(true), m_ticks(0), m_counter(0) {
-      m_epoc_us = time_stamp();
-    }
-
-    void init(const char *message, uint32_t ticks_ms = 1000) {
-      msg = message;
-      m_ticks = m_counter = ticks_ms;
-      m_init_us = time_stamp();
-      printf("DE10-Nano-Mgr: %-32s : ", msg);
-    }
-
-    bool status() { return m_status; }
-
-    void reset() { m_counter = m_ticks; }
-
-    void done(bool status = true) {
-      uint32_t elapsed = time_stamp(m_init_us);
-      const char *rs = "FAIL";
-      if (!m_counter) {
-        status = false;
-        rs = "TOUT";
-      } else if (status) {
-        rs = "PASS";
-      }
-      printf("\rDE10-Nano-Mgr: %-32s : %s in %u us\n", msg, rs, elapsed);
-      if (!status) {
-        m_status = false;
-        throw 1;
-      }
-    }
-
-    ~monitor() {
-      uint32_t elapsed = time_stamp(m_epoc_us);
-      const char *rs = m_status ? "SUCCESS" : "FAILURE";
-      printf("DE10-Nano-Mgr: EXIT %s in %u us\n", rs, elapsed);
-    }
-
-    uint64_t time_stamp(uint64_t base_us = 0) {
-      struct timeval tv;
-      gettimeofday(&tv, NULL);
-      return tv.tv_sec * 1000000L + tv.tv_usec - base_us;
-    }
-
-    bool operator() (bool cond) {
-      if (m_counter) {
-        if (!cond)
-          return false;
-        m_counter--;
-        usleep(1000);
-      }
-      return m_counter;
-    }
-    const char *msg;
-
-   private:
-    bool m_status;
-    uint32_t m_ticks, m_counter;
-    uint64_t m_init_us, m_epoc_us;
-  };
-
-  enum BaseAddr {
-    REGS_BASE_ADDR = 0xFF706000U,
-    DATA_BASE_ADDR = 0xFFB90000U
-  };
-
-  De10NanoMgr() {
-    m_page_size = sysconf(_SC_PAGE_SIZE);
-    #ifdef MOCK_DEVMEM
-    m_regs_base = reinterpret_cast<uint8_t*>(malloc(m_page_size));
-    m_data_base = reinterpret_cast<uint8_t*>(malloc(m_page_size));
-    #else
-    m_regs_base = map_mem(REGS_BASE_ADDR);
-    m_data_base = map_mem(DATA_BASE_ADDR);
-    #endif  // MOCK_DEVMEM
-    data.map(m_data_base);
-    stat.map(m_regs_base);
-    ctrl.map(m_regs_base);
-    dclkcnt.map(m_regs_base);
-    dclkstat.map(m_regs_base);
-    gpio_inten.map(m_regs_base);
-    gpio_porta_eoi.map(m_regs_base);
-    gpio_ext_porta.map(m_regs_base);
-  }
-
-  ~De10NanoMgr() {
-    #ifdef MOCK_DEVMEM
-    free(m_regs_base);
-    free(m_data_base);
-    #else
-    unmap_mem(m_regs_base);
-    unmap_mem(m_data_base);
-    #endif  // MOCK_DEVMEM
-  }
-
-  bool mapped() const { return m_regs_base && m_data_base; }
-
-  void print(bool addr = false) {
-    stat.print(addr, false);
-    ctrl.print(addr, false);
-    gpio_inten.print();
-    gpio_porta_eoi.print();
-    gpio_ext_porta.print(addr, false);
-  }
-
- private:
-  uint32_t msel_to_cfgwdth(uint32_t msel) {
-    return(msel & 0b1000) >> 3;
-  }
-
-  uint32_t msel_to_cdratio(uint32_t msel) {
-    uint32_t cfgwdth = msel_to_cfgwdth(msel);
-    uint32_t cdratio = msel & 0b11;
-    if (cfgwdth && cdratio)
-      cdratio++;
-    return cdratio;
-  }
-
-  uint8_t * map_mem(off_t addr, size_t pages = 1) {
-    if (m_page_size <= 0) { return NULL; }
-
-    int mem_fd = open("/dev/mem", O_SYNC | O_RDWR);
-    if (mem_fd < 0) { return NULL; }
-
-    void *vbase = mmap(NULL, pages*m_page_size, PROT_READ | PROT_WRITE,
-                       MAP_SHARED, mem_fd, addr & ~(pages*m_page_size-1));
-    if (vbase == MAP_FAILED) { return NULL; }
-
-    close(mem_fd);
-    return reinterpret_cast<uint8_t*>(vbase);
-  }
-
-  void unmap_mem(void *base, size_t pages = 1) {
-    if (base)
-      munmap(base, pages * m_page_size);
-  }
-
-  uint8_t *m_regs_base, *m_data_base;
-  size_t m_page_size;
-
- public:
-  // Configuration sequence documented at page A-34.
-  bool program_rbf(const char *rbf) {
-    monitor mon;
-    int rbf_fd;
-    uint32_t count = 0;
-    printf("DE10-Nano-Mgr: Programming FPGA from image %s\n", rbf);
-
-    try {
-      mon.init("Open RBF file");
-      rbf_fd = open(rbf, (O_RDONLY | O_SYNC));
-      mon.done(rbf_fd >= 0);
-
-      // 1. Set the cdratio and cfgwdth bits of the ctrl register in the
-      // FPGA manager registers (fpgamgrregs) to match the characteristics
-      // of the configuration image. Tese settings are dependent on the
-      // MSEL pins input.
-      // 2. Set the nce bit of the ctrl register to 0 to enable HPS
-      // configuration.
-      // 3. Set the en bit of the ctrl register to 1 to give the FPGA
-      // manager control of the configuration input signals.
-      // 4. Set the nconfigpull bit of the ctrl register to 1 to pull
-      // down the nCONFIG pin and put the FPGA portion of the device
-      // into the reset phase.
-      mon.init("Enable FPGA configuration");
-      stat.read();
-      if (stat.msel_is_invalid()) {
-        printf("DE10-Nano-Mgr: msel %x is not a valid HPS configuration\n", stat.msel);
-      } else {
-        ctrl.read();
-        ctrl.cdratio = msel_to_cdratio(stat.msel);
-        ctrl.cfgwdth = msel_to_cfgwdth(stat.msel);
-        ctrl.nce = 0;
-        ctrl.en = 1;
-        ctrl.nconfigpull = 1;
-        ctrl.write();
-      }
-      mon.done(!stat.msel_is_invalid());
-
-      // 5. Poll the mode bit of the stat register and wait until
-      // the FPGA enters the reset phase.
-      mon.init("Wait for FPGA to reset");
-      do {
-        stat.read();
-      } while (mon(stat.mode != stat::FPGA_RESET_PHASE));
-      mon.done();
-      stat.print();
-
-      // 6. Set the nconfigpull bit of the ctrl register to 0 to
-      // release the FPGA from reset.
-      mon.init("Release FPGA from reset");
-      ctrl.nconfigpull = 0;
-      ctrl.write();
-      mon.done();
-
-      // 7. Read the mode bit of the stat register and wait until
-      // the FPGA enters the configuration phase.
-      mon.init("Wait for configuration phase");
-      do {
-        stat.read();
-      } while (mon(stat.mode != stat::FPGA_CONFIG_PHASE));
-      mon.done();
-      stat.print();
-
-      // 8. Clear the interrupt bit of nSTATUS (ns) in the gpio interrupt
-      // register (fpgamgrregs.mon.gpio_porta_eoi).
-      mon.init("Clear nSTATUS interrupt bit");
-      gpio_porta_eoi.clear();
-      gpio_porta_eoi.ns = 1;
-      gpio_porta_eoi.write();
-      mon.done();
-
-      // 9. Set the axicfgen bit of the ctrl register to 1 to enable
-      // sending configuration data to the FPGA.
-      mon.init("Enable configuration on AXI");
-      ctrl.axicfgen = 1;
-      ctrl.write();
-      mon.done();
-
-      // 10. Write the configuration image to the configuration data register
-      // (data) in the FPGA manager module configuration data registers
-      // (fpgamgrdata). You can also choose to use a DMA controller to
-      // transfer the configuration image from a peripheral device to the
-      // FPGA manager.
-      ssize_t bytes;
-      mon.init("Write configuration Image");
-      do {
-        data.value = 0;
-        bytes = read(rbf_fd, &data.value, sizeof(data.value));
-        if (bytes > 0) {
-          if (!(count % (1<<16))) {
-            printf("\rDE10-Nano-Mgr: %-32s : %u B", mon.msg, count);
-            fflush(stdout);
-          }
-          data.write();
-          count += bytes;
-        }
-      } while (bytes == 4);
-      mon.done(count > 0);
-      printf("DE10-Nano-Mgr: %-32s : written %u B\n", mon.msg, count);
-      close(rbf_fd);
-
-      // 11. Use the fpgamgrregs.mon.gpio_ext_porta registers to monitor
-      // the CONF_DONE (cd) and nSTATUS (ns) bits.
-      mon.init("Wait for CONF_DONE");
-      do {
-        gpio_ext_porta.read();
-      } while (mon(gpio_ext_porta.cd != 1 && gpio_ext_porta.ns != 1));
-      mon.done();
-      stat.print();
-
-      // 12. Set the axicfgen bit of the ctrl register to 0 to disable
-      // configuration data on AXI slave.
-      mon.init("Disable configuration on AXI");
-      ctrl.axicfgen = 0;
-      ctrl.write();
-      mon.done();
-
-      // 13. Clear any previous DONE status by writing a 1 to the dcntdone
-      // bit of the DCLK status register (dclkstat) to clear the completed
-      // status ﬂag.
-      mon.init("Clear DCLK DONE status");
-      dclkstat.dcntdone = 1;
-      dclkstat.write();
-      mon.done();
-
-      // 14. Send the DCLKs required by the FPGA to enter the
-      // initialization phase.
-      mon.init("Send DCLK for init phase");
-      dclkcnt.cnt = 4;
-      dclkcnt.write();
-      mon.done();
-
-      // 15. Poll the dcntdone bit of the DCLK status register (dclkstat)
-      // until it changes to 1, which indicates that all the DCLKs have
-      // been sent.
-      mon.init("Wait for DCLK");
-      do {
-        dclkstat.read();
-      } while (mon(dclkstat.dcntdone != 1));
-      mon.done();
-
-      // 16. Write a 1 to the dcntdone bit of the DCLK status register to
-      // clear the completed status ﬂag.
-      mon.init("Clear DCLK status flag");
-      dclkstat.dcntdone = 1;
-      dclkstat.write();
-      mon.done();
-
-      // 17. Read the mode bit of the stat register to wait for the FPGA
-      // to enter user mode.
-      mon.init("Wait for FPGA user mode");
-      do {
-        stat.read();
-      } while (mon(stat.mode != stat::FPGA_USER_MODE));
-      mon.done();
-
-      // 18. Set the en bit of the ctrl register to 0 to allow the
-      // external pins to drive the configuration input signals.
-      mon.init("Release control");
-      ctrl.en = 0;
-      ctrl.write();
-      mon.done();
-    }
-    catch(int i) {
-      close(rbf_fd);
-      printf("DE10-Nano-Mgr: %-32s : written %u B\n", mon.msg, count);
-      print();
-    }
-
-    return mon.status();
-  }
-};
-
-#endif  // VTA_DE10NANO_DE10NANO_MGR_H_
diff --git a/vta/vta-hw/src/dpi/module.cc b/vta/vta-hw/src/dpi/module.cc
deleted file mode 100644
index bb8284cf8faf..000000000000
--- a/vta/vta-hw/src/dpi/module.cc
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-
-#include <vta/dpi/module.h>
-#include <vta/dpi/tsim.h>
-#if defined(_WIN32)
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-#include <mutex>
-#include <queue>
-#include <thread>
-#include <condition_variable>
-#include <fstream>
-
-#include "../vmem/virtual_memory.h"
-
-namespace vta {
-namespace dpi {
-
-using namespace tvm::runtime;
-
-typedef void* DeviceHandle;
-
-struct HostRequest {
-  uint8_t opcode;
-  uint8_t addr;
-  uint32_t value;
-};
-
-struct HostResponse {
-  uint32_t value;
-};
-
-struct MemResponse {
-  uint8_t valid;
-  uint64_t value;
-};
-
-template <typename T>
-class ThreadSafeQueue {
- public:
-  void Push(const T item) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    queue_.push(std::move(item));
-    cond_.notify_one();
-  }
-
-  void WaitPop(T* item) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond_.wait(lock, [this]{return !queue_.empty();});
-    *item = std::move(queue_.front());
-    queue_.pop();
-  }
-
-  bool TryPop(T* item, bool pop) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (queue_.empty()) return false;
-    *item = std::move(queue_.front());
-    if (pop) queue_.pop();
-    return true;
-  }
-
- private:
-  mutable std::mutex mutex_;
-  std::queue<T> queue_;
-  std::condition_variable cond_;
-};
-
-class SimDevice {
- public:
-  void Wait();
-  void Resume();
-  void Exit();
-  bool GetWaitStatus();
-  bool GetExitStatus();
-
- private:
-  bool wait_{false};
-  bool exit_{false};
-  mutable std::mutex mutex_;
-};
-
-class HostDevice {
- public:
-  void PushRequest(uint8_t opcode, uint8_t addr, uint32_t value);
-  bool TryPopRequest(HostRequest* r, bool pop);
-  void PushResponse(uint32_t value);
-  void WaitPopResponse(HostResponse* r);
-
- private:
-  mutable std::mutex mutex_;
-  ThreadSafeQueue<HostRequest> req_;
-  ThreadSafeQueue<HostResponse> resp_;
-};
-
-class MemDevice {
- public:
-  void SetRequest(uint8_t opcode, uint64_t addr, uint32_t len);
-  MemResponse ReadData(uint8_t ready);
-  void WriteData(uint64_t value);
-
- private:
-  uint64_t* raddr_{0};
-  uint64_t* waddr_{0};
-  uint32_t rlen_{0};
-  uint32_t wlen_{0};
-  std::mutex mutex_;
-};
-
-void SimDevice::Wait() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  wait_ = true;
-}
-
-void SimDevice::Resume() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  wait_ = false;
-}
-
-void SimDevice::Exit() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  exit_ = true;
-}
-
-bool SimDevice::GetWaitStatus() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return wait_;
-}
-
-bool SimDevice::GetExitStatus() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return exit_;
-}
-
-void HostDevice::PushRequest(uint8_t opcode, uint8_t addr, uint32_t value) {
-  HostRequest r;
-  r.opcode = opcode;
-  r.addr = addr;
-  r.value = value;
-  req_.Push(r);
-}
-
-bool HostDevice::TryPopRequest(HostRequest* r, bool pop) {
-  r->opcode = 0xad;
-  r->addr = 0xad;
-  r->value = 0xbad;
-  return req_.TryPop(r, pop);
-}
-
-void HostDevice::PushResponse(uint32_t value) {
-  HostResponse r;
-  r.value = value;
-  resp_.Push(r);
-}
-
-void HostDevice::WaitPopResponse(HostResponse* r) {
-  resp_.WaitPop(r);
-}
-
-void MemDevice::SetRequest(uint8_t opcode, uint64_t addr, uint32_t len) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  void * vaddr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(addr);
-
-  if (opcode == 1) {
-    wlen_ = len + 1;
-    waddr_ = reinterpret_cast<uint64_t*>(vaddr);
-  } else {
-    rlen_ = len + 1;
-    raddr_ = reinterpret_cast<uint64_t*>(vaddr);
-  }
-}
-
-MemResponse MemDevice::ReadData(uint8_t ready) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  MemResponse r;
-  r.valid = rlen_ > 0;
-  r.value = rlen_ > 0 ? *raddr_ : 0xdeadbeefdeadbeef;
-  if (ready == 1 && rlen_ > 0) {
-    raddr_++;
-    rlen_ -= 1;
-  }
-  return r;
-}
-
-void MemDevice::WriteData(uint64_t value) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  if (wlen_ > 0) {
-    *waddr_ = value;
-    waddr_++;
-    wlen_ -= 1;
-  }
-}
-
-class DPIModule final : public DPIModuleNode {
- public:
-  ~DPIModule() {
-    if (lib_handle_) Unload();
-  }
-
-  const char* type_key() const final {
-    return "vta-tsim";
-  }
-
-  PackedFunc GetFunction(
-      const std::string& name,
-      const ObjectPtr<Object>& sptr_to_self) final {
-    if (name == "WriteReg") {
-      return TypedPackedFunc<void(int, int)>(
-          [this](int addr, int value){
-            this->WriteReg(addr, value);
-          });
-    } else {
-      LOG(FATAL) << "Member " << name << "does not exists";
-      return nullptr;
-    }
-  }
-
-  void Init(const std::string& name) {
-    Load(name);
-    VTADPIInitFunc finit =  reinterpret_cast<VTADPIInitFunc>(
-        GetSymbol("VTADPIInit"));
-    CHECK(finit != nullptr);
-    finit(this, VTASimDPI, VTAHostDPI, VTAMemDPI);
-    ftsim_ = reinterpret_cast<VTADPISimFunc>(GetSymbol("VTADPISim"));
-    CHECK(ftsim_ != nullptr);
-  }
-
-  void SimLaunch() {
-    auto frun = [this]() {
-      (*ftsim_)();
-    };
-    tsim_thread_ = std::thread(frun);
-  }
-
-  void SimWait() {
-    sim_device_.Wait();
-  }
-
-  void SimResume() {
-    sim_device_.Resume();
-  }
-
-  void SimFinish() {
-    sim_device_.Exit();
-    tsim_thread_.join();
-  }
-
-  void WriteReg(int addr, uint32_t value) {
-    host_device_.PushRequest(1, addr, value);
-  }
-
-  uint32_t ReadReg(int addr) {
-    uint32_t value;
-    HostResponse* r = new HostResponse;
-    host_device_.PushRequest(0, addr, 0);
-    host_device_.WaitPopResponse(r);
-    value = r->value;
-    delete r;
-    return value;
-  }
-
- protected:
-  VTADPISimFunc ftsim_;
-  SimDevice sim_device_;
-  HostDevice host_device_;
-  MemDevice mem_device_;
-  std::thread tsim_thread_;
-
-  void SimDPI(dpi8_t* wait,
-              dpi8_t* exit) {
-    *wait = sim_device_.GetWaitStatus();
-    *exit = sim_device_.GetExitStatus();
-  }
-
-  void HostDPI(dpi8_t* req_valid,
-               dpi8_t* req_opcode,
-               dpi8_t* req_addr,
-               dpi32_t* req_value,
-               dpi8_t req_deq,
-               dpi8_t resp_valid,
-               dpi32_t resp_value) {
-    HostRequest* r = new HostRequest;
-    *req_valid = host_device_.TryPopRequest(r, req_deq);
-    *req_opcode = r->opcode;
-    *req_addr = r->addr;
-    *req_value = r->value;
-    if (resp_valid) {
-      host_device_.PushResponse(resp_value);
-    }
-    delete r;
-  }
-
-  void MemDPI(
-      dpi8_t req_valid,
-      dpi8_t req_opcode,
-      dpi8_t req_len,
-      dpi64_t req_addr,
-      dpi8_t wr_valid,
-      dpi64_t wr_value,
-      dpi8_t* rd_valid,
-      dpi64_t* rd_value,
-      dpi8_t rd_ready) {
-    MemResponse r = mem_device_.ReadData(rd_ready);
-    *rd_valid = r.valid;
-    *rd_value = r.value;
-    if (wr_valid) {
-      mem_device_.WriteData(wr_value);
-    }
-    if (req_valid) {
-      mem_device_.SetRequest(req_opcode, req_addr, req_len);
-    }
-  }
-
-  static void VTASimDPI(
-      VTAContextHandle self,
-      dpi8_t* wait,
-      dpi8_t* exit) {
-    static_cast<DPIModule*>(self)->SimDPI(
-        wait, exit);
-  }
-
-  static void VTAHostDPI(
-      VTAContextHandle self,
-      dpi8_t* req_valid,
-      dpi8_t* req_opcode,
-      dpi8_t* req_addr,
-      dpi32_t* req_value,
-      dpi8_t req_deq,
-      dpi8_t resp_valid,
-      dpi32_t resp_value) {
-    static_cast<DPIModule*>(self)->HostDPI(
-        req_valid, req_opcode, req_addr,
-        req_value, req_deq, resp_valid, resp_value);
-  }
-
-  static void VTAMemDPI(
-    VTAContextHandle self,
-    dpi8_t req_valid,
-    dpi8_t req_opcode,
-    dpi8_t req_len,
-    dpi64_t req_addr,
-    dpi8_t wr_valid,
-    dpi64_t wr_value,
-    dpi8_t* rd_valid,
-    dpi64_t* rd_value,
-    dpi8_t rd_ready) {
-    static_cast<DPIModule*>(self)->MemDPI(
-        req_valid, req_opcode, req_len,
-        req_addr, wr_valid, wr_value,
-        rd_valid, rd_value, rd_ready);
-  }
-
- private:
-  // Platform dependent handling.
-#if defined(_WIN32)
-  // library handle
-  HMODULE lib_handle_{nullptr};
-  // Load the library
-  void Load(const std::string& name) {
-    // use wstring version that is needed by LLVM.
-    std::wstring wname(name.begin(), name.end());
-    lib_handle_ = LoadLibraryW(wname.c_str());
-    CHECK(lib_handle_ != nullptr)
-        << "Failed to load dynamic shared library " << name;
-  }
-  void* GetSymbol(const char* name) {
-    return reinterpret_cast<void*>(
-        GetProcAddress(lib_handle_, (LPCSTR)name)); // NOLINT(*)
-  }
-  void Unload() {
-    FreeLibrary(lib_handle_);
-  }
-#else
-  // Library handle
-  void* lib_handle_{nullptr};
-  // load the library
-  void Load(const std::string& name) {
-    lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    CHECK(lib_handle_ != nullptr)
-        << "Failed to load dynamic shared library " << name
-        << " " << dlerror();
-  }
-  void* GetSymbol(const char* name) {
-    return dlsym(lib_handle_, name);
-  }
-  void Unload() {
-    dlclose(lib_handle_);
-  }
-#endif
-};
-
-Module DPIModuleNode::Load(std::string dll_name) {
-  auto n = make_object<DPIModule>();
-  n->Init(dll_name);
-  return Module(n);
-}
-
-TVM_REGISTER_GLOBAL("runtime.module.loadfile_vta-tsim")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = DPIModuleNode::Load(args[0]);
-  });
-}  // namespace dpi
-}  // namespace vta
diff --git a/vta/vta-hw/src/pynq/pynq_driver.cc b/vta/vta-hw/src/pynq/pynq_driver.cc
deleted file mode 100644
index a37bb4e466af..000000000000
--- a/vta/vta-hw/src/pynq/pynq_driver.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- * \file pynq_driver.c
- * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
- */
-
-#include <vta/driver.h>
-#include <thread>
-#include "pynq_driver.h"
-
-
-void* VTAMemAlloc(size_t size, int cached) {
-  assert(size <= VTA_MAX_XFER);
-  // Rely on the pynq-specific cma library
-  return cma_alloc(size, cached);
-}
-
-void VTAMemFree(void* buf) {
-  // Rely on the pynq-specific cma library
-  cma_free(buf);
-}
-
-vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
-  return cma_get_phy_addr(buf);
-}
-
-void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
-  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
-  memcpy(dst, src, size);
-}
-
-void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
-  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
-  memcpy(dst, src, size);
-}
-
-void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
-  // Call the cma_flush_cache on the CMA buffer
-  // so that the FPGA can read the buffer data.
-  cma_flush_cache(vir_addr, phy_addr, size);
-}
-
-void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
-  // Call the cma_invalidate_cache on the CMA buffer
-  // so that the host needs to read the buffer data.
-  cma_invalidate_cache(vir_addr, phy_addr, size);
-}
-
-void *VTAMapRegister(uint32_t addr) {
-  // Align the base address with the pages
-  uint32_t virt_base = addr & ~(getpagesize() - 1);
-  // Calculate base address offset w.r.t the base address
-  uint32_t virt_offset = addr - virt_base;
-  // Open file and mmap
-  uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC);
-  return mmap(NULL,
-              (VTA_IP_REG_MAP_RANGE + virt_offset),
-              PROT_READ|PROT_WRITE,
-              MAP_SHARED,
-              mmap_file,
-              virt_base);
-}
-
-void VTAUnmapRegister(void *vta) {
-  // Unmap memory
-  int status = munmap(vta, VTA_IP_REG_MAP_RANGE);
-  assert(status == 0);
-}
-
-void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
-  *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)) = val;
-}
-
-uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
-  return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
-}
-
-class VTADevice {
- public:
-  VTADevice() {
-    // VTA stage handles
-    vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR);
-    vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR);
-    vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR);
-    vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR);
-  }
-
-  ~VTADevice() {
-    // Close VTA stage handle
-    VTAUnmapRegister(vta_fetch_handle_);
-    VTAUnmapRegister(vta_load_handle_);
-    VTAUnmapRegister(vta_compute_handle_);
-    VTAUnmapRegister(vta_store_handle_);
-  }
-
-  int Run(vta_phy_addr_t insn_phy_addr,
-          uint32_t insn_count,
-          uint32_t wait_cycles) {
-    VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
-    VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy_addr);
-    VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_INP_ADDR_OFFSET, 0);
-    VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_WGT_ADDR_OFFSET, 0);
-    VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_UOP_ADDR_OFFSET, 0);
-    VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_BIAS_ADDR_OFFSET, 0);
-    VTAWriteMappedReg(vta_store_handle_, VTA_STORE_OUT_ADDR_OFFSET, 0);
-
-    // VTA start
-    VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
-    VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
-    VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
-    VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
-
-    // Loop until the VTA is done
-    unsigned t, flag = 0;
-    for (t = 0; t < wait_cycles; ++t) {
-      flag = VTAReadMappedReg(vta_compute_handle_, VTA_COMPUTE_DONE_RD_OFFSET);
-      if (flag == VTA_DONE) break;
-      std::this_thread::yield();
-    }
-    // Report error if timeout
-    return t < wait_cycles ? 0 : 1;
-  }
-
- private:
-  // VTA handles (register maps)
-  void* vta_fetch_handle_{nullptr};
-  void* vta_load_handle_{nullptr};
-  void* vta_compute_handle_{nullptr};
-  void* vta_store_handle_{nullptr};
-};
-
-VTADeviceHandle VTADeviceAlloc() {
-  return new VTADevice();
-}
-
-void VTADeviceFree(VTADeviceHandle handle) {
-  delete static_cast<VTADevice*>(handle);
-}
-
-int VTADeviceRun(VTADeviceHandle handle,
-                 vta_phy_addr_t insn_phy_addr,
-                 uint32_t insn_count,
-                 uint32_t wait_cycles) {
-  return static_cast<VTADevice*>(handle)->Run(
-      insn_phy_addr, insn_count, wait_cycles);
-}
diff --git a/vta/vta-hw/src/pynq/pynq_driver.h b/vta/vta-hw/src/pynq/pynq_driver.h
deleted file mode 100644
index bb6ca3db2b93..000000000000
--- a/vta/vta-hw/src/pynq/pynq_driver.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- * \file pynq_driver.h
- * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
- */
-
-#ifndef VTA_PYNQ_PYNQ_DRIVER_H_
-#define VTA_PYNQ_PYNQ_DRIVER_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include <assert.h>
-#include <fcntl.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-#if defined(__arm__) || defined(__aarch64__)
-#include <libxlnk_cma.h>
-#else
-void* cma_alloc(size_t size, int cached);
-void cma_free(void* buf);
-uint32_t cma_get_phy_addr(void* buf);
-void cma_flush_cache(void* buf, unsigned int phys_addr, int size);
-void cma_invalidate_cache(void* buf, unsigned int phys_addr, int size);
-#endif
-
-void *VTAMapRegister(uint32_t addr);
-void VTAUnmapRegister(void *vta);
-void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
-uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
-
-/*! \brief VTA configuration register start value */
-#define VTA_START 0x1
-/*! \brief VTA configuration register auto-restart value */
-#define VTA_AUTORESTART 0x81
-/*! \brief VTA configuration register done value */
-#define VTA_DONE 0x1
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // VTA_PYNQ_PYNQ_DRIVER_H_
diff --git a/vta/vta-hw/src/sim/sim_driver.cc b/vta/vta-hw/src/sim/sim_driver.cc
deleted file mode 100644
index b00f41daf1f6..000000000000
--- a/vta/vta-hw/src/sim/sim_driver.cc
+++ /dev/null
@@ -1,550 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file sim_driver.cc
- * \brief VTA driver for simulated backend.
- */
-#include <vta/driver.h>
-#include <vta/hw_spec.h>
-#include <tvm/runtime/registry.h>
-#include <vta/sim_tlpp.h>
-#include <type_traits>
-#include <mutex>
-#include <map>
-#include <unordered_map>
-#include <cstring>
-#include <sstream>
-
-#include "../vmem/virtual_memory.h"
-
-namespace vta {
-namespace sim {
-
-/*! \brief debug flag for skipping computation */
-enum DebugFlagMask {
-  kSkipExec = 1
-};
-
-/*!
- * \brief Helper class to pack and unpack bits
- *  Applies truncation when pack to low level bits.
- *
- * \tparam bits The number of bits in integer.
- * \note This implementation relies on little endian.
- */
-template<uint32_t bits>
-class BitPacker {
- public:
-  explicit BitPacker(void* data) {
-    data_ = static_cast<uint32_t*>(data);
-  }
-
-  uint32_t GetUnsigned(uint32_t index) const {
-    if (bits == 32) {
-      return data_[index];
-    } else if (bits == 16) {
-      return reinterpret_cast<uint16_t*>(data_)[index];
-    } else if (bits == 8) {
-      return reinterpret_cast<uint8_t*>(data_)[index];
-    } else {
-      uint32_t offset = index / kNumPackElem;
-      uint32_t shift = index % kNumPackElem;
-      return (data_[offset] >> shift) & kMask;
-    }
-  }
-
-  int32_t GetSigned(uint32_t index) const {
-    if (bits == 32) {
-      return reinterpret_cast<int32_t*>(data_)[index];
-    } else if (bits == 16) {
-      return reinterpret_cast<int16_t*>(data_)[index];
-    } else if (bits == 8) {
-      return reinterpret_cast<int8_t*>(data_)[index];
-    } else {
-      uint32_t offset = index / kNumPackElem;
-      uint32_t shift = (index % kNumPackElem) * bits;
-      int32_t uvalue = static_cast<int32_t>(
-          (data_[offset] >> shift) & kMask);
-      int kleft = 32 - bits;
-      return (uvalue << kleft) >> kleft;
-    }
-  }
-
-  void SetUnsigned(uint32_t index, uint32_t value) {
-    if (bits == 32) {
-      data_[index] = value;
-    } else if (bits == 16) {
-      reinterpret_cast<uint16_t*>(data_)[index] = value;
-    } else if (bits == 8) {
-      reinterpret_cast<uint8_t*>(data_)[index] = value;
-    } else {
-      uint32_t offset = index / kNumPackElem;
-      uint32_t shift = (index % kNumPackElem) * bits;
-      data_[offset] &= (~(kMask << shift));
-      data_[offset] |= (value & kMask) << shift;
-    }
-  }
-
-  void SetSigned(uint32_t index, int32_t value) {
-    if (bits == 32) {
-      reinterpret_cast<int32_t*>(data_)[index] = value;
-    } else if (bits == 16) {
-      reinterpret_cast<int16_t*>(data_)[index] = value;
-    } else if (bits == 8) {
-      reinterpret_cast<int8_t*>(data_)[index] = value;
-    } else {
-      uint32_t offset = index / kNumPackElem;
-      uint32_t shift = (index % kNumPackElem) * bits;
-      data_[offset] &= (~(kMask << shift));
-      data_[offset] |= static_cast<uint32_t>(value & kMask) << shift;
-    }
-  }
-
- private:
-  uint32_t* data_;
-  static constexpr uint32_t kNumPackElem = 32 / bits;
-  static constexpr uint32_t kMask = (1U << (bits >= 32U ? 31U : bits)) - 1U;
-};
-
-/*!
- * \brief DRAM memory manager
- *  Implements simple paging to allow physical address translation.
- */
-using DRAM = ::vta::vmem::VirtualMemoryManager;
-
-/*!
- * \brief Register file.
- * \tparam kBits Number of bits of one value.
- * \tparam kLane Number of lanes in one element.
- * \tparam kMaxNumElem Maximum number of element.
- */
-template<int kBits, int kLane, int kMaxNumElem>
-class SRAM {
- public:
-  /*! \brief Bytes of single vector element */
-  static const int kElemBytes = (kBits * kLane + 7) / 8;
-  /*! \brief content data type */
-  using DType = typename std::aligned_storage<kElemBytes, kElemBytes>::type;
-  SRAM() {
-    data_ = new DType[kMaxNumElem];
-  }
-  ~SRAM() {
-    delete [] data_;
-  }
-  // Get the i-th index
-  void* BeginPtr(uint32_t index) {
-    CHECK_LT(index, kMaxNumElem);
-    return &(data_[index]);
-  }
-  // Execute the load instruction on this SRAM
-  void Load(const VTAMemInsn* op,
-            DRAM* dram,
-            uint64_t* load_counter,
-            bool skip_exec) {
-    load_counter[0] += (op->x_size * op->y_size) * kElemBytes;
-    if (skip_exec) return;
-    DType* sram_ptr = data_ + op->sram_base;
-    uint8_t* dram_ptr = static_cast<uint8_t*>(dram->GetAddr(
-        op->dram_base * kElemBytes));
-    uint64_t xtotal = op->x_size + op->x_pad_0 + op->x_pad_1;
-    uint32_t ytotal = op->y_size + op->y_pad_0 + op->y_pad_1;
-    uint64_t sram_end = op->sram_base + xtotal * ytotal;
-    CHECK_LE(sram_end, kMaxNumElem);
-    memset(sram_ptr, 0, kElemBytes * xtotal * op->y_pad_0);
-    sram_ptr += xtotal * op->y_pad_0;
-
-    for (uint32_t y = 0; y < op->y_size; ++y) {
-      memset(sram_ptr, 0, kElemBytes * op->x_pad_0);
-      sram_ptr += op->x_pad_0;
-      memcpy(sram_ptr, dram_ptr, kElemBytes * op->x_size);
-      sram_ptr += op->x_size;
-      memset(sram_ptr, 0, kElemBytes * op->x_pad_1);
-      sram_ptr += op->x_pad_1;
-      dram_ptr += kElemBytes * op->x_stride;
-    }
-    memset(sram_ptr, 0, kElemBytes * xtotal * op->y_pad_1);
-  }
-  // Execute the store instruction on this SRAM apply trucation.
-  // This relies on the elements is 32 bits
-  template<int target_bits>
-  void TruncStore(const VTAMemInsn* op, DRAM* dram) {
-    CHECK_EQ(op->x_pad_0, 0);
-    CHECK_EQ(op->x_pad_1, 0);
-    CHECK_EQ(op->y_pad_0, 0);
-    CHECK_EQ(op->y_pad_1, 0);
-    int target_width = (target_bits * kLane + 7) / 8;
-    BitPacker<kBits> src(data_ + op->sram_base);
-    BitPacker<target_bits> dst(dram->GetAddr(op->dram_base * target_width));
-    for (uint32_t y = 0; y < op->y_size; ++y) {
-      for (uint32_t x = 0; x < op->x_size; ++x) {
-        uint32_t sram_base = y * op->x_size + x;
-        uint32_t dram_base = y * op->x_stride + x;
-        for (int i = 0; i < kLane; ++i) {
-          dst.SetSigned(dram_base * kLane + i,
-                        src.GetSigned(sram_base * kLane +i));
-        }
-      }
-    }
-  }
-
- private:
-  /*! \brief internal data content */
-  DType* data_;
-};
-
-
-/*!
- * \brief Memory information of special memory region.
- *  Use MemoryInfo as its container type
- */
-class Profiler {
- public:
-  /*! \brief The memory load statistics */
-  uint64_t inp_load_nbytes{0};
-  /*! \brief The memory load statistics */
-  uint64_t wgt_load_nbytes{0};
-  /*! \brief The ACC memory load statistics */
-  uint64_t acc_load_nbytes{0};
-  /*! \brief The ACC memory load statistics */
-  uint64_t uop_load_nbytes{0};
-  /*! \brief The ACC memory load statistics */
-  uint64_t out_store_nbytes{0};
-  /*! \brief instr counter for gemm */
-  uint64_t gemm_counter{0};
-  /*! \brief instr counter for ALU ops */
-  uint64_t alu_counter{0};
-  /*! \brief set debug mode */
-  int64_t debug_flag{0};
-  /*! \brief clear the profiler */
-  void Clear() {
-    inp_load_nbytes = 0;
-    wgt_load_nbytes = 0;
-    acc_load_nbytes = 0;
-    uop_load_nbytes = 0;
-    out_store_nbytes = 0;
-    gemm_counter = 0;
-    alu_counter = 0;
-  }
-  /*! \return Whether we should skip execution. */
-  bool SkipExec() const {
-    return (debug_flag & DebugFlagMask::kSkipExec) != 0;
-  }
-
-  std::string AsJSON() {
-    std::ostringstream os;
-    os << "{\n"
-       << " \"inp_load_nbytes\":" << inp_load_nbytes << ",\n"
-       << " \"wgt_load_nbytes\":" << wgt_load_nbytes << ",\n"
-       << " \"acc_load_nbytes\":" << acc_load_nbytes << ",\n"
-       << " \"uop_load_nbytes\":" << uop_load_nbytes << ",\n"
-       << " \"out_store_nbytes\":" << out_store_nbytes << ",\n"
-       << " \"gemm_counter\":" << gemm_counter << ",\n"
-       << " \"alu_counter\":" << alu_counter << "\n"
-       <<"}\n";
-    return os.str();
-  }
-
-  static Profiler* ThreadLocal() {
-    static thread_local Profiler inst;
-    return &inst;
-  }
-};
-
-
-// Simulate device
-// TODO(tqchen,thierry): queue based event driven simulation.
-class Device {
- public:
-  Device() {
-    prof_ = Profiler::ThreadLocal();
-    dram_ = DRAM::Global();
-    ptlpp = TlppVerify::Global();
-  }
-
-  int Run(vta_phy_addr_t insn_phy_addr,
-          uint32_t insn_count,
-          uint32_t wait_cycles) {
-    VTAGenericInsn* insn = static_cast<VTAGenericInsn*>(
-        dram_->GetAddr(insn_phy_addr));
-    finish_counter_ = 0;
-    for (uint32_t i = 0; i < insn_count; ++i) {
-      this->Run(insn + i);
-    }
-    this->TlppSynchronization();
-    return 0;
-  }
-
- private:
-  static void Run_Insn(const VTAGenericInsn* insn, void * dev) {
-    Device * device = reinterpret_cast<Device *> (dev);
-    const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
-    const VTAGemInsn* gem = reinterpret_cast<const VTAGemInsn*>(insn);
-    const VTAAluInsn* alu = reinterpret_cast<const VTAAluInsn*>(insn);
-    switch (mem->opcode) {
-      case VTA_OPCODE_LOAD: device->RunLoad(mem); break;
-      case VTA_OPCODE_STORE: device->RunStore(mem); break;
-      case VTA_OPCODE_GEMM: device->RunGEMM(gem); break;
-      case VTA_OPCODE_ALU: device->RunALU(alu); break;
-      case VTA_OPCODE_FINISH: ++(device->finish_counter_); break;
-      default: {
-        LOG(FATAL) << "Unknown op_code" << mem->opcode;
-      }
-    }
-  }
-
- private:
-  void Run(const VTAGenericInsn* insn) {
-    ptlpp->TlppPushInsn(insn);
-  }
-
-  void TlppSynchronization(void) {
-    ptlpp->TlppSynchronization(Run_Insn, reinterpret_cast<void *> (this));
-  }
-
-  void RunLoad(const VTAMemInsn* op) {
-    if (op->x_size == 0) return;
-    if (op->memory_type == VTA_MEM_ID_INP) {
-      inp_.Load(op, dram_, &(prof_->inp_load_nbytes), prof_->SkipExec());
-    } else if (op->memory_type == VTA_MEM_ID_WGT) {
-      wgt_.Load(op, dram_, &(prof_->wgt_load_nbytes), prof_->SkipExec());
-    } else if (op->memory_type == VTA_MEM_ID_ACC) {
-      acc_.Load(op, dram_, &(prof_->acc_load_nbytes), prof_->SkipExec());
-    } else if (op->memory_type == VTA_MEM_ID_UOP) {
-      // always load in uop, since uop is stateful
-      // subsequent non-debug mode exec can depend on it.
-      uop_.Load(op, dram_, &(prof_->uop_load_nbytes), false);
-    } else {
-      LOG(FATAL) << "Unknown memory_type=" << op->memory_type;
-    }
-  }
-
-  void RunStore(const VTAMemInsn* op) {
-    if (op->x_size == 0) return;
-    if (op->memory_type == VTA_MEM_ID_ACC ||
-        op->memory_type == VTA_MEM_ID_UOP) {
-      prof_->out_store_nbytes += (
-          op->x_size * op->y_size * VTA_BATCH * VTA_BLOCK_OUT * VTA_OUT_WIDTH / 8);
-      if (!prof_->SkipExec()) {
-        acc_.TruncStore<VTA_OUT_WIDTH>(op, dram_);
-      }
-    } else {
-      LOG(FATAL) << "Store do not support memory_type="
-                 << op->memory_type;
-    }
-  }
-
-  void RunGEMM(const VTAGemInsn* op) {
-    if (!op->reset_reg) {
-      prof_->gemm_counter += op->iter_out * op->iter_in * (op->uop_end - op->uop_bgn);
-      if (prof_->SkipExec()) return;
-      for (uint32_t y = 0; y < op->iter_out; ++y) {
-        for (uint32_t x = 0; x < op->iter_in; ++x) {
-          for (uint32_t uindex = op->uop_bgn; uindex < op->uop_end; ++uindex) {
-            VTAUop* uop_ptr = static_cast<VTAUop*>(uop_.BeginPtr(uindex));
-            // Read in memory indices
-            uint32_t acc_idx = uop_ptr->dst_idx;
-            uint32_t inp_idx = uop_ptr->src_idx;
-            uint32_t wgt_idx = uop_ptr->wgt_idx;
-
-            acc_idx += y * op->dst_factor_out + x * op->dst_factor_in;
-            inp_idx += y * op->src_factor_out + x * op->src_factor_in;
-            wgt_idx += y * op->wgt_factor_out + x * op->wgt_factor_in;
-            BitPacker<VTA_ACC_WIDTH> acc(acc_.BeginPtr(acc_idx));
-            BitPacker<VTA_INP_WIDTH> inp(inp_.BeginPtr(inp_idx));
-            BitPacker<VTA_WGT_WIDTH> wgt(wgt_.BeginPtr(wgt_idx));
-
-            // gemm loop
-            for (uint32_t i = 0; i < VTA_BATCH; ++i) {
-              for (uint32_t j = 0; j < VTA_BLOCK_OUT; ++j) {
-                uint32_t acc_offset = i * VTA_BLOCK_OUT + j;
-                int32_t sum = acc.GetSigned(acc_offset);
-                for (uint32_t k = 0; k < VTA_BLOCK_IN; ++k) {
-                  sum +=
-                      inp.GetSigned(i * VTA_BLOCK_IN + k) *
-                      wgt.GetSigned(j * VTA_BLOCK_IN + k);
-                }
-                acc.SetSigned(acc_offset, sum);
-              }
-            }
-          }
-        }
-      }
-    } else {
-      if (prof_->SkipExec()) return;
-      // reset
-      for (uint32_t y = 0; y < op->iter_out; ++y) {
-        for (uint32_t x = 0; x < op->iter_in; ++x) {
-          for (uint32_t uindex = op->uop_bgn; uindex < op->uop_end; ++uindex) {
-            VTAUop* uop_ptr = static_cast<VTAUop*>(uop_.BeginPtr(uindex));
-            uint32_t acc_idx = uop_ptr->dst_idx;
-            acc_idx += y * op->dst_factor_out + x * op->dst_factor_in;
-            BitPacker<VTA_ACC_WIDTH> acc(acc_.BeginPtr(acc_idx));
-            for (uint32_t i = 0; i < VTA_BATCH * VTA_BLOCK_OUT; ++i) {
-              acc.SetSigned(i, 0);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void RunALU(const VTAAluInsn* op) {
-    if (op->use_imm) {
-      RunALU_<true>(op);
-    } else {
-      RunALU_<false>(op);
-    }
-  }
-
-  template<bool use_imm>
-  void RunALU_(const VTAAluInsn* op) {
-    switch (op->alu_opcode) {
-      case VTA_ALU_OPCODE_ADD: {
-        return RunALULoop<use_imm>(op, [](int32_t x, int32_t y) {
-            return x + y;
-          });
-      }
-      case VTA_ALU_OPCODE_MAX: {
-        return RunALULoop<use_imm>(op, [](int32_t x, int32_t y) {
-            return std::max(x, y);
-          });
-      }
-      case VTA_ALU_OPCODE_MIN: {
-        return RunALULoop<use_imm>(op, [](int32_t x, int32_t y) {
-            return std::min(x, y);
-          });
-      }
-      case VTA_ALU_OPCODE_SHR: {
-        return RunALULoop<use_imm>(op, [](int32_t x, int32_t y) {
-            if (y >= 0) {
-              return x >> y;
-            } else {
-              return x << (-y);
-            }
-          });
-      }
-      default: {
-        LOG(FATAL) << "Unknown ALU code " << op->alu_opcode;
-      }
-    }
-  }
-
-  template<bool use_imm, typename F>
-  void RunALULoop(const VTAAluInsn* op, F func) {
-    prof_->alu_counter += op->iter_out * op->iter_in * (op->uop_end - op->uop_bgn);
-    if (prof_->SkipExec()) return;
-    for (int y = 0; y < op->iter_out; ++y) {
-      for (int x = 0; x < op->iter_in; ++x) {
-        for (int k = op->uop_bgn; k < op->uop_end; ++k) {
-          // Read micro op
-          VTAUop* uop_ptr = static_cast<VTAUop*>(uop_.BeginPtr(k));
-          uint32_t dst_index = uop_ptr->dst_idx;
-          uint32_t src_index = uop_ptr->src_idx;
-          dst_index += y * op->dst_factor_out + x * op->dst_factor_in;
-          src_index += y * op->src_factor_out + x * op->src_factor_in;
-          BitPacker<VTA_ACC_WIDTH> dst(acc_.BeginPtr(dst_index));
-          BitPacker<VTA_ACC_WIDTH> src(acc_.BeginPtr(src_index));
-          for (int k = 0; k < VTA_BATCH * VTA_BLOCK_OUT; ++k) {
-            if (use_imm) {
-              dst.SetSigned(k, func(dst.GetSigned(k), op->imm));
-            } else {
-              dst.SetSigned(k, func(dst.GetSigned(k), src.GetSigned(k)));
-            }
-          }
-        }
-      }
-    }
-  }
-  // the finish counter
-  int finish_counter_{0};
-  // Prof_
-  Profiler* prof_;
-  // The DRAM interface
-  DRAM* dram_;
-  TlppVerify* ptlpp;
-  // The SRAM
-  SRAM<VTA_INP_WIDTH, VTA_BATCH * VTA_BLOCK_IN, VTA_INP_BUFF_DEPTH> inp_;
-  SRAM<VTA_WGT_WIDTH, VTA_BLOCK_IN * VTA_BLOCK_OUT, VTA_WGT_BUFF_DEPTH> wgt_;
-  SRAM<VTA_ACC_WIDTH, VTA_BATCH * VTA_BLOCK_OUT, VTA_ACC_BUFF_DEPTH> acc_;
-  SRAM<VTA_UOP_WIDTH, 1, VTA_UOP_BUFF_DEPTH> uop_;
-};
-
-using tvm::runtime::TVMRetValue;
-using tvm::runtime::TVMArgs;
-
-TVM_REGISTER_GLOBAL("vta.simulator.profiler_clear")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Profiler::ThreadLocal()->Clear();
-  });
-TVM_REGISTER_GLOBAL("vta.simulator.profiler_status")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = Profiler::ThreadLocal()->AsJSON();
-  });
-TVM_REGISTER_GLOBAL("vta.simulator.profiler_debug_mode")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Profiler::ThreadLocal()->debug_flag = args[0];
-  });
-}  // namespace sim
-}  // namespace vta
-
-void* VTAMemAlloc(size_t size, int cached) {
-  return vta::sim::DRAM::Global()->Alloc(size);
-}
-
-void VTAMemFree(void* buf) {
-  vta::sim::DRAM::Global()->Free(buf);
-}
-
-vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
-  return vta::sim::DRAM::Global()->GetPhyAddr(buf);
-}
-
-void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
-  memcpy(dst, src, size);
-}
-
-void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
-  memcpy(dst, src, size);
-}
-
-void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
-}
-
-void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
-}
-
-VTADeviceHandle VTADeviceAlloc() {
-  return new vta::sim::Device();
-}
-
-void VTADeviceFree(VTADeviceHandle handle) {
-  delete static_cast<vta::sim::Device*>(handle);
-}
-
-int VTADeviceRun(VTADeviceHandle handle,
-                 vta_phy_addr_t insn_phy_addr,
-                 uint32_t insn_count,
-                 uint32_t wait_cycles) {
-  return static_cast<vta::sim::Device*>(handle)->Run(
-      insn_phy_addr, insn_count, wait_cycles);
-}
-
-void VTAProgram(const char* bitstream) {
-}
diff --git a/vta/vta-hw/src/sim/sim_tlpp.cc b/vta/vta-hw/src/sim/sim_tlpp.cc
deleted file mode 100644
index d5ec7ea4ecdc..000000000000
--- a/vta/vta-hw/src/sim/sim_tlpp.cc
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file sim_tlpp.cc
- * \brief simulate core level pipe line parallism logic.
- */
-#include <vta/sim_tlpp.h>
-TlppVerify::TlppVerify() {
-  done_ = 0;
-}
-
-void TlppVerify::Clear() {
-  fsim_handle_ = nullptr;
-  run_fsim_function_ = nullptr;
-  for (int i = 0; i < COREMAX; i++) {
-    while (insnq_array_[i].size()) {
-      insnq_array_[i].pop();
-    }
-  }
-  done_ = 0;
-}
-
-uint64_t TlppVerify::GetOperationCode(const VTAGenericInsn *insn) {
-  const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
-  return mem->opcode;
-}
-
-CORE_TYPE TlppVerify::GetCoreType(uint64_t operation_code,
-                              const VTAGenericInsn *insn) {
-  CORE_TYPE core_type = COREGEMM;
-  const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
-  switch (operation_code) {
-    case VTA_OPCODE_GEMM:
-    case VTA_OPCODE_ALU:
-      core_type = COREGEMM;
-      break;
-    case VTA_OPCODE_LOAD:
-      if (mem->memory_type == VTA_MEM_ID_INP||
-          mem->memory_type == VTA_MEM_ID_WGT) {
-        core_type = CORELOAD;
-      }
-      break;
-    case VTA_OPCODE_STORE:
-      core_type = CORESTORE;
-      break;
-    default:
-      break;
-  }
-  return core_type;
-}
-
-bool TlppVerify::DependencyProcess(bool before_run,
-    bool pop_prev, bool pop_next,
-    bool push_prev, bool push_next,
-    Dep_q_t *pop_prev_q, Dep_q_t *pop_next_q,
-    Dep_q_t *push_prev_q, Dep_q_t *push_next_q,
-    CORE_TYPE push_to_prev_q_indx, CORE_TYPE push_to_next_q_indx) {
-
-  int val = 1;
-  if (before_run) {
-    if (pop_prev && pop_prev_q->size() == 0) {
-      return false;
-    }
-    if (pop_next && pop_next_q->size() == 0) {
-      return false;
-    }
-    if (pop_next) pop_next_q->pop();
-    if (pop_prev) pop_prev_q->pop();
-  } else {
-    if (push_prev) {
-      push_prev_q->push(val);
-      dep_push_event_.push(push_to_prev_q_indx);
-    }
-    if (push_next) {
-      push_next_q->push(val);
-      dep_push_event_.push(push_to_next_q_indx);
-    }
-  }
-  return true;
-}
-
-bool TlppVerify::InsnDependencyCheck(const VTAGenericInsn *insn,
-                                     bool before_run) {
-  const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
-  bool pop_prev = mem->pop_prev_dep;
-  bool pop_next = mem->pop_next_dep;
-  bool push_prev = mem->push_prev_dep;
-  bool push_next = mem->push_next_dep;
-  CORE_TYPE core_type = GetCoreType(GetOperationCode(insn), insn);
-  bool bcheck = false;
-  switch (core_type) {
-    case COREGEMM:
-      bcheck = DependencyProcess(before_run, pop_prev,
-          pop_next, push_prev, push_next,
-          &l2g_q_, &s2g_q_, &g2l_q_, &g2s_q_, CORELOAD, CORESTORE);
-      break;
-    case CORELOAD:
-      bcheck = DependencyProcess(before_run, pop_prev,
-          pop_next, push_prev, push_next,
-          nullptr, &g2l_q_, nullptr, &l2g_q_, COREMAX, COREGEMM);
-      break;
-    case CORESTORE:
-      bcheck = DependencyProcess(before_run, pop_prev,
-          pop_next, push_prev, push_next,
-          &g2s_q_, nullptr, &s2g_q_, nullptr, COREGEMM, COREMAX);
-      break;
-    case COREMAX:
-      assert(0);
-      break;
-  }
-
-  return bcheck;
-}
-
-void TlppVerify::CoreRun(CORE_TYPE core_type) {
-  const VTAGenericInsn *insn = PickFrontInsn(core_type);
-  while (insn) {
-    /*!
-     * Check need to read any dependency queue for wait.
-     */
-    if (!InsnDependencyCheck(insn, true)) {
-      break;
-    }
-    /*!
-     * Execute the instruction.
-     */
-    run_fsim_function_(insn, fsim_handle_);
-    /*!
-     *check if need to write any dependency queue for notify.
-     */
-    InsnDependencyCheck(insn, false);
-    /*!
-     * If instruction is FINISH set done flag.
-     * notification.
-     */
-    done_ = GetOperationCode(insn) == VTA_OPCODE_FINISH;
-
-    if (debug_) {
-      printf("this is thread for %s\n", GetCoreTypeName(core_type));
-    }
-    ConsumeFrontInsn(core_type);
-    insn = PickFrontInsn(core_type);
-  }
-  return;
-}
-
-void TlppVerify::EventProcess(void) {
-  while (dep_push_event_.size()) {
-      CORE_TYPE core_type = dep_push_event_.front();
-      dep_push_event_.pop();
-      CoreRun(core_type);
-  }
-}
-
-void TlppVerify::TlppSynchronization(Run_Function run_function,
-                                         void *fsim_handle,
-                                         bool debug) {
-  fsim_handle_ = fsim_handle;
-  run_fsim_function_ = run_function;
-  debug_ = debug;
-  done_ = 0;
-  do {
-    /*
-     * Pick a random core to run first.
-     */
-    unsigned int seed = time(NULL);
-    uint8_t core_start = rand_r(&seed)%COREMAX;
-    for (int i = 0; i < COREMAX; i++) {
-      CoreRun(static_cast<CORE_TYPE>((core_start + i) % COREMAX));
-    }
-    EventProcess();
-  }while (!done_);
-  Clear();
-  return;
-}
-
-void TlppVerify::TlppPushInsn(const VTAGenericInsn *insn) {
-  uint64_t operation_code = GetOperationCode(insn);
-  CORE_TYPE core_type = GetCoreType(operation_code, insn);
-  insnq_array_[core_type].push(static_cast<const void *>(insn));
-  return;
-}
-
-const VTAGenericInsn *TlppVerify::PickFrontInsn(uint64_t core_type) {
-  const void *return_value = nullptr;
-  if (insnq_array_[core_type].size()) {
-    return_value = insnq_array_[core_type].front();
-  }
-  return reinterpret_cast<const VTAGenericInsn *> (return_value);
-}
-
-void TlppVerify::ConsumeFrontInsn(uint64_t core_type) {
-  if (insnq_array_[core_type].size()) {
-    insnq_array_[core_type].pop();
-  }
-}
diff --git a/vta/vta-hw/src/tsim/tsim_driver.cc b/vta/vta-hw/src/tsim/tsim_driver.cc
deleted file mode 100644
index 646dbe17a2e6..000000000000
--- a/vta/vta-hw/src/tsim/tsim_driver.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-#include <vta/driver.h>
-#include <vta/dpi/module.h>
-
-#include "../vmem/virtual_memory.h"
-
-namespace vta {
-namespace tsim {
-
-using tvm::runtime::Module;
-using vta::dpi::DPIModuleNode;
-
-class Profiler {
- public:
-  Profiler() {
-    counters_ = new int[num_counters_];
-    this->ClearAll();
-  }
-
-  ~Profiler() {
-    delete [] counters_;
-  }
-
-  /*! \brief update one event counter */
-  void Update(uint32_t idx, uint32_t value) {
-    counters_[idx] += value;
-  }
-
-  /*! \brief clear one event counter*/
-  void Clear(uint32_t idx) {
-    counters_[idx] = 0;
-  }
-
-  /*! \brief clear all event counters */
-  void ClearAll() {
-    for (uint32_t i = 0; i < num_counters_; i++) {
-      counters_[i] = 0;
-    }
-  }
-
-  /*! \brief return counters as json */
-  std::string AsJSON() {
-    std::ostringstream os;
-    os << "{\n"
-       << " \"cycle_count\":" << counters_[0] << "\n"
-       <<"}\n";
-    return os.str();
-  }
-
-  static Profiler* Global() {
-    static Profiler inst;
-    return &inst;
-  }
-
- private:
-  /*! \brief total number of event counters */
-  uint32_t num_counters_{1};
-  /*! \brief event counters */
-  int* counters_{nullptr};
-};
-
-class DPILoader {
- public:
-  ~DPILoader() {
-    dpi_->SimResume();
-    dpi_->SimFinish();
-  }
-
-  void Init(Module module) {
-    mod_ = module;
-    dpi_ = this->Get();
-    dpi_->SimLaunch();
-    dpi_->SimWait();
-  }
-
-  DPIModuleNode* Get() {
-    return static_cast<DPIModuleNode*>(mod_.operator->());
-  }
-
-  static DPILoader* Global() {
-    static DPILoader inst;
-    return &inst;
-  }
-
-  // TVM module
-  Module mod_;
-  // DPI Module
-  DPIModuleNode* dpi_{nullptr};
-};
-
-class Device {
- public:
-  Device() {
-    loader_ = DPILoader::Global();
-    prof_ = Profiler::Global();
-  }
-
-  int Run(vta_phy_addr_t insn_phy_addr,
-          uint32_t insn_count,
-          uint32_t wait_cycles) {
-    this->Init();
-    this->Launch(insn_phy_addr,
-                 insn_count,
-                 wait_cycles);
-    this->WaitForCompletion(wait_cycles);
-    return 0;
-  }
-
- private:
-  void Init() {
-    dpi_ = loader_->Get();
-    dpi_->SimResume();
-  }
-
-  void Launch(vta_phy_addr_t insn_phy_addr,
-              uint32_t insn_count,
-              uint32_t wait_cycles) {
-    dpi_->WriteReg(0x08, insn_count);
-    dpi_->WriteReg(0x0c, insn_phy_addr);
-    dpi_->WriteReg(0x10, 0);
-    dpi_->WriteReg(0x14, 0);
-    dpi_->WriteReg(0x18, 0);
-    dpi_->WriteReg(0x1c, 0);
-    dpi_->WriteReg(0x20, 0);
-    // start
-    dpi_->WriteReg(0x00, 0x1);
-  }
-
-  void WaitForCompletion(uint32_t wait_cycles) {
-    uint32_t i, val;
-    for (i = 0; i < wait_cycles; i++) {
-      val = dpi_->ReadReg(0x00);
-      val &= 0x2;
-      if (val == 0x2) break;  // finish
-    }
-    prof_->Update(0, dpi_->ReadReg(0x04));
-    dpi_->SimWait();
-  }
-
-  // Profiler
-  Profiler* prof_;
-  // DPI loader
-  DPILoader* loader_;
-  // DPI Module
-  DPIModuleNode* dpi_;
-};
-
-using tvm::runtime::TVMRetValue;
-using tvm::runtime::TVMArgs;
-
-TVM_REGISTER_GLOBAL("vta.tsim.init")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Module m = args[0];
-    DPILoader::Global()->Init(m);
-  });
-
-TVM_REGISTER_GLOBAL("vta.tsim.profiler_clear")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Profiler::Global()->ClearAll();
-  });
-
-TVM_REGISTER_GLOBAL("vta.tsim.profiler_status")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = Profiler::Global()->AsJSON();
-  });
-
-}  // namespace tsim
-}  // namespace vta
-
-void* VTAMemAlloc(size_t size, int cached) {
-  return vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
-}
-
-void VTAMemFree(void* buf) {
-  vta::vmem::VirtualMemoryManager::Global()->Free(buf);
-}
-
-vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
-  return vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(buf);
-}
-
-void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
-  memcpy(dst, src, size);
-}
-
-void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
-  memcpy(dst, src, size);
-}
-
-void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
-}
-
-void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
-}
-
-VTADeviceHandle VTADeviceAlloc() {
-  return new vta::tsim::Device();
-}
-
-void VTADeviceFree(VTADeviceHandle handle) {
-  delete static_cast<vta::tsim::Device*>(handle);
-}
-
-int VTADeviceRun(VTADeviceHandle handle,
-                 vta_phy_addr_t insn_phy_addr,
-                 uint32_t insn_count,
-                 uint32_t wait_cycles) {
-  return static_cast<vta::tsim::Device*>(handle)->Run(
-      insn_phy_addr,
-      insn_count,
-      wait_cycles);
-}
diff --git a/vta/vta-hw/src/vmem/virtual_memory.cc b/vta/vta-hw/src/vmem/virtual_memory.cc
deleted file mode 100644
index 0bf2382e155e..000000000000
--- a/vta/vta-hw/src/vmem/virtual_memory.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file virtual_memory.cc
- * \brief Thread-safe virtal memory manager
- */
-
-#include "virtual_memory.h"
-
-#include <dmlc/logging.h>
-#include <vta/driver.h>
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <list>
-#include <utility>
-#include <iterator>
-#include <unordered_map>
-#include <map>
-#include <mutex>
-
-namespace vta {
-namespace vmem {
-
-/*!
- * \brief Get virtual address given physical address.
- * \param phy_addr The simulator phyiscal address.
- * \return The true virtual address;
- */
-void* VirtualMemoryManager::GetAddr(uint64_t phy_addr) {
-  CHECK_NE(phy_addr, 0)
-      << "trying to get address that is nullptr";
-  std::lock_guard<std::mutex> lock(mutex_);
-  uint64_t loc = (phy_addr >> kPageBits) - 1;
-  CHECK_LT(loc, ptable_.size())
-      << "phy_addr=" << phy_addr;
-  Page* p = ptable_[loc];
-  CHECK(p != nullptr);
-  size_t offset = (loc - p->ptable_begin) << kPageBits;
-  offset += phy_addr & (kPageSize - 1);
-  return reinterpret_cast<char*>(p->data) + offset;
-}
-
-/*!
- * \brief Get physical address
- * \param buf The virtual address.
- * \return The true physical address;
- */
-vta_phy_addr_t VirtualMemoryManager::GetPhyAddr(void* buf) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  auto it = pmap_.find(buf);
-  uint64_t offset = 0;
-  if (it == pmap_.end()) {
-    for (it = pmap_.begin(); it != pmap_.end(); it++) {
-      uint64_t bytes = it->second->num_pages << kPageBits;
-      if ((buf >= it->first) && (buf < static_cast<char*>(it->first) + bytes)) {
-        offset = static_cast<char*>(buf) - static_cast<char*>(it->first);
-        break;
-      }
-    }
-    CHECK(it != pmap_.end());
-  }
-  Page* p = it->second.get();
-  return ((p->ptable_begin + 1) << kPageBits) + offset;
-}
-
-/*!
- * \brief Allocate memory from manager
- * \param size The size of memory
- * \return The virtual address
- */
-void* VirtualMemoryManager::Alloc(size_t size) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  size_t npage = (size + kPageSize - 1) / kPageSize;
-  auto it = free_map_.lower_bound(npage);
-  if (it != free_map_.end()) {
-    Page* p = it->second;
-    free_map_.erase(it);
-    return p->data;
-  }
-  size_t start = ptable_.size();
-  std::unique_ptr<Page> p(new Page(start, npage));
-  // insert page entry
-  ptable_.resize(start + npage, p.get());
-  void* data = p->data;
-  pmap_[data] = std::move(p);
-  return data;
-}
-
-/*!
- * \brief Free the memory.
- * \param size The size of memory
- * \return The virtual address
- */
-void VirtualMemoryManager::Free(void* data) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  if (pmap_.size() == 0) return;
-  auto it = pmap_.find(data);
-  CHECK(it != pmap_.end());
-  Page* p = it->second.get();
-  free_map_.insert(std::make_pair(p->num_pages, p));
-}
-
-/*!
- * \brief Copy from the host memory to device memory (virtual).
- * \param dst The device memory address (virtual)
- * \param src The host memory address
- * \param size The size of memory
- */
-void VirtualMemoryManager::MemCopyFromHost(void* dst, const void * src, size_t size) {
-  void * addr = this->GetAddr(reinterpret_cast<uint64_t>(dst));
-  memcpy(addr, src, size);
-}
-
-/*!
- * \brief Copy from the device memory (virtual) to host memory.
- * \param dst The host memory address
- * \param src The device memory address (virtual)
- * \param size The size of memory
- */
-void VirtualMemoryManager::MemCopyToHost(void* dst, const void * src, size_t size) {
-  void * addr = this->GetAddr(reinterpret_cast<uint64_t>(src));
-  memcpy(dst, addr, size);
-}
-
-VirtualMemoryManager* VirtualMemoryManager::Global() {
-  static VirtualMemoryManager inst;
-  return &inst;
-}
-
-}  // namespace vmem
-}  // namespace vta
diff --git a/vta/vta-hw/src/vmem/virtual_memory.h b/vta/vta-hw/src/vmem/virtual_memory.h
deleted file mode 100644
index 5181b3d01c2d..000000000000
--- a/vta/vta-hw/src/vmem/virtual_memory.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file virtual_memory.h
- * \brief The virtual memory manager for device simulation
- */
-
-#ifndef VTA_VMEM_VIRTUAL_MEMORY_H_
-#define VTA_VMEM_VIRTUAL_MEMORY_H_
-
-#include <vta/driver.h>
-#include <cstdint>
-#include <type_traits>
-#include <mutex>
-#include <vector>
-#include <map>
-#include <unordered_map>
-#include <memory>
-
-enum VMemCopyType {
-  kVirtualMemCopyFromHost = 0,
-  kVirtualMemCopyToHost = 1
-};
-
-namespace vta {
-namespace vmem {
-
-/*!
- * \brief DRAM memory manager
- *  Implements simple paging to allow physical address translation.
- */
-class VirtualMemoryManager {
- public:
-  /*!
-   * \brief Get virtual address given physical address.
-   * \param phy_addr The simulator phyiscal address.
-   * \return The true virtual address;
-   */
-  void* GetAddr(uint64_t phy_addr);
-  /*!
-   * \brief Get physical address
-   * \param buf The virtual address.
-   * \return The true physical address;
-   */
-  vta_phy_addr_t GetPhyAddr(void* buf);
-  /*!
-   * \brief Allocate memory from manager
-   * \param size The size of memory
-   * \return The virtual address
-   */
-  void* Alloc(size_t size);
-  /*!
-   * \brief Free the memory.
-   * \param size The size of memory
-   * \return The virtual address
-   */
-  void Free(void* data);
-  /*!
-   * \brief Copy from the host memory to device memory (virtual).
-   * \param dst The device memory address (virtual)
-   * \param src The host memory address
-   * \param size The size of memory
-   */
-  void MemCopyFromHost(void* dst, const void * src, size_t size);
-  /*!
-   * \brief Copy from the device memory (virtual) to host memory.
-   * \param dst The host memory address
-   * \param src The device memory address (virtual)
-   * \param size The size of memory
-   */
-  void MemCopyToHost(void* dst, const void * src, size_t size);
-  static VirtualMemoryManager* Global();
-
- private:
-  // The bits in page table
-  static constexpr vta_phy_addr_t kPageBits = VTA_PAGE_BITS;
-  // page size, also the maximum allocable size 16 K
-  static constexpr vta_phy_addr_t kPageSize = VTA_PAGE_BYTES;
-  /*! \brief A page in the DRAM */
-  struct Page {
-    /*! \brief Data Type */
-    using DType = typename std::aligned_storage<kPageSize, 256>::type;
-    /*! \brief Start location in page table */
-    size_t ptable_begin;
-    /*! \brief The total number of pages */
-    size_t num_pages;
-    /*! \brief Data */
-    DType* data{nullptr};
-    // construct a new page
-    explicit Page(size_t ptable_begin, size_t num_pages)
-        : ptable_begin(ptable_begin), num_pages(num_pages) {
-      data = new DType[num_pages];
-    }
-    ~Page() {
-      delete [] data;
-    }
-  };
-  // Internal lock
-  std::mutex mutex_;
-  // Physical address -> page
-  std::vector<Page*> ptable_;
-  // virtual addres -> page
-  std::unordered_map<void*, std::unique_ptr<Page> > pmap_;
-  // Free map
-  std::multimap<size_t, Page*> free_map_;
-};
-
-
-}  // namespace vmem
-}  // namespace vta
-
-#endif  // VTA_VMEM_VIRTUAL_MEMORY_H_
diff --git a/vta/vta-hw/tests/hardware/common/test_lib.cc b/vta/vta-hw/tests/hardware/common/test_lib.cc
deleted file mode 100644
index ae4adf91d7db..000000000000
--- a/vta/vta-hw/tests/hardware/common/test_lib.cc
+++ /dev/null
@@ -1,1448 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file test_lib.cpp
- * \brief Test library for the VTA design simulation and driver tests.
- */
-
-#include "test_lib.h"
-
-#ifdef NO_SIM
-#ifdef VTA_TARGET_PYNQ
-
-uint64_t vta(
-  uint32_t insn_count,
-  VTAGenericInsn *insns,
-  VTAUop *uops,
-  uint32_t *inputs,
-  uint32_t *weights,
-  uint32_t *biases,
-  uint32_t *outputs) {
-  // Performance counter variables
-  uint64_t t_fpga;
-  struct timespec start, stop;
-
-  // Derive bitstream file
-  char bitstream[128];
-  char str_batch_size[4];
-  char str_block_out_size[4];
-  char str_block_in_size[4];
-  char str_block_bit_width[4];
-  snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
-  snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
-  snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
-  snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
-  snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
-
-  // Get VTA handles
-  void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR);
-  void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR);
-  void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR);
-  void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR);
-
-  // Physical address pointers
-  uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
-  uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
-  uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
-  uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
-  uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
-  uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
-
-#if VTA_DEBUG == 1
-  printf("INFO - Starting FPGA!\n");
-#endif
-
-  clock_gettime(CLOCK_REALTIME, &start);
-
-  VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
-  if (insns) VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy);
-  if (inputs) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_INP_ADDR_OFFSET, input_phy);
-  if (weights) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_WGT_ADDR_OFFSET, weight_phy);
-  if (uops) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_UOP_ADDR_OFFSET, uop_phy);
-  if (biases) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_BIAS_ADDR_OFFSET, bias_phy);
-  if (outputs) VTAWriteMappedReg(vta_store_handle, VTA_STORE_OUT_ADDR_OFFSET, output_phy);
-
-  // VTA start
-  VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
-  VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
-  VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
-  VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
-
-  int flag = 0, t = 0;
-  for (t = 0; t < 10000000; ++t) {
-    flag = VTAReadMappedReg(vta_compute_handle, VTA_COMPUTE_DONE_RD_OFFSET);
-    if (flag & VTA_DONE) break;
-  }
-
-  if (t == 10000000) {
-    printf("\tWARNING: VTA TIMEOUT!!!!\n");
-#if VTA_DEBUG == 1
-  } else {
-    printf("INFO - FPGA Finished!\n");
-#endif
-  }
-
-  clock_gettime(CLOCK_REALTIME, &stop);
-  t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
-
-  // Unmap VTA register
-  VTAUnmapRegister(vta_fetch_handle);
-  VTAUnmapRegister(vta_load_handle);
-  VTAUnmapRegister(vta_compute_handle);
-  VTAUnmapRegister(vta_store_handle);
-
-  return t_fpga;
-}
-
-#endif  // VTA_TARGET_PYNQ
-#endif  // NO_SIM
-
-uint32_t globalSeed;
-
-const char* getOpcodeString(int opcode, bool use_imm) {
-  // Returns string name
-  if (opcode == VTA_ALU_OPCODE_MIN) {
-    if (use_imm) {
-      return "min imm";
-    } else {
-      return "min";
-    }
-  } else if (opcode == VTA_ALU_OPCODE_MAX) {
-    if (use_imm) {
-      return "max imm";
-    } else {
-      return "max";
-    }
-  } else if (opcode == VTA_ALU_OPCODE_ADD) {
-    if (use_imm) {
-      return "add imm";
-    } else {
-      return "add";
-    }
-  } else if (opcode == VTA_ALU_OPCODE_SHR) {
-    return "shr";
-  }
-  // else if (opcode == VTA_ALU_OPCODE_MUL) {
-  //   return "mul";
-  // }
-  return "unknown op";
-}
-
-template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
-void packBuffer(DST_T *dst, SRC_T **src, int y_size, int x_size, int y_block, int x_block) {
-  assert((SRC_T_WIDTH * x_block * y_block) % DST_T_WIDTH  == 0);
-  assert(DST_T_WIDTH <= 64);
-  int buffer_idx = 0;
-  int ratio = DST_T_WIDTH / SRC_T_WIDTH;
-  long long int mask = (1ULL << SRC_T_WIDTH) - 1;
-  DST_T tmp = 0;
-  for (int i = 0; i < y_size / y_block; i++) {
-    for (int j = 0; j < x_size / x_block; j++) {
-      for (int k = 0; k < y_block; k++) {
-        for (int l = 0; l < x_block; l++) {
-          int block_idx = l + k * x_block;
-          tmp |= (src[i * y_block + k][j * x_block + l] & mask) << ((block_idx % ratio) * SRC_T_WIDTH);
-          // When tmp is packed, write to destination array
-          if (block_idx % ratio == ratio - 1) {
-            dst[buffer_idx++] = tmp;
-            tmp = 0;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
-void unpackBuffer(DST_T **dst, SRC_T *src, int y_size, int x_size, int y_block, int x_block) {
-  assert((DST_T_WIDTH * x_block * y_block) % SRC_T_WIDTH == 0);
-  int buffer_idx = 0;
-  long long int mask = (1ULL << DST_T_WIDTH) - 1;
-  int ratio = SRC_T_WIDTH / DST_T_WIDTH;
-  for (int i = 0; i < y_size / y_block; i++) {
-    for (int j = 0; j < x_size / x_block; j++) {
-      for (int k = 0; k < y_block; k++) {
-        for (int l = 0; l < x_block; l++) {
-          int block_idx = l + k * x_block;
-          dst[i * y_block + k][j * x_block + l] = (src[buffer_idx] >> ((block_idx % ratio) * DST_T_WIDTH)) & mask;
-          if (block_idx % ratio == ratio - 1) {
-            buffer_idx++;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-T ** allocInit2dArray(int rows, int cols) {
-  // Allocate
-  T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
-  for (int i = 0; i < rows; i++) {
-    array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
-  }
-  // Init
-  for (int i = 0; i < rows; i++) {
-    for (int j = 0; j < cols; j++) {
-      array[i][j] = static_cast<T>(rand_r(&globalSeed));
-    }
-  }
-  return array;
-}
-
-template <typename T>
-T ** allocSet2dArray(int rows, int cols, int val) {
-  // Allocate
-  T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
-  for (int i = 0; i < rows; i++) {
-    array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
-  }
-  // Init
-  for (int i = 0; i < rows; i++) {
-    for (int j = 0; j < cols; j++) {
-      array[i][j] = static_cast<T>(val);
-    }
-  }
-  return array;
-}
-
-template <typename T>
-T ** alloc2dArray(int rows, int cols) {
-  T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
-  for (int i = 0; i < rows; i++) {
-    array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
-  }
-  return array;
-}
-
-template <typename T>
-void free2dArray(T **array, int rows, int cols) {
-  for (int i = 0; i < rows; i++) {
-    free(array[i]);
-  }
-  free(array);
-}
-
-template <typename T>
-T *** alloc3dArray(int rows, int cols, int depth) {
-  T ***array = static_cast<T ***>(malloc(sizeof(T **) * rows));
-  for (int i = 0; i < rows; i++) {
-    array[i] = static_cast<T **>(malloc(sizeof(T *) * cols));
-    for (int j = 0; j < cols; j++) {
-      array[i][j] = static_cast<T*>(malloc(sizeof(T) * depth));
-    }
-  }
-  return array;
-}
-
-template <typename T>
-void free3dArray(T *** array, int rows, int cols, int depth) {
-  for (int i = 0; i < rows; i++) {
-    for (int j = 0; j < cols; j++) {
-      free(array[i][j]);
-    }
-    free(array[i]);
-  }
-  free(array);
-}
-
-void * allocBuffer(size_t num_bytes) {
-#ifdef NO_SIM
-  return VTAMemAlloc(num_bytes, VTA_CACHED);
-#else
-  return malloc(num_bytes);
-#endif
-}
-
-void freeBuffer(void * buffer) {
-#ifdef NO_SIM
-  return VTAMemFree(buffer);
-#else
-  return free(buffer);
-#endif
-}
-
-VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
-    int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
-    int push_prev_dep, int push_next_dep) {
-  // Converter
-  union VTAInsn converter;
-  // Memory instruction initialization
-  VTAMemInsn insn = {};
-  insn.opcode = opcode;
-  insn.pop_prev_dep = pop_prev_dep;
-  insn.pop_next_dep = pop_next_dep;
-  insn.push_prev_dep = push_prev_dep;
-  insn.push_next_dep = push_next_dep;
-  insn.memory_type = type;
-  insn.sram_base = sram_offset;
-  insn.dram_base = dram_offset;
-  insn.y_size = y_size;
-  insn.x_size = x_size;
-  insn.x_stride = x_stride;
-  insn.y_pad_0 = y_pad;
-  insn.y_pad_1 = y_pad;
-  insn.x_pad_0 = x_pad;
-  insn.x_pad_1 = x_pad;
-  converter.mem = insn;
-  return converter.generic;
-}
-
-VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
-    int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
-  // Converter
-  union VTAInsn converter;
-  // Memory instruction initialization
-  VTAMemInsn insn = {};
-  insn.opcode = opcode;
-  insn.pop_prev_dep = pop_prev_dep;
-  insn.pop_next_dep = pop_next_dep;
-  insn.push_prev_dep = push_prev_dep;
-  insn.push_next_dep = push_next_dep;
-  insn.memory_type = type;
-  insn.sram_base = sram_offset;
-  insn.dram_base = dram_offset;
-  insn.y_size = 1;
-  insn.x_size = size;
-  insn.x_stride = size;
-  insn.y_pad_0 = 0;
-  insn.y_pad_1 = 0;
-  insn.x_pad_0 = 0;
-  insn.x_pad_1 = 0;
-  converter.mem = insn;
-  return converter.generic;
-}
-
-VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
-    bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
-    int push_next_dep) {
-  // Converter
-  union VTAInsn converter;
-  // GEMM instruction initialization
-  VTAGemInsn insn;
-  insn.opcode = VTA_OPCODE_GEMM;
-  insn.pop_prev_dep = pop_prev_dep;
-  insn.pop_next_dep = pop_next_dep;
-  insn.push_prev_dep = push_prev_dep;
-  insn.push_next_dep = push_next_dep;
-  insn.reset_reg = false;
-  if (!uop_compression) {
-    insn.uop_bgn = uop_offset;
-    insn.uop_end = uop_offset + batch * in_feat * out_feat;
-    insn.iter_out = 1;
-    insn.iter_in = 1;
-    insn.dst_factor_out = 0;
-    insn.src_factor_out = 0;
-    insn.wgt_factor_out = 0;
-    insn.dst_factor_in = 0;
-    insn.src_factor_in = 0;
-    insn.wgt_factor_in = 0;
-  } else {
-    insn.uop_bgn = uop_offset;
-    insn.uop_end = uop_offset + batch;
-    insn.iter_out = in_feat;
-    insn.iter_in = out_feat;
-    insn.dst_factor_out = 0;
-    insn.src_factor_out = 1;
-    insn.wgt_factor_out = 1;
-    insn.dst_factor_in = 1;
-    insn.src_factor_in = 0;
-    insn.wgt_factor_in = in_feat;
-  }
-  converter.gemm = insn;
-  return converter.generic;
-}
-
-VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bool uop_compression,
-    int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
-  // Converter
-  union VTAInsn converter;
-  // Memory instruction initialization
-  VTAAluInsn insn = {};
-  insn.opcode = VTA_OPCODE_ALU;
-  insn.pop_prev_dep = pop_prev_dep;
-  insn.pop_next_dep = pop_next_dep;
-  insn.push_prev_dep = push_prev_dep;
-  insn.push_next_dep = push_next_dep;
-  insn.reset_reg = false;
-  if (!uop_compression) {
-    insn.uop_bgn = 0;
-    insn.uop_end = vector_size;
-    insn.iter_out = 1;
-    insn.iter_in = 1;
-    insn.dst_factor_out = 0;
-    insn.src_factor_out = 0;
-    insn.dst_factor_in = 0;
-    insn.src_factor_in = 0;
-    insn.alu_opcode = opcode;
-    insn.use_imm = use_imm;
-    insn.imm = imm;
-  } else {
-    insn.uop_bgn = 0;
-    insn.uop_end = 1;
-    insn.iter_out = 1;
-    insn.iter_in = vector_size;
-    insn.dst_factor_out = 0;
-    insn.src_factor_out = 0;
-    insn.dst_factor_in = 1;
-    insn.src_factor_in = 1;
-    insn.alu_opcode = opcode;
-    insn.use_imm = use_imm;
-    insn.imm = imm;
-  }
-  converter.alu = insn;
-  return converter.generic;
-}
-
-VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) {
-  // Converter
-  union VTAInsn converter;
-  // GEMM instruction initialization
-  VTAGemInsn insn;
-  insn.opcode = VTA_OPCODE_FINISH;
-  insn.pop_prev_dep = pop_prev;
-  insn.pop_next_dep = pop_next;
-  insn.push_prev_dep = 0;
-  insn.push_next_dep = 0;
-  insn.reset_reg = false;
-  insn.uop_bgn = 0;
-  insn.uop_end = 0;
-  insn.iter_out = 0;
-  insn.iter_in = 0;
-  insn.dst_factor_out = 0;
-  insn.src_factor_out = 0;
-  insn.wgt_factor_out = 0;
-  insn.dst_factor_in = 0;
-  insn.src_factor_in = 0;
-  insn.wgt_factor_in = 0;
-  converter.gemm = insn;
-  return converter.generic;
-}
-
-VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
-  // Derive the total uop size
-  int uop_size = (uop_compression) ? 1 : y_size * x_size;
-
-  // Allocate buffer
-#ifdef NO_SIM
-  VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
-#else
-  VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
-#endif
-
-  if (!uop_compression) {
-    int uop_idx = 0;
-    for (int i = 0; i < y_size; i++) {
-      for (int j = 0; j < x_size; j++) {
-        uop_buf[uop_idx].dst_idx = i * x_size + j;
-        uop_buf[uop_idx].src_idx = 0;
-        uop_buf[uop_idx].wgt_idx = 0;
-        uop_idx++;
-      }
-    }
-  } else {
-    uop_buf[0].dst_idx = 1;
-    uop_buf[0].src_idx = 0;
-    uop_buf[0].wgt_idx = 0;
-  }
-
-  return uop_buf;
-}
-
-VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
-    bool multi_threaded) {
-  // Derive the total uop size
-  int uop_size = (uop_compression) ? batch : batch * in_feat * out_feat;
-  if (multi_threaded) uop_size *= 2;
-
-  // Allocate buffer
-#ifdef NO_SIM
-  VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
-#else
-  VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
-#endif
-
-  if (!uop_compression) {
-    int uop_idx = 0;
-    for (int i = 0; i < batch; i++) {
-      for (int j = 0; j < in_feat; j++) {
-        for (int k = 0; k < out_feat; k++) {
-          uop_buf[uop_idx].dst_idx = i * out_feat + k;
-          uop_buf[uop_idx].src_idx = i * in_feat + j;
-          uop_buf[uop_idx].wgt_idx = k * in_feat + j;
-          uop_idx++;
-        }
-      }
-    }
-  } else {
-    for (int i = 0; i < batch; i++) {
-      uop_buf[i].dst_idx = i * out_feat;
-      uop_buf[i].src_idx = i * in_feat;
-      uop_buf[i].wgt_idx = 0;
-    }
-  }
-
-  if (multi_threaded) {
-    if (!uop_compression) {
-      int uop_idx = uop_size / 2;
-      for (int i = 0; i < batch; i++) {
-        for (int j = 0; j < in_feat; j++) {
-          for (int k = 0; k < out_feat; k++) {
-            uop_buf[uop_idx].dst_idx = i * out_feat + k;
-            uop_buf[uop_idx].src_idx = batch * in_feat + i * in_feat + j;
-            uop_buf[uop_idx].wgt_idx = out_feat * in_feat + k * in_feat + j;
-            uop_idx++;
-          }
-        }
-      }
-    } else {
-      for (int i = 0; i < batch; i++) {
-        uop_buf[batch+i].dst_idx = i * out_feat;
-        uop_buf[batch+i].src_idx = batch * in_feat + i * in_feat;
-        uop_buf[batch+i].wgt_idx = out_feat * in_feat;
-      }
-    }
-  }
-
-  return uop_buf;
-}
-
-VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
-  // Derive the total uop size
-  int uop_size = (uop_compression) ? 1 : vector_size;
-
-  // Allocate buffer
-#ifdef NO_SIM
-  VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
-#else
-  VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
-#endif
-
-  if (!uop_compression) {
-    for (int i = 0; i < vector_size; i++) {
-      uop_buf[i].dst_idx = i;
-      uop_buf[i].src_idx = vector_size + i;
-    }
-  } else {
-    uop_buf[0].dst_idx = 0;
-    uop_buf[0].src_idx = vector_size;
-  }
-
-  return uop_buf;
-}
-
-void printParameters() {
-  // Some debugging code
-  printf("Size of VTAInsn: %d\n", sizeof(VTAGenericInsn));
-  printf("Size of VTAUop: %d\n", sizeof(VTAUop));
-  printf("VTA_UOP_BUFF_DEPTH: %d\n", VTA_UOP_BUFF_DEPTH);
-  printf("VTA_LOG_UOP_BUFF_DEPTH: %d\n", VTA_LOG_UOP_BUFF_DEPTH);
-  printf("VTA_WGT_BUFF_DEPTH: %d\n", VTA_WGT_BUFF_DEPTH);
-  printf("VTA_LOG_WGT_BUFF_DEPTH: %d\n", VTA_LOG_WGT_BUFF_DEPTH);
-  printf("VTA_INP_BUFF_DEPTH: %d\n", VTA_INP_BUFF_DEPTH);
-  printf("VTA_LOG_INP_BUFF_DEPTH: %d\n", VTA_LOG_INP_BUFF_DEPTH);
-  printf("VTA_ACC_BUFF_DEPTH: %d\n", VTA_ACC_BUFF_DEPTH);
-  printf("VTA_LOG_ACC_BUFF_DEPTH: %d\n", VTA_LOG_ACC_BUFF_DEPTH);
-  printf("VTA_WGT_WORDS: %d\n", VTA_WGT_BUFF_DEPTH*VTA_BLOCK_IN*VTA_BLOCK_OUT);
-  printf("VTA_INP_WORDS: %d\n", VTA_INP_BUFF_DEPTH*VTA_BLOCK_IN);
-  printf("VTA_ACC_WORDS: %d\n", VTA_ACC_BUFF_DEPTH*VTA_BLOCK_OUT);
-  printf("VTA_INS_ELEM_BYTES: %d\n", VTA_INS_ELEM_BYTES);
-  printf("VTA_UOP_ELEM_BYTES: %d\n", VTA_UOP_ELEM_BYTES);
-  printf("VTA_INP_ELEM_BYTES: %d\n", VTA_INP_ELEM_BYTES);
-  printf("VTA_WGT_ELEM_BYTES: %d\n", VTA_WGT_ELEM_BYTES);
-  printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES);
-  printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN);
-  printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT);
-}
-
-void printInstruction(int num_insn, VTAGenericInsn *insns) {
-  // Keep tabs on dependence queues
-  int l2g_queue = 0;
-  int g2l_queue = 0;
-  int s2g_queue = 0;
-  int g2s_queue = 0;
-  // Converter
-  union VTAInsn c;
-  // Iterate over all instructions
-  printf("DEBUG - There are %u instructions\n", num_insn);
-  for (int i = 0; i < num_insn; i++) {
-    // Fetch instruction and decode opcode
-    c.generic = insns[i];
-    printf("DEBUG - INSTRUCTION %u: ", i);
-    if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
-      // Print instruction field information
-      if (c.mem.opcode == VTA_OPCODE_LOAD) {
-        printf("LOAD ");
-        if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
-        if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
-        if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
-        if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
-      }
-      if (c.mem.opcode == VTA_OPCODE_STORE) {
-        printf("STORE ACC\n");
-      }
-      printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-             static_cast<int>(c.mem.pop_prev_dep),
-             static_cast<int>(c.mem.pop_next_dep),
-             static_cast<int>(c.mem.push_prev_dep),
-             static_cast<int>(c.mem.push_next_dep));
-      printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
-             static_cast<int>(c.mem.dram_base),
-             static_cast<int>(c.mem.sram_base));
-      printf("\ty: size=%d, pad=[%d, %d]\n",
-             static_cast<int>(c.mem.y_size),
-             static_cast<int>(c.mem.y_pad_0),
-             static_cast<int>(c.mem.y_pad_1));
-      printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
-             static_cast<int>(c.mem.x_size),
-             static_cast<int>(c.mem.x_stride),
-             static_cast<int>(c.mem.x_pad_0),
-             static_cast<int>(c.mem.x_pad_1));
-      if (c.mem.opcode == VTA_OPCODE_STORE) {
-        if (c.mem.pop_prev_dep) g2s_queue--;
-        if (c.mem.push_prev_dep) s2g_queue++;
-      } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
-        (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
-        if (c.mem.pop_next_dep) g2l_queue--;
-        if (c.mem.push_next_dep) l2g_queue++;
-      } else {
-        if (c.mem.pop_prev_dep) l2g_queue--;
-        if (c.mem.push_prev_dep) g2l_queue++;
-        if (c.mem.pop_next_dep) s2g_queue--;
-        if (c.mem.push_next_dep) g2s_queue++;
-      }
-    } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
-      // Print instruction field information
-      printf("GEMM\n");
-      printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-             static_cast<int>(c.mem.pop_prev_dep),
-             static_cast<int>(c.mem.pop_next_dep),
-             static_cast<int>(c.mem.push_prev_dep),
-             static_cast<int>(c.mem.push_next_dep));
-      printf("\trange (%d, %d)\n",
-             static_cast<int>(c.gemm.uop_bgn),
-             static_cast<int>(c.gemm.uop_end));
-      printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
-      printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
-             static_cast<int>(c.gemm.iter_out),
-             static_cast<int>(c.gemm.dst_factor_out),
-             static_cast<int>(c.gemm.src_factor_out),
-             static_cast<int>(c.gemm.wgt_factor_out));
-      printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
-             static_cast<int>(c.gemm.iter_in),
-             static_cast<int>(c.gemm.dst_factor_in),
-             static_cast<int>(c.gemm.src_factor_in),
-             static_cast<int>(c.gemm.wgt_factor_in));
-      if (c.gemm.pop_prev_dep) l2g_queue--;
-      if (c.gemm.push_prev_dep) g2l_queue++;
-      if (c.gemm.pop_next_dep) s2g_queue--;
-      if (c.gemm.push_next_dep) g2s_queue++;
-    } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
-      printf("FINISH\n");
-      printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-             static_cast<int>(c.mem.pop_prev_dep),
-             static_cast<int>(c.mem.pop_next_dep),
-             static_cast<int>(c.mem.push_prev_dep),
-             static_cast<int>(c.mem.push_next_dep));
-      if (c.gemm.pop_prev_dep) l2g_queue--;
-      if (c.gemm.push_prev_dep) g2l_queue++;
-      if (c.gemm.pop_next_dep) s2g_queue--;
-      if (c.gemm.push_next_dep) g2s_queue++;
-    } else if (c.mem.opcode == VTA_OPCODE_ALU) {
-      // Print instruction field information
-      printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
-      printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-             static_cast<int>(c.mem.pop_prev_dep),
-             static_cast<int>(c.mem.pop_next_dep),
-             static_cast<int>(c.mem.push_prev_dep),
-             static_cast<int>(c.mem.push_next_dep));
-      printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
-      printf("\trange (%d, %d)\n",
-             static_cast<int>(c.alu.uop_bgn),
-             static_cast<int>(c.alu.uop_end));
-      printf("\touter loop - iter: %d, dst: %d, src: %d\n",
-             static_cast<int>(c.alu.iter_out),
-             static_cast<int>(c.alu.dst_factor_out),
-             static_cast<int>(c.alu.src_factor_out));
-      printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
-             static_cast<int>(c.alu.iter_in),
-             static_cast<int>(c.alu.dst_factor_in),
-             static_cast<int>(c.alu.src_factor_in));
-      if (c.alu.pop_prev_dep) l2g_queue--;
-      if (c.alu.push_prev_dep) g2l_queue++;
-      if (c.alu.pop_next_dep) s2g_queue--;
-      if (c.alu.push_next_dep) g2s_queue++;
-    }
-  }
-  printf("DEBUG - l2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
-  printf("DEBUG - s2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
-}
-
-// Helper function: Print micro-ops status
-void printMicroOp(int num_uop, VTAUop *uops) {
-  // Iterate over all micro ops
-  printf("DEBUG - There are %u micro-ops\n", num_uop);
-  for (int i = 0; i < num_uop; i++) {
-    // Read micro-op
-    printf("DEBUG - UOP %u: ", i);
-    printf("acc=%u, inp= %u, wgt=%u\n", uops[i].dst_idx, uops[i].src_idx, uops[i].wgt_idx);
-  }
-}
-
-int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) {
-  // Some assertions
-  assert(batch % VTA_BATCH == 0);
-  assert(vector_size % VTA_BLOCK_OUT == 0);
-  printf("=====================================================================================\n");
-  printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
-    getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
-
-  // Instruction count
-  int ins_size = 3 * batch / VTA_BATCH + 2;
-  // Micro op count
-  int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT;
-  // Input/output elements in each transfer
-  int tx_size = vector_size / VTA_BLOCK_OUT;
-  // Number of input sets to be generated
-  int input_sets = (use_imm) ? 1 : 2;
-  // Make sure we don't exceed buffer bounds
-  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
-  assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH);
-
-  // Immediate values
-  acc_T *immediate = static_cast<acc_T *>(malloc(sizeof(acc_T) * batch / VTA_BATCH));
-  for (int b = 0; b < batch / VTA_BATCH; b++) {
-    if (opcode == VTA_ALU_OPCODE_MIN) {
-      immediate[b] = static_cast<acc_T>(
-          rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
-    } else if (opcode == VTA_ALU_OPCODE_MAX) {
-      immediate[b] = static_cast<acc_T>(
-          rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
-    } else if (opcode == VTA_ALU_OPCODE_ADD) {
-      immediate[b] = static_cast<acc_T>(
-          rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
-    } else if (opcode == VTA_ALU_OPCODE_SHR) {
-      immediate[b] = static_cast<acc_T>(
-          rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
-    }
-    // else if (opcode == VTA_ALU_OPCODE_MUL) {
-    //   immediate[b] = static_cast<acc_T>(
-    //       rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2)));
-    // }
-  }
-
-  // Initialize instructions
-  VTAGenericInsn *insn_buf =
-      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
-  int insn_idx = 0;
-  insn_buf[insn_idx++] =
-      get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
-  for (int b = 0; b < batch; b += VTA_BATCH) {
-    insn_buf[insn_idx++] = get2DLoadStoreInsn(
-        VTA_OPCODE_LOAD,                   // opcode
-        VTA_MEM_ID_ACC,                    // vector size
-        0,                                 // sram offset
-        b / VTA_BATCH * tx_size * input_sets,  // dram offset
-        1,                                 // y size
-        tx_size * input_sets,              // x size
-        tx_size * input_sets,              // x stride
-        0,                                 // y pad
-        0,                                 // x pad
-        0,                                 // pop prev dep
-        b > 0,                             // pop next dep
-        0,                                 // push prev dep
-        0);                                // push next dep
-    insn_buf[insn_idx++] = getALUInsn(
-        opcode,                            // opcode
-        tx_size,                           // vector size
-        use_imm,                           // use imm
-        immediate[b / VTA_BATCH],          // imm
-        uop_compression,                   // uop compression
-        0,                                 // pop prev dep
-        0,                                 // pop next dep
-        0,                                 // push prev dep
-        1);                                // push next dep
-    insn_buf[insn_idx++] = get2DLoadStoreInsn(
-        VTA_OPCODE_STORE,                  // opcode
-        VTA_MEM_ID_OUT,                    // vector size
-        0,                                 // sram offset
-        b / VTA_BATCH * tx_size,           // dram offset
-        1,                                 // y size
-        tx_size,                           // x size
-        tx_size,                           // x stride
-        0,                                 // y pad
-        0,                                 // x pad
-        1,                                 // pop prev dep
-        0,                                 // pop next dep
-        1,                                 // push prev dep
-        0);                                // push next dep
-  }
-  // Finish
-  insn_buf[insn_idx++] = getFinishInsn(0, 1);
-  // Prepare the uop buffer
-  VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression);
-
-#if VTA_DEBUG == 1
-  printInstruction(ins_size, insn_buf);
-  printMicroOp(uop_size, uop_buf);
-#endif
-
-  // Initialize the input/output data
-  acc_T **inputs = alloc2dArray<acc_T>(batch, vector_size * input_sets);
-  for (int i = 0; i < batch; i++) {
-    for (int j = 0; j < vector_size * input_sets; j++) {
-      if (opcode == VTA_ALU_OPCODE_MIN) {
-        inputs[i][j] = static_cast<acc_T>(
-            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
-      } else if (opcode == VTA_ALU_OPCODE_MAX) {
-        inputs[i][j] = static_cast<acc_T>(
-            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
-      } else if (opcode == VTA_ALU_OPCODE_ADD) {
-        inputs[i][j] = static_cast<acc_T>(
-            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 2)) - (1LL << (VTA_INP_WIDTH - 3)));
-      } else if (opcode == VTA_ALU_OPCODE_SHR) {
-        inputs[i][j] = static_cast<acc_T>(
-            rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
-      }
-    }
-  }
-
-  // Compute reference output
-  out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
-  for (int i = 0; i < batch; i++) {
-    for (int j = 0; j < vector_size; j++) {
-      acc_T out_val = 0;
-      acc_T imm_val = immediate[i / VTA_BATCH];
-      acc_T src_val = inputs[i][j + vector_size];
-      if (opcode == VTA_ALU_OPCODE_MIN) {
-        if (!use_imm) {
-          out_val = inputs[i][j] < src_val ? inputs[i][j] : src_val;
-        } else {
-          out_val = inputs[i][j] < imm_val ? inputs[i][j] : imm_val;
-        }
-      } else if (opcode == VTA_ALU_OPCODE_MAX) {
-        if (!use_imm) {
-          out_val = inputs[i][j] > src_val ? inputs[i][j] : src_val;
-        } else {
-          out_val = inputs[i][j] > imm_val ? inputs[i][j] : imm_val;
-        }
-      } else if (opcode == VTA_ALU_OPCODE_ADD) {
-        if (!use_imm) {
-          out_val = inputs[i][j] + src_val;
-        } else {
-          out_val = inputs[i][j] + imm_val;
-        }
-      } else if (opcode == VTA_ALU_OPCODE_SHR) {
-        if (!use_imm) {
-          if (src_val >= 0) {
-            out_val = inputs[i][j] >> src_val;
-          } else {
-            out_val = inputs[i][j] << (0 - src_val);
-          }
-        } else {
-          if (imm_val >= 0) {
-            out_val = inputs[i][j] >> imm_val;
-          } else {
-            out_val = inputs[i][j] << (0 - imm_val);
-          }
-        }
-      }
-      outputs_ref[i][j] = (out_T) out_val;
-    }
-  }
-
-  // Pack input buffer
-  uint32_t *bias_buf = static_cast<uint32_t *>(
-      allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
-  packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(
-      bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
-
-  // Prepare output buffer
-  uint32_t *output_buf = static_cast<uint32_t *>(
-      allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets));
-
-#ifdef NO_SIM
-  // Invoke the VTA
-  uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf);
-  // Report on timining
-  printf("INFO - Synchronization time: %.3fms\n", static_cast<float>(t_fpga) / 1E6);
-  printf("INFO - Throughput: %.3fGOps/s\n", static_cast<float>(vector_size * batch) / t_fpga);
-#else
-  // Invoke the VTA
-  vta(ins_size,
-      (volatile insn_T *) insn_buf,
-      (volatile uop_T *) uop_buf,
-      (volatile bus_T *) NULL,
-      (volatile bus_T *) NULL,
-      (volatile bus_T *) bias_buf,
-      (volatile bus_T *) output_buf);
-#endif
-
-  // Unpack output buffer
-  out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
-  unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
-                                                   output_buf,
-                                                   batch,
-                                                   vector_size,
-                                                   VTA_BATCH,
-                                                   VTA_BLOCK_OUT);
-
-  // Correctness checks
-  int err = 0;
-  for (int i = 0; i < batch; i++) {
-    for (int j = 0; j < vector_size; j++) {
-      if (outputs_ref[i][j] != outputs[i][j]) {
-        err++;
-#if VTA_DEBUG == 1
-        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
-               static_cast<int>(outputs_ref[i][j]),
-               static_cast<int>(outputs[i][j]));
-#endif
-      }
-    }
-  }
-
-  // Free all allocated arrays
-  free(immediate);
-  free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
-  free2dArray<out_T>(outputs_ref, batch, vector_size);
-  free2dArray<out_T>(outputs, batch, vector_size);
-  freeBuffer(insn_buf);
-  freeBuffer(uop_buf);
-  freeBuffer(bias_buf);
-  freeBuffer(output_buf);
-
-  if (err == 0) {
-    printf("INFO - ALU test successful!\n");
-    return 0;
-  } else {
-    printf("INFO - ALU test failed, got %d errors!\n", err);
-    return -1;
-  }
-}
-
-int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
-    int virtual_threads) {
-  // Some assertions
-  assert(block % VTA_BLOCK_IN == 0);
-  assert(block % VTA_BLOCK_OUT == 0);
-  assert(block % VTA_BATCH == 0);
-  assert(channels % block == 0);
-  assert(batch % block == 0);
-
-  printf("=====================================================================================\n");
-  printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_comp=%d, vt=%d\n",
-         batch, channels, block, uop_compression, virtual_threads);
-
-  // Input/output channels
-  int in_feat = channels;
-  int out_feat = channels;
-  // Derive number of elements that need to be loaded/stored
-  int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
-  int uop_size = uop_compression ?
-      block / VTA_BATCH * virtual_threads :
-      block / VTA_BATCH * block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT * virtual_threads;
-  int inp_size = batch / VTA_BATCH * in_feat / VTA_BLOCK_IN;
-  int wgt_size = in_feat / VTA_BLOCK_IN * out_feat / VTA_BLOCK_OUT;
-  int out_size = batch / VTA_BATCH * out_feat / VTA_BLOCK_OUT;
-  // Blocked buffer sizes (in terms of elements)
-  int inp_block_size = block / VTA_BATCH * block / VTA_BLOCK_IN;
-  int wgt_block_size = block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT;
-  int out_block_size = block / VTA_BATCH * block / VTA_BLOCK_OUT;
-  // Make sure we don't exceed buffer bounds
-  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
-  assert(inp_block_size <= VTA_INP_BUFF_DEPTH);
-  assert(wgt_block_size <= VTA_WGT_BUFF_DEPTH);
-  assert(out_block_size <= VTA_ACC_BUFF_DEPTH);
-
-  // Initialize instruction buffer
-  VTAGenericInsn *insn_buf =
-      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
-  int insn_idx = 0;
-
-  // Load uops
-  insn_buf[insn_idx++] = get1DLoadStoreInsn(VTA_OPCODE_LOAD,
-                                            VTA_MEM_ID_UOP,
-                                            0,
-                                            0,
-                                            uop_size,
-                                            0,
-                                            0,
-                                            0,
-                                            0);
-  // Iterate over batch blocks
-  for (int i = 0; i < batch; i += block) {
-    // Iterate over output channel blocks
-    for (int j = 0; j < out_feat; j += block) {
-      // Load bias block (pop next if not first, push prev)
-      insn_buf[insn_idx++] = get2DLoadStoreInsn(
-          VTA_OPCODE_LOAD,                                    // opcode
-          VTA_MEM_ID_ACC,                                     // type
-          0,                                                  // sram offset
-          (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT,     // dram offset
-          block / VTA_BATCH,                                  // y size
-          block / VTA_BLOCK_OUT,                              // x size
-          out_feat / VTA_BLOCK_OUT,                           // x stride
-          0,                                                  // y pad
-          0,                                                  // x pad
-          0,                                                  // pop prev dep
-          (i > 0 || j > 0),                                   // pop next dep
-          (virtual_threads == 1),                             // push prev dep
-          0);                                                 // push next dep
-      // Iterate over input channel blocks
-      for (int k = 0; k < in_feat; k += block * virtual_threads) {
-        for (int l = 0; l < block * virtual_threads; l += block) {
-          // Derive dependence flags
-          bool pop = (virtual_threads == 1) ?
-              1 :
-              (i > 0 || j > 0 || k > 0 || l > 0) && (k + l != block * virtual_threads - block);
-          bool push_prev = (virtual_threads == 1) ?
-              ((k + l) != in_feat - block) :
-              ((k + l) != in_feat - virtual_threads * block) &&
-              (
-                  (k + l != in_feat - block) ||
-                  (j != out_feat - block) ||
-                  (i != batch - block));
-          bool push_next = (k + l == in_feat - block);
-          // Load weight block (pop next)
-          insn_buf[insn_idx++] = get2DLoadStoreInsn(
-              VTA_OPCODE_LOAD,                                // opcode
-              VTA_MEM_ID_WGT,                                 // type
-              l / VTA_BLOCK_IN * block / VTA_BLOCK_OUT,       // sram offset
-              (j / VTA_BLOCK_OUT * in_feat + k + l) / VTA_BLOCK_IN,  // dram offset
-              block / VTA_BLOCK_OUT,                          // y size
-              block / VTA_BLOCK_IN,                           // x size
-              in_feat / VTA_BLOCK_IN,                         // x stride
-              0,                                              // y pad
-              0,                                              // x pad
-              0,                                              // pop prev dep
-              pop,                                            // pop next dep
-              0,                                              // push prev dep
-              0);                                             // push next dep
-          // Load input block (push next)
-          insn_buf[insn_idx++] = get2DLoadStoreInsn(
-              VTA_OPCODE_LOAD,                                // opcode
-              VTA_MEM_ID_INP,                                 // type
-              l / VTA_BLOCK_IN * block / VTA_BATCH,           // sram offset
-              (i / VTA_BATCH * in_feat + k + l) / VTA_BLOCK_IN,  // dram offset
-              block / VTA_BATCH,                              // y size
-              block / VTA_BLOCK_IN,                           // x size
-              in_feat / VTA_BLOCK_IN,                         // x stride
-              0,                                              // y pad
-              0,                                              // x pad
-              0,                                              // pop prev dep
-              0,                                              // pop next dep
-              0,                                              // push prev dep
-              1);                                             // push next dep
-          // Perform GEMM (pop prev, push prev if not last, push next if last)
-          insn_buf[insn_idx++] = getGEMMInsn(
-              l / block * uop_size / virtual_threads,         // uop offset
-              block / VTA_BATCH,                              // batch
-              block / VTA_BLOCK_IN,                           // in_feat
-              block / VTA_BLOCK_OUT,                          // out_feat
-              uop_compression,                                // uop_compression
-              1,                                              // pop_prev_dep
-              0,                                              // pop_next_dep
-              push_prev,                                      // push prev dep
-              push_next);                                     // push_next_dep
-        }
-      }
-      // Store output block (pop prev, push prev if not last)
-      insn_buf[insn_idx++] = get2DLoadStoreInsn(
-          VTA_OPCODE_STORE,                                   // opcode
-          VTA_MEM_ID_OUT,                                     // type
-          0,                                                  // sram offset
-          (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT,     // dram offset
-          block / VTA_BATCH,                                  // y size
-          block / VTA_BLOCK_OUT,                              // x size
-          out_feat / VTA_BLOCK_OUT,                           // x stride
-          0,                                                  // y pad
-          0,                                                  // x pad
-          1,                                                  // pop prev dep
-          0,                                                  // pop next dep
-          1,                                                  // pop prev dep
-          0);                                                 // push next dep
-    }
-  }
-  // Finish
-  insn_buf[insn_idx++] = getFinishInsn(0, 1);
-
-  // Prepare the uop buffer
-  VTAUop * uop_buf = getGEMMUops(
-      block / VTA_BATCH,
-      block / VTA_BLOCK_IN,
-      block / VTA_BLOCK_OUT,
-      uop_compression,
-      virtual_threads > 1);
-
-#if VTA_DEBUG == 1
-  printInstruction(ins_size, insn_buf);
-  printMicroOp(uop_size, uop_buf);
-#endif
-
-  // Initialize inputs
-  inp_T **inputs = allocInit2dArray<inp_T>(batch, in_feat);
-  // Initialize weights
-  wgt_T **weights = allocInit2dArray<wgt_T>(out_feat, in_feat);
-  // Initialize biases
-  acc_T **biases = allocInit2dArray<acc_T>(batch, out_feat);
-
-  // Reference GEMM implementation
-  out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
-  for (int i = 0; i < batch; i++) {
-    for (int j = 0; j < out_feat; j++) {
-      acc_T sum = biases[i][j];
-      for (int k = 0; k < in_feat; k++) {
-        sum += (acc_T) (inputs[i][k] * weights[j][k]);
-      }
-      // Set
-      outputs_ref[i][j] = (out_T) sum;
-    }
-  }
-
-  // Prepare the input buffer
-  uint32_t *input_buf = static_cast<uint32_t *>(
-      allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
-  packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
-                                                 inputs,
-                                                 batch,
-                                                 in_feat,
-                                                 VTA_BATCH,
-                                                 VTA_BLOCK_IN);
-  // Prepare the weight buffer
-  uint32_t *weight_buf = static_cast<uint32_t *>(
-      allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
-  packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
-                                                 weights,
-                                                 out_feat,
-                                                 in_feat,
-                                                 VTA_BLOCK_OUT,
-                                                 VTA_BLOCK_IN);
-  // Prepare the bias buffer
-  uint32_t *bias_buf = static_cast<uint32_t *>(
-      allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
-  packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
-                                                 biases,
-                                                 batch,
-                                                 out_feat,
-                                                 VTA_BATCH,
-                                                 VTA_BLOCK_OUT);
-  // Prepare the output buffer
-  uint32_t *output_buf = static_cast<uint32_t *>(
-      allocBuffer(VTA_INP_ELEM_BYTES * out_size));
-
-#ifdef NO_SIM
-  // Invoke the VTA
-  uint64_t t_fpga = vta(ins_size,
-                        insn_buf,
-                        uop_buf,
-                        input_buf,
-                        weight_buf,
-                        bias_buf,
-                        output_buf);
-  // Report on timining
-  printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
-  printf("INFO - Throughput: %.3lfGOPs/s\n",
-         static_cast<float>(batch) * in_feat * out_feat * 2 / t_fpga);
-#else
-  // Invoke the VTA
-  vta(ins_size,
-      (volatile insn_T *) insn_buf,
-      (volatile uop_T *) uop_buf,
-      (volatile bus_T *) input_buf,
-      (volatile bus_T *) weight_buf,
-      (volatile bus_T *) bias_buf,
-      (volatile bus_T *) output_buf);
-#endif
-
-  // Unpack output data
-  out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
-  unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
-                                                   output_buf,
-                                                   batch,
-                                                   out_feat,
-                                                   VTA_BATCH,
-                                                   VTA_BLOCK_OUT);
-
-  // Correctness checks
-  int err = 0;
-  for (int i = 0; i < batch; i++) {
-    for (int j = 0; j < out_feat; j++) {
-      if (outputs_ref[i][j] != outputs[i][j]) {
-        err++;
-#if VTA_DEBUG == 1
-        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
-               static_cast<int>(outputs_ref[i][j]),
-               static_cast<int>(outputs[i][j]));
-#endif
-      }
-    }
-  }
-
-  // Free all allocated arrays
-  free2dArray<inp_T>(inputs, batch, in_feat);
-  free2dArray<wgt_T>(weights, out_feat, in_feat);
-  free2dArray<acc_T>(biases, batch, out_feat);
-  free2dArray<out_T>(outputs_ref, batch, out_feat);
-  free2dArray<out_T>(outputs, batch, out_feat);
-  freeBuffer(insn_buf);
-  freeBuffer(uop_buf);
-  freeBuffer(input_buf);
-  freeBuffer(weight_buf);
-  freeBuffer(bias_buf);
-  freeBuffer(output_buf);
-
-  if (err == 0) {
-    printf("INFO - Blocked GEMM test successful!\n");
-    return 0;
-  } else {
-    printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
-    return -1;
-  }
-}
-
-
-int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression) {
-  // Some assertions
-  assert(batch % VTA_BATCH == 0);
-  assert(in_channels % VTA_BLOCK_IN == 0);
-  assert(out_channels % VTA_BLOCK_OUT == 0);
-
-  printf("=====================================================================================\n");
-  printf("INFO - Blocked GEMM test: batch=%d, in_channels=%d, out_channels=%d, uop_comp=%d\n",
-         batch, in_channels, out_channels, uop_compression);
-
-  // Derive number of elements that need to be loaded/stored
-  int ins_size = 7;
-  int uop_size = uop_compression ?
-      batch / VTA_BATCH :
-      batch / VTA_BATCH * in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
-  int inp_size = batch / VTA_BATCH * in_channels / VTA_BLOCK_IN;
-  int wgt_size = in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
-  int out_size = batch / VTA_BATCH * out_channels / VTA_BLOCK_OUT;
-  // Make sure we don't exceed buffer bounds
-  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
-  assert(inp_size <= VTA_INP_BUFF_DEPTH);
-  assert(wgt_size <= VTA_WGT_BUFF_DEPTH);
-  assert(out_size <= VTA_ACC_BUFF_DEPTH);
-
-  // Initialize instruction buffer
-  VTAGenericInsn *insn_buf =
-      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
-  int insn_idx = 0;
-
-  // Load uops
-  insn_buf[insn_idx++] = get1DLoadStoreInsn(
-      VTA_OPCODE_LOAD,
-      VTA_MEM_ID_UOP,
-      0,
-      0,
-      uop_size,
-      0,
-      0,
-      0,
-      0);
-  // Load bias
-  insn_buf[insn_idx++] = get1DLoadStoreInsn(
-      VTA_OPCODE_LOAD,                                    // opcode
-      VTA_MEM_ID_ACC,                                     // type
-      0,                                                  // sram offset
-      0,                                                  // dram offset
-      out_size,                                           // size
-      0,                                                  // pop prev dep
-      0,                                                  // pop next dep
-      1,                                                  // push prev dep
-      0);                                                 // push next dep
-  // Load weight block (pop next)
-  insn_buf[insn_idx++] = get1DLoadStoreInsn(
-      VTA_OPCODE_LOAD,                                    // opcode
-      VTA_MEM_ID_WGT,                                     // type
-      0,                                                  // sram offset
-      0,                                                  // dram offset
-      wgt_size,                                           // size
-      0,                                                  // pop prev dep
-      1,                                                  // pop next dep
-      0,                                                  // push prev dep
-      0);                                                 // push next dep
-  // Load input block (push next)
-  insn_buf[insn_idx++] = get1DLoadStoreInsn(
-      VTA_OPCODE_LOAD,                                    // opcode
-      VTA_MEM_ID_INP,                                     // type
-      0,                                                  // sram offset
-      0,                                                  // dram offset
-      inp_size,                                           // size
-      0,                                                  // pop prev dep
-      0,                                                  // pop next dep
-      0,                                                  // push prev dep
-      1);                                                 // push next dep
-  // Perform GEMM (pop prev, push prev if not last, push next if last)
-  insn_buf[insn_idx++] = getGEMMInsn(
-      0,                                                  // uop offset
-      batch / VTA_BATCH,                                  // batch
-      in_channels / VTA_BLOCK_IN,                         // in_channels
-      out_channels / VTA_BLOCK_OUT,                       // out_channels
-      uop_compression,                                    // uop_compression
-      1,                                                  // pop_prev_dep
-      0,                                                  // pop_next_dep
-      0,                                                  // push prev dep
-      1);                                                 // push_next_dep
-  // Store output block (pop prev, push prev if not last)
-  insn_buf[insn_idx++] = get1DLoadStoreInsn(
-      VTA_OPCODE_STORE,                                   // opcode
-      VTA_MEM_ID_OUT,                                     // type
-      0,                                                  // sram offset
-      0,                                                  // dram offset
-      out_size,                                           // size
-      1,                                                  // pop prev dep
-      0,                                                  // pop next dep
-      1,                                                  // push prev dep
-      0);                                                 // push next dep
-  // Finish
-  insn_buf[insn_idx++] = getFinishInsn(0, 1);
-
-  // Prepare the uop buffer
-  VTAUop * uop_buf = getGEMMUops(
-      batch / VTA_BATCH,
-      in_channels / VTA_BLOCK_IN,
-      out_channels / VTA_BLOCK_OUT,
-      uop_compression,
-      0);
-
-#if VTA_DEBUG == 1
-  printInstruction(ins_size, insn_buf);
-  printMicroOp(uop_size, uop_buf);
-#endif
-
-  // Initialize inputs
-  inp_T **inputs = allocInit2dArray<inp_T>(batch, in_channels);
-  // Initialize weights
-  wgt_T **weights = allocInit2dArray<wgt_T>(out_channels, in_channels);
-  // Initialize biases
-  acc_T **biases = allocInit2dArray<acc_T>(batch, out_channels);
-
-  // Reference GEMM implementation
-  out_T **outputs_ref = alloc2dArray<out_T>(batch, out_channels);
-  for (int i = 0; i < batch; i++) {
-    for (int j = 0; j < out_channels; j++) {
-      acc_T sum = biases[i][j];
-      for (int k = 0; k < in_channels; k++) {
-        sum += (acc_T) (inputs[i][k] * weights[j][k]);
-      }
-      // Set
-      outputs_ref[i][j] = (out_T) sum;
-    }
-  }
-
-  // Prepare the input buffer
-  uint32_t *input_buf = static_cast<uint32_t *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
-  packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
-                                                 inputs,
-                                                 batch,
-                                                 in_channels,
-                                                 VTA_BATCH,
-                                                 VTA_BLOCK_IN);
-  // Prepare the weight buffer
-  uint32_t *weight_buf = static_cast<uint32_t *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
-  packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
-                                                 weights,
-                                                 out_channels,
-                                                 in_channels,
-                                                 VTA_BLOCK_OUT,
-                                                 VTA_BLOCK_IN);
-  // Prepare the bias buffer
-  uint32_t *bias_buf = static_cast<uint32_t *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
-  packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
-                                                 biases,
-                                                 batch,
-                                                 out_channels,
-                                                 VTA_BATCH,
-                                                 VTA_BLOCK_OUT);
-  // Prepare the output buffer
-  uint32_t *output_buf = static_cast<uint32_t *>(allocBuffer(VTA_OUT_ELEM_BYTES * out_size));
-
-#ifdef NO_SIM
-  // Invoke the VTA
-  uint64_t t_fpga = vta(ins_size,
-                        insn_buf,
-                        uop_buf,
-                        input_buf,
-                        weight_buf,
-                        bias_buf,
-                        output_buf);
-  // Report on timining
-  printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
-  printf("INFO - Throughput: %.3lfGOPs/s\n",
-         static_cast<float>(batch) * in_channels * out_channels * 2 / t_fpga);
-#else
-  // Invoke the VTA
-  vta(ins_size,
-      (volatile insn_T *) insn_buf,
-      (volatile uop_T *) uop_buf,
-      (volatile bus_T *) input_buf,
-      (volatile bus_T *) weight_buf,
-      (volatile bus_T *) bias_buf,
-      (volatile bus_T *) output_buf);
-#endif
-
-  // Unpack output data
-  out_T **outputs = alloc2dArray<out_T>(batch, out_channels);
-  unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
-                                                   output_buf,
-                                                   batch,
-                                                   out_channels,
-                                                   VTA_BATCH,
-                                                   VTA_BLOCK_OUT);
-
-  // Correctness checks
-  int err = 0;
-  for (int i = 0; i < batch; i++) {
-    for (int j = 0; j < out_channels; j++) {
-      if (outputs_ref[i][j] != outputs[i][j]) {
-        err++;
-#if VTA_DEBUG == 1
-        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
-               static_cast<int>(outputs_ref[i][j]),
-               static_cast<int>(outputs[i][j]));
-#endif
-      }
-    }
-  }
-
-  // Free all allocated arrays
-  free2dArray<inp_T>(inputs, batch, in_channels);
-  free2dArray<wgt_T>(weights, out_channels, in_channels);
-  free2dArray<acc_T>(biases, batch, out_channels);
-  free2dArray<out_T>(outputs_ref, batch, out_channels);
-  free2dArray<out_T>(outputs, batch, out_channels);
-  freeBuffer(insn_buf);
-  freeBuffer(uop_buf);
-  freeBuffer(input_buf);
-  freeBuffer(weight_buf);
-  freeBuffer(bias_buf);
-  freeBuffer(output_buf);
-
-  if (err == 0) {
-    printf("INFO - Blocked GEMM test successful!\n");
-    return 0;
-  } else {
-    printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
-    return -1;
-  }
-}
diff --git a/vta/vta-hw/tests/hardware/common/test_lib.h b/vta/vta-hw/tests/hardware/common/test_lib.h
deleted file mode 100644
index f1dbdc807fcf..000000000000
--- a/vta/vta-hw/tests/hardware/common/test_lib.h
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file test_lib.cpp
- * \brief Test library for the VTA design simulation and driver tests.
- */
-
-#ifndef TESTS_HARDWARE_COMMON_TEST_LIB_H_
-#define TESTS_HARDWARE_COMMON_TEST_LIB_H_
-
-#include <assert.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vta/hw_spec.h>
-
-#ifdef NO_SIM
-
-#include <vta/driver.h>
-
-#ifdef VTA_TARGET_PYNQ
-#include "../../../src/pynq/pynq_driver.h"
-#endif  // VTA_TARGET_PYNQ
-
-typedef uint32_t uop_T;
-typedef int8_t wgt_T;
-typedef int8_t inp_T;
-typedef int8_t out_T;
-typedef int32_t acc_T;
-
-uint64_t vta(
-  uint32_t insn_count,
-  VTAGenericInsn *insns,
-  VTAUop *uops,
-  inp_T *inputs,
-  wgt_T *weights,
-  acc_T *biases,
-  inp_T *outputs);
-
-#else  // NO_SIM
-
-#include "../../../hardware/xilinx/src/vta.h"
-
-#endif  // NO_SIM
-
-/*!
-* \brief Returns opcode string.
-* \param opcode Opcode parameter (defined in vta_defines.h).
-* \param use_imm Boolean that indicates if the operation uses an immediate value.
-* \return The opcode string.
-*/
-const char* getOpcodeString(int opcode, bool use_imm);
-
-/*!
-* \brief Performs buffer data packing and tiling.
-* \param dst Pointer to the packed, and tiled destination 1D array (flattened).
-* \param src Pointer to the unpacked source 2D array.
-* \param y_size Number of rows.
-* \param x_size Number of columns.
-* \param y_block Inner tiling along row dimension.
-* \param x_block Inner tiling along column dimension.
-*/
-template <typename T, int T_WIDTH>
-void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block);
-
-/*!
-* \brief Performs buffer data unpacking.
-* \param dst Pointer to the unpacked destination 2D array.
-* \param src Pointer to the packed, and tiled source 1D array (flattened).
-* \param y_size Number of rows.
-* \param x_size Number of columns.
-* \param y_block Inner tiling along row dimension.
-* \param x_block Inner tiling along column dimension.
-*/
-template <typename T, int T_WIDTH>
-void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block);
-
-/*!
-* \brief Allocates and randomly initializes a 2D array in the heap.
-* \param rows Number of rows.
-* \param cols Number of columns.
-* \return Pointer to the 2D array.
-*/
-template <typename T>
-T ** allocInit2dArray(int rows, int cols);
-
-/*!
-* \brief Allocates and initializes a 2D array to a set value in the heap.
-* \param rows Number of rows.
-* \param cols Number of columns.
-* \param val Value to set the whole array to.
-* \return Pointer to the 2D array.
-*/
-template <typename T>
-T ** allocSet2dArray(int rows, int cols, int val);
-
-/*!
-* \brief Allocates a 2D array in the heap.
-* \param rows Number of rows.
-* \param cols Number of columns.
-* \return Pointer to the 2D array.
-*/
-template <typename T>
-T ** alloc2dArray(int rows, int cols);
-
-/*!
-* \brief Frees a 2D array.
-* \param array Pointer to the 2D array to be freed.
-* \param rows Number of rows.
-* \param cols Number of columns.
-*/
-template <typename T>
-void free2dArray(T **array, int rows, int cols);
-
-/*!
-* \brief Allocates a 3D array in the heap.
-* \param rows Number of rows (dim 0).
-* \param cols Number of columns (dim 1).
-* \param depth Depth of the array (dim 2).
-* \return Pointer to the 3D array.
-*/
-template <typename T>
-T *** alloc3dArray(int rows, int cols, int depth);
-
-/*!
-* \brief Frees a 3D array.
-* \param array Pointer to the 3D array.
-* \param rows Number of rows (dim 0).
-* \param cols Number of columns (dim 1).
-* \param depth Depth of the array (dim 2).
-*/
-template <typename T>
-void free3dArray(T *** array, int rows, int cols, int depth);
-
-/*!
-* \brief Performs memory allocation in a physically contiguous region of memory.
-* \param num_bytes Size of the buffer in bytes.
-* \return Pointer to the allocated buffer.
-*/
-void * allocBuffer(size_t num_bytes);
-
-/*!
-* \brief Frees buffer allocated in a physically contiguous region of memory.
-* \param buffer Pointer to the buffer to free.
-*/
-void freeBuffer(void * buffer);
-
-/*!
-* \brief Returns a VTA reset instruction on a 2D patch of the register file.
-* \param type On-chip memory target.
-* \param sram_offset Offset in SRAM.
-* \param y_size Number of rows to reset (y axis).
-* \param x_size Number of elements per row to reset (x axis).
-* \param x_stride Stride along the x axis.
-* \param pop_prev_dep Pop dependence from previous stage.
-* \param pop_next_dep Pop dependence from next stage.
-* \param push_prev_dep Push dependence to previous stage.
-* \param push_next_dep Push dependence to next stage.
-* \return A VTAGenericInsn for a reset op.
-*/
-VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, int x_stride,
-  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
-
-/*!
-* \brief Returns a VTA 2D load or store instruction.
-* \param opcode Type of operation.
-* \param type On-chip memory target.
-* \param sram_offset Offset in SRAM.
-* \param dram_offset Offset in DRAM.
-* \param y_size Number of rows to load/store (y axis).
-* \param x_size Number of elements per row to load/store (x axis).
-* \param x_stride Stride along the x axis.
-* \param y_pad Padding along the y axis.
-* \param x_pad Padding along the x axis.
-* \param pop_prev_dep Pop dependence from previous stage.
-* \param pop_next_dep Pop dependence from next stage.
-* \param push_prev_dep Push dependence to previous stage.
-* \param push_next_dep Push dependence to next stage.
-* \return A VTAGenericInsn for a 2D load or store op.
-*/
-VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
-  int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
-  int push_prev_dep, int push_next_dep);
-
-/*!
-* \brief Returns a VTA 1D load or store instruction.
-* \param opcode Type of operation.
-* \param type On-chip memory target.
-* \param sram_offset Offset in SRAM.
-* \param dram_offset Offset in DRAM.
-* \param size Number of elements to load/store.
-* \param pop_prev_dep Pop dependence from previous stage.
-* \param pop_next_dep Pop dependence from next stage.
-* \param push_prev_dep Push dependence to previous stage.
-* \param push_next_dep Push dependence to next stage.
-* \return A VTAGenericInsn for a 1D load or store op.
-*/
-VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
-  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
-
-/*!
-* \brief Returns a VTA matrix multiplication instruction of size (a, b) x (b, c).
-* \param uop_offset Offset of the micro-op in SRAM.
-* \param batch Batch size (a).
-* \param in_feat Input features (b).
-* \param out_feat Output features (c).
-* \param uop_compression Apply micro-op compression.
-* \param pop_prev_dep Pop dependence from previous stage.
-* \param pop_next_dep Pop dependence from next stage.
-* \param push_prev_dep Push dependence to previous stage.
-* \param push_next_dep Push dependence to next stage.
-* \return A VTAGenericInsn for a GEMM op.
-*/
-VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
-  bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
-  int push_next_dep);
-
-/*!
-* \brief Returns a VTA ALU instruction for map type operation.
-* \param opcode Opcode of the ALU instruction.
-* \param vector_size Vector size of the ALU operation size.
-* \param use_imm Use immediate.
-* \param imm Immediate value (int16).
-* \param uop_compression Apply micro-op compression.
-* \param pop_prev_dep Pop dependence from previous stage.
-* \param pop_next_dep Pop dependence from next stage.
-* \param push_prev_dep Push dependence to previous stage.
-* \param push_next_dep Push dependence to next stage.
-* \return A VTAGenericInsn for a ALU op.
-*/
-VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bool uop_compression,
-  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
-
-/*!
-* \brief Returns a VTA finish instruction.
-* \param pop_prev Pop dependence from previous stage.
-* \param pop_next Pop dependence from next stage.
-* \return A VTAGenericInsn for a finish op.
-*/
-VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next);
-
-/*!
-* \brief Returns an allocated buffer of VTA micro-ops to implement a copy operation.
-* \param y_size Number of rows to load/store (y axis).
-* \param x_size Number of elements per row to load/store (x axis).
-* \param uop_compression Apply micro-op compression.
-* \return A VTAUop pointer to an allocated micro-op buffer.
-*/
-VTAUop * getCopyUops(int y_size, int x_size, int uop_compression);
-
-/*!
-* \brief Returns an allocated buffer of VTA micro-ops to implement a matrix multiplication
-*   of size (a, b) x (b, c).
-* \param batch Batch size (a).
-* \param in_feat Input features (b).
-* \param out_feat Output features (c).
-* \param uop_compression Apply micro-op compression.
-* \param multi_threaded Generate micro-ops for two virtual execution threads.
-* \return A VTAUop pointer to an allocated micro-op buffer.
-*/
-VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
-  bool multi_threaded);
-
-/*!
-* \brief Returns an allocated buffer of VTA micro-ops to implement a vector-vector map operation.
-* \param vector_size Vector size.
-* \param uop_compression Apply micro-op compression.
-* \return A VTAUop pointer to an allocated micro-op buffer.
-*/
-VTAUop * getMapALUUops(int vector_size, bool uop_compression);
-
-/*!
-* \brief Print out parameters of the VTA design (for debugging purposes).
-*/
-void printParameters();
-
-/*!
-* \brief Print out instruction information (for debugging purposes).
-* \param num_insn Number of instructions.
-* \param insns Pointer to the instruction buffer.
-*/
-void printInstruction(int num_insn, VTAGenericInsn *insns);
-
-/*!
-* \brief Print out micro-op information (for debugging purposes).
-* \param num_insn Number of micro-ops.
-* \param insns Pointer to the micro-op buffer.
-*/
-void printMicroOp(int num_uop, VTAUop *uops);
-
-/*!
-* \brief VTA ALU unit test.
-* \param opcode The ALU opcode.
-* \param use_imm Use immediate.
-* \param batch Batch size.
-* \param vector_size Vector length of the ALU operation.
-* \param uop_compression Apply micro-op compression.
-* \return Number of errors from the test run.
-*/
-int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression);
-
-/*!
-* \brief VTA blocked GEMM unit test.
-* \param batch Batch size.
-* \param channels Channel width.
-* \param block Blocking size.
-* \param uop_compression Apply micro-op compression.
-* \return Number of errors from the test run.
-*/
-int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
-  int virtual_threads);
-
-/*!
-* \brief VTA GEMM unit test.
-* \param batch Batch size.
-* \param in_channels Input channels.
-* \param out_channels Output channels.
-* \param uop_compression Apply micro-op compression.
-* \return Number of errors from the test run.
-*/
-int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression);
-
-#endif  //  TESTS_HARDWARE_COMMON_TEST_LIB_H_
diff --git a/vta/vta-hw/tests/hardware/metal_test/Makefile b/vta/vta-hw/tests/hardware/metal_test/Makefile
deleted file mode 100644
index ef1dfc274916..000000000000
--- a/vta/vta-hw/tests/hardware/metal_test/Makefile
+++ /dev/null
@@ -1,55 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CC ?= g++
-CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
-LDFLAGS = -L/usr/lib -L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/
-LIBS = -l:libcma.so -lstdc++ -pthread
-INCLUDE_DIR = ../../../include
-DRIVER_DIR = ../../../src/pynq
-TESTLIB_DIR = ../common
-VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
-SOURCES = pynq_driver.cc test_lib.cc
-OBJECTS = pynq_driver.o test_lib.o metal_test.o
-EXECUTABLE = vta
-
-# Include VTA config
-VTA_CONFIG = python ../../../config/vta_config.py
-CFLAGS += `${VTA_CONFIG} --cflags`
-LDFLAGS += `${VTA_CONFIG} --ldflags`
-VTA_TARGET := $(shell ${VTA_CONFIG} --target)
-
-# Include bitstream
-VTA_PROGRAM = python3 ../../../python/vta/program_bitstream.py
-VTA_BIT = "vta.bit"
-
-# Define flags
-CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DVTA_DEBUG=0
-
-# All Target
-all: vtainstall $(EXECUTABLE)
-
-%.o: %.cc $(SOURCES)
-	$(CC) -c -o $@ $< $(CFLAGS)
-
-$(EXECUTABLE): $(OBJECTS)
-	$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
-
-vtainstall:
-	${VTA_PROGRAM} ${VTA_TARGET} ${VTA_BIT}
-clean:
-	rm -rf *.o $(EXECUTABLE)
diff --git a/vta/vta-hw/tests/hardware/metal_test/metal_test.cc b/vta/vta-hw/tests/hardware/metal_test/metal_test.cc
deleted file mode 100644
index cc0da5ccb440..000000000000
--- a/vta/vta-hw/tests/hardware/metal_test/metal_test.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file metal_test.cpp
- * \brief Bare-metal test to test driver and VTA design.
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <vta/driver.h>
-#ifdef VTA_TARGET_PYNQ
-#  include "../../../src/pynq/pynq_driver.h"
-#endif  // VTA_TARGET_PYNQ
-#include "../common/test_lib.h"
-
-int main(void) {
-#if VTA_DEBUG == 1
-  printParameters();
-#endif
-
-  int status = 0;
-
-  // Run ALU test (vector-scalar operators)
-  status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
-  status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
-  status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
-  status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
-  status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
-  status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
-
-  // Run ALU test (vector-vector operators)
-  status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
-  status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
-  status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
-  status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
-
-  // Run blocked GEMM test
-  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
-  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
-  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
-  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
-
-  if (status == 0) {
-    printf("\nINFO - Unit tests successful!\n");
-  } else {
-    printf("\nINTO - Unit tests failed!\n");
-  }
-
-  return status;
-}