From 427afd4729de14c5b7d4acea13cb8b2ca54c0181 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Sun, 28 Jul 2019 18:41:10 -0700 Subject: [PATCH] [VTA] Refactor to increase platform coverage (Ultra96 etc.) (#3496) * hardware refactor for increased FPGA coverage, small optimizations * fix header * cleaning up parameters that won't be needed for now * streamlining makefile, and simplifying tcl scripts * moving parameter derivation into pkg_config.py, keeping tcl scripts lightweight * refactoring tcl script to avoid global variables * deriving AXI signals in pkg_config.py * unifying address map definition for hardware and software drivers * single channel design for ultra96 to simplify build * enable alu by default, no mul opcode for now * hardware fix * new bitstream; vta version * avoid error when env variable is not set * ultra96 cleanup * further cleaning up tcl script for bitstream generation * preliminary rpc server support on ultra96 * rpc server tracker scripts * ultra96 ldflag * ultra96 support * ultra96 support * cleanup line * cmake support for ultra96 * simplify memory instantiation * cleaning up IP parameter initialization * fix queue instantiation * 2019.1 transition * fix macro def * removing bus width from config * cleanup * fix * turning off testing for now * cleanup ultra96 ps insantiation * minor refactor * adding comments * upgrading to tophub v0.6 * model used in TVM target now refers to a specific version of VTA for better autoTVM scheduling * revert change due to bug * rename driver files to be for zynq-type devices * streamlining address mapping * unifying register map offset values between driver and hardware generator * rely on cma library for cache flush/invalidation * coherence management * not make buffer packing depend on data types that can be wider than 64bits * refactor config derivation to minimize free parameters * fix environment/pkg config interaction * adding cfg dump property to pkgconfig: * fix rpc reconfig * fix spacing * cleanup * fix spacing * long line fix * fix spacing and lint * fix line length * cmake fix * environment fix * renaming after pynq since the driver stack relies on the pynq library - see pynq.io * update doc * adding parameterization to name * space * removing reg width * vta RPC * update doc on how to edit vta_config.json * fix path * fix path --- .../{pynq_rpc => vta_rpc}/start_rpc_server.sh | 0 .../start_rpc_server_to_tracker.py} | 5 +- cmake/modules/VTA.cmake | 17 +- docs/vta/dev/config.rst | 23 +- docs/vta/install.md | 25 +- python/tvm/autotvm/tophub.py | 2 +- vta/config/pynq_sample.json | 10 +- vta/config/ultra96_sample.json | 13 + vta/config/vta_config.json | 8 +- vta/config/vta_config.py | 185 +-- vta/hardware/xilinx/Makefile | 110 +- vta/hardware/xilinx/scripts/hls.tcl | 293 ++-- vta/hardware/xilinx/scripts/vivado.tcl | 1193 +++++------------ vta/hardware/xilinx/sim/vta_test.cc | 17 +- vta/hardware/xilinx/src/vta.cc | 851 ++++++------ vta/hardware/xilinx/src/vta.h | 65 +- vta/include/vta/driver.h | 16 +- vta/include/vta/hw_spec.h | 247 +--- vta/python/vta/bitstream.py | 5 +- vta/python/vta/environment.py | 53 +- vta/python/vta/pkg_config.py | 194 ++- vta/python/vta/program_bitstream.py | 7 +- vta/python/vta/rpc_client.py | 2 +- vta/python/vta/testing/util.py | 17 +- vta/src/pynq/pynq_driver.cc | 68 +- vta/src/pynq/pynq_driver.h | 46 +- vta/src/runtime.cc | 32 +- vta/src/sim/sim_driver.cc | 4 +- vta/src/tsim/tsim_driver.cc | 4 +- vta/tests/hardware/common/test_lib.cc | 405 +++--- vta/tests/hardware/common/test_lib.h | 16 +- vta/tests/python/unittest/test_environment.py | 2 +- .../frontend/deploy_resnet_on_vta.py | 4 +- 33 files changed, 1610 insertions(+), 2329 deletions(-) rename apps/{pynq_rpc => vta_rpc}/start_rpc_server.sh (100%) rename apps/{pynq_rpc/start_rpc_server_to_tracker.sh => vta_rpc/start_rpc_server_to_tracker.py} (83%) create mode 100644 vta/config/ultra96_sample.json diff --git a/apps/pynq_rpc/start_rpc_server.sh b/apps/vta_rpc/start_rpc_server.sh similarity index 100% rename from apps/pynq_rpc/start_rpc_server.sh rename to apps/vta_rpc/start_rpc_server.sh diff --git a/apps/pynq_rpc/start_rpc_server_to_tracker.sh b/apps/vta_rpc/start_rpc_server_to_tracker.py similarity index 83% rename from apps/pynq_rpc/start_rpc_server_to_tracker.sh rename to apps/vta_rpc/start_rpc_server_to_tracker.py index f1b906327add..fd2998efe095 100755 --- a/apps/pynq_rpc/start_rpc_server_to_tracker.sh +++ b/apps/vta_rpc/start_rpc_server_to_tracker.py @@ -17,7 +17,10 @@ # under the License. PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )" +# Derive target specified by vta_config.json +VTA_CONFIG=${PROJROOT}/vta/config/vta_config.py +TARGET=$(python ${VTA_CONFIG} --target) export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq -python3 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq +python3 -m vta.exec.rpc_server --tracker fleet:9190 --key $TARGET diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index 6d5ea000edc2..bae8d458d298 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -38,11 +38,16 @@ elseif(PYTHON) string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_=.]*" VTA_DEFINITIONS "${__vta_defs}") file(GLOB VTA_RUNTIME_SRCS vta/src/*.cc) - file(GLOB __vta_target_srcs vta/src/${VTA_TARGET}/*.cc) + # Add sim driver sources + if(${VTA_TARGET} STREQUAL "sim") + file(GLOB __vta_target_srcs vta/src/sim/*.cc) + endif() + # Add pynq driver sources + if(${VTA_TARGET} STREQUAL "pynq" OR ${VTA_TARGET} STREQUAL "ultra96") + file(GLOB __vta_target_srcs vta/src/pynq/*.cc) + endif() list(APPEND VTA_RUNTIME_SRCS ${__vta_target_srcs}) - - add_library(vta SHARED ${VTA_RUNTIME_SRCS}) - + # Add tsim driver sources if(${VTA_TARGET} STREQUAL "tsim") target_compile_definitions(vta PUBLIC USE_TSIM) include_directories("vta/include") @@ -50,6 +55,8 @@ elseif(PYTHON) list(APPEND RUNTIME_SRCS ${RUNTIME_DPI_SRCS}) endif() + add_library(vta SHARED ${VTA_RUNTIME_SRCS}) + target_include_directories(vta PUBLIC vta/include) foreach(__def ${VTA_DEFINITIONS}) @@ -62,7 +69,7 @@ elseif(PYTHON) endif(APPLE) # PYNQ rules for Pynq v2.4 - if(${VTA_TARGET} STREQUAL "pynq") + if(${VTA_TARGET} STREQUAL "pynq" OR ${VTA_TARGET} STREQUAL "ultra96") find_library(__cma_lib NAMES cma PATH /usr/lib) target_link_libraries(vta ${__cma_lib}) endif() diff --git a/docs/vta/dev/config.rst b/docs/vta/dev/config.rst index 0ca6b99759c0..f4b5bcec8af1 100644 --- a/docs/vta/dev/config.rst +++ b/docs/vta/dev/config.rst @@ -36,10 +36,6 @@ below. +=======================+============+========================================================+ | ``TARGET`` | String | The TVM device target. | +-----------------------+------------+--------------------------------------------------------+ -| ``HW_TARGET`` | Int | FPGA frequency in MHz. | -+-----------------------+------------+--------------------------------------------------------+ -| ``HW_CLK_TARGET`` | Int | FPGA clock period in ns target for HLS tool. | -+-----------------------+------------+--------------------------------------------------------+ | ``HW_VER`` | String | VTA hardware version number. | +-----------------------+------------+--------------------------------------------------------+ | ``LOG_INP_WIDTH`` | Int (log2) | Input data type signed integer width. | @@ -48,13 +44,9 @@ below. +-----------------------+------------+--------------------------------------------------------+ | ``LOG_ACC_WIDTH`` | Int (log2) | Accumulator data type signed integer width. | +-----------------------+------------+--------------------------------------------------------+ -| ``LOG_OUT_WIDTH`` | Int (log2) | Output data type signed integer width. | -+-----------------------+------------+--------------------------------------------------------+ -| ``LOG_BATCH`` | Int (log2) | VTA matrix multiply intrinsic output dimension 0. | -+-----------------------+------------+--------------------------------------------------------+ -| ``LOG_BLOCK_IN`` | Int (log2) | VTA matrix multiply reduction dimension. | +| ``LOG_BATCH`` | Int (log2) | VTA matrix multiply intrinsic input/output dimension 0.| +-----------------------+------------+--------------------------------------------------------+ -| ``LOG_BLOCK_OUT`` | Int (log2) | VTA matrix multiply intrinsic output dimension 1. | +| ``LOG_BLOCK`` | Int (log2) | VTA matrix multiply inner dimensions. | +-----------------------+------------+--------------------------------------------------------+ | ``LOG_UOP_BUFF_SIZE`` | Int (log2) | Micro-op on-chip buffer in Bytes. | +-----------------------+------------+--------------------------------------------------------+ @@ -75,13 +67,8 @@ below. We provide additional detail below regarding each parameter: - - ``TARGET``: Can be set to ``"pynq"`` or ``"sim"``. - - ``HW_TARGET``: In pynq mode, can be set to ``100``, ``142``, ``167``, or ``200`` MHz. - - ``HW_CLK_TARGET``: The lower the target, the more pipeline stages HLS will insert to achieve timing closure during place and route (this can also slightly decrease performance). + - ``TARGET``: Can be set to ``"pynq"``, ``"ultra96"``, ``"sim"`` (fast simulator), or ``"tsim"`` (cycle accurate sim with verilator). - ``HW_VER``: Hardware version which increments everytime the VTA hardware design changes. This parameter is used to uniquely idenfity hardware bitstreams. - - ``LOG_OUT_WIDTH``: We recommend matching ``LOG_OUT_WIDTH`` to ``LOG_INP_WIDTH``. - - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension. - - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension. - - ``LOG_BLOCK_IN``: Equivalent to B in multiplication of shape (A, B) x (B, C), or typically, the input channel dimension. - - ``LOG_BLOCK_OUT``: Equivalent to C in multiplication of shape (A, B) x (B, C), or typically, the output channel dimension. + - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension of inner tensor computation. + - ``LOG_BLOCK``: Equivalent to B and C in multiplication of shape (A, B) x (B, C), or typically, the input/output channel dimensions of the innter tensor computation. diff --git a/docs/vta/install.md b/docs/vta/install.md index 6c87b4edd288..2583e331ecd5 100644 --- a/docs/vta/install.md +++ b/docs/vta/install.md @@ -61,7 +61,7 @@ To do so, ```bash cd -cp vta/config/vta_config.json vta_config.json +vim vta/config/vta_config.json # edit vta_config.json make vta ``` @@ -118,7 +118,7 @@ cd /home/xilinx/tvm mkdir build cp cmake/config.cmake build/. # Copy pynq specific configuration -cp vta/config/pynq_sample.json build/vta_config.json +cp vta/config/pynq_sample.json vta/config/vta_config.json cd build cmake .. make runtime vta -j2 @@ -147,13 +147,12 @@ export VTA_PYNQ_RPC_PORT=9091 ``` In addition, you'll need to edit the `vta_config.json` file on the host to indicate that we are targeting the Pynq platform, by setting the `TARGET` field to `"pynq"`. -Alternatively, you can copy the default `vta/config/pynq_sample.json` into the TVM root as `vta_config.json`. > Note: in contrast to our simulation setup, there are no libraries to compile on the host side since the host offloads all of the computation to the Pynq board. ```bash # On the Host-side cd -cp vta/config/pynq_sample.json vta_config.json +cp vta/config/pynq_sample.json vta/config/vta_config.json ``` This time again, we will run the 2D convolution testbench. @@ -187,28 +186,28 @@ This third and last guide allows users to generate custom VTA bitstreams using f ### Xilinx Toolchain Installation -We recommend using `Vivado 2018.2` since our scripts have been tested to work on this version of the Xilinx toolchains. +We recommend using `Vivado 2019.1` since our scripts have been tested to work on this version of the Xilinx toolchains. Our guide is written for Linux (Ubuntu) installation. -You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2018.2](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain. +You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2019.1](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain. #### Obtaining and Launching the Vivado GUI Installer -1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2018-2.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2018.2: WebPACK and Editions. +1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2019-1.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2019.1: WebPACK and Editions. 2. You’ll have to sign in with a Xilinx account. This requires a Xilinx account creation that will take 2 minutes. -3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin`. +3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin`. 4. Now that the file is downloaded, go to your `Downloads` directory, and change the file permissions so it can be executed: ```bash -chmod u+x Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin +chmod u+x Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin ``` 5. Now you can execute the binary: ```bash -./Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin +./Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin ``` #### Xilinx Vivado GUI Installer Steps -At this point you've launched the Vivado 2018.2 Installer GUI program. +At this point you've launched the Vivado 2019.1 Installer GUI program. 1. Click “Next” on the *Welcome* screen. 2. On the *Select Install Type* screen, enter your Xilinx user credentials under the “User Authentication” box and select the “Download and Install Now” option before clicking “Next” . @@ -230,8 +229,8 @@ At this point you've launched the Vivado 2018.2 Installer GUI program. The last step is to update your `~/.bashrc` with the following lines. This will include all of the Xilinx binary paths so you can launch compilation scripts from the command line. ```bash -# Xilinx Vivado 2018.2 environment -export XILINX_VIVADO=${XILINX_PATH}/Vivado/2018.2 +# Xilinx Vivado 2019.1 environment +export XILINX_VIVADO=${XILINX_PATH}/Vivado/2019.1 export PATH=${XILINX_VIVADO}/bin:${PATH} ``` diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py index 37a95d6f774d..0130384c2e69 100644 --- a/python/tvm/autotvm/tophub.py +++ b/python/tvm/autotvm/tophub.py @@ -44,7 +44,7 @@ 'opencl': "v0.02", 'mali': "v0.05", - 'vta': "v0.05", + 'vta': "v0.06", } logger = logging.getLogger('autotvm') diff --git a/vta/config/pynq_sample.json b/vta/config/pynq_sample.json index 5c37108e6b12..380984a28972 100644 --- a/vta/config/pynq_sample.json +++ b/vta/config/pynq_sample.json @@ -1,17 +1,13 @@ { "TARGET" : "pynq", - "HW_FREQ" : 100, - "HW_CLK_TARGET" : 8, - "HW_VER" : "0.0.0", + "HW_VER" : "0.0.1", "LOG_INP_WIDTH" : 3, "LOG_WGT_WIDTH" : 3, "LOG_ACC_WIDTH" : 5, - "LOG_OUT_WIDTH" : 3, "LOG_BATCH" : 0, - "LOG_BLOCK_IN" : 4, - "LOG_BLOCK_OUT" : 4, + "LOG_BLOCK" : 4, "LOG_UOP_BUFF_SIZE" : 15, - "LOG_INP_BUFF_SIZE" : 15, + "LOG_INP_BUFF_SIZE" :15, "LOG_WGT_BUFF_SIZE" : 18, "LOG_ACC_BUFF_SIZE" : 17 } diff --git a/vta/config/ultra96_sample.json b/vta/config/ultra96_sample.json new file mode 100644 index 000000000000..013420cff52e --- /dev/null +++ b/vta/config/ultra96_sample.json @@ -0,0 +1,13 @@ +{ + "TARGET" : "ultra96", + "HW_VER" : "0.0.1", + "LOG_INP_WIDTH" : 3, + "LOG_WGT_WIDTH" : 3, + "LOG_ACC_WIDTH" : 5, + "LOG_BATCH" : 0, + "LOG_BLOCK" : 4, + "LOG_UOP_BUFF_SIZE" : 15, + "LOG_INP_BUFF_SIZE" :15, + "LOG_WGT_BUFF_SIZE" : 18, + "LOG_ACC_BUFF_SIZE" : 17 +} diff --git a/vta/config/vta_config.json b/vta/config/vta_config.json index 602af0126816..0591bb486143 100644 --- a/vta/config/vta_config.json +++ b/vta/config/vta_config.json @@ -1,15 +1,11 @@ { "TARGET" : "sim", - "HW_FREQ" : 100, - "HW_CLK_TARGET" : 7, - "HW_VER" : "0.0.0", + "HW_VER" : "0.0.1", "LOG_INP_WIDTH" : 3, "LOG_WGT_WIDTH" : 3, "LOG_ACC_WIDTH" : 5, - "LOG_OUT_WIDTH" : 3, "LOG_BATCH" : 0, - "LOG_BLOCK_IN" : 4, - "LOG_BLOCK_OUT" : 4, + "LOG_BLOCK" : 4, "LOG_UOP_BUFF_SIZE" : 15, "LOG_INP_BUFF_SIZE" : 15, "LOG_WGT_BUFF_SIZE" : 18, diff --git a/vta/config/vta_config.py b/vta/config/vta_config.py index ea07e5a7770c..b925bf5fe4df 100644 --- a/vta/config/vta_config.py +++ b/vta/config/vta_config.py @@ -30,7 +30,6 @@ def get_pkg_config(cfg): PkgConfig = libpkg["PkgConfig"] return PkgConfig(cfg, proj_root) - def main(): """Main funciton""" parser = argparse.ArgumentParser() @@ -45,7 +44,7 @@ def main(): parser.add_argument("--update", action="store_true", help="Print out the json option.") parser.add_argument("--ldflags", action="store_true", - help="print the cflags") + help="print the ldflags") parser.add_argument("--cfg-json", action="store_true", help="print all the config json") parser.add_argument("--save-cfg-json", type=str, default="", @@ -54,33 +53,51 @@ def main(): help="print the target") parser.add_argument("--cfg-str", action="store_true", help="print the configuration string") - parser.add_argument("--get-inpwidth", action="store_true", - help="returns log of input bitwidth") - parser.add_argument("--get-wgtwidth", action="store_true", - help="returns log of weight bitwidth") - parser.add_argument("--get-accwidth", action="store_true", - help="returns log of accum bitwidth") - parser.add_argument("--get-outwidth", action="store_true", - help="returns log of output bitwidth") - parser.add_argument("--get-batch", action="store_true", - help="returns log of tensor batch dimension") - parser.add_argument("--get-blockin", action="store_true", - help="returns log of tensor block in dimension") - parser.add_argument("--get-blockout", action="store_true", - help="returns log of tensor block out dimension") - parser.add_argument("--get-uopbuffsize", action="store_true", - help="returns log of micro-op buffer size in B") - parser.add_argument("--get-inpbuffsize", action="store_true", - help="returns log of input buffer size in B") - parser.add_argument("--get-wgtbuffsize", action="store_true", - help="returns log of weight buffer size in B") - parser.add_argument("--get-accbuffsize", action="store_true", - help="returns log of accum buffer size in B") - parser.add_argument("--get-outbuffsize", action="store_true", - help="returns log of output buffer size in B") - parser.add_argument("--get-fpgafreq", action="store_true", + parser.add_argument("--get-inp-mem-banks", action="store_true", + help="returns number of input memory banks") + parser.add_argument("--get-inp-mem-width", action="store_true", + help="returns input memory read/write port width") + parser.add_argument("--get-inp-mem-depth", action="store_true", + help="returns input memory depth") + parser.add_argument("--get-inp-mem-axi-ratio", action="store_true", + help="returns ratio between input element width and axi width") + parser.add_argument("--get-wgt-mem-banks", action="store_true", + help="returns number of weight memory banks") + parser.add_argument("--get-wgt-mem-width", action="store_true", + help="returns weight memory read/write port width") + parser.add_argument("--get-wgt-mem-depth", action="store_true", + help="returns weight memory depth") + parser.add_argument("--get-wgt-mem-axi-ratio", action="store_true", + help="returns ratio between weight element width and axi width") + parser.add_argument("--get-out-mem-banks", action="store_true", + help="returns number of output memory banks") + parser.add_argument("--get-out-mem-width", action="store_true", + help="returns output memory read/write port width") + parser.add_argument("--get-out-mem-depth", action="store_true", + help="returns output memory depth") + parser.add_argument("--get-out-mem-axi-ratio", action="store_true", + help="returns ratio between output element width and axi width") + parser.add_argument("--get-axi-cache-bits", action="store_true", + help="returns AXI system ARCACHE/AWCACHE hardcoded bit value") + parser.add_argument("--get-axi-prot-bits", action="store_true", + help="returns AXI system ARPROT/AWPROT hardcoded bit value") + parser.add_argument("--get-ip-reg-map-range", action="store_true", + help="returns ip register map address range") + parser.add_argument("--get-fetch-base-addr", action="store_true", + help="returns fetch module base address") + parser.add_argument("--get-load-base-addr", action="store_true", + help="returns load module base address") + parser.add_argument("--get-compute-base-addr", action="store_true", + help="returns compute module base address") + parser.add_argument("--get-store-base-addr", action="store_true", + help="returns store module base address") + parser.add_argument("--get-fpga-dev", action="store_true", + help="returns FPGA device target") + parser.add_argument("--get-fpga-family", action="store_true", + help="returns FPGA device family") + parser.add_argument("--get-fpga-freq", action="store_true", help="returns FPGA frequency") - parser.add_argument("--get-fpgaper", action="store_true", + parser.add_argument("--get-fpga-per", action="store_true", help="returns HLS target clock period") args = parser.parse_args() @@ -92,8 +109,6 @@ def main(): os.path.abspath(os.path.expanduser(__file__))) proj_root = os.path.abspath(os.path.join(curr_path, "../../")) path_list = [ - os.path.join(proj_root, "vta_config.json"), - os.path.join(proj_root, "build", "vta_config.json"), os.path.join(proj_root, "vta/config/vta_config.json") ] if args.use_cfg: @@ -102,14 +117,11 @@ def main(): if not ok_path_list: raise RuntimeError("Cannot find config in %s" % str(path_list)) cfg = json.load(open(ok_path_list[0])) - cfg["LOG_OUT_BUFF_SIZE"] = ( - cfg["LOG_ACC_BUFF_SIZE"] + - cfg["LOG_OUT_WIDTH"] - - cfg["LOG_ACC_WIDTH"]) + pkg = get_pkg_config(cfg) if args.target: - print(pkg.target) + print(pkg.TARGET) if args.defs: print(" ".join(pkg.macro_defs)) @@ -119,8 +131,10 @@ def main(): if args.cflags: cflags_str = " ".join(pkg.cflags) - if cfg["TARGET"] == "pynq": + if pkg.TARGET == "pynq": cflags_str += " -DVTA_TARGET_PYNQ" + if pkg.TARGET == "ultra96": + cflags_str += " -DVTA_TARGET_ULTRA96" print(cflags_str) if args.ldflags: @@ -134,63 +148,76 @@ def main(): fo.write(pkg.cfg_json) if args.cfg_str: - # Needs to match the BITSTREAM string in python/vta/environment.py - cfg_str = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}".format( - (1 << cfg["LOG_BATCH"]), - (1 << cfg["LOG_BLOCK_IN"]), - (1 << cfg["LOG_BLOCK_OUT"]), - (1 << cfg["LOG_INP_WIDTH"]), - (1 << cfg["LOG_WGT_WIDTH"]), - cfg["LOG_UOP_BUFF_SIZE"], - cfg["LOG_INP_BUFF_SIZE"], - cfg["LOG_WGT_BUFF_SIZE"], - cfg["LOG_ACC_BUFF_SIZE"], - cfg["HW_FREQ"], - cfg["HW_CLK_TARGET"], - cfg["HW_VER"].replace('.', '_')) - print(cfg_str) + print(pkg.TARGET + "_" + pkg.bitstream) + + if args.get_inp_mem_banks: + print(pkg.inp_mem_banks) + + if args.get_inp_mem_width: + print(pkg.inp_mem_width) + + if args.get_inp_mem_depth: + print(pkg.inp_mem_depth) + + if args.get_inp_mem_axi_ratio: + print(pkg.inp_mem_axi_ratio) + + if args.get_wgt_mem_banks: + print(pkg.wgt_mem_banks) + + if args.get_wgt_mem_width: + print(pkg.wgt_mem_width) + + if args.get_wgt_mem_depth: + print(pkg.wgt_mem_depth) + + if args.get_wgt_mem_axi_ratio: + print(pkg.wgt_mem_axi_ratio) + + if args.get_out_mem_banks: + print(pkg.out_mem_banks) - if args.get_inpwidth: - print(cfg["LOG_INP_WIDTH"]) + if args.get_out_mem_width: + print(pkg.out_mem_width) - if args.get_wgtwidth: - print(cfg["LOG_WGT_WIDTH"]) + if args.get_out_mem_depth: + print(pkg.out_mem_depth) - if args.get_accwidth: - print(cfg["LOG_ACC_WIDTH"]) + if args.get_out_mem_axi_ratio: + print(pkg.out_mem_axi_ratio) - if args.get_outwidth: - print(cfg["LOG_OUT_WIDTH"]) + if args.get_axi_cache_bits: + print(pkg.axi_cache_bits) - if args.get_batch: - print(cfg["LOG_BATCH"]) + if args.get_axi_prot_bits: + print(pkg.axi_prot_bits) - if args.get_blockin: - print(cfg["LOG_BLOCK_IN"]) + if args.get_ip_reg_map_range: + print(pkg.ip_reg_map_range) - if args.get_blockout: - print(cfg["LOG_BLOCK_OUT"]) + if args.get_fetch_base_addr: + print(pkg.fetch_base_addr) - if args.get_uopbuffsize: - print(cfg["LOG_UOP_BUFF_SIZE"]) + if args.get_load_base_addr: + print(pkg.load_base_addr) - if args.get_inpbuffsize: - print(cfg["LOG_INP_BUFF_SIZE"]) + if args.get_compute_base_addr: + print(pkg.compute_base_addr) - if args.get_wgtbuffsize: - print(cfg["LOG_WGT_BUFF_SIZE"]) + if args.get_store_base_addr: + print(pkg.store_base_addr) - if args.get_outbuffsize: - print(cfg["LOG_OUT_BUFF_SIZE"]) + if args.get_fpga_dev: + print(pkg.fpga_device) - if args.get_accbuffsize: - print(cfg["LOG_ACC_BUFF_SIZE"]) + if args.get_fpga_family: + print(pkg.fpga_family) - if args.get_fpgafreq: - print(cfg["HW_FREQ"]) + if args.get_fpga_freq: + print(pkg.fpga_freq) - if args.get_fpgaper: - print(cfg["HW_CLK_TARGET"]) + if args.get_fpga_per: + print(pkg.fpga_per) if __name__ == "__main__": main() diff --git a/vta/hardware/xilinx/Makefile b/vta/hardware/xilinx/Makefile index af13cdc166f8..77d5d4413f6c 100644 --- a/vta/hardware/xilinx/Makefile +++ b/vta/hardware/xilinx/Makefile @@ -17,81 +17,30 @@ # Directories ROOTDIR = $(CURDIR) -BUILD_NAME = build -BUILD_DIR = $(ROOTDIR)/../../$(BUILD_NAME)/hardware/xilinx -SCRIPT_DIR = $(ROOTDIR)/scripts -SRC_DIR = $(ROOTDIR)/src -SIM_DIR = $(ROOTDIR)/sim -TEST_DIR = $(ROOTDIR)/../../tests/hardware/common -INCLUDE_DIR = $(ROOTDIR)/../../include +VTA_DIR = $(CURDIR)/../.. +BUILD_DIR = $(VTA_DIR)/build/hardware/xilinx +SCRIPT_DIR = $(CURDIR)/scripts +SRC_DIR = $(CURDIR)/src # Executables VIVADO_HLS = vivado_hls VIVADO = vivado -HSI = hsi - -# HLS mode -MODE = skip_sim -# Debug flag -DEBUG = false -# SLURM -SLURM = false -# Prevent generation of DSP -NO_DSP = false -# Prevent generation of ALU -NO_ALU = false # Process VTA JSON config -VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py -CFLAGS := $(shell ${VTA_CONFIG} --cflags) -VTA_TARGET := $(shell ${VTA_CONFIG} --target) - -#--------------------- -# VTA Parameters -#-------------------- -VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth) -VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth) -VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth) -VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth) -VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch) -VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin) -VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout) -VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize) -VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize) -VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize) -VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize) -VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize) - -#--------------------- -# FPGA Parameters -#-------------------- -VTA_CLOCK_FREQ = $(shell ${VTA_CONFIG} --get-fpgafreq) -VTA_TARGET_PER = $(shell ${VTA_CONFIG} --get-fpgaper) - -#--------------------- -# Compilation parameters -#-------------------- - -# Number of threads during compilation -VTA_HW_COMP_THREADS = 8 +VTA_CONFIG := $(CURDIR)/../../config/vta_config.py # Derive config name -CONF = $(shell ${VTA_CONFIG} --cfg-str) -IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF) -HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF) - -ifeq ($(SLURM), true) - IP_BUILD_PATH = /scratch/hls/$(CONF) - HW_BUILD_PATH = /scratch/vivado/$(CONF) -endif +CONF := $(shell python ${VTA_CONFIG} --cfg-str) +IP_BUILD_PATH := $(BUILD_DIR)/hls/$(CONF) +HW_BUILD_PATH := $(BUILD_DIR)/vivado/$(CONF) # IP file path -IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip +IP_PATH := $(BUILD_DIR)/hls/$(CONF)/vta_compute/soln/impl/ip/xilinx_com_hls_compute_1_0.zip # Bitstream file path -BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit +BIT_PATH := $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit -.PHONY: all ip bit bsp clean clean_all +.PHONY: all ip bit clean clean_all all: bit ip: $(IP_PATH) @@ -100,37 +49,24 @@ bit: $(BIT_PATH) $(IP_PATH): $(SRC_DIR)/* mkdir -p $(IP_BUILD_PATH) cd $(IP_BUILD_PATH) && \ - $(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \ - -tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \ - $(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(VTA_TARGET_PER) \ - $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \ - $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \ - $(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \ - $(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE) -ifeq ($(SLURM), true) - mkdir -p $(BUILD_DIR)/hls - mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/. -endif + $(VIVADO_HLS) \ + -f $(SCRIPT_DIR)/hls.tcl \ + -tclargs \ + $(VTA_DIR) \ + ${VTA_CONFIG} $(BIT_PATH): $(IP_PATH) mkdir -p $(HW_BUILD_PATH) cd $(HW_BUILD_PATH) && \ - $(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \ - -tclargs $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) $(VTA_CLOCK_FREQ) \ - $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \ - $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \ - $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE) -ifeq ($(SLURM), true) - mkdir -p $(BUILD_DIR)/vivado - mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/. -endif - -bsp: $(BIT_PATH) - cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog - cd $(HW_BUILD_PATH)/bsp && make + $(VIVADO) \ + -mode tcl \ + -source $(SCRIPT_DIR)/vivado.tcl \ + -tclargs \ + $(BUILD_DIR)/hls/$(CONF) \ + ${VTA_CONFIG} clean: - rm -rf *.out *.log *.sb figures + rm -rf *.out *.log cleanall: clean rm -rf $(BUILD_DIR) diff --git a/vta/hardware/xilinx/scripts/hls.tcl b/vta/hardware/xilinx/scripts/hls.tcl index 3d308bc58d25..f371d905113b 100644 --- a/vta/hardware/xilinx/scripts/hls.tcl +++ b/vta/hardware/xilinx/scripts/hls.tcl @@ -14,220 +14,125 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# -# Copyright (c) 2018 by Contributors -# file: hls.tcl -# brief: HLS generation script. -# # Command line arguments: -# Arg 1: path to design sources -# Arg 2: path to sim sources -# Arg 3: path to test sources -# Arg 4: path to include sources -# Arg 5: mode -# Arg 6: debug -# Arg 7: no_dsp -# Arg 8: no_alu -# Arg 9: target clock period -# Arg 10: input type width (log) -# Arg 11: weight type width (log) -# Arg 12: accum type width (log) -# Arg 13: output type width (log) -# Arg 14: batch size (log) -# Arg 15: in block size (log) -# Arg 16: out block size (log) -# Arg 17: uop buffer size in B (log) -# Arg 18: inp buffer size in B (log) -# Arg 19: wgt buffer size in B (log) -# Arg 20: acc buffer size in B (log) -# Arg 21: out buffer size in B (log) - -if { [llength $argv] eq 23 } { - set src_dir [lindex $argv 2] - set sim_dir [lindex $argv 3] - set test_dir [lindex $argv 4] - set include_dir [lindex $argv 5] - set mode [lindex $argv 6] - set debug [lindex $argv 7] - set no_dsp [lindex $argv 8] - set no_alu [lindex $argv 9] - set target_period [lindex $argv 10] - set inp_width [lindex $argv 11] - set wgt_width [lindex $argv 12] - set acc_width [lindex $argv 13] - set out_width [lindex $argv 14] - set batch [lindex $argv 15] - set block_in [lindex $argv 16] - set block_out [lindex $argv 17] - set uop_buff_size [lindex $argv 18] - set inp_buff_size [lindex $argv 19] - set wgt_buff_size [lindex $argv 20] - set acc_buff_size [lindex $argv 21] - set out_buff_size [lindex $argv 22] +# Arg 1: path to vta root +# Arg 2: path of config param script + +if { [llength $argv] eq 4 } { + set root_dir [lindex $argv 2] + set vta_config [lindex $argv 3] } else { - set src_dir "../src" - set sim_dir "../sim" - set test_dir "../../src/test" - set include_dir "../../include" - set mode "all" - set debug "false" - set no_dsp "true" - set no_alu "false" - set target_period 10 - set inp_width 3 - set wgt_width 3 - set acc_width 5 - set out_width 3 - set batch 1 - set block_in 4 - set block_out 4 - set uop_buff_size 15 - set inp_buff_size 15 - set wgt_buff_size 15 - set acc_buff_size 17 - set out_buff_size 15 - exit + puts "Not enough arguments provided!" + exit } +# Derive paths +set src_dir "$root_dir/hardware/xilinx/src" +set sim_dir "$root_dir/hardware/xilinx/sim" +set test_dir "$root_dir/tests/hardware/common" + +# C define flags that we want to pass to the compiler +set cflags [exec python $vta_config --cflags] + +# Get the VTA configuration paramters +set ::device [exec python $vta_config --get-fpga-dev] +set ::period [exec python $vta_config --get-fpga-per] + +# Get the VTA SRAM reshape/partition factors to get all memories +# to be of the same axi width. +set ::inp_reshape_factor [exec python $vta_config --get-inp-mem-axi-ratio] +set ::inp_partition_factor [exec python $vta_config --get-inp-mem-banks] +set ::wgt_reshape_factor [exec python $vta_config --get-wgt-mem-axi-ratio] +set ::wgt_partition_factor [exec python $vta_config --get-wgt-mem-banks] +set ::out_reshape_factor [exec python $vta_config --get-out-mem-axi-ratio] +set ::out_partition_factor [exec python $vta_config --get-out-mem-banks] + + # Initializes the HLS design and sets HLS pragmas for memory partitioning. # This is necessary because of a Vivado restriction that doesn't allow for # buses wider than 1024 bits. -proc init_design {per inp_width wgt_width out_width batch block_in block_out} { - - # Set device number - set_part {xc7z020clg484-1} - - # Set the clock frequency - create_clock -period $per -name default - - # Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024) - set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}] - if {$inp_partition_factor == 0} { - set_directive_array_reshape -type complete -dim 2 "load" inp_mem - set_directive_array_reshape -type complete -dim 2 "compute" inp_mem - } else { - # Set input reshaping factor below to (1024/INP_VECTOR_WIDTH) - set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}] - set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem - set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem - set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem - set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem - } - # Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024) - set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}] - if {$wgt_partition_factor == 0} { - set_directive_array_reshape -type complete -dim 2 "load" wgt_mem - set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem - } else { - # Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH) - set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}] - set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem - set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem - set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem - set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem - } - # Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024) - set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}] - if {$out_partition_factor == 0} { - set_directive_array_reshape -type complete -dim 2 "compute" out_mem - set_directive_array_reshape -type complete -dim 2 "store" out_mem - } else { - # Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH) - set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}] - set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem - set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem - set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem - set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem - } -} +proc init_design {} { -# C define flags to pass to compiler -set cflags "-I $include_dir -I $src_dir -I $test_dir \ - -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \ - -DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \ - -DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \ - -DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \ - -DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \ - -DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size" -if {$debug=="true"} { - append cflags " -DVTA_DEBUG=1" -} -if {$no_dsp=="true"} { - append cflags " -DNO_DSP" -} -if {$no_alu=="true"} { - append cflags " -DNO_ALU" + # Set device id + set_part $::device + + # Set the clock frequency + create_clock -period $::period -name default + + # HLS pragmas to reshape/partition the input memory read/write port + set_directive_array_reshape -type block -factor $::inp_reshape_factor -dim 2 "load" inp_mem + set_directive_array_reshape -type block -factor $::inp_reshape_factor -dim 2 "compute" inp_mem + if {$::inp_partition_factor > 1} { + set_directive_array_partition -type block -factor $::inp_partition_factor -dim 2 "load" inp_mem + set_directive_array_partition -type block -factor $::inp_partition_factor -dim 2 "compute" inp_mem + } + # HLS pragmas to reshape/partition the weight memory read/write port + set_directive_array_reshape -type block -factor $::wgt_reshape_factor -dim 2 "load" wgt_mem + set_directive_array_reshape -type block -factor $::wgt_reshape_factor -dim 2 "compute" wgt_mem + if {$::wgt_partition_factor >1} { + set_directive_array_partition -type block -factor $::wgt_partition_factor -dim 2 "load" wgt_mem + set_directive_array_partition -type block -factor $::wgt_partition_factor -dim 2 "compute" wgt_mem + } + # HLS pragmas to reshape/partition the output memory read/write port + set_directive_array_reshape -type block -factor $::out_reshape_factor -dim 2 "compute" out_mem + set_directive_array_reshape -type block -factor $::out_reshape_factor -dim 2 "store" out_mem + if {$::out_partition_factor > 1} { + set_directive_array_partition -type block -factor $::out_partition_factor -dim 2 "compute" out_mem + set_directive_array_partition -type block -factor $::out_partition_factor -dim 2 "store" out_mem + } } # HLS behavioral sim -if {$mode=="all" || $mode=="sim"} { - open_project vta_sim - set_top vta - add_files $src_dir/vta.cc -cflags $cflags - add_files -tb $sim_dir/vta_test.cc -cflags $cflags - add_files -tb $test_dir/test_lib.cc -cflags $cflags - open_solution "solution0" - init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out - csim_design -clean - close_project -} +open_project vta_sim +set_top vta +add_files $src_dir/vta.cc -cflags $cflags +add_files -tb $sim_dir/vta_test.cc -cflags $cflags +add_files -tb $test_dir/test_lib.cc -cflags $cflags +open_solution "soln" +init_design +csim_design -clean +close_project # Generate fetch stage -if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} { - open_project vta_fetch - set_top fetch - add_files $src_dir/vta.cc -cflags $cflags - open_solution "solution0" - init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out - csynth_design - if {$mode=="all" || $mode=="skip_sim"} { - export_design -format ip_catalog - } - close_project -} +open_project vta_fetch +set_top fetch +add_files $src_dir/vta.cc -cflags $cflags +open_solution "soln" +init_design +csynth_design +export_design -format ip_catalog +close_project # Generate load stage -if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} { - open_project vta_load - set_top load - add_files $src_dir/vta.cc -cflags $cflags - open_solution "solution0" - init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out - csynth_design - if {$mode=="all" || $mode=="skip_sim"} { - export_design -format ip_catalog - } - close_project -} +open_project vta_load +set_top load +add_files $src_dir/vta.cc -cflags $cflags +open_solution "soln" +init_design +csynth_design +export_design -format ip_catalog +close_project # Generate compute stage -if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} { - open_project vta_compute - set_top compute - add_files $src_dir/vta.cc -cflags $cflags - open_solution "solution0" - init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out - csynth_design - if {$mode=="all" || $mode=="skip_sim"} { - export_design -format ip_catalog - } - close_project -} +open_project vta_compute +set_top compute +add_files $src_dir/vta.cc -cflags $cflags +open_solution "soln" +init_design +csynth_design +export_design -format ip_catalog +close_project # Generate store stage -if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} { - open_project vta_store - set_top store - add_files $src_dir/vta.cc -cflags $cflags - open_solution "solution0" - init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out - csynth_design - if {$mode=="all" || $mode=="skip_sim"} { - export_design -format ip_catalog - } - close_project -} +open_project vta_store +set_top store +add_files $src_dir/vta.cc -cflags $cflags +open_solution "soln" +init_design +csynth_design +export_design -format ip_catalog +close_project exit diff --git a/vta/hardware/xilinx/scripts/vivado.tcl b/vta/hardware/xilinx/scripts/vivado.tcl index 9cfa10ea7482..3be575749c27 100644 --- a/vta/hardware/xilinx/scripts/vivado.tcl +++ b/vta/hardware/xilinx/scripts/vivado.tcl @@ -14,107 +14,67 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# -# Copyright (c) 2018 by Xilinx, Contributors -# file: vivado.tcl -# brief: Vivado compilation script. Partially automatically generated -# by Vivado. -# # Check if script is running in correct Vivado version. -set scripts_vivado_version 2018.2 +set scripts_vivado_version 2018.3 set current_vivado_version [version -short] if { [string first $scripts_vivado_version $current_vivado_version] == -1 } { puts "" catch {common::send_msg_id "BD_TCL-109" "ERROR" "This script was generated using Vivado \ - <$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado. \ - Please run the script in Vivado <$scripts_vivado_version> then open the design in Vivado \ - <$current_vivado_version>. Upgrade the design by running \"Tools => Report => Report IP \ - Status...\", then run write_bd_tcl to create an updated script."} - + <$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado."} return 1 } # Parse argument list, derive the clock to utilize -set clock_id 0 -if { [llength $argv] eq 12 } { - set ip_path [lindex $argv 0] - set num_threads [lindex $argv 1] - set clock_freq [lindex $argv 2] - set inp_width [expr 1 << [lindex $argv 3]] - set wgt_width [expr 1 << [lindex $argv 4]] - set out_width [expr 1 << [lindex $argv 5]] - set batch [expr 1 << [lindex $argv 6]] - set out_block [expr 1 << [lindex $argv 7]] - set in_block [expr 1 << [lindex $argv 8]] - set inp_mem_size [expr 1 << [lindex $argv 9]] - set wgt_mem_size [expr 1 << [lindex $argv 10]] - set out_mem_size [expr 1 << [lindex $argv 11]] - if {$clock_freq eq 100} { - set clock_id 0 - puts "Setting clock frequency to 100MHz" - } elseif {$clock_freq eq 142} { - set clock_id 1 - puts "Setting clock frequency to 142MHz" - } elseif {$clock_freq eq 167} { - set clock_id 3 - puts "Setting clock frequency to 167MHz" - } elseif {$clock_freq eq 200} { - set clock_id 2 - puts "Setting clock frequency to 200MHz" - } else { - set clock_id 0 - puts "Unrecognized clock frequency, setting clock to 100MHz" - } +if { [llength $argv] eq 2 } { + set ip_path [lindex $argv 0] + set vta_config [lindex $argv 1] } else { - puts "Arg list incomplete: \ - " + puts "Arg list incomplete: " return 1 } -# Derive input mem parameters -set inp_mem_width [expr $inp_width * $batch * $in_block] -set inp_bus_width 1024 -set inp_part [expr $inp_mem_width / $inp_bus_width] -if {[expr $inp_part == 0]} { - set inp_part 1 - set inp_bus_width $inp_mem_width -} -set inp_mem_depth [expr $inp_mem_size * 8 / ($inp_mem_width * $inp_part)] - -# Derive weight mem parameters -set wgt_mem_width [expr $wgt_width * $out_block * $in_block] -set wgt_bus_width 1024 -set wgt_part [expr $wgt_mem_width / $wgt_bus_width] -if {[expr $wgt_part == 0]} { - set wgt_part 1 - set wgt_bus_width $wgt_mem_width -} -set wgt_mem_depth [expr $wgt_mem_size * 8 / ($wgt_mem_width * $wgt_part)] - -# Derive output mem parameters -set out_mem_width [expr $out_width * $batch * $out_block] -set out_bus_width 1024 -set out_part [expr $out_mem_width / $out_bus_width] -if {[expr $out_part == 0]} { - set out_part 1 - set out_bus_width $out_mem_width -} -set out_mem_depth [expr $out_mem_size * 8 / ($out_mem_width * $out_part)] - -# User defined paths +# Get the VTA configuration paramters +set target [exec python $vta_config --target] +set device_family [exec python $vta_config --get-fpga-family] +set clock_freq [exec python $vta_config --get-fpga-freq] + +# SRAM dimensions +set inp_part [exec python $vta_config --get-inp-mem-banks] +set inp_mem_width [exec python $vta_config --get-inp-mem-width] +set inp_mem_depth [exec python $vta_config --get-inp-mem-depth] +set wgt_part [exec python $vta_config --get-wgt-mem-banks] +set wgt_mem_width [exec python $vta_config --get-wgt-mem-width] +set wgt_mem_depth [exec python $vta_config --get-wgt-mem-depth] +set out_part [exec python $vta_config --get-out-mem-banks] +set out_mem_width [exec python $vta_config --get-out-mem-width] +set out_mem_depth [exec python $vta_config --get-out-mem-depth] + +# AXI bus signals +set axi_cache [exec python $vta_config --get-axi-cache-bits] +set axi_prot [exec python $vta_config --get-axi-prot-bits] + +# Address map +set ip_reg_map_range [exec python $vta_config --get-ip-reg-map-range] +set fetch_base_addr [exec python $vta_config --get-fetch-base-addr] +set load_base_addr [exec python $vta_config --get-load-base-addr] +set compute_base_addr [exec python $vta_config --get-compute-base-addr] +set store_base_addr [exec python $vta_config --get-store-base-addr] + +# Paths to IP library of VTA modules set proj_name vta +set design_name $proj_name set proj_path "." set ip_lib "ip_lib" -set fetch_ip "${ip_path}/vta_fetch/solution0/impl/ip/xilinx_com_hls_fetch_1_0.zip" -set load_ip "${ip_path}/vta_load/solution0/impl/ip/xilinx_com_hls_load_1_0.zip" -set compute_ip "${ip_path}/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_1_0.zip" -set store_ip "${ip_path}/vta_store/solution0/impl/ip/xilinx_com_hls_store_1_0.zip" +set fetch_ip "${ip_path}/vta_fetch/soln/impl/ip/xilinx_com_hls_fetch_1_0.zip" +set load_ip "${ip_path}/vta_load/soln/impl/ip/xilinx_com_hls_load_1_0.zip" +set compute_ip "${ip_path}/vta_compute/soln/impl/ip/xilinx_com_hls_compute_1_0.zip" +set store_ip "${ip_path}/vta_store/soln/impl/ip/xilinx_com_hls_store_1_0.zip" # Create custom project -create_project -force $proj_name $proj_path -part xc7z020clg484-1 +set device [exec python $vta_config --get-fpga-dev] +create_project -force $proj_name $proj_path -part $device # Update IP repository with generated IP file mkdir $ip_lib @@ -125,810 +85,334 @@ update_ip_catalog -add_ip $load_ip -repo_path $ip_lib update_ip_catalog -add_ip $compute_ip -repo_path $ip_lib update_ip_catalog -add_ip $store_ip -repo_path $ip_lib -# CHANGE DESIGN NAME HERE -set design_name $proj_name - -# Creating design if needed -set errMsg "" -set nRet 0 - -set cur_design [current_bd_design -quiet] -set list_cells [get_bd_cells -quiet] - -if { ${design_name} eq "" } { - # USE CASES: - # 1) Design_name not set - - set errMsg "Please set the variable to a non-empty value." - set nRet 1 - -} elseif { ${cur_design} ne "" && ${list_cells} eq "" } { - # USE CASES: - # 2): Current design opened AND is empty AND names same. - # 3): Current design opened AND is empty AND names diff; design_name NOT in project. - # 4): Current design opened AND is empty AND names diff; design_name exists in project. - - if { $cur_design ne $design_name } { - common::send_msg_id "BD_TCL-001" "INFO" "Changing value of from <$design_name> \ - to <$cur_design> since current design is empty." - set design_name [get_property NAME $cur_design] - } - common::send_msg_id "BD_TCL-002" "INFO" "Constructing design in IPI design <$cur_design>..." - -} elseif { ${cur_design} ne "" && $list_cells ne "" && $cur_design eq $design_name } { - # USE CASES: - # 5) Current design opened AND has components AND same names. - - set errMsg "Design <$design_name> already exists in your project, please set the variable \ - to another value." - set nRet 1 -} elseif { [get_files -quiet ${design_name}.bd] ne "" } { - # USE CASES: - # 6) Current opened design, has components, but diff names, design_name exists in project. - # 7) No opened design, design_name exists in project. - - set errMsg "Design <$design_name> already exists in your project, please set the variable \ - to another value." - set nRet 2 - -} else { - # USE CASES: - # 8) No opened design, design_name not in project. - # 9) Current opened design, has components, but diff names, design_name not in project. - - common::send_msg_id "BD_TCL-003" "INFO" "Currently there is no design <$design_name> in \ - project, so creating one..." - - create_bd_design $design_name - - common::send_msg_id "BD_TCL-004" "INFO" "Making design <$design_name> as current_bd_design." - current_bd_design $design_name - -} - -common::send_msg_id "BD_TCL-005" "INFO" "Currently the variable is equal \ - to \"$design_name\"." - -if { $nRet != 0 } { - catch {common::send_msg_id "BD_TCL-114" "ERROR" $errMsg} - return $nRet -} ################################################################## -# DESIGN PROCs +# CONFIGURE BLOCK DIAGRAM DESIGN ################################################################## +# Create bd design +create_bd_design $design_name +current_bd_design $design_name - -# Procedure to create entire design; Provide argument to make -# procedure reusable. If parentCell is "", will use root. -proc create_root_design { parentCell clk inp_part wgt_part out_part inp_bus_width inp_mem_depth wgt_bus_width wgt_mem_depth out_bus_width out_mem_depth} { - - variable script_folder - - if { $parentCell eq "" } { - set parentCell [get_bd_cells /] - } - - # Get object for parentCell - set parentObj [get_bd_cells $parentCell] - if { $parentObj == "" } { - catch {common::send_msg_id "BD_TCL-100" "ERROR" "Unable to find parent cell <$parentCell>!"} - return - } - - # Make sure parentObj is hier blk - set parentType [get_property TYPE $parentObj] - if { $parentType ne "hier" } { - catch {common::send_msg_id "BD_TCL-101" "ERROR" "Parent <$parentObj> has TYPE = \ - <$parentType>. Expected to be ."} - return - } - - # Save current instance; Restore later - set oldCurInst [current_bd_instance .] - - # Set parent object as current - current_bd_instance $parentObj - - - # Create interface ports - set DDR [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:ddrx_rtl:1.0 DDR ] - set FIXED_IO [ create_bd_intf_port -mode Master \ - -vlnv xilinx.com:display_processing_system7:fixedio_rtl:1.0 FIXED_IO ] - - # Create ports - - # Create instance: axi_interconnect_1, and set properties - set axi_interconnect_1 \ - [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_1 ] - set_property -dict [ list \ - CONFIG.NUM_MI {5} \ - ] $axi_interconnect_1 - - # Create instance: axi_smc, and set properties - set axi_smc [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc ] - set_property -dict [ list \ - CONFIG.NUM_SI {5} \ - ] $axi_smc - - # Create instance: axi_timer_1, and set properties - set axi_timer_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_timer:2.0 axi_timer_1 ] - - # Create instance: compute_0, and set properties - set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ] - set_property -dict [ list \ - CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \ - CONFIG.C_M_AXI_DATA_PORT_DATA_WIDTH {64} \ - CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE {"1111"} \ - ] $compute_0 - - # Create instance: fetch_0, and set properties - set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ] - set_property -dict [ list \ - CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE {"1111"} \ - CONFIG.C_M_AXI_INS_PORT_DATA_WIDTH {64} \ - ] $fetch_0 - - # Create instance: g2l_queue, and set properties - set g2l_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 g2l_queue ] - set_property -dict [ list \ - CONFIG.Empty_Threshold_Assert_Value_axis {1022} \ - CONFIG.Empty_Threshold_Assert_Value_rach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wrch {14} \ - CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \ - CONFIG.Full_Flags_Reset_Value {1} \ - CONFIG.Full_Threshold_Assert_Value_axis {1023} \ - CONFIG.Full_Threshold_Assert_Value_rach {15} \ - CONFIG.Full_Threshold_Assert_Value_wach {15} \ - CONFIG.Full_Threshold_Assert_Value_wrch {15} \ - CONFIG.INTERFACE_TYPE {AXI_STREAM} \ - CONFIG.Input_Depth_axis {1024} \ - CONFIG.Reset_Type {Asynchronous_Reset} \ - CONFIG.TUSER_WIDTH {0} \ - ] $g2l_queue - - # Create instance: g2s_queue, and set properties - set g2s_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 g2s_queue ] - set_property -dict [ list \ - CONFIG.Empty_Threshold_Assert_Value_axis {1022} \ - CONFIG.Empty_Threshold_Assert_Value_rach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wrch {14} \ - CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \ - CONFIG.Full_Flags_Reset_Value {1} \ - CONFIG.Full_Threshold_Assert_Value_axis {1023} \ - CONFIG.Full_Threshold_Assert_Value_rach {15} \ - CONFIG.Full_Threshold_Assert_Value_wach {15} \ - CONFIG.Full_Threshold_Assert_Value_wrch {15} \ - CONFIG.INTERFACE_TYPE {AXI_STREAM} \ - CONFIG.Input_Depth_axis {1024} \ - CONFIG.Reset_Type {Asynchronous_Reset} \ - CONFIG.TUSER_WIDTH {0} \ - ] $g2s_queue - - # Create instance: gemm_queue, and set properties - set gemm_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 gemm_queue ] +# Procedure to initialize FIFO +proc init_fifo_property {fifo width_bytes depth} { set_property -dict [ list \ - CONFIG.Empty_Threshold_Assert_Value_axis {510} \ - CONFIG.Empty_Threshold_Assert_Value_rach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wrch {14} \ CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \ CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \ CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \ CONFIG.Full_Flags_Reset_Value {1} \ - CONFIG.Full_Threshold_Assert_Value_axis {511} \ - CONFIG.Full_Threshold_Assert_Value_rach {15} \ - CONFIG.Full_Threshold_Assert_Value_wach {15} \ - CONFIG.Full_Threshold_Assert_Value_wrch {15} \ CONFIG.INTERFACE_TYPE {AXI_STREAM} \ - CONFIG.Input_Depth_axis {512} \ + CONFIG.Input_Depth_axis $depth \ CONFIG.Reset_Type {Asynchronous_Reset} \ - CONFIG.TDATA_NUM_BYTES {16} \ - CONFIG.TKEEP_WIDTH {16} \ - CONFIG.TSTRB_WIDTH {16} \ - CONFIG.TUSER_WIDTH {0} \ - ] $gemm_queue - - # Create instance: l2g_queue, and set properties - set l2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 l2g_queue ] - set_property -dict [ list \ - CONFIG.Empty_Threshold_Assert_Value_axis {1022} \ - CONFIG.Empty_Threshold_Assert_Value_rach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wrch {14} \ - CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \ - CONFIG.Full_Flags_Reset_Value {1} \ - CONFIG.Full_Threshold_Assert_Value_axis {1023} \ - CONFIG.Full_Threshold_Assert_Value_rach {15} \ - CONFIG.Full_Threshold_Assert_Value_wach {15} \ - CONFIG.Full_Threshold_Assert_Value_wrch {15} \ - CONFIG.INTERFACE_TYPE {AXI_STREAM} \ - CONFIG.Input_Depth_axis {1024} \ - CONFIG.Reset_Type {Asynchronous_Reset} \ - CONFIG.TUSER_WIDTH {0} \ - ] $l2g_queue - - # Create instance: load_0, and set properties - set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ] - set_property -dict [ list \ - CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \ - ] $load_0 + CONFIG.TDATA_NUM_BYTES $width_bytes \ + ] $fifo +} - # Create instance: load_queue, and set properties - set load_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 load_queue ] - set_property -dict [ list \ - CONFIG.Empty_Threshold_Assert_Value_axis {510} \ - CONFIG.Empty_Threshold_Assert_Value_rach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wrch {14} \ - CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \ - CONFIG.Full_Flags_Reset_Value {1} \ - CONFIG.Full_Threshold_Assert_Value_axis {511} \ - CONFIG.Full_Threshold_Assert_Value_rach {15} \ - CONFIG.Full_Threshold_Assert_Value_wach {15} \ - CONFIG.Full_Threshold_Assert_Value_wrch {15} \ - CONFIG.INTERFACE_TYPE {AXI_STREAM} \ - CONFIG.Input_Depth_axis {512} \ - CONFIG.Reset_Type {Asynchronous_Reset} \ - CONFIG.TDATA_NUM_BYTES {16} \ - CONFIG.TKEEP_WIDTH {16} \ - CONFIG.TSTRB_WIDTH {16} \ - CONFIG.TUSER_WIDTH {0} \ - ] $load_queue - - # Create instance: proc_sys_reset, and set properties - set proc_sys_reset \ - [ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ] - - # Create instance: processing_system7_1, and set properties - set processing_system7_1 \ - [ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_1 ] +# Procedure to initialize BRAM +proc init_bram_property {bram width depth} { set_property -dict [ list \ - CONFIG.PCW_CAN0_PERIPHERAL_ENABLE {0} \ - CONFIG.PCW_ENET0_PERIPHERAL_ENABLE {0} \ - CONFIG.PCW_EN_CLK0_PORT {1} \ - CONFIG.PCW_EN_CLK1_PORT {1} \ - CONFIG.PCW_EN_CLK2_PORT {1} \ - CONFIG.PCW_EN_CLK3_PORT {1} \ - CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \ - CONFIG.PCW_FPGA1_PERIPHERAL_FREQMHZ {142.86} \ - CONFIG.PCW_FPGA2_PERIPHERAL_FREQMHZ {200} \ - CONFIG.PCW_FPGA3_PERIPHERAL_FREQMHZ {167} \ - CONFIG.PCW_GPIO_MIO_GPIO_ENABLE {0} \ - CONFIG.PCW_I2C0_PERIPHERAL_ENABLE {0} \ - CONFIG.PCW_IMPORT_BOARD_PRESET {None} \ - CONFIG.PCW_IRQ_F2P_INTR {1} \ - CONFIG.PCW_QSPI_GRP_SINGLE_SS_ENABLE {0} \ - CONFIG.PCW_QSPI_PERIPHERAL_ENABLE {0} \ - CONFIG.PCW_SD0_PERIPHERAL_ENABLE {0} \ - CONFIG.PCW_USB0_PERIPHERAL_ENABLE {0} \ - CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \ - CONFIG.PCW_USE_FABRIC_INTERRUPT {1} \ - CONFIG.PCW_USE_HIGH_OCM {1} \ - CONFIG.PCW_USE_S_AXI_ACP {1} \ - CONFIG.PCW_USE_S_AXI_HP0 {0} \ - CONFIG.PCW_USE_S_AXI_HP1 {0} \ - CONFIG.PCW_USE_S_AXI_HP2 {0} \ - CONFIG.PCW_USE_S_AXI_HP3 {0} \ - CONFIG.preset {ZC702} \ - ] $processing_system7_1 + CONFIG.Assume_Synchronous_Clk {true} \ + CONFIG.Byte_Size {8} \ + CONFIG.Enable_32bit_Address {true} \ + CONFIG.Enable_B {Use_ENB_Pin} \ + CONFIG.Memory_Type {True_Dual_Port_RAM} \ + CONFIG.Read_Width_A $width \ + CONFIG.Read_Width_B $width \ + CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \ + CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \ + CONFIG.Use_Byte_Write_Enable {true} \ + CONFIG.Use_RSTA_Pin {true} \ + CONFIG.Use_RSTB_Pin {true} \ + CONFIG.Write_Depth_A $depth \ + CONFIG.Write_Width_A $width \ + CONFIG.Write_Width_B $width \ + ] $bram +} - # Create instance: s2g_queue, and set properties - set s2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 s2g_queue ] - set_property -dict [ list \ - CONFIG.Empty_Threshold_Assert_Value_axis {1022} \ - CONFIG.Empty_Threshold_Assert_Value_rach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wrch {14} \ - CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \ - CONFIG.Full_Flags_Reset_Value {1} \ - CONFIG.Full_Threshold_Assert_Value_axis {1023} \ - CONFIG.Full_Threshold_Assert_Value_rach {15} \ - CONFIG.Full_Threshold_Assert_Value_wach {15} \ - CONFIG.Full_Threshold_Assert_Value_wrch {15} \ - CONFIG.INTERFACE_TYPE {AXI_STREAM} \ - CONFIG.Input_Depth_axis {1024} \ - CONFIG.Reset_Type {Asynchronous_Reset} \ - CONFIG.TUSER_WIDTH {0} \ - ] $s2g_queue +# Create instance: proc_sys_reset, and set properties +set proc_sys_reset \ + [ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ] + +# Create instance: pll_clk, and set properties +set pll_clk [ create_bd_cell -type ip -vlnv xilinx.com:ip:clk_wiz:6.0 pll_clk ] +set_property -dict [ list \ + CONFIG.CLKOUT1_REQUESTED_OUT_FREQ $clock_freq \ + CONFIG.RESET_PORT {resetn} \ + CONFIG.RESET_TYPE {ACTIVE_LOW} \ + CONFIG.USE_LOCKED {false} \ +] $pll_clk + +# Create instance: axi_smc0, and set properties +set axi_smc0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc0 ] +set_property -dict [ list \ + CONFIG.NUM_MI {1} \ + CONFIG.NUM_SI {5} \ +] $axi_smc0 + +# Create instance: axi_xbar, and set properties +set axi_xbar \ + [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_xbar ] +set_property -dict [ list \ + CONFIG.NUM_MI {4} \ + CONFIG.NUM_SI {1} \ +] $axi_xbar + +# Create instance: fetch_0, and set properties +set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ] +set_property -dict [ list \ + CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE $axi_cache \ + CONFIG.C_M_AXI_INS_PORT_PROT_VALUE $axi_prot \ +] $fetch_0 + +# Create instance: load_0, and set properties +set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ] +set_property -dict [ list \ + CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \ + CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \ +] $load_0 + +# Create instance: compute_0, and set properties +set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ] +set_property -dict [ list \ + CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \ + CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \ + CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE $axi_cache \ + CONFIG.C_M_AXI_UOP_PORT_PROT_VALUE $axi_prot \ +] $compute_0 + +# Create instance: store_0, and set properties +set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ] +set_property -dict [ list \ + CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \ + CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \ +] $store_0 + +# Create command queues and set properties +set cmd_queue_list {load_queue gemm_queue store_queue} +foreach cmd_queue $cmd_queue_list { + set tmp_cmd_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 $cmd_queue ] + # Width is 16B (128b, as set in hw_spec.h), depth is 512 (depth of FIFO on Zynq 7000 and Zynq Ultrascale+) + # TODO: derive it from vta_config.h + [ init_fifo_property $tmp_cmd_queue 16 512 ] +} - # Create instance: store_0, and set properties - set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ] - set_property -dict [ list \ -CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \ - ] $store_0 +# Create dependence queues and set properties +set dep_queue_list {l2g_queue g2l_queue g2s_queue s2g_queue} +foreach dep_queue $dep_queue_list { + set tmp_dep_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 $dep_queue ] + # Width is 1B (min width), depth is 1024 + # TODO: derive it from vta_config.h + [ init_fifo_property $tmp_dep_queue 1 1024 ] +} - # Create instance: store_queue, and set properties - set store_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 store_queue ] - set_property -dict [ list \ - CONFIG.Empty_Threshold_Assert_Value_axis {510} \ - CONFIG.Empty_Threshold_Assert_Value_rach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wach {14} \ - CONFIG.Empty_Threshold_Assert_Value_wrch {14} \ - CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \ - CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \ - CONFIG.Full_Flags_Reset_Value {1} \ - CONFIG.Full_Threshold_Assert_Value_axis {511} \ - CONFIG.Full_Threshold_Assert_Value_rach {15} \ - CONFIG.Full_Threshold_Assert_Value_wach {15} \ - CONFIG.Full_Threshold_Assert_Value_wrch {15} \ - CONFIG.INTERFACE_TYPE {AXI_STREAM} \ - CONFIG.Input_Depth_axis {512} \ - CONFIG.Reset_Type {Asynchronous_Reset} \ - CONFIG.TDATA_NUM_BYTES {16} \ - CONFIG.TKEEP_WIDTH {16} \ - CONFIG.TSTRB_WIDTH {16} \ - CONFIG.TUSER_WIDTH {0} \ - ] $store_queue - - # Create instance: xlconcat_1, and set properties - set xlconcat_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_1 ] - set_property -dict [ list \ -CONFIG.NUM_PORTS {5} \ - ] $xlconcat_1 - - # Create and connect inp_mem partitions - if {${inp_part} > 1} { - for {set i 0} {$i < ${inp_part}} {incr i} { - # Create instance: inp_mem, and set properties - set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem_${i} ] - set_property -dict [ list \ - CONFIG.Byte_Size {8} \ - CONFIG.Enable_32bit_Address {true} \ - CONFIG.Enable_B {Use_ENB_Pin} \ - CONFIG.Memory_Type {True_Dual_Port_RAM} \ - CONFIG.Read_Width_A $inp_bus_width \ - CONFIG.Read_Width_B $inp_bus_width \ - CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \ - CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \ - CONFIG.Use_Byte_Write_Enable {true} \ - CONFIG.Use_RSTA_Pin {true} \ - CONFIG.Use_RSTB_Pin {true} \ - CONFIG.Write_Depth_A $inp_mem_depth \ - CONFIG.Write_Width_A $inp_bus_width \ - CONFIG.Write_Width_B $inp_bus_width \ - CONFIG.use_bram_block {BRAM_Controller} \ - ] $inp_mem - # Create interface connections - connect_bd_intf_net -intf_net load_0_inp_mem_${i}_V_PORTA \ - [get_bd_intf_pins $inp_mem/BRAM_PORTA] \ - [get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA] - connect_bd_intf_net -intf_net compute_0_inp_mem_${i}_V_PORTA \ - [get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA] \ - [get_bd_intf_pins $inp_mem/BRAM_PORTB] - } +# Create and connect inp_mem partitions +for {set i 0} {$i < $inp_part} {incr i} { + # Create instance: inp_mem, and set properties + set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem_${i} ] + [ init_bram_property $inp_mem $inp_mem_width $inp_mem_depth ] + # If module has more than 1 mem port, the naming convention changes + if {$inp_part > 1} { + set porta [get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA] + set portb [get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA] } else { - # Create instance: inp_mem, and set properties - set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem ] - set_property -dict [ list \ - CONFIG.Byte_Size {8} \ - CONFIG.Enable_32bit_Address {true} \ - CONFIG.Enable_B {Use_ENB_Pin} \ - CONFIG.Memory_Type {True_Dual_Port_RAM} \ - CONFIG.Read_Width_A $inp_bus_width \ - CONFIG.Read_Width_B $inp_bus_width \ - CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \ - CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \ - CONFIG.Use_Byte_Write_Enable {true} \ - CONFIG.Use_RSTA_Pin {true} \ - CONFIG.Use_RSTB_Pin {true} \ - CONFIG.Write_Depth_A $inp_mem_depth \ - CONFIG.Write_Width_A $inp_bus_width \ - CONFIG.Write_Width_B $inp_bus_width \ - CONFIG.use_bram_block {BRAM_Controller} \ - ] $inp_mem - # Create interface connections - connect_bd_intf_net -intf_net load_0_inp_mem_V_PORTA \ - [get_bd_intf_pins $inp_mem/BRAM_PORTA] \ - [get_bd_intf_pins load_0/inp_mem_V_PORTA] - connect_bd_intf_net -intf_net compute_0_inp_mem_V_PORTA \ - [get_bd_intf_pins compute_0/inp_mem_V_PORTA] \ - [get_bd_intf_pins $inp_mem/BRAM_PORTB] + set porta [get_bd_intf_pins load_0/inp_mem_V_PORTA] + set portb [get_bd_intf_pins compute_0/inp_mem_V_PORTA] } + # Create interface connections + connect_bd_intf_net -intf_net load_0_inp_mem_V_PORTA \ + [get_bd_intf_pins $inp_mem/BRAM_PORTA] \ + $porta + connect_bd_intf_net -intf_net compute_0_inp_mem_V_PORTA \ + [get_bd_intf_pins $inp_mem/BRAM_PORTB] \ + $portb +} - # Create and connect wgt_mem partitions - if {${wgt_part} > 1} { - for {set i 0} {$i < ${wgt_part}} {incr i} { - # Create instance: wgt_mem, and set properties - set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem_${i} ] - set_property -dict [ list \ - CONFIG.Assume_Synchronous_Clk {true} \ - CONFIG.Byte_Size {8} \ - CONFIG.Enable_32bit_Address {true} \ - CONFIG.Enable_B {Use_ENB_Pin} \ - CONFIG.Memory_Type {True_Dual_Port_RAM} \ - CONFIG.Read_Width_A $wgt_bus_width \ - CONFIG.Read_Width_B $wgt_bus_width \ - CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \ - CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \ - CONFIG.Use_Byte_Write_Enable {true} \ - CONFIG.Use_RSTA_Pin {true} \ - CONFIG.Use_RSTB_Pin {true} \ - CONFIG.Write_Depth_A $wgt_mem_depth \ - CONFIG.Write_Width_A $wgt_bus_width \ - CONFIG.Write_Width_B $wgt_bus_width \ - ] $wgt_mem - # Create interface connections - connect_bd_intf_net -intf_net load_0_wgt_mem_${i}_V_PORTA \ - [get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA] \ - [get_bd_intf_pins $wgt_mem/BRAM_PORTA] - connect_bd_intf_net -intf_net compute_0_wgt_mem_${i}_V_PORTA \ - [get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA] \ - [get_bd_intf_pins $wgt_mem/BRAM_PORTB] - } +# Create and connect wgt_mem partitions +for {set i 0} {$i < $wgt_part} {incr i} { + # Create instance: wgt_mem, and set properties + set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem_${i} ] + [ init_bram_property $wgt_mem $wgt_mem_width $wgt_mem_depth ] + # If module has more than 1 mem port, the naming convention changes + if {$wgt_part > 1} { + set porta [get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA] + set portb [get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA] } else { - # Create instance: wgt_mem, and set properties - set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem ] - set_property -dict [ list \ - CONFIG.Assume_Synchronous_Clk {true} \ - CONFIG.Byte_Size {8} \ - CONFIG.Enable_32bit_Address {true} \ - CONFIG.Enable_B {Use_ENB_Pin} \ - CONFIG.Memory_Type {True_Dual_Port_RAM} \ - CONFIG.Read_Width_A $wgt_bus_width \ - CONFIG.Read_Width_B $wgt_bus_width \ - CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \ - CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \ - CONFIG.Use_Byte_Write_Enable {true} \ - CONFIG.Use_RSTA_Pin {true} \ - CONFIG.Use_RSTB_Pin {true} \ - CONFIG.Write_Depth_A $wgt_mem_depth \ - CONFIG.Write_Width_A $wgt_bus_width \ - CONFIG.Write_Width_B $wgt_bus_width \ - ] $wgt_mem - # Create interface connections - connect_bd_intf_net -intf_net load_0_wgt_mem_V_PORTA \ - [get_bd_intf_pins load_0/wgt_mem_V_PORTA] \ - [get_bd_intf_pins $wgt_mem/BRAM_PORTA] - connect_bd_intf_net -intf_net compute_0_wgt_mem_V_PORTA \ - [get_bd_intf_pins compute_0/wgt_mem_V_PORTA] \ - [get_bd_intf_pins $wgt_mem/BRAM_PORTB] + set porta [get_bd_intf_pins load_0/wgt_mem_V_PORTA] + set portb [get_bd_intf_pins compute_0/wgt_mem_V_PORTA] } + # Create interface connections + connect_bd_intf_net -intf_net load_0_wgt_mem_${i}_V_PORTA \ + [get_bd_intf_pins $wgt_mem/BRAM_PORTA] \ + $porta + connect_bd_intf_net -intf_net compute_0_wgt_mem_${i}_V_PORTA \ + [get_bd_intf_pins $wgt_mem/BRAM_PORTB] \ + $portb +} - # Create and connect out_mem partitions - if {${out_part} > 1} { - for {set i 0} {$i < ${out_part}} {incr i} { - # Create instance: out_mem, and set properties - set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem_${i} ] - set_property -dict [ list \ - CONFIG.Byte_Size {8} \ - CONFIG.Enable_32bit_Address {true} \ - CONFIG.Enable_B {Use_ENB_Pin} \ - CONFIG.Memory_Type {True_Dual_Port_RAM} \ - CONFIG.Read_Width_A $out_bus_width \ - CONFIG.Read_Width_B $out_bus_width \ - CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \ - CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \ - CONFIG.Use_Byte_Write_Enable {true} \ - CONFIG.Use_RSTA_Pin {true} \ - CONFIG.Use_RSTB_Pin {true} \ - CONFIG.Write_Depth_A $out_mem_depth \ - CONFIG.Write_Width_A $out_bus_width \ - CONFIG.Write_Width_B $out_bus_width \ - CONFIG.use_bram_block {BRAM_Controller} \ - ] $out_mem - # Create interface connections - connect_bd_intf_net -intf_net compute_0_out_mem_${i}_V_PORTA \ - [get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA] \ - [get_bd_intf_pins $out_mem/BRAM_PORTA] - connect_bd_intf_net -intf_net store_0_out_mem_${i}_V_PORTA \ - [get_bd_intf_pins $out_mem/BRAM_PORTB] \ - [get_bd_intf_pins store_0/out_mem_${i}_V_PORTA] - } +# Create and connect out_mem partitions +for {set i 0} {$i < $out_part} {incr i} { + # Create instance: out_mem, and set properties + set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem_${i} ] + [ init_bram_property $out_mem $out_mem_width $out_mem_depth ] + # If module has more than 1 mem port, the naming convention changes + if {$out_part > 1} { + set porta [get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA] + set portb [get_bd_intf_pins store_0/out_mem_${i}_V_PORTA] } else { - # Create instance: out_mem, and set properties - set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem ] - set_property -dict [ list \ - CONFIG.Byte_Size {8} \ - CONFIG.Enable_32bit_Address {true} \ - CONFIG.Enable_B {Use_ENB_Pin} \ - CONFIG.Memory_Type {True_Dual_Port_RAM} \ - CONFIG.Read_Width_A $out_bus_width \ - CONFIG.Read_Width_B $out_bus_width \ - CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \ - CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \ - CONFIG.Use_Byte_Write_Enable {true} \ - CONFIG.Use_RSTA_Pin {true} \ - CONFIG.Use_RSTB_Pin {true} \ - CONFIG.Write_Depth_A $out_mem_depth \ - CONFIG.Write_Width_A $out_bus_width \ - CONFIG.Write_Width_B $out_bus_width \ - CONFIG.use_bram_block {BRAM_Controller} \ - ] $out_mem - # Create interface connections - connect_bd_intf_net -intf_net compute_0_out_mem_V_PORTA \ - [get_bd_intf_pins compute_0/out_mem_V_PORTA] \ - [get_bd_intf_pins $out_mem/BRAM_PORTA] - connect_bd_intf_net -intf_net store_0_out_mem_V_PORTA \ - [get_bd_intf_pins $out_mem/BRAM_PORTB] \ - [get_bd_intf_pins store_0/out_mem_V_PORTA] + set porta [get_bd_intf_pins compute_0/out_mem_V_PORTA] + set portb [get_bd_intf_pins store_0/out_mem_V_PORTA] } - # Create interface connections - connect_bd_intf_net -intf_net axi_interconnect_1_M01_AXI \ - [get_bd_intf_pins axi_interconnect_1/M01_AXI] \ - [get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS] - connect_bd_intf_net -intf_net axi_interconnect_1_M02_AXI \ - [get_bd_intf_pins axi_interconnect_1/M02_AXI] \ - [get_bd_intf_pins load_0/s_axi_CONTROL_BUS] - connect_bd_intf_net -intf_net axi_interconnect_1_M03_AXI \ - [get_bd_intf_pins axi_interconnect_1/M03_AXI] \ - [get_bd_intf_pins compute_0/s_axi_CONTROL_BUS] - connect_bd_intf_net -intf_net axi_interconnect_1_M04_AXI \ - [get_bd_intf_pins axi_interconnect_1/M04_AXI] \ - [get_bd_intf_pins store_0/s_axi_CONTROL_BUS] - connect_bd_intf_net -intf_net axi_smc_M00_AXI \ - [get_bd_intf_pins axi_smc/M00_AXI] \ - [get_bd_intf_pins processing_system7_1/S_AXI_ACP] - connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V \ - [get_bd_intf_pins compute_0/g2l_dep_queue_V] \ - [get_bd_intf_pins g2l_queue/S_AXIS] - connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V \ - [get_bd_intf_pins compute_0/g2s_dep_queue_V] \ - [get_bd_intf_pins g2s_queue/S_AXIS] - connect_bd_intf_net -intf_net compute_0_m_axi_data_port \ - [get_bd_intf_pins axi_smc/S02_AXI] \ - [get_bd_intf_pins compute_0/m_axi_data_port] - connect_bd_intf_net -intf_net compute_0_m_axi_uop_port \ - [get_bd_intf_pins axi_smc/S01_AXI] \ - [get_bd_intf_pins compute_0/m_axi_uop_port] - connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V \ - [get_bd_intf_pins fetch_0/gemm_queue_V_V] \ - [get_bd_intf_pins gemm_queue/S_AXIS] - connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V \ - [get_bd_intf_pins l2g_queue/S_AXIS] \ - [get_bd_intf_pins load_0/l2g_dep_queue_V] - connect_bd_intf_net -intf_net fetch_0_load_queue_V_V \ - [get_bd_intf_pins fetch_0/load_queue_V_V] \ - [get_bd_intf_pins load_queue/S_AXIS] - connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port \ - [get_bd_intf_pins axi_smc/S00_AXI] \ - [get_bd_intf_pins fetch_0/m_axi_ins_port] - connect_bd_intf_net -intf_net fetch_0_store_queue_V_V \ - [get_bd_intf_pins fetch_0/store_queue_V_V] \ - [get_bd_intf_pins store_queue/S_AXIS] - connect_bd_intf_net -intf_net g2l_queue_M_AXIS \ - [get_bd_intf_pins g2l_queue/M_AXIS] \ - [get_bd_intf_pins load_0/g2l_dep_queue_V] - connect_bd_intf_net -intf_net g2s_queue_M_AXIS \ - [get_bd_intf_pins g2s_queue/M_AXIS] \ - [get_bd_intf_pins store_0/g2s_dep_queue_V] - connect_bd_intf_net -intf_net gemm_queue_M_AXIS \ - [get_bd_intf_pins compute_0/gemm_queue_V_V] \ - [get_bd_intf_pins gemm_queue/M_AXIS] - connect_bd_intf_net -intf_net l2g_queue_M_AXIS \ - [get_bd_intf_pins compute_0/l2g_dep_queue_V] \ - [get_bd_intf_pins l2g_queue/M_AXIS] - connect_bd_intf_net -intf_net load_0_m_axi_data_port \ - [get_bd_intf_pins axi_smc/S03_AXI] \ - [get_bd_intf_pins load_0/m_axi_data_port] - connect_bd_intf_net -intf_net load_queue_M_AXIS \ - [get_bd_intf_pins load_0/load_queue_V_V] \ - [get_bd_intf_pins load_queue/M_AXIS] - connect_bd_intf_net -intf_net processing_system7_1_axi_periph_m00_axi \ - [get_bd_intf_pins axi_interconnect_1/M00_AXI] \ - [get_bd_intf_pins axi_timer_1/S_AXI] - connect_bd_intf_net -intf_net processing_system7_1_ddr \ - [get_bd_intf_ports DDR] \ - [get_bd_intf_pins processing_system7_1/DDR] - connect_bd_intf_net -intf_net processing_system7_1_fixed_io \ - [get_bd_intf_ports FIXED_IO] \ - [get_bd_intf_pins processing_system7_1/FIXED_IO] - connect_bd_intf_net -intf_net processing_system7_1_m_axi_gp0 \ - [get_bd_intf_pins axi_interconnect_1/S00_AXI] \ - [get_bd_intf_pins processing_system7_1/M_AXI_GP0] - connect_bd_intf_net -intf_net s2g_queue_M_AXIS \ - [get_bd_intf_pins compute_0/s2g_dep_queue_V] \ - [get_bd_intf_pins s2g_queue/M_AXIS] - connect_bd_intf_net -intf_net store_0_m_axi_data_port \ - [get_bd_intf_pins axi_smc/S04_AXI] \ - [get_bd_intf_pins store_0/m_axi_data_port] - connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V \ - [get_bd_intf_pins s2g_queue/S_AXIS] \ - [get_bd_intf_pins store_0/s2g_dep_queue_V] - connect_bd_intf_net -intf_net store_queue_M_AXIS \ - [get_bd_intf_pins store_0/store_queue_V_V] \ - [get_bd_intf_pins store_queue/M_AXIS] - - # Create port connections - connect_bd_net -net axi_timer_1_interrupt \ - [get_bd_pins axi_timer_1/interrupt] \ - [get_bd_pins xlconcat_1/In0] - connect_bd_net -net compute_0_interrupt \ - [get_bd_pins compute_0/interrupt] \ - [get_bd_pins xlconcat_1/In3] - connect_bd_net -net fetch_0_interrupt \ - [get_bd_pins fetch_0/interrupt] \ - [get_bd_pins xlconcat_1/In1] - connect_bd_net -net load_0_interrupt \ - [get_bd_pins load_0/interrupt] \ - [get_bd_pins xlconcat_1/In2] - connect_bd_net -net proc_sys_reset_interconnect_aresetn \ - [get_bd_pins axi_interconnect_1/ARESETN] \ - [get_bd_pins proc_sys_reset/interconnect_aresetn] - connect_bd_net -net proc_sys_reset_peripheral_aresetn \ - [get_bd_pins axi_interconnect_1/M00_ARESETN] \ - [get_bd_pins axi_interconnect_1/M01_ARESETN] \ - [get_bd_pins axi_interconnect_1/M02_ARESETN] \ - [get_bd_pins axi_interconnect_1/M03_ARESETN] \ - [get_bd_pins axi_interconnect_1/M04_ARESETN] \ - [get_bd_pins axi_interconnect_1/S00_ARESETN] \ - [get_bd_pins axi_smc/aresetn] \ - [get_bd_pins axi_timer_1/s_axi_aresetn] \ - [get_bd_pins compute_0/ap_rst_n] \ - [get_bd_pins fetch_0/ap_rst_n] \ - [get_bd_pins g2l_queue/s_aresetn] \ - [get_bd_pins g2s_queue/s_aresetn] \ - [get_bd_pins gemm_queue/s_aresetn] \ - [get_bd_pins l2g_queue/s_aresetn] \ - [get_bd_pins load_0/ap_rst_n] \ - [get_bd_pins load_queue/s_aresetn] \ - [get_bd_pins proc_sys_reset/peripheral_aresetn] \ - [get_bd_pins s2g_queue/s_aresetn] \ - [get_bd_pins store_0/ap_rst_n] \ - [get_bd_pins store_queue/s_aresetn] - connect_bd_net -net processing_system7_1_FCLK_CLK \ - [get_bd_pins axi_interconnect_1/ACLK] \ - [get_bd_pins axi_interconnect_1/M00_ACLK] \ - [get_bd_pins axi_interconnect_1/M01_ACLK] \ - [get_bd_pins axi_interconnect_1/M02_ACLK] \ - [get_bd_pins axi_interconnect_1/M03_ACLK] \ - [get_bd_pins axi_interconnect_1/M04_ACLK] \ - [get_bd_pins axi_interconnect_1/S00_ACLK] \ - [get_bd_pins axi_smc/aclk] \ - [get_bd_pins axi_timer_1/s_axi_aclk] \ - [get_bd_pins compute_0/ap_clk] \ - [get_bd_pins fetch_0/ap_clk] \ - [get_bd_pins g2l_queue/s_aclk] \ - [get_bd_pins g2s_queue/s_aclk] \ - [get_bd_pins gemm_queue/s_aclk] \ - [get_bd_pins l2g_queue/s_aclk] \ - [get_bd_pins load_0/ap_clk] \ - [get_bd_pins load_queue/s_aclk] \ - [get_bd_pins proc_sys_reset/slowest_sync_clk] \ - [get_bd_pins processing_system7_1/FCLK_CLK${clk}] \ - [get_bd_pins processing_system7_1/M_AXI_GP0_ACLK] \ - [get_bd_pins processing_system7_1/S_AXI_ACP_ACLK] \ - [get_bd_pins s2g_queue/s_aclk] \ - [get_bd_pins store_0/ap_clk] \ - [get_bd_pins store_queue/s_aclk] - connect_bd_net -net processing_system7_1_fclk_reset0_n \ - [get_bd_pins proc_sys_reset/ext_reset_in] \ - [get_bd_pins processing_system7_1/FCLK_RESET0_N] - connect_bd_net -net store_0_interrupt \ - [get_bd_pins store_0/interrupt] \ - [get_bd_pins xlconcat_1/In4] - connect_bd_net -net xlconcat_1_dout \ - [get_bd_pins processing_system7_1/IRQ_F2P] \ - [get_bd_pins xlconcat_1/dout] - - # Create address segments - create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \ - [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \ - SEG_processing_system7_1_ACP_DDR_LOWOCM - create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \ - [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \ - SEG_processing_system7_1_ACP_DDR_LOWOCM - create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \ - [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \ - SEG_processing_system7_1_ACP_HIGH_OCM - create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \ - [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \ - SEG_processing_system7_1_ACP_HIGH_OCM - create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \ - [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \ - SEG_processing_system7_1_ACP_IOP - create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \ - [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \ - SEG_processing_system7_1_ACP_IOP - create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \ - [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \ - SEG_processing_system7_1_ACP_M_AXI_GP0 - create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \ - [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \ - SEG_processing_system7_1_ACP_M_AXI_GP0 - create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \ - [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \ - SEG_processing_system7_1_ACP_DDR_LOWOCM - create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \ - [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \ - SEG_processing_system7_1_ACP_HIGH_OCM - create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \ - [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \ - SEG_processing_system7_1_ACP_IOP - create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \ - [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \ - SEG_processing_system7_1_ACP_M_AXI_GP0 - create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \ - [get_bd_addr_spaces load_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \ - SEG_processing_system7_1_ACP_DDR_LOWOCM - create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \ - [get_bd_addr_spaces load_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \ - SEG_processing_system7_1_ACP_HIGH_OCM - create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \ - [get_bd_addr_spaces load_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \ - SEG_processing_system7_1_ACP_IOP - create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \ - [get_bd_addr_spaces load_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \ - SEG_processing_system7_1_ACP_M_AXI_GP0 - create_bd_addr_seg -range 0x00010000 -offset 0x42800000 \ - [get_bd_addr_spaces processing_system7_1/Data] \ - [get_bd_addr_segs axi_timer_1/S_AXI/Reg] SEG_axi_timer_1_Reg - create_bd_addr_seg -range 0x00010000 -offset 0x43C10000 \ - [get_bd_addr_spaces processing_system7_1/Data] \ - [get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg - create_bd_addr_seg -range 0x00010000 -offset 0x43C00000 \ - [get_bd_addr_spaces processing_system7_1/Data] \ - [get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg - create_bd_addr_seg -range 0x00010000 -offset 0x43C20000 \ - [get_bd_addr_spaces processing_system7_1/Data] \ - [get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg - create_bd_addr_seg -range 0x00010000 -offset 0x43C30000 \ - [get_bd_addr_spaces processing_system7_1/Data] \ - [get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg - create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \ - [get_bd_addr_spaces store_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \ - SEG_processing_system7_1_ACP_DDR_LOWOCM - create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \ - [get_bd_addr_spaces store_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \ - SEG_processing_system7_1_ACP_HIGH_OCM - create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \ - [get_bd_addr_spaces store_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \ - SEG_processing_system7_1_ACP_IOP - create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \ - [get_bd_addr_spaces store_0/Data_m_axi_data_port] \ - [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \ - SEG_processing_system7_1_ACP_M_AXI_GP0 - - - # Restore current instance - current_bd_instance $oldCurInst - - save_bd_design + connect_bd_intf_net -intf_net compute_0_out_mem_${i}_V_PORTA \ + [get_bd_intf_pins $out_mem/BRAM_PORTA] \ + $porta + connect_bd_intf_net -intf_net store_0_out_mem_${i}_V_PORTA \ + [get_bd_intf_pins $out_mem/BRAM_PORTB] \ + $portb +} + +# Create instance: processing_system, and set properties +if { $device_family eq "zynq-7000" } { + set processing_system [ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system ] + set_property -dict [ list \ + CONFIG.PCW_EN_CLK0_PORT {1} \ + CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \ + CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \ + CONFIG.PCW_USE_S_AXI_ACP {1} \ + CONFIG.preset {ZC702} \ + ] $processing_system + # Get ports that are specific to the Zynq 7000 processing system + set ps_clk [get_bd_pins processing_system/FCLK_CLK0] + set ps_rstn [get_bd_pins processing_system/FCLK_RESET0_N] + set maxi_clk [get_bd_pins processing_system/M_AXI_GP0_ACLK] + set saxi_clk [get_bd_pins processing_system/S_AXI_ACP_ACLK] + set maxi [get_bd_intf_pins processing_system/M_AXI_GP0] + set saxi [get_bd_intf_pins processing_system/S_AXI_ACP] +} elseif { $device_family eq "zynq-ultrascale+" } { + set processing_system [ create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.2 processing_system ] + set_property -dict [ list \ + CONFIG.PSU__FPGA_PL0_ENABLE {1} \ + CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ {100} \ + CONFIG.PSU__USE__M_AXI_GP0 {1} \ + CONFIG.PSU__USE__M_AXI_GP2 {0} \ + CONFIG.PSU__USE__S_AXI_GP0 {1} + ] $processing_system + # Get ports that are specific to the Zynq Ultrascale MPSoC processing system + set ps_clk [get_bd_pins processing_system/pl_clk0] + set ps_rstn [get_bd_pins processing_system/pl_resetn0] + set maxi_clk [get_bd_pins processing_system/maxihpm0_fpd_aclk] + set saxi_clk [get_bd_pins processing_system/saxihpc0_fpd_aclk] + set maxi [get_bd_intf_pins processing_system/M_AXI_HPM0_FPD] + set saxi [get_bd_intf_pins processing_system/S_AXI_HPC0_FPD] } -# End of create_root_design() + +# Create interface connections +connect_bd_intf_net -intf_net axi_xbar_M00_AXI [get_bd_intf_pins axi_xbar/M00_AXI] [get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS] +connect_bd_intf_net -intf_net axi_xbar_M01_AXI [get_bd_intf_pins axi_xbar/M01_AXI] [get_bd_intf_pins load_0/s_axi_CONTROL_BUS] +connect_bd_intf_net -intf_net axi_xbar_M02_AXI [get_bd_intf_pins axi_xbar/M02_AXI] [get_bd_intf_pins compute_0/s_axi_CONTROL_BUS] +connect_bd_intf_net -intf_net axi_xbar_M03_AXI [get_bd_intf_pins axi_xbar/M03_AXI] [get_bd_intf_pins store_0/s_axi_CONTROL_BUS] +connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V [get_bd_intf_pins l2g_queue/S_AXIS] [get_bd_intf_pins load_0/l2g_dep_queue_V] +connect_bd_intf_net -intf_net fetch_0_load_queue_V_V [get_bd_intf_pins fetch_0/load_queue_V_V] [get_bd_intf_pins load_queue/S_AXIS] +connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V [get_bd_intf_pins fetch_0/gemm_queue_V_V] [get_bd_intf_pins gemm_queue/S_AXIS] +connect_bd_intf_net -intf_net fetch_0_store_queue_V_V [get_bd_intf_pins fetch_0/store_queue_V_V] [get_bd_intf_pins store_queue/S_AXIS] +connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V [get_bd_intf_pins compute_0/g2l_dep_queue_V] [get_bd_intf_pins g2l_queue/S_AXIS] +connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V [get_bd_intf_pins compute_0/g2s_dep_queue_V] [get_bd_intf_pins g2s_queue/S_AXIS] +connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V [get_bd_intf_pins s2g_queue/S_AXIS] [get_bd_intf_pins store_0/s2g_dep_queue_V] +connect_bd_intf_net -intf_net load_queue_M_AXIS [get_bd_intf_pins load_0/load_queue_V_V] [get_bd_intf_pins load_queue/M_AXIS] +connect_bd_intf_net -intf_net gemm_queue_M_AXIS [get_bd_intf_pins compute_0/gemm_queue_V_V] [get_bd_intf_pins gemm_queue/M_AXIS] +connect_bd_intf_net -intf_net store_queue_M_AXIS [get_bd_intf_pins store_0/store_queue_V_V] [get_bd_intf_pins store_queue/M_AXIS] +connect_bd_intf_net -intf_net l2g_queue_M_AXIS [get_bd_intf_pins compute_0/l2g_dep_queue_V] [get_bd_intf_pins l2g_queue/M_AXIS] +connect_bd_intf_net -intf_net g2l_queue_M_AXIS [get_bd_intf_pins g2l_queue/M_AXIS] [get_bd_intf_pins load_0/g2l_dep_queue_V] +connect_bd_intf_net -intf_net g2s_queue_M_AXIS [get_bd_intf_pins g2s_queue/M_AXIS] [get_bd_intf_pins store_0/g2s_dep_queue_V] +connect_bd_intf_net -intf_net s2g_queue_M_AXIS [get_bd_intf_pins compute_0/s2g_dep_queue_V] [get_bd_intf_pins s2g_queue/M_AXIS] +connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port [get_bd_intf_pins axi_smc0/S00_AXI] [get_bd_intf_pins fetch_0/m_axi_ins_port] +connect_bd_intf_net -intf_net load_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S01_AXI] [get_bd_intf_pins load_0/m_axi_data_port] +connect_bd_intf_net -intf_net compute_0_m_axi_uop_port [get_bd_intf_pins axi_smc0/S02_AXI] [get_bd_intf_pins compute_0/m_axi_uop_port] +connect_bd_intf_net -intf_net compute_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S03_AXI] [get_bd_intf_pins compute_0/m_axi_data_port] +connect_bd_intf_net -intf_net store_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S04_AXI] [get_bd_intf_pins store_0/m_axi_data_port] +connect_bd_intf_net -intf_net axi_smc0_M00_AXI [get_bd_intf_pins axi_smc0/M00_AXI] $saxi +connect_bd_intf_net -intf_net processing_system_m_axi [get_bd_intf_pins axi_xbar/S00_AXI] $maxi + +# Create port connections +connect_bd_net -net processing_system_reset \ + [get_bd_pins pll_clk/resetn] \ + [get_bd_pins proc_sys_reset/ext_reset_in] \ + $ps_rstn +connect_bd_net -net ps_clk_net \ + [get_bd_pins pll_clk/clk_in1] \ + $ps_clk +connect_bd_net -net proc_sys_reset_interconnect_aresetn \ + [get_bd_pins axi_xbar/ARESETN] \ + [get_bd_pins proc_sys_reset/interconnect_aresetn] +connect_bd_net -net proc_sys_reset_peripheral_aresetn \ + [get_bd_pins proc_sys_reset/peripheral_aresetn] \ + [get_bd_pins axi_smc0/aresetn] \ + [get_bd_pins axi_xbar/M00_ARESETN] \ + [get_bd_pins axi_xbar/M01_ARESETN] \ + [get_bd_pins axi_xbar/M02_ARESETN] \ + [get_bd_pins axi_xbar/M03_ARESETN] \ + [get_bd_pins axi_xbar/S00_ARESETN] \ + [get_bd_pins fetch_0/ap_rst_n] \ + [get_bd_pins load_0/ap_rst_n] \ + [get_bd_pins store_0/ap_rst_n] \ + [get_bd_pins compute_0/ap_rst_n] \ + [get_bd_pins load_queue/s_aresetn] \ + [get_bd_pins gemm_queue/s_aresetn] \ + [get_bd_pins store_queue/s_aresetn] \ + [get_bd_pins l2g_queue/s_aresetn] \ + [get_bd_pins g2l_queue/s_aresetn] \ + [get_bd_pins g2s_queue/s_aresetn] \ + [get_bd_pins s2g_queue/s_aresetn] +connect_bd_net -net processing_system_clk \ + [get_bd_pins pll_clk/clk_out1] \ + [get_bd_pins proc_sys_reset/slowest_sync_clk] \ + [get_bd_pins axi_smc0/aclk] \ + [get_bd_pins axi_xbar/ACLK] \ + [get_bd_pins axi_xbar/M00_ACLK] \ + [get_bd_pins axi_xbar/M01_ACLK] \ + [get_bd_pins axi_xbar/M02_ACLK] \ + [get_bd_pins axi_xbar/M03_ACLK] \ + [get_bd_pins axi_xbar/S00_ACLK] \ + [get_bd_pins fetch_0/ap_clk] \ + [get_bd_pins load_0/ap_clk] \ + [get_bd_pins compute_0/ap_clk] \ + [get_bd_pins store_0/ap_clk] \ + [get_bd_pins load_queue/s_aclk] \ + [get_bd_pins gemm_queue/s_aclk] \ + [get_bd_pins store_queue/s_aclk] \ + [get_bd_pins l2g_queue/s_aclk] \ + [get_bd_pins g2l_queue/s_aclk] \ + [get_bd_pins g2s_queue/s_aclk] \ + [get_bd_pins s2g_queue/s_aclk] \ + $maxi_clk \ + $saxi_clk + +# Create address segments +create_bd_addr_seg -range $ip_reg_map_range -offset $fetch_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg +create_bd_addr_seg -range $ip_reg_map_range -offset $load_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg +create_bd_addr_seg -range $ip_reg_map_range -offset $compute_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg +create_bd_addr_seg -range $ip_reg_map_range -offset $store_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg +if { $device_family eq "zynq-7000" } { + create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM + create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM + create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM + create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces load_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM + create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces store_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM +} elseif { $device_family eq "zynq-ultrascale+"} { + create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW + create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces load_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW + create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW + create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW + create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces store_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW +} + +save_bd_design ################################################################## -# MAIN FLOW +# COMPILATION FLOW ################################################################## -create_root_design "" $clock_id $inp_part $wgt_part $out_part $inp_bus_width \ - $inp_mem_depth $wgt_bus_width $wgt_mem_depth $out_bus_width $out_mem_depth - # Create top-level wrapper file make_wrapper -files \ [get_files $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/$proj_name.bd] -top @@ -937,8 +421,7 @@ update_compile_order -fileset sources_1 update_compile_order -fileset sim_1 # Run bistream generation on 8 threads with performance oriented P&R strategy -# create_run impl_1 -parent_run synth_1 -flow {Vivado Implementation 2017} \ -# -strategy "Performance_ExplorePostRoutePhysOpt" +set num_threads 8 launch_runs impl_1 -to_step write_bitstream -jobs $num_threads wait_on_run impl_1 diff --git a/vta/hardware/xilinx/sim/vta_test.cc b/vta/hardware/xilinx/sim/vta_test.cc index d21d121a8ddb..e3df31a9ddfe 100644 --- a/vta/hardware/xilinx/sim/vta_test.cc +++ b/vta/hardware/xilinx/sim/vta_test.cc @@ -35,17 +35,6 @@ int main(void) { printParameters(); #endif - // Micro op bound - assert(VTA_UOP_GEM_2_1 < VTA_UOP_WIDTH); - assert(VTA_UOP_ALU_1_1 < VTA_UOP_WIDTH); - // Make sure there is no misaligment - assert(VTA_INSN_GEM_9_1 < VTA_INSN_GEM_A_0); - assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0); - // Instruction bounds - assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH); - assert(VTA_INSN_GEM_F_1 < VTA_INS_WIDTH); - assert(VTA_INSN_ALU_G_1 < VTA_INS_WIDTH); - int status = 0; // Run ALU test (vector-scalar operators) @@ -65,15 +54,15 @@ int main(void) { status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false); status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true); status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false); + status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, false); // Run blocked GEMM test - status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2); - status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1); // Simple GEMM unit test - status |= gemm_test(64, 64, 64, true); + status |= gemm_test(4 * VTA_BATCH, 4 * VTA_BLOCK_OUT, 4 * VTA_BLOCK_IN, false); return status; } diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc index cb6b7b76cd11..fba9b4febcf8 100644 --- a/vta/hardware/xilinx/src/vta.cc +++ b/vta/hardware/xilinx/src/vta.cc @@ -18,7 +18,6 @@ */ /*! - * Copyright (c) 2018 by Contributors * \file vta.cpp * \brief VTA HLS design. */ @@ -29,13 +28,114 @@ #include "vta.h" +template +void reset_mem( + memop_sram_T &sram_idx, + memop_sram_T range, + DATA_T mem[][MAT_AXI_RATIO]) { + + for (int i = 0; i < range; i ++) { + for (int j = 0; j < MAT_AXI_RATIO; j ++) { +#pragma HLS UNROLL + mem[sram_idx][j] = 0; + } + sram_idx ++; + } +} + +template +void load_pad_2d( + volatile DATA_T *src, + DATA_T dst[][MAT_AXI_RATIO], + memop_sram_T sram_idx, + memop_dram_T dram_idx, + memop_size_T y_size, + memop_size_T x_size, + memop_stride_T x_stride, + memop_pad_T x_pad_0, + memop_pad_T x_pad_1, + memop_sram_T y_offset_0, + memop_sram_T y_offset_1) { +#pragma HLS INLINE + + reset_mem(sram_idx, y_offset_0, dst); + for (int y = 0; y < y_size; y++) { +#pragma HLS PIPELINE + reset_mem(sram_idx, x_pad_0, dst); + memcpy(&dst[sram_idx][0], + (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO], + x_size * ELEM_BYTES); + sram_idx += x_size; + dram_idx += x_stride; + reset_mem(sram_idx, x_pad_1, dst); + } + reset_mem(sram_idx, y_offset_1, dst); +} + +template +void load_2d( + volatile DATA_T *src, + DATA_T dst[][MAT_AXI_RATIO], + memop_sram_T sram_idx, + memop_dram_T dram_idx, + memop_size_T y_size, + memop_size_T x_size, + memop_stride_T x_stride) { +#pragma HLS INLINE + + for (int y = 0; y < y_size; y++) { + memcpy(&dst[sram_idx][0], + (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO], + x_size * ELEM_BYTES); +#pragma HLS RESOURCE variable = sram_idx core = Mul_LUT + sram_idx += x_size; + dram_idx += x_stride; + } +} + +template +void read_tensor( + IDX_T idx, + WIDE_T src[][NARROW_W * Y_DIM * X_DIM / WIDE_W], + NARROW_T dst[Y_DIM][X_DIM]) { +#pragma HLS INLINE + + // Read in weight tensor + for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) { + WIDE_T packet = src[idx][p]; + for (int w = 0; w < (WIDE_W / NARROW_W); w++) { + int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM; + int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM; + dst[x][y] = (NARROW_T) packet.range((w + 1) * NARROW_W - 1, w * NARROW_W); + } + } +} + +template +void write_tensor( + IDX_T idx, + NARROW_T src[Y_DIM][X_DIM], + WIDE_T dst[][NARROW_W * Y_DIM * X_DIM / WIDE_W]) { +#pragma HLS INLINE + + for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) { + WIDE_T packet = 0; + for (int w = 0; w < (WIDE_W / NARROW_W); w++) { + int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM; + int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM; + packet.range((w + 1) * NARROW_W - 1, w * NARROW_W) = src[x][y]; + } + dst[idx][p] = packet; + } +} + void fetch( uint32_t insn_count, volatile insn_T *insns, hls::stream &load_queue, hls::stream &gemm_queue, hls::stream &store_queue) { -#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS +PRAGMA_HLS(HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS offset = VTA_FETCH_INSN_COUNT_OFFSET) #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port #pragma HLS INTERFACE axis port = load_queue #pragma HLS INTERFACE axis port = gemm_queue @@ -43,170 +143,288 @@ void fetch( #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) { -#pragma HLS PIPELINE II = 1 +#pragma HLS PIPELINE // Read instruction fields - insn_T insn = insns[pc]; + insn_T raw_insn = insns[pc]; + VTAInsn insn; + insn.generic = *((VTAGenericInsn *) &raw_insn); // Do some partial decoding - opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0); - memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0); + opcode_T opcode = insn.generic.opcode; + memop_id_T memory_type = insn.mem.memory_type; // Push to appropriate instruction queue if (opcode == VTA_OPCODE_STORE) { - store_queue.write(insn); - } else if (opcode == VTA_OPCODE_LOAD && - (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) { - load_queue.write(insn); + store_queue.write(raw_insn); + } else if (opcode == VTA_OPCODE_LOAD) { + if (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT) { + load_queue.write(raw_insn); + } else { + gemm_queue.write(raw_insn); + } } else { - gemm_queue.write(insn); + gemm_queue.write(raw_insn); } } } void load( - volatile inp_vec_T *inputs, - volatile wgt_vec_T *weights, + volatile bus_T *inputs, + volatile bus_T *weights, hls::stream &load_queue, hls::stream &g2l_dep_queue, hls::stream &l2g_dep_queue, - inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH], - wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT] - ) { -#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port + bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], + bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]) { #pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port +#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port #pragma HLS INTERFACE axis port = load_queue #pragma HLS INTERFACE axis port = g2l_dep_queue #pragma HLS INTERFACE axis port = l2g_dep_queue #pragma HLS INTERFACE bram port = wgt_mem #pragma HLS INTERFACE bram port = inp_mem #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS +#pragma HLS RESOURCE variable = inp_mem core = RAM_1P +#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P // Pop load instruction - insn_T insn = load_queue.read(); - - // Decode instruction - bool pop_prev_dependence = insn[VTA_INSN_MEM_1]; - bool pop_next_dependence = insn[VTA_INSN_MEM_2]; - bool push_prev_dependence = insn[VTA_INSN_MEM_3]; - bool push_next_dependence = insn[VTA_INSN_MEM_4]; - memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0); - memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0); - memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0); - memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0); - memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0); - memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0); - memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0); - memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0); - memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0); - memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0); + insn_T raw_insn = load_queue.read(); + // Cast to MemInsn + insn_T raw_copy = raw_insn; + VTAMemInsn insn = *((VTAMemInsn *) &raw_copy); // Pop dependence token if instructed - if (pop_next_dependence) { + if (insn.pop_next_dep) { g2l_dep_queue.read(); } - // Initialize indices - memop_sram_T sram_idx = sram_base; - memop_dram_T dram_idx = dram_base; - - // Pre-compute dimensions, and offsets - memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1; - memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1; - memop_sram_T y_offset = x_size_total * y_pad_0; -// Force this computation to be done with LUTs to avoid using too many DSPs -#pragma HLS RESOURCE variable = y_offset core = Mul_LUT - - // Skip padding along y dimension - sram_idx += y_offset; + // Pre-processing + memop_sram_T x_width = (insn.x_pad_0 + insn.x_size + insn.x_pad_1); + memop_sram_T y_offset_0 = x_width * insn.y_pad_0; +#pragma HLS RESOURCE variable = y_offset_0 core = Mul_LUT latency = 4 + memop_sram_T y_offset_1 = x_width * insn.y_pad_1; +#pragma HLS RESOURCE variable = y_offset_1 core = Mul_LUT latency = 4 + + if (insn.memory_type == VTA_MEM_ID_INP) { + load_pad_2d( + inputs, + inp_mem, + insn.sram_base, + insn.dram_base, + insn.y_size, + insn.x_size, + insn.x_stride, + insn.x_pad_0, + insn.x_pad_1, + y_offset_0, + y_offset_1); + } else if (insn.memory_type == VTA_MEM_ID_WGT) { + load_2d( + weights, + wgt_mem, + insn.sram_base, + insn.dram_base, + insn.y_size, + insn.x_size, + insn.x_stride); + } - // Perform data transfer from DRAM - for (int y = 0; y < y_size; y++) { -#pragma HLS PIPELINE rewind - // Skip padding along x dimension - sram_idx += x_pad_0; - // Perform data transfer - if (memory_type == VTA_MEM_ID_INP) { - memcpy(&inp_mem[sram_idx][0], - (const inp_vec_T*) &inputs[dram_idx * VTA_BATCH], - x_size * VTA_INP_ELEM_BYTES); - } else { - memcpy(&wgt_mem[sram_idx][0], - (const wgt_vec_T*) &weights[dram_idx * VTA_BLOCK_OUT], - x_size * VTA_WGT_ELEM_BYTES); - } - sram_idx += x_size; - dram_idx += x_stride; - // Skip padding along x dimension - sram_idx += x_pad_1; + // Push dependence token if instructed + if (insn.push_next_dep) { + l2g_dep_queue.write(1); } +} - // Reset SRAM index - sram_idx = sram_base; - // Pad x/y edges with zeros - for (int y = 0; y < y_size_total; y++) { - if (y < y_pad_0 || y >= y_pad_0 + y_size) { - for (int x = 0; x < x_size_total; x++) { -#pragma HLS PIPELINE II = 1 rewind - if (memory_type == VTA_MEM_ID_INP) { - for (int i = 0; i < VTA_BATCH; i++) { - inp_mem[sram_idx][i] = 0; - } - } else { - for (int i = 0; i < VTA_BLOCK_OUT; i++) { - wgt_mem[sram_idx][i] = 0; - } - } - sram_idx++; - } - } else { - for (int x = 0; x < x_pad_0; x++) { -#pragma HLS PIPELINE II = 1 rewind - if (memory_type == VTA_MEM_ID_INP) { - for (int i = 0; i < VTA_BATCH; i++) { - inp_mem[sram_idx][i] = 0; - } - } else { - for (int i = 0; i < VTA_BLOCK_OUT; i++) { - wgt_mem[sram_idx][i] = 0; +void gemm( + insn_T insn_raw, + uop_T uop_mem[VTA_UOP_BUFF_DEPTH], + bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO], + bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], + bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO], + bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) { +#pragma HLS INLINE + + VTAGemInsn insn = *((VTAGemInsn *) &insn_raw); + + // Loop offset + acc_idx_T dst_offset_out = 0; + inp_idx_T src_offset_out = 0; + wgt_idx_T wgt_offset_out = 0; + + // Outer Loop + EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) { + acc_idx_T dst_offset_in = dst_offset_out; + inp_idx_T src_offset_in = src_offset_out; + wgt_idx_T wgt_offset_in = wgt_offset_out; + + // Inner Loop + EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) { + + // Iterate over micro op + READ_GEMM_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) { +#pragma HLS PIPELINE II = 1 + // Read micro-op fields + uop_T uop = uop_mem[upc]; + + // Decode indices + acc_idx_T dst_idx = + uop.range(VTA_UOP_GEM_0_1, VTA_UOP_GEM_0_0) + dst_offset_in; + inp_idx_T src_idx = + uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + src_offset_in; + wgt_idx_T wgt_idx = + uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + wgt_offset_in; + + // Read in weight tensor + wgt_T w_tensor[VTA_BLOCK_OUT][VTA_BLOCK_IN]; + read_tensor(wgt_idx, wgt_mem, w_tensor); + // Read in input tensor + inp_T i_tensor[VTA_BATCH][VTA_BLOCK_IN]; + read_tensor(src_idx, inp_mem, i_tensor); + // Read in accum tensor + acc_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT]; + read_tensor(dst_idx, acc_mem, a_tensor); + // Output tensor + out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT]; + + // Inner GEMM loop + for (int b = 0; b < VTA_BATCH; b++) { + for (int oc = 0; oc < VTA_BLOCK_OUT; oc++) { + // Initialize the accumulator values + acc_T accum = a_tensor[b][oc]; + // Dot product sum + sum_T tmp = 0; + // Inner matrix multiplication loop (input channel/feature) + for (int ic = 0; ic < VTA_BLOCK_IN; ic++) { + wgt_T w_elem = w_tensor[oc][ic]; + inp_T i_elem = i_tensor[b][ic]; + mul_T prod_dsp = i_elem * w_elem; + tmp += (sum_T) prod_dsp; + } + // Update summation + accum += (acc_T) tmp; + // Write back result acc_mem + a_tensor[b][oc] = insn.reset_reg ? (acc_T) 0 : accum; + // And output vector + o_tensor[b][oc] = (out_T) accum.range(VTA_OUT_WIDTH - 1, 0); } } - sram_idx++; + + // Write the results back into accumulator + write_tensor(dst_idx, a_tensor, acc_mem); + // Write the results back in the output buffer + write_tensor(dst_idx, o_tensor, out_mem); } - sram_idx += x_size; - for (int x = 0; x < x_pad_1; x++) { -#pragma HLS PIPELINE II = 1 rewind - if (memory_type == VTA_MEM_ID_INP) { - for (int i = 0; i < VTA_BATCH; i++) { - inp_mem[sram_idx][i] = 0; - } - } else { - for (int i = 0; i < VTA_BLOCK_OUT; i++) { - wgt_mem[sram_idx][i] = 0; + // Update offsets + dst_offset_in += insn.dst_factor_in; + src_offset_in += insn.src_factor_in; + wgt_offset_in += insn.wgt_factor_in; + } + // Update offsets + dst_offset_out += insn.dst_factor_out; + src_offset_out += insn.src_factor_out; + wgt_offset_out += insn.wgt_factor_out; + } +} + +void alu( + insn_T insn_raw, + uop_T uop_mem[VTA_UOP_BUFF_DEPTH], + bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO], + bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], + bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO], + bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) { +#pragma HLS INLINE + + VTAAluInsn insn = *((VTAAluInsn *) &insn_raw); + + // Loop offset + acc_idx_T dst_offset_out = 0; + inp_idx_T src_offset_out = 0; + + // Outer Loop + EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) { + acc_idx_T dst_offset_in = dst_offset_out; + inp_idx_T src_offset_in = src_offset_out; + + // Inner Loop + EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) { + // Iterate over micro op + READ_ALU_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) { +#pragma HLS PIPELINE II = 2 + // Read micro-op fields + uop_T uop = uop_mem[upc]; + + // Decode + acc_idx_T dst_idx = + uop.range(VTA_UOP_ALU_0_1, VTA_UOP_ALU_0_0) + dst_offset_in; + acc_idx_T src_idx = + uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in; + + // Read in src tensor + acc_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT]; + read_tensor(src_idx, acc_mem, src_tensor); + // Read in dst tensor + acc_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT]; + read_tensor(dst_idx, acc_mem, dst_tensor); + // Output tensor + out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT]; + + // Perform ALU op over matrix elements + for (int i = 0; i < VTA_BATCH; i++) { + for (int b = 0; b < VTA_BLOCK_OUT; b++) { + // Read in operands + acc_T src_0 = dst_tensor[i][b]; + acc_T src_1 = insn.use_imm ? (acc_T) insn.imm : src_tensor[i][b]; + aluop_shr_arg_T shft_by = src_1.range(VTA_SHR_ARG_BIT_WIDTH - 1, 0); + aluop_mul_arg_T mul_by = src_1.range(VTA_MUL_ARG_BIT_WIDTH - 1, 0); + if (insn.alu_opcode == VTA_ALU_OPCODE_MIN || insn.alu_opcode == VTA_ALU_OPCODE_MAX) { + // Compute Min/Max + acc_T mix_val = src_0 < src_1 ? + (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) : + (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0); + dst_tensor[i][b] = mix_val; + o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0); + } else if (insn.alu_opcode == VTA_ALU_OPCODE_ADD) { + // Compute Sum + acc_T add_val = + src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0); + dst_tensor[i][b] = add_val; + o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0); + } else if (insn.alu_opcode == VTA_ALU_OPCODE_SHR) { + // Compute Shift Right + acc_T shr_val = src_0 >> shft_by; + dst_tensor[i][b] = shr_val; + o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH - 1, 0); + } } } - sram_idx++; + + // Write the results back into accumulator + write_tensor(dst_idx, dst_tensor, acc_mem); + // Write the results back in the output buffer + write_tensor(dst_idx, o_tensor, out_mem); } + // Update offsets + dst_offset_in += insn.dst_factor_in; + src_offset_in += insn.src_factor_in; } - } - - // Push dependence token if instructed - if (push_next_dependence) { - l2g_dep_queue.write(1); + // Update offsets + dst_offset_out += insn.dst_factor_out; + src_offset_out += insn.src_factor_out; } } void compute( volatile uint32_t &done, volatile uop_T *uops, - volatile acc_vec_T *biases, + volatile bus_T *biases, hls::stream &gemm_queue, hls::stream &l2g_dep_queue, hls::stream &s2g_dep_queue, hls::stream &g2l_dep_queue, hls::stream &g2s_dep_queue, - inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH], - wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT], - out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH] - ) { -#pragma HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS + bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], + bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO], + bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) { +PRAGMA_HLS(HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS offset = VTA_COMPUTE_DONE_WR_OFFSET) #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port #pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port #pragma HLS INTERFACE axis port = gemm_queue @@ -218,351 +436,119 @@ void compute( #pragma HLS INTERFACE bram port = wgt_mem #pragma HLS INTERFACE bram port = out_mem #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS -// This is necessary connect the SRAM to the load module +#pragma HLS RESOURCE variable = inp_mem core = RAM_1P #pragma HLS RESOURCE variable = wgt_mem core = RAM_1P +#pragma HLS RESOURCE variable = out_mem core = RAM_1P // Micro-op storage static uop_T uop_mem[VTA_UOP_BUFF_DEPTH]; // Accumulator storage - static acc_vec_T acc_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]; -#pragma HLS ARRAY_PARTITION variable = acc_mem complete dim = 2 + static bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO]; +#pragma HLS ARRAY_RESHAPE variable = acc_mem complete dim=2 +// This is necessary to obtain II=1 +#pragma HLS DEPENDENCE variable = acc_mem inter false // Pop GEMM instruction - insn_T insn = gemm_queue.read(); - - // Decode - opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0); - bool pop_prev_dependence = insn[VTA_INSN_MEM_1]; - bool pop_next_dependence = insn[VTA_INSN_MEM_2]; - bool push_prev_dependence = insn[VTA_INSN_MEM_3]; - bool push_next_dependence = insn[VTA_INSN_MEM_4]; + insn_T raw_insn = gemm_queue.read(); + // Cast to GenericInsn + VTAInsn insn; + insn_T raw_copy = raw_insn; + insn.generic = *((VTAGenericInsn *) &raw_copy); // Pop dependence token if instructed - if (pop_prev_dependence) { + if (insn.generic.pop_prev_dep) { l2g_dep_queue.read(); } - if (pop_next_dependence) { + if (insn.generic.pop_next_dep) { s2g_dep_queue.read(); } + // Set done value + done = 0; // Perform action based on opcode - if (opcode == VTA_OPCODE_FINISH) { + if (insn.generic.opcode == VTA_OPCODE_FINISH) { // Set done flag if we reach a FINISH instruction done = 1; - } else if (opcode == VTA_OPCODE_LOAD || opcode == VTA_OPCODE_STORE) { - // Set done value - done = 0; - - // Decode instruction - memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0); - memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0); - memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0); - memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0); - memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0); - memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0); - memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0); - memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0); - memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0); - memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0); - + } else if (insn.generic.opcode == VTA_OPCODE_LOAD) { // Initialize indices - memop_sram_T sram_idx = sram_base; - memop_dram_T dram_idx = dram_base; - - // Pre-compute dimensions, and offsets - memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1; - memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1; - memop_sram_T y_offset = x_size_total * y_pad_0; -// Force this computation to be done with LUTs to avoid using too many DSPs -#pragma HLS RESOURCE variable = y_offset core = Mul_LUT - - if (memory_type == VTA_MEM_ID_UOP) { + memop_sram_T sram_idx = insn.mem.sram_base; + memop_dram_T dram_idx = insn.mem.dram_base; + if (insn.mem.memory_type == VTA_MEM_ID_UOP) { // Perform data transfer - memcpy(&uop_mem[sram_base], - (const uop_T*) &uops[dram_base], - x_size * sizeof(uop_T)); - } else { - // Skip vertical padding - sram_idx += y_offset; + memcpy(&uop_mem[sram_idx], + (const uop_T*) &uops[dram_idx], + insn.mem.x_size * sizeof(uop_T)); + } else if (insn.mem.memory_type == VTA_MEM_ID_ACC) { // Perform data transfer from DRAM - for (int y = 0; y < y_size; y++) { -#pragma HLS PIPELINE rewind - // Skip padding along x dimension - sram_idx += x_pad_0; - // Perform data transfer - memcpy(&acc_mem[sram_idx][0], - (const acc_vec_T*) &biases[dram_idx * VTA_BATCH], - x_size*VTA_ACC_ELEM_BYTES); - sram_idx += x_size; - dram_idx += x_stride; - // Skip padding along x dimension - sram_idx += x_pad_1; - } - } - } else if (opcode == VTA_OPCODE_GEMM || opcode == VTA_OPCODE_ALU) { - // Set done value - done = 0; - - // Decode - bool reset_out = insn[VTA_INSN_GEM_5]; - uop_idx_T uop_bgn = insn.range(VTA_INSN_GEM_6_1, VTA_INSN_GEM_6_0); - uop_idx_T uop_end = insn.range(VTA_INSN_GEM_7_1, VTA_INSN_GEM_7_0); - loop_T iter_out = insn.range(VTA_INSN_GEM_8_1, VTA_INSN_GEM_8_0); - loop_T iter_in = insn.range(VTA_INSN_GEM_9_1, VTA_INSN_GEM_9_0); - acc_idx_T dst_factor_out = insn.range(VTA_INSN_GEM_A_1, VTA_INSN_GEM_A_0); - acc_idx_T dst_factor_in = insn.range(VTA_INSN_GEM_B_1, VTA_INSN_GEM_B_0); - inp_idx_T src_factor_out = insn.range(VTA_INSN_GEM_C_1, VTA_INSN_GEM_C_0); - inp_idx_T src_factor_in = insn.range(VTA_INSN_GEM_D_1, VTA_INSN_GEM_D_0); - - // GEMM-specific fields - wgt_idx_T wgt_factor_out = insn.range(VTA_INSN_GEM_E_1, VTA_INSN_GEM_E_0); - wgt_idx_T wgt_factor_in = insn.range(VTA_INSN_GEM_F_1, VTA_INSN_GEM_F_0); - - // ALU-specific field - aluop_opcode_T alu_opcode = insn.range(VTA_INSN_ALU_E_1, VTA_INSN_ALU_E_0); - bool use_imm = insn[VTA_INSN_ALU_F]; - aluop_imm_T imm = insn.range(VTA_INSN_ALU_G_1, VTA_INSN_ALU_G_0); - acc_idx_T dst_offset_out = 0; - inp_idx_T src_offset_out = 0; - wgt_idx_T wgt_offset_out = 0; - - // Outer Loop - EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out++) { -#pragma HLS DEPENDENCE variable = acc_mem inter false - acc_idx_T dst_offset_in = dst_offset_out; - inp_idx_T src_offset_in = src_offset_out; - wgt_idx_T wgt_offset_in = wgt_offset_out; - - // Inner Loop - EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in++) { - // Perform appropriate computation based on opcode - if (opcode == VTA_OPCODE_GEMM) { - // Iterate over micro op - READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) { -#pragma HLS PIPELINE II = 1 rewind - - // Read micro-op fields - uop_T uop = uop_mem[upc]; - - // Decode indices - acc_idx_T dst_idx = - uop.range(VTA_UOP_GEM_0_1, VTA_UOP_GEM_0_0) + dst_offset_in; - inp_idx_T src_idx = - uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + src_offset_in; - wgt_idx_T wgt_idx = - uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + wgt_offset_in; - - // Read weight matrix - wgt_vec_T w_matrix[VTA_BLOCK_OUT]; - for (int i = 0; i < VTA_BLOCK_OUT; i++) { - w_matrix[i] = wgt_mem[wgt_idx][i]; - } - // Read input matrix and accum matrix - acc_vec_T o_matrix[VTA_BATCH]; - inp_vec_T i_matrix[VTA_BATCH]; - for (int i = 0; i < VTA_BATCH; i++) { - o_matrix[i] = acc_mem[dst_idx][i]; - i_matrix[i] = inp_mem[src_idx][i]; - } - // Result matrices - acc_vec_T acc_mem_val[VTA_BATCH]; - out_vec_T st_buf_val[VTA_BATCH]; - - // Inner GEMM loop - for (int i = 0; i < VTA_BATCH; i++) { - for (int b = 0; b < VTA_BLOCK_OUT; b++) { - // Initialize the accumulator values - acc_T accum = - o_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH); - // Dot product sum - sum_T tmp = 0; - // Inner matrix multiplication loop (input channel/feature) - for (int k = 0; k < VTA_BLOCK_IN; k++) { - wgt_T w_elem = - w_matrix[b].range((k + 1) * VTA_WGT_WIDTH - 1, k * VTA_WGT_WIDTH); - inp_T i_elem = - i_matrix[i].range((k + 1) * VTA_INP_WIDTH - 1, k * VTA_INP_WIDTH); - mul_T prod = i_elem * w_elem; -#ifdef NO_DSP -#pragma HLS RESOURCE variable = prod core = Mul_LUT -#endif // NO_DSP - tmp += (sum_T) prod; - } - // Update summation - accum += (acc_T) tmp; - // Update result vector - acc_mem_val[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = - reset_out ? (acc_T) 0 : accum; - st_buf_val[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) = - (out_T) accum.range(VTA_OUT_WIDTH - 1, 0); - } - // Write to buffers - acc_mem[dst_idx][i] = acc_mem_val[i]; - out_mem[dst_idx][i] = st_buf_val[i]; - } - } - } -#ifndef NO_ALU - else if (opcode == VTA_OPCODE_ALU) { - // Iterate over micro op - READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) { - // Read micro-op fields - uop_T uop = uop_mem[upc]; - - // Decode - acc_idx_T dst_idx = - uop.range(VTA_UOP_ALU_0_1, VTA_UOP_ALU_0_0) + dst_offset_in; - acc_idx_T src_idx = - uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in; - - // Perform ALU op over matrix elements - for (int i = 0; i < VTA_BATCH; i++) { - // Read input matrix and accum matrix - acc_vec_T dst_vector = acc_mem[dst_idx][i]; - acc_vec_T src_vector = acc_mem[src_idx][i]; - // Result matrices - acc_vec_T cmp_res; - acc_vec_T add_res; - acc_vec_T shr_res; - out_vec_T short_cmp_res; - out_vec_T short_add_res; - out_vec_T short_shr_res; - // Results vector - acc_vec_T res_vec = 0; - for (int b = 0; b < VTA_BLOCK_OUT; b++) { -#pragma HLS PIPELINE II = 1 rewind - // Read in operands - acc_T src_0 = dst_vector.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH); - acc_T src_1 = use_imm ? - (acc_T) imm : - src_vector.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH); - // Compute Min/Max - acc_T mix_val = src_0 < src_1 ? - (alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) : - (alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0); - cmp_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = mix_val; - short_cmp_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) = - (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0); - // Compute Sum - acc_T add_val = - src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0); - add_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = add_val; - short_add_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) = - (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0); - // Compute Shift Right - acc_T shr_val = - src_0 >> (aluop_sh_imm_T) src_1.range(VTA_LOG_ACC_WIDTH - 1, 0); - shr_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = shr_val; - short_shr_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) = - (out_T) shr_val.range(VTA_OUT_WIDTH-1, 0); - } - - // Store to accum memory/store buffer - if (alu_opcode == VTA_ALU_OPCODE_MIN || - alu_opcode == VTA_ALU_OPCODE_MAX) { - acc_mem[dst_idx][i] = cmp_res; - out_mem[dst_idx][i] = short_cmp_res; - } else if (alu_opcode == VTA_ALU_OPCODE_ADD) { - acc_mem[dst_idx][i] = add_res; - out_mem[dst_idx][i] = short_add_res; - } else if (alu_opcode == VTA_ALU_OPCODE_SHR) { - acc_mem[dst_idx][i] = shr_res; - out_mem[dst_idx][i] = short_shr_res; - } - } - } - } -#endif // NO_ALU - - // Update offsets - dst_offset_in += dst_factor_in; - src_offset_in += src_factor_in; - wgt_offset_in += wgt_factor_in; - } - - // Update offsets - dst_offset_out += dst_factor_out; - src_offset_out += src_factor_out; - wgt_offset_out += wgt_factor_out; + load_2d( + biases, + acc_mem, + sram_idx, + dram_idx, + insn.mem.y_size, + insn.mem.x_size, + insn.mem.x_stride); } + } else if (insn.generic.opcode == VTA_OPCODE_GEMM) { + gemm(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem); + } else if (insn.generic.opcode == VTA_OPCODE_ALU) { + alu(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem); } // Push dependence token if instructed - if (push_prev_dependence) { + if (insn.generic.push_prev_dep) { g2l_dep_queue.write(1); } - if (push_next_dependence) { + if (insn.generic.push_next_dep) { g2s_dep_queue.write(1); } } void store( - volatile out_vec_T *outputs, + volatile bus_T *outputs, hls::stream &store_queue, hls::stream &g2s_dep_queue, hls::stream &s2g_dep_queue, - out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH] - ) { + bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) { #pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port #pragma HLS INTERFACE axis port = store_queue #pragma HLS INTERFACE axis port = g2s_dep_queue #pragma HLS INTERFACE axis port = s2g_dep_queue #pragma HLS INTERFACE bram port = out_mem #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS +#pragma HLS RESOURCE variable = out_mem core = RAM_1P - // Load buffer - insn_T insn = store_queue.read(); - - // Decode - bool pop_prev_dependence = insn[VTA_INSN_MEM_1]; - bool pop_next_dependence = insn[VTA_INSN_MEM_2]; - bool push_prev_dependence = insn[VTA_INSN_MEM_3]; - bool push_next_dependence = insn[VTA_INSN_MEM_4]; - memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0); - memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0); - memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0); - memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0); - memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0); - memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0); - memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0); - memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0); - memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0); - memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0); + // Pop store instruction + insn_T raw_insn = store_queue.read(); + // Cast to MemInsn + insn_T raw_copy = raw_insn; + VTAMemInsn insn = *((VTAMemInsn *) &raw_copy); // Pop dependence token if instructed - if (pop_prev_dependence) { + if (insn.pop_prev_dep) { g2s_dep_queue.read(); } // Initialize indices - memop_sram_T sram_idx = sram_base; - memop_dram_T dram_idx = dram_base; - - // Skip padding along y dimension - memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0; - sram_idx += y_offset; -// Force this computation to be done with LUTs to avoid using too many DSPs -#pragma HLS RESOURCE variable = y_offset core = Mul_LUT + memop_sram_T sram_idx = insn.sram_base; + memop_dram_T dram_idx = insn.dram_base; // Copy along y dimension - for (int y = 0; y < y_size; y++) { -#pragma HLS PIPELINE rewind - // Skip padding along x dimension - sram_idx += x_pad_0; + for (int y = 0; y < insn.y_size; y++) { +#pragma HLS PIPELINE // Perform data transfer memcpy( - const_cast(&outputs[dram_idx*VTA_BATCH]), - (const out_vec_T*) &out_mem[sram_idx][0], - x_size * VTA_INP_ELEM_BYTES); - sram_idx += x_size; - dram_idx += x_stride; - // Skip padding along x dimension - sram_idx += x_pad_1; + const_cast(&outputs[dram_idx * OUT_MAT_AXI_RATIO]), + (const bus_T*) &out_mem[sram_idx][0], + insn.x_size * VTA_OUT_ELEM_BYTES); +#pragma HLS RESOURCE variable = sram_idx core = Mul_LUT + sram_idx += insn.x_size; + dram_idx += insn.x_stride; } // Push dependence token if instructed - if (push_prev_dependence) { + if (insn.push_prev_dep) { s2g_dep_queue.write(1); } } @@ -571,10 +557,10 @@ void vta( uint32_t insn_count, volatile insn_T *insns, volatile uop_T *uops, - volatile inp_vec_T *inputs, - volatile wgt_vec_T *weights, - volatile acc_vec_T *biases, - volatile out_vec_T *outputs) { + volatile bus_T *inputs, + volatile bus_T *weights, + volatile bus_T *biases, + volatile bus_T *outputs) { #pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port @@ -606,14 +592,14 @@ void vta( hls::stream s2g_dep_queue; PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=s2g_dep_queue) hls::stream g2l_dep_queue; - PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue) + PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2l_dep_queue) hls::stream g2s_dep_queue; PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue) // Instantiate memories - inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH]; - wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]; - out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]; + bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO]; + bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]; + bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]; // Push all instructions into the queues fetch(insn_count, insns, tmp_load_queue, tmp_gemm_queue, tmp_store_queue); @@ -642,9 +628,9 @@ void vta( tmp_load_popped = true; } // Check dependences and invoke the load stage - bool pop_next_dependence = tmp_load[VTA_INSN_MEM_2]; - if ((pop_next_dependence && !g2l_dep_queue.empty()) || - !pop_next_dependence) { + VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_load); + if ((insn.pop_next_dep && !g2l_dep_queue.empty()) || + !insn.pop_next_dep) { // Push the instruction in the load queue load_queue.write(tmp_load); tmp_load_popped = false; @@ -662,16 +648,15 @@ void vta( tmp_gemm_popped = true; } // Check dependences and invoke the load stage - bool pop_prev_dependence = tmp_gemv[VTA_INSN_MEM_1]; - bool pop_next_dependence = tmp_gemv[VTA_INSN_MEM_2]; + VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv); if ( - (pop_prev_dependence && !l2g_dep_queue.empty() && - pop_next_dependence && !s2g_dep_queue.empty()) || - (!pop_prev_dependence && pop_next_dependence && + (insn.pop_prev_dep && !l2g_dep_queue.empty() && + insn.pop_next_dep && !s2g_dep_queue.empty()) || + (!insn.pop_prev_dep && insn.pop_next_dep && !s2g_dep_queue.empty()) || - (pop_prev_dependence && !l2g_dep_queue.empty() && - !pop_next_dependence) || - (!pop_prev_dependence && !pop_next_dependence) + (insn.pop_prev_dep && !l2g_dep_queue.empty() && + !insn.pop_next_dep) || + (!insn.pop_prev_dep && !insn.pop_next_dep) ) { // Push the instruction in the load queue gemm_queue.write(tmp_gemv); @@ -692,9 +677,10 @@ void vta( tmp_store_popped = true; } // Check dependences and invoke the load stage - bool pop_prev_dependence = tmp_store[VTA_INSN_MEM_1]; - if ((pop_prev_dependence && !g2s_dep_queue.empty()) || - !pop_prev_dependence) { + VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_store); + + if ((insn.pop_prev_dep && !g2s_dep_queue.empty()) || + !insn.pop_prev_dep) { // Push the instruction in the load queue store_queue.write(tmp_store); tmp_store_popped = false; @@ -716,10 +702,11 @@ void vta( } } if (tmp_gemm_popped) { - if (l2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_1]) { + VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv); + if (l2g_dep_queue.empty() && insn.pop_prev_dep) { printf("waiting on l2g\n"); } - if (s2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_2]) { + if (s2g_dep_queue.empty() && insn.pop_next_dep) { printf("waiting on s2g\n"); } } diff --git a/vta/hardware/xilinx/src/vta.h b/vta/hardware/xilinx/src/vta.h index 1395d5eaba8e..d796e2265d4f 100644 --- a/vta/hardware/xilinx/src/vta.h +++ b/vta/hardware/xilinx/src/vta.h @@ -18,7 +18,6 @@ */ /*! - * Copyright (c) 2018 by Contributors * \file vta.h * \brief Type definitions and prototype for VTA HLS design. */ @@ -32,6 +31,16 @@ #include +/*! +* Define HLS stream depth +*/ +#define PRAGMA_SUB(x) _Pragma (#x) +#define PRAGMA_HLS(x) PRAGMA_SUB(x) +#define STREAM_IN_DEPTH 8 + +/* \typedef bus_T memory bus datatype*/ +typedef ap_uint bus_T; + /* \typedef uop_T Micro-op datatype*/ typedef ap_uint uop_T; @@ -53,18 +62,6 @@ typedef ap_int mul_T; /* \typedef sum_T GEMM accumulator datatype*/ typedef ap_int sum_T; -/* \typedef inp_vec_T Input vector datatype*/ -typedef ap_uint inp_vec_T; - -/* \typedef wgt_vec_T Weight vector datatype*/ -typedef ap_uint wgt_vec_T; - -/* \typedef acc_vec_T Accumulator vector datatype*/ -typedef ap_uint acc_vec_T; - -/* \typedef out_vec_T Output vector datatype*/ -typedef ap_uint out_vec_T; - /* \typedef uop_idx_T Micro-op SRAM index datatype*/ typedef ap_uint uop_idx_T; @@ -107,18 +104,14 @@ typedef ap_uint memop_pad_T; /* \typedef aluop_opcode_T ALU operation opcode datatype*/ typedef ap_uint aluop_opcode_T; -/* \typedef aluop_opcode_T ALU operation immediate datatype*/ +/* \typedef aluop_imm_T ALU operation immediate datatype*/ typedef ap_int aluop_imm_T; -/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/ -typedef ap_int aluop_sh_imm_T; +/* \typedef aluop_shr_arg_T ALU operation shift right immediate datatype*/ +typedef ap_int aluop_shr_arg_T; -/*! -* Define HLS stream depth -*/ -#define PRAGMA_SUB(x) _Pragma (#x) -#define PRAGMA_HLS(x) PRAGMA_SUB(x) -#define STREAM_IN_DEPTH 8 +/* \typedef aluop_mul_arg_T ALU operation multiply datatype*/ +typedef ap_int aluop_mul_arg_T; /*! * \brief Fetch module. @@ -153,13 +146,13 @@ void fetch( * \param wgt_mem Local weight SRAM buffer. Write only single port BRAM. */ void load( - volatile inp_vec_T *inputs, - volatile wgt_vec_T *weights, + volatile bus_T *inputs, + volatile bus_T *weights, hls::stream &load_queue, hls::stream &g2l_dep_queue, hls::stream &l2g_dep_queue, - inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH], - wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]); + bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], + bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]); /*! * \brief Compute module. @@ -187,15 +180,15 @@ void load( void compute( volatile uint32_t &done, volatile uop_T *uops, - volatile acc_vec_T *biases, + volatile bus_T *biases, hls::stream &gemm_queue, hls::stream &l2g_dep_queue, hls::stream &s2g_dep_queue, hls::stream &g2l_dep_queue, hls::stream &g2s_dep_queue, - out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH], - wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT], - out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]); + bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], + bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO], + bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]); /*! * \brief Store module. @@ -211,11 +204,11 @@ void compute( * \param out_mem Local output SRAM buffer. Read only single port BRAM. */ void store( - volatile out_vec_T *outputs, + volatile bus_T *outputs, hls::stream &store_queue, hls::stream &g2s_dep_queue, hls::stream &s2g_dep_queue, - out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]); + bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]); /*! * \brief VTA wrapper for simulation purpose only. @@ -232,9 +225,9 @@ void vta( uint32_t insn_count, volatile insn_T *insns, volatile uop_T *uops, - volatile inp_vec_T *inputs, - volatile wgt_vec_T *weights, - volatile acc_vec_T *biases, - volatile out_vec_T *outputs); + volatile bus_T *inputs, + volatile bus_T *weights, + volatile bus_T *biases, + volatile bus_T *outputs); #endif // VTA_VTA_H_ diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h index a6f5fd27f528..410a2b24a090 100644 --- a/vta/include/vta/driver.h +++ b/vta/include/vta/driver.h @@ -136,19 +136,23 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size); /*! * \brief Flushes the region of memory out of the CPU cache to DRAM. - * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed. - * This need to be the physical address. + * \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be flushed. + * This need to be the virtual address. + * \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be flushed. + * This need to be the physical address. * \param size Size of the region to flush in Bytes. */ -void VTAFlushCache(vta_phy_addr_t buf, int size); +void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size); /*! * \brief Invalidates the region of memory that is cached. - * \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated. - * This need to be the physical address. + * \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated. + * This need to be the virtual address. + * \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated. + * This need to be the physical address. * \param size Size of the region to invalidate in Bytes. */ -void VTAInvalidateCache(vta_phy_addr_t buf, int size); +void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size); #ifdef __cplusplus } diff --git a/vta/include/vta/hw_spec.h b/vta/include/vta/hw_spec.h index 36b0a5d26b3b..9751b2f137c8 100644 --- a/vta/include/vta/hw_spec.h +++ b/vta/include/vta/hw_spec.h @@ -18,7 +18,6 @@ */ /*! - * Copyright (c) 2018 by Contributors * \file hw_spec.h * \brief Preprocessor definitions for VTA HLS design and runtime. */ @@ -32,6 +31,9 @@ extern "C" { #include +/*! Memory bus width */ +#define VTA_BUS_WIDTH (1 << VTA_LOG_BUS_WIDTH) + /*! log2 of instruction data type width */ #define VTA_LOG_INS_WIDTH 7 /*! Instruction data type width */ @@ -48,10 +50,6 @@ extern "C" { #define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH) /*! Accumulator data type width */ #define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH) -/*! log2 of ALU data type width */ -#define VTA_LOG_ALU_WIDTH (VTA_LOG_ACC_WIDTH - 1) -/*! ALU data type width */ -#define VTA_ALU_WIDTH (1 << VTA_LOG_ALU_WIDTH) /*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/ #define VTA_BATCH (1 << VTA_LOG_BATCH) @@ -60,15 +58,6 @@ extern "C" { /*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */ #define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT) -/*! Weight vector width */ -#define VTA_WGT_VECTOR_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_IN) -/*! Input vector width */ -#define VTA_INP_VECTOR_WIDTH (VTA_INP_WIDTH * VTA_BLOCK_IN) -/*! Accumulator vector width */ -#define VTA_ACC_VECTOR_WIDTH (VTA_ACC_WIDTH * VTA_BLOCK_OUT) -/*! Output vector width */ -#define VTA_OUT_VECTOR_WIDTH (VTA_OUT_WIDTH * VTA_BLOCK_OUT) - /*! On-chip micro-op buffer size in B */ #define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE) /*! On-chip weight buffer size in B */ @@ -78,16 +67,36 @@ extern "C" { /*! On-chip accumulator buffer size in B */ #define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE) +/*! Input vector size in bits */ +#define VTA_INP_MATRIX_WIDTH (VTA_INP_WIDTH * VTA_BATCH * VTA_BLOCK_IN) +/*! Weight vector size in bits */ +#define VTA_WGT_MATRIX_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_OUT * VTA_BLOCK_IN) +/*! Accumulator vector size in bits */ +#define VTA_ACC_MATRIX_WIDTH (VTA_ACC_WIDTH * VTA_BATCH * VTA_BLOCK_OUT) +/*! Output vector size in bits */ +#define VTA_OUT_MATRIX_WIDTH (VTA_OUT_WIDTH * VTA_BATCH * VTA_BLOCK_OUT) + +/*! Ratio between input matrix size and axi width */ +#define INP_MAT_AXI_RATIO (VTA_INP_MATRIX_WIDTH / VTA_BUS_WIDTH) +/*! Ratio between weight matrix size and axi width */ +#define WGT_MAT_AXI_RATIO (VTA_WGT_MATRIX_WIDTH / VTA_BUS_WIDTH) +/*! Ratio between accumulator matrix size and axi width */ +#define ACC_MAT_AXI_RATIO (VTA_ACC_MATRIX_WIDTH / VTA_BUS_WIDTH) +/*! Ratio between output matrix size and axi width */ +#define OUT_MAT_AXI_RATIO (VTA_OUT_MATRIX_WIDTH / VTA_BUS_WIDTH) + /*! Size of instruction buffer element in B */ #define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8) /*! Size of uop buffer element in B*/ #define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8) /*! Size of activation buffer element in B*/ -#define VTA_INP_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_IN * VTA_INP_WIDTH / 8) +#define VTA_INP_ELEM_BYTES (VTA_INP_MATRIX_WIDTH / 8) /*! Size of weight buffer element in B*/ -#define VTA_WGT_ELEM_BYTES (VTA_BLOCK_OUT * VTA_BLOCK_IN * VTA_WGT_WIDTH / 8) +#define VTA_WGT_ELEM_BYTES (VTA_WGT_MATRIX_WIDTH / 8) /*! Size of accumulator buffer element in B*/ -#define VTA_ACC_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_OUT * VTA_ACC_WIDTH / 8) +#define VTA_ACC_ELEM_BYTES (VTA_ACC_MATRIX_WIDTH / 8) +/*! Size of output buffer element in B*/ +#define VTA_OUT_ELEM_BYTES (VTA_OUT_MATRIX_WIDTH / 8) /*! On-chip micro-op buffer depth */ #define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES) @@ -148,10 +157,14 @@ extern "C" { #define VTA_MEMOP_PAD_BIT_WIDTH 4 /*! Load/Store Instruction: padding value encoding width*/ #define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2 -/*! ALU Instruction: immediate bitwidth*/ -#define VTA_ALUOP_IMM_BIT_WIDTH 16 /*! GEMM/ALU Instruction: loop max iter bits */ #define VTA_LOOP_ITER_WIDTH 14 +/*! ALU Instruction: immediate bitwidth*/ +#define VTA_ALUOP_IMM_BIT_WIDTH 16 +/*! ALU Instruction: shift arg bitwidth*/ +#define VTA_SHR_ARG_BIT_WIDTH (VTA_LOG_ACC_WIDTH) +/*! ALU Instruction: multiply arg bitwidth*/ +#define VTA_MUL_ARG_BIT_WIDTH 8 /*! Mem ID constant: uop memory */ #define VTA_MEM_ID_UOP 0 @@ -164,186 +177,6 @@ extern "C" { /*! Mem ID constant: output store buffer */ #define VTA_MEM_ID_OUT 4 -// Instruction organization layout: -// -// LOAD/STORE -// _____________________________|_type______________| -// arg 0: opcode | opcode_T | -// arg 1: pop_prev_dependence | bool | -// arg 2: pop_next_dependence | bool | -// arg 3: push_prev_dependence | bool | -// arg 4: push_next_dependence | bool | -// arg 5: memory_type | memop_id_T | -// arg 6: pad_value | memop_pad_val_T | -// arg 7: sram_base | memop_sram_T | -// arg 8: dram_base | memop_dram_T | -// arg 9: y_size | memop_size_T | -// arg a: x_size | memop_size_T | -// arg b: x_stride | memop_stride_T | -// arg c: y_pad_0 | memop_pad_T | -// arg d: y_pad_1 | memop_pad_T | -// arg e: x_pad_0 | memop_pad_T | -// arg f: x_pad_1 | memop_pad_T | -// -// GEMM -// _____________________________|_type______________| -// arg 0: opcode | opcode_T | -// arg 1: pop_prev_dependence | bool | -// arg 2: pop_next_dependence | bool | -// arg 3: push_prev_dependence | bool | -// arg 4: push_next_dependence | bool | -// arg 5: reset_reg | bool | -// arg 6: uop_bgn | uop_idx_T | -// arg 7: uop_end | uop_idx_T | -// arg 8: iteration count ax0 | loop_T | -// arg 9: iteration count ax1 | loop_T | -// arg a: accum idx factor ax0 | acc_idx_T | -// arg b: accum idx factor ax1 | acc_idx_T | -// arg c: input idx factor ax0 | inp_idx_T | -// arg d: input idx factor ax1 | inp_idx_T | -// arg e: weight idx factor ax0 | wgt_idx_T | -// arg f: weight idx factor ax1 | wgt_idx_T | -// -// ALU -// _____________________________|_type______________| -// arg 0: opcode | opcode_T | -// arg 1: pop_prev_dependence | bool | -// arg 2: pop_next_dependence | bool | -// arg 3: push_prev_dependence | bool | -// arg 4: push_next_dependence | bool | -// arg 5: reset_reg | bool | -// arg 6: uop_bgn | uop_idx_T | -// arg 7: uop_end | uop_idx_T | -// arg 8: iteration count ax0 | loop_T | -// arg 9: iteration count ax1 | loop_T | -// arg a: dst idx factor ax0 | acc_idx_T | -// arg b: dst idx factor ax1 | acc_idx_T | -// arg c: src idx factor ax0 | inp_idx_T | -// arg d: src idx factor ax1 | inp_idx_T | -// arg e: alu_opcode | aluop_opcode_T | -// arg f: use_imm | bool | -// arg g: imm | alu_imm_T | - -/*! Load/Store instruction start position of the opcode field */ -#define VTA_INSN_MEM_0_0 0 -/*! Load/Store instruction end position of the opcode field */ -#define VTA_INSN_MEM_0_1 (VTA_INSN_MEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1) -/*! Load/Store instruction position of the pop_prev_dep field */ -#define VTA_INSN_MEM_1 (VTA_INSN_MEM_0_1 + 1) -/*! Load/Store instruction position of the pop_next_dep field */ -#define VTA_INSN_MEM_2 (VTA_INSN_MEM_1 + 1) -/*! Load/Store instruction position of the push_prev_dependence field */ -#define VTA_INSN_MEM_3 (VTA_INSN_MEM_2 + 1) -/*! Load/Store instruction position of the push_next_dependence field */ -#define VTA_INSN_MEM_4 (VTA_INSN_MEM_3 + 1) -/*! Load/Store instruction start position of the memory_type field */ -#define VTA_INSN_MEM_5_0 (VTA_INSN_MEM_4 + 1) -/*! Load/Store instruction end position of the memory_type field */ -#define VTA_INSN_MEM_5_1 (VTA_INSN_MEM_5_0 + VTA_MEMOP_ID_BIT_WIDTH - 1) -/*! Load/Store instruction start position of the sram_base field */ -#define VTA_INSN_MEM_6_0 (VTA_INSN_MEM_5_1 + 1) -/*! Load/Store instruction end position of the sram_base field */ -#define VTA_INSN_MEM_6_1 (VTA_INSN_MEM_6_0 + VTA_MEMOP_SRAM_ADDR_BIT_WIDTH - 1) -/*! Load/Store instruction start position of the dram_base field */ -#define VTA_INSN_MEM_7_0 (VTA_INSN_MEM_6_1 + 1) -/*! Load/Store instruction end position of the dram_base field */ -#define VTA_INSN_MEM_7_1 (VTA_INSN_MEM_7_0 + VTA_MEMOP_DRAM_ADDR_BIT_WIDTH - 1) -/*! Load/Store instruction start position of the y_size field */ -#define VTA_INSN_MEM_8_0 64 -/*! Load/Store instruction end position of the y_size field */ -#define VTA_INSN_MEM_8_1 (VTA_INSN_MEM_8_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1) -/*! Load/Store instruction start position of the x_size field */ -#define VTA_INSN_MEM_9_0 (VTA_INSN_MEM_8_1 + 1) -/*! Load/Store instruction start position of the x_size field */ -#define VTA_INSN_MEM_9_1 (VTA_INSN_MEM_9_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1) -/*! Load/Store instruction start position of the x_stride field */ -#define VTA_INSN_MEM_A_0 (VTA_INSN_MEM_9_1 + 1) -/*! Load/Store instruction end position of the x_stride field */ -#define VTA_INSN_MEM_A_1 (VTA_INSN_MEM_A_0 + VTA_MEMOP_STRIDE_BIT_WIDTH - 1) -/*! Load/Store instruction start position of the y_pad_0 field */ -#define VTA_INSN_MEM_B_0 (VTA_INSN_MEM_A_1 + 1) -/*! Load/Store instruction start position of the y_pad_0 field */ -#define VTA_INSN_MEM_B_1 (VTA_INSN_MEM_B_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1) -/*! Load/Store instruction start position of the y_pad_1 field */ -#define VTA_INSN_MEM_C_0 (VTA_INSN_MEM_B_1 + 1) -/*! Load/Store instruction start position of the y_pad_1 field */ -#define VTA_INSN_MEM_C_1 (VTA_INSN_MEM_C_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1) -/*! Load/Store instruction start position of the x_pad_0 field */ -#define VTA_INSN_MEM_D_0 (VTA_INSN_MEM_C_1 + 1) -/*! Load/Store instruction start position of the x_pad_0 field */ -#define VTA_INSN_MEM_D_1 (VTA_INSN_MEM_D_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1) -/*! Load/Store instruction start position of the x_pad_1 field */ -#define VTA_INSN_MEM_E_0 (VTA_INSN_MEM_D_1 + 1) -/*! Load/Store instruction start position of the x_pad_1 field */ -#define VTA_INSN_MEM_E_1 (VTA_INSN_MEM_E_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1) - -/*! GEMM instruction start position of the opcode field */ -#define VTA_INSN_GEM_0_0 0 -/*! GEMM instruction end position of the opcode field */ -#define VTA_INSN_GEM_0_1 (VTA_INSN_GEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1) -/*! GEMM instruction position of the pop_prev_dep field */ -#define VTA_INSN_GEM_1 (VTA_INSN_GEM_0_1 + 1) -/*! GEMM instruction position of the pop_next_dep field */ -#define VTA_INSN_GEM_2 (VTA_INSN_GEM_1 + 1) -/*! GEMM instruction position of the push_prev_dependence field */ -#define VTA_INSN_GEM_3 (VTA_INSN_GEM_2 + 1) -/*! GEMM instruction position of the push_next_dependence field */ -#define VTA_INSN_GEM_4 (VTA_INSN_GEM_3 + 1) -/*! GEMM instruction position of the reset register bit */ -#define VTA_INSN_GEM_5 (VTA_INSN_GEM_4 + 1) -/*! GEMM instruction start position of the uop_bgn field */ -#define VTA_INSN_GEM_6_0 (VTA_INSN_GEM_5 + 1) -/*! GEMM instruction end position of the uop_bgn field */ -#define VTA_INSN_GEM_6_1 (VTA_INSN_GEM_6_0 + VTA_LOG_UOP_BUFF_DEPTH - 1) -/*! GEMM instruction start position of the uop_end field */ -#define VTA_INSN_GEM_7_0 (VTA_INSN_GEM_6_1 + 1) -/*! GEMM instruction end position of the uop_end field */ -#define VTA_INSN_GEM_7_1 (VTA_INSN_GEM_7_0 + VTA_LOG_UOP_BUFF_DEPTH + 1 - 1) -/*! GEMM instruction start position of the iter_out field */ -#define VTA_INSN_GEM_8_0 (VTA_INSN_GEM_7_1 + 1) -/*! GEMM instruction end position of the iter_out field */ -#define VTA_INSN_GEM_8_1 (VTA_INSN_GEM_8_0 + VTA_LOOP_ITER_WIDTH - 1) -/*! GEMM instruction start position of the iter_in field */ -#define VTA_INSN_GEM_9_0 (VTA_INSN_GEM_8_1 + 1) -/*! GEMM instruction end position of the iter_in field */ -#define VTA_INSN_GEM_9_1 (VTA_INSN_GEM_9_0 + VTA_LOOP_ITER_WIDTH - 1) -/*! GEMM instruction start position of the dst_factor_out field */ -#define VTA_INSN_GEM_A_0 64 -/*! GEMM instruction end position of the dst_factor_out field */ -#define VTA_INSN_GEM_A_1 (VTA_INSN_GEM_A_0 + VTA_LOG_ACC_BUFF_DEPTH - 1) -/*! GEMM instruction start position of the dst_factor_in field */ -#define VTA_INSN_GEM_B_0 (VTA_INSN_GEM_A_1 + 1) -/*! GEMM instruction end position of the dst_factor_in field */ -#define VTA_INSN_GEM_B_1 (VTA_INSN_GEM_B_0 + VTA_LOG_ACC_BUFF_DEPTH - 1) -/*! GEMM instruction start position of the src_factor_out field */ -#define VTA_INSN_GEM_C_0 (VTA_INSN_GEM_B_1 + 1) -/*! GEMM instruction end position of the src_factor_out field */ -#define VTA_INSN_GEM_C_1 (VTA_INSN_GEM_C_0 + VTA_LOG_INP_BUFF_DEPTH - 1) -/*! GEMM instruction start position of the src_factor_in field */ -#define VTA_INSN_GEM_D_0 (VTA_INSN_GEM_C_1 + 1) -/*! GEMM instruction end position of the src_factor_in field */ -#define VTA_INSN_GEM_D_1 (VTA_INSN_GEM_D_0 + VTA_LOG_INP_BUFF_DEPTH - 1) - -/*! GEMM instruction start position of the wgt_factor_out field */ -#define VTA_INSN_GEM_E_0 (VTA_INSN_GEM_D_1 + 1) -/*! GEMM instruction end position of the wgt_factor_out field */ -#define VTA_INSN_GEM_E_1 (VTA_INSN_GEM_E_0 + VTA_LOG_WGT_BUFF_DEPTH - 1) -/*! GEMM instruction start position of the wgt_factor_in field */ -#define VTA_INSN_GEM_F_0 (VTA_INSN_GEM_E_1 + 1) -/*! GEMM instruction end position of the wgt_factor_in field */ -#define VTA_INSN_GEM_F_1 (VTA_INSN_GEM_F_0 + VTA_LOG_WGT_BUFF_DEPTH - 1) - -/*! ALU instruction start position of the alu_opcode field */ -#define VTA_INSN_ALU_E_0 (VTA_INSN_GEM_D_1 + 1) -/*! ALU instruction end position of the alu_opcode field */ -#define VTA_INSN_ALU_E_1 (VTA_INSN_ALU_E_0 + VTA_ALU_OPCODE_BIT_WIDTH - 1) -/*! ALU instruction position of the use_imm field */ -#define VTA_INSN_ALU_F (VTA_INSN_ALU_E_1 + 1) -/*! ALU instruction start position of the immediate field */ -#define VTA_INSN_ALU_G_0 (VTA_INSN_ALU_F + 1) -/*! ALU instruction end position of the immediate field */ -#define VTA_INSN_ALU_G_1 (VTA_INSN_ALU_G_0 + VTA_ALUOP_IMM_BIT_WIDTH - 1) - /*! GEMM Micro-op start position of the acc_idx field */ #define VTA_UOP_GEM_0_0 0 /*! GEMM Micro-op end position of the acc_idx field */ @@ -368,8 +201,20 @@ extern "C" { /*! \brief VTA generic instruction */ typedef struct { - uint64_t word_0 : 64; - uint64_t word_1 : 64; + /*! \brief The instruction opcode */ + uint64_t opcode : VTA_OPCODE_BIT_WIDTH; + /*! \brief Unused in this instruction */ + uint64_t pop_prev_dep : 1; + /*! \brief Pop dependence token from GEMM stage */ + uint64_t pop_next_dep : 1; + /*! \brief Unused in this instruction */ + uint64_t push_prev_dep : 1; + /*! \brief Push dependence token to GEMM stage */ + uint64_t push_next_dep : 1; + /*! \brief Padding */ + uint64_t pad_0 : 64 - VTA_OPCODE_BIT_WIDTH - 4; + /*! \brief Padding */ + uint64_t pad_1 : 64; } VTAGenericInsn; /*! \brief VTA load/store instruction diff --git a/vta/python/vta/bitstream.py b/vta/python/vta/bitstream.py index 7c5ee5523e38..b3d7df49328e 100644 --- a/vta/python/vta/bitstream.py +++ b/vta/python/vta/bitstream.py @@ -45,10 +45,11 @@ def get_bitstream_path(): # Derive destination path cache_dir = os.getenv("VTA_CACHE_PATH", os.path.join(os.getenv("HOME"), ".vta_cache/")) cache_dir = os.path.join(cache_dir, env.TARGET) + cache_dir = os.path.join(cache_dir, env.HW_VER.replace('.', '_')) # Create the directory if it didn't exist if not os.path.exists(cache_dir): os.makedirs(cache_dir) - bit_path = os.path.join(cache_dir, env.BITSTREAM) + bit_path = os.path.join(cache_dir, env.BITSTREAM) + ".bit" return bit_path @@ -63,7 +64,7 @@ def download_bitstream(): bit = get_bitstream_path() url = os.path.join(BITSTREAM_URL, env.TARGET) url = os.path.join(url, env.HW_VER) - url = os.path.join(url, env.BITSTREAM) + url = os.path.join(url, env.BITSTREAM + ".bit") try: download(url, bit) diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index 093b0ec5c386..ee2428be828b 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -113,15 +113,9 @@ class Environment(object): # initialization function def __init__(self, cfg): - self.__dict__.update(cfg) - for key in PkgConfig.cfg_keys: - if key not in cfg: - raise ValueError("Expect key %s in cfg" % key) - # derive output buffer size - self.LOG_OUT_BUFF_SIZE = ( - self.LOG_ACC_BUFF_SIZE + - self.LOG_OUT_WIDTH - - self.LOG_ACC_WIDTH) + # Produce the derived parameters and update dict + self.pkg = self.pkg_config(cfg) + self.__dict__.update(self.pkg.cfg_dict) # data type width self.INP_WIDTH = 1 << self.LOG_INP_WIDTH self.WGT_WIDTH = 1 << self.LOG_WGT_WIDTH @@ -154,25 +148,15 @@ def __init__(self, cfg): self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8 self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8 self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8 - # Configuration bitstream name - self.BITSTREAM = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}.bit".format( - (1 << cfg["LOG_BATCH"]), - (1 << cfg["LOG_BLOCK_IN"]), - (1 << cfg["LOG_BLOCK_OUT"]), - (1 << cfg["LOG_INP_WIDTH"]), - (1 << cfg["LOG_WGT_WIDTH"]), - cfg["LOG_UOP_BUFF_SIZE"], - cfg["LOG_INP_BUFF_SIZE"], - cfg["LOG_WGT_BUFF_SIZE"], - cfg["LOG_ACC_BUFF_SIZE"], - cfg["HW_FREQ"], - cfg["HW_CLK_TARGET"], - cfg["HW_VER"].replace('.', '_')) # dtypes self.acc_dtype = "int%d" % self.ACC_WIDTH self.inp_dtype = "int%d" % self.INP_WIDTH self.wgt_dtype = "int%d" % self.WGT_WIDTH self.out_dtype = "int%d" % self.OUT_WIDTH + # bistream name + self.BITSTREAM = self.pkg.bitstream + # model string + self.MODEL = self.TARGET + "_" + self.BITSTREAM # lazy cached members self.mock_mode = False self._mock_env = None @@ -187,11 +171,15 @@ def __enter__(self): def __exit__(self, ptype, value, trace): Environment.current = self._last_env - def pkg_config(self): + def pkg_config(self, cfg): """PkgConfig instance""" curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) proj_root = os.path.abspath(os.path.join(curr_path, "../../")) - return PkgConfig(self.__dict__, proj_root) + return PkgConfig(cfg, proj_root) + + @property + def cfg_dict(self): + return self.pkg.cfg_dict @property def dev(self): @@ -236,13 +224,15 @@ def gemm(self): @property def target(self): - return tvm.target.vta(model=self.TARGET) + return tvm.target.vta(model=self.MODEL) @property def target_host(self): """The target host""" if self.TARGET == "pynq": return "llvm -target=armv7-none-linux-gnueabihf" + if self.TARGET == "ultra96": + return "llvm -target=aarch64-linux-gnu" if self.TARGET == "sim" or self.TARGET == "tsim": return "llvm" raise ValueError("Unknown target %s" % self.TARGET) @@ -316,21 +306,18 @@ def coproc_dep_pop(op): def _init_env(): - """Iniitalize the default global env""" + """Initialize the default global env""" curr_path = os.path.dirname( os.path.abspath(os.path.expanduser(__file__))) proj_root = os.path.abspath(os.path.join(curr_path, "../../../")) path_list = [ - os.path.join(curr_path, "vta_config.json"), - os.path.join(proj_root, "build", "vta_config.json"), - os.path.join(proj_root, "vta_config.json"), os.path.join(proj_root, "vta/config/vta_config.json") ] path_list = [p for p in path_list if os.path.exists(p)] if not path_list: raise RuntimeError( - "Error: {} not found.make sure you have config.json in your vta root" - .format(filename)) - return Environment(json.load(open(path_list[0]))) + "Error: vta_config.json not found.") + cfg = json.load(open(path_list[0])) + return Environment(cfg) Environment.current = _init_env() diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py index 3977d5aa2e43..5390ee0de9a8 100644 --- a/vta/python/vta/pkg_config.py +++ b/vta/python/vta/pkg_config.py @@ -38,49 +38,209 @@ class PkgConfig(object): """ cfg_keys = [ "TARGET", - "HW_FREQ", - "HW_CLK_TARGET", - "HW_VER", "LOG_INP_WIDTH", "LOG_WGT_WIDTH", "LOG_ACC_WIDTH", - "LOG_OUT_WIDTH", "LOG_BATCH", - "LOG_BLOCK_IN", - "LOG_BLOCK_OUT", + "LOG_BLOCK", "LOG_UOP_BUFF_SIZE", "LOG_INP_BUFF_SIZE", "LOG_WGT_BUFF_SIZE", "LOG_ACC_BUFF_SIZE", ] + def __init__(self, cfg, proj_root): - # include path + + # Derived parameters + cfg["LOG_BLOCK_IN"] = cfg["LOG_BLOCK"] + cfg["LOG_BLOCK_OUT"] = cfg["LOG_BLOCK"] + cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"] + cfg["LOG_OUT_BUFF_SIZE"] = ( + cfg["LOG_ACC_BUFF_SIZE"] + + cfg["LOG_OUT_WIDTH"] - + cfg["LOG_ACC_WIDTH"]) + + # Update cfg now that we've extended it + self.__dict__.update(cfg) + + # Include path self.include_path = [ "-I%s/include" % proj_root, "-I%s/vta/include" % proj_root, "-I%s/3rdparty/dlpack/include" % proj_root, "-I%s/3rdparty/dmlc-core/include" % proj_root ] + # List of source files that can be used to build standalone library. self.lib_source = [] self.lib_source += glob.glob("%s/vta/src/*.cc" % proj_root) - self.lib_source += glob.glob("%s/vta/src/%s/*.cc" % (proj_root, cfg["TARGET"])) - # macro keys - self.macro_defs = [] - self.cfg_dict = {} - for key in self.cfg_keys: - self.macro_defs.append("-DVTA_%s=%s" % (key, str(cfg[key]))) - self.cfg_dict[key] = cfg[key] + if self.TARGET in ["pynq", "ultra96"]: + # add pynq drivers for any board that uses pynq driver stack (see pynq.io) + self.lib_source += glob.glob("%s/vta/src/pynq/*.cc" % (proj_root)) - self.target = cfg["TARGET"] - - if self.target == "pynq": + # Linker flags + if self.TARGET in ["pynq", "ultra96"]: self.ldflags = [ "-L/usr/lib", "-l:libcma.so"] else: self.ldflags = [] + # Derive bitstream config string. + self.bitstream = "{}x{}_i{}w{}a{}_{}_{}_{}_{}".format( + (1 << cfg["LOG_BATCH"]), + (1 << cfg["LOG_BLOCK"]), + (1 << cfg["LOG_INP_WIDTH"]), + (1 << cfg["LOG_WGT_WIDTH"]), + (1 << cfg["LOG_ACC_WIDTH"]), + cfg["LOG_UOP_BUFF_SIZE"], + cfg["LOG_INP_BUFF_SIZE"], + cfg["LOG_WGT_BUFF_SIZE"], + cfg["LOG_ACC_BUFF_SIZE"]) + + # Derive FPGA parameters from target + # - device: part number + # - family: fpga family + # - freq: PLL frequency + # - per: clock period to achieve in HLS + # (how aggressively design is pipelined) + # - axi_bus_width: axi bus width used for DMA transactions + # (property of FPGA memory interface) + # - axi_cache_bits: ARCACHE/AWCACHE signals for the AXI bus + # (e.g. 1111 is write-back read and write allocate) + # - axi_prot_bits: ARPROT/AWPROT signals for the AXI bus + if self.TARGET == "ultra96": + self.fpga_device = "xczu3eg-sbva484-1-e" + self.fpga_family = "zynq-ultrascale+" + self.fpga_freq = 333 + self.fpga_per = 2 + self.fpga_log_axi_bus_width = 7 + self.axi_prot_bits = '010' + # IP register address map + self.ip_reg_map_range = "0x1000" + self.fetch_base_addr = "0xA0000000" + self.load_base_addr = "0xA0001000" + self.compute_base_addr = "0xA0002000" + self.store_base_addr = "0xA0003000" + else: + # By default, we use the pynq parameters + self.fpga_device = "xc7z020clg484-1" + self.fpga_family = "zynq-7000" + self.fpga_freq = 100 + self.fpga_per = 7 + self.fpga_log_axi_bus_width = 6 + self.axi_prot_bits = '000' + # IP register address map + self.ip_reg_map_range = "0x1000" + self.fetch_base_addr = "0x43C00000" + self.load_base_addr = "0x43C01000" + self.compute_base_addr = "0x43C02000" + self.store_base_addr = "0x43C03000" + # Set coherence settings + coherent = True + if coherent: + self.axi_cache_bits = '1111' + self.coherent = True + + # Define IP memory mapped registers offsets. + # In HLS 0x00-0x0C is reserved for block-level I/O protocol. + # Make sure to leave 8B between register offsets to maintain + # compatibility with 64bit systems. + self.fetch_insn_count_offset = 0x10 + self.fetch_insn_addr_offset = self.fetch_insn_count_offset + 0x08 + self.load_inp_addr_offset = 0x10 + self.load_wgt_addr_offset = self.load_inp_addr_offset + 0x08 + self.compute_done_wr_offet = 0x10 + self.compute_done_rd_offet = self.compute_done_wr_offet + 0x08 + self.compute_uop_addr_offset = self.compute_done_rd_offet + 0x08 + self.compute_bias_addr_offset = self.compute_uop_addr_offset + 0x08 + self.store_out_addr_offset = 0x10 + + # Derive SRAM parameters + # The goal here is to determine how many memory banks are needed, + # how deep and wide each bank needs to be. This is derived from + # the size of each memory element (result of data width, and tensor shape), + # and also how wide a memory can be as permitted by the FPGA tools. + # + # The mem axi ratio is a parameter used by HLS to resize memories + # so memory read/write ports are the same size as the design axi bus width. + # + # Max bus width allowed (property of FPGA vendor toolchain) + max_bus_width = 1024 + # Bus width of a memory interface + mem_bus_width = 1 << self.fpga_log_axi_bus_width + # Input memory + inp_mem_bus_width = 1 << (cfg["LOG_INP_WIDTH"] + \ + cfg["LOG_BATCH"] + \ + cfg["LOG_BLOCK_IN"]) + self.inp_mem_size = 1 << cfg["LOG_INP_BUFF_SIZE"] # bytes + self.inp_mem_banks = (inp_mem_bus_width + \ + max_bus_width - 1) // \ + max_bus_width + self.inp_mem_width = min(inp_mem_bus_width, max_bus_width) + self.inp_mem_depth = self.inp_mem_size * 8 // inp_mem_bus_width + self.inp_mem_axi_ratio = self.inp_mem_width // mem_bus_width + # Weight memory + wgt_mem_bus_width = 1 << (cfg["LOG_WGT_WIDTH"] + \ + cfg["LOG_BLOCK_IN"] + \ + cfg["LOG_BLOCK_OUT"]) + self.wgt_mem_size = 1 << cfg["LOG_WGT_BUFF_SIZE"] # bytes + self.wgt_mem_banks = (wgt_mem_bus_width + \ + max_bus_width - 1) // \ + max_bus_width + self.wgt_mem_width = min(wgt_mem_bus_width, max_bus_width) + self.wgt_mem_depth = self.wgt_mem_size * 8 // wgt_mem_bus_width + self.wgt_mem_axi_ratio = self.wgt_mem_width // mem_bus_width + # Output memory + out_mem_bus_width = 1 << (cfg["LOG_OUT_WIDTH"] + \ + cfg["LOG_BATCH"] + \ + cfg["LOG_BLOCK_OUT"]) + self.out_mem_size = 1 << cfg["LOG_OUT_BUFF_SIZE"] # bytes + self.out_mem_banks = (out_mem_bus_width + \ + max_bus_width - 1) // \ + max_bus_width + self.out_mem_width = min(out_mem_bus_width, max_bus_width) + self.out_mem_depth = self.out_mem_size * 8 // out_mem_bus_width + self.out_mem_axi_ratio = self.out_mem_width // mem_bus_width + + # Macro defs + self.macro_defs = [] + self.cfg_dict = {} + for key in cfg: + self.macro_defs.append("-DVTA_%s=%s" % (key, str(cfg[key]))) + self.cfg_dict[key] = cfg[key] + self.macro_defs.append("-DVTA_LOG_BUS_WIDTH=%s" % (self.fpga_log_axi_bus_width)) + # Macros used by the VTA driver + self.macro_defs.append("-DVTA_IP_REG_MAP_RANGE=%s" % (self.ip_reg_map_range)) + self.macro_defs.append("-DVTA_FETCH_ADDR=%s" % (self.fetch_base_addr)) + self.macro_defs.append("-DVTA_LOAD_ADDR=%s" % (self.load_base_addr)) + self.macro_defs.append("-DVTA_COMPUTE_ADDR=%s" % (self.compute_base_addr)) + self.macro_defs.append("-DVTA_STORE_ADDR=%s" % (self.store_base_addr)) + # IP register offsets + self.macro_defs.append("-DVTA_FETCH_INSN_COUNT_OFFSET=%s" % \ + (self.fetch_insn_count_offset)) + self.macro_defs.append("-DVTA_FETCH_INSN_ADDR_OFFSET=%s" % \ + (self.fetch_insn_addr_offset)) + self.macro_defs.append("-DVTA_LOAD_INP_ADDR_OFFSET=%s" % \ + (self.load_inp_addr_offset)) + self.macro_defs.append("-DVTA_LOAD_WGT_ADDR_OFFSET=%s" % \ + (self.load_wgt_addr_offset)) + self.macro_defs.append("-DVTA_COMPUTE_DONE_WR_OFFSET=%s" % \ + (self.compute_done_wr_offet)) + self.macro_defs.append("-DVTA_COMPUTE_DONE_RD_OFFSET=%s" % \ + (self.compute_done_rd_offet)) + self.macro_defs.append("-DVTA_COMPUTE_UOP_ADDR_OFFSET=%s" % \ + (self.compute_uop_addr_offset)) + self.macro_defs.append("-DVTA_COMPUTE_BIAS_ADDR_OFFSET=%s" % \ + (self.compute_bias_addr_offset)) + self.macro_defs.append("-DVTA_STORE_OUT_ADDR_OFFSET=%s" % \ + (self.store_out_addr_offset)) + # Coherency + if coherent: + self.macro_defs.append("-DVTA_COHERENT_ACCESSES=true") + else: + self.macro_defs.append("-DVTA_COHERENT_ACCESSES=false") + @property def cflags(self): return self.include_path + self.macro_defs diff --git a/vta/python/vta/program_bitstream.py b/vta/python/vta/program_bitstream.py index 5c5a86293885..e735a4cf252c 100644 --- a/vta/python/vta/program_bitstream.py +++ b/vta/python/vta/program_bitstream.py @@ -48,9 +48,12 @@ def pynq_bitstream_program(bitstream_path): bitstream.download() def bitstream_program(target, bitstream): - if target == 'pynq': + if target in ['pynq', 'ultra96']: pynq_bitstream_program(bitstream) - elif target != 'sim': + elif target in ['sim', 'tsim']: + # In simulation, bit stream programming is a no-op + return + else: raise RuntimeError("Unknown target {}".format(target)) if __name__ == "__main__": diff --git a/vta/python/vta/rpc_client.py b/vta/python/vta/rpc_client.py index a5bafab498a5..f689ef46ba1c 100644 --- a/vta/python/vta/rpc_client.py +++ b/vta/python/vta/rpc_client.py @@ -30,7 +30,7 @@ def reconfig_runtime(remote): """ env = get_env() freconfig = remote.get_function("tvm.contrib.vta.reconfig_runtime") - freconfig(env.pkg_config().cfg_json) + freconfig(env.pkg.cfg_json) def program_fpga(remote, bitstream=None): diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py index b748cdf23358..67fc6b275b79 100644 --- a/vta/python/vta/testing/util.py +++ b/vta/python/vta/testing/util.py @@ -33,7 +33,6 @@ def run(run_func): env = get_env() if env.TARGET in ["sim", "tsim"]: - # Talk to local RPC if necessary to debug RPC server. # Compile vta on your host with make at the root. # Make sure TARGET is set to "sim" in the config.json file. @@ -53,21 +52,20 @@ def run(run_func): assert simulator.enabled() run_func(env, rpc.LocalSession()) - elif env.TARGET == "pynq": - + elif env.TARGET in ["pynq", "ultra96"]: # The environment variables below should be set if we are using # a tracker to obtain a remote for a test device - tracket_host = os.environ.get("TVM_TRACKER_HOST", None) - tracket_port = os.environ.get("TVM_TRACKER_PORT", None) + tracker_host = os.environ.get("TVM_TRACKER_HOST", None) + tracker_port = os.environ.get("TVM_TRACKER_PORT", None) # Otherwise, we can set the variables below to directly # obtain a remote from a test device pynq_host = os.environ.get("VTA_PYNQ_RPC_HOST", None) pynq_port = os.environ.get("VTA_PYNQ_RPC_PORT", None) # Run device from fleet node if env variables are defined - if tracket_host and tracket_port: + if tracker_host and tracker_port: remote = autotvm.measure.request_remote(env.TARGET, - tracket_host, - int(tracket_port), + tracker_host, + int(tracker_port), timeout=10000) run_func(env, remote) else: @@ -78,3 +76,6 @@ def run(run_func): else: raise RuntimeError( "Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables") + + else: + raise RuntimeError("Unknown target %s" % env.TARGET) diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc index 47ca604d9d39..a37bb4e466af 100644 --- a/vta/src/pynq/pynq_driver.cc +++ b/vta/src/pynq/pynq_driver.cc @@ -15,12 +15,9 @@ * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. - */ - -/*! - * Copyright (c) 2018 by Contributors + * * \file pynq_driver.c - * \brief VTA driver for Pynq board. + * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io). */ #include @@ -53,19 +50,19 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) { memcpy(dst, src, size); } -void VTAFlushCache(vta_phy_addr_t buf, int size) { - // Call the xlnkFlushCache on the CMA buffer +void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { + // Call the cma_flush_cache on the CMA buffer // so that the FPGA can read the buffer data. - xlnkFlushCache(reinterpret_cast(buf), size); + cma_flush_cache(vir_addr, phy_addr, size); } -void VTAInvalidateCache(vta_phy_addr_t buf, int size) { - // Call the xlnkInvalidateCache on the CMA buffer +void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { + // Call the cma_invalidate_cache on the CMA buffer // so that the host needs to read the buffer data. - xlnkInvalidateCache(reinterpret_cast(buf), size); + cma_invalidate_cache(vir_addr, phy_addr, size); } -void *VTAMapRegister(uint32_t addr, size_t length) { +void *VTAMapRegister(uint32_t addr) { // Align the base address with the pages uint32_t virt_base = addr & ~(getpagesize() - 1); // Calculate base address offset w.r.t the base address @@ -73,16 +70,16 @@ void *VTAMapRegister(uint32_t addr, size_t length) { // Open file and mmap uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC); return mmap(NULL, - (length+virt_offset), + (VTA_IP_REG_MAP_RANGE + virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base); } -void VTAUnmapRegister(void *vta, size_t length) { +void VTAUnmapRegister(void *vta) { // Unmap memory - int status = munmap(vta, length); + int status = munmap(vta, VTA_IP_REG_MAP_RANGE); assert(status == 0); } @@ -98,39 +95,30 @@ class VTADevice { public: VTADevice() { // VTA stage handles - vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); - vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); - vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); - vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); + vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR); + vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR); + vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR); + vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR); } ~VTADevice() { // Close VTA stage handle - VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE); - VTAUnmapRegister(vta_load_handle_, VTA_RANGE); - VTAUnmapRegister(vta_compute_handle_, VTA_RANGE); - VTAUnmapRegister(vta_store_handle_, VTA_RANGE); + VTAUnmapRegister(vta_fetch_handle_); + VTAUnmapRegister(vta_load_handle_); + VTAUnmapRegister(vta_compute_handle_); + VTAUnmapRegister(vta_store_handle_); } int Run(vta_phy_addr_t insn_phy_addr, uint32_t insn_count, uint32_t wait_cycles) { - // NOTE: Register address map is derived from the auto-generated - // driver files available under hardware/build/vivado//export/driver - // FETCH @ 0x10 : Data signal of insn_count_V - VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_count); - // FETCH @ 0x18 : Data signal of insns_V - VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_phy_addr); - // LOAD @ 0x10 : Data signal of inputs_V - VTAWriteMappedReg(vta_load_handle_, 0x10, 0); - // LOAD @ 0x18 : Data signal of weight_V - VTAWriteMappedReg(vta_load_handle_, 0x18, 0); - // COMPUTE @ 0x20 : Data signal of uops_V - VTAWriteMappedReg(vta_compute_handle_, 0x20, 0); - // COMPUTE @ 0x28 : Data signal of biases_V - VTAWriteMappedReg(vta_compute_handle_, 0x28, 0); - // STORE @ 0x10 : Data signal of outputs_V - VTAWriteMappedReg(vta_store_handle_, 0x10, 0); + VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_COUNT_OFFSET, insn_count); + VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy_addr); + VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_INP_ADDR_OFFSET, 0); + VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_WGT_ADDR_OFFSET, 0); + VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_UOP_ADDR_OFFSET, 0); + VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_BIAS_ADDR_OFFSET, 0); + VTAWriteMappedReg(vta_store_handle_, VTA_STORE_OUT_ADDR_OFFSET, 0); // VTA start VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START); @@ -141,7 +129,7 @@ class VTADevice { // Loop until the VTA is done unsigned t, flag = 0; for (t = 0; t < wait_cycles; ++t) { - flag = VTAReadMappedReg(vta_compute_handle_, 0x18); + flag = VTAReadMappedReg(vta_compute_handle_, VTA_COMPUTE_DONE_RD_OFFSET); if (flag == VTA_DONE) break; std::this_thread::yield(); } diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h index 7cfee4cf0958..bb6ca3db2b93 100644 --- a/vta/src/pynq/pynq_driver.h +++ b/vta/src/pynq/pynq_driver.h @@ -6,21 +6,18 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. - */ - -/*! - * Copyright (c) 2018 by Contributors - * \file vta_pynq_driver.h - * \brief VTA driver for Pynq board. + * + * \file pynq_driver.h + * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io). */ #ifndef VTA_PYNQ_PYNQ_DRIVER_H_ @@ -41,23 +38,21 @@ extern "C" { #include #include -#ifdef __arm__ +#if defined(__arm__) || defined(__aarch64__) #include #else void* cma_alloc(size_t size, int cached); void cma_free(void* buf); uint32_t cma_get_phy_addr(void* buf); +void cma_flush_cache(void* buf, unsigned int phys_addr, int size); +void cma_invalidate_cache(void* buf, unsigned int phys_addr, int size); #endif -void xlnkFlushCache(void* buf, int size); -void xlnkInvalidateCache(void* buf, int size); -void *VTAMapRegister(uint32_t addr, size_t length); -void VTAUnmapRegister(void *vta, size_t length); +void *VTAMapRegister(uint32_t addr); +void VTAUnmapRegister(void *vta); void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val); uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset); -/*! \brief VTA configuration register address range */ -#define VTA_RANGE 0x100 /*! \brief VTA configuration register start value */ #define VTA_START 0x1 /*! \brief VTA configuration register auto-restart value */ @@ -65,27 +60,6 @@ uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset); /*! \brief VTA configuration register done value */ #define VTA_DONE 0x1 -/*! \brief VTA fetch stage configuration register address -* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define -* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) -*/ -#define VTA_FETCH_ADDR 0x43C00000 -/*! \brief VTA compute stage configuration register address -* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define -* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) -*/ -#define VTA_COMPUTE_ADDR 0x43C10000 -/*! \brief VTA compute stage configuration register address -* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define -* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) -*/ -#define VTA_LOAD_ADDR 0x43C20000 -/*! \brief VTA store stage configuration register address -* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define -* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) -*/ -#define VTA_STORE_ADDR 0x43C30000 - #ifdef __cplusplus } #endif diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc index cebfaf7bb68f..4a6552892f4e 100644 --- a/vta/src/runtime.cc +++ b/vta/src/runtime.cc @@ -44,8 +44,10 @@ namespace vta { static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8, "VTA_UOP_WIDTH do not match VTAUop size"); -/*! \brief Enable coherent access between VTA and CPU (used on shared mem systems). */ -static const bool kBufferCoherent = true; +/*! \brief Enable coherent access of data buffers between VTA and CPU */ +static const bool kBufferCoherent = VTA_COHERENT_ACCESSES; +/*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */ +static const bool kAlwaysCache = true; /*! * \brief Data buffer represents data on CMA. @@ -65,8 +67,10 @@ struct DataBuffer { * \param size The size of the data. */ void InvalidateCache(size_t offset, size_t size) { - if (!kBufferCoherent) { - VTAInvalidateCache(phy_addr_ + offset, size); + if (!kBufferCoherent && kAlwaysCache) { + VTAInvalidateCache(reinterpret_cast(data_) + offset, + phy_addr_ + offset, + size); } } /*! @@ -75,8 +79,10 @@ struct DataBuffer { * \param size The size of the data. */ void FlushCache(size_t offset, size_t size) { - if (!kBufferCoherent) { - VTAFlushCache(phy_addr_ + offset, size); + if (!kBufferCoherent && kAlwaysCache) { + VTAFlushCache(reinterpret_cast(data_) + offset, + phy_addr_ + offset, + size); } } /*! @@ -102,7 +108,7 @@ struct DataBuffer { * \param size The size of the buffer. */ static DataBuffer* Alloc(size_t size) { - void* data = VTAMemAlloc(size, 1); + void* data = VTAMemAlloc(size, kAlwaysCache); CHECK(data != nullptr); DataBuffer* buffer = new DataBuffer(); buffer->data_ = data; @@ -469,7 +475,9 @@ class UopQueue : public BaseQueue { // Flush if we're using a shared memory system // and if interface is non-coherent if (!coherent_ && always_cache_) { - VTAFlushCache(fpga_buff_phy_, offset); + VTAFlushCache(fpga_buff_, + fpga_buff_phy_, + offset); } } @@ -860,7 +868,9 @@ class InsnQueue : public BaseQueue { // Flush if we're using a shared memory system // and if interface is non-coherent if (!coherent_ && always_cache_) { - VTAFlushCache(fpga_buff_phy_, buff_size); + VTAFlushCache(fpga_buff_, + fpga_buff_phy_, + buff_size); } } @@ -1302,9 +1312,9 @@ class CommandQueue { // The kernel we are currently recording UopKernel* record_kernel_{nullptr}; // Micro op queue - UopQueue uop_queue_; + UopQueue uop_queue_; // instruction queue - InsnQueue insn_queue_; + InsnQueue insn_queue_; // Device handle VTADeviceHandle device_{nullptr}; #ifdef USE_TSIM diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc index 9d81befdaa6e..ca0fd7ec521a 100644 --- a/vta/src/sim/sim_driver.cc +++ b/vta/src/sim/sim_driver.cc @@ -615,10 +615,10 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) { memcpy(dst, src, size); } -void VTAFlushCache(vta_phy_addr_t buf, int size) { +void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { } -void VTAInvalidateCache(vta_phy_addr_t buf, int size) { +void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { } VTADeviceHandle VTADeviceAlloc() { diff --git a/vta/src/tsim/tsim_driver.cc b/vta/src/tsim/tsim_driver.cc index 799ee27e5a9a..a7bcc3c54ca8 100644 --- a/vta/src/tsim/tsim_driver.cc +++ b/vta/src/tsim/tsim_driver.cc @@ -228,10 +228,10 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) { memcpy(dst, src, size); } -void VTAFlushCache(vta_phy_addr_t buf, int size) { +void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { } -void VTAInvalidateCache(vta_phy_addr_t buf, int size) { +void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { } VTADeviceHandle VTADeviceAlloc() { diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc index e88cede4d055..7c47c2c3e012 100644 --- a/vta/tests/hardware/common/test_lib.cc +++ b/vta/tests/hardware/common/test_lib.cc @@ -18,7 +18,6 @@ */ /*! - * Copyright (c) 2018 by Contributors * \file test_lib.cpp * \brief Test library for the VTA design simulation and driver tests. */ @@ -32,10 +31,10 @@ uint64_t vta( uint32_t insn_count, VTAGenericInsn *insns, VTAUop *uops, - inp_T *inputs, - wgt_T *weights, - acc_T *biases, - inp_T *outputs) { + uint32_t *inputs, + uint32_t *weights, + uint32_t *biases, + uint32_t *outputs) { // Performance counter variables uint64_t t_fpga; struct timespec start, stop; @@ -53,18 +52,18 @@ uint64_t vta( snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit"); // Get VTA handles - void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); - void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); - void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); - void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); + void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR); + void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR); + void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR); + void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR); // Physical address pointers - uint32_t insn_phy = insns ? VTAMemGetPhyAddr(insns) : 0; - uint32_t uop_phy = uops ? VTAMemGetPhyAddr(uops) : 0; - uint32_t input_phy = inputs ? VTAMemGetPhyAddr(inputs) : 0; - uint32_t weight_phy = weights ? VTAMemGetPhyAddr(weights) : 0; - uint32_t bias_phy = biases ? VTAMemGetPhyAddr(biases) : 0; - uint32_t output_phy = outputs ? VTAMemGetPhyAddr(outputs) : 0; + uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0; + uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0; + uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0; + uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0; + uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0; + uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0; #if VTA_DEBUG == 1 printf("INFO - Starting FPGA!\n"); @@ -72,20 +71,13 @@ uint64_t vta( clock_gettime(CLOCK_REALTIME, &start); - // FETCH @ 0x10 : Data signal of insn_count_V - VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count); - // FETCH @ 0x18 : Data signal of insns_V - if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy); - // LOAD @ 0x10 : Data signal of inputs_V - if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy); - // LOAD @ 0x18 : Data signal of weight_V - if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy); - // COMPUTE @ 0x20 : Data signal of uops_V - if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy); - // COMPUTE @ 0x28 : Data signal of biases_V - if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy); - // STORE @ 0x10 : Data signal of outputs_V - if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy); + VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_COUNT_OFFSET, insn_count); + if (insns) VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy); + if (inputs) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_INP_ADDR_OFFSET, input_phy); + if (weights) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_WGT_ADDR_OFFSET, weight_phy); + if (uops) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_UOP_ADDR_OFFSET, uop_phy); + if (biases) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_BIAS_ADDR_OFFSET, bias_phy); + if (outputs) VTAWriteMappedReg(vta_store_handle, VTA_STORE_OUT_ADDR_OFFSET, output_phy); // VTA start VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1); @@ -95,7 +87,7 @@ uint64_t vta( int flag = 0, t = 0; for (t = 0; t < 10000000; ++t) { - flag = VTAReadMappedReg(vta_compute_handle, 0x18); + flag = VTAReadMappedReg(vta_compute_handle, VTA_COMPUTE_DONE_RD_OFFSET); if (flag & VTA_DONE) break; } @@ -111,10 +103,10 @@ uint64_t vta( t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec); // Unmap VTA register - VTAUnmapRegister(vta_fetch_handle, VTA_RANGE); - VTAUnmapRegister(vta_load_handle, VTA_RANGE); - VTAUnmapRegister(vta_compute_handle, VTA_RANGE); - VTAUnmapRegister(vta_store_handle, VTA_RANGE); + VTAUnmapRegister(vta_fetch_handle); + VTAUnmapRegister(vta_load_handle); + VTAUnmapRegister(vta_compute_handle); + VTAUnmapRegister(vta_store_handle); return t_fpga; } @@ -147,27 +139,30 @@ const char* getOpcodeString(int opcode, bool use_imm) { } else if (opcode == VTA_ALU_OPCODE_SHR) { return "shr"; } + // else if (opcode == VTA_ALU_OPCODE_MUL) { + // return "mul"; + // } return "unknown op"; } -template -void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block) { +template +void packBuffer(DST_T *dst, SRC_T **src, int y_size, int x_size, int y_block, int x_block) { + assert((SRC_T_WIDTH * x_block * y_block) % DST_T_WIDTH == 0); + assert(DST_T_WIDTH <= 64); int buffer_idx = 0; + int ratio = DST_T_WIDTH / SRC_T_WIDTH; + long long int mask = (1ULL << SRC_T_WIDTH) - 1; + DST_T tmp = 0; for (int i = 0; i < y_size / y_block; i++) { for (int j = 0; j < x_size / x_block; j++) { for (int k = 0; k < y_block; k++) { - if (T_WIDTH < 8) { - for (int l = 0; l < x_block; l += 8 / T_WIDTH) { - dst[buffer_idx] = 0; - for (int m = 0; m < 8 / T_WIDTH; m++) { - dst[buffer_idx] |= (src[i * y_block + k][j * x_block + l + m] & - ((1ULL << T_WIDTH) - 1)) << (m * T_WIDTH); - } - buffer_idx++; - } - } else { - for (int l = 0; l < x_block; l++) { - dst[buffer_idx++] = src[i * y_block + k][j * x_block + l]; + for (int l = 0; l < x_block; l++) { + int block_idx = l + k * x_block; + tmp |= (src[i * y_block + k][j * x_block + l] & mask) << ((block_idx % ratio) * SRC_T_WIDTH); + // When tmp is packed, write to destination array + if (block_idx % ratio == ratio - 1) { + dst[buffer_idx++] = tmp; + tmp = 0; } } } @@ -175,31 +170,28 @@ void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_bloc } } -template -void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block) { +template +void unpackBuffer(DST_T **dst, SRC_T *src, int y_size, int x_size, int y_block, int x_block) { + assert((DST_T_WIDTH * x_block * y_block) % SRC_T_WIDTH == 0); int buffer_idx = 0; + long long int mask = (1ULL << DST_T_WIDTH) - 1; + int ratio = SRC_T_WIDTH / DST_T_WIDTH; for (int i = 0; i < y_size / y_block; i++) { for (int j = 0; j < x_size / x_block; j++) { for (int k = 0; k < y_block; k++) { - if (T_WIDTH < 8) { - for (int l = 0; l < x_block; l += 8 / T_WIDTH) { - for (int m = 0; m < 8 / T_WIDTH; m++) { - dst[i * y_block + k][j * x_block + l + m] = (src[buffer_idx] >> (m * T_WIDTH)) - & ((1 << T_WIDTH) - 1); - } + for (int l = 0; l < x_block; l++) { + int block_idx = l + k * x_block; + dst[i * y_block + k][j * x_block + l] = (src[buffer_idx] >> ((block_idx % ratio) * DST_T_WIDTH)) & mask; + if (block_idx % ratio == ratio - 1) { buffer_idx++; } - } else { - for (int l = 0; l < x_block; l++) { - dst[i * y_block + k][j * x_block + l] = src[buffer_idx++]; - } } } } } } -template +template T ** allocInit2dArray(int rows, int cols) { // Allocate T **array = static_cast(malloc(sizeof(T *) * rows)); @@ -209,8 +201,23 @@ T ** allocInit2dArray(int rows, int cols) { // Init for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { - array[i][j] = - static_cast(rand_r(&globalSeed) % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2))); + array[i][j] = static_cast(rand_r(&globalSeed)); + } + } + return array; +} + +template +T ** allocSet2dArray(int rows, int cols, int val) { + // Allocate + T **array = static_cast(malloc(sizeof(T *) * rows)); + for (int i = 0; i < rows; i++) { + array[i] = static_cast(malloc(sizeof(T) * cols)); + } + // Init + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + array[i][j] = static_cast(val); } } return array; @@ -563,45 +570,6 @@ void printParameters() { printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES); printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN); printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT); - printf("VTA_INSN_MEM_0 [%d-%d]\n", VTA_INSN_MEM_0_0, VTA_INSN_MEM_0_1); - printf("VTA_INSN_MEM_1 [%d]\n", VTA_INSN_MEM_1); - printf("VTA_INSN_MEM_2 [%d]\n", VTA_INSN_MEM_2); - printf("VTA_INSN_MEM_3 [%d]\n", VTA_INSN_MEM_3); - printf("VTA_INSN_MEM_4 [%d]\n", VTA_INSN_MEM_4); - printf("VTA_INSN_MEM_5 [%d-%d]\n", VTA_INSN_MEM_5_0, VTA_INSN_MEM_5_1); - printf("VTA_INSN_MEM_6 [%d-%d]\n", VTA_INSN_MEM_6_0, VTA_INSN_MEM_6_1); - printf("VTA_INSN_MEM_7 [%d-%d]\n", VTA_INSN_MEM_7_0, VTA_INSN_MEM_7_1); - printf("VTA_INSN_MEM_8 [%d-%d]\n", VTA_INSN_MEM_8_0, VTA_INSN_MEM_8_1); - printf("VTA_INSN_MEM_9 [%d-%d]\n", VTA_INSN_MEM_9_0, VTA_INSN_MEM_9_1); - printf("VTA_INSN_MEM_A [%d-%d]\n", VTA_INSN_MEM_A_0, VTA_INSN_MEM_A_1); - printf("VTA_INSN_MEM_B [%d-%d]\n", VTA_INSN_MEM_B_0, VTA_INSN_MEM_B_1); - printf("VTA_INSN_MEM_C [%d-%d]\n", VTA_INSN_MEM_C_0, VTA_INSN_MEM_C_1); - printf("VTA_INSN_MEM_D [%d-%d]\n", VTA_INSN_MEM_D_0, VTA_INSN_MEM_D_1); - printf("VTA_INSN_MEM_E [%d-%d]\n", VTA_INSN_MEM_E_0, VTA_INSN_MEM_E_1); - printf("VTA_INSN_GEM_0 [%d-%d]\n", VTA_INSN_GEM_0_0, VTA_INSN_GEM_0_1); - printf("VTA_INSN_GEM_1 [%d]\n", VTA_INSN_GEM_1); - printf("VTA_INSN_GEM_2 [%d]\n", VTA_INSN_GEM_2); - printf("VTA_INSN_GEM_3 [%d]\n", VTA_INSN_GEM_3); - printf("VTA_INSN_GEM_4 [%d]\n", VTA_INSN_GEM_4); - printf("VTA_INSN_GEM_5 [%d]\n", VTA_INSN_GEM_5); - printf("VTA_INSN_GEM_6 [%d-%d]\n", VTA_INSN_GEM_6_0, VTA_INSN_GEM_6_1); - printf("VTA_INSN_GEM_7 [%d-%d]\n", VTA_INSN_GEM_7_0, VTA_INSN_GEM_7_1); - printf("VTA_INSN_GEM_8 [%d-%d]\n", VTA_INSN_GEM_8_0, VTA_INSN_GEM_8_1); - printf("VTA_INSN_GEM_9 [%d-%d]\n", VTA_INSN_GEM_9_0, VTA_INSN_GEM_9_1); - printf("VTA_INSN_GEM_A [%d-%d]\n", VTA_INSN_GEM_A_0, VTA_INSN_GEM_A_1); - printf("VTA_INSN_GEM_B [%d-%d]\n", VTA_INSN_GEM_B_0, VTA_INSN_GEM_B_1); - printf("VTA_INSN_GEM_C [%d-%d]\n", VTA_INSN_GEM_C_0, VTA_INSN_GEM_C_1); - printf("VTA_INSN_GEM_D [%d-%d]\n", VTA_INSN_GEM_D_0, VTA_INSN_GEM_D_1); - printf("VTA_INSN_GEM_E [%d-%d]\n", VTA_INSN_GEM_E_0, VTA_INSN_GEM_E_1); - printf("VTA_INSN_GEM_F [%d-%d]\n", VTA_INSN_GEM_F_0, VTA_INSN_GEM_F_1); - printf("VTA_INSN_ALU_E [%d-%d]\n", VTA_INSN_ALU_E_0, VTA_INSN_ALU_E_1); - printf("VTA_INSN_ALU_F [%d]\n", VTA_INSN_ALU_F); - printf("VTA_INSN_ALU_G [%d-%d]\n", VTA_INSN_ALU_G_0, VTA_INSN_ALU_G_1); - printf("VTA_UOP_GEM_0 [%d-%d]\n", VTA_UOP_GEM_0_0, VTA_UOP_GEM_0_1); - printf("VTA_UOP_GEM_1 [%d-%d]\n", VTA_UOP_GEM_1_0, VTA_UOP_GEM_1_1); - printf("VTA_UOP_GEM_2 [%d-%d]\n", VTA_UOP_GEM_2_0, VTA_UOP_GEM_2_1); - printf("VTA_UOP_ALU_0 [%d-%d]\n", VTA_UOP_ALU_0_0, VTA_UOP_ALU_0_1); - printf("VTA_UOP_ALU_1 [%d-%d]\n", VTA_UOP_ALU_1_0, VTA_UOP_ALU_1_1); } void printInstruction(int num_insn, VTAGenericInsn *insns) { @@ -742,7 +710,6 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp // Some assertions assert(batch % VTA_BATCH == 0); assert(vector_size % VTA_BLOCK_OUT == 0); - assert(!(opcode == VTA_ALU_OPCODE_SHR && !use_imm)); printf("=====================================================================================\n"); printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n", getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression); @@ -764,17 +731,21 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp for (int b = 0; b < batch / VTA_BATCH; b++) { if (opcode == VTA_ALU_OPCODE_MIN) { immediate[b] = static_cast( - rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); + rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2))); } else if (opcode == VTA_ALU_OPCODE_MAX) { immediate[b] = static_cast( - rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); + rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2))); } else if (opcode == VTA_ALU_OPCODE_ADD) { immediate[b] = static_cast( - rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); + rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2))); } else if (opcode == VTA_ALU_OPCODE_SHR) { immediate[b] = static_cast( - rand_r(&globalSeed) % VTA_ACC_WIDTH - VTA_ACC_WIDTH/2); + rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2))); } + // else if (opcode == VTA_ALU_OPCODE_MUL) { + // immediate[b] = static_cast( + // rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2))); + // } } // Initialize instructions @@ -845,7 +816,10 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); } else if (opcode == VTA_ALU_OPCODE_ADD) { inputs[i][j] = static_cast( - rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 2)) - (1LL << (VTA_INP_WIDTH - 3))); + } else if (opcode == VTA_ALU_OPCODE_SHR) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2))); } } } @@ -854,54 +828,55 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp out_T **outputs_ref = alloc2dArray(batch, vector_size); for (int i = 0; i < batch; i++) { for (int j = 0; j < vector_size; j++) { - acc_T tmp = 0; + acc_T out_val = 0; + acc_T imm_val = immediate[i / VTA_BATCH]; + acc_T src_val = inputs[i][j + vector_size]; if (opcode == VTA_ALU_OPCODE_MIN) { if (!use_imm) { - tmp = inputs[i][j] < inputs[i][j + vector_size] ? - inputs[i][j] : - inputs[i][j + vector_size]; + out_val = inputs[i][j] < src_val ? inputs[i][j] : src_val; } else { - tmp = inputs[i][j] < immediate[i / VTA_BATCH] ? - inputs[i][j] : - immediate[i / VTA_BATCH]; + out_val = inputs[i][j] < imm_val ? inputs[i][j] : imm_val; } } else if (opcode == VTA_ALU_OPCODE_MAX) { if (!use_imm) { - tmp = inputs[i][j] > inputs[i][j + vector_size] ? - inputs[i][j] : - inputs[i][j + vector_size]; + out_val = inputs[i][j] > src_val ? inputs[i][j] : src_val; } else { - tmp = inputs[i][j] > immediate[i / VTA_BATCH] ? - inputs[i][j] : - immediate[i / VTA_BATCH]; + out_val = inputs[i][j] > imm_val ? inputs[i][j] : imm_val; } } else if (opcode == VTA_ALU_OPCODE_ADD) { if (!use_imm) { - tmp = inputs[i][j] + inputs[i][j + vector_size]; + out_val = inputs[i][j] + src_val; } else { - tmp = inputs[i][j] + immediate[i / VTA_BATCH]; + out_val = inputs[i][j] + imm_val; } } else if (opcode == VTA_ALU_OPCODE_SHR) { - if (immediate[i / VTA_BATCH] >= 0) { - tmp = inputs[i][j] >> immediate[i / VTA_BATCH]; + if (!use_imm) { + if (src_val >= 0) { + out_val = inputs[i][j] >> src_val; + } else { + out_val = inputs[i][j] << (0 - src_val); + } } else { - tmp = inputs[i][j] << (0 - immediate[i / VTA_BATCH]); + if (imm_val >= 0) { + out_val = inputs[i][j] >> imm_val; + } else { + out_val = inputs[i][j] << (0 - imm_val); + } } } - // Set - outputs_ref[i][j] = (out_T) tmp; + outputs_ref[i][j] = (out_T) out_val; } } // Pack input buffer - acc_T *bias_buf = - static_cast(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets)); - packBuffer( + uint32_t *bias_buf = static_cast( + allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets)); + packBuffer( bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT); // Prepare output buffer - out_T *output_buf = - static_cast(allocBuffer(VTA_INP_ELEM_BYTES * batch * tx_size * input_sets)); + uint32_t *output_buf = static_cast( + allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets)); #ifdef NO_SIM // Invoke the VTA @@ -914,20 +889,20 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp vta(ins_size, (volatile insn_T *) insn_buf, (volatile uop_T *) uop_buf, - (volatile inp_vec_T *) NULL, - (volatile wgt_vec_T *) NULL, - (volatile acc_vec_T *) bias_buf, - (volatile out_vec_T *) output_buf); + (volatile bus_T *) NULL, + (volatile bus_T *) NULL, + (volatile bus_T *) bias_buf, + (volatile bus_T *) output_buf); #endif // Unpack output buffer out_T **outputs = alloc2dArray(batch, vector_size); - unpackBuffer(outputs, - output_buf, - batch, - vector_size, - VTA_BATCH, - VTA_BLOCK_OUT); + unpackBuffer(outputs, + output_buf, + batch, + vector_size, + VTA_BATCH, + VTA_BLOCK_OUT); // Correctness checks int err = 0; @@ -1123,11 +1098,11 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, #endif // Initialize inputs - inp_T **inputs = allocInit2dArray(batch, in_feat); + inp_T **inputs = allocInit2dArray(batch, in_feat); // Initialize weights - wgt_T **weights = allocInit2dArray(out_feat, in_feat); + wgt_T **weights = allocInit2dArray(out_feat, in_feat); // Initialize biases - acc_T **biases = allocInit2dArray(batch, out_feat); + acc_T **biases = allocInit2dArray(batch, out_feat); // Reference GEMM implementation out_T **outputs_ref = alloc2dArray(batch, out_feat); @@ -1143,31 +1118,35 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, } // Prepare the input buffer - inp_T *input_buf = static_cast(allocBuffer(VTA_INP_ELEM_BYTES * inp_size)); - packBuffer(input_buf, - inputs, - batch, - in_feat, - VTA_BATCH, - VTA_BLOCK_IN); + uint32_t *input_buf = static_cast( + allocBuffer(VTA_INP_ELEM_BYTES * inp_size)); + packBuffer(input_buf, + inputs, + batch, + in_feat, + VTA_BATCH, + VTA_BLOCK_IN); // Prepare the weight buffer - wgt_T *weight_buf = static_cast(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size)); - packBuffer(weight_buf, - weights, - out_feat, - in_feat, - VTA_BLOCK_OUT, - VTA_BLOCK_IN); + uint32_t *weight_buf = static_cast( + allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size)); + packBuffer(weight_buf, + weights, + out_feat, + in_feat, + VTA_BLOCK_OUT, + VTA_BLOCK_IN); // Prepare the bias buffer - acc_T *bias_buf = static_cast(allocBuffer(VTA_ACC_ELEM_BYTES * out_size)); - packBuffer(bias_buf, - biases, - batch, - out_feat, - VTA_BATCH, - VTA_BLOCK_OUT); + uint32_t *bias_buf = static_cast( + allocBuffer(VTA_ACC_ELEM_BYTES * out_size)); + packBuffer(bias_buf, + biases, + batch, + out_feat, + VTA_BATCH, + VTA_BLOCK_OUT); // Prepare the output buffer - out_T *output_buf = static_cast(allocBuffer(VTA_INP_ELEM_BYTES * out_size)); + uint32_t *output_buf = static_cast( + allocBuffer(VTA_INP_ELEM_BYTES * out_size)); #ifdef NO_SIM // Invoke the VTA @@ -1187,20 +1166,20 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, vta(ins_size, (volatile insn_T *) insn_buf, (volatile uop_T *) uop_buf, - (volatile inp_vec_T *) input_buf, - (volatile wgt_vec_T *) weight_buf, - (volatile acc_vec_T *) bias_buf, - (volatile out_vec_T *) output_buf); + (volatile bus_T *) input_buf, + (volatile bus_T *) weight_buf, + (volatile bus_T *) bias_buf, + (volatile bus_T *) output_buf); #endif // Unpack output data out_T **outputs = alloc2dArray(batch, out_feat); - unpackBuffer(outputs, - output_buf, - batch, - out_feat, - VTA_BATCH, - VTA_BLOCK_OUT); + unpackBuffer(outputs, + output_buf, + batch, + out_feat, + VTA_BATCH, + VTA_BLOCK_OUT); // Correctness checks int err = 0; @@ -1352,11 +1331,11 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression #endif // Initialize inputs - inp_T **inputs = allocInit2dArray(batch, in_channels); + inp_T **inputs = allocInit2dArray(batch, in_channels); // Initialize weights - wgt_T **weights = allocInit2dArray(out_channels, in_channels); + wgt_T **weights = allocInit2dArray(out_channels, in_channels); // Initialize biases - acc_T **biases = allocInit2dArray(batch, out_channels); + acc_T **biases = allocInit2dArray(batch, out_channels); // Reference GEMM implementation out_T **outputs_ref = alloc2dArray(batch, out_channels); @@ -1372,31 +1351,31 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression } // Prepare the input buffer - inp_T *input_buf = static_cast(allocBuffer(VTA_INP_ELEM_BYTES * inp_size)); - packBuffer(input_buf, - inputs, - batch, - in_channels, - VTA_BATCH, - VTA_BLOCK_IN); + uint32_t *input_buf = static_cast(allocBuffer(VTA_INP_ELEM_BYTES * inp_size)); + packBuffer(input_buf, + inputs, + batch, + in_channels, + VTA_BATCH, + VTA_BLOCK_IN); // Prepare the weight buffer - wgt_T *weight_buf = static_cast(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size)); - packBuffer(weight_buf, - weights, - out_channels, - in_channels, - VTA_BLOCK_OUT, - VTA_BLOCK_IN); + uint32_t *weight_buf = static_cast(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size)); + packBuffer(weight_buf, + weights, + out_channels, + in_channels, + VTA_BLOCK_OUT, + VTA_BLOCK_IN); // Prepare the bias buffer - acc_T *bias_buf = static_cast(allocBuffer(VTA_ACC_ELEM_BYTES * out_size)); - packBuffer(bias_buf, - biases, - batch, - out_channels, - VTA_BATCH, - VTA_BLOCK_OUT); + uint32_t *bias_buf = static_cast(allocBuffer(VTA_ACC_ELEM_BYTES * out_size)); + packBuffer(bias_buf, + biases, + batch, + out_channels, + VTA_BATCH, + VTA_BLOCK_OUT); // Prepare the output buffer - out_T *output_buf = static_cast(allocBuffer(VTA_INP_ELEM_BYTES * out_size)); + uint32_t *output_buf = static_cast(allocBuffer(VTA_OUT_ELEM_BYTES * out_size)); #ifdef NO_SIM // Invoke the VTA @@ -1416,20 +1395,20 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression vta(ins_size, (volatile insn_T *) insn_buf, (volatile uop_T *) uop_buf, - (volatile inp_vec_T *) input_buf, - (volatile wgt_vec_T *) weight_buf, - (volatile acc_vec_T *) bias_buf, - (volatile out_vec_T *) output_buf); + (volatile bus_T *) input_buf, + (volatile bus_T *) weight_buf, + (volatile bus_T *) bias_buf, + (volatile bus_T *) output_buf); #endif // Unpack output data out_T **outputs = alloc2dArray(batch, out_channels); - unpackBuffer(outputs, - output_buf, - batch, - out_channels, - VTA_BATCH, - VTA_BLOCK_OUT); + unpackBuffer(outputs, + output_buf, + batch, + out_channels, + VTA_BATCH, + VTA_BLOCK_OUT); // Correctness checks int err = 0; diff --git a/vta/tests/hardware/common/test_lib.h b/vta/tests/hardware/common/test_lib.h index ee8c34009057..e4ba9c9944fb 100644 --- a/vta/tests/hardware/common/test_lib.h +++ b/vta/tests/hardware/common/test_lib.h @@ -18,7 +18,6 @@ */ /*! - * Copyright (c) 2018 by Contributors * \file test_lib.cpp * \brief Test library for the VTA design simulation and driver tests. */ @@ -40,7 +39,6 @@ #include "../../../src/pynq/pynq_driver.h" #endif // VTA_TARGET_PYNQ -typedef uint64_t axi_T; typedef uint32_t uop_T; typedef int8_t wgt_T; typedef int8_t inp_T; @@ -95,14 +93,24 @@ template void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block); /*! -* \brief Allocates and initializes a 2D array in the heap. +* \brief Allocates and randomly initializes a 2D array in the heap. * \param rows Number of rows. * \param cols Number of columns. * \return Pointer to the 2D array. */ -template +template T ** allocInit2dArray(int rows, int cols); +/*! +* \brief Allocates and initializes a 2D array to a set value in the heap. +* \param rows Number of rows. +* \param cols Number of columns. +* \param val Value to set the whole array to. +* \return Pointer to the 2D array. +*/ +template +T ** allocSet2dArray(int rows, int cols, int val); + /*! * \brief Allocates a 2D array in the heap. * \param rows Number of rows. diff --git a/vta/tests/python/unittest/test_environment.py b/vta/tests/python/unittest/test_environment.py index d5f7a6f43be9..605a9e0dfcdd 100644 --- a/vta/tests/python/unittest/test_environment.py +++ b/vta/tests/python/unittest/test_environment.py @@ -24,7 +24,7 @@ def test_env(): def test_env_scope(): env = vta.get_env() - cfg = env.pkg_config().cfg_dict + cfg = env.cfg_dict cfg["TARGET"] = "xyz" with vta.Environment(cfg): assert vta.get_env().TARGET == "xyz" diff --git a/vta/tutorials/frontend/deploy_resnet_on_vta.py b/vta/tutorials/frontend/deploy_resnet_on_vta.py index 393574932841..b21d205bd9d8 100644 --- a/vta/tutorials/frontend/deploy_resnet_on_vta.py +++ b/vta/tutorials/frontend/deploy_resnet_on_vta.py @@ -100,9 +100,9 @@ # the host, make sure you've set the variables below to the IP of # your board. device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") - device_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091")) + device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091") if not tracker_host or not tracker_port: - remote = rpc.connect(device_host, device_port) + remote = rpc.connect(device_host, int(device_port)) else: remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000)