diff --git a/vta/config/pynq_sample.json b/vta/config/pynq_sample.json index 247d255c34382..28e826acdbf6e 100644 --- a/vta/config/pynq_sample.json +++ b/vta/config/pynq_sample.json @@ -1,12 +1,8 @@ { - "TARGET" : "pynq", + "TARGET" : "sim", "HW_VER" : "0.0.0", "HW_FREQ" : 100, "HW_CLK_TARGET" : 7, - "ALU_EN" : true, - "MUL_EN" : false, - "GEMM_II" : 1, - "TALU_II" : 2, "LOG_INP_WIDTH" : 3, "LOG_WGT_WIDTH" : 3, "LOG_ACC_WIDTH" : 5, @@ -16,7 +12,7 @@ "LOG_BLOCK_OUT" : 4, "LOG_BUS_WIDTH" : 6, "LOG_UOP_BUFF_SIZE" : 15, - "LOG_INP_BUFF_SIZE" : 15, + "LOG_INP_BUFF_SIZE" :15, "LOG_WGT_BUFF_SIZE" : 18, "LOG_ACC_BUFF_SIZE" : 17 } diff --git a/vta/config/vta_config.json b/vta/config/vta_config.json index 2a1f3aad600cc..fcba423477735 100644 --- a/vta/config/vta_config.json +++ b/vta/config/vta_config.json @@ -1,12 +1,8 @@ { - "TARGET" : "sim", + "TARGET" : "pynq", "HW_VER" : "0.0.0", "HW_FREQ" : 100, "HW_CLK_TARGET" : 7, - "ALU_EN" : true, - "MUL_EN" : false, - "GEMM_II" : 1, - "TALU_II" : 2, "LOG_INP_WIDTH" : 3, "LOG_WGT_WIDTH" : 3, "LOG_ACC_WIDTH" : 5, @@ -16,7 +12,7 @@ "LOG_BLOCK_OUT" : 4, "LOG_BUS_WIDTH" : 6, "LOG_UOP_BUFF_SIZE" : 15, - "LOG_INP_BUFF_SIZE" : 15, + "LOG_INP_BUFF_SIZE" :15, "LOG_WGT_BUFF_SIZE" : 18, "LOG_ACC_BUFF_SIZE" : 17 } diff --git a/vta/config/vta_config.py b/vta/config/vta_config.py index 808a07b6d14d8..6d5ea705c1ea7 100644 --- a/vta/config/vta_config.py +++ b/vta/config/vta_config.py @@ -54,14 +54,6 @@ def main(): help="print the target") parser.add_argument("--cfg-str", action="store_true", help="print the configuration string") - parser.add_argument("--get-aluen", action="store_true", - help="returns whether ALU is enabled") - parser.add_argument("--get-mulen", action="store_true", - help="returns whether mul in ALU is enabled") - parser.add_argument("--get-gemmii", action="store_true", - help="returns the GEMM core II") - parser.add_argument("--get-taluii", action="store_true", - help="returns the tensor ALU core II") parser.add_argument("--get-inpwidth", action="store_true", help="returns log of input bitwidth") parser.add_argument("--get-wgtwidth", action="store_true", @@ -118,7 +110,7 @@ def main(): - cfg["LOG_ACC_WIDTH"]) # Generate bitstream config string. # Needs to match the BITSTREAM string in python/vta/environment.py - cfg["BITSTREAM"] = "{}_{}x{}x{}_a{}w{}o{}s{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format( + cfg["BITSTREAM"] = "{}_{}x{}x{}_a{}w{}o{}s{}_{}_{}_{}_{}_{}MHz_{}ns".format( cfg["TARGET"], (1 << cfg["LOG_BATCH"]), (1 << cfg["LOG_BLOCK_IN"]), @@ -132,12 +124,7 @@ def main(): cfg["LOG_WGT_BUFF_SIZE"], cfg["LOG_ACC_BUFF_SIZE"], cfg["HW_FREQ"], - cfg["HW_CLK_TARGET"], - cfg["GEMM_II"]) - if cfg["ALU_EN"]: - cfg["BITSTREAM"] += "_aii{}".format(cfg["TALU_II"]) - if cfg["MUL_EN"] and cfg["ALU_EN"]: - cfg["BITSTREAM"] += "_mul" + cfg["HW_CLK_TARGET"]) pkg = get_pkg_config(cfg) if args.target: @@ -170,18 +157,6 @@ def main(): if args.cfg_str: print(cfg["BITSTREAM"]) - if args.get_aluen: - print(cfg["ALU_EN"]) - - if args.get_mulen: - print(cfg["MUL_EN"]) - - if args.get_gemmii: - print(cfg["GEMM_II"]) - - if args.get_taluii: - print(cfg["TALU_II"]) - if args.get_inpwidth: print(cfg["LOG_INP_WIDTH"]) diff --git a/vta/hardware/xilinx/Makefile b/vta/hardware/xilinx/Makefile index 79082036f535f..785ec7fcbd4f9 100644 --- a/vta/hardware/xilinx/Makefile +++ b/vta/hardware/xilinx/Makefile @@ -28,18 +28,9 @@ INCLUDE_DIR = $(ROOTDIR)/../../include # Executables VIVADO_HLS = vivado_hls VIVADO = vivado -HSI = hsi - -# HLS mode -MODE = all -# Debug flag -DEBUG = False -# SLURM -SLURM = False # Process VTA JSON config VTA_CONFIG := python $(CURDIR)/../../config/vta_config.py -CFLAGS := $(shell ${VTA_CONFIG} --cflags) VTA_TARGET := $(shell ${VTA_CONFIG} --target) #--------------------- @@ -58,16 +49,12 @@ VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize) VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize) VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize) VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize) -VTA_ALU_EN := $(shell ${VTA_CONFIG} --get-aluen) -VTA_MUL_EN := $(shell ${VTA_CONFIG} --get-mulen) #--------------------- # FPGA Parameters #-------------------- VTA_CLOCK_FREQ := $(shell ${VTA_CONFIG} --get-fpgafreq) VTA_TARGET_PER := $(shell ${VTA_CONFIG} --get-fpgaper) -VTA_GEMM_II := $(shell ${VTA_CONFIG} --get-gemmii) -VTA_TALU_II := $(shell ${VTA_CONFIG} --get-taluii) #--------------------- # Compilation parameters @@ -81,19 +68,13 @@ CONF := $(shell ${VTA_CONFIG} --cfg-str) IP_BUILD_PATH := $(BUILD_DIR)/hls/$(CONF) HW_BUILD_PATH := $(BUILD_DIR)/vivado/$(CONF) -# Build on local scratch drive when using cluster -ifeq ($(SLURM), True) - IP_BUILD_PATH = /scratch/hls/$(CONF) - HW_BUILD_PATH = /scratch/vivado/$(CONF) -endif - # IP file path IP_PATH := $(BUILD_DIR)/hls/$(CONF)/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_1_0.zip # Bitstream file path BIT_PATH := $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit -.PHONY: all ip bit bsp clean clean_all +.PHONY: all ip bit clean clean_all all: bit ip: $(IP_PATH) @@ -102,40 +83,52 @@ bit: $(BIT_PATH) $(IP_PATH): $(SRC_DIR)/* mkdir -p $(IP_BUILD_PATH) cd $(IP_BUILD_PATH) && \ - $(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \ - -tclargs $(VTA_TARGET) \ - $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \ - $(MODE) $(DEBUG) $(VTA_ALU_EN) $(VTA_MUL_EN) \ - $(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \ - $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \ - $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) $(VTA_BUS_WIDTH) \ - $(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \ - $(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE) -ifeq ($(SLURM), True) - mkdir -p $(BUILD_DIR)/hls - mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/. -endif + $(VIVADO_HLS) \ + -f $(SCRIPT_DIR)/hls.tcl \ + -tclargs \ + $(VTA_TARGET) \ + $(SRC_DIR) \ + $(SIM_DIR) \ + $(TEST_DIR) \ + $(INCLUDE_DIR) \ + $(VTA_TARGET_PER) \ + $(VTA_INP_WIDTH) \ + $(VTA_WGT_WIDTH) \ + $(VTA_ACC_WIDTH) \ + $(VTA_OUT_WIDTH) \ + $(VTA_BATCH) \ + $(VTA_IN_BLOCK) \ + $(VTA_OUT_BLOCK) \ + $(VTA_BUS_WIDTH) \ + $(VTA_UOP_BUFF_SIZE) \ + $(VTA_INP_BUFF_SIZE) \ + $(VTA_WGT_BUFF_SIZE) \ + $(VTA_ACC_BUFF_SIZE) \ + $(VTA_OUT_BUFF_SIZE) $(BIT_PATH): $(IP_PATH) mkdir -p $(HW_BUILD_PATH) cd $(HW_BUILD_PATH) && \ - $(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \ - -tclargs $(VTA_TARGET) $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) \ - $(VTA_CLOCK_FREQ) $(VTA_GEMM_II) \ - $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \ - $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \ - $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE) -ifeq ($(SLURM), True) - mkdir -p $(BUILD_DIR)/vivado - mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/. -endif - -bsp: $(BIT_PATH) - cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog - cd $(HW_BUILD_PATH)/bsp && make + $(VIVADO) \ + -mode tcl \ + -source $(SCRIPT_DIR)/vivado.tcl \ + -tclargs \ + $(VTA_TARGET) \ + $(BUILD_DIR)/hls/$(CONF) \ + $(VTA_HW_COMP_THREADS) \ + $(VTA_CLOCK_FREQ) \ + $(VTA_INP_WIDTH) \ + $(VTA_WGT_WIDTH) \ + $(VTA_OUT_WIDTH) \ + $(VTA_BATCH) \ + $(VTA_IN_BLOCK) \ + $(VTA_OUT_BLOCK) \ + $(VTA_INP_BUFF_SIZE) \ + $(VTA_WGT_BUFF_SIZE) \ + $(VTA_OUT_BUFF_SIZE) clean: - rm -rf *.out *.log *.sb figures + rm -rf *.out *.log cleanall: clean rm -rf $(BUILD_DIR) diff --git a/vta/hardware/xilinx/scripts/hls.tcl b/vta/hardware/xilinx/scripts/hls.tcl index 75979c8c3af84..586e6c05a001e 100644 --- a/vta/hardware/xilinx/scripts/hls.tcl +++ b/vta/hardware/xilinx/scripts/hls.tcl @@ -21,53 +21,41 @@ # Arg 3: path to sim sources # Arg 4: path to test sources # Arg 5: path to include sources -# Arg 6: mode -# Arg 7: debug -# Arg 8: alu_ena -# Arg 9: mul_ena -# Arg 10: target clock period -# Arg 11: target II for GEMM -# Arg 12: target II for tensor ALU -# Arg 13: input type width (log) -# Arg 14: weight type width (log) -# Arg 15: accum type width (log) -# Arg 16: output type width (log) -# Arg 17: batch size (log) -# Arg 18: in block size (log) -# Arg 19: out block size (log) -# Arg 20: bus width in b (log) -# Arg 21: uop buffer size in B (log) -# Arg 22: inp buffer size in B (log) -# Arg 23: wgt buffer size in B (log) -# Arg 24: acc buffer size in B (log) -# Arg 25: out buffer size in B (log) - -if { [llength $argv] eq 27 } { - set target [lindex $argv 2] - set src_dir [lindex $argv 3] - set sim_dir [lindex $argv 4] - set test_dir [lindex $argv 5] - set include_dir [lindex $argv 6] - set mode [lindex $argv 7] - set debug [lindex $argv 8] - set alu_ena [lindex $argv 9] - set mul_ena [lindex $argv 10] - set target_period [lindex $argv 11] - set target_gemm_ii [lindex $argv 12] - set target_alu_ii [lindex $argv 13] - set inp_width [lindex $argv 14] - set wgt_width [lindex $argv 15] - set acc_width [lindex $argv 16] - set out_width [lindex $argv 17] - set batch [lindex $argv 18] - set block_in [lindex $argv 19] - set block_out [lindex $argv 20] - set bus_width [lindex $argv 21] - set uop_buff_size [lindex $argv 22] - set inp_buff_size [lindex $argv 23] - set wgt_buff_size [lindex $argv 24] - set acc_buff_size [lindex $argv 25] - set out_buff_size [lindex $argv 26] +# Arg 6: target clock period +# Arg 7: input type width (log) +# Arg 8: weight type width (log) +# Arg 9: accum type width (log) +# Arg 10: output type width (log) +# Arg 11: batch size (log) +# Arg 12: in block size (log) +# Arg 13: out block size (log) +# Arg 14: bus width in b (log) +# Arg 15: uop buffer size in B (log) +# Arg 16: inp buffer size in B (log) +# Arg 17: wgt buffer size in B (log) +# Arg 18: acc buffer size in B (log) +# Arg 19: out buffer size in B (log) + +if { [llength $argv] eq 21 } { + set target [lindex $argv 2] + set src_dir [lindex $argv 3] + set sim_dir [lindex $argv 4] + set test_dir [lindex $argv 5] + set include_dir [lindex $argv 6] + set per [lindex $argv 7] + set inp_width [lindex $argv 8] + set wgt_width [lindex $argv 9] + set acc_width [lindex $argv 10] + set out_width [lindex $argv 11] + set batch [lindex $argv 12] + set block_in [lindex $argv 13] + set block_out [lindex $argv 14] + set bus_width [lindex $argv 15] + set uop_buff_size [lindex $argv 16] + set inp_buff_size [lindex $argv 17] + set wgt_buff_size [lindex $argv 18] + set acc_buff_size [lindex $argv 19] + set out_buff_size [lindex $argv 20] } else { puts "Not enough arguments provided!" exit @@ -79,7 +67,7 @@ puts "about to start doing some stuff" # Initializes the HLS design and sets HLS pragmas for memory partitioning. # This is necessary because of a Vivado restriction that doesn't allow for # buses wider than 1024 bits. -proc init_design {target per g_ii a_ii bus_width inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} { +proc init_design {target per bus_width inp_width wgt_width out_width acc_width batch block_in block_out} { # Set device number if {$target=="pynq"} { @@ -88,9 +76,6 @@ proc init_design {target per g_ii a_ii bus_width inp_width wgt_width out_width a set_part {xczu3eg-sbva484-1-e} } elseif {$target=="zcu102"} { set_part {xczu9eg-ffvb1156-2-e} - } elseif {$target=="f1"} { - set_part {xcvu9p-flgb2104-2-i} - # config_interface -m_axi_addr64 } # Max bus width (supported by Vivado) @@ -102,15 +87,8 @@ proc init_design {target per g_ii a_ii bus_width inp_width wgt_width out_width a # Set the clock frequency create_clock -period $per -name default - # Set pipeline directive - set_directive_pipeline -II $g_ii "gemm/READ_GEMM_UOP" - - if {$alu_ena=="True"} { - set_directive_pipeline -II $a_ii "alu/READ_ALU_UOP" - } - - # Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*g_ii) - set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch)) / $g_ii}] + # Set input partition factor to (INP_VECTOR_WIDTH*BATCH/max_width) + set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch))}] set inp_partition_factor [expr {$inp_bus_width / $max_width}] if {$inp_partition_factor == 0} { set inp_reshape_factor [expr {$inp_bus_width / $axi_width}] @@ -123,8 +101,8 @@ proc init_design {target per g_ii a_ii bus_width inp_width wgt_width out_width a set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem } - # Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/(1024*g_ii)) - set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out)) / $g_ii}] + # Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/max_width) + set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out))}] set wgt_partition_factor [expr {$wgt_bus_width / $max_width}] if {$wgt_partition_factor == 0} { set wgt_reshape_factor [expr {$wgt_bus_width / $axi_width}] @@ -137,8 +115,8 @@ proc init_design {target per g_ii a_ii bus_width inp_width wgt_width out_width a set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem } - # Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/(1024*g_ii)) - set out_bus_width [expr {(1 << ($out_width + $block_out + $batch)) / $g_ii}] + # Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/max_width) + set out_bus_width [expr {(1 << ($out_width + $block_out + $batch))}] set out_partition_factor [expr {$out_bus_width / $max_width}] if {$out_partition_factor == 0} { set out_reshape_factor [expr {$out_bus_width / $axi_width}] @@ -164,87 +142,58 @@ set cflags "-I $include_dir -I $src_dir -I $test_dir \ -DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \ -DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \ -DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \ - -DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size -DVTA_LOG_BUS_WIDTH=$bus_width \ - -DVTA_GEMM_II=$target_gemm_ii" -if {$debug=="True"} { - append cflags " -DVTA_DEBUG=1" -} -if {$alu_ena=="True"} { - append cflags " -DALU_EN" -} -if {$mul_ena=="True"} { - append cflags " -DMUL_EN" -} - + -DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size -DVTA_LOG_BUS_WIDTH=$bus_width" # HLS behavioral sim -if {$mode=="all" || $mode=="sim"} { - open_project vta_sim - set_top vta - add_files $src_dir/vta.cc -cflags $cflags - add_files -tb $sim_dir/vta_test.cc -cflags $cflags - add_files -tb $test_dir/test_lib.cc -cflags $cflags - open_solution "solution0" - init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena - csim_design -clean - close_project -} +open_project vta_sim +set_top vta +add_files $src_dir/vta.cc -cflags $cflags +add_files -tb $sim_dir/vta_test.cc -cflags $cflags +add_files -tb $test_dir/test_lib.cc -cflags $cflags +open_solution "solution0" +init_design $target $per $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out +csim_design -clean +close_project # Generate fetch stage -if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} { - open_project vta_fetch - set_top fetch - add_files $src_dir/vta.cc -cflags $cflags - open_solution "solution0" - init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena - csynth_design - if {$mode=="all" || $mode=="skip_sim"} { - export_design -format ip_catalog - } - close_project -} +open_project vta_fetch +set_top fetch +add_files $src_dir/vta.cc -cflags $cflags +open_solution "solution0" +init_design $target $per $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out +csynth_design +export_design -format ip_catalog +close_project # Generate load stage -if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} { - open_project vta_load - set_top load - add_files $src_dir/vta.cc -cflags $cflags - open_solution "solution0" - init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena - csynth_design - if {$mode=="all" || $mode=="skip_sim"} { - export_design -format ip_catalog - } - close_project -} +open_project vta_load +set_top load +add_files $src_dir/vta.cc -cflags $cflags +open_solution "solution0" +init_design $target $per $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out +csynth_design +export_design -format ip_catalog +close_project # Generate compute stage -if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} { - open_project vta_compute - set_top compute - add_files $src_dir/vta.cc -cflags $cflags - open_solution "solution0" - init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena - csynth_design - if {$mode=="all" || $mode=="skip_sim"} { - export_design -format ip_catalog - } - close_project -} +open_project vta_compute +set_top compute +add_files $src_dir/vta.cc -cflags $cflags +open_solution "solution0" +init_design $target $per $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out +csynth_design +export_design -format ip_catalog +close_project # Generate store stage -if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} { - open_project vta_store - set_top store - add_files $src_dir/vta.cc -cflags $cflags - open_solution "solution0" - init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena - csynth_design - if {$mode=="all" || $mode=="skip_sim"} { - export_design -format ip_catalog - } - close_project -} +open_project vta_store +set_top store +add_files $src_dir/vta.cc -cflags $cflags +open_solution "solution0" +init_design $target $per $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out +csynth_design +export_design -format ip_catalog +close_project exit diff --git a/vta/hardware/xilinx/scripts/vivado.tcl b/vta/hardware/xilinx/scripts/vivado.tcl index 3a2e1d3cb0b7c..50326418fb59f 100644 --- a/vta/hardware/xilinx/scripts/vivado.tcl +++ b/vta/hardware/xilinx/scripts/vivado.tcl @@ -27,24 +27,23 @@ if { [string first $scripts_vivado_version $current_vivado_version] == -1 } { } # Parse argument list, derive the clock to utilize -if { [llength $argv] eq 14 } { +if { [llength $argv] eq 13 } { set target [lindex $argv 0] set ip_path [lindex $argv 1] set num_threads [lindex $argv 2] set clock_freq [lindex $argv 3] - set gemm_ii [lindex $argv 4] - set inp_width [expr 1 << [lindex $argv 5]] - set wgt_width [expr 1 << [lindex $argv 6]] - set out_width [expr 1 << [lindex $argv 7]] - set batch [expr 1 << [lindex $argv 8]] + set inp_width [expr 1 << [lindex $argv 4]] + set wgt_width [expr 1 << [lindex $argv 5]] + set out_width [expr 1 << [lindex $argv 6]] + set batch [expr 1 << [lindex $argv 7]] + set in_block [expr 1 << [lindex $argv 8]] set out_block [expr 1 << [lindex $argv 9]] - set in_block [expr 1 << [lindex $argv 10]] - set inp_mem_size [expr 1 << [lindex $argv 11]] - set wgt_mem_size [expr 1 << [lindex $argv 12]] - set out_mem_size [expr 1 << [lindex $argv 13]] + set inp_mem_size [expr 1 << [lindex $argv 10]] + set wgt_mem_size [expr 1 << [lindex $argv 11]] + set out_mem_size [expr 1 << [lindex $argv 12]] } else { puts "Arg list incomplete: \ - \ + \ " return 1 } @@ -53,7 +52,7 @@ if { [llength $argv] eq 14 } { set max_bus_width 1024 # Derive input mem parameters -set inp_mem_width [expr $inp_width * $batch * $in_block / $gemm_ii] +set inp_mem_width [expr $inp_width * $batch * $in_block] set inp_bus_width $max_bus_width set inp_part [expr $inp_mem_width / $inp_bus_width] if {[expr $inp_part == 0]} { @@ -63,7 +62,7 @@ if {[expr $inp_part == 0]} { set inp_mem_depth [expr $inp_mem_size * 8 / ($inp_mem_width * $inp_part)] # Derive weight mem parameters -set wgt_mem_width [expr $wgt_width * $out_block * $in_block / $gemm_ii] +set wgt_mem_width [expr $wgt_width * $out_block * $in_block] set wgt_bus_width $max_bus_width set wgt_part [expr $wgt_mem_width / $wgt_bus_width] if {[expr $wgt_part == 0]} { @@ -73,7 +72,7 @@ if {[expr $wgt_part == 0]} { set wgt_mem_depth [expr $wgt_mem_size * 8 / ($wgt_mem_width * $wgt_part)] # Derive output mem parameters -set out_mem_width [expr $out_width * $batch * $out_block / $gemm_ii] +set out_mem_width [expr $out_width * $batch * $out_block] set out_bus_width $max_bus_width set out_part [expr $out_mem_width / $out_bus_width] if {[expr $out_part == 0]} { diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc index 1e6432ff99579..e179f7341f6f3 100644 --- a/vta/hardware/xilinx/src/vta.cc +++ b/vta/hardware/xilinx/src/vta.cc @@ -260,7 +260,7 @@ void gemm( // Iterate over micro op READ_GEMM_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) { - +#pragma HLS PIPELINE II = 1 // Read micro-op fields uop_T uop = uop_mem[upc]; @@ -348,6 +348,7 @@ void alu( EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) { // Iterate over micro op READ_ALU_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) { +#pragma HLS PIPELINE II = 2 // Read micro-op fields uop_T uop = uop_mem[upc]; diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index 093b0ec5c3863..32bfcb27c9259 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -155,19 +155,21 @@ def __init__(self, cfg): self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8 self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8 # Configuration bitstream name - self.BITSTREAM = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}.bit".format( + self.BITSTREAM = "{}_{}x{}x{}_a{}w{}o{}s{}_{}_{}_{}_{}_{}MHz_{}ns".format( + cfg["TARGET"], (1 << cfg["LOG_BATCH"]), (1 << cfg["LOG_BLOCK_IN"]), (1 << cfg["LOG_BLOCK_OUT"]), (1 << cfg["LOG_INP_WIDTH"]), (1 << cfg["LOG_WGT_WIDTH"]), + (1 << cfg["LOG_OUT_WIDTH"]), + (1 << cfg["LOG_ACC_WIDTH"]), cfg["LOG_UOP_BUFF_SIZE"], cfg["LOG_INP_BUFF_SIZE"], cfg["LOG_WGT_BUFF_SIZE"], cfg["LOG_ACC_BUFF_SIZE"], cfg["HW_FREQ"], - cfg["HW_CLK_TARGET"], - cfg["HW_VER"].replace('.', '_')) + cfg["HW_CLK_TARGET"]) # dtypes self.acc_dtype = "int%d" % self.ACC_WIDTH self.inp_dtype = "int%d" % self.INP_WIDTH