diff --git a/MANIFEST.in b/MANIFEST.in
index 47a636c11..a3482f084 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,3 +4,4 @@ graft example-prjs
graft example-models
graft test
recursive-include hls4ml/templates *
+include hls4ml/backends/vivado_accelerator/supported_boards.json
diff --git a/example-models b/example-models
index 0d4cc7277..ff74f73db 160000
--- a/example-models
+++ b/example-models
@@ -1 +1 @@
-Subproject commit 0d4cc7277eac9bb9020e3d73a992dc15dbdcce4e
+Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548
diff --git a/hls4ml/backends/vivado_accelerator/supported_boards.json b/hls4ml/backends/vivado_accelerator/supported_boards.json
index 34d676d9c..1279ec22d 100644
--- a/hls4ml/backends/vivado_accelerator/supported_boards.json
+++ b/hls4ml/backends/vivado_accelerator/supported_boards.json
@@ -10,5 +10,33 @@
"tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
"python_drivers": {"axi_stream": "axi_stream_driver.py"},
"c_drivers": {}
+ },
+ "alveo-u50": {
+ "part": "xcu50-fsvh2104-2-e",
+ "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"},
+ "c_drivers": {}
+ },
+ "alveo-u250": {
+ "part": "xcu250-figd2104-2L-e",
+ "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"},
+ "c_drivers": {}
+ },
+ "alveo-u200": {
+ "part": "xcu200-fsgd2104-2-e",
+ "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"},
+ "c_drivers": {}
+ },
+ "alveo-u280": {
+ "part": "xcu280-fsvh2892-2L-e",
+ "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"},
+ "c_drivers": {}
}
-}
\ No newline at end of file
+}
diff --git a/hls4ml/backends/vivado_accelerator/vivado_accelerator_backend.py b/hls4ml/backends/vivado_accelerator/vivado_accelerator_backend.py
index 3f33900f9..63b83659f 100644
--- a/hls4ml/backends/vivado_accelerator/vivado_accelerator_backend.py
+++ b/hls4ml/backends/vivado_accelerator/vivado_accelerator_backend.py
@@ -12,20 +12,55 @@ def __init__(self):
def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False, bitfile=False):
# run the VivadoBackend build
report = super().build(model, reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth)
+ # Get Config to view Board and Platform
+ from hls4ml.backends import VivadoAcceleratorConfig
+ vivado_accelerator_config=VivadoAcceleratorConfig(model.config, model.get_input_variables(),model.get_output_variables())
# now make a bitfile
if bitfile:
- curr_dir = os.getcwd()
- os.chdir(model.config.get_output_dir())
- try:
- os.system('vivado -mode batch -source design.tcl')
- except:
- print("Something went wrong, check the Vivado logs")
- os.chdir(curr_dir)
+ if(vivado_accelerator_config.get_board().startswith('alveo')):
+ self.make_xclbin(model,vivado_accelerator_config.get_platform())
+ else:
+ curr_dir = os.getcwd()
+ os.chdir(model.config.get_output_dir())
+ try:
+ os.system('vivado -mode batch -source design.tcl')
+ except:
+ print("Something went wrong, check the Vivado logs")
+ os.chdir(curr_dir)
return parse_vivado_report(model.config.get_output_dir())
+ def make_xclbin(self,model, platform='xilinx_u250_xdma_201830_2'):
+ """
+
+ Parameters
+ ----------
+ - model : compiled and built hls_model.
+ - platform : development Target Platform, must be installed first. On the host machine is required only the
+ deployment target platform, both can be found on the Getting Started section of the Alveo card.
+ """
+ curr_dir = os.getcwd()
+ abs_path_dir=os.path.abspath(model.config.get_output_dir())
+ os.chdir(abs_path_dir)
+ os.makedirs('xo_files', exist_ok=True)
+ try:
+ os.system('vivado -mode batch -source design.tcl')
+ except:
+ print("Something went wrong, check the Vivado logs")
+ project_name=model.config.get_project_name()
+ ip_repo_path = abs_path_dir + '/'+project_name+'_prj'+'/solution1/impl/ip'
+ os.makedirs('xclbin_files', exist_ok=True)
+ os.chdir(abs_path_dir + '/xclbin_files')
+ # TODO Add other platforms
+ vitis_cmd = "v++ -t hw --platform " + platform + " --link ../xo_files/"+project_name+"_kernel.xo -o'"+project_name+"_kernel.xclbin' --user_ip_repo_paths " + ip_repo_path
+ try:
+ os.system(vitis_cmd)
+ except:
+ print("Something went wrong, check the Vitis/Vivado logs")
+ os.chdir(curr_dir)
+
def create_initial_config(self, board='pynq-z2', part=None, clock_period=5, io_type='io_parallel', interface='axi_stream',
- driver='python', input_type='float', output_type='float'):
+ driver='python', input_type='float', output_type='float',platform='xilinx_u250_xdma_201830_2'):
'''
Create initial accelerator config with default parameters
Args:
@@ -42,6 +77,7 @@ def create_initial_config(self, board='pynq-z2', part=None, clock_period=5, io_t
will round the number of bits used to the next power-of-2 value.
output_type: the wrapper output precision. Can be `float` or an `ap_type`. Note:
VivadoAcceleratorBackend will round the number of bits used to the next power-of-2 value.
+ platform: development target platform
Returns:
populated config
@@ -57,6 +93,9 @@ def create_initial_config(self, board='pynq-z2', part=None, clock_period=5, io_t
config['AcceleratorConfig']['Precision']['Output'] = {}
config['AcceleratorConfig']['Precision']['Input'] = input_type # float, double or ap_fixed
config['AcceleratorConfig']['Precision']['Output'] = output_type # float, double or ap_fixed
+ if board.startswith('alveo'):
+ config['AcceleratorConfig']['Platform'] = platform
+
return config
def _register_flows(self):
diff --git a/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py b/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py
index 31828e5cd..f9c7848ef 100644
--- a/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py
+++ b/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py
@@ -47,6 +47,7 @@ def __init__(self, config, model_inputs, model_outputs):
'float') # float, double or ap_fixed
self.output_type = self.config['AcceleratorConfig']['Precision'].get('Output',
'float') # float, double or ap_fixed
+ self.platform= self.config['AcceleratorConfig'].get('Platform', 'xilinx_u250_xdma_201830_2') # Get platform folder name
assert len(
model_inputs) == 1, "Only models with one input tensor are currently supported by VivadoAcceleratorBackend"
@@ -118,14 +119,28 @@ def get_driver(self):
def get_board(self):
return self.board
+ def get_platform(self):
+ return self.platform
+
+ def get_clock_period(self):
+ return self.clock_period
+
def get_driver_path(self):
- return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + \
+ if self.board.startswith('alveo'):
+ return '../templates/vivado_accelerator/' + 'alveo/' + self.driver + '_drivers/' + \
+ self.get_driver_file()
+ else:
+ return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + \
self.get_driver_file()
def get_driver_file(self):
driver_ext = '.py' if self.driver == 'python' else '.h'
return self.interface + '_driver' + driver_ext
+ def get_krnl_rtl_src_dir(self):
+ return '../templates/vivado_accelerator/' + 'alveo/' + '/krnl_rtl_src'
+
+
def get_input_type(self):
return self.input_type
@@ -140,4 +155,8 @@ def get_tcl_file_path(self):
tcl_script = tcl_scripts.get(self.interface, None)
if tcl_script is None:
raise Exception('No tcl script definition available for the desired interface in supported_board.json')
- return '../templates/vivado_accelerator/' + self.board + '/tcl_scripts/' + tcl_script
+ if self.board.startswith('alveo'):
+ return '../templates/vivado_accelerator/' + 'alveo/' + '/tcl_scripts/' + tcl_script
+ else:
+ return '../templates/vivado_accelerator/' + self.board + '/tcl_scripts/' + tcl_script
+
diff --git a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_axi_read_master.sv b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_axi_read_master.sv
new file mode 100644
index 000000000..a82dfc5a7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_axi_read_master.sv
@@ -0,0 +1,280 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+// Description: This is a multi-threaded AXI4 read master. Each channel will
+// issue commands on a different IDs. As a result data may arrive out of
+// order. The amount of data requested is equal to the ctrl_length variable.
+// Prog full is set and sampled such that the FIFO will never overflow. Thus
+// rready can be always asserted for better timing.
+///////////////////////////////////////////////////////////////////////////////
+
+`default_nettype none
+
+module krnl_rtl_axi_read_master #(
+ parameter integer C_ID_WIDTH = 0, // Must be >= $clog2(C_NUM_CHANNELS)
+ parameter integer C_ADDR_WIDTH = 64,
+ parameter integer C_DATA_WIDTH = 32,
+ parameter integer C_NUM_CHANNELS = 1, // Only 2 tested.
+ parameter integer C_LENGTH_WIDTH = 32,
+ parameter integer C_BURST_LEN = 256, // Max AXI burst length for read commands
+ parameter integer C_LOG_BURST_LEN = 8,
+ parameter integer C_MAX_OUTSTANDING = 3
+)
+(
+ // System signals
+ input wire aclk,
+ input wire areset,
+ // Control signals
+ input wire ctrl_start,
+ output wire ctrl_done,
+ input wire [C_NUM_CHANNELS-1:0][C_ADDR_WIDTH-1:0] ctrl_offset,
+ input wire [C_LENGTH_WIDTH-1:0] ctrl_length,
+ input wire [C_NUM_CHANNELS-1:0] ctrl_prog_full,
+ // AXI4 master interface
+ output wire arvalid,
+ input wire arready,
+ output wire [C_ADDR_WIDTH-1:0] araddr,
+ output wire [C_ID_WIDTH-1:0] arid,
+ output wire [7:0] arlen,
+ output wire [2:0] arsize,
+ input wire rvalid,
+ output wire rready,
+ input wire [C_DATA_WIDTH - 1:0] rdata,
+ input wire rlast,
+ input wire [C_ID_WIDTH - 1:0] rid,
+ input wire [1:0] rresp,
+ // AXI4-Stream master interface, 1 interface per channel.
+ output wire [C_NUM_CHANNELS-1:0] m_tvalid,
+ input wire [C_NUM_CHANNELS-1:0] m_tready,
+ output wire [C_NUM_CHANNELS-1:0][C_DATA_WIDTH-1:0] m_tdata,
+ output wire [C_NUM_CHANNELS-1:0] m_tlast
+);
+
+timeunit 1ps;
+timeprecision 1ps;
+
+///////////////////////////////////////////////////////////////////////////////
+// Local Parameters
+///////////////////////////////////////////////////////////////////////////////
+localparam integer LP_MAX_OUTSTANDING_CNTR_WIDTH = $clog2(C_MAX_OUTSTANDING+1);
+localparam integer LP_TRANSACTION_CNTR_WIDTH = C_LENGTH_WIDTH-C_LOG_BURST_LEN;
+
+///////////////////////////////////////////////////////////////////////////////
+// Variables
+///////////////////////////////////////////////////////////////////////////////
+// Control logic
+logic [C_NUM_CHANNELS-1:0] done = '0;
+logic [LP_TRANSACTION_CNTR_WIDTH-1:0] num_full_bursts;
+logic num_partial_bursts;
+logic start = 1'b0;
+logic [LP_TRANSACTION_CNTR_WIDTH-1:0] num_transactions;
+logic has_partial_burst;
+logic [C_LOG_BURST_LEN-1:0] final_burst_len;
+logic single_transaction;
+logic ar_idle = 1'b1;
+logic ar_done;
+// AXI Read Address Channel
+logic fifo_stall;
+logic arxfer;
+logic arvalid_r = 1'b0;
+logic [C_NUM_CHANNELS-1:0][C_ADDR_WIDTH-1:0] addr;
+logic [C_ID_WIDTH-1:0] id = {C_ID_WIDTH{1'b1}};
+logic [LP_TRANSACTION_CNTR_WIDTH-1:0] ar_transactions_to_go;
+logic ar_final_transaction;
+logic [C_NUM_CHANNELS-1:0] incr_ar_to_r_cnt;
+logic [C_NUM_CHANNELS-1:0] decr_ar_to_r_cnt;
+logic [C_NUM_CHANNELS-1:0] stall_ar;
+logic [C_NUM_CHANNELS-1:0][LP_MAX_OUTSTANDING_CNTR_WIDTH-1:0] outstanding_vacancy_count;
+// AXI Data Channel
+logic [C_NUM_CHANNELS-1:0] tvalid;
+logic [C_NUM_CHANNELS-1:0][C_DATA_WIDTH-1:0] tdata;
+logic [C_NUM_CHANNELS-1:0] tlast;
+logic rxfer;
+logic [C_NUM_CHANNELS-1:0] decr_r_transaction_cntr;
+logic [C_NUM_CHANNELS-1:0][LP_TRANSACTION_CNTR_WIDTH-1:0] r_transactions_to_go;
+logic [C_NUM_CHANNELS-1:0] r_final_transaction;
+///////////////////////////////////////////////////////////////////////////////
+// Control Logic
+///////////////////////////////////////////////////////////////////////////////
+
+always @(posedge aclk) begin
+ for (int i = 0; i < C_NUM_CHANNELS; i++) begin
+ done[i] <= rxfer & rlast & (rid == i) & r_final_transaction[i] ? 1'b1 :
+ ctrl_done ? 1'b0 : done[i];
+ end
+end
+assign ctrl_done = &done;
+
+// Determine how many full burst to issue and if there are any partial bursts.
+assign num_full_bursts = ctrl_length[C_LOG_BURST_LEN+:C_LENGTH_WIDTH-C_LOG_BURST_LEN];
+assign num_partial_bursts = ctrl_length[0+:C_LOG_BURST_LEN] ? 1'b1 : 1'b0;
+
+always @(posedge aclk) begin
+ start <= ctrl_start;
+ num_transactions <= (num_partial_bursts == 1'b0) ? num_full_bursts - 1'b1 : num_full_bursts;
+ has_partial_burst <= num_partial_bursts;
+ final_burst_len <= ctrl_length[0+:C_LOG_BURST_LEN] - 1'b1;
+end
+
+// Special case if there is only 1 AXI transaction.
+assign single_transaction = (num_transactions == {LP_TRANSACTION_CNTR_WIDTH{1'b0}}) ? 1'b1 : 1'b0;
+
+///////////////////////////////////////////////////////////////////////////////
+// AXI Read Address Channel
+///////////////////////////////////////////////////////////////////////////////
+assign arvalid = arvalid_r;
+assign araddr = addr[id];
+assign arlen = ar_final_transaction || (start & single_transaction) ? final_burst_len : C_BURST_LEN - 1;
+assign arsize = $clog2((C_DATA_WIDTH/8));
+assign arid = id;
+
+assign arxfer = arvalid & arready;
+assign fifo_stall = ctrl_prog_full[id];
+
+always @(posedge aclk) begin
+ if (areset) begin
+ arvalid_r <= 1'b0;
+ end
+ else begin
+ arvalid_r <= ~ar_idle & ~stall_ar[id] & ~arvalid_r & ~fifo_stall ? 1'b1 :
+ arready ? 1'b0 : arvalid_r;
+ end
+end
+
+// When ar_idle, there are no transactions to issue.
+always @(posedge aclk) begin
+ if (areset) begin
+ ar_idle <= 1'b1;
+ end
+ else begin
+ ar_idle <= start ? 1'b0 :
+ ar_done ? 1'b1 :
+ ar_idle;
+ end
+end
+
+// each channel is assigned a different id. The transactions are interleaved.
+always @(posedge aclk) begin
+ if (start) begin
+ id <= {C_ID_WIDTH{1'b1}};
+ end
+ else begin
+ id <= arxfer ? id - 1'b1 : id;
+ end
+end
+
+
+// Increment to next address after each transaction is issued.
+always @(posedge aclk) begin
+ for (int i = 0; i < C_NUM_CHANNELS; i++) begin
+ addr[i] <= ctrl_start ? ctrl_offset[i] :
+ arxfer && (id == i) ? addr[i] + C_BURST_LEN*C_DATA_WIDTH/8 :
+ addr[i];
+ end
+end
+
+// Counts down the number of transactions to send.
+krnl_rtl_counter #(
+ .C_WIDTH ( LP_TRANSACTION_CNTR_WIDTH ) ,
+ .C_INIT ( {LP_TRANSACTION_CNTR_WIDTH{1'b0}} )
+)
+inst_ar_transaction_cntr (
+ .clk ( aclk ) ,
+ .clken ( 1'b1 ) ,
+ .rst ( areset ) ,
+ .load ( start ) ,
+ .incr ( 1'b0 ) ,
+ .decr ( arxfer && id == '0 ) ,
+ .load_value ( num_transactions ) ,
+ .count ( ar_transactions_to_go ) ,
+ .is_zero ( ar_final_transaction )
+);
+
+assign ar_done = ar_final_transaction && arxfer && id == 1'b0;
+
+always_comb begin
+ for (int i = 0; i < C_NUM_CHANNELS; i++) begin
+ incr_ar_to_r_cnt[i] = rxfer & rlast & (rid == i);
+ decr_ar_to_r_cnt[i] = arxfer & (arid == i);
+ end
+end
+
+// Keeps track of the number of outstanding transactions. Stalls
+// when the value is reached so that the FIFO won't overflow.
+krnl_rtl_counter #(
+ .C_WIDTH ( LP_MAX_OUTSTANDING_CNTR_WIDTH ) ,
+ .C_INIT ( C_MAX_OUTSTANDING[0+:LP_MAX_OUTSTANDING_CNTR_WIDTH] )
+)
+inst_ar_to_r_transaction_cntr[C_NUM_CHANNELS-1:0] (
+ .clk ( aclk ) ,
+ .clken ( 1'b1 ) ,
+ .rst ( areset ) ,
+ .load ( 1'b0 ) ,
+ .incr ( incr_ar_to_r_cnt ) ,
+ .decr ( decr_ar_to_r_cnt ) ,
+ .load_value ( {LP_MAX_OUTSTANDING_CNTR_WIDTH{1'b0}} ) ,
+ .count ( outstanding_vacancy_count ) ,
+ .is_zero ( stall_ar )
+);
+
+///////////////////////////////////////////////////////////////////////////////
+// AXI Read Channel
+///////////////////////////////////////////////////////////////////////////////
+assign m_tvalid = tvalid;
+assign m_tdata = tdata;
+assign m_tlast = tlast;
+
+always_comb begin
+ for (int i = 0; i < C_NUM_CHANNELS; i++) begin
+ tvalid[i] = rvalid && (rid == i);
+ tdata[i] = rdata;
+ tlast[i] = rlast;
+ end
+end
+
+// rready can remain high for optimal timing because ar transactions are not issued
+// unless there is enough space in the FIFO.
+assign rready = 1'b1;
+assign rxfer = rready & rvalid;
+
+always_comb begin
+ for (int i = 0; i < C_NUM_CHANNELS; i++) begin
+ decr_r_transaction_cntr[i] = rxfer & rlast & (rid == i);
+ end
+end
+krnl_rtl_counter #(
+ .C_WIDTH ( LP_TRANSACTION_CNTR_WIDTH ) ,
+ .C_INIT ( {LP_TRANSACTION_CNTR_WIDTH{1'b0}} )
+)
+inst_r_transaction_cntr[C_NUM_CHANNELS-1:0] (
+ .clk ( aclk ) ,
+ .clken ( 1'b1 ) ,
+ .rst ( areset ) ,
+ .load ( start ) ,
+ .incr ( 1'b0 ) ,
+ .decr ( decr_r_transaction_cntr ) ,
+ .load_value ( num_transactions ) ,
+ .count ( r_transactions_to_go ) ,
+ .is_zero ( r_final_transaction )
+);
+
+
+endmodule : krnl_rtl_axi_read_master
+
+`default_nettype wire
+
+
diff --git a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_axi_write_master.sv b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_axi_write_master.sv
new file mode 100644
index 000000000..ab41386db
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_axi_write_master.sv
@@ -0,0 +1,276 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+////////////////////////////////////////////////////////////
+// Description: AXI4 Write Master. Takes a stream of data in,
+// appends address information and sends it out.
+`default_nettype none
+
+module krnl_rtl_axi_write_master #(
+ parameter integer C_ADDR_WIDTH = 64,
+ parameter integer C_DATA_WIDTH = 32,
+ parameter integer C_MAX_LENGTH_WIDTH = 32,
+ parameter integer C_BURST_LEN = 256,
+ parameter integer C_LOG_BURST_LEN = 8
+)
+(
+ // Control interface
+ input wire ctrl_start,
+ input wire [C_ADDR_WIDTH-1:0] ctrl_offset,
+ input wire [C_MAX_LENGTH_WIDTH-1:0] ctrl_length,
+ output wire ctrl_done,
+
+ // AXI4-Stream interface
+ input wire s_tvalid,
+ input wire [C_DATA_WIDTH-1:0] s_tdata,
+ output wire s_tready,
+
+ // AXI Interface
+ input wire aclk,
+ input wire areset,
+
+ output wire [C_ADDR_WIDTH-1:0] awaddr,
+ output wire [7:0] awlen,
+ output wire [2:0] awsize,
+ output wire awvalid,
+ input wire awready,
+
+ output wire [C_DATA_WIDTH-1:0] wdata,
+ output wire [C_DATA_WIDTH/8-1:0] wstrb,
+ output wire wlast,
+ output wire wvalid,
+ input wire wready,
+
+ input wire [1:0] bresp,
+ input wire bvalid,
+ output wire bready
+);
+
+timeunit 1ps;
+timeprecision 1ps;
+
+/////////////////////////////////////////////////////////////////////////////
+// Local Parameters
+/////////////////////////////////////////////////////////////////////////////
+localparam integer LP_LOG_MAX_W_TO_AW = 8; // Allow up to 256 outstanding w to aw transactions
+localparam integer LP_TRANSACTION_CNTR_WIDTH = C_MAX_LENGTH_WIDTH-C_LOG_BURST_LEN;
+
+/////////////////////////////////////////////////////////////////////////////
+// Variables
+/////////////////////////////////////////////////////////////////////////////
+logic [LP_TRANSACTION_CNTR_WIDTH-1:0] num_full_bursts;
+logic num_partial_bursts;
+logic start = 1'b0;
+logic [LP_TRANSACTION_CNTR_WIDTH-1:0] num_transactions;
+logic has_partial_burst;
+logic [C_LOG_BURST_LEN-1:0] final_burst_len;
+logic single_transaction;
+
+logic wxfer; // Unregistered write data transfer
+logic wfirst = 1'b1;
+logic load_burst_cntr;
+logic [C_LOG_BURST_LEN-1:0] wxfers_to_go; // Used for simulation debug
+logic [LP_TRANSACTION_CNTR_WIDTH-1:0] w_transactions_to_go;
+logic w_final_transaction;
+logic w_almost_final_transaction = 1'b0;
+
+logic awxfer;
+logic awvalid_r = 1'b0;
+logic [C_ADDR_WIDTH-1:0] addr;
+logic wfirst_d1 = 1'b0;
+logic wfirst_pulse = 1'b0;
+logic [LP_LOG_MAX_W_TO_AW-1:0] dbg_w_to_aw_outstanding;
+logic idle_aw;
+logic [LP_TRANSACTION_CNTR_WIDTH-1:0] aw_transactions_to_go;
+logic aw_final_transaction;
+
+wire bxfer;
+logic [LP_TRANSACTION_CNTR_WIDTH-1:0] b_transactions_to_go;
+logic b_final_transaction;
+
+/////////////////////////////////////////////////////////////////////////////
+// Control logic
+/////////////////////////////////////////////////////////////////////////////
+// Count the number of transfers and assert done when the last bvalid is received.
+assign num_full_bursts = ctrl_length[C_LOG_BURST_LEN+:C_MAX_LENGTH_WIDTH-C_LOG_BURST_LEN];
+assign num_partial_bursts = ctrl_length[0+:C_LOG_BURST_LEN] ? 1'b1 : 1'b0;
+
+always @(posedge aclk) begin
+ start <= ctrl_start;
+ num_transactions <= (num_partial_bursts == 1'b0) ? num_full_bursts - 1'b1 : num_full_bursts;
+ has_partial_burst <= num_partial_bursts;
+ final_burst_len <= ctrl_length[0+:C_LOG_BURST_LEN] - 1'b1;
+end
+
+assign ctrl_done = bxfer & b_final_transaction;
+assign single_transaction = (num_transactions == {LP_TRANSACTION_CNTR_WIDTH{1'b0}}) ? 1'b1 : 1'b0;
+
+/////////////////////////////////////////////////////////////////////////////
+// AXI Write Data Channel
+/////////////////////////////////////////////////////////////////////////////
+assign wvalid = s_tvalid;
+assign wdata = s_tdata;
+assign wstrb = {(C_DATA_WIDTH/8){1'b1}};
+assign s_tready = wready;
+
+assign wxfer = wvalid & wready;
+
+always @(posedge aclk) begin
+ if (areset) begin
+ wfirst <= 1'b1;
+ end
+ else begin
+ wfirst <= wxfer ? wlast : wfirst;
+ end
+end
+
+// Load burst counter with partial burst if on final transaction or if there is only 1 transaction
+assign load_burst_cntr = (wxfer & wlast & w_almost_final_transaction) || (start & single_transaction);
+
+krnl_rtl_counter #(
+ .C_WIDTH ( C_LOG_BURST_LEN ) ,
+ .C_INIT ( {C_LOG_BURST_LEN{1'b1}} )
+)
+inst_burst_cntr (
+ .clk ( aclk ) ,
+ .clken ( 1'b1 ) ,
+ .rst ( areset ) ,
+ .load ( load_burst_cntr ) ,
+ .incr ( 1'b0 ) ,
+ .decr ( wxfer ) ,
+ .load_value ( final_burst_len ) ,
+ .count ( wxfers_to_go ) ,
+ .is_zero ( wlast )
+);
+
+krnl_rtl_counter #(
+ .C_WIDTH ( LP_TRANSACTION_CNTR_WIDTH ) ,
+ .C_INIT ( {LP_TRANSACTION_CNTR_WIDTH{1'b0}} )
+)
+inst_w_transaction_cntr (
+ .clk ( aclk ) ,
+ .clken ( 1'b1 ) ,
+ .rst ( areset ) ,
+ .load ( start ) ,
+ .incr ( 1'b0 ) ,
+ .decr ( wxfer & wlast ) ,
+ .load_value ( num_transactions ) ,
+ .count ( w_transactions_to_go ) ,
+ .is_zero ( w_final_transaction )
+);
+
+always @(posedge aclk) begin
+ w_almost_final_transaction <= (w_transactions_to_go == 1) ? 1'b1 : 1'b0;
+end
+
+/////////////////////////////////////////////////////////////////////////////
+// AXI Write Address Channel
+/////////////////////////////////////////////////////////////////////////////
+// The address channel samples the data channel and send out transactions when
+// first beat of wdata is asserted. This ensures that address requests are not
+// sent without data on the way.
+
+assign awvalid = awvalid_r;
+assign awxfer = awvalid & awready;
+
+always @(posedge aclk) begin
+ if (areset) begin
+ awvalid_r <= 1'b0;
+ end
+ else begin
+ awvalid_r <= ~idle_aw & ~awvalid_r ? 1'b1 :
+ awready ? 1'b0 :
+ awvalid_r;
+ end
+end
+
+assign awaddr = addr;
+
+always @(posedge aclk) begin
+ addr <= ctrl_start ? ctrl_offset :
+ awxfer ? addr + C_BURST_LEN*C_DATA_WIDTH/8 :
+ addr;
+end
+
+assign awlen = aw_final_transaction || (start & single_transaction) ? final_burst_len : C_BURST_LEN - 1;
+assign awsize = $clog2((C_DATA_WIDTH/8));
+
+krnl_rtl_counter #(
+ .C_WIDTH (LP_LOG_MAX_W_TO_AW),
+ .C_INIT ({LP_LOG_MAX_W_TO_AW{1'b0}})
+)
+inst_w_to_aw_cntr (
+ .clk ( aclk ) ,
+ .clken ( 1'b1 ) ,
+ .rst ( areset ) ,
+ .load ( 1'b0 ) ,
+ .incr ( wfirst_pulse ) ,
+ .decr ( awxfer ) ,
+ .load_value ( ) ,
+ .count ( dbg_w_to_aw_outstanding ) ,
+ .is_zero ( idle_aw )
+);
+
+always @(posedge aclk) begin
+ wfirst_d1 <= wvalid & wfirst;
+end
+
+always @(posedge aclk) begin
+ wfirst_pulse <= wvalid & wfirst & ~wfirst_d1;
+end
+
+krnl_rtl_counter #(
+ .C_WIDTH ( LP_TRANSACTION_CNTR_WIDTH ) ,
+ .C_INIT ( {LP_TRANSACTION_CNTR_WIDTH{1'b0}} )
+)
+inst_aw_transaction_cntr (
+ .clk ( aclk ) ,
+ .clken ( 1'b1 ) ,
+ .rst ( areset ) ,
+ .load ( start ) ,
+ .incr ( 1'b0 ) ,
+ .decr ( awxfer ) ,
+ .load_value ( num_transactions ) ,
+ .count ( aw_transactions_to_go ) ,
+ .is_zero ( aw_final_transaction )
+);
+
+/////////////////////////////////////////////////////////////////////////////
+// AXI Write Response Channel
+/////////////////////////////////////////////////////////////////////////////
+
+assign bready = 1'b1;
+assign bxfer = bready & bvalid;
+
+krnl_rtl_counter #(
+ .C_WIDTH ( LP_TRANSACTION_CNTR_WIDTH ) ,
+ .C_INIT ( {LP_TRANSACTION_CNTR_WIDTH{1'b0}} )
+)
+inst_b_transaction_cntr (
+ .clk ( aclk ) ,
+ .clken ( 1'b1 ) ,
+ .rst ( areset ) ,
+ .load ( start ) ,
+ .incr ( 1'b0 ) ,
+ .decr ( bxfer ) ,
+ .load_value ( num_transactions ) ,
+ .count ( b_transactions_to_go ) ,
+ .is_zero ( b_final_transaction )
+);
+
+endmodule : krnl_rtl_axi_write_master
+
+`default_nettype wire
diff --git a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v
new file mode 100644
index 000000000..c4a76ef0c
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v
@@ -0,0 +1,422 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+`timescale 1ns/1ps
+module krnl_rtl_control_s_axi
+#(parameter
+ C_S_AXI_ADDR_WIDTH = 6,
+ C_S_AXI_DATA_WIDTH = 32
+)(
+ // axi4 lite slave signals
+ input wire ACLK,
+ input wire ARESET,
+ input wire ACLK_EN,
+ input wire [C_S_AXI_ADDR_WIDTH-1:0] AWADDR,
+ input wire AWVALID,
+ output wire AWREADY,
+ input wire [C_S_AXI_DATA_WIDTH-1:0] WDATA,
+ input wire [C_S_AXI_DATA_WIDTH/8-1:0] WSTRB,
+ input wire WVALID,
+ output wire WREADY,
+ output wire [1:0] BRESP,
+ output wire BVALID,
+ input wire BREADY,
+ input wire [C_S_AXI_ADDR_WIDTH-1:0] ARADDR,
+ input wire ARVALID,
+ output wire ARREADY,
+ output wire [C_S_AXI_DATA_WIDTH-1:0] RDATA,
+ output wire [1:0] RRESP,
+ output wire RVALID,
+ input wire RREADY,
+ output wire interrupt,
+ // user signals
+ output wire ap_start,
+ input wire ap_done,
+ input wire ap_ready,
+ input wire ap_idle,
+ output wire [63:0] fifo_in,
+ output wire [63:0] fifo_out,
+ output wire [31:0] length_r_in,
+ output wire [31:0] length_r_out
+);
+//------------------------Address Info-------------------
+// 0x00 : Control signals
+// bit 0 - ap_start (Read/Write/COH)
+// bit 1 - ap_done (Read/COR)
+// bit 2 - ap_idle (Read)
+// bit 3 - ap_ready (Read)
+// bit 7 - auto_restart (Read/Write)
+// others - reserved
+// 0x04 : Global Interrupt Enable Register
+// bit 0 - Global Interrupt Enable (Read/Write)
+// others - reserved
+// 0x08 : IP Interrupt Enable Register (Read/Write)
+// bit 0 - Channel 0 (ap_done)
+// bit 1 - Channel 1 (ap_ready)
+// others - reserved
+// 0x0c : IP Interrupt Status Register (Read/TOW)
+// bit 0 - Channel 0 (ap_done)
+// bit 1 - Channel 1 (ap_ready)
+// others - reserved
+// 0x10 : Data signal of fifo_in
+// bit 31~0 - a[31:0] (Read/Write)
+// 0x14 : Data signal of fifo_in
+// bit 31~0 - a[63:32] (Read/Write)
+// 0x18 : reserved
+// 0x1c : Data signal of fifo_out
+// bit 31~0 - b[31:0] (Read/Write)
+// 0x20 : Data signal of fifo_out
+// bit 31~0 - b[63:32] (Read/Write)
+// 0x24 : reserved
+// 0x28 : Data signal of length_r_in
+// bit 31~0 - length_r[31:0] (Read/Write)
+// 0x2c : reserved
+// 0x30 : Data signal of length_r_out
+// bit 31~0 - length_r[31:0] (Read/Write)
+// 0x34 : reserved
+// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
+
+//------------------------Parameter----------------------
+localparam
+ ADDR_AP_CTRL = 6'h00,
+ ADDR_GIE = 6'h04,
+ ADDR_IER = 6'h08,
+ ADDR_ISR = 6'h0c,
+ ADDR_FIFO_IN_DATA_0 = 6'h10,
+ ADDR_FIFO_IN_DATA_1 = 6'h14,
+ ADDR_FIFO_IN_CTRL = 6'h18,
+ ADDR_FIFO_OUT_DATA_0 = 6'h1c,
+ ADDR_FIFO_OUT_DATA_1 = 6'h20,
+ ADDR_FIFO_OUT_CTRL = 6'h24,
+ ADDR_LENGTH_R_IN_DATA_0 = 6'h28,
+ ADDR_LENGTH_R_IN_CTRL = 6'h2c,
+ ADDR_LENGTH_R_OUT_DATA_0 = 6'h30,
+ ADDR_LENGTH_R_OUT_CTRL = 6'h34,
+ WRIDLE = 2'd0,
+ WRDATA = 2'd1,
+ WRRESP = 2'd2,
+ RDIDLE = 2'd0,
+ RDDATA = 2'd1,
+ ADDR_BITS = 6;
+
+//------------------------Local signal-------------------
+ reg [1:0] wstate = WRIDLE;
+ reg [1:0] wnext;
+ reg [ADDR_BITS-1:0] waddr;
+ wire [31:0] wmask;
+ wire aw_hs;
+ wire w_hs;
+ reg [1:0] rstate = RDIDLE;
+ reg [1:0] rnext;
+ reg [31:0] rdata;
+ wire ar_hs;
+ wire [ADDR_BITS-1:0] raddr;
+ // internal registers
+ wire int_ap_idle;
+ wire int_ap_ready;
+ reg int_ap_done = 1'b0;
+ reg int_ap_start = 1'b0;
+ reg int_auto_restart = 1'b0;
+ reg int_gie = 2'b0;
+ reg [1:0] int_ier = 2'b0;
+ reg [1:0] int_isr = 2'b0;
+ reg [63:0] int_fifo_in = 64'b0;
+ reg [63:0] int_fifo_out = 64'b0;
+ reg [63:0] int_length_r_in = 32'b0;
+ reg [31:0] int_length_r_out = 32'b0;
+
+//------------------------Instantiation------------------
+
+//------------------------AXI write fsm------------------
+assign AWREADY = (~ARESET) & (wstate == WRIDLE);
+assign WREADY = (wstate == WRDATA);
+assign BRESP = 2'b00; // OKAY
+assign BVALID = (wstate == WRRESP);
+assign wmask = { {8{WSTRB[3]}}, {8{WSTRB[2]}}, {8{WSTRB[1]}}, {8{WSTRB[0]}} };
+assign aw_hs = AWVALID & AWREADY;
+assign w_hs = WVALID & WREADY;
+
+// wstate
+always @(posedge ACLK) begin
+ if (ARESET)
+ wstate <= WRIDLE;
+ else if (ACLK_EN)
+ wstate <= wnext;
+end
+
+// wnext
+always @(*) begin
+ case (wstate)
+ WRIDLE:
+ if (AWVALID)
+ wnext = WRDATA;
+ else
+ wnext = WRIDLE;
+ WRDATA:
+ if (WVALID)
+ wnext = WRRESP;
+ else
+ wnext = WRDATA;
+ WRRESP:
+ if (BREADY)
+ wnext = WRIDLE;
+ else
+ wnext = WRRESP;
+ default:
+ wnext = WRIDLE;
+ endcase
+end
+
+// waddr
+always @(posedge ACLK) begin
+ if (ACLK_EN) begin
+ if (aw_hs)
+ waddr <= AWADDR[ADDR_BITS-1:0];
+ end
+end
+
+//------------------------AXI read fsm-------------------
+assign ARREADY = (~ARESET) && (rstate == RDIDLE);
+assign RDATA = rdata;
+assign RRESP = 2'b00; // OKAY
+assign RVALID = (rstate == RDDATA);
+assign ar_hs = ARVALID & ARREADY;
+assign raddr = ARADDR[ADDR_BITS-1:0];
+
+// rstate
+always @(posedge ACLK) begin
+ if (ARESET)
+ rstate <= RDIDLE;
+ else if (ACLK_EN)
+ rstate <= rnext;
+end
+
+// rnext
+always @(*) begin
+ case (rstate)
+ RDIDLE:
+ if (ARVALID)
+ rnext = RDDATA;
+ else
+ rnext = RDIDLE;
+ RDDATA:
+ if (RREADY & RVALID)
+ rnext = RDIDLE;
+ else
+ rnext = RDDATA;
+ default:
+ rnext = RDIDLE;
+ endcase
+end
+
+// rdata
+always @(posedge ACLK) begin
+ if (ACLK_EN) begin
+ if (ar_hs) begin
+ rdata <= 1'b0;
+ case (raddr)
+ ADDR_AP_CTRL: begin
+ rdata[0] <= int_ap_start;
+ rdata[1] <= int_ap_done;
+ rdata[2] <= int_ap_idle;
+ rdata[3] <= int_ap_ready;
+ rdata[7] <= int_auto_restart;
+ end
+ ADDR_GIE: begin
+ rdata <= int_gie;
+ end
+ ADDR_IER: begin
+ rdata <= int_ier;
+ end
+ ADDR_ISR: begin
+ rdata <= int_isr;
+ end
+ ADDR_FIFO_IN_DATA_0: begin
+ rdata <= int_fifo_in[31:0];
+ end
+ ADDR_FIFO_IN_DATA_1: begin
+ rdata <= int_fifo_in[63:32];
+ end
+ ADDR_FIFO_OUT_DATA_0: begin
+ rdata <= int_fifo_out[31:0];
+ end
+ ADDR_FIFO_OUT_DATA_1: begin
+ rdata <= int_fifo_out[63:32];
+ end
+ ADDR_LENGTH_R_IN_DATA_0: begin
+ rdata <= int_length_r_in[31:0];
+ end
+ ADDR_LENGTH_R_OUT_DATA_0: begin
+ rdata <= int_length_r_out[31:0];
+ end
+ endcase
+ end
+ end
+end
+
+
+//------------------------Register logic-----------------
+assign interrupt = int_gie & (|int_isr);
+assign ap_start = int_ap_start;
+assign int_ap_idle = ap_idle;
+assign int_ap_ready = ap_ready;
+assign fifo_in = int_fifo_in;
+assign fifo_out = int_fifo_out;
+assign length_r_in = int_length_r_in;
+assign length_r_out = int_length_r_out;
+// int_ap_start
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_ap_start <= 1'b0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0] && WDATA[0])
+ int_ap_start <= 1'b1;
+ else if (int_ap_ready)
+ int_ap_start <= int_auto_restart; // clear on handshake/auto restart
+ end
+end
+
+// int_ap_done
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_ap_done <= 1'b0;
+ else if (ACLK_EN) begin
+ if (ap_done)
+ int_ap_done <= 1'b1;
+ else if (ar_hs && raddr == ADDR_AP_CTRL)
+ int_ap_done <= 1'b0; // clear on read
+ end
+end
+
+// int_auto_restart
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_auto_restart <= 1'b0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0])
+ int_auto_restart <= WDATA[7];
+ end
+end
+
+// int_gie
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_gie <= 1'b0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_GIE && WSTRB[0])
+ int_gie <= WDATA[0];
+ end
+end
+
+// int_ier
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_ier <= 1'b0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_IER && WSTRB[0])
+ int_ier <= WDATA[1:0];
+ end
+end
+
+// int_isr[0]
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_isr[0] <= 1'b0;
+ else if (ACLK_EN) begin
+ if (int_ier[0] & ap_done)
+ int_isr[0] <= 1'b1;
+ else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
+ int_isr[0] <= int_isr[0] ^ WDATA[0]; // toggle on write
+ end
+end
+
+// int_isr[1]
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_isr[1] <= 1'b0;
+ else if (ACLK_EN) begin
+ if (int_ier[1] & ap_ready)
+ int_isr[1] <= 1'b1;
+ else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
+ int_isr[1] <= int_isr[1] ^ WDATA[1]; // toggle on write
+ end
+end
+
+// int_fifo_in[31:0]
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_fifo_in[31:0] <= 0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_FIFO_IN_DATA_0)
+ int_fifo_in[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_in[31:0] & ~wmask);
+ end
+end
+
+// int_fifo_in[63:32]
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_fifo_in[63:32] <= 0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_FIFO_IN_DATA_1)
+ int_fifo_in[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_in[63:32] & ~wmask);
+ end
+end
+
+// int_fifo_out[31:0]
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_fifo_out[31:0] <= 0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_FIFO_OUT_DATA_0)
+ int_fifo_out[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_out[31:0] & ~wmask);
+ end
+end
+
+// int_fifo_out[63:32]
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_fifo_out[63:32] <= 0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_FIFO_OUT_DATA_1)
+ int_fifo_out[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_out[63:32] & ~wmask);
+ end
+end
+
+// int_length_r_in[31:0]
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_length_r_in[31:0] <= 0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_LENGTH_R_IN_DATA_0)
+ int_length_r_in[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_in[31:0] & ~wmask);
+ end
+end
+
+
+// int_length_r_out[31:0]
+always @(posedge ACLK) begin
+ if (ARESET)
+ int_length_r_out[31:0] <= 0;
+ else if (ACLK_EN) begin
+ if (w_hs && waddr == ADDR_LENGTH_R_OUT_DATA_0)
+ int_length_r_out[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_out[31:0] & ~wmask);
+ end
+end
+
+
+//------------------------Memory logic-------------------
+
+endmodule
diff --git a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_counter.sv b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_counter.sv
new file mode 100644
index 000000000..631cde7b0
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_counter.sv
@@ -0,0 +1,88 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+//-----------------------------------------------------------------------------
+// Simple up/down counter with reset.
+//-----------------------------------------------------------------------------
+`default_nettype none
+`timescale 1ps/1ps
+module krnl_rtl_counter #(
+ parameter integer C_WIDTH = 4,
+ parameter [C_WIDTH-1:0] C_INIT = {C_WIDTH{1'b0}}
+)
+(
+ input wire clk,
+ input wire clken,
+ input wire rst,
+ input wire load,
+ input wire incr,
+ input wire decr,
+ input wire [C_WIDTH-1:0] load_value,
+ output wire [C_WIDTH-1:0] count,
+ output wire is_zero
+);
+
+ localparam [C_WIDTH-1:0] LP_ZERO = {C_WIDTH{1'b0}};
+ localparam [C_WIDTH-1:0] LP_ONE = {{C_WIDTH-1{1'b0}},1'b1};
+ localparam [C_WIDTH-1:0] LP_MAX = {C_WIDTH{1'b1}};
+
+ reg [C_WIDTH-1:0] count_r = C_INIT;
+ reg is_zero_r = (C_INIT == LP_ZERO);
+
+ assign count = count_r;
+
+ always @(posedge clk) begin
+ if (rst) begin
+ count_r <= C_INIT;
+ end
+ else if (clken) begin
+ if (load) begin
+ count_r <= load_value;
+ end
+ else if (incr & ~decr) begin
+ count_r <= count_r + 1'b1;
+ end
+ else if (~incr & decr) begin
+ count_r <= count_r - 1'b1;
+ end
+ else
+ count_r <= count_r;
+ end
+ end
+
+ assign is_zero = is_zero_r;
+
+ always @(posedge clk) begin
+ if (rst) begin
+ is_zero_r <= (C_INIT == LP_ZERO);
+ end
+ else if (clken) begin
+ if (load) begin
+ is_zero_r <= (load_value == LP_ZERO);
+ end
+ else begin
+ is_zero_r <= incr ^ decr ? (decr && (count_r == LP_ONE)) || (incr && (count_r == LP_MAX)) : is_zero_r;
+ end
+ end
+ else begin
+ is_zero_r <= is_zero_r;
+ end
+ end
+
+
+endmodule : krnl_rtl_counter
+`default_nettype wire
+
diff --git a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_int.sv b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_int.sv
new file mode 100644
index 000000000..63581799c
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_int.sv
@@ -0,0 +1,415 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+// Description: This is a example of how to create an RTL Kernel. The function
+// of this module is to add two 32-bit values and produce a result. The values
+// are read from one AXI4 memory mapped master, processed and then written out.
+//
+// Data flow: axi_read_master->fifo->adder->fifo->axi_write_master
+///////////////////////////////////////////////////////////////////////////////
+
+// default_nettype of none prevents implicit wire declaration.
+`default_nettype none
+`timescale 1 ns / 1 ps
+
+module krnl_rtl_int #(
+ parameter integer C_S_AXI_CONTROL_DATA_WIDTH = 32,
+ parameter integer C_S_AXI_CONTROL_ADDR_WIDTH = 6,
+ parameter integer C_M_AXI_GMEM_ID_WIDTH = 1,
+ parameter integer C_M_AXI_GMEM_ADDR_WIDTH = 64,
+ parameter integer C_M_AXI_GMEM_DATA_WIDTH = 32
+)
+(
+ // System signals
+ input wire ap_clk,
+ input wire ap_rst_n,
+ // AXI4 master interface
+ output wire m_axi_gmem_AWVALID,
+ input wire m_axi_gmem_AWREADY,
+ output wire [C_M_AXI_GMEM_ADDR_WIDTH-1:0] m_axi_gmem_AWADDR,
+ output wire [C_M_AXI_GMEM_ID_WIDTH - 1:0] m_axi_gmem_AWID,
+ output wire [7:0] m_axi_gmem_AWLEN,
+ output wire [2:0] m_axi_gmem_AWSIZE,
+ // Tie-off AXI4 transaction options that are not being used.
+ output wire [1:0] m_axi_gmem_AWBURST,
+ output wire [1:0] m_axi_gmem_AWLOCK,
+ output wire [3:0] m_axi_gmem_AWCACHE,
+ output wire [2:0] m_axi_gmem_AWPROT,
+ output wire [3:0] m_axi_gmem_AWQOS,
+ output wire [3:0] m_axi_gmem_AWREGION,
+ output wire m_axi_gmem_WVALID,
+ input wire m_axi_gmem_WREADY,
+ output wire [C_M_AXI_GMEM_DATA_WIDTH-1:0] m_axi_gmem_WDATA,
+ output wire [C_M_AXI_GMEM_DATA_WIDTH/8-1:0] m_axi_gmem_WSTRB,
+ output wire m_axi_gmem_WLAST,
+ output wire m_axi_gmem_ARVALID,
+ input wire m_axi_gmem_ARREADY,
+ output wire [C_M_AXI_GMEM_ADDR_WIDTH-1:0] m_axi_gmem_ARADDR,
+ output wire [C_M_AXI_GMEM_ID_WIDTH-1:0] m_axi_gmem_ARID,
+ output wire [7:0] m_axi_gmem_ARLEN,
+ output wire [2:0] m_axi_gmem_ARSIZE,
+ output wire [1:0] m_axi_gmem_ARBURST,
+ output wire [1:0] m_axi_gmem_ARLOCK,
+ output wire [3:0] m_axi_gmem_ARCACHE,
+ output wire [2:0] m_axi_gmem_ARPROT,
+ output wire [3:0] m_axi_gmem_ARQOS,
+ output wire [3:0] m_axi_gmem_ARREGION,
+ input wire m_axi_gmem_RVALID,
+ output wire m_axi_gmem_RREADY,
+ input wire [C_M_AXI_GMEM_DATA_WIDTH - 1:0] m_axi_gmem_RDATA,
+ input wire m_axi_gmem_RLAST,
+ input wire [C_M_AXI_GMEM_ID_WIDTH - 1:0] m_axi_gmem_RID,
+ input wire [1:0] m_axi_gmem_RRESP,
+ input wire m_axi_gmem_BVALID,
+ output wire m_axi_gmem_BREADY,
+ input wire [1:0] m_axi_gmem_BRESP,
+ input wire [C_M_AXI_GMEM_ID_WIDTH - 1:0] m_axi_gmem_BID,
+
+ // AXI4-Lite slave interface
+ input wire s_axi_control_AWVALID,
+ output wire s_axi_control_AWREADY,
+ input wire [C_S_AXI_CONTROL_ADDR_WIDTH-1:0] s_axi_control_AWADDR,
+ input wire s_axi_control_WVALID,
+ output wire s_axi_control_WREADY,
+ input wire [C_S_AXI_CONTROL_DATA_WIDTH-1:0] s_axi_control_WDATA,
+ input wire [C_S_AXI_CONTROL_DATA_WIDTH/8-1:0] s_axi_control_WSTRB,
+ input wire s_axi_control_ARVALID,
+ output wire s_axi_control_ARREADY,
+ input wire [C_S_AXI_CONTROL_ADDR_WIDTH-1:0] s_axi_control_ARADDR,
+ output wire s_axi_control_RVALID,
+ input wire s_axi_control_RREADY,
+ output wire [C_S_AXI_CONTROL_DATA_WIDTH-1:0] s_axi_control_RDATA,
+ output wire [1:0] s_axi_control_RRESP,
+ output wire s_axi_control_BVALID,
+ input wire s_axi_control_BREADY,
+ output wire [1:0] s_axi_control_BRESP,
+ output wire interrupt
+);
+///////////////////////////////////////////////////////////////////////////////
+// Local Parameters (constants)
+///////////////////////////////////////////////////////////////////////////////
+localparam integer LP_NUM_READ_CHANNELS = 1;
+localparam integer LP_LENGTH_WIDTH = 32;
+localparam integer LP_DW_BYTES = C_M_AXI_GMEM_DATA_WIDTH/8;
+localparam integer LP_AXI_BURST_LEN = 4096/LP_DW_BYTES < 256 ? 4096/LP_DW_BYTES : 256;
+localparam integer LP_LOG_BURST_LEN = $clog2(LP_AXI_BURST_LEN);
+localparam integer LP_RD_MAX_OUTSTANDING = 3;
+localparam integer LP_RD_FIFO_DEPTH = LP_AXI_BURST_LEN*(LP_RD_MAX_OUTSTANDING + 1);
+localparam integer LP_WR_FIFO_DEPTH = LP_AXI_BURST_LEN;
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Variables
+///////////////////////////////////////////////////////////////////////////////
+logic areset = 1'b0;
+logic ap_start;
+logic ap_start_pulse;
+logic ap_start_r;
+logic ap_ready;
+logic ap_done;
+logic ap_idle = 1'b1;
+logic [C_M_AXI_GMEM_ADDR_WIDTH-1:0] fifo_in;
+logic [C_M_AXI_GMEM_ADDR_WIDTH-1:0] fifo_out;
+logic [LP_LENGTH_WIDTH-1:0] length_r_in;
+logic [LP_LENGTH_WIDTH-1:0] length_r_out;
+
+logic read_done;
+logic [LP_NUM_READ_CHANNELS-1:0] rd_tvalid;
+logic [LP_NUM_READ_CHANNELS-1:0] rd_tready_n;
+logic [LP_NUM_READ_CHANNELS-1:0] [C_M_AXI_GMEM_DATA_WIDTH-1:0] rd_tdata;
+logic [LP_NUM_READ_CHANNELS-1:0] rd_tlast;
+logic [LP_NUM_READ_CHANNELS-1:0] ctrl_rd_fifo_prog_full;
+logic [LP_NUM_READ_CHANNELS-1:0] rd_fifo_tvalid_n;
+logic [LP_NUM_READ_CHANNELS-1:0] rd_fifo_tready;
+logic [LP_NUM_READ_CHANNELS-1:0] [C_M_AXI_GMEM_DATA_WIDTH-1:0] rd_fifo_tdata;
+logic [LP_NUM_READ_CHANNELS-1:0] rd_fifo_tlast;
+
+logic NN_inf_tvalid;
+logic NN_inf_tready_n;
+logic [C_M_AXI_GMEM_DATA_WIDTH-1:0] NN_inf_tdata;
+logic wr_fifo_tvalid_n;
+logic wr_fifo_tready;
+logic [C_M_AXI_GMEM_DATA_WIDTH-1:0] wr_fifo_tdata;
+
+///////////////////////////////////////////////////////////////////////////////
+// RTL Logic
+///////////////////////////////////////////////////////////////////////////////
+// Tie-off unused AXI protocol features
+assign m_axi_gmem_AWID = {C_M_AXI_GMEM_ID_WIDTH{1'b0}};
+assign m_axi_gmem_AWBURST = 2'b01;
+assign m_axi_gmem_AWLOCK = 2'b00;
+assign m_axi_gmem_AWCACHE = 4'b0011;
+assign m_axi_gmem_AWPROT = 3'b000;
+assign m_axi_gmem_AWQOS = 4'b0000;
+assign m_axi_gmem_AWREGION = 4'b0000;
+assign m_axi_gmem_ARBURST = 2'b01;
+assign m_axi_gmem_ARLOCK = 2'b00;
+assign m_axi_gmem_ARCACHE = 4'b0011;
+assign m_axi_gmem_ARPROT = 3'b000;
+assign m_axi_gmem_ARQOS = 4'b0000;
+assign m_axi_gmem_ARREGION = 4'b0000;
+
+// Register and invert reset signal for better timing.
+always @(posedge ap_clk) begin
+ areset <= ~ap_rst_n;
+end
+
+// create pulse when ap_start transitions to 1
+always @(posedge ap_clk) begin
+ begin
+ ap_start_r <= ap_start;
+ end
+end
+
+assign ap_start_pulse = ap_start & ~ap_start_r;
+
+// ap_idle is asserted when done is asserted, it is de-asserted when ap_start_pulse
+// is asserted
+always @(posedge ap_clk) begin
+ if (areset) begin
+ ap_idle <= 1'b1;
+ end
+ else begin
+ ap_idle <= ap_done ? 1'b1 :
+ ap_start_pulse ? 1'b0 :
+ ap_idle;
+ end
+end
+
+assign ap_ready = ap_done;
+
+// AXI4-Lite slave
+krnl_rtl_control_s_axi #(
+ .C_S_AXI_ADDR_WIDTH( C_S_AXI_CONTROL_ADDR_WIDTH ),
+ .C_S_AXI_DATA_WIDTH( C_S_AXI_CONTROL_DATA_WIDTH )
+)
+inst_krnl_control_s_axi (
+ .AWVALID ( s_axi_control_AWVALID ) ,
+ .AWREADY ( s_axi_control_AWREADY ) ,
+ .AWADDR ( s_axi_control_AWADDR ) ,
+ .WVALID ( s_axi_control_WVALID ) ,
+ .WREADY ( s_axi_control_WREADY ) ,
+ .WDATA ( s_axi_control_WDATA ) ,
+ .WSTRB ( s_axi_control_WSTRB ) ,
+ .ARVALID ( s_axi_control_ARVALID ) ,
+ .ARREADY ( s_axi_control_ARREADY ) ,
+ .ARADDR ( s_axi_control_ARADDR ) ,
+ .RVALID ( s_axi_control_RVALID ) ,
+ .RREADY ( s_axi_control_RREADY ) ,
+ .RDATA ( s_axi_control_RDATA ) ,
+ .RRESP ( s_axi_control_RRESP ) ,
+ .BVALID ( s_axi_control_BVALID ) ,
+ .BREADY ( s_axi_control_BREADY ) ,
+ .BRESP ( s_axi_control_BRESP ) ,
+ .ACLK ( ap_clk ) ,
+ .ARESET ( areset ) ,
+ .ACLK_EN ( 1'b1 ) ,
+ .ap_start ( ap_start ) ,
+ .interrupt ( interrupt ) ,
+ .ap_ready ( ap_ready ) ,
+ .ap_done ( ap_done ) ,
+ .ap_idle ( ap_idle ) ,
+ .fifo_in ( fifo_in[0+:C_M_AXI_GMEM_ADDR_WIDTH] ) ,
+ .fifo_out ( fifo_out[0+:C_M_AXI_GMEM_ADDR_WIDTH] ) ,
+ .length_r_in ( length_r_in[0+:LP_LENGTH_WIDTH] ) ,
+ .length_r_out ( length_r_out[0+:LP_LENGTH_WIDTH] )
+);
+
+// AXI4 Read Master
+krnl_rtl_axi_read_master #(
+ .C_ADDR_WIDTH ( C_M_AXI_GMEM_ADDR_WIDTH ) ,
+ .C_DATA_WIDTH ( C_M_AXI_GMEM_DATA_WIDTH ) ,
+ .C_ID_WIDTH ( C_M_AXI_GMEM_ID_WIDTH ) ,
+ .C_NUM_CHANNELS ( LP_NUM_READ_CHANNELS ) ,
+ .C_LENGTH_WIDTH ( LP_LENGTH_WIDTH ) ,
+ .C_BURST_LEN ( LP_AXI_BURST_LEN ) ,
+ .C_LOG_BURST_LEN ( LP_LOG_BURST_LEN ) ,
+ .C_MAX_OUTSTANDING ( LP_RD_MAX_OUTSTANDING )
+)
+inst_axi_read_master (
+ .aclk ( ap_clk ) ,
+ .areset ( areset ) ,
+
+ .ctrl_start ( ap_start_pulse ) ,
+ .ctrl_done ( read_done ) ,
+ .ctrl_offset ( fifo_in ) ,
+ .ctrl_length ( length_r_in ) ,
+ .ctrl_prog_full ( ctrl_rd_fifo_prog_full ) ,
+
+ .arvalid ( m_axi_gmem_ARVALID ) ,
+ .arready ( m_axi_gmem_ARREADY ) ,
+ .araddr ( m_axi_gmem_ARADDR ) ,
+ .arid ( m_axi_gmem_ARID ) ,
+ .arlen ( m_axi_gmem_ARLEN ) ,
+ .arsize ( m_axi_gmem_ARSIZE ) ,
+ .rvalid ( m_axi_gmem_RVALID ) ,
+ .rready ( m_axi_gmem_RREADY ) ,
+ .rdata ( m_axi_gmem_RDATA ) ,
+ .rlast ( m_axi_gmem_RLAST ) ,
+ .rid ( m_axi_gmem_RID ) ,
+ .rresp ( m_axi_gmem_RRESP ) ,
+
+ .m_tvalid ( rd_tvalid ) ,
+ .m_tready ( ~rd_tready_n ) ,
+ .m_tdata ( rd_tdata ) ,
+ .m_tlast ( rd_tlast )
+);
+
+// xpm_fifo_sync: Synchronous FIFO
+// Xilinx Parameterized Macro, Version 2016.4
+xpm_fifo_sync # (
+ .FIFO_MEMORY_TYPE ("auto"), //string; "auto", "block", "distributed", or "ultra";
+ .ECC_MODE ("no_ecc"), //string; "no_ecc" or "en_ecc";
+ .FIFO_WRITE_DEPTH (LP_RD_FIFO_DEPTH), //positive integer
+ .WRITE_DATA_WIDTH (C_M_AXI_GMEM_DATA_WIDTH+1), //positive integer
+ .WR_DATA_COUNT_WIDTH ($clog2(LP_RD_FIFO_DEPTH)+1), //positive integer, Not used
+ .PROG_FULL_THRESH (LP_AXI_BURST_LEN-2), //positive integer
+ .FULL_RESET_VALUE (1), //positive integer; 0 or 1
+ .READ_MODE ("fwft"), //string; "std" or "fwft";
+ .FIFO_READ_LATENCY (1), //positive integer;
+ .READ_DATA_WIDTH (C_M_AXI_GMEM_DATA_WIDTH+1), //positive integer
+ .RD_DATA_COUNT_WIDTH ($clog2(LP_RD_FIFO_DEPTH)+1), //positive integer, not used
+ .PROG_EMPTY_THRESH (10), //positive integer, not used
+ .DOUT_RESET_VALUE ("0"), //string, don't care
+ .WAKEUP_TIME (0) //positive integer; 0 or 2;
+
+) inst_rd_xpm_fifo_sync[LP_NUM_READ_CHANNELS-1:0] (
+ .sleep ( 1'b0 ) ,
+ .rst ( areset ) ,
+ .wr_clk ( ap_clk ) ,
+ .wr_en ( rd_tvalid ) ,
+ .din ( {rd_tlast,rd_tdata} ) ,
+ .full ( rd_tready_n ) ,
+ .prog_full ( ctrl_rd_fifo_prog_full) ,
+ .wr_data_count ( ) ,
+ .overflow ( ) ,
+ .wr_rst_busy ( ) ,
+ .rd_en ( rd_fifo_tready ) ,
+ .dout ( {rd_fifo_tlast,rd_fifo_tdata} ) ,
+ .empty ( rd_fifo_tvalid_n ) ,
+ .prog_empty ( ) ,
+ .rd_data_count ( ) ,
+ .underflow ( ) ,
+ .rd_rst_busy ( ) ,
+ .injectsbiterr ( 1'b0 ) ,
+ .injectdbiterr ( 1'b0 ) ,
+ .sbiterr ( ) ,
+ .dbiterr ( )
+
+);
+
+// NN inference
+myproject_axi_0 #()
+hls4ml_IP (
+ .ap_clk ( ap_clk ) ,
+ .ap_rst_n ( ap_rst_n ) ,
+
+ .in_r_TVALID ( ~rd_fifo_tvalid_n ) ,
+ .in_r_TREADY ( rd_fifo_tready ) ,
+ .in_r_TDATA ( rd_fifo_tdata ) ,
+ .in_r_TLAST ( rd_fifo_tlast ) ,
+
+ .out_r_TVALID ( NN_inf_tvalid ) ,
+ .out_r_TREADY ( ~NN_inf_tready_n ) ,
+ .out_r_TDATA ( NN_inf_tdata )
+);
+
+// xpm_fifo_sync: Synchronous FIFO
+// Xilinx Parameterized Macro, Version 2016.4
+xpm_fifo_sync # (
+ .FIFO_MEMORY_TYPE ("auto"), //string; "auto", "block", "distributed", or "ultra";
+ .ECC_MODE ("no_ecc"), //string; "no_ecc" or "en_ecc";
+ .FIFO_WRITE_DEPTH (LP_WR_FIFO_DEPTH), //positive integer
+ .WRITE_DATA_WIDTH (C_M_AXI_GMEM_DATA_WIDTH), //positive integer
+ .WR_DATA_COUNT_WIDTH ($clog2(LP_WR_FIFO_DEPTH)), //positive integer, Not used
+ .PROG_FULL_THRESH (10), //positive integer, Not used
+ .FULL_RESET_VALUE (1), //positive integer; 0 or 1
+ .READ_MODE ("fwft"), //string; "std" or "fwft";
+ .FIFO_READ_LATENCY (1), //positive integer;
+ .READ_DATA_WIDTH (C_M_AXI_GMEM_DATA_WIDTH), //positive integer
+ .RD_DATA_COUNT_WIDTH ($clog2(LP_WR_FIFO_DEPTH)), //positive integer, not used
+ .PROG_EMPTY_THRESH (10), //positive integer, not used
+ .DOUT_RESET_VALUE ("0"), //string, don't care
+ .WAKEUP_TIME (0) //positive integer; 0 or 2;
+
+) inst_wr_xpm_fifo_sync (
+ .sleep ( 1'b0 ) ,
+ .rst ( areset ) ,
+ .wr_clk ( ap_clk ) ,
+ .wr_en ( NN_inf_tvalid ) ,
+ .din ( NN_inf_tdata ) ,
+ .full ( NN_inf_tready_n ) ,
+ .prog_full ( ) ,
+ .wr_data_count ( ) ,
+ .overflow ( ) ,
+ .wr_rst_busy ( ) ,
+ .rd_en ( wr_fifo_tready ) ,
+ .dout ( wr_fifo_tdata ) ,
+ .empty ( wr_fifo_tvalid_n ) ,
+ .prog_empty ( ) ,
+ .rd_data_count ( ) ,
+ .underflow ( ) ,
+ .rd_rst_busy ( ) ,
+ .injectsbiterr ( 1'b0 ) ,
+ .injectdbiterr ( 1'b0 ) ,
+ .sbiterr ( ) ,
+ .dbiterr ( )
+
+);
+
+
+// AXI4 Write Master
+krnl_rtl_axi_write_master #(
+ .C_ADDR_WIDTH ( C_M_AXI_GMEM_ADDR_WIDTH ) ,
+ .C_DATA_WIDTH ( C_M_AXI_GMEM_DATA_WIDTH ) ,
+ .C_MAX_LENGTH_WIDTH ( LP_LENGTH_WIDTH ) ,
+ .C_BURST_LEN ( LP_AXI_BURST_LEN ) ,
+ .C_LOG_BURST_LEN ( LP_LOG_BURST_LEN )
+)
+inst_axi_write_master (
+ .aclk ( ap_clk ) ,
+ .areset ( areset ) ,
+
+ .ctrl_start ( ap_start_pulse ) ,
+ .ctrl_offset ( fifo_out ) ,
+ .ctrl_length ( length_r_out ) ,
+ .ctrl_done ( ap_done ) ,
+
+ .awvalid ( m_axi_gmem_AWVALID ) ,
+ .awready ( m_axi_gmem_AWREADY ) ,
+ .awaddr ( m_axi_gmem_AWADDR ) ,
+ .awlen ( m_axi_gmem_AWLEN ) ,
+ .awsize ( m_axi_gmem_AWSIZE ) ,
+
+ .s_tvalid ( ~wr_fifo_tvalid_n ) ,
+ .s_tready ( wr_fifo_tready ) ,
+ .s_tdata ( wr_fifo_tdata ) ,
+
+ .wvalid ( m_axi_gmem_WVALID ) ,
+ .wready ( m_axi_gmem_WREADY ) ,
+ .wdata ( m_axi_gmem_WDATA ) ,
+ .wstrb ( m_axi_gmem_WSTRB ) ,
+ .wlast ( m_axi_gmem_WLAST ) ,
+
+ .bvalid ( m_axi_gmem_BVALID ) ,
+ .bready ( m_axi_gmem_BREADY ) ,
+ .bresp ( m_axi_gmem_BRESP )
+);
+
+endmodule : krnl_rtl_int
+
+`default_nettype wire
diff --git a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/myproject_kernel.v b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/myproject_kernel.v
new file mode 100644
index 000000000..0d5dc71d6
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/myproject_kernel.v
@@ -0,0 +1,170 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+// Description: This is a wrapper of module "krnl_rtl_int"
+///////////////////////////////////////////////////////////////////////////////
+
+// default_nettype of none prevents implicit wire declaration.
+`default_nettype none
+`timescale 1 ns / 1 ps
+
+module krnl_rtl #(
+ parameter integer C_S_AXI_CONTROL_DATA_WIDTH = 32,
+ parameter integer C_S_AXI_CONTROL_ADDR_WIDTH = 6,
+ parameter integer C_M_AXI_GMEM_ID_WIDTH = 1,
+ parameter integer C_M_AXI_GMEM_ADDR_WIDTH = 64,
+ parameter integer C_M_AXI_GMEM_DATA_WIDTH = 32
+)
+(
+ // System signals
+ input wire ap_clk,
+ input wire ap_rst_n,
+ // AXI4 master interface
+ output wire m_axi_gmem_AWVALID,
+ input wire m_axi_gmem_AWREADY,
+ output wire [C_M_AXI_GMEM_ADDR_WIDTH-1:0] m_axi_gmem_AWADDR,
+ output wire [C_M_AXI_GMEM_ID_WIDTH - 1:0] m_axi_gmem_AWID,
+ output wire [7:0] m_axi_gmem_AWLEN,
+ output wire [2:0] m_axi_gmem_AWSIZE,
+ // Tie-off AXI4 transaction options that are not being used.
+ output wire [1:0] m_axi_gmem_AWBURST,
+ output wire [1:0] m_axi_gmem_AWLOCK,
+ output wire [3:0] m_axi_gmem_AWCACHE,
+ output wire [2:0] m_axi_gmem_AWPROT,
+ output wire [3:0] m_axi_gmem_AWQOS,
+ output wire [3:0] m_axi_gmem_AWREGION,
+ output wire m_axi_gmem_WVALID,
+ input wire m_axi_gmem_WREADY,
+ output wire [C_M_AXI_GMEM_DATA_WIDTH-1:0] m_axi_gmem_WDATA,
+ output wire [C_M_AXI_GMEM_DATA_WIDTH/8-1:0] m_axi_gmem_WSTRB,
+ output wire m_axi_gmem_WLAST,
+ output wire m_axi_gmem_ARVALID,
+ input wire m_axi_gmem_ARREADY,
+ output wire [C_M_AXI_GMEM_ADDR_WIDTH-1:0] m_axi_gmem_ARADDR,
+ output wire [C_M_AXI_GMEM_ID_WIDTH-1:0] m_axi_gmem_ARID,
+ output wire [7:0] m_axi_gmem_ARLEN,
+ output wire [2:0] m_axi_gmem_ARSIZE,
+ output wire [1:0] m_axi_gmem_ARBURST,
+ output wire [1:0] m_axi_gmem_ARLOCK,
+ output wire [3:0] m_axi_gmem_ARCACHE,
+ output wire [2:0] m_axi_gmem_ARPROT,
+ output wire [3:0] m_axi_gmem_ARQOS,
+ output wire [3:0] m_axi_gmem_ARREGION,
+ input wire m_axi_gmem_RVALID,
+ output wire m_axi_gmem_RREADY,
+ input wire [C_M_AXI_GMEM_DATA_WIDTH - 1:0] m_axi_gmem_RDATA,
+ input wire m_axi_gmem_RLAST,
+ input wire [C_M_AXI_GMEM_ID_WIDTH - 1:0] m_axi_gmem_RID,
+ input wire [1:0] m_axi_gmem_RRESP,
+ input wire m_axi_gmem_BVALID,
+ output wire m_axi_gmem_BREADY,
+ input wire [1:0] m_axi_gmem_BRESP,
+ input wire [C_M_AXI_GMEM_ID_WIDTH - 1:0] m_axi_gmem_BID,
+
+ // AXI4-Lite slave interface
+ input wire s_axi_control_AWVALID,
+ output wire s_axi_control_AWREADY,
+ input wire [C_S_AXI_CONTROL_ADDR_WIDTH-1:0] s_axi_control_AWADDR,
+ input wire s_axi_control_WVALID,
+ output wire s_axi_control_WREADY,
+ input wire [C_S_AXI_CONTROL_DATA_WIDTH-1:0] s_axi_control_WDATA,
+ input wire [C_S_AXI_CONTROL_DATA_WIDTH/8-1:0] s_axi_control_WSTRB,
+ input wire s_axi_control_ARVALID,
+ output wire s_axi_control_ARREADY,
+ input wire [C_S_AXI_CONTROL_ADDR_WIDTH-1:0] s_axi_control_ARADDR,
+ output wire s_axi_control_RVALID,
+ input wire s_axi_control_RREADY,
+ output wire [C_S_AXI_CONTROL_DATA_WIDTH-1:0] s_axi_control_RDATA,
+ output wire [1:0] s_axi_control_RRESP,
+ output wire s_axi_control_BVALID,
+ input wire s_axi_control_BREADY,
+ output wire [1:0] s_axi_control_BRESP,
+ output wire interrupt
+);
+
+krnl_rtl_int #(
+ .C_S_AXI_CONTROL_DATA_WIDTH ( C_S_AXI_CONTROL_DATA_WIDTH ),
+ .C_S_AXI_CONTROL_ADDR_WIDTH ( C_S_AXI_CONTROL_ADDR_WIDTH ),
+ .C_M_AXI_GMEM_ID_WIDTH ( C_M_AXI_GMEM_ID_WIDTH ),
+ .C_M_AXI_GMEM_ADDR_WIDTH ( C_M_AXI_GMEM_ADDR_WIDTH ),
+ .C_M_AXI_GMEM_DATA_WIDTH ( C_M_AXI_GMEM_DATA_WIDTH )
+)
+inst_krnl_rtl_int (
+ .ap_clk ( ap_clk ),
+ .ap_rst_n ( ap_rst_n ),
+ .m_axi_gmem_AWVALID ( m_axi_gmem_AWVALID ),
+ .m_axi_gmem_AWREADY ( m_axi_gmem_AWREADY ),
+ .m_axi_gmem_AWADDR ( m_axi_gmem_AWADDR ),
+ .m_axi_gmem_AWID ( m_axi_gmem_AWID ),
+ .m_axi_gmem_AWLEN ( m_axi_gmem_AWLEN ),
+ .m_axi_gmem_AWSIZE ( m_axi_gmem_AWSIZE ),
+ .m_axi_gmem_AWBURST ( m_axi_gmem_AWBURST ),
+ .m_axi_gmem_AWLOCK ( m_axi_gmem_AWLOCK ),
+ .m_axi_gmem_AWCACHE ( m_axi_gmem_AWCACHE ),
+ .m_axi_gmem_AWPROT ( m_axi_gmem_AWPROT ),
+ .m_axi_gmem_AWQOS ( m_axi_gmem_AWQOS ),
+ .m_axi_gmem_AWREGION ( m_axi_gmem_AWREGION ),
+ .m_axi_gmem_WVALID ( m_axi_gmem_WVALID ),
+ .m_axi_gmem_WREADY ( m_axi_gmem_WREADY ),
+ .m_axi_gmem_WDATA ( m_axi_gmem_WDATA ),
+ .m_axi_gmem_WSTRB ( m_axi_gmem_WSTRB ),
+ .m_axi_gmem_WLAST ( m_axi_gmem_WLAST ),
+ .m_axi_gmem_ARVALID ( m_axi_gmem_ARVALID ),
+ .m_axi_gmem_ARREADY ( m_axi_gmem_ARREADY ),
+ .m_axi_gmem_ARADDR ( m_axi_gmem_ARADDR ),
+ .m_axi_gmem_ARID ( m_axi_gmem_ARID ),
+ .m_axi_gmem_ARLEN ( m_axi_gmem_ARLEN ),
+ .m_axi_gmem_ARSIZE ( m_axi_gmem_ARSIZE ),
+ .m_axi_gmem_ARBURST ( m_axi_gmem_ARBURST ),
+ .m_axi_gmem_ARLOCK ( m_axi_gmem_ARLOCK ),
+ .m_axi_gmem_ARCACHE ( m_axi_gmem_ARCACHE ),
+ .m_axi_gmem_ARPROT ( m_axi_gmem_ARPROT ),
+ .m_axi_gmem_ARQOS ( m_axi_gmem_ARQOS ),
+ .m_axi_gmem_ARREGION ( m_axi_gmem_ARREGION ),
+ .m_axi_gmem_RVALID ( m_axi_gmem_RVALID ),
+ .m_axi_gmem_RREADY ( m_axi_gmem_RREADY ),
+ .m_axi_gmem_RDATA ( m_axi_gmem_RDATA ),
+ .m_axi_gmem_RLAST ( m_axi_gmem_RLAST ),
+ .m_axi_gmem_RID ( m_axi_gmem_RID ),
+ .m_axi_gmem_RRESP ( m_axi_gmem_RRESP ),
+ .m_axi_gmem_BVALID ( m_axi_gmem_BVALID ),
+ .m_axi_gmem_BREADY ( m_axi_gmem_BREADY ),
+ .m_axi_gmem_BRESP ( m_axi_gmem_BRESP ),
+ .m_axi_gmem_BID ( m_axi_gmem_BID ),
+ .s_axi_control_AWVALID ( s_axi_control_AWVALID ),
+ .s_axi_control_AWREADY ( s_axi_control_AWREADY ),
+ .s_axi_control_AWADDR ( s_axi_control_AWADDR ),
+ .s_axi_control_WVALID ( s_axi_control_WVALID ),
+ .s_axi_control_WREADY ( s_axi_control_WREADY ),
+ .s_axi_control_WDATA ( s_axi_control_WDATA ),
+ .s_axi_control_WSTRB ( s_axi_control_WSTRB ),
+ .s_axi_control_ARVALID ( s_axi_control_ARVALID ),
+ .s_axi_control_ARREADY ( s_axi_control_ARREADY ),
+ .s_axi_control_ARADDR ( s_axi_control_ARADDR ),
+ .s_axi_control_RVALID ( s_axi_control_RVALID ),
+ .s_axi_control_RREADY ( s_axi_control_RREADY ),
+ .s_axi_control_RDATA ( s_axi_control_RDATA ),
+ .s_axi_control_RRESP ( s_axi_control_RRESP ),
+ .s_axi_control_BVALID ( s_axi_control_BVALID ),
+ .s_axi_control_BREADY ( s_axi_control_BREADY ),
+ .s_axi_control_BRESP ( s_axi_control_BRESP ),
+ .interrupt ( interrupt )
+);
+endmodule : krnl_rtl
+
+`default_nettype wire
+
diff --git a/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py
new file mode 100644
index 000000000..2c220df3f
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py
@@ -0,0 +1,108 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import Overlay
+from pynq import allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+ def __init__(self, xclbin_name, dtbo=None, download=True, ignore_version=False, device=None):
+
+ super().__init__(xclbin_name, dtbo=dtbo, download=download, ignore_version=ignore_version, device=device)
+ self.input_buffer=None
+ self.output_buffer=None
+
+ def allocate_mem(self, X_shape, y_shape, dtype=np.float32, trg_in=None, trg_out=None):
+ """
+ Buffer allocation in the card memory
+ Parameters
+ ----------
+ X_shape : input buffer shape.
+ y_shape : output buffer shape.
+ dtype : the data type of the elements of the input/output vectors.
+ Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+ types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+ Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot
+ any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+ doc for more info).
+ In this case the encoding/decoding has to be computed by the host machine. For example for
+ 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+ 'float' -> 'ap_fixed<16,6>':
+ ```
+ def encode(xi):
+ return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+ def decode(yi):
+ return yi * 2**-10
+ encode_v = np.vectorize(encode) # to apply them element-wise
+ decode_v = np.vectorize(decode)
+ ```
+ trg_in : input buffer target memory. By default the v++ command
+ set it to HBM[0] for alveo-u50.
+ trg_out : output buffer target memory.By default the v++ command
+ set it to HBM[0] for alveo-u50.
+
+ Assigns
+ -------
+ input_buffer : input PYNQ buffer, must be allocated first and just once.
+ output_buffer : output PYNQ buffer, must be allocated first and just once.
+ input_buffer, output_buffer : input and output PYNQ buffers
+
+ """
+ self.input_buffer = allocate(shape=X_shape, dtype=dtype, target=trg_in )
+ self.output_buffer = allocate(shape=y_shape, dtype=dtype, target=trg_out)
+
+ def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None,
+ decode=None):
+ """
+ Obtain the predictions of the NN implemented in the FPGA.
+ Parameters:
+ - X : the input vector. Should be numpy ndarray.
+ - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and
+ for sizing the output vector shape.
+ - dtype : the data type of the elements of the input/output vectors.
+ - debug : boolean, if set the function will print information about the data transfers status.
+ - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+ - encode/decode: function pointers. See `dtype` section for more information.
+ - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+ the namesake parameter.
+ """
+ self.allocate_mem(X_shape=X.shape, y_shape=y_shape, dtype=dtype)
+ if profile:
+ timea = datetime.now()
+ if encode is not None:
+ X = encode(X)
+ in_size = np.prod(X.shape)
+ out_size = np.prod(y_shape)
+ self.input_buffer[:] = X
+ self.input_buffer.sync_to_device()
+ if debug:
+ print("Send OK")
+ self.krnl_rtl_1.call(self.input_buffer, self.output_buffer, in_size, out_size)
+ if debug:
+ print("Kernel call OK")
+ self.output_buffer.sync_from_device()
+ if debug:
+ print("Recieve OK")
+ result = self.output_buffer.copy()
+ if profile:
+ timeb = datetime.now()
+ dts, rate = self._print_dt(timea, timeb, len(X))
+ self.input_buffer.flush()
+ self.output_buffer.flush()
+ self.free()
+ return result, dts, rate
+ self.input_buffer.flush()
+ self.output_buffer.flush()
+ return result
+
+ def free_overlay(self):
+ self.free()
+
+ def _print_dt(self, timea, timeb, N):
+ dt = (timeb - timea)
+ dts = dt.seconds + dt.microseconds * 10 ** -6
+ rate = N / dts
+ print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate))
+ print("Or {} us / inferences".format(1 / rate * 1e6))
+ return dts, rate
+
diff --git a/hls4ml/templates/vivado_accelerator/alveo/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/alveo/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 000000000..2970100e2
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/alveo/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,109 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${myproject}_vivado_accelerator -part ${part} -force
+
+set_property ip_repo_paths ${myproject}_prj [current_project]
+update_ip_catalog
+
+
+add_files -scan_for_includes {src/krnl_rtl_int.sv src/krnl_rtl_axi_read_master.sv src/krnl_rtl_counter.sv src/myproject_kernel.v src/krnl_rtl_axi_write_master.sv src/krnl_rtl_control_s_axi.v}
+import_files {src/krnl_rtl_int.sv src/krnl_rtl_axi_read_master.sv src/krnl_rtl_counter.sv src/myproject_kernel.v src/krnl_rtl_axi_write_master.sv src/krnl_rtl_control_s_axi.v}
+
+
+
+create_ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 -module_name myproject_axi_0
+
+
+ipx::package_project -root_dir hls4ml_IP -vendor fastmachinelearning.org -library hls4ml -taxonomy /UserIP -import_files -set_current false
+ipx::unload_core hls4ml_IP/component.xml
+ipx::edit_ip_in_project -upgrade true -name tmp_edit_project -directory hls4ml_IP hls4ml_IP/component.xml
+ipx::associate_bus_interfaces -busif m_axi_gmem -clock ap_clk [ipx::current_core]
+ipx::associate_bus_interfaces -busif s_axi_control -clock ap_clk [ipx::current_core]
+ipx::add_bus_parameter FREQ_HZ [ipx::get_bus_interfaces ap_clk -of_objects [ipx::current_core]]
+
+
+
+set_property value_resolve_type user [ipx::get_bus_parameters -of [::ipx::get_bus_interfaces -of [ipx::current_core] *clk*] "FREQ_HZ"]
+
+
+
+ipx::add_register CTRL [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]
+ipx::add_register GIER [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]
+ipx::add_register IP_IER [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]
+ipx::add_register IP_ISR [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]
+ipx::add_register fifo_in [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]
+ipx::add_register fifo_out [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]
+ipx::add_register length_r_in [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]
+ipx::add_register length_r_out [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]
+
+
+# Commands to set the descrtiprion, address offset and size
+
+# CTRL register properties
+set_property Description "Control Signals" [ipx::get_registers CTRL -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Address_Offset 0x000 [ipx::get_registers CTRL -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Size 32 [ipx::get_registers CTRL -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+
+# GIER register properties
+set_property Description "Global Interrupt Enable Register" [ipx::get_registers GIER -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Address_Offset 0x004 [ipx::get_registers GIER -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Size 32 [ipx::get_registers GIER -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+
+# IP_IER register properties
+set_property Description "IP Interrupt Enable Register" [ipx::get_registers IP_IER -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Address_Offset 0x008 [ipx::get_registers IP_IER -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Size 32 [ipx::get_registers IP_IER -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+
+# IP_ISR register properties
+set_property Description "IP Interrupt Status Register" [ipx::get_registers IP_ISR -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Address_Offset 0x00C [ipx::get_registers IP_ISR -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Size 32 [ipx::get_registers IP_ISR -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+
+# fifo_in register properties
+set_property Description "fifo_in pointer argument" [ipx::get_registers fifo_in -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Address_Offset 0x010 [ipx::get_registers fifo_in -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Size 64 [ipx::get_registers fifo_in -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+
+# fifo_out register properties
+set_property Description "fifo_out pointer argument" [ipx::get_registers fifo_out -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Address_Offset 0x01C [ipx::get_registers fifo_out -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Size 64 [ipx::get_registers fifo_out -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+
+# length_r_in register properties
+set_property Description "length_r_in value" [ipx::get_registers length_r_in -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Address_Offset 0x028 [ipx::get_registers length_r_in -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Size 32 [ipx::get_registers length_r_in -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+
+# length_r_out register properties
+set_property Description "length_r_out value" [ipx::get_registers length_r_out -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Address_Offset 0x030 [ipx::get_registers length_r_out -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+set_property Size 32 [ipx::get_registers length_r_out -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+
+ipx::add_register_parameter ASSOCIATED_BUSIF [ipx::get_registers fifo_in -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+ipx::add_register_parameter ASSOCIATED_BUSIF [ipx::get_registers fifo_out -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]
+
+# Commands to set m_axi_gmem as value in the register ASSOCIATED_BUSIF parameters
+set_property Value m_axi_gmem [ipx::get_register_parameters ASSOCIATED_BUSIF -of_objects [ipx::get_registers fifo_in -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]]
+set_property Value m_axi_gmem [ipx::get_register_parameters ASSOCIATED_BUSIF -of_objects [ipx::get_registers fifo_out -of_objects [ipx::get_address_blocks reg0 -of_objects [ipx::get_memory_maps s_axi_control -of_objects [ipx::current_core]]]]]
+
+set core [ipx::current_core]
+
+
+set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} $core
+set_property sdx_kernel true $core
+set_property sdx_kernel_type rtl $core
+
+
+
+set_property core_revision 2 [ipx::current_core]
+ipx::update_source_project_archive -component [ipx::current_core]
+ipx::create_xgui_files [ipx::current_core]
+ipx::update_checksums [ipx::current_core]
+ipx::save_core [ipx::current_core]
+ipx::check_integrity -quiet [ipx::current_core]
+ipx::archive_core hls4ml_IP/fastmachinelearning.org_hls4ml_krnl_rtl_1.0.zip [ipx::current_core]
+current_project project_1
+
+
+package_xo -force -xo_path xo_files/${myproject}_kernel.xo -kernel_name krnl_rtl -ip_directory hls4ml_IP
diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py
index 8d726c3e2..ae9d201e8 100644
--- a/hls4ml/writer/vivado_accelerator_writer.py
+++ b/hls4ml/writer/vivado_accelerator_writer.py
@@ -1,6 +1,6 @@
import os
-from shutil import copyfile
-
+from shutil import copyfile, copytree
+from distutils.dir_util import copy_tree
from hls4ml.writer.vivado_writer import VivadoWriter
class VivadoAcceleratorWriter(VivadoWriter):
@@ -311,14 +311,22 @@ def write_wrapper_test(self, model):
def write_board_script(self, model):
'''
- Write the tcl scripts to create a Vivado IPI project for the VivadoAccelerator
+ Write the tcl scripts and kernel sources to create a Vivado IPI project for the VivadoAccelerator
'''
filedir = os.path.dirname(os.path.abspath(__file__))
copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_tcl_file_path()),
'{}/design.tcl'.format(model.config.get_output_dir()))
+ # Generic alveo board
+ if self.vivado_accelerator_config.get_board().startswith('alveo'):
+ src_dir=os.path.join(filedir, self.vivado_accelerator_config.get_krnl_rtl_src_dir())
+ dst_dir= os.path.abspath(model.config.get_output_dir())+'/src'
+ copy_tree(src_dir,dst_dir)
f = open('{}/project.tcl'.format(model.config.get_output_dir()), 'w')
f.write('variable myproject\n')
f.write('set myproject "{}"\n'.format(model.config.get_project_name()))
+ if self.vivado_accelerator_config.get_board().startswith('alveo'):
+ f.write('variable part\n')
+ f.write('set part "{}"\n'.format(self.vivado_accelerator_config.get_part()))
if self.vivado_accelerator_config.get_interface() == 'axi_stream':
in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth()
f.write('set bit_width_hls_output {}\n'.format(in_bit))
diff --git a/test/hls4ml-keras-test.sh b/test/hls4ml-keras-test.sh
index 09ce49053..62c1bd20d 100755
--- a/test/hls4ml-keras-test.sh
+++ b/test/hls4ml-keras-test.sh
@@ -11,6 +11,9 @@ VIVADO_VERSION=2020.1
# Alternatively, keras-to-hls script can be called, with the model name(s) specified, i.e.:
#./keras-to-hls.sh KERAS_1layer KERAS_conv1d_small
+./keras-to-hls.sh -b alveo-u250 -B VivadoAccelerator -x xcu250-figd2104-2L-e KERAS_3layer
+./keras-to-hls.sh -b pynq-z2 -B VivadoAccelerator -x xc7z020clg400-1 KERAS_3layer
+# KERAS_3layer b:pynq-z2 B:VivadoAccelerator x:xc7z020clg400-1 s:Resource
# Build the projects generated by keras-to-hls script.
# Remove parameter -s to disable synthesis. -p controls the number of parallel tasks
diff --git a/test/keras-models.txt b/test/keras-models.txt
index 439c770a0..e087cb6f6 100644
--- a/test/keras-models.txt
+++ b/test/keras-models.txt
@@ -29,10 +29,11 @@ KERAS_3layer_batch_norm
KERAS_3layer_binary_smaller
KERAS_3layer_ternary_small
+# Pynq backend
KERAS_3layer b:pynq-z2 B:VivadoAccelerator x:xc7z020clg400-1 s:Resource
-
garnet_1layer x:xcku115-flvb2104-2-i y:garnet_1layer_config
+
# Resource strategy
KERAS_3layer r:2 s:Resource
qkeras_mnist_dense r:112 s:Resource
diff --git a/test/keras-to-hls.sh b/test/keras-to-hls.sh
index b11304c61..674c9c3f3 100755
--- a/test/keras-to-hls.sh
+++ b/test/keras-to-hls.sh
@@ -11,7 +11,7 @@ strategy="Latency"
type="ap_fixed<16,6>"
yml=""
basedir=vivado_prj
-
+precision="float"
sanitizer="[^A-Za-z0-9._]"
function print_usage {
@@ -47,9 +47,9 @@ function print_usage {
echo " Prints this help message."
}
-while getopts ":x:b:B:c:sr:g:t:d:y:h" opt; do
+while getopts ":x:b:B:c:sr:g:t:d:y:p:h" opt; do
case "$opt" in
- x) part=$OPTARG
+ x) part=$OPTARG
;;
b) board=$OPTARG
;;
@@ -69,6 +69,8 @@ while getopts ":x:b:B:c:sr:g:t:d:y:h" opt; do
;;
y) yml=$OPTARG
;;
+ p) precision=$OPTARG
+ ;;
h)
print_usage
exit
@@ -109,7 +111,6 @@ do
if [ ! -z "${yml}" ]; then
hlscfg=`sed -ne '/HLSConfig/,$p' ../example-models/config-files/${yml}`
fi
-
echo "KerasJson: ../example-models/keras/${name}.json" > ${file}
echo "KerasH5: ../example-models/keras/${h5}.h5" >> ${file}
echo "OutputDir: ${prjdir}" >> ${file}
@@ -120,7 +121,6 @@ do
echo "ClockPeriod: ${clock}" >> ${file}
echo "" >> ${file}
echo "IOType: ${io}" >> ${file}
-
if [ -z "${hlscfg}" ]
then
echo "HLSConfig:" >> ${file}
@@ -131,7 +131,16 @@ do
else
echo "${hlscfg}" >> ${file}
fi
-
+ # Adding VivadoAccelerator config to file
+ if [ "${backend}" = "VivadoAccelerator" ];
+ then
+ echo "AcceleratorConfig:" >> ${file}
+ echo " Board: ${board}" >> ${file}
+ echo " Precision:" >> ${file}
+ echo " Input: ${precision}" >> ${file}
+ echo " Output: ${precision}" >> ${file}
+ fi
+
${pycmd} ../scripts/hls4ml convert -c ${file} || exit 1
rm ${file}
rm -rf "${prjdir}"