diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index cbd44d466..5fe692052 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -5,8 +5,10 @@ from hls4ml.backends.vivado.vivado_backend import VivadoBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig +from hls4ml.backends.vitis.vitis_backend import VitisBackend from hls4ml.backends.quartus.quartus_backend import QuartusBackend register_backend('Vivado', VivadoBackend) register_backend('VivadoAccelerator', VivadoAcceleratorBackend) +register_backend('Vitis', VitisBackend) register_backend('Quartus', QuartusBackend) diff --git a/hls4ml/backends/vitis/__init__.py b/hls4ml/backends/vitis/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/hls4ml/backends/vitis/passes/__init__.py b/hls4ml/backends/vitis/passes/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py new file mode 100644 index 000000000..eddd5530f --- /dev/null +++ b/hls4ml/backends/vitis/passes/feature_check.py @@ -0,0 +1,28 @@ +from hls4ml.model.optimizer import OptimizerPass + + +class ValidateConvImplementation(OptimizerPass): + + def match(self, node): + return 'Conv' in node.class_name + + def transform(self, model, node): + if node.get_attr('implementation', 'linebuffer') == 'encoded': + print(f'WARNING: "Encoded" implementation in "{node.name}" ({node.class_name}) is not supported in Vitis backend. Switching to "LineBuffer" implementation.') + node.set_attr('implementation', 'linebuffer') + + +class ValidateStrategy(OptimizerPass): + _resource_layer_cls = ['Conv1D', 'Conv2D', 'Dense'] + + def match(self, node): + is_resource_layer = len([layer_cls for layer_cls in self._resource_layer_cls if layer_cls in node.class_name]) > 0 + is_resource_strategy = node.model.config.is_resource_strategy(node) + + return is_resource_layer and is_resource_strategy + + def transform(self, model, node): + n_in, _ = model.config.backend.get_layer_mult_size(node) + rf = node.get_attr('reuse_factor') + if rf > n_in and rf % n_in > 0: + print(f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis backend due to use of "urem" cores. Consider using a different ReuseFactor or switching to "Latency" strategy.') diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py new file mode 100644 index 000000000..8fc4ab9c3 --- /dev/null +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -0,0 +1,46 @@ +import os +import sys + +from hls4ml.backends import VivadoBackend +from hls4ml.model.flow import register_flow, get_flow +from hls4ml.report import parse_vivado_report + + +class VitisBackend(VivadoBackend): + def __init__(self): + super(VivadoBackend, self).__init__(name='Vitis') + self._register_layer_attributes() + self._register_flows() + + def _register_flows(self): + validation_passes = [ + 'vitis:validate_conv_implementation', + 'vitis:validate_strategy', + ] + validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name) + + # Any potential templates registered specifically for Vitis backend + template_flow = register_flow('apply_templates', self._get_layer_templates, requires=['vivado:init_layers'], backend=self.name) + + writer_passes = ['make_stamp', 'vitis:write_hls'] + self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) + + ip_flow_requirements = get_flow('vivado:ip').requires.copy() + ip_flow_requirements.insert(ip_flow_requirements.index('vivado:init_layers'), validation_flow) + ip_flow_requirements.insert(ip_flow_requirements.index('vivado:apply_templates'), template_flow) + + self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) + + def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False): + if 'linux' in sys.platform: + found = os.system('command -v vitis_hls > /dev/null') + if found != 0: + raise Exception('Vitis HLS installation not found. Make sure "vitis_hls" is on PATH.') + + curr_dir = os.getcwd() + os.chdir(model.config.get_output_dir()) + os.system('vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} validation={validation} export={export} vsynth={vsynth}"' + .format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth)) + os.chdir(curr_dir) + + return parse_vivado_report(model.config.get_output_dir()) \ No newline at end of file diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 195fc00b5..005a0718a 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -9,6 +9,8 @@ static const unsigned n_out = {n_out}; static const unsigned reuse_factor = {reuse}; static const unsigned strategy = nnet::{strategy}; + static const unsigned n_zeros = 0; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; typedef {accum_t.name} accum_t; typedef {bias_t.name} bias_t; typedef {weight_t.name} weight_t; @@ -123,6 +125,7 @@ def format(self, node): static const unsigned out_width = {out_width}; static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor; static const bool store_weights_in_bram = false; static const unsigned strategy = nnet::{strategy}; static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; @@ -363,6 +366,9 @@ def format(self, node): # Depthwise config params = self._default_config_params(node) + # Override bias and bias_t since these are zeros in depthwise step of SepConv2D + params['bias'] = params['zero_bias'] + params['bias_t'] = params['zero_bias_t'] params['n_filt'] = params['n_chan'] # In depthwise step n_chan == n_filt params['dilation'] = node.get_attr('dilation', 1) params['nzeros'] = node.get_weights('depthwise').nzeros diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index a73350a29..c8119c0c2 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -12,6 +12,7 @@ static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; static const unsigned n_nonzeros = {nonzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; static const bool store_weights_in_bram = false; typedef {accum_t.name} accum_t; typedef {bias_t.name} bias_t; @@ -63,6 +64,7 @@ def format(self, node): static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; static const unsigned io_type = nnet::{iotype}; static const unsigned reuse_factor = {reuse}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor); static const bool store_weights_in_bram = false; typedef {bias_t.name} bias_t; typedef {scale_t.name} scale_t; diff --git a/hls4ml/backends/vivado/passes/merge_templates.py b/hls4ml/backends/vivado/passes/merge_templates.py index 863512c4c..7aa705750 100644 --- a/hls4ml/backends/vivado/passes/merge_templates.py +++ b/hls4ml/backends/vivado/passes/merge_templates.py @@ -49,6 +49,7 @@ def format(self, node): static const unsigned n_in = {n_in}; static const unsigned n_out = {n_out}; static const unsigned reuse_factor = {reuse}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor); typedef {accum_t.name} accum_t; template using product = nnet::product::{product_type}; diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py index 74ec61e82..d7c826e74 100644 --- a/hls4ml/backends/vivado/passes/recurrent_templates.py +++ b/hls4ml/backends/vivado/passes/recurrent_templates.py @@ -12,6 +12,7 @@ static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; static const unsigned n_nonzeros = {nonzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; static const bool store_weights_in_bram = false; typedef {accum_t.name} accum_t; typedef {bias_t.name} bias_t; diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py index 504df597e..68c3ad9dd 100644 --- a/hls4ml/report/vivado_report.py +++ b/hls4ml/report/vivado_report.py @@ -56,15 +56,21 @@ def _find_solutions(sln_dir): solutions = [] if os.path.isfile(sln_dir + '/vivado_hls.app'): - with open(sln_dir + '/vivado_hls.app') as f: - # Get rid of namespaces (workaround to support two types of vivado_hls.app files) - xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1) + sln_file = 'vivado_hls.app' + elif os.path.isfile(sln_dir + '/hls.app'): + sln_file = 'hls.app' + else: + return solutions + + with open(sln_dir + '/' + sln_file) as f: + # Get rid of namespaces (workaround to support two types of vivado_hls.app files) + xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1) - root = ET.fromstring(xmlstring) - for sln_tag in root.findall('solutions/solution'): - sln_name = sln_tag.get('name') - if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name): - solutions.append(sln_name) + root = ET.fromstring(xmlstring) + for sln_tag in root.findall('solutions/solution'): + sln_name = sln_tag.get('name') + if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name): + solutions.append(sln_name) return solutions @@ -172,8 +178,13 @@ def parse_vivado_report(hls_dir): # Area area_node = root.find('./AreaEstimates') for child in area_node.find('./Resources'): + # DSPs are called 'DSP48E' in Vivado and just 'DSP' in Vitis. Overriding here to have consistent keys + if child.tag == 'DSP48E': + child.tag = 'DSP' c_synth_report[child.tag] = child.text for child in area_node.find('./AvailableResources'): + if child.tag == 'DSP48E': + child.tag = 'DSP' c_synth_report['Available' + child.tag] = child.text report['CSynthesisReport'] = c_synth_report else: diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h new file mode 100644 index 000000000..6477bbd90 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h @@ -0,0 +1,102 @@ +#ifndef NNET_CONV1D_RESOURCE_H_ +#define NNET_CONV1D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +template +void conv_1d_resource_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor); + constexpr unsigned multscale = block_factor / mult_n_out; + + assert((block_factor % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor <= CONFIG_T::filt_width * CONFIG_T::n_chan) && "This function is correct only for RF <= FILT_WIDTH * N_CHAN"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + PartitionLoop: + for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + //#pragma HLS UNROLL // We don't want this loop unrolled + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelInitAccumLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + InitAccumLoop: + for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t) biases[i_acc]; + } + } + + ReuseLoop: + for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) { + #pragma HLS PIPELINE II=1 rewind + + unsigned i_in = i_rf; + unsigned i_out = 0; + unsigned i_acc = 0; + + MultLoop: + for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) { + #pragma HLS UNROLL + + PixelMultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + acc[i_pxl][i_out] += static_cast( + CONFIG_T::mult_config::template product::product(data_buf[i_pxl][i_in], weights_2d[i_blk][i_rf])); + } + + // Increment i_in + i_in += CONFIG_T::reuse_factor; + if (i_in >= mult_n_in) { + i_in = i_rf; + } + // Increment i_out + if (i_acc + 1 >= multscale) { + i_acc = 0; + i_out++; + } else { + i_acc++; + } + } + } + + PixelResultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + // Cast to "res_t" type + ResultLoop: + for (unsigned i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_pxl][i_res]); + } + } + } +} + +} +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h new file mode 100644 index 000000000..f054adc3d --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h @@ -0,0 +1,36 @@ +#ifndef NNET_CONV1D_STREAM_H_ +#define NNET_CONV1D_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_conv_stream.h" +#include "hls_stream.h" + +namespace nnet { + +template +void conv_1d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } else { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } + +} + + +} +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h new file mode 100644 index 000000000..ea0afc7d2 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h @@ -0,0 +1,104 @@ +#ifndef NNET_CONV2D_RESOURCE_H_ +#define NNET_CONV2D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +template +void conv_2d_resource_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor); + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor); + constexpr unsigned multscale = multiplier_limit / mult_n_out; + + assert((multiplier_limit % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && "The current Reuse Factor is not allowed"); + assert((multiplier_limit == block_factor) && "This function is correct only for RF <= FILT_HEIGHT * FILT_WIDTH * N_CHAN"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + PartitionLoop: + for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + //#pragma HLS UNROLL // We don't want this loop unrolled + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelInitAccumLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + InitAccumLoop: + for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t) biases[i_acc]; + } + } + + ReuseLoop: + for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) { + #pragma HLS PIPELINE II=1 rewind + + unsigned i_in = i_rf; + unsigned i_out = 0; + unsigned i_acc = 0; + + MultLoop: + for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) { + #pragma HLS UNROLL + + PixelMultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + acc[i_pxl][i_out] += static_cast( + CONFIG_T::mult_config::template product::product(data_buf[i_pxl][i_in], weights_2d[i_blk][i_rf])); + } + + // Increment i_in + i_in += CONFIG_T::reuse_factor; + if (i_in >= mult_n_in) { + i_in = i_rf; + } + // Increment i_out + if (i_acc + 1 >= multscale) { + i_acc = 0; + i_out++; + } else { + i_acc++; + } + } + } + + PixelResultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + // Cast to "res_t" type + ResultLoop: + for (unsigned i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_pxl][i_res]); + } + } + } +} + +} +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h new file mode 100644 index 000000000..1c77f4f3e --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h @@ -0,0 +1,81 @@ +#ifndef NNET_CONV2D_STREAM_H_ +#define NNET_CONV2D_STREAM_H_ + +#include "ap_shift_reg.h" +#include "nnet_common.h" +#include "nnet_conv_stream.h" +#include "hls_stream.h" + +namespace nnet { + +// Line Buffer +template +void conv_2d_buffer_latency_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1,1)][CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + if (CONFIG_T::filt_height > 1) { + compute_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } + } +} + +template +void conv_2d_buffer_resource_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1,1)][CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + + if (CONFIG_T::filt_height > 1) { + compute_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } + } +} + +template +void conv_2d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + conv_2d_buffer_latency_cl(data, res, weights, biases); + } else { + conv_2d_buffer_resource_cl(data, res, weights, biases); + } +} + +} +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h new file mode 100644 index 000000000..d96b75b47 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h @@ -0,0 +1,247 @@ +#ifndef NNET_DENSE_RESOURCE_H_ +#define NNET_DENSE_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_mult.h" +#include "hls_stream.h" +#include +#include + +namespace nnet { + +template +void dense_resource_rf_leq_nin( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit / CONFIG_T::n_out; + + assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed"); + assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; // I got you now motherfucker! + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + InitAccum: + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc]; + } + + ReuseLoop: + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + int in_index = ir; + int out_index = 0; + int acc_step = 0; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights_2d[im][ir])); + + // Increment in_index + in_index += CONFIG_T::reuse_factor; + if (in_index >= CONFIG_T::n_in) { + in_index = ir; + } + // Increment out_index + if (acc_step + 1 >= multscale) { + acc_step = 0; + out_index++; + } else { + acc_step++; + } + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin_rem0( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::n_in); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + + assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor > CONFIG_T::n_in && CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + InitAccum: + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc]; + } + + int in_index = 0; + int out_index; + int outstep = 0; + const int outscale = CONFIG_T::reuse_factor / CONFIG_T::n_in; + + int outidx[CONFIG_T::reuse_factor]; + IndexLoop: + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + outidx[ir] = outstep; + if ((ir + 1) % CONFIG_T::n_in == 0) { + outstep++; + } + } + + ReuseLoop: + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + out_index = outidx[ir]/*outstep*/; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights_2d[im][ir])); + + out_index += outscale; + } + + in_index++; + if (in_index >= CONFIG_T::n_in) { + in_index = 0; + //outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround. + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int multiplier_limit = CONFIG_T::n_out; + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + + assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + InitAccum: + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc]; + } + + ReuseLoop: + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma HLS PIPELINE II=1 rewind + typename CONFIG_T::accum_t tmpmult[block_factor]; + #pragma HLS ARRAY_PARTITION variable=tmpmult complete + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + int w_index = ir + CONFIG_T::reuse_factor * im; + int in_index = w_index % CONFIG_T::n_in; // As of Vitis HLS 2022.1, this still results in urem core being used. + tmpmult[im] = CONFIG_T::template product::product(data[in_index], weights_2d[im][ir]); + } + + typename CONFIG_T::accum_t mult[multiplier_limit]; + #pragma HLS ARRAY_PARTITION variable=mult complete + + ResetMult: + for (int imult = 0; imult < multiplier_limit; imult++) { + #pragma HLS UNROLL + mult[imult] = 0; + } + + AccumLoop1: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + int w_index = ir + CONFIG_T::reuse_factor * im; + int out_index = w_index / CONFIG_T::n_in; + if (out_index >= multiplier_limit) continue; // check out of bounds + mult[out_index] += tmpmult[im]; + } + + AccumLoop2: + for (int im = 0; im < multiplier_limit; im++) { + #pragma HLS UNROLL + acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + #pragma HLS INLINE recursive + + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_resource_rf_leq_nin(data, res, weights, biases); + } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) { + dense_resource_rf_gt_nin_rem0(data, res, weights, biases); + } else { + dense_resource_rf_gt_nin(data, res, weights, biases); + } +} + +} + +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h new file mode 100644 index 000000000..955dc9e78 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h @@ -0,0 +1,113 @@ +#ifndef NNET_DENSE_STREAM_H_ +#define NNET_DENSE_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_types.h" +#include "hls_stream.h" +#include +#include + +namespace nnet { + +template +void dense_latency_wrapper( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + dense_latency(data, res, weights, biases); +} + +template +void dense_resource_wrapper( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { + dense_resource(data, res, weights, biases); +} + +template +void data_prepare( + hls::stream &data_stream, + typename data_T::value_type data[CONFIG_T::n_in] +) { + #pragma HLS INLINE + + if (CONFIG_T::n_in / data_T::size > 1) { + DataPrepare: for(int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) { + #pragma HLS PIPELINE + data_T data_pack = data_stream.read(); + DataPackPipeline: for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + #pragma HLS UNROLL + data[i_in * data_T::size + i_pack] = data_pack[i_pack]; + } + } + } else { + data_T data_pack = data_stream.read(); + DataPackSingle: for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + #pragma HLS UNROLL + data[i_pack] = data_pack[i_pack]; + } + } +} + +template +void res_write( + typename res_T::value_type res[CONFIG_T::n_out], + hls::stream &res_stream +) { + #pragma HLS INLINE + + if (CONFIG_T::n_out / res_T::size > 1) { + ResWrite: for(unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) { + #pragma HLS PIPELINE + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + ResPackPipeline: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = res[i_out * res_T::size + i_pack]; + } + res_stream.write(res_pack); + } + } else { + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + ResPackSingle: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = res[i_pack]; + } + res_stream.write(res_pack); + } +} + +template +void dense( + hls::stream &data_stream, + hls::stream &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) +{ + #pragma HLS INLINE recursive + + typename data_T::value_type data[CONFIG_T::n_in]; + #pragma HLS ARRAY_PARTITION variable=data complete + + typename res_T::value_type res[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=res complete + + data_prepare(data_stream, data); + if (CONFIG_T::strategy == nnet::latency) { + dense_latency_wrapper(data, res, weights, biases); + } else { + dense_resource_wrapper(data, res, weights, biases); + } + res_write(res, res_stream); +} + +} + +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h new file mode 100644 index 000000000..ac921e0d3 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h @@ -0,0 +1,340 @@ +#ifndef NNET_POOLING_H_ +#define NNET_POOLING_H_ + +#include +#include "nnet_common.h" +#include "nnet_helpers.h" + +namespace nnet{ + +// Return the maximum value from an array +template +T max(T x[N]){ + T y = x[0]; + for(int i = 1; i < N; i++){ + y = x[i] > y ? x[i] : y; + } + return y; +} + +template +ap_int avg(ap_int (&x)[N]){ + // Use a wider accumulator than the input to avoid overflow + ap_int tmp = 0; + for(int i = 0; i < N; i++){ + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ap_int y = tmp; + return tmp; +} + +template +ap_fixed avg(ap_fixed (&x)[N]){ + // Use a wider accumulator than the input to avoid overflow + ap_fixed tmp = 0; + for(int i = 0; i < N; i++){ + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ap_fixed y = tmp; + return y; +} + +// Return the mean value of an array +template +T avg(T (&x)[N]){ + T y = 0; + for(int i = 0; i < N; i++){ + y += x[i]; + } + y /= N; + return y; +} + +// Enumeration for pooling operation (max, avg, l2norm pooling) +enum Pool_Op { Max, Average }; // L2Norm }; +template +T pool_op(T (&x)[N]){ + switch(op){ + case Max: return max(x); + case Average: return avg(x); + // case L2Norm: return l2norm(x); + } +} + +template +T pad_val(){ + /*--- + *- In Tensorflow, pooling ignores the value in the padded cells + *- For Avg pooling, return 0 (the divisior is modified to the + *- area overlapping the unpadded image. + *- For max pooling, return the most negative value for the type. + *- TODO this is not really generic, it assumes fixed point or integer T + ---*/ + switch(op){ + case Max:{ + T x = 0; + x[x.width - 1] = 1; + return x; + break;} + case Average: return 0; + } +} + +struct pooling1d_config{ + // IO size + static const unsigned n_in = 10; + static const unsigned pool_width = 2; + static const unsigned stride_width = 2; + static const unsigned n_out = (n_in - pool_width) / stride_width + 1; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template +constexpr int pool_op_limit_1d() { + return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor; +} + +template +void pooling1d_cl( + data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], + res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) +{ + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit_1d(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + // Add any necessary padding + unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Loop over input image x in steps of stride + for(int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) { + data_T pool[CONFIG_T::pool_width]; + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window x + for(int jj = 0; jj < CONFIG_T::stride_width; jj++) { + if(ii+jj < CONFIG_T::pad_left || ii+jj >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[jj] = pad_val(); + }else{ + pool[jj] = data[(ii + jj) * CONFIG_T::n_filt + ff]; + img_overlap++; + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce width to area of pool window + // not overlapping padding region + res[(ii/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if(CONFIG_T::pool_op == Average) { + data_T rescale = CONFIG_T::pool_width / img_overlap; + res[(ii/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] *= rescale; + } + } + } +} + +template +void global_pooling1d_cl( + data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], + res_T res[CONFIG_T::n_filt]) +{ + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit_1d(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + data_T pool[CONFIG_T::n_in]; + for(int jj = 0; jj < CONFIG_T::n_in; jj++) { + pool[jj] = data[jj * CONFIG_T::n_filt + ff]; + } + // do the pooling + res[ff] = pool_op(pool); + } +} + +struct pooling2d_config{ + // IO size + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_filt = 4; + static const unsigned stride_height = 2; + static const unsigned stride_width = 2; + static const unsigned pool_height = 2; + static const unsigned pool_width = 2; + static const unsigned out_height = (in_height - pool_height) / stride_height + 1; + static const unsigned out_width = (in_width - pool_width) / stride_width + 1; + // Padding + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + // Pooling function + static const Pool_Op pool_op = Max; + // Reuse factor + static const unsigned reuse_factor = 1; + + // Internal data type definitions + typedef float accum_t; +}; + +template +constexpr int pool_op_limit(){ + return DIV_ROUNDUP((CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt, CONFIG_T::reuse_factor); +} + +template +void pooling2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) +{ + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + // Add any necessary padding + unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for(int ff = 0; ff < CONFIG_T::n_filt; ff++){ + // Loop over input image y in steps of stride + for(int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height){ + // Loop over input image x in steps of stride + for(int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width){ + data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window y + for(int kk = 0; kk < CONFIG_T::stride_height; kk++){ + // Loop over pool window x + for(int ll = 0; ll < CONFIG_T::stride_width; ll++){ + if(ii+kk < CONFIG_T::pad_top || ii+kk >= (padded_height - CONFIG_T::pad_bottom) || jj+ll < CONFIG_T::pad_left || jj+ll >= (padded_width - CONFIG_T::pad_right)){ + // Add padding + pool[kk * CONFIG_T::stride_width + ll] = pad_val(); + }else{ + pool[kk * CONFIG_T::stride_width + ll] = data[(ii + kk) * CONFIG_T::in_width * CONFIG_T::n_filt + (jj + ll) * CONFIG_T::n_filt + ff]; + img_overlap++; + } + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce height * width to area of pool window + // not overlapping padding region + res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (jj/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if(CONFIG_T::pool_op == Average){ + data_T rescale = CONFIG_T::pool_height * CONFIG_T::pool_width / img_overlap; + res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (jj/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] *= rescale; + } + } + } + } +} + +template +void pooling2d_cf( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) +{ + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + // Add any necessary padding + unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for(int ff = 0; ff < CONFIG_T::n_filt; ff++){ + // Loop over input image y in steps of stride + for(int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height){ + // Loop over input image x in steps of stride + for(int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width){ + data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window y + for(int kk = 0; kk < CONFIG_T::stride_height; kk++){ + // Loop over pool window x + for(int ll = 0; ll < CONFIG_T::stride_width; ll++){ + if(ii+kk < CONFIG_T::pad_top || ii+kk >= (padded_height - CONFIG_T::pad_bottom) || jj+ll < CONFIG_T::pad_left || jj+ll >= (padded_width - CONFIG_T::pad_right)){ + // Add padding + pool[kk * CONFIG_T::stride_width + ll] = pad_val(); + }else{ + pool[kk * CONFIG_T::stride_width + ll] = data[(ii + kk) * CONFIG_T::in_width + ff * CONFIG_T::in_width*CONFIG_T::in_height + ll + jj]; + img_overlap++; + } + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce height * width to area of pool window + // not overlapping padding region + res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width + (jj/CONFIG_T::stride_width) + ff* CONFIG_T::out_height* CONFIG_T::out_width] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if(CONFIG_T::pool_op == Average){ + data_T rescale = CONFIG_T::pool_height * CONFIG_T::pool_width / img_overlap; + res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width + (jj/CONFIG_T::stride_width) + ff* CONFIG_T::out_height* CONFIG_T::out_width] *= rescale; + } + } + } + } +} + + +template +void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height); + + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + + FiltLoop: + for(int filt = 0; filt < CONFIG_T::n_filt; filt++) { + data_T pool[CONFIG_T::in_height * CONFIG_T::in_width]; + + InputLoop: + for (int i = 0 ; i < CONFIG_T::in_height * CONFIG_T::in_width ; i++) { + pool[i] = data[i * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast(pool_op(pool)); + } +} + +} + +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h new file mode 100644 index 000000000..f936c7c88 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h @@ -0,0 +1,341 @@ +#ifndef NNET_POOLING_STREAM_H_ +#define NNET_POOLING_STREAM_H_ + +#include "utils/x_hls_utils.h" +#include "ap_shift_reg.h" +#include "nnet_common.h" +#include "nnet_pooling.h" +#include "nnet_conv_stream.h" +#include "hls_stream.h" + +namespace nnet { + +// ************************************************* +// Max/average pooling +// ************************************************* + +template +T reduce_pool(T x[N]) { + #pragma HLS INLINE + if (CONFIG_T::pool_op == Max) { + Op_max op_max; + return reduce>(x, op_max); + } else { + Op_add op_add; + T sum = reduce>(x, op_add); + return sum / N; + } +} + +template +void compute_pool_buffer_2d( + const data_T& in_elem, + ap_shift_reg line_buffer[MAX(CONFIG_T::pool_height - 1,1)][CONFIG_T::n_filt], + hls::stream &res +) { + #pragma HLS INLINE + const static int lShiftX = CONFIG_T::pool_width - 1; + const static int lShiftY = CONFIG_T::pool_height - 1; + static int pX = 0; // pixel X + static int pY = 0; // pixel Y + static int sX = 0; // stride X + static int sY = 0; // stride Y + + typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool_window complete + + static typename data_T::value_type kernel_data[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel into line buffer, return pooling kernels + nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); + + // Can compute pooling output + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { + FiltLoop: for(unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS PIPELINE + + // Retrieve data for current channel + PoolLoop: for(unsigned i_ihw = 0; i_ihw < CONFIG_T::pool_height * CONFIG_T::pool_width; i_ihw++) { + pool_window[i_ihw] = kernel_data[i_ihw * CONFIG_T::n_filt + i_ic]; + } + + // Compute Pooling + res_pack[i_ic] = reduce_pool(pool_window); + } + + // Write to output + res.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image + pY = 0; + sY = 0; + } else { // Next line + pY = pY + 1; + // Update stride (threshold) ? subtract stride : increment stride + sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; + } + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void pooling2d_cl( + hls::stream &data, + hls::stream &res +) { + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + #pragma HLS INLINE recursive + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::pool_height - 1,1)][CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE + + compute_pool_buffer_2d(data.read(), line_buffer, res); + } + } +} + +// ************************************************* +// Pooling 1D +// ************************************************* +template +void compute_pool_buffer_1d( + const data_T& in_elem, + hls::stream &res +) { + #pragma HLS INLINE + const static int lShiftX = CONFIG_T::pool_width - 1; + // Counters + static int pX = 0; + static int sX = 0; + + typename data_T::value_type pool_window[CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool_window complete + + static typename data_T::value_type kernel_data[CONFIG_T::pool_width * CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel into line buffer, return pooling kernels + // 1D case line buffer not necessary. Put directly into the kernel_data buffer + nnet::kernel_shift_1d(in_elem, kernel_data); + + // Can compute pooling output + if ( (sX - lShiftX) == 0 && pX > lShiftX - 1) { + FiltLoop: for(unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS PIPELINE + + // Retrieve data for current channel + PoolLoop: for(unsigned i_iw = 0; i_iw < CONFIG_T::pool_width; i_iw++) { + pool_window[i_iw] = kernel_data[i_iw * CONFIG_T::n_filt + i_ic]; + } + + // Compute Pooling + res_pack[i_ic] = reduce_pool(pool_window); + } + + // Write to output + res.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::n_in) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void pooling1d_cl( + hls::stream &data, + hls::stream &res +) { + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + #pragma HLS inline recursive + + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in; i_iw++) { + #pragma HLS PIPELINE + compute_pool_buffer_1d(data.read(), res); + } +} + + +// ************************************************* +// Global max/average pooling +// ************************************************* + +template +T reduce_global_pool(T x, T y[N]) { + #pragma HLS INLINE + if (CONFIG_T::pool_op == Max) { + Op_max op_max; + T y_max = reduce>(y, op_max); + return (x > y_max) ? x : y_max; + } else { + Op_add op_add; + T y_sum = reduce>(y, op_add); + return x + y_sum; + } +} + +template +void compute_global_pool( + const data_T& in_elem, + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt] +) { + PoolFilt: for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + #pragma HLS UNROLL + + typename CONFIG_T::accum_t data_pack[data_T::size / CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=data_pack complete dim=0 + + PixelLoop: for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) { + #pragma HLS UNROLL + data_pack[p] = in_elem[p * CONFIG_T::n_filt + c]; + } + data_window[c] = reduce_global_pool(data_window[c], data_pack); + } +} + +template +void global_pooling2d_cl( + hls::stream &data, + hls::stream &res +) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=data_window complete + + typename CONFIG_T::accum_t init = 0; + if (CONFIG_T::pool_op == Max) { + init = hls::numeric_limits::min(); + } + + PoolInitLoop: for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) { + #pragma HLS UNROLL + data_window[i_init] = init; + } + + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_filt); i_iw++) { + #pragma HLS LOOP_FLATTEN + compute_global_pool(data.read(), data_window); + } + } + + if (CONFIG_T::pool_op == Max) { + MaxPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack]; + } + res.write(res_pack); + } + } else { + AvgPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width); + } + res.write(res_pack); + } + } + +} + +template +void global_pooling1d_cl( + hls::stream &data, + hls::stream &res +) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=data_window complete + + typename CONFIG_T::accum_t init = 0; + if (CONFIG_T::pool_op == Max) { + init = hls::numeric_limits::min(); + } + + PoolInitLoop: for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) { + #pragma HLS UNROLL + data_window[i_init] = init; + } + + ReadInput: for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (data_T::size / CONFIG_T::n_filt); i_iw++) { + #pragma HLS LOOP_FLATTEN + compute_global_pool(data.read(), data_window); + } + + if (CONFIG_T::pool_op == Max) { + MaxPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack]; + } + res.write(res_pack); + } + } else { + AvgPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in; + } + res.write(res_pack); + } + } + +} + +} + +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h new file mode 100644 index 000000000..6850497ff --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h @@ -0,0 +1,89 @@ +#ifndef NNET_SEPARABLE_CONV1D_STREAM_H_ +#define NNET_SEPARABLE_CONV1D_STREAM_H_ + +#include "nnet_common.h" +#include "hls_stream.h" +#include "nnet_sepconv_stream.h" +#include "nnet_conv1d_stream.h" + +namespace nnet { + +template +void depthwise_conv_1d_buffer_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) +{ + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } else { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } +} + +template +void pointwise_conv_1d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_width == 1); + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + if (i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } else { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + if (i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } +} + + +template +void separable_conv_1d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::depthwise_config::weight_t depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt] +) { + assert(CONFIG_T::depthwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + assert(CONFIG_T::pointwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + #pragma HLS DATAFLOW + + hls::stream depthwise_res; + unsigned res_depth = CONFIG_T::depthwise_config::out_width; + #pragma HLS STREAM variable=depthwise_res depth=res_depth + + depthwise_conv_1d_buffer_cl(data, depthwise_res, depthwise_weights, depthwise_biases); + pointwise_conv_1d_cl(depthwise_res, res, pointwise_weights, pointwise_biases); +} + +} +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h new file mode 100644 index 000000000..352828ecd --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h @@ -0,0 +1,113 @@ +#ifndef NNET_SEPARABLE_CONV2D_STREAM_H_ +#define NNET_SEPARABLE_CONV2D_STREAM_H_ + +#include "nnet_common.h" +#include "hls_stream.h" +#include "nnet_sepconv_stream.h" +#include "nnet_conv2d_stream.h" + +namespace nnet { + +// Line Buffer Implementation (Phil's) +template +void depthwise_conv_2d_buffer_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) +{ + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[CONFIG_T::filt_height - 1][CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + if (CONFIG_T::filt_height > 1) { + compute_depthwise_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } + } + } else { + ReadInputHeightSerial: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::filt_height > 1) { + compute_depthwise_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } + } + } +} + + +template +void pointwise_conv_2d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } + } else { + ReadInputHeightSerial: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } + } +} + +template +void separable_conv_2d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::depthwise_config::weight_t depthwise_weights[CONFIG_T::depthwise_config::filt_height * CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt] +) { + assert(CONFIG_T::depthwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + assert(CONFIG_T::pointwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + #pragma HLS DATAFLOW + + hls::stream depthwise_res; + unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; + #pragma HLS STREAM variable=depthwise_res depth=res_depth + + depthwise_conv_2d_buffer_cl(data, depthwise_res, depthwise_weights, depthwise_biases); + pointwise_conv_2d_cl(depthwise_res, res, pointwise_weights, pointwise_biases); +} + +} +#endif diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index 3b0f9ad53..d34337c57 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -162,9 +162,11 @@ if {$opt(reset)} { open_solution "solution1" } catch {config_array_partition -maximum_size 4096} -config_compile -name_max_length 60 +config_compile -name_max_length 80 set_part $part +config_schedule -enable_dsp_full_reg=false create_clock -period $clock_period -name default +set_clock_uncertainty $clock_uncertainty default if {$opt(csim)} { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h index cf97ce099..dcfcb00d4 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h @@ -40,7 +40,7 @@ template void linear(hls::stream< data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) LinearPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -62,7 +62,7 @@ template void relu(hls::stream void sigmoid(hls::stream data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) SigmoidPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -173,11 +173,12 @@ void softmax_latency(hls::stream &data, hls::stream &res) { invert_table[softmax_idx_from_real_val(exp_sum)]; res_T out_pack; - #pragma HLS DATA_PACK variable=out_pack + PRAGMA_DATA_PACK(out_pack) + SoftmaxInvPackLoop: for (unsigned j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit out_pack[j] = exp_res[j] * inv_exp_sum; } res.write(out_pack); @@ -253,11 +254,12 @@ void softmax_stable(hls::stream &data, hls::stream &res) { invert_table[softmax_idx_from_real_val(exp_sum)]; res_T out_pack; - #pragma HLS DATA_PACK variable=out_pack + PRAGMA_DATA_PACK(out_pack) + SoftmaxInvPackLoop: for (unsigned j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit out_pack[j] = exp_res[j] * inv_exp_sum; } res.write(out_pack); @@ -322,7 +324,8 @@ void softmax_legacy(hls::stream &data, hls::stream &res) { } res_T out_pack; - #pragma HLS DATA_PACK variable=out_pack + PRAGMA_DATA_PACK(out_pack) + SoftmaxInvPackLoop: for (unsigned j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -410,7 +413,7 @@ template void tanh(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) HardSigmoidPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -495,7 +498,7 @@ void leaky_relu(hls::stream &data, typename data_T::value_type alpha, hl data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) LeakyReLUPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -521,7 +524,7 @@ void thresholded_relu(hls::stream &data, typename data_T::value_type the data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ThresholdedReLUPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -560,7 +563,7 @@ template void softplus(hls::strea data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) SoftplusPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -601,7 +604,7 @@ template void softsign(hls::strea data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) SoftsignPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -642,7 +645,7 @@ void elu(hls::stream &data, typename data_T::value_type alpha, hls::stre data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) EluPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -690,7 +693,7 @@ template void selu(hls::stream &data, typename data_T::value_type alpha[CONFIG_T data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) PReLUPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -747,7 +750,7 @@ void binary_tanh(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) PReLUPackLoop: for (int j = 0; j < res_T::size; j++) { @@ -772,7 +775,7 @@ void ternary_tanh(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) PReLUPackLoop: for (int j = 0; j < res_T::size; j++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h index 200282784..2314f5609 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h @@ -70,18 +70,17 @@ void normalize( #pragma HLS ARRAY_PARTITION variable=scale complete #pragma HLS ARRAY_PARTITION variable=bias complete - int multiplier_limit = ceil(float(CONFIG_T::n_in) / float(CONFIG_T::reuse_factor)); - CONFIG_T::template product::limit(multiplier_limit); + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Calcuate result Result: for (int ires = 0; ires < CONFIG_T::n_in; ires++) { if (CONFIG_T::n_filt==-1) { res[ires] = CONFIG_T::template product::product(data[ires], scale[ires]) + bias[ires]; - } else { + } else { int norm_index = ires%CONFIG_T::n_filt; res[ires] = CONFIG_T::template product::product(data[ires], scale[norm_index]) + bias[norm_index]; } - } + } } // **************************************************** @@ -108,13 +107,12 @@ void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T data_T datareg; ap_uint<1> cache; for (int ii=0; ii threshold[norm_index] ) cache = 1; else cache = 0; res[ii] = (ap_uint<1>) cache; - } } @@ -134,7 +132,6 @@ void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T else cache = 0; res[ii] = (ap_int<2>) cache; - } } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h index ce76c01bc..ce49d65b0 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h @@ -41,16 +41,15 @@ void normalize( #pragma HLS ARRAY_PARTITION variable=scale complete #pragma HLS ARRAY_PARTITION variable=bias complete - constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); - constexpr unsigned ii = CONFIG_T::n_in / multiplier_limit; - CONFIG_T::template product::limit(multiplier_limit); + constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit; + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit BatchNormLoop: for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { #pragma HLS PIPELINE II=ii data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) BatchNormpack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -83,7 +82,7 @@ void normalize_binary_tanh( data_T in_data = data.read(); nnet::array, CONFIG_T::n_in> out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) BatchNormPack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -109,7 +108,7 @@ void normalize_ternary_tanh( data_T in_data = data.read(); nnet::array, CONFIG_T::n_in> out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) BatchNormPack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index 9bfae8339..af59f9021 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -27,6 +27,17 @@ #define MIN(n,d) (n > d ? d : n) #define MAX(n,d) (n > d ? n : d) +#define STRINGIFY(x) #x +#define EXPAND_STRING(x) STRINGIFY(x) + +#ifndef __VITIS_HLS__ +#define DATA_PACK_TXT HLS DATA_PACK variable= +#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable +#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable))) +#else +#define PRAGMA_DATA_PACK(variable) +#endif + namespace nnet { // Common type definitions diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 65f71d080..dd7225346 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -29,8 +29,8 @@ void conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=weights complete #pragma HLS ARRAY_PARTITION variable=biases complete - int multiplier_limit = CONFIG_T::n_pixels * (ceil(float(mult_n_in * mult_n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::mult_config::n_zeros) / float(CONFIG_T::reuse_factor))); - CONFIG_T::mult_config::template product::limit(multiplier_limit); + // Limit multipliers to control parallelization + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit PartitionLoop: for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h index 40568921e..b23c330c7 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h @@ -36,7 +36,7 @@ void conv_1d_encoded_cl(hls::stream &data, hls::stream &res, #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; @@ -74,7 +74,7 @@ template void conv_1d_cl(hls::stream &data, hls::stream &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - #pragma HLS inline region + #pragma HLS inline recursive switch (CONFIG_T::implementation) { case conv_implementation::linebuffer: conv_1d_buffer_cl(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h index ff2fb181c..43222696c 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h @@ -7,166 +7,6 @@ namespace nnet { -//Computes multiplier limit -//This function should not be synthesized into firmware -template - int compute_multiplier_limit_conv2d( - typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt] -) -{ - int n_mult = 0; - - for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++){ - for(int cc = 0; cc < CONFIG_T::n_chan; cc++){ - for(int fh = 0; fh < CONFIG_T::filt_height; fh++){ - for(int fw = 0; fw < CONFIG_T::filt_width; fw++){ - - int index_weight = fh*CONFIG_T::filt_width*CONFIG_T::n_chan*CONFIG_T::n_filt - + fw*CONFIG_T::n_chan*CONFIG_T::n_filt - + cc*CONFIG_T::n_filt - + ff; - - if ((oh*CONFIG_T::stride_height+fh) < CONFIG_T::pad_top - || (oh*CONFIG_T::stride_height+fh) >= (CONFIG_T::pad_top+CONFIG_T::in_height) - || (ow*CONFIG_T::stride_width+fw) < CONFIG_T::pad_left - || (ow*CONFIG_T::stride_width+fw) >= (CONFIG_T::pad_left+CONFIG_T::in_width)) { - //padded - do nothing - continue; - } else { - if (weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20) { - n_mult++; - } - } - - }//end mult loop - }//end channel loop - }//end filter width loop - }//end filter height loop - }//end output width loop - }//end output height loop - - return ceil( float(n_mult) / float(CONFIG_T::reuse_factor) ); - -}//end compute_n_mult - -template -void conv_2d_latency_cf( - data_T data[CONFIG_T::in_height*CONFIG_T::in_width*CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height*CONFIG_T::out_width*CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) -{ - - typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]; - - #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 - #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 - - // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases - #pragma HLS function_instantiate variable=weights,biases - - // Parallel mode - #pragma HLS PIPELINE - #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 - - // Limit multipliers to control parallelization - const int multiplier_limit = compute_multiplier_limit_conv2d(weights); - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation - - // Convolve, saving all multiplication results to accumulate later - ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - ConvOutWidth: for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++){ - ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++){ - ConvFiltHeight: for(int fh = 0; fh < CONFIG_T::filt_height; fh++){ - ConvFiltWidth: for(int fw = 0; fw < CONFIG_T::filt_width; fw++){ - - int index_mult = oh*CONFIG_T::out_width*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + ow*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + ff*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + cc*CONFIG_T::filt_height*CONFIG_T::filt_width - + fh*CONFIG_T::filt_width - + fw; - - int index_weight = fh*CONFIG_T::filt_width*CONFIG_T::n_chan*CONFIG_T::n_filt - + fw*CONFIG_T::n_chan*CONFIG_T::n_filt - + cc*CONFIG_T::n_filt - + ff; - - if ((oh*CONFIG_T::stride_height+fh) < CONFIG_T::pad_top - || (oh*CONFIG_T::stride_height+fh) >= (CONFIG_T::pad_top+CONFIG_T::in_height) - || (ow*CONFIG_T::stride_width+fw) < CONFIG_T::pad_left - || (ow*CONFIG_T::stride_width+fw) >= (CONFIG_T::pad_left+CONFIG_T::in_width)) { - mult[index_mult] = 0; - } else { - int index_data = cc*CONFIG_T::in_height*CONFIG_T::in_width - + (oh*CONFIG_T::stride_height+fh-CONFIG_T::pad_top)*CONFIG_T::in_width - + (ow*CONFIG_T::stride_width+fw-CONFIG_T::pad_left); - mult[index_mult] = data[index_data] * weights[index_weight]; - } - - }//end mult loop - }//end channel loop - }//end filter width loop - }//end filter height loop - }//end output width loop - }//end output height loop - - - // Initialize accumulator with input biases - for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - acc[oh*CONFIG_T::out_width*CONFIG_T::n_filt + ow*CONFIG_T::n_filt + ff]=biases[ff]; - } - } - } - - - // Accumulate multiplication result - AccumOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - AccumOutWidth: for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - //Do "dot product" sum within filter and sum over channels - AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++){ - AccumDotHeight: for(int fh = 0; fh < CONFIG_T::filt_height; fh++){ - AccumDotWidth: for(int fw = 0; fw < CONFIG_T::filt_width; fw++){ - - int index_mult = oh*CONFIG_T::out_width*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + ow*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + ff*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + cc*CONFIG_T::filt_height*CONFIG_T::filt_width - + fh*CONFIG_T::filt_width - + fw; - int index_acc = oh*CONFIG_T::out_width*CONFIG_T::n_filt - + ow*CONFIG_T::n_filt - + ff; - - acc[index_acc] += mult[index_mult]; - - }//end dot product filter width loop - }//end dot product filter height loop - }//end n channel loop - }//end n filter loop - }//end output width loop - }//end output height loop - - // Cast to "res_t" type - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - int res_index = ff*CONFIG_T::out_height*CONFIG_T::out_width + oh*CONFIG_T::out_width + ow; - int acc_index = oh*CONFIG_T::out_width*CONFIG_T::n_filt + ow*CONFIG_T::n_filt + ff; - res[res_index] = acc[acc_index]; - } - } - } - -}//end conv2d - template void conv_2d_latency_cl( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], @@ -189,8 +29,8 @@ void conv_2d_latency_cl( #pragma HLS ARRAY_PARTITION variable=weights complete #pragma HLS ARRAY_PARTITION variable=biases complete - int multiplier_limit = CONFIG_T::n_pixels * (ceil(float(mult_n_in * mult_n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::mult_config::n_zeros) / float(CONFIG_T::reuse_factor))); - CONFIG_T::mult_config::template product::limit(multiplier_limit); + // Limit multipliers to control parallelization + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit PartitionLoop: for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h index ccc048d4f..8a4fb6be8 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h @@ -42,7 +42,7 @@ void conv_2d_encoded_cl( #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; @@ -97,7 +97,7 @@ void conv_2d_cl( hls::stream &data, hls::stream &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - #pragma HLS inline region + #pragma HLS inline recursive switch (CONFIG_T::implementation) { case conv_implementation::linebuffer: conv_2d_buffer_cl(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index ef46a5210..203810f28 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -94,7 +94,7 @@ void mult_buffer(hls::stream data_window[CONFIG_T:: data[id] = data_window[id].read(); } - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { dense_latency( data, res, weights, biases); @@ -280,7 +280,7 @@ void compute_output_buffer_2d( #pragma HLS ARRAY_PARTITION variable = res_out complete dim = 0 res_T res_pack; - #pragma HLS DATA_PACK variable = res_pack + PRAGMA_DATA_PACK(res_pack) // Add pixel to buffer nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); @@ -289,7 +289,7 @@ void compute_output_buffer_2d( if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { // Dense multiply - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { dense_latency( kernel_data, res_out, weights, biases); @@ -351,7 +351,7 @@ void compute_output_buffer_1d( #pragma HLS ARRAY_PARTITION variable = res_out complete dim = 0 res_T res_pack; - #pragma HLS DATA_PACK variable = res_pack + PRAGMA_DATA_PACK(res_pack) // Add pixel to buffer nnet::kernel_shift_1d(in_elem, kernel_data); @@ -360,7 +360,7 @@ void compute_output_buffer_1d( if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { // Dense multiply - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { dense_latency( kernel_data, res_out, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h index dc803ff2b..7202b3a10 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h @@ -51,10 +51,12 @@ void dense_compressed( #pragma HLS ARRAY_PARTITION variable=acc complete #pragma HLS ARRAY_PARTITION variable=biases complete #pragma HLS ARRAY_RESHAPE variable=weights block factor=multiplier_limit - //if (CONFIG_T::store_weights_in_bram){ - //#pragma HLS RESOURCE variable=weights core=ROM_1P_BRAM + +#ifdef __VITIS_HLS__ + #pragma HLS AGGREGATE variable=weights +#else #pragma HLS data_pack variable=weights struct_level - //} +#endif InitAccum: for(unsigned i = 0; i < CONFIG_T::n_out; i++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h index c4dcea4ab..464e8b495 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h @@ -52,8 +52,7 @@ void dense_latency( #pragma HLS ARRAY_PARTITION variable=mult complete #pragma HLS ARRAY_PARTITION variable=acc complete - int multiplier_limit = ceil(float(CONFIG_T::n_in*CONFIG_T::n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor)); - CONFIG_T::template product::limit(multiplier_limit); + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Do the matrix-multiply Product1: for(int ii = 0; ii < CONFIG_T::n_in; ii++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h index c0e5d1759..180365327 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h @@ -270,7 +270,7 @@ void dense_resource( typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { dense_resource_rf_leq_nin(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h index 52c96c52c..564bafac9 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h @@ -16,7 +16,7 @@ void dense_wrapper( typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], typename CONFIG_T::bias_t biases[CONFIG_T::n_out] ) { - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { #pragma HLS PIPELINE II=CONFIG_T::reuse_factor dense_latency(data, res, weights, biases); @@ -56,7 +56,7 @@ void dense( #pragma HLS PIPELINE } res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = res[i_out * res_T::size + i_pack]; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h index 3ada00b24..fb8e2fb43 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h @@ -19,7 +19,7 @@ void embedding( #pragma HLS PIPELINE II=CONFIG_T::reuse_factor res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) DenseEmbedding: for (int i = 0; i < CONFIG_T::n_out; i++) { #pragma HLS UNROLL diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h index 42d2ce80e..89f91d6f0 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h @@ -44,7 +44,7 @@ void resize_nearest( #pragma HLS UNROLL data_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ResizeChan: for (unsigned k = 0; k < CONFIG_T::n_chan; k++) { #pragma HLS UNROLL diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h index a35c264d2..19f2b421d 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h @@ -140,8 +140,7 @@ void dot1d( { #pragma HLS PIPELINE II=CONFIG_T::reuse_factor - constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); - CONFIG_T::template product::limit(multiplier_limit); + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit typename CONFIG_T::accum_t mult[CONFIG_T::n_in]; #pragma HLS ARRAY_PARTITION variable=mult complete diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h index 6a1d81a4a..1ebbb9662 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h @@ -37,7 +37,7 @@ void add(hls::stream &data1, hls::stream &data2, hls::stream input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) AddPack: for (int j = 0; j < res_T::size; j++) { @@ -60,7 +60,7 @@ void subtract(hls::stream &data1, hls::stream &data2, hls::s input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) SubtractPack: for (int j = 0; j < res_T::size; j++) { @@ -83,7 +83,7 @@ void multiply(hls::stream &data1, hls::stream &data2, hls::s input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) MultiplyPack: for (int j = 0; j < res_T::size; j++) { @@ -106,7 +106,7 @@ void average(hls::stream &data1, hls::stream &data2, hls::st input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) AveragePack: for (int j = 0; j < res_T::size; j++) { @@ -129,7 +129,7 @@ void maximum(hls::stream &data1, hls::stream &data2, hls::st input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) MaximumPack: for (int j = 0; j < res_T::size; j++) { @@ -152,7 +152,7 @@ void minimum(hls::stream &data1, hls::stream &data2, hls::st input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) MinimumPack: for (int j = 0; j < res_T::size; j++) { @@ -174,7 +174,7 @@ void concatenate3d_0(hls::stream &data1, hls::stream &data2, input1_T in_data1 = data1.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { @@ -193,7 +193,7 @@ void concatenate3d_0(hls::stream &data1, hls::stream &data2, input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) { @@ -216,7 +216,7 @@ void concatenate3d_1(hls::stream &data1, hls::stream &data2, input1_T in_data1 = data1.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { @@ -232,7 +232,7 @@ void concatenate3d_1(hls::stream &data1, hls::stream &data2, input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) { @@ -256,7 +256,7 @@ void concatenate3d_2(hls::stream &data1, hls::stream &data2, input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { @@ -294,7 +294,7 @@ void concatenate2d_0(hls::stream &data1, hls::stream &data2, input1_T in_data1 = data1.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { @@ -310,7 +310,7 @@ void concatenate2d_0(hls::stream &data1, hls::stream &data2, input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) { @@ -331,7 +331,7 @@ void concatenate2d_1(hls::stream &data1, hls::stream &data2, input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { @@ -361,7 +361,7 @@ void concatenate2d(hls::stream &data1, hls::stream &data2, h template void concatenate1d(hls::stream &data1, hls::stream &data2, hls::stream &res) { res_T out_data; -#pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatLoop1: for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) { #pragma HLS PIPELINE diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h index 586bc65ae..966959c70 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h @@ -16,10 +16,7 @@ namespace product{ * types of each. * --- */ -class Product{ - public: - static void limit(unsigned multiplier_limit) {} // Nothing to do here -}; +class Product{}; template class both_binary : public Product{ @@ -77,10 +74,6 @@ class mult : public Product{ #pragma HLS INLINE return a * w; } - static void limit(unsigned multiplier_limit){ - #pragma HLS INLINE - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation - } }; template diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h index 303315b52..cd7d1a135 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h @@ -109,7 +109,7 @@ void pooling1d_cl( // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit_1d(); - #pragma HLS ALLOCATION instances=pool_op limit=limit function + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit // Add any necessary padding unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { @@ -159,7 +159,7 @@ void global_pooling1d_cl( // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit_1d(); - #pragma HLS ALLOCATION instances=pool_op limit=limit function + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { data_T pool[CONFIG_T::n_in]; @@ -211,7 +211,7 @@ void pooling2d_cl( // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit(); - #pragma HLS ALLOCATION instances=pool_op limit=limit function + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit // Add any necessary padding unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; @@ -266,7 +266,7 @@ void pooling2d_cf( // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit(); - #pragma HLS ALLOCATION instances=pool_op limit=limit function + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit // Add any necessary padding unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h index 837cd416a..af06624fa 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h @@ -130,7 +130,7 @@ void pooling2d_encoded_cl( assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; hls::stream data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]; @@ -176,7 +176,7 @@ void compute_pool_buffer_2d( #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) // Add pixel into line buffer, return pooling kernels nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); @@ -245,7 +245,7 @@ void pooling2d_cl( hls::stream &data, hls::stream &res ) { - #pragma HLS inline region + #pragma HLS inline recursive switch(CONFIG_T::implementation){ case conv_implementation::linebuffer: pooling2d_buffer_cl(data, res); @@ -344,7 +344,7 @@ void pooling1d_encoded_cl( assert(CONFIG_T::pool_width == CONFIG_T::stride_width); res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; hls::stream data_window[CONFIG_T::pool_width * CONFIG_T::n_filt]; @@ -385,7 +385,7 @@ void compute_pool_buffer_1d( #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) // Add pixel into line buffer, return pooling kernels // 1D case line buffer not necessary. Put directly into the kernel_data buffer @@ -441,7 +441,7 @@ void pooling1d_cl( hls::stream &data, hls::stream &res ) { - #pragma HLS inline region + #pragma HLS inline recursive switch(CONFIG_T::implementation){ case conv_implementation::linebuffer: pooling1d_buffer_cl(data, res); @@ -523,7 +523,7 @@ void global_pooling2d_cl( #pragma HLS PIPELINE res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = data_window[i_pack]; @@ -535,7 +535,7 @@ void global_pooling2d_cl( #pragma HLS PIPELINE res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width); @@ -577,7 +577,7 @@ void global_pooling1d_cl( #pragma HLS PIPELINE res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = data_window[i_pack]; @@ -589,7 +589,7 @@ void global_pooling1d_cl( #pragma HLS PIPELINE res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h index e94286aa8..a2581a94c 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h @@ -293,7 +293,7 @@ template nnet::lstm(reset_state,data_in,h_newstate, s_newstate, param,param_r,param_b, param_br); if (CONFIG_T::n_sequence_out > 1){ res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack_sequences: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = h_newstate[i_pack]; @@ -305,7 +305,7 @@ template if (CONFIG_T::n_sequence_out == 1){ res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = h_newstate[i_pack]; @@ -565,7 +565,7 @@ template nnet::gru(reset_state,data_in,h_newstate,param,param_zr,param_b, param_br); if (CONFIG_T::n_sequence_out > 1){ res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack_sequences: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = h_newstate[i_pack]; @@ -577,7 +577,7 @@ template if (CONFIG_T::n_sequence_out == 1){ res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = h_newstate[i_pack]; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h index b0f6ce9c6..71ccf1a01 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h @@ -26,7 +26,7 @@ void depthwise_conv_1d_encoded_cl( #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h index 69e272652..b2c80950a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h @@ -27,7 +27,7 @@ void depthwise_conv_2d_encoded_cl( #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h index e8826e300..ce6528995 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h @@ -26,8 +26,7 @@ void depthwise_product( #pragma HLS ARRAY_PARTITION variable=mult complete - int multiplier_limit = ceil(float(CONFIG_T::kernel_size * CONFIG_T::n_chan) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor)); - CONFIG_T::mult_config::template product::limit(multiplier_limit); + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Do the matrix-multiply Product: for(int ii = 0; ii < CONFIG_T::kernel_size * CONFIG_T::n_chan; ii++) { @@ -77,7 +76,7 @@ void depthwise_mult_buffer( data[id] = data_window[id].read(); } - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { depthwise_product(data, res, weights, biases); } else { @@ -157,7 +156,7 @@ void pointwise_mult_buffer( data[id] = data_pack[id]; } - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { dense_latency(data, res, weights, biases); } else { @@ -204,7 +203,7 @@ void compute_depthwise_output_buffer_1d( // Check to see if we have a full kernel if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { // Dense multiply - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { depthwise_product(kernel_data, res_out, weights, biases); } else { @@ -268,7 +267,7 @@ void compute_depthwise_output_buffer_2d( // Check to see if we have a full kernel if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { // Dense multiply - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { depthwise_product(kernel_data, res_out, weights, biases); } else { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h index 9ee6628fe..b4de14ffd 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h @@ -3,17 +3,18 @@ #define NNET_STREAM_H #include "hls_stream.h" +#include "nnet_common.h" namespace nnet { struct broadcast_config { - static const unsigned in_height = 1; - static const unsigned in_width = 1; - static const unsigned in_chan = 3; - static const unsigned out_height = 2; - static const unsigned out_width = 2; - static const unsigned out_chan = 3; + static const unsigned in_height = 1; + static const unsigned in_width = 1; + static const unsigned in_chan = 3; + static const unsigned out_height = 2; + static const unsigned out_width = 2; + static const unsigned out_chan = 3; }; template @@ -24,8 +25,8 @@ void clone_stream(hls::stream &data, hls::stream &res1, hls::stre data_T in_data = data.read(); res_T out_data1; res_T out_data2; - #pragma HLS DATA_PACK variable=out_data1 - #pragma HLS DATA_PACK variable=out_data2 + PRAGMA_DATA_PACK(out_data1) + PRAGMA_DATA_PACK(out_data2) ClonePack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -47,9 +48,9 @@ void clone_stream(hls::stream &data, hls::stream &res1, hls::stre res_T out_data1; res_T out_data2; res_T out_data3; - #pragma HLS DATA_PACK variable=out_data1 - #pragma HLS DATA_PACK variable=out_data2 - #pragma HLS DATA_PACK variable=out_data3 + PRAGMA_DATA_PACK(out_data1) + PRAGMA_DATA_PACK(out_data2) + PRAGMA_DATA_PACK(out_data3) ClonePack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -72,7 +73,7 @@ void repack_stream(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -90,7 +91,7 @@ void repack_stream(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) for (int j = 0; j < pack_diff; j++) { #pragma HLS PIPELINE @@ -136,7 +137,7 @@ void broadcast_stream_1x1xC(hls::stream &data, hls::stream &res) for (int j = 0; j < n_dupl; j++) { #pragma HLS PIPELINE res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) for (int k = 0; k < res_T::size; k++) { #pragma HLS UNROLL out_data[k] = in_data[k]; @@ -152,20 +153,20 @@ void broadcast_stream_HxWx1(hls::stream &data, hls::stream &res) BroadcastLoop: for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); - res_T out_data; - #pragma HLS DATA_PACK variable=out_data - for (int k = 0; k < res_T::size; k++) { + res_T out_data; + PRAGMA_DATA_PACK(out_data) + for (int k = 0; k < res_T::size; k++) { #pragma HLS UNROLL - out_data[k] = in_data[0]; - } - res.write(out_data); + out_data[k] = in_data[0]; + } + res.write(out_data); } } template void broadcast_stream(hls::stream &data, hls::stream &res) { if(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) { - broadcast_stream_1x1xC(data, res); + broadcast_stream_1x1xC(data, res); } else if(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && CONFIG_T::in_width == CONFIG_T::out_width) { broadcast_stream_HxWx1(data, res); @@ -180,19 +181,19 @@ void transpose_2d(hls::stream &data, hls::stream &res) { for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); - for (int j = 0; j < data_T::size; j++) { - data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]); + for (int j = 0; j < data_T::size; j++) { + data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]); } } for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) { #pragma HLS PIPELINE res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) for (int j = 0; j < res_T::size; j++) { - out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]); + out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]); } - res.write(out_data); + res.write(out_data); } } } diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index b25576f9a..8ac4f1f8e 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -3,8 +3,10 @@ from hls4ml.writer.writers import Writer, register_writer, get_writer from hls4ml.writer.vivado_writer import VivadoWriter from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter +from hls4ml.writer.vitis_writer import VitisWriter from hls4ml.writer.quartus_writer import QuartusWriter register_writer('Vivado', VivadoWriter) register_writer('VivadoAccelerator', VivadoAcceleratorWriter) +register_writer('Vitis', VitisWriter) register_writer('Quartus', QuartusWriter) diff --git a/hls4ml/writer/vitis_writer.py b/hls4ml/writer/vitis_writer.py new file mode 100644 index 000000000..44b7d97c0 --- /dev/null +++ b/hls4ml/writer/vitis_writer.py @@ -0,0 +1,31 @@ +import os +import glob +from shutil import copy +from hls4ml.writer.vivado_writer import VivadoWriter + +class VitisWriter(VivadoWriter): + + def __init__(self): + super().__init__() + + def write_nnet_utils_overrides(self, model): + ################### + ## nnet_utils + ################### + + filedir = os.path.dirname(os.path.abspath(__file__)) + + srcpath = os.path.join(filedir,'../templates/vitis/nnet_utils/') + dstpath = '{}/firmware/nnet_utils/'.format(model.config.get_output_dir()) + + headers = [os.path.basename(h) for h in glob.glob(srcpath + '*.h')] + + for h in headers: + copy(srcpath + h, dstpath + h) + + def write_hls(self, model): + """ + Write the HLS project. Calls the steps from VivadoWriter, adapted for Vitis + """ + super(VitisWriter, self).write_hls(model) + self.write_nnet_utils_overrides(model) diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py index 0ce72d1af..46c193fdb 100644 --- a/hls4ml/writer/vivado_accelerator_writer.py +++ b/hls4ml/writer/vivado_accelerator_writer.py @@ -391,6 +391,8 @@ def write_board_script(self, model): f.write(f'set part "{self.vivado_accelerator_config.get_part()}"\n') f.write('variable clock_period\n') f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) + f.write('variable clock_uncertainty\n') + f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) if self.vivado_accelerator_config.get_interface() == 'axi_stream': in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth() f.write(f'set bit_width_hls_output {in_bit}\n') diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 9f67df4cb..a7d269102 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -583,6 +583,8 @@ def write_build_script(self, model): f.write('set part "{}"\n'.format(model.config.get_config_value('Part'))) f.write('variable clock_period\n') f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) + f.write('variable clock_uncertainty\n') + f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) f.close() # build_prj.tcl @@ -642,7 +644,7 @@ def write_nnet_utils(self, model): # custom source filedir = os.path.dirname(os.path.abspath(__file__)) - custom_source = get_backend('Vivado').get_custom_source() + custom_source = model.config.backend.get_custom_source() for dst, srcpath in custom_source.items(): dstpath = f'{model.config.get_output_dir()}/firmware/{dst}' copyfile(srcpath, dstpath) diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py index 7aea0884e..9875bfe14 100644 --- a/test/pytest/test_activations.py +++ b/test/pytest/test_activations.py @@ -9,7 +9,7 @@ # Variable 'name' is simply used as an identifier for the activation -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('shape, io_type', [ ((8, ), 'io_parallel'), ((8, ), 'io_stream'), diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py index 1b17637d9..f50329230 100644 --- a/test/pytest/test_batchnorm.py +++ b/test/pytest/test_batchnorm.py @@ -29,7 +29,7 @@ def model(): @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_batchnorm(model, data, backend, io_type): default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>' diff --git a/test/pytest/test_causalpadding.py b/test/pytest/test_causalpadding.py index d183d81c4..4e128b874 100644 --- a/test/pytest/test_causalpadding.py +++ b/test/pytest/test_causalpadding.py @@ -10,7 +10,7 @@ atol = 5e-3 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_causalpadding(io_type, backend): model = Sequential() diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py index 262ae5013..ab3365f22 100644 --- a/test/pytest/test_cnn_mnist.py +++ b/test/pytest/test_cnn_mnist.py @@ -58,6 +58,10 @@ def keras_model(mnist_data): ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), ('Vivado', 'io_stream', 'resource'), + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource'), ], ) def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy): diff --git a/test/pytest/test_cnn_mnist_qkeras.py b/test/pytest/test_cnn_mnist_qkeras.py index c34e0965a..cf3dbf17d 100644 --- a/test/pytest/test_cnn_mnist_qkeras.py +++ b/test/pytest/test_cnn_mnist_qkeras.py @@ -40,7 +40,12 @@ def mnist_model(): ('Vivado', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource') ]) def hls_model(mnist_model, backend, io_type, strategy): keras_model = mnist_model @@ -66,7 +71,12 @@ def hls_model(mnist_model, backend, io_type, strategy): ('Vivado', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource') ]) def test_accuracy(mnist_data, mnist_model, hls_model): x_train, y_train, x_test, y_test = mnist_data diff --git a/test/pytest/test_conv1d.py b/test/pytest/test_conv1d.py index 1d91d80ea..bc8a68002 100644 --- a/test/pytest/test_conv1d.py +++ b/test/pytest/test_conv1d.py @@ -30,7 +30,11 @@ def keras_model(): ('Vivado', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource'), ]) def hls_model(keras_model, backend, io_type, strategy): default_precision = 'ap_fixed<16,3,AP_RND_CONV,AP_SAT>' if backend=='Vivado' else 'ac_fixed<16,3,true,AC_RND_CONV,AC_SAT>' @@ -63,7 +67,11 @@ def hls_model(keras_model, backend, io_type, strategy): ('Vivado', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource'), ]) def test_accuracy(data, keras_model, hls_model): X = data diff --git a/test/pytest/test_embed.py b/test/pytest/test_embed.py index 8073a7a1a..fd8e39cdb 100644 --- a/test/pytest/test_embed.py +++ b/test/pytest/test_embed.py @@ -25,7 +25,7 @@ def keras_model(): @pytest.fixture -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def hls_model(keras_model, backend, io_type): hls_config = hls4ml.utils.config_from_keras_model(keras_model, default_precision='ap_fixed<16,6>', granularity='name') @@ -39,7 +39,7 @@ def hls_model(keras_model, backend, io_type): return hls_model -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_embedding_accuracy(data, keras_model, hls_model): X = data diff --git a/test/pytest/test_extensions.py b/test/pytest/test_extensions.py index 1c8e07198..9945768ea 100644 --- a/test/pytest/test_extensions.py +++ b/test/pytest/test_extensions.py @@ -126,11 +126,14 @@ def regsister_custom_layer(): hls4ml.model.layers.register_layer('HReverse', HReverse) -@pytest.mark.parametrize('backend_id', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend_id', ['Vivado', 'Vitis', 'Quartus']) def test_extensions(tmp_path, backend_id): # Register the optimization passes (if any) backend = hls4ml.backends.get_backend(backend_id) - backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=f'{backend_id.lower()}:optimize') + ip_flow = hls4ml.model.flow.get_flow(backend.get_default_flow()) + # Add the pass into the main optimization flow + optimize_flow = [flow for flow in ip_flow.requires if ':optimize' in flow][0] + backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=optimize_flow) # Register template passes for the given backend backend.register_template(HReverseConfigTemplate) @@ -168,6 +171,6 @@ def test_extensions(tmp_path, backend_id): hres = hmodel.predict(x.astype('float32')) # Check if the optimizer pass was applied - assert f'{backend_id.lower()}:remove_duplicate_reverse' in hmodel._applied_flows[0][f'{backend_id.lower()}:optimize'] + assert f'{backend_id.lower()}:remove_duplicate_reverse' in hmodel._applied_flows[0][optimize_flow] np.testing.assert_array_equal(kres, hres) diff --git a/test/pytest/test_globalpooling.py b/test/pytest/test_globalpooling.py index e1d3b1fec..ebbdb2419 100644 --- a/test/pytest/test_globalpooling.py +++ b/test/pytest/test_globalpooling.py @@ -32,7 +32,7 @@ def keras_model_1d(request): return model, model_type, keepdims -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado']) @pytest.mark.parametrize( 'keras_model_1d', [ @@ -88,7 +88,7 @@ def keras_model_2d(request): return model, model_type, keepdims -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado']) @pytest.mark.parametrize( 'keras_model_2d', [ diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py index bd3f175b1..6da516646 100644 --- a/test/pytest/test_keras_api.py +++ b/test/pytest/test_keras_api.py @@ -15,7 +15,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_dense(backend, io_type): model = tf.keras.models.Sequential() @@ -66,7 +66,7 @@ def test_dense(backend, io_type): PReLU(alpha_initializer="zeros",), Activation(activation='sigmoid', name='Activation')]) #ThresholdedReLU(theta=1.0)]) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_activations(activation_function, backend, io_type): model = tf.keras.models.Sequential() @@ -94,7 +94,7 @@ def test_activations(activation_function, backend, io_type): padds_options = ['same', 'valid'] @pytest.mark.parametrize('padds', padds_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_conv1d(padds, backend, io_type): model = tf.keras.models.Sequential() @@ -123,8 +123,8 @@ def test_conv1d(padds, backend, io_type): # 5e-2 might be too high np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=5e-2) - if not (backend=='Vivado' and io_type=='io_stream' and padds=='same'): - # Vivado inserts and additional layer for 'same' padding in io_stream + if not (backend in ['Vivado', 'Vitis'] and io_type=='io_stream' and padds=='same'): + # Vivado/Vitis inserts and additional layer for 'same' padding in io_stream assert len(model.layers) + 2 == len(hls_model.get_layers()) assert list(hls_model.get_layers())[1].attributes['name'] == model.layers[0]._name assert list(hls_model.get_layers())[1].attributes['class_name'] == 'Conv1D' @@ -154,7 +154,7 @@ def test_conv1d(padds, backend, io_type): padds_options=['same', 'valid'] @pytest.mark.parametrize('chans', chans_options) @pytest.mark.parametrize('padds', padds_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_conv2d(chans, padds, backend, io_type): model = tf.keras.models.Sequential() @@ -235,7 +235,7 @@ def test_conv2d(chans, padds, backend, io_type): @pytest.mark.parametrize('pooling', pooling_layers) @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('chans', chans_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_pooling(pooling, padds, chans, backend): assert '1D' in pooling.__name__ or '2D' in pooling.__name__ diff --git a/test/pytest/test_keras_h5_loader.py b/test/pytest/test_keras_h5_loader.py index 0fa689e45..08753d584 100644 --- a/test/pytest/test_keras_h5_loader.py +++ b/test/pytest/test_keras_h5_loader.py @@ -8,7 +8,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_keras_h5_loader(backend): input_shape = (10,) model = tf.keras.models.Sequential([ diff --git a/test/pytest/test_merge.py b/test/pytest/test_merge.py index 470e9b3ff..8ab4fa3a1 100644 --- a/test/pytest/test_merge.py +++ b/test/pytest/test_merge.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize('merge_layer', [Add, Average, Maximum, Minimum, Multiply, Subtract]) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_merge(merge_layer, io_type, backend): input_shape = (10, 10, 3) @@ -35,7 +35,7 @@ def test_merge(merge_layer, io_type, backend): @pytest.mark.parametrize('axes', [1]) @pytest.mark.parametrize('io_type', ['io_parallel']) # No io_stream implementation yet -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_dot(axes, io_type, backend): # Only 1D implemented input_shape = (10, ) @@ -61,7 +61,7 @@ def test_dot(axes, io_type, backend): np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_concatenate1d(io_type, backend): input_shape = (10,) @@ -87,7 +87,7 @@ def test_concatenate1d(io_type, backend): @pytest.mark.parametrize('axis', [1, 2]) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_concatenate2d(axis, io_type, backend): input_shape = (10, 3) @@ -114,7 +114,7 @@ def test_concatenate2d(axis, io_type, backend): @pytest.mark.parametrize('axis', [1, 2, 3]) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_concatenate3d(axis, io_type, backend): input_shape = (10, 10, 3) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 7650056f8..d43e35288 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -20,11 +20,13 @@ @pytest.mark.parametrize('backend, io_type, strategy', [ ('Quartus', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'resource'), - + ('Vitis', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), - + ('Vitis', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource'), ]) def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): model = tf.keras.models.Sequential() diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py index 399154144..8645ecd0b 100644 --- a/test/pytest/test_qkeras.py +++ b/test/pytest/test_qkeras.py @@ -123,7 +123,7 @@ def randX_100_16(): # https://github.com/fastmachinelearning/hls4ml/issues/381 # @pytest.mark.parametrize('bits', [4, 6, 8]) @pytest.mark.parametrize('bits,alpha', [(4, 1), (4, 'auto_po2')]) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_single_dense_activation_exact(randX_100_16, bits, alpha, backend, io_type): ''' @@ -221,7 +221,7 @@ def test_quantizer_special(randX_1000_1, quantizer, backend, io_type): (7, 10, binary(), quantized_bits(5, 2), binary(), False, True), ], ) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_btnn(make_btnn, randX_100_10, backend, io_type): model, is_xnor, test_no = make_btnn @@ -264,7 +264,7 @@ def randX_1000_1(): (quantized_relu(10, 5)), ], ) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_quantizer(randX_1000_1, quantizer, backend, io_type): ''' diff --git a/test/pytest/test_rnn.py b/test/pytest/test_rnn.py index 12fc42601..aa49e43d3 100644 --- a/test/pytest/test_rnn.py +++ b/test/pytest/test_rnn.py @@ -70,10 +70,14 @@ def test_rnn_parsing(rnn_layer, return_sequences): [ (SimpleRNN, 'Quartus', 'io_parallel'), (LSTM, 'Vivado', 'io_parallel'), + (LSTM, 'Vitis', 'io_parallel'), (LSTM, 'Quartus', 'io_parallel'), (LSTM, 'Vivado', 'io_stream'), + (LSTM, 'Vitis', 'io_stream'), (GRU, 'Vivado', 'io_parallel'), (GRU, 'Vivado', 'io_stream'), + (GRU, 'Vitis', 'io_parallel'), + (GRU, 'Vitis', 'io_stream'), (GRU, 'Quartus', 'io_parallel'), (GRU, 'Quartus', 'io_stream'), ], diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index 7815d5770..d32569449 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -23,7 +23,8 @@ @pytest.mark.parametrize("kernels", kernel_options) @pytest.mark.parametrize("bias", bias_options) @pytest.mark.parametrize("io_type", io_type_options) -def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type): +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) +def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() input_shape = (28, 28, 3) model.add(conv2d(filters=32, @@ -42,8 +43,8 @@ def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type): config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>') stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '') - output_dir = str(test_root_path / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding'.format(conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds)) - hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, io_type=io_type) + output_dir = str(test_root_path / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds, backend, io_type)) + hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend) hls_model.compile() hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape) diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py index 749a019f3..9290faf50 100644 --- a/test/pytest/test_softmax.py +++ b/test/pytest/test_softmax.py @@ -23,7 +23,7 @@ def high_accuracy_distribution(shape): def generate_data(function, input_shape): return function((1000, *input_shape)) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('strategy', ['stable', 'argmax']) @pytest.mark.parametrize('function,input_shape,io_type', [ (flat_distribution, (8,), 'io_parallel'), @@ -58,7 +58,7 @@ def test_softmax(backend, strategy, generate_data, input_shape, io_type, functio assert acc_hls4ml >= 0.98 -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_softmax_skipped(backend, io_type): X = np.random.rand(100, 10) diff --git a/test/pytest/test_softsign.py b/test/pytest/test_softsign.py index 2f70b8251..338aaf6f3 100644 --- a/test/pytest/test_softsign.py +++ b/test/pytest/test_softsign.py @@ -7,7 +7,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('input_shape, io_type', [ ((8, ), 'io_parallel'), ((8, ), 'io_stream'), diff --git a/test/pytest/test_trace.py b/test/pytest/test_trace.py index ce01c4213..4c7cde4ac 100644 --- a/test/pytest/test_trace.py +++ b/test/pytest/test_trace.py @@ -8,7 +8,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_trace(backend): '''Test the tracing feature with a simple Keras model.''' model = tf.keras.models.Sequential() diff --git a/test/pytest/test_transpose_concat.py b/test/pytest/test_transpose_concat.py index 488fc46b6..db3e03125 100644 --- a/test/pytest/test_transpose_concat.py +++ b/test/pytest/test_transpose_concat.py @@ -29,7 +29,7 @@ def keras_model(): @pytest.fixture @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def hls_model(keras_model, backend, io_type): hls_config = hls4ml.utils.config_from_keras_model( keras_model, default_precision='ap_fixed<16,3,AP_RND_CONV,AP_SAT>', granularity='name' @@ -45,7 +45,7 @@ def hls_model(keras_model, backend, io_type): @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_accuracy(data, keras_model, hls_model): X = data model = keras_model diff --git a/test/pytest/test_upsampling.py b/test/pytest/test_upsampling.py index 7e698fd90..0f5130162 100644 --- a/test/pytest/test_upsampling.py +++ b/test/pytest/test_upsampling.py @@ -41,7 +41,7 @@ def keras_model_2d(): @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('model_type', ['1d', '2d']) def test_upsampling(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend): if model_type == '1d': diff --git a/test/pytest/test_zeropadding.py b/test/pytest/test_zeropadding.py index 219f727c0..ca539a9ef 100644 --- a/test/pytest/test_zeropadding.py +++ b/test/pytest/test_zeropadding.py @@ -45,7 +45,7 @@ def keras_model_2d(): @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('model_type', ['1d', '2d']) def test_zeropadding(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend): if model_type == '1d':