From 27a4eb7b2ee2bbac340876f3a247e222dff09f01 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 28 Jul 2022 20:44:42 +0200 Subject: [PATCH 01/20] Remove the limit function, Vitis doesn't like this --- .../backends/vivado/passes/convolution_templates.py | 7 +++++++ hls4ml/backends/vivado/passes/core_templates.py | 2 ++ hls4ml/backends/vivado/passes/merge_templates.py | 1 + hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h | 11 ++++------- .../vivado/nnet_utils/nnet_batchnorm_stream.h | 5 ++--- .../templates/vivado/nnet_utils/nnet_dense_latency.h | 3 +-- hls4ml/templates/vivado/nnet_utils/nnet_merge.h | 3 +-- hls4ml/templates/vivado/nnet_utils/nnet_mult.h | 9 +-------- .../templates/vivado/nnet_utils/nnet_sepconv_stream.h | 3 +-- 9 files changed, 20 insertions(+), 24 deletions(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index d4ac2d5b0a..a16ab80c6e 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -10,6 +10,8 @@ static const unsigned n_out = {n_out}; static const unsigned reuse_factor = {reuse}; static const unsigned strategy = nnet::{strategy}; + static const unsigned n_zeros = 0; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; typedef {accum_t.name} accum_t; typedef {bias_t.name} bias_t; typedef {weight_t.name} weight_t; @@ -103,6 +105,7 @@ def format(self, node): static const unsigned out_width = {out_width}; static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor; static const bool store_weights_in_bram = false; static const unsigned strategy = nnet::{strategy}; static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; @@ -278,6 +281,10 @@ def format(self, node): # Depthwise config params = self._default_config_params(node) + # Override bias and bias_t since these are zeros in depthwise step of SepConv2D + params['bias'] = params['zero_bias'] + params['bias_t'] = params['zero_bias_t'] + params['n_filt'] = params['n_chan'] # In depthwise step n_chan == n_filt params['dilation'] = node.get_attr('dilation', 1) params['nzeros'] = node.get_weights('depthwise').nzeros diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index f63c0f454d..8327e3a7fe 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -13,6 +13,7 @@ static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; static const unsigned n_nonzeros = {nonzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; static const bool store_weights_in_bram = false; typedef {accum_t.name} accum_t; typedef {bias_t.name} bias_t; @@ -60,6 +61,7 @@ def format(self, node): static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; static const unsigned io_type = nnet::{iotype}; static const unsigned reuse_factor = {reuse}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor); static const bool store_weights_in_bram = false; typedef {bias_t.name} bias_t; typedef {scale_t.name} scale_t; diff --git a/hls4ml/backends/vivado/passes/merge_templates.py b/hls4ml/backends/vivado/passes/merge_templates.py index 863512c4c5..7aa705750d 100644 --- a/hls4ml/backends/vivado/passes/merge_templates.py +++ b/hls4ml/backends/vivado/passes/merge_templates.py @@ -49,6 +49,7 @@ def format(self, node): static const unsigned n_in = {n_in}; static const unsigned n_out = {n_out}; static const unsigned reuse_factor = {reuse}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor); typedef {accum_t.name} accum_t; template using product = nnet::product::{product_type}; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h index 2002827843..2314f56091 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h @@ -70,18 +70,17 @@ void normalize( #pragma HLS ARRAY_PARTITION variable=scale complete #pragma HLS ARRAY_PARTITION variable=bias complete - int multiplier_limit = ceil(float(CONFIG_T::n_in) / float(CONFIG_T::reuse_factor)); - CONFIG_T::template product::limit(multiplier_limit); + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Calcuate result Result: for (int ires = 0; ires < CONFIG_T::n_in; ires++) { if (CONFIG_T::n_filt==-1) { res[ires] = CONFIG_T::template product::product(data[ires], scale[ires]) + bias[ires]; - } else { + } else { int norm_index = ires%CONFIG_T::n_filt; res[ires] = CONFIG_T::template product::product(data[ires], scale[norm_index]) + bias[norm_index]; } - } + } } // **************************************************** @@ -108,13 +107,12 @@ void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T data_T datareg; ap_uint<1> cache; for (int ii=0; ii threshold[norm_index] ) cache = 1; else cache = 0; res[ii] = (ap_uint<1>) cache; - } } @@ -134,7 +132,6 @@ void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T else cache = 0; res[ii] = (ap_int<2>) cache; - } } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h index ce76c01bc3..a2b406806c 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h @@ -41,9 +41,8 @@ void normalize( #pragma HLS ARRAY_PARTITION variable=scale complete #pragma HLS ARRAY_PARTITION variable=bias complete - constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); - constexpr unsigned ii = CONFIG_T::n_in / multiplier_limit; - CONFIG_T::template product::limit(multiplier_limit); + constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit; + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit BatchNormLoop: for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { #pragma HLS PIPELINE II=ii diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h index c4dcea4abf..464e8b4959 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h @@ -52,8 +52,7 @@ void dense_latency( #pragma HLS ARRAY_PARTITION variable=mult complete #pragma HLS ARRAY_PARTITION variable=acc complete - int multiplier_limit = ceil(float(CONFIG_T::n_in*CONFIG_T::n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor)); - CONFIG_T::template product::limit(multiplier_limit); + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Do the matrix-multiply Product1: for(int ii = 0; ii < CONFIG_T::n_in; ii++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h index a35c264d28..19f2b421dd 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h @@ -140,8 +140,7 @@ void dot1d( { #pragma HLS PIPELINE II=CONFIG_T::reuse_factor - constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); - CONFIG_T::template product::limit(multiplier_limit); + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit typename CONFIG_T::accum_t mult[CONFIG_T::n_in]; #pragma HLS ARRAY_PARTITION variable=mult complete diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h index 586bc65aeb..966959c705 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h @@ -16,10 +16,7 @@ namespace product{ * types of each. * --- */ -class Product{ - public: - static void limit(unsigned multiplier_limit) {} // Nothing to do here -}; +class Product{}; template class both_binary : public Product{ @@ -77,10 +74,6 @@ class mult : public Product{ #pragma HLS INLINE return a * w; } - static void limit(unsigned multiplier_limit){ - #pragma HLS INLINE - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation - } }; template diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h index e8826e300c..5788d429e1 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h @@ -26,8 +26,7 @@ void depthwise_product( #pragma HLS ARRAY_PARTITION variable=mult complete - int multiplier_limit = ceil(float(CONFIG_T::kernel_size * CONFIG_T::n_chan) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor)); - CONFIG_T::mult_config::template product::limit(multiplier_limit); + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Do the matrix-multiply Product: for(int ii = 0; ii < CONFIG_T::kernel_size * CONFIG_T::n_chan; ii++) { From 9612365a3655ed2a5732c71fb3636b1cc696d072 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 28 Jul 2022 20:45:15 +0200 Subject: [PATCH 02/20] Rudimentary Vitis backend --- hls4ml/backends/__init__.py | 2 ++ hls4ml/backends/vitis/vitis_backend.py | 32 ++++++++++++++++++++++++++ hls4ml/report/vivado_report.py | 24 +++++++++++-------- hls4ml/writer/__init__.py | 2 ++ hls4ml/writer/vitis_writer.py | 15 ++++++++++++ 5 files changed, 66 insertions(+), 9 deletions(-) create mode 100644 hls4ml/backends/vitis/vitis_backend.py create mode 100644 hls4ml/writer/vitis_writer.py diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index cbd44d4661..5fe6920525 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -5,8 +5,10 @@ from hls4ml.backends.vivado.vivado_backend import VivadoBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig +from hls4ml.backends.vitis.vitis_backend import VitisBackend from hls4ml.backends.quartus.quartus_backend import QuartusBackend register_backend('Vivado', VivadoBackend) register_backend('VivadoAccelerator', VivadoAcceleratorBackend) +register_backend('Vitis', VitisBackend) register_backend('Quartus', QuartusBackend) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py new file mode 100644 index 0000000000..512917e933 --- /dev/null +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -0,0 +1,32 @@ +import os +import sys + +from hls4ml.backends import VivadoBackend +from hls4ml.model.flow import register_flow +from hls4ml.report import parse_vivado_report + + +class VitisBackend(VivadoBackend): + def __init__(self): + super(VivadoBackend, self).__init__(name='Vitis') + self._register_flows() + + def _register_flows(self): + vivado_ip = 'vivado:ip' + writer_passes = ['make_stamp', 'vitis:write_hls'] + self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name) + self._default_flow = vivado_ip + + def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False): + if 'linux' in sys.platform: + found = os.system('command -v vitis_hls > /dev/null') + if found != 0: + raise Exception('Vitis HLS installation not found. Make sure "vitis_hls" is on PATH.') + + curr_dir = os.getcwd() + os.chdir(model.config.get_output_dir()) + os.system('vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} validation={validation} export={export} vsynth={vsynth}"' + .format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth)) + os.chdir(curr_dir) + + return parse_vivado_report(model.config.get_output_dir()) \ No newline at end of file diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py index 7930564b80..4325afd29d 100644 --- a/hls4ml/report/vivado_report.py +++ b/hls4ml/report/vivado_report.py @@ -53,15 +53,21 @@ def _find_solutions(sln_dir): solutions = [] if os.path.isfile(sln_dir + '/vivado_hls.app'): - with open(sln_dir + '/vivado_hls.app') as f: - # Get rid of namespaces (workaround to support two types of vivado_hls.app files) - xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1) - - root = ET.fromstring(xmlstring) - for sln_tag in root.findall('solutions/solution'): - sln_name = sln_tag.get('name') - if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name): - solutions.append(sln_name) + sln_file = 'vivado_hls.app' + elif os.path.isfile(sln_dir + '/hls.app'): + sln_file = 'hls.app' + else: + return solutions + + with open(sln_dir + '/' + sln_file) as f: + # Get rid of namespaces (workaround to support two types of vivado_hls.app files) + xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1) + + root = ET.fromstring(xmlstring) + for sln_tag in root.findall('solutions/solution'): + sln_name = sln_tag.get('name') + if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name): + solutions.append(sln_name) return solutions diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index b25576f9a1..8ac4f1f8e9 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -3,8 +3,10 @@ from hls4ml.writer.writers import Writer, register_writer, get_writer from hls4ml.writer.vivado_writer import VivadoWriter from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter +from hls4ml.writer.vitis_writer import VitisWriter from hls4ml.writer.quartus_writer import QuartusWriter register_writer('Vivado', VivadoWriter) register_writer('VivadoAccelerator', VivadoAcceleratorWriter) +register_writer('Vitis', VitisWriter) register_writer('Quartus', QuartusWriter) diff --git a/hls4ml/writer/vitis_writer.py b/hls4ml/writer/vitis_writer.py new file mode 100644 index 0000000000..45784acb6b --- /dev/null +++ b/hls4ml/writer/vitis_writer.py @@ -0,0 +1,15 @@ +import os +from shutil import copyfile, copytree +from distutils.dir_util import copy_tree +from hls4ml.writer.vivado_writer import VivadoWriter + +class VitisWriter(VivadoWriter): + + def __init__(self): + super().__init__() + + def write_hls(self, model): + """ + Write the HLS project. Calls the steps from VivadoWriter, adapted for Vitis + """ + super(VitisWriter, self).write_hls(model) From 08fbc285060fc7d01e9cf2b86d48f8480cb532ba Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 28 Jul 2022 21:17:44 +0200 Subject: [PATCH 03/20] Ensure default build options are the same --- hls4ml/templates/vivado/build_prj.tcl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index df01e459ac..cba53de031 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -164,9 +164,11 @@ if {$opt(reset)} { open_solution "solution1" } catch {config_array_partition -maximum_size 4096} -config_compile -name_max_length 60 +config_compile -name_max_length 80 set_part {xcku115-flvb2104-2-i} +config_schedule -enable_dsp_full_reg=false create_clock -period 5 -name default +set_clock_uncertainty 12.5% default if {$opt(csim)} { From 5ca91236e57f3d7dcfd1b9f355266d7866f64134 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 1 Aug 2022 15:40:20 +0200 Subject: [PATCH 04/20] Conditionally use DATA_PACK pragma --- .../nnet_utils/nnet_activation_stream.h | 34 ++++++------ .../vivado/nnet_utils/nnet_batchnorm_stream.h | 6 +-- .../templates/vivado/nnet_utils/nnet_common.h | 11 ++++ .../vivado/nnet_utils/nnet_conv1d_stream.h | 2 +- .../vivado/nnet_utils/nnet_conv2d_stream.h | 2 +- .../vivado/nnet_utils/nnet_conv_stream.h | 4 +- .../vivado/nnet_utils/nnet_dense_stream.h | 2 +- .../vivado/nnet_utils/nnet_embed_stream.h | 2 +- .../vivado/nnet_utils/nnet_image_stream.h | 2 +- .../vivado/nnet_utils/nnet_merge_stream.h | 30 +++++------ .../vivado/nnet_utils/nnet_pooling_stream.h | 16 +++--- .../vivado/nnet_utils/nnet_sepconv1d_stream.h | 2 +- .../vivado/nnet_utils/nnet_sepconv2d_stream.h | 2 +- .../templates/vivado/nnet_utils/nnet_stream.h | 53 ++++++++++--------- 14 files changed, 90 insertions(+), 78 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h index 8f294daee7..bc36c1a5cf 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h @@ -40,7 +40,7 @@ void linear(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) LinearPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -62,7 +62,7 @@ void relu(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ReLUPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -98,7 +98,7 @@ void sigmoid(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) SigmoidPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -164,7 +164,7 @@ void softmax_latency(hls::stream &data, hls::stream &res){ typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; res_T out_pack; - #pragma HLS DATA_PACK variable=out_pack + PRAGMA_DATA_PACK(out_pack) SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){ #pragma HLS UNROLL #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation @@ -238,7 +238,7 @@ void softmax_stable(hls::stream &data, hls::stream &res){ typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; res_T out_pack; - #pragma HLS DATA_PACK variable=out_pack + PRAGMA_DATA_PACK(out_pack) SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){ #pragma HLS UNROLL #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation @@ -300,7 +300,7 @@ void softmax_legacy(hls::stream &data, hls::stream &res) { } res_T out_pack; - #pragma HLS DATA_PACK variable=out_pack + PRAGMA_DATA_PACK(out_pack) SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -356,7 +356,7 @@ void tanh(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) TanHPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -386,7 +386,7 @@ void hard_sigmoid(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) HardSigmoidPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -412,7 +412,7 @@ void leaky_relu(hls::stream &data, typename data_T::value_type alpha, hl data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) LeakyReLUPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -435,7 +435,7 @@ void thresholded_relu(hls::stream &data, typename data_T::value_type the data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ThresholdedReLUPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -472,7 +472,7 @@ void softplus(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) SoftplusPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -511,7 +511,7 @@ void softsign(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) SoftsignPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -549,7 +549,7 @@ void elu(hls::stream &data, typename data_T::value_type alpha, hls::stre data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) EluPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -596,7 +596,7 @@ void selu(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) SeluPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -626,7 +626,7 @@ void prelu(hls::stream &data, typename data_T::value_type alpha[CONFIG_T data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) PReLUPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -647,7 +647,7 @@ void binary_tanh(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) PReLUPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -668,7 +668,7 @@ void ternary_tanh(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) PReLUPackLoop: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h index a2b406806c..ce49d65b02 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h @@ -49,7 +49,7 @@ void normalize( data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) BatchNormpack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -82,7 +82,7 @@ void normalize_binary_tanh( data_T in_data = data.read(); nnet::array, CONFIG_T::n_in> out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) BatchNormPack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -108,7 +108,7 @@ void normalize_ternary_tanh( data_T in_data = data.read(); nnet::array, CONFIG_T::n_in> out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) BatchNormPack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index 9bfae8339a..af59f90218 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -27,6 +27,17 @@ #define MIN(n,d) (n > d ? d : n) #define MAX(n,d) (n > d ? n : d) +#define STRINGIFY(x) #x +#define EXPAND_STRING(x) STRINGIFY(x) + +#ifndef __VITIS_HLS__ +#define DATA_PACK_TXT HLS DATA_PACK variable= +#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable +#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable))) +#else +#define PRAGMA_DATA_PACK(variable) +#endif + namespace nnet { // Common type definitions diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h index e0f6f08332..e887b2564f 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h @@ -40,7 +40,7 @@ void conv_1d_encoded_cl( #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h index 5d1c7d1efb..7d451b3ba2 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h @@ -44,7 +44,7 @@ void conv_2d_encoded_cl( #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index 862e8361a1..a922d58541 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -263,7 +263,7 @@ void compute_output_buffer_2d( #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0 res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) // Add pixel to buffer nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); @@ -333,7 +333,7 @@ void compute_output_buffer_1d( #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0 res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) // Add pixel to buffer nnet::kernel_shift_1d(in_elem, kernel_data); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h index 52c96c52c3..9b0deb9b42 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h @@ -56,7 +56,7 @@ void dense( #pragma HLS PIPELINE } res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = res[i_out * res_T::size + i_pack]; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h index 3ada00b244..fb8e2fb435 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h @@ -19,7 +19,7 @@ void embedding( #pragma HLS PIPELINE II=CONFIG_T::reuse_factor res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) DenseEmbedding: for (int i = 0; i < CONFIG_T::n_out; i++) { #pragma HLS UNROLL diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h index 42d2ce80e0..89f91d6f06 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h @@ -44,7 +44,7 @@ void resize_nearest( #pragma HLS UNROLL data_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ResizeChan: for (unsigned k = 0; k < CONFIG_T::n_chan; k++) { #pragma HLS UNROLL diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h index 6b053b8781..c9ac45edf4 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h @@ -40,7 +40,7 @@ void add( input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) AddPack: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -65,7 +65,7 @@ void subtract( input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) SubtractPack: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -90,7 +90,7 @@ void multiply( input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) MultiplyPack: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -115,7 +115,7 @@ void average( input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) AveragePack: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -140,7 +140,7 @@ void maximum( input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) MaximumPack: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -165,7 +165,7 @@ void minimum( input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) MinimumPack: for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL @@ -188,7 +188,7 @@ void concatenate3d_0( input1_T in_data1 = data1.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { #pragma HLS UNROLL @@ -204,7 +204,7 @@ void concatenate3d_0( input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) { #pragma HLS UNROLL @@ -228,7 +228,7 @@ void concatenate3d_1( input1_T in_data1 = data1.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { #pragma HLS UNROLL @@ -242,7 +242,7 @@ void concatenate3d_1( input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) { #pragma HLS UNROLL @@ -267,7 +267,7 @@ void concatenate3d_2( input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { #pragma HLS UNROLL @@ -310,7 +310,7 @@ void concatenate2d_0( input1_T in_data1 = data1.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { #pragma HLS UNROLL @@ -324,7 +324,7 @@ void concatenate2d_0( input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) { #pragma HLS UNROLL @@ -347,7 +347,7 @@ void concatenate2d_1( input1_T in_data1 = data1.read(); input2_T in_data2 = data2.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) { #pragma HLS UNROLL @@ -383,7 +383,7 @@ void concatenate1d( hls::stream &res) { res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) ConcatLoop1: for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) { #pragma HLS PIPELINE input1_T in_data1 = data1.read(); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h index 80fa1d2872..4113d9e363 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h @@ -130,7 +130,7 @@ void pooling2d_encoded_cl( assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; hls::stream data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]; @@ -176,7 +176,7 @@ void compute_pool_buffer_2d( #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) // Add pixel into line buffer, return pooling kernels nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); @@ -344,7 +344,7 @@ void pooling1d_encoded_cl( assert(CONFIG_T::pool_width == CONFIG_T::stride_width); res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; hls::stream data_window[CONFIG_T::pool_width * CONFIG_T::n_filt]; @@ -385,7 +385,7 @@ void compute_pool_buffer_1d( #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) // Add pixel into line buffer, return pooling kernels // 1D case line buffer not necessary. Put directly into the kernel_data buffer @@ -523,7 +523,7 @@ void global_pooling2d_cl( #pragma HLS PIPELINE res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = data_window[i_pack]; @@ -535,7 +535,7 @@ void global_pooling2d_cl( #pragma HLS PIPELINE res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width); @@ -577,7 +577,7 @@ void global_pooling1d_cl( #pragma HLS PIPELINE res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = data_window[i_pack]; @@ -589,7 +589,7 @@ void global_pooling1d_cl( #pragma HLS PIPELINE res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h index b0f6ce9c64..71ccf1a014 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h @@ -26,7 +26,7 @@ void depthwise_conv_1d_encoded_cl( #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h index 69e2726525..b2c80950a7 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h @@ -27,7 +27,7 @@ void depthwise_conv_2d_encoded_cl( #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) unsigned outputs_ready = 0; ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h index 9ee6628fee..b4de14ffdd 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h @@ -3,17 +3,18 @@ #define NNET_STREAM_H #include "hls_stream.h" +#include "nnet_common.h" namespace nnet { struct broadcast_config { - static const unsigned in_height = 1; - static const unsigned in_width = 1; - static const unsigned in_chan = 3; - static const unsigned out_height = 2; - static const unsigned out_width = 2; - static const unsigned out_chan = 3; + static const unsigned in_height = 1; + static const unsigned in_width = 1; + static const unsigned in_chan = 3; + static const unsigned out_height = 2; + static const unsigned out_width = 2; + static const unsigned out_chan = 3; }; template @@ -24,8 +25,8 @@ void clone_stream(hls::stream &data, hls::stream &res1, hls::stre data_T in_data = data.read(); res_T out_data1; res_T out_data2; - #pragma HLS DATA_PACK variable=out_data1 - #pragma HLS DATA_PACK variable=out_data2 + PRAGMA_DATA_PACK(out_data1) + PRAGMA_DATA_PACK(out_data2) ClonePack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -47,9 +48,9 @@ void clone_stream(hls::stream &data, hls::stream &res1, hls::stre res_T out_data1; res_T out_data2; res_T out_data3; - #pragma HLS DATA_PACK variable=out_data1 - #pragma HLS DATA_PACK variable=out_data2 - #pragma HLS DATA_PACK variable=out_data3 + PRAGMA_DATA_PACK(out_data1) + PRAGMA_DATA_PACK(out_data2) + PRAGMA_DATA_PACK(out_data3) ClonePack: for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -72,7 +73,7 @@ void repack_stream(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL @@ -90,7 +91,7 @@ void repack_stream(hls::stream &data, hls::stream &res) { data_T in_data = data.read(); res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) for (int j = 0; j < pack_diff; j++) { #pragma HLS PIPELINE @@ -136,7 +137,7 @@ void broadcast_stream_1x1xC(hls::stream &data, hls::stream &res) for (int j = 0; j < n_dupl; j++) { #pragma HLS PIPELINE res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) for (int k = 0; k < res_T::size; k++) { #pragma HLS UNROLL out_data[k] = in_data[k]; @@ -152,20 +153,20 @@ void broadcast_stream_HxWx1(hls::stream &data, hls::stream &res) BroadcastLoop: for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); - res_T out_data; - #pragma HLS DATA_PACK variable=out_data - for (int k = 0; k < res_T::size; k++) { + res_T out_data; + PRAGMA_DATA_PACK(out_data) + for (int k = 0; k < res_T::size; k++) { #pragma HLS UNROLL - out_data[k] = in_data[0]; - } - res.write(out_data); + out_data[k] = in_data[0]; + } + res.write(out_data); } } template void broadcast_stream(hls::stream &data, hls::stream &res) { if(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) { - broadcast_stream_1x1xC(data, res); + broadcast_stream_1x1xC(data, res); } else if(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && CONFIG_T::in_width == CONFIG_T::out_width) { broadcast_stream_HxWx1(data, res); @@ -180,19 +181,19 @@ void transpose_2d(hls::stream &data, hls::stream &res) { for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); - for (int j = 0; j < data_T::size; j++) { - data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]); + for (int j = 0; j < data_T::size; j++) { + data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]); } } for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) { #pragma HLS PIPELINE res_T out_data; - #pragma HLS DATA_PACK variable=out_data + PRAGMA_DATA_PACK(out_data) for (int j = 0; j < res_T::size; j++) { - out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]); + out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]); } - res.write(out_data); + res.write(out_data); } } } From 65dc688de1b5cf2278faa7ab8c04b3770176e4ce Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 1 Aug 2022 17:15:59 +0200 Subject: [PATCH 05/20] Fix allocation pragmas --- .../nnet_utils/nnet_activation_stream.h | 4 +- .../vivado/nnet_utils/nnet_conv2d_latency.h | 53 ++----------------- .../vivado/nnet_utils/nnet_dense_compressed.h | 8 +-- .../vivado/nnet_utils/nnet_pooling.h | 8 +-- 4 files changed, 14 insertions(+), 59 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h index bc36c1a5cf..f866aa16fd 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h @@ -167,7 +167,7 @@ void softmax_latency(hls::stream &data, hls::stream &res){ PRAGMA_DATA_PACK(out_pack) SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){ #pragma HLS UNROLL - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit out_pack[j] = exp_res[j] * inv_exp_sum; } res.write(out_pack); @@ -241,7 +241,7 @@ void softmax_stable(hls::stream &data, hls::stream &res){ PRAGMA_DATA_PACK(out_pack) SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){ #pragma HLS UNROLL - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit out_pack[j] = exp_res[j] * inv_exp_sum; } res.write(out_pack); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h index 24132e5c6d..724eedbeb6 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h @@ -6,50 +6,6 @@ namespace nnet { -//Computes multiplier limit -//This function should not be synthesized into firmware -template - int compute_multiplier_limit_conv2d( - typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt] -) -{ - int n_mult = 0; - - for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++){ - for(int cc = 0; cc < CONFIG_T::n_chan; cc++){ - for(int fh = 0; fh < CONFIG_T::filt_height; fh++){ - for(int fw = 0; fw < CONFIG_T::filt_width; fw++){ - - int index_weight = fh*CONFIG_T::filt_width*CONFIG_T::n_chan*CONFIG_T::n_filt - + fw*CONFIG_T::n_chan*CONFIG_T::n_filt - + cc*CONFIG_T::n_filt - + ff; - - if ((oh*CONFIG_T::stride_height+fh) < CONFIG_T::pad_top - || (oh*CONFIG_T::stride_height+fh) >= (CONFIG_T::pad_top+CONFIG_T::in_height) - || (ow*CONFIG_T::stride_width+fw) < CONFIG_T::pad_left - || (ow*CONFIG_T::stride_width+fw) >= (CONFIG_T::pad_left+CONFIG_T::in_width)) { - //padded - do nothing - continue; - } else { - if (weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20) { - n_mult++; - } - } - - }//end mult loop - }//end channel loop - }//end filter width loop - }//end filter height loop - }//end output width loop - }//end output height loop - - return ceil( float(n_mult) / float(CONFIG_T::reuse_factor) ); - -}//end compute_n_mult - template void conv_2d_latency_cf( data_T data[CONFIG_T::in_height*CONFIG_T::in_width*CONFIG_T::n_chan], @@ -72,8 +28,7 @@ void conv_2d_latency_cf( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - const int multiplier_limit = compute_multiplier_limit_conv2d(weights); - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) { @@ -188,8 +143,7 @@ void conv_2d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - const int multiplier_limit = compute_multiplier_limit_conv2d(weights); - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) { @@ -303,8 +257,7 @@ void pointwise_conv_2d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - const int multiplier_limit = compute_multiplier_limit_conv2d(weights); - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h index dc803ff2bc..7202b3a101 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h @@ -51,10 +51,12 @@ void dense_compressed( #pragma HLS ARRAY_PARTITION variable=acc complete #pragma HLS ARRAY_PARTITION variable=biases complete #pragma HLS ARRAY_RESHAPE variable=weights block factor=multiplier_limit - //if (CONFIG_T::store_weights_in_bram){ - //#pragma HLS RESOURCE variable=weights core=ROM_1P_BRAM + +#ifdef __VITIS_HLS__ + #pragma HLS AGGREGATE variable=weights +#else #pragma HLS data_pack variable=weights struct_level - //} +#endif InitAccum: for(unsigned i = 0; i < CONFIG_T::n_out; i++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h index 5267a58fcb..cd2c580f13 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h @@ -109,7 +109,7 @@ void pooling1d_cl( // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit_1d(); - #pragma HLS ALLOCATION instances=pool_op limit=limit function + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit // Add any necessary padding unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { @@ -158,7 +158,7 @@ void global_pooling1d_cl( // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit_1d(); - #pragma HLS ALLOCATION instances=pool_op limit=limit function + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { data_T pool[CONFIG_T::n_in]; @@ -209,7 +209,7 @@ void pooling2d_cl( // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit(); - #pragma HLS ALLOCATION instances=pool_op limit=limit function + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit // Add any necessary padding unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; @@ -263,7 +263,7 @@ void pooling2d_cf( // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit(); - #pragma HLS ALLOCATION instances=pool_op limit=limit function + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit // Add any necessary padding unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; From 5f9548b4ebf9659001a4ff12f38e05904711c777 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 1 Aug 2022 17:42:23 +0200 Subject: [PATCH 06/20] Use recursive inlining instead of region --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h | 2 +- hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h | 2 +- hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h | 6 +++--- hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h | 2 +- hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h | 2 +- hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h | 4 ++-- hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h | 8 ++++---- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h index e887b2564f..0dd2e8cdeb 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h @@ -81,7 +81,7 @@ void conv_1d_cl( typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - #pragma HLS inline region + #pragma HLS inline recursive switch(CONFIG_T::implementation){ case conv_implementation::linebuffer: conv_1d_buffer_cl(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h index 7d451b3ba2..6eff1eef68 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h @@ -97,7 +97,7 @@ void conv_2d_cl( typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - #pragma HLS inline region + #pragma HLS inline recursive switch(CONFIG_T::implementation){ case conv_implementation::linebuffer: conv_2d_buffer_cl(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index a922d58541..dab9c39ad7 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -89,7 +89,7 @@ void mult_buffer( data[id] = data_window[id].read(); } - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { dense_latency(data, res, weights, biases); } else { @@ -272,7 +272,7 @@ void compute_output_buffer_2d( if ( (sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { // Dense multiply - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { dense_latency(kernel_data, res_out, weights, biases); } else { @@ -342,7 +342,7 @@ void compute_output_buffer_1d( if ( (sX - lShiftX) == 0 && pX > lShiftX - 1 ) { // Dense multiply - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { dense_latency(kernel_data, res_out, weights, biases); } else { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h index c0e5d17591..1803653279 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h @@ -270,7 +270,7 @@ void dense_resource( typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { dense_resource_rf_leq_nin(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h index 9b0deb9b42..564bafac9c 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h @@ -16,7 +16,7 @@ void dense_wrapper( typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], typename CONFIG_T::bias_t biases[CONFIG_T::n_out] ) { - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { #pragma HLS PIPELINE II=CONFIG_T::reuse_factor dense_latency(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h index 4113d9e363..08c4a6a8a8 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h @@ -245,7 +245,7 @@ void pooling2d_cl( hls::stream &data, hls::stream &res ) { - #pragma HLS inline region + #pragma HLS inline recursive switch(CONFIG_T::implementation){ case conv_implementation::linebuffer: pooling2d_buffer_cl(data, res); @@ -441,7 +441,7 @@ void pooling1d_cl( hls::stream &data, hls::stream &res ) { - #pragma HLS inline region + #pragma HLS inline recursive switch(CONFIG_T::implementation){ case conv_implementation::linebuffer: pooling1d_buffer_cl(data, res); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h index 5788d429e1..ce6528995e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h @@ -76,7 +76,7 @@ void depthwise_mult_buffer( data[id] = data_window[id].read(); } - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { depthwise_product(data, res, weights, biases); } else { @@ -156,7 +156,7 @@ void pointwise_mult_buffer( data[id] = data_pack[id]; } - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { dense_latency(data, res, weights, biases); } else { @@ -203,7 +203,7 @@ void compute_depthwise_output_buffer_1d( // Check to see if we have a full kernel if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { // Dense multiply - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { depthwise_product(kernel_data, res_out, weights, biases); } else { @@ -267,7 +267,7 @@ void compute_depthwise_output_buffer_2d( // Check to see if we have a full kernel if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { // Dense multiply - #pragma HLS INLINE region + #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { depthwise_product(kernel_data, res_out, weights, biases); } else { From fc819aef6cbbca90a5d43934fa4c9f4753bfa271 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 3 Aug 2022 20:05:36 +0200 Subject: [PATCH 07/20] Add Vitis HLS implementation overrides --- .../vitis/nnet_utils/nnet_conv1d_stream.h | 36 ++ .../vitis/nnet_utils/nnet_conv2d_stream.h | 81 +++++ .../vitis/nnet_utils/nnet_dense_stream.h | 102 ++++++ .../vitis/nnet_utils/nnet_pooling_stream.h | 341 ++++++++++++++++++ .../vitis/nnet_utils/nnet_sepconv1d_stream.h | 88 +++++ .../vitis/nnet_utils/nnet_sepconv2d_stream.h | 112 ++++++ hls4ml/writer/vitis_writer.py | 20 +- 7 files changed, 778 insertions(+), 2 deletions(-) create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h new file mode 100644 index 0000000000..f054adc3d9 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h @@ -0,0 +1,36 @@ +#ifndef NNET_CONV1D_STREAM_H_ +#define NNET_CONV1D_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_conv_stream.h" +#include "hls_stream.h" + +namespace nnet { + +template +void conv_1d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } else { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } + +} + + +} +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h new file mode 100644 index 0000000000..1c77f4f3e6 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h @@ -0,0 +1,81 @@ +#ifndef NNET_CONV2D_STREAM_H_ +#define NNET_CONV2D_STREAM_H_ + +#include "ap_shift_reg.h" +#include "nnet_common.h" +#include "nnet_conv_stream.h" +#include "hls_stream.h" + +namespace nnet { + +// Line Buffer +template +void conv_2d_buffer_latency_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1,1)][CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + if (CONFIG_T::filt_height > 1) { + compute_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } + } +} + +template +void conv_2d_buffer_resource_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1,1)][CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + + if (CONFIG_T::filt_height > 1) { + compute_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } + } +} + +template +void conv_2d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + conv_2d_buffer_latency_cl(data, res, weights, biases); + } else { + conv_2d_buffer_resource_cl(data, res, weights, biases); + } +} + +} +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h new file mode 100644 index 0000000000..f8469f0cb5 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h @@ -0,0 +1,102 @@ +#ifndef NNET_DENSE_STREAM_H_ +#define NNET_DENSE_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_types.h" +#include "hls_stream.h" +#include +#include + +namespace nnet { + +template +void dense_wrapper( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + dense_latency(data, res, weights, biases); + } else { + dense_resource(data, res, weights, biases); + } +} + +template +void data_prepare( + hls::stream &data_stream, + typename data_T::value_type data[CONFIG_T::n_in] +) { + #pragma HLS INLINE + + if (CONFIG_T::n_in / data_T::size > 1) { + DataPrepare: for(int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) { + #pragma HLS PIPELINE + data_T data_pack = data_stream.read(); + DataPackPipeline: for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + #pragma HLS UNROLL + data[i_in * data_T::size + i_pack] = data_pack[i_pack]; + } + } + } else { + data_T data_pack = data_stream.read(); + DataPackSingle: for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + #pragma HLS UNROLL + data[i_pack] = data_pack[i_pack]; + } + } +} + +template +void res_write( + typename res_T::value_type res[CONFIG_T::n_out], + hls::stream &res_stream +) { + #pragma HLS INLINE + + if (CONFIG_T::n_out / res_T::size > 1) { + ResWrite: for(unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) { + #pragma HLS PIPELINE + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + ResPackPipeline: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = res[i_out * res_T::size + i_pack]; + } + res_stream.write(res_pack); + } + } else { + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + ResPackSingle: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = res[i_pack]; + } + res_stream.write(res_pack); + } +} + +template +void dense( + hls::stream &data_stream, + hls::stream &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) +{ + typename data_T::value_type data[CONFIG_T::n_in]; + #pragma HLS ARRAY_PARTITION variable=data complete + + typename res_T::value_type res[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=res complete + + data_prepare(data_stream, data); + dense_wrapper(data, res, weights, biases); + res_write(res, res_stream); +} + +} + +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h new file mode 100644 index 0000000000..f936c7c88a --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h @@ -0,0 +1,341 @@ +#ifndef NNET_POOLING_STREAM_H_ +#define NNET_POOLING_STREAM_H_ + +#include "utils/x_hls_utils.h" +#include "ap_shift_reg.h" +#include "nnet_common.h" +#include "nnet_pooling.h" +#include "nnet_conv_stream.h" +#include "hls_stream.h" + +namespace nnet { + +// ************************************************* +// Max/average pooling +// ************************************************* + +template +T reduce_pool(T x[N]) { + #pragma HLS INLINE + if (CONFIG_T::pool_op == Max) { + Op_max op_max; + return reduce>(x, op_max); + } else { + Op_add op_add; + T sum = reduce>(x, op_add); + return sum / N; + } +} + +template +void compute_pool_buffer_2d( + const data_T& in_elem, + ap_shift_reg line_buffer[MAX(CONFIG_T::pool_height - 1,1)][CONFIG_T::n_filt], + hls::stream &res +) { + #pragma HLS INLINE + const static int lShiftX = CONFIG_T::pool_width - 1; + const static int lShiftY = CONFIG_T::pool_height - 1; + static int pX = 0; // pixel X + static int pY = 0; // pixel Y + static int sX = 0; // stride X + static int sY = 0; // stride Y + + typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool_window complete + + static typename data_T::value_type kernel_data[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel into line buffer, return pooling kernels + nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); + + // Can compute pooling output + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { + FiltLoop: for(unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS PIPELINE + + // Retrieve data for current channel + PoolLoop: for(unsigned i_ihw = 0; i_ihw < CONFIG_T::pool_height * CONFIG_T::pool_width; i_ihw++) { + pool_window[i_ihw] = kernel_data[i_ihw * CONFIG_T::n_filt + i_ic]; + } + + // Compute Pooling + res_pack[i_ic] = reduce_pool(pool_window); + } + + // Write to output + res.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image + pY = 0; + sY = 0; + } else { // Next line + pY = pY + 1; + // Update stride (threshold) ? subtract stride : increment stride + sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; + } + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void pooling2d_cl( + hls::stream &data, + hls::stream &res +) { + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + #pragma HLS INLINE recursive + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::pool_height - 1,1)][CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE + + compute_pool_buffer_2d(data.read(), line_buffer, res); + } + } +} + +// ************************************************* +// Pooling 1D +// ************************************************* +template +void compute_pool_buffer_1d( + const data_T& in_elem, + hls::stream &res +) { + #pragma HLS INLINE + const static int lShiftX = CONFIG_T::pool_width - 1; + // Counters + static int pX = 0; + static int sX = 0; + + typename data_T::value_type pool_window[CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool_window complete + + static typename data_T::value_type kernel_data[CONFIG_T::pool_width * CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel into line buffer, return pooling kernels + // 1D case line buffer not necessary. Put directly into the kernel_data buffer + nnet::kernel_shift_1d(in_elem, kernel_data); + + // Can compute pooling output + if ( (sX - lShiftX) == 0 && pX > lShiftX - 1) { + FiltLoop: for(unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS PIPELINE + + // Retrieve data for current channel + PoolLoop: for(unsigned i_iw = 0; i_iw < CONFIG_T::pool_width; i_iw++) { + pool_window[i_iw] = kernel_data[i_iw * CONFIG_T::n_filt + i_ic]; + } + + // Compute Pooling + res_pack[i_ic] = reduce_pool(pool_window); + } + + // Write to output + res.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::n_in) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void pooling1d_cl( + hls::stream &data, + hls::stream &res +) { + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + #pragma HLS inline recursive + + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in; i_iw++) { + #pragma HLS PIPELINE + compute_pool_buffer_1d(data.read(), res); + } +} + + +// ************************************************* +// Global max/average pooling +// ************************************************* + +template +T reduce_global_pool(T x, T y[N]) { + #pragma HLS INLINE + if (CONFIG_T::pool_op == Max) { + Op_max op_max; + T y_max = reduce>(y, op_max); + return (x > y_max) ? x : y_max; + } else { + Op_add op_add; + T y_sum = reduce>(y, op_add); + return x + y_sum; + } +} + +template +void compute_global_pool( + const data_T& in_elem, + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt] +) { + PoolFilt: for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + #pragma HLS UNROLL + + typename CONFIG_T::accum_t data_pack[data_T::size / CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=data_pack complete dim=0 + + PixelLoop: for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) { + #pragma HLS UNROLL + data_pack[p] = in_elem[p * CONFIG_T::n_filt + c]; + } + data_window[c] = reduce_global_pool(data_window[c], data_pack); + } +} + +template +void global_pooling2d_cl( + hls::stream &data, + hls::stream &res +) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=data_window complete + + typename CONFIG_T::accum_t init = 0; + if (CONFIG_T::pool_op == Max) { + init = hls::numeric_limits::min(); + } + + PoolInitLoop: for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) { + #pragma HLS UNROLL + data_window[i_init] = init; + } + + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_filt); i_iw++) { + #pragma HLS LOOP_FLATTEN + compute_global_pool(data.read(), data_window); + } + } + + if (CONFIG_T::pool_op == Max) { + MaxPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack]; + } + res.write(res_pack); + } + } else { + AvgPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width); + } + res.write(res_pack); + } + } + +} + +template +void global_pooling1d_cl( + hls::stream &data, + hls::stream &res +) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=data_window complete + + typename CONFIG_T::accum_t init = 0; + if (CONFIG_T::pool_op == Max) { + init = hls::numeric_limits::min(); + } + + PoolInitLoop: for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) { + #pragma HLS UNROLL + data_window[i_init] = init; + } + + ReadInput: for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (data_T::size / CONFIG_T::n_filt); i_iw++) { + #pragma HLS LOOP_FLATTEN + compute_global_pool(data.read(), data_window); + } + + if (CONFIG_T::pool_op == Max) { + MaxPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack]; + } + res.write(res_pack); + } + } else { + AvgPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in; + } + res.write(res_pack); + } + } + +} + +} + +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h new file mode 100644 index 0000000000..d36dbe5f80 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h @@ -0,0 +1,88 @@ +#ifndef NNET_SEPARABLE_CONV1D_STREAM_H_ +#define NNET_SEPARABLE_CONV1D_STREAM_H_ + +#include "nnet_common.h" +#include "hls_stream.h" +#include "nnet_sepconv_stream.h" +#include "nnet_conv1d_stream.h" + +namespace nnet { + +template +void depthwise_conv_1d_buffer_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) +{ + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } else { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } +} + +template +void pointwise_conv_1d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_width == 1); + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + if (i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } else { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + if (i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } +} + + +template +void separable_conv_1d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::depthwise_config::weight_t depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt] +) { + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + #pragma HLS DATAFLOW + + hls::stream depthwise_res; + unsigned res_depth = CONFIG_T::depthwise_config::out_width; + #pragma HLS STREAM variable=depthwise_res depth=res_depth + + depthwise_conv_1d_buffer_cl(data, depthwise_res, depthwise_weights, depthwise_biases); + pointwise_conv_1d_cl(depthwise_res, res, pointwise_weights, pointwise_biases); +} + +} +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h new file mode 100644 index 0000000000..a483c46ddf --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h @@ -0,0 +1,112 @@ +#ifndef NNET_SEPARABLE_CONV2D_STREAM_H_ +#define NNET_SEPARABLE_CONV2D_STREAM_H_ + +#include "nnet_common.h" +#include "hls_stream.h" +#include "nnet_sepconv_stream.h" +#include "nnet_conv2d_stream.h" + +namespace nnet { + +// Line Buffer Implementation (Phil's) +template +void depthwise_conv_2d_buffer_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) +{ + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[CONFIG_T::filt_height - 1][CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + if (CONFIG_T::filt_height > 1) { + compute_depthwise_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } + } + } else { + ReadInputHeightSerial: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::filt_height > 1) { + compute_depthwise_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } + } + } +} + + +template +void pointwise_conv_2d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + + if (CONFIG_T::strategy == nnet::latency) { + ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } + } else { + ReadInputHeightSerial: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } + } +} + +template +void separable_conv_2d_cl( + hls::stream &data, + hls::stream &res, + typename CONFIG_T::depthwise_config::weight_t depthwise_weights[CONFIG_T::depthwise_config::filt_height * CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt] +) { + assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + + #pragma HLS DATAFLOW + + hls::stream depthwise_res; + unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; + #pragma HLS STREAM variable=depthwise_res depth=res_depth + + depthwise_conv_2d_buffer_cl(data, depthwise_res, depthwise_weights, depthwise_biases); + pointwise_conv_2d_cl(depthwise_res, res, pointwise_weights, pointwise_biases); +} + +} +#endif diff --git a/hls4ml/writer/vitis_writer.py b/hls4ml/writer/vitis_writer.py index 45784acb6b..44b7d97c0f 100644 --- a/hls4ml/writer/vitis_writer.py +++ b/hls4ml/writer/vitis_writer.py @@ -1,6 +1,6 @@ import os -from shutil import copyfile, copytree -from distutils.dir_util import copy_tree +import glob +from shutil import copy from hls4ml.writer.vivado_writer import VivadoWriter class VitisWriter(VivadoWriter): @@ -8,8 +8,24 @@ class VitisWriter(VivadoWriter): def __init__(self): super().__init__() + def write_nnet_utils_overrides(self, model): + ################### + ## nnet_utils + ################### + + filedir = os.path.dirname(os.path.abspath(__file__)) + + srcpath = os.path.join(filedir,'../templates/vitis/nnet_utils/') + dstpath = '{}/firmware/nnet_utils/'.format(model.config.get_output_dir()) + + headers = [os.path.basename(h) for h in glob.glob(srcpath + '*.h')] + + for h in headers: + copy(srcpath + h, dstpath + h) + def write_hls(self, model): """ Write the HLS project. Calls the steps from VivadoWriter, adapted for Vitis """ super(VitisWriter, self).write_hls(model) + self.write_nnet_utils_overrides(model) From 3d6463981379b4b38d4ee901eabf0ec75321f3b6 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 3 Aug 2022 20:05:57 +0200 Subject: [PATCH 08/20] Supported feature validation for Vitis backend --- hls4ml/backends/vitis/passes/feature_check.py | 25 +++++++++++++++++++ hls4ml/backends/vitis/vitis_backend.py | 17 ++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 hls4ml/backends/vitis/passes/feature_check.py diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py new file mode 100644 index 0000000000..ee3e6d83be --- /dev/null +++ b/hls4ml/backends/vitis/passes/feature_check.py @@ -0,0 +1,25 @@ +from hls4ml.model.optimizer import OptimizerPass + + +class ValidateConvImplementation(OptimizerPass): + + def match(self, node): + return 'Conv' in node.class_name + + def transform(self, model, node): + if node.get_attr('implementation', 'linebuffer') == 'encoded': + print(f'WARNING: "Encoded" implementation in "{node.name}" ({node.class_name}) is not supported in Vitis backend. Switching to "LineBuffer" implementation.') + node.set_attr('implementation', 'linebuffer') + + +class ValidateStrategy(OptimizerPass): + _resource_layer_cls = ['Conv1D', 'Conv2D', 'Dense'] + + def match(self, node): + is_resource_layer = len([layer_cls for layer_cls in self._resource_layer_cls if layer_cls in node.class_name]) > 0 + is_resource_strategy = node.model.config.is_resource_strategy(node) + + return is_resource_layer and is_resource_strategy + + def transform(self, model, node): + print(f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis backend due to use of "urem" cores. Consider switching to "Latency" strategy.') diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index 512917e933..b8b4bc7989 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -2,7 +2,7 @@ import sys from hls4ml.backends import VivadoBackend -from hls4ml.model.flow import register_flow +from hls4ml.model.flow import register_flow, get_flow from hls4ml.report import parse_vivado_report @@ -12,10 +12,19 @@ def __init__(self): self._register_flows() def _register_flows(self): - vivado_ip = 'vivado:ip' + validation_passes = [ + 'vitis:validate_conv_implementation', + 'vitis:validate_strategy', + ] + validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name) + writer_passes = ['make_stamp', 'vitis:write_hls'] - self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name) - self._default_flow = vivado_ip + self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) + + ip_flow_requirements = get_flow('vivado:ip').requires.copy() + ip_flow_requirements.insert(ip_flow_requirements.index('vivado:init_layers'), validation_flow) + + self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False): if 'linux' in sys.platform: From 1c4b6edd6ea7184d2738c9b12c9b03b60dca2e83 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 4 Aug 2022 10:21:30 +0200 Subject: [PATCH 09/20] Fix Vitis pragmas --- .../vitis/nnet_utils/nnet_dense_stream.h | 29 +++++++++----- .../vivado/nnet_utils/nnet_conv1d_latency.h | 40 +------------------ 2 files changed, 22 insertions(+), 47 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h index f8469f0cb5..955dc9e784 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h @@ -10,19 +10,24 @@ namespace nnet { template -void dense_wrapper( +void dense_latency_wrapper( data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], typename CONFIG_T::bias_t biases[CONFIG_T::n_out] ) { - #pragma HLS INLINE recursive - if (CONFIG_T::strategy == nnet::latency) { - #pragma HLS PIPELINE II=CONFIG_T::reuse_factor - dense_latency(data, res, weights, biases); - } else { - dense_resource(data, res, weights, biases); - } + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + dense_latency(data, res, weights, biases); +} + +template +void dense_resource_wrapper( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { + dense_resource(data, res, weights, biases); } template @@ -86,6 +91,8 @@ void dense( typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS INLINE recursive + typename data_T::value_type data[CONFIG_T::n_in]; #pragma HLS ARRAY_PARTITION variable=data complete @@ -93,7 +100,11 @@ void dense( #pragma HLS ARRAY_PARTITION variable=res complete data_prepare(data_stream, data); - dense_wrapper(data, res, weights, biases); + if (CONFIG_T::strategy == nnet::latency) { + dense_latency_wrapper(data, res, weights, biases); + } else { + dense_resource_wrapper(data, res, weights, biases); + } res_write(res, res_stream); } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index f79903ee2f..15fd2b49e1 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -6,40 +6,6 @@ namespace nnet { -//Computes multiplier limit -//This function should not be synthesized into firmware -template -int compute_multiplier_limit( - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt] -) -{ - int n_mult = 0; - for(int ii = 0; ii < CONFIG_T::out_width; ii++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++){ - for(int cc = 0; cc < CONFIG_T::n_chan; cc++){ - for(int jj = 0; jj < CONFIG_T::filt_width; jj++){ - - int index_weight = jj*CONFIG_T::n_chan*CONFIG_T::n_filt + cc*CONFIG_T::n_filt + ff; - - if((ii*CONFIG_T::stride_width+jj) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width+jj) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){ - //padded -- do nothing - continue; - } else { - //need to tune this cut? - if( weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20 ){ - n_mult++; - }//end if nonzero weight - }//end not padding - }//end loop accross filter - }//end channel loop - }//end filter loop - }//end output loop - - return ceil( float(n_mult) / float(CONFIG_T::reuse_factor) ); - -}//end compute_n_mult - - template void conv_1d_latency_cl( data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], @@ -63,8 +29,7 @@ void conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - const int multiplier_limit = compute_multiplier_limit(weights); - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOut: for(int ii = 0; ii < CONFIG_T::out_width; ii++) { @@ -141,8 +106,7 @@ void pointwise_conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - const int multiplier_limit = compute_multiplier_limit(weights); - #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOut: for(int ii = 0; ii < CONFIG_T::out_width; ii++) { From d0a1e3ab29722072c8ef542fc20e58bddd526308 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 4 Aug 2022 10:31:33 +0200 Subject: [PATCH 10/20] Treat Vitis backend as module --- hls4ml/backends/vitis/__init__.py | 0 hls4ml/backends/vitis/passes/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 hls4ml/backends/vitis/__init__.py create mode 100644 hls4ml/backends/vitis/passes/__init__.py diff --git a/hls4ml/backends/vitis/__init__.py b/hls4ml/backends/vitis/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/vitis/passes/__init__.py b/hls4ml/backends/vitis/passes/__init__.py new file mode 100644 index 0000000000..e69de29bb2 From 28407c585e9ad2fb1686c7e48373837fdbf1a90e Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 17 Aug 2022 18:36:55 +0200 Subject: [PATCH 11/20] Limit function instances in pooling layers --- .../templates/vitis/nnet_utils/nnet_pooling.h | 314 ++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_pooling.h diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h new file mode 100644 index 0000000000..1fb2ecca7d --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h @@ -0,0 +1,314 @@ +#ifndef NNET_POOLING_H_ +#define NNET_POOLING_H_ + +#include +#include "nnet_common.h" +#include "nnet_helpers.h" + +namespace nnet{ + +// Return the maximum value from an array +template +T max(T x[N]){ + T y = x[0]; + for(int i = 1; i < N; i++){ + y = x[i] > y ? x[i] : y; + } + return y; +} + +template +ap_int avg(ap_int (&x)[N]){ + // Use a wider accumulator than the input to avoid overflow + ap_int tmp = 0; + for(int i = 0; i < N; i++){ + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ap_int y = tmp; + return tmp; +} + +template +ap_fixed avg(ap_fixed (&x)[N]){ + // Use a wider accumulator than the input to avoid overflow + ap_fixed tmp = 0; + for(int i = 0; i < N; i++){ + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ap_fixed y = tmp; + return y; +} + +// Return the mean value of an array +template +T avg(T (&x)[N]){ + T y = 0; + for(int i = 0; i < N; i++){ + y += x[i]; + } + y /= N; + return y; +} + +// Enumeration for pooling operation (max, avg, l2norm pooling) +enum Pool_Op { Max, Average }; // L2Norm }; +template +T pool_op(T (&x)[N]){ + switch(op){ + case Max: return max(x); + case Average: return avg(x); + // case L2Norm: return l2norm(x); + } +} + +template +T pad_val(){ + /*--- + *- In Tensorflow, pooling ignores the value in the padded cells + *- For Avg pooling, return 0 (the divisior is modified to the + *- area overlapping the unpadded image. + *- For max pooling, return the most negative value for the type. + *- TODO this is not really generic, it assumes fixed point or integer T + ---*/ + switch(op){ + case Max:{ + T x = 0; + x[x.width - 1] = 1; + return x; + break;} + case Average: return 0; + } +} + +struct pooling1d_config{ + // IO size + static const unsigned n_in = 10; + static const unsigned pool_width = 2; + static const unsigned stride_width = 2; + static const unsigned n_out = (n_in - pool_width) / stride_width + 1; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template +constexpr int pool_op_limit_1d() { + return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor; +} + +template +void pooling1d_cl( + data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], + res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) +{ + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit_1d(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + // Add any necessary padding + unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Loop over input image x in steps of stride + for(int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) { + data_T pool[CONFIG_T::pool_width]; + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window x + for(int jj = 0; jj < CONFIG_T::stride_width; jj++) { + if(ii+jj < CONFIG_T::pad_left || ii+jj >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[jj] = pad_val(); + }else{ + pool[jj] = data[(ii + jj) * CONFIG_T::n_filt + ff]; + img_overlap++; + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce width to area of pool window + // not overlapping padding region + res[(ii/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if(CONFIG_T::pool_op == Average) { + data_T rescale = CONFIG_T::pool_width / img_overlap; + res[(ii/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] *= rescale; + } + } + } +} + +template +void global_pooling1d_cl( + data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], + res_T res[CONFIG_T::n_filt]) +{ + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit_1d(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + data_T pool[CONFIG_T::n_in]; + for(int jj = 0; jj < CONFIG_T::n_in; jj++) { + pool[jj] = data[jj * CONFIG_T::n_filt + ff]; + } + // do the pooling + res[ff] = pool_op(pool); + } +} + +struct pooling2d_config{ + // IO size + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_filt = 4; + static const unsigned stride_height = 2; + static const unsigned stride_width = 2; + static const unsigned pool_height = 2; + static const unsigned pool_width = 2; + static const unsigned out_height = (in_height - pool_height) / stride_height + 1; + static const unsigned out_width = (in_width - pool_width) / stride_width + 1; + // Padding + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + // Pooling function + static const Pool_Op pool_op = Max; + // Reuse factor + static const unsigned reuse_factor = 1; + + // Internal data type definitions + typedef float accum_t; +}; + +template +constexpr int pool_op_limit(){ + return DIV_ROUNDUP((CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt, CONFIG_T::reuse_factor); +} + +template +void pooling2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) +{ + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + // Add any necessary padding + unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for(int ff = 0; ff < CONFIG_T::n_filt; ff++){ + // Loop over input image y in steps of stride + for(int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height){ + // Loop over input image x in steps of stride + for(int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width){ + data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window y + for(int kk = 0; kk < CONFIG_T::stride_height; kk++){ + // Loop over pool window x + for(int ll = 0; ll < CONFIG_T::stride_width; ll++){ + if(ii+kk < CONFIG_T::pad_top || ii+kk >= (padded_height - CONFIG_T::pad_bottom) || jj+ll < CONFIG_T::pad_left || jj+ll >= (padded_width - CONFIG_T::pad_right)){ + // Add padding + pool[kk * CONFIG_T::stride_width + ll] = pad_val(); + }else{ + pool[kk * CONFIG_T::stride_width + ll] = data[(ii + kk) * CONFIG_T::in_width * CONFIG_T::n_filt + (jj + ll) * CONFIG_T::n_filt + ff]; + img_overlap++; + } + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce height * width to area of pool window + // not overlapping padding region + res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (jj/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if(CONFIG_T::pool_op == Average){ + data_T rescale = CONFIG_T::pool_height * CONFIG_T::pool_width / img_overlap; + res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (jj/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] *= rescale; + } + } + } + } +} + +template +void pooling2d_cf( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) +{ + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + // Add any necessary padding + unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for(int ff = 0; ff < CONFIG_T::n_filt; ff++){ + // Loop over input image y in steps of stride + for(int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height){ + // Loop over input image x in steps of stride + for(int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width){ + data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window y + for(int kk = 0; kk < CONFIG_T::stride_height; kk++){ + // Loop over pool window x + for(int ll = 0; ll < CONFIG_T::stride_width; ll++){ + if(ii+kk < CONFIG_T::pad_top || ii+kk >= (padded_height - CONFIG_T::pad_bottom) || jj+ll < CONFIG_T::pad_left || jj+ll >= (padded_width - CONFIG_T::pad_right)){ + // Add padding + pool[kk * CONFIG_T::stride_width + ll] = pad_val(); + }else{ + pool[kk * CONFIG_T::stride_width + ll] = data[(ii + kk) * CONFIG_T::in_width + ff * CONFIG_T::in_width*CONFIG_T::in_height + ll + jj]; + img_overlap++; + } + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce height * width to area of pool window + // not overlapping padding region + res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width + (jj/CONFIG_T::stride_width) + ff* CONFIG_T::out_height* CONFIG_T::out_width] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if(CONFIG_T::pool_op == Average){ + data_T rescale = CONFIG_T::pool_height * CONFIG_T::pool_width / img_overlap; + res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width + (jj/CONFIG_T::stride_width) + ff* CONFIG_T::out_height* CONFIG_T::out_width] *= rescale; + } + } + } + } +} + +} + +#endif From 85e19a77306d14711e051daf0d7475319e444e58 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 18 Aug 2022 16:03:14 +0200 Subject: [PATCH 12/20] Use consistent resource names for reports --- hls4ml/report/vivado_report.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py index c7773ad9ff..736fc3354b 100644 --- a/hls4ml/report/vivado_report.py +++ b/hls4ml/report/vivado_report.py @@ -164,8 +164,13 @@ def parse_vivado_report(hls_dir): # Area area_node = root.find('./AreaEstimates') for child in area_node.find('./Resources'): + # DSPs are called 'DSP48E' in Vivado and just 'DSP' in Vitis. Overriding here to have consistent keys + if child.tag == 'DSP48E': + child.tag = 'DSP' c_synth_report[child.tag] = child.text for child in area_node.find('./AvailableResources'): + if child.tag == 'DSP48E': + child.tag = 'DSP' c_synth_report['Available' + child.tag] = child.text report['CSynthesisReport'] = c_synth_report else: From d8f298f39d49e55f0f62c8f936666b5e21f0a5d9 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 18 Aug 2022 19:27:18 +0200 Subject: [PATCH 13/20] Support RNNs in Vitis --- hls4ml/backends/vitis/vitis_backend.py | 1 + hls4ml/backends/vivado/passes/recurrent_templates.py | 1 + hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h | 8 ++++---- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index b8b4bc7989..dbcf87c31e 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -9,6 +9,7 @@ class VitisBackend(VivadoBackend): def __init__(self): super(VivadoBackend, self).__init__(name='Vitis') + self._register_layer_attributes() self._register_flows() def _register_flows(self): diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py index 74ec61e823..d7c826e74a 100644 --- a/hls4ml/backends/vivado/passes/recurrent_templates.py +++ b/hls4ml/backends/vivado/passes/recurrent_templates.py @@ -12,6 +12,7 @@ static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; static const unsigned n_nonzeros = {nonzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; static const bool store_weights_in_bram = false; typedef {accum_t.name} accum_t; typedef {bias_t.name} bias_t; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h index e94286aa8e..a2581a94c4 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h @@ -293,7 +293,7 @@ template nnet::lstm(reset_state,data_in,h_newstate, s_newstate, param,param_r,param_b, param_br); if (CONFIG_T::n_sequence_out > 1){ res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack_sequences: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = h_newstate[i_pack]; @@ -305,7 +305,7 @@ template if (CONFIG_T::n_sequence_out == 1){ res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = h_newstate[i_pack]; @@ -565,7 +565,7 @@ template nnet::gru(reset_state,data_in,h_newstate,param,param_zr,param_b, param_br); if (CONFIG_T::n_sequence_out > 1){ res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack_sequences: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = h_newstate[i_pack]; @@ -577,7 +577,7 @@ template if (CONFIG_T::n_sequence_out == 1){ res_T res_pack; - #pragma HLS DATA_PACK variable=res_pack + PRAGMA_DATA_PACK(res_pack) ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) { #pragma HLS UNROLL res_pack[i_pack] = h_newstate[i_pack]; From ee91246b51b6709b7f7f8810ca1e673dbf3183ff Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 19 Oct 2022 19:12:18 +0200 Subject: [PATCH 14/20] Properly set the multiplier limit for Conv1D/2D --- .../vivado/nnet_utils/nnet_conv1d_latency.h | 2 +- .../vivado/nnet_utils/nnet_conv2d_latency.h | 117 +----------------- 2 files changed, 2 insertions(+), 117 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 0f9f51debd..dd7225346d 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -30,7 +30,7 @@ void conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete // Limit multipliers to control parallelization - #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit PartitionLoop: for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h index 6d46836f82..43222696c3 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h @@ -7,121 +7,6 @@ namespace nnet { -template -void conv_2d_latency_cf( - data_T data[CONFIG_T::in_height*CONFIG_T::in_width*CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height*CONFIG_T::out_width*CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) -{ - - typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]; - - #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 - #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 - - // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases - #pragma HLS function_instantiate variable=weights,biases - - // Parallel mode - #pragma HLS PIPELINE - #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 - - // Limit multipliers to control parallelization - #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit - - // Convolve, saving all multiplication results to accumulate later - ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - ConvOutWidth: for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++){ - ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++){ - ConvFiltHeight: for(int fh = 0; fh < CONFIG_T::filt_height; fh++){ - ConvFiltWidth: for(int fw = 0; fw < CONFIG_T::filt_width; fw++){ - - int index_mult = oh*CONFIG_T::out_width*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + ow*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + ff*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + cc*CONFIG_T::filt_height*CONFIG_T::filt_width - + fh*CONFIG_T::filt_width - + fw; - - int index_weight = fh*CONFIG_T::filt_width*CONFIG_T::n_chan*CONFIG_T::n_filt - + fw*CONFIG_T::n_chan*CONFIG_T::n_filt - + cc*CONFIG_T::n_filt - + ff; - - if ((oh*CONFIG_T::stride_height+fh) < CONFIG_T::pad_top - || (oh*CONFIG_T::stride_height+fh) >= (CONFIG_T::pad_top+CONFIG_T::in_height) - || (ow*CONFIG_T::stride_width+fw) < CONFIG_T::pad_left - || (ow*CONFIG_T::stride_width+fw) >= (CONFIG_T::pad_left+CONFIG_T::in_width)) { - mult[index_mult] = 0; - } else { - int index_data = cc*CONFIG_T::in_height*CONFIG_T::in_width - + (oh*CONFIG_T::stride_height+fh-CONFIG_T::pad_top)*CONFIG_T::in_width - + (ow*CONFIG_T::stride_width+fw-CONFIG_T::pad_left); - mult[index_mult] = data[index_data] * weights[index_weight]; - } - - }//end mult loop - }//end channel loop - }//end filter width loop - }//end filter height loop - }//end output width loop - }//end output height loop - - - // Initialize accumulator with input biases - for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - acc[oh*CONFIG_T::out_width*CONFIG_T::n_filt + ow*CONFIG_T::n_filt + ff]=biases[ff]; - } - } - } - - - // Accumulate multiplication result - AccumOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - AccumOutWidth: for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - //Do "dot product" sum within filter and sum over channels - AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++){ - AccumDotHeight: for(int fh = 0; fh < CONFIG_T::filt_height; fh++){ - AccumDotWidth: for(int fw = 0; fw < CONFIG_T::filt_width; fw++){ - - int index_mult = oh*CONFIG_T::out_width*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + ow*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + ff*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width - + cc*CONFIG_T::filt_height*CONFIG_T::filt_width - + fh*CONFIG_T::filt_width - + fw; - int index_acc = oh*CONFIG_T::out_width*CONFIG_T::n_filt - + ow*CONFIG_T::n_filt - + ff; - - acc[index_acc] += mult[index_mult]; - - }//end dot product filter width loop - }//end dot product filter height loop - }//end n channel loop - }//end n filter loop - }//end output width loop - }//end output height loop - - // Cast to "res_t" type - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - for(int oh = 0; oh < CONFIG_T::out_height; oh++) { - for(int ow = 0; ow < CONFIG_T::out_width; ow++) { - int res_index = ff*CONFIG_T::out_height*CONFIG_T::out_width + oh*CONFIG_T::out_width + ow; - int acc_index = oh*CONFIG_T::out_width*CONFIG_T::n_filt + ow*CONFIG_T::n_filt + ff; - res[res_index] = acc[acc_index]; - } - } - } - -}//end conv2d - template void conv_2d_latency_cl( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], @@ -145,7 +30,7 @@ void conv_2d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete // Limit multipliers to control parallelization - #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit PartitionLoop: for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { From f905162d3e6b25bddf7fe7e0aeb2a5fbe01706d8 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 19 Oct 2022 19:39:42 +0200 Subject: [PATCH 15/20] Add option to configure clock uncertainty --- hls4ml/templates/vivado/build_prj.tcl | 2 +- hls4ml/writer/vivado_writer.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index f059852095..d34337c573 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -166,7 +166,7 @@ config_compile -name_max_length 80 set_part $part config_schedule -enable_dsp_full_reg=false create_clock -period $clock_period -name default -set_clock_uncertainty 12.5% default +set_clock_uncertainty $clock_uncertainty default if {$opt(csim)} { diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 9cdbefb4e1..03a8923c77 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -516,6 +516,8 @@ def write_build_script(self, model): f.write('set part "{}"\n'.format(model.config.get_config_value('Part'))) f.write('variable clock_period\n') f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) + f.write('variable clock_uncertainty\n') + f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) f.close() ################### From 5a8fd9eecd1b9521ce98dcef09e32e7f8584cebc Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 20 Oct 2022 14:59:35 +0200 Subject: [PATCH 16/20] Add uncertainty to accelerator writer as well --- hls4ml/writer/vivado_accelerator_writer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py index f979b60321..b92ce74ab7 100644 --- a/hls4ml/writer/vivado_accelerator_writer.py +++ b/hls4ml/writer/vivado_accelerator_writer.py @@ -340,6 +340,8 @@ def write_board_script(self, model): f.write('set part "{}"\n'.format(self.vivado_accelerator_config.get_part())) f.write('variable clock_period\n') f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) + f.write('variable clock_uncertainty\n') + f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) if self.vivado_accelerator_config.get_interface() == 'axi_stream': in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth() f.write('set bit_width_hls_output {}\n'.format(in_bit)) From c6dbcf8816fef149b5b9c79ce1376ac8d12475f0 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 20 Oct 2022 21:02:56 +0200 Subject: [PATCH 17/20] Enabling resource strategy for Vitis backend --- hls4ml/backends/vitis/passes/feature_check.py | 5 +- .../vitis/nnet_utils/nnet_dense_resource.h | 247 ++++++++++++++++++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py index ee3e6d83be..eddd5530f3 100644 --- a/hls4ml/backends/vitis/passes/feature_check.py +++ b/hls4ml/backends/vitis/passes/feature_check.py @@ -22,4 +22,7 @@ def match(self, node): return is_resource_layer and is_resource_strategy def transform(self, model, node): - print(f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis backend due to use of "urem" cores. Consider switching to "Latency" strategy.') + n_in, _ = model.config.backend.get_layer_mult_size(node) + rf = node.get_attr('reuse_factor') + if rf > n_in and rf % n_in > 0: + print(f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis backend due to use of "urem" cores. Consider using a different ReuseFactor or switching to "Latency" strategy.') diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h new file mode 100644 index 0000000000..d96b75b47a --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h @@ -0,0 +1,247 @@ +#ifndef NNET_DENSE_RESOURCE_H_ +#define NNET_DENSE_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_mult.h" +#include "hls_stream.h" +#include +#include + +namespace nnet { + +template +void dense_resource_rf_leq_nin( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit / CONFIG_T::n_out; + + assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed"); + assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; // I got you now motherfucker! + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + InitAccum: + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc]; + } + + ReuseLoop: + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + int in_index = ir; + int out_index = 0; + int acc_step = 0; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights_2d[im][ir])); + + // Increment in_index + in_index += CONFIG_T::reuse_factor; + if (in_index >= CONFIG_T::n_in) { + in_index = ir; + } + // Increment out_index + if (acc_step + 1 >= multscale) { + acc_step = 0; + out_index++; + } else { + acc_step++; + } + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin_rem0( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::n_in); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + + assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor > CONFIG_T::n_in && CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + InitAccum: + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc]; + } + + int in_index = 0; + int out_index; + int outstep = 0; + const int outscale = CONFIG_T::reuse_factor / CONFIG_T::n_in; + + int outidx[CONFIG_T::reuse_factor]; + IndexLoop: + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + outidx[ir] = outstep; + if ((ir + 1) % CONFIG_T::n_in == 0) { + outstep++; + } + } + + ReuseLoop: + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + out_index = outidx[ir]/*outstep*/; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights_2d[im][ir])); + + out_index += outscale; + } + + in_index++; + if (in_index >= CONFIG_T::n_in) { + in_index = 0; + //outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround. + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int multiplier_limit = CONFIG_T::n_out; + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + + assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + InitAccum: + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc]; + } + + ReuseLoop: + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma HLS PIPELINE II=1 rewind + typename CONFIG_T::accum_t tmpmult[block_factor]; + #pragma HLS ARRAY_PARTITION variable=tmpmult complete + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + int w_index = ir + CONFIG_T::reuse_factor * im; + int in_index = w_index % CONFIG_T::n_in; // As of Vitis HLS 2022.1, this still results in urem core being used. + tmpmult[im] = CONFIG_T::template product::product(data[in_index], weights_2d[im][ir]); + } + + typename CONFIG_T::accum_t mult[multiplier_limit]; + #pragma HLS ARRAY_PARTITION variable=mult complete + + ResetMult: + for (int imult = 0; imult < multiplier_limit; imult++) { + #pragma HLS UNROLL + mult[imult] = 0; + } + + AccumLoop1: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + int w_index = ir + CONFIG_T::reuse_factor * im; + int out_index = w_index / CONFIG_T::n_in; + if (out_index >= multiplier_limit) continue; // check out of bounds + mult[out_index] += tmpmult[im]; + } + + AccumLoop2: + for (int im = 0; im < multiplier_limit; im++) { + #pragma HLS UNROLL + acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + #pragma HLS INLINE recursive + + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_resource_rf_leq_nin(data, res, weights, biases); + } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) { + dense_resource_rf_gt_nin_rem0(data, res, weights, biases); + } else { + dense_resource_rf_gt_nin(data, res, weights, biases); + } +} + +} + +#endif From 5c48d155482eb59c6323299da0d66afeb9a02ca3 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 24 Oct 2022 20:11:44 +0200 Subject: [PATCH 18/20] Resource strategy for Conv1D/2D io_parallel --- .../vitis/nnet_utils/nnet_conv1d_resource.h | 102 +++++++++++++++++ .../vitis/nnet_utils/nnet_conv2d_resource.h | 104 ++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h new file mode 100644 index 0000000000..6477bbd902 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h @@ -0,0 +1,102 @@ +#ifndef NNET_CONV1D_RESOURCE_H_ +#define NNET_CONV1D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +template +void conv_1d_resource_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor); + constexpr unsigned multscale = block_factor / mult_n_out; + + assert((block_factor % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor <= CONFIG_T::filt_width * CONFIG_T::n_chan) && "This function is correct only for RF <= FILT_WIDTH * N_CHAN"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + PartitionLoop: + for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + //#pragma HLS UNROLL // We don't want this loop unrolled + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelInitAccumLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + InitAccumLoop: + for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t) biases[i_acc]; + } + } + + ReuseLoop: + for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) { + #pragma HLS PIPELINE II=1 rewind + + unsigned i_in = i_rf; + unsigned i_out = 0; + unsigned i_acc = 0; + + MultLoop: + for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) { + #pragma HLS UNROLL + + PixelMultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + acc[i_pxl][i_out] += static_cast( + CONFIG_T::mult_config::template product::product(data_buf[i_pxl][i_in], weights_2d[i_blk][i_rf])); + } + + // Increment i_in + i_in += CONFIG_T::reuse_factor; + if (i_in >= mult_n_in) { + i_in = i_rf; + } + // Increment i_out + if (i_acc + 1 >= multscale) { + i_acc = 0; + i_out++; + } else { + i_acc++; + } + } + } + + PixelResultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + // Cast to "res_t" type + ResultLoop: + for (unsigned i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_pxl][i_res]); + } + } + } +} + +} +#endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h new file mode 100644 index 0000000000..ea0afc7d29 --- /dev/null +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h @@ -0,0 +1,104 @@ +#ifndef NNET_CONV2D_RESOURCE_H_ +#define NNET_CONV2D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +template +void conv_2d_resource_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor); + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor); + constexpr unsigned multscale = multiplier_limit / mult_n_out; + + assert((multiplier_limit % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && "The current Reuse Factor is not allowed"); + assert((multiplier_limit == block_factor) && "This function is correct only for RF <= FILT_HEIGHT * FILT_WIDTH * N_CHAN"); + + // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices. + // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own. + typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + PartitionLoop: + for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + //#pragma HLS UNROLL // We don't want this loop unrolled + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelInitAccumLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + InitAccumLoop: + for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t) biases[i_acc]; + } + } + + ReuseLoop: + for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) { + #pragma HLS PIPELINE II=1 rewind + + unsigned i_in = i_rf; + unsigned i_out = 0; + unsigned i_acc = 0; + + MultLoop: + for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) { + #pragma HLS UNROLL + + PixelMultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + acc[i_pxl][i_out] += static_cast( + CONFIG_T::mult_config::template product::product(data_buf[i_pxl][i_in], weights_2d[i_blk][i_rf])); + } + + // Increment i_in + i_in += CONFIG_T::reuse_factor; + if (i_in >= mult_n_in) { + i_in = i_rf; + } + // Increment i_out + if (i_acc + 1 >= multscale) { + i_acc = 0; + i_out++; + } else { + i_acc++; + } + } + } + + PixelResultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + // Cast to "res_t" type + ResultLoop: + for (unsigned i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_pxl][i_res]); + } + } + } +} + +} +#endif From ede3350920bd445df9a8ffc1a133fb2fb933ae07 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Fri, 17 Mar 2023 22:06:43 +0100 Subject: [PATCH 19/20] Add Vitis to tests --- hls4ml/backends/vitis/vitis_backend.py | 4 +++ .../templates/vitis/nnet_utils/nnet_pooling.h | 26 +++++++++++++++++++ .../vitis/nnet_utils/nnet_sepconv1d_stream.h | 3 ++- .../vitis/nnet_utils/nnet_sepconv2d_stream.h | 3 ++- hls4ml/writer/vivado_writer.py | 2 +- test/pytest/test_activations.py | 2 +- test/pytest/test_batchnorm.py | 2 +- test/pytest/test_causalpadding.py | 2 +- test/pytest/test_cnn_mnist.py | 4 +++ test/pytest/test_cnn_mnist_qkeras.py | 14 ++++++++-- test/pytest/test_conv1d.py | 12 +++++++-- test/pytest/test_embed.py | 4 +-- test/pytest/test_extensions.py | 9 ++++--- test/pytest/test_globalpooling.py | 4 +-- test/pytest/test_keras_api.py | 14 +++++----- test/pytest/test_keras_h5_loader.py | 2 +- test/pytest/test_merge.py | 10 +++---- test/pytest/test_pointwiseconv.py | 8 +++--- test/pytest/test_qkeras.py | 6 ++--- test/pytest/test_rnn.py | 4 +++ test/pytest/test_sepconv2d.py | 7 ++--- test/pytest/test_softmax.py | 4 +-- test/pytest/test_softsign.py | 2 +- test/pytest/test_trace.py | 2 +- test/pytest/test_transpose_concat.py | 4 +-- test/pytest/test_upsampling.py | 2 +- test/pytest/test_zeropadding.py | 2 +- 27 files changed, 111 insertions(+), 47 deletions(-) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index dbcf87c31e..8fc4ab9c3d 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -19,11 +19,15 @@ def _register_flows(self): ] validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name) + # Any potential templates registered specifically for Vitis backend + template_flow = register_flow('apply_templates', self._get_layer_templates, requires=['vivado:init_layers'], backend=self.name) + writer_passes = ['make_stamp', 'vitis:write_hls'] self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) ip_flow_requirements = get_flow('vivado:ip').requires.copy() ip_flow_requirements.insert(ip_flow_requirements.index('vivado:init_layers'), validation_flow) + ip_flow_requirements.insert(ip_flow_requirements.index('vivado:apply_templates'), template_flow) self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h index 1fb2ecca7d..ac921e0d3a 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h @@ -309,6 +309,32 @@ void pooling2d_cf( } } + +template +void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height); + + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=pool_op limit=limit + + FiltLoop: + for(int filt = 0; filt < CONFIG_T::n_filt; filt++) { + data_T pool[CONFIG_T::in_height * CONFIG_T::in_width]; + + InputLoop: + for (int i = 0 ; i < CONFIG_T::in_height * CONFIG_T::in_width ; i++) { + pool[i] = data[i * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast(pool_op(pool)); + } +} + } #endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h index d36dbe5f80..6850497ffd 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h @@ -72,7 +72,8 @@ void separable_conv_1d_cl( typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt] ) { - assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + assert(CONFIG_T::depthwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + assert(CONFIG_T::pointwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); #pragma HLS DATAFLOW diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h index a483c46ddf..352828ecd9 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h @@ -96,7 +96,8 @@ void separable_conv_2d_cl( typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt] ) { - assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + assert(CONFIG_T::depthwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); + assert(CONFIG_T::pointwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS."); #pragma HLS DATAFLOW diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 4e8cd9ad95..c70e28bb5f 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -643,7 +643,7 @@ def write_nnet_utils(self, model): # custom source filedir = os.path.dirname(os.path.abspath(__file__)) - custom_source = get_backend('Vivado').get_custom_source() + custom_source = model.config.backend.get_custom_source() for dst, srcpath in custom_source.items(): dstpath = f'{model.config.get_output_dir()}/firmware/{dst}' copyfile(srcpath, dstpath) diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py index 7aea0884e0..9875bfe144 100644 --- a/test/pytest/test_activations.py +++ b/test/pytest/test_activations.py @@ -9,7 +9,7 @@ # Variable 'name' is simply used as an identifier for the activation -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('shape, io_type', [ ((8, ), 'io_parallel'), ((8, ), 'io_stream'), diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py index 1b17637d92..f50329230b 100644 --- a/test/pytest/test_batchnorm.py +++ b/test/pytest/test_batchnorm.py @@ -29,7 +29,7 @@ def model(): @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_batchnorm(model, data, backend, io_type): default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>' diff --git a/test/pytest/test_causalpadding.py b/test/pytest/test_causalpadding.py index d183d81c41..4e128b8744 100644 --- a/test/pytest/test_causalpadding.py +++ b/test/pytest/test_causalpadding.py @@ -10,7 +10,7 @@ atol = 5e-3 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_causalpadding(io_type, backend): model = Sequential() diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py index 262ae50138..ab3365f228 100644 --- a/test/pytest/test_cnn_mnist.py +++ b/test/pytest/test_cnn_mnist.py @@ -58,6 +58,10 @@ def keras_model(mnist_data): ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), ('Vivado', 'io_stream', 'resource'), + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource'), ], ) def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy): diff --git a/test/pytest/test_cnn_mnist_qkeras.py b/test/pytest/test_cnn_mnist_qkeras.py index c34e0965a6..cf3dbf17d9 100644 --- a/test/pytest/test_cnn_mnist_qkeras.py +++ b/test/pytest/test_cnn_mnist_qkeras.py @@ -40,7 +40,12 @@ def mnist_model(): ('Vivado', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource') ]) def hls_model(mnist_model, backend, io_type, strategy): keras_model = mnist_model @@ -66,7 +71,12 @@ def hls_model(mnist_model, backend, io_type, strategy): ('Vivado', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource') ]) def test_accuracy(mnist_data, mnist_model, hls_model): x_train, y_train, x_test, y_test = mnist_data diff --git a/test/pytest/test_conv1d.py b/test/pytest/test_conv1d.py index 1d91d80ea3..bc8a680022 100644 --- a/test/pytest/test_conv1d.py +++ b/test/pytest/test_conv1d.py @@ -30,7 +30,11 @@ def keras_model(): ('Vivado', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource'), ]) def hls_model(keras_model, backend, io_type, strategy): default_precision = 'ap_fixed<16,3,AP_RND_CONV,AP_SAT>' if backend=='Vivado' else 'ac_fixed<16,3,true,AC_RND_CONV,AC_SAT>' @@ -63,7 +67,11 @@ def hls_model(keras_model, backend, io_type, strategy): ('Vivado', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + ('Vitis', 'io_parallel', 'resource'), + ('Vitis', 'io_parallel', 'latency'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource'), ]) def test_accuracy(data, keras_model, hls_model): X = data diff --git a/test/pytest/test_embed.py b/test/pytest/test_embed.py index 8073a7a1a5..fd8e39cdb9 100644 --- a/test/pytest/test_embed.py +++ b/test/pytest/test_embed.py @@ -25,7 +25,7 @@ def keras_model(): @pytest.fixture -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def hls_model(keras_model, backend, io_type): hls_config = hls4ml.utils.config_from_keras_model(keras_model, default_precision='ap_fixed<16,6>', granularity='name') @@ -39,7 +39,7 @@ def hls_model(keras_model, backend, io_type): return hls_model -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_embedding_accuracy(data, keras_model, hls_model): X = data diff --git a/test/pytest/test_extensions.py b/test/pytest/test_extensions.py index 1c8e07198a..9945768ea6 100644 --- a/test/pytest/test_extensions.py +++ b/test/pytest/test_extensions.py @@ -126,11 +126,14 @@ def regsister_custom_layer(): hls4ml.model.layers.register_layer('HReverse', HReverse) -@pytest.mark.parametrize('backend_id', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend_id', ['Vivado', 'Vitis', 'Quartus']) def test_extensions(tmp_path, backend_id): # Register the optimization passes (if any) backend = hls4ml.backends.get_backend(backend_id) - backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=f'{backend_id.lower()}:optimize') + ip_flow = hls4ml.model.flow.get_flow(backend.get_default_flow()) + # Add the pass into the main optimization flow + optimize_flow = [flow for flow in ip_flow.requires if ':optimize' in flow][0] + backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=optimize_flow) # Register template passes for the given backend backend.register_template(HReverseConfigTemplate) @@ -168,6 +171,6 @@ def test_extensions(tmp_path, backend_id): hres = hmodel.predict(x.astype('float32')) # Check if the optimizer pass was applied - assert f'{backend_id.lower()}:remove_duplicate_reverse' in hmodel._applied_flows[0][f'{backend_id.lower()}:optimize'] + assert f'{backend_id.lower()}:remove_duplicate_reverse' in hmodel._applied_flows[0][optimize_flow] np.testing.assert_array_equal(kres, hres) diff --git a/test/pytest/test_globalpooling.py b/test/pytest/test_globalpooling.py index 79260afbdf..829c8f5d9f 100644 --- a/test/pytest/test_globalpooling.py +++ b/test/pytest/test_globalpooling.py @@ -30,7 +30,7 @@ def keras_model_avg_1d(): return model -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado']) @pytest.mark.parametrize('model_type', ['max', 'avg']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_global_pool1d(backend, keras_model_max_1d, keras_model_avg_1d, data_1d, model_type, io_type): @@ -70,7 +70,7 @@ def keras_model_avg_2d(): model.compile() return model -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado']) @pytest.mark.parametrize('model_type', ['max', 'avg']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_global_pool2d(backend, keras_model_max_2d, keras_model_avg_2d, data_2d, model_type, io_type): diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py index bd3f175b18..6da5166468 100644 --- a/test/pytest/test_keras_api.py +++ b/test/pytest/test_keras_api.py @@ -15,7 +15,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_dense(backend, io_type): model = tf.keras.models.Sequential() @@ -66,7 +66,7 @@ def test_dense(backend, io_type): PReLU(alpha_initializer="zeros",), Activation(activation='sigmoid', name='Activation')]) #ThresholdedReLU(theta=1.0)]) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_activations(activation_function, backend, io_type): model = tf.keras.models.Sequential() @@ -94,7 +94,7 @@ def test_activations(activation_function, backend, io_type): padds_options = ['same', 'valid'] @pytest.mark.parametrize('padds', padds_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_conv1d(padds, backend, io_type): model = tf.keras.models.Sequential() @@ -123,8 +123,8 @@ def test_conv1d(padds, backend, io_type): # 5e-2 might be too high np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=5e-2) - if not (backend=='Vivado' and io_type=='io_stream' and padds=='same'): - # Vivado inserts and additional layer for 'same' padding in io_stream + if not (backend in ['Vivado', 'Vitis'] and io_type=='io_stream' and padds=='same'): + # Vivado/Vitis inserts and additional layer for 'same' padding in io_stream assert len(model.layers) + 2 == len(hls_model.get_layers()) assert list(hls_model.get_layers())[1].attributes['name'] == model.layers[0]._name assert list(hls_model.get_layers())[1].attributes['class_name'] == 'Conv1D' @@ -154,7 +154,7 @@ def test_conv1d(padds, backend, io_type): padds_options=['same', 'valid'] @pytest.mark.parametrize('chans', chans_options) @pytest.mark.parametrize('padds', padds_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_conv2d(chans, padds, backend, io_type): model = tf.keras.models.Sequential() @@ -235,7 +235,7 @@ def test_conv2d(chans, padds, backend, io_type): @pytest.mark.parametrize('pooling', pooling_layers) @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('chans', chans_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_pooling(pooling, padds, chans, backend): assert '1D' in pooling.__name__ or '2D' in pooling.__name__ diff --git a/test/pytest/test_keras_h5_loader.py b/test/pytest/test_keras_h5_loader.py index 0fa689e451..08753d5846 100644 --- a/test/pytest/test_keras_h5_loader.py +++ b/test/pytest/test_keras_h5_loader.py @@ -8,7 +8,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_keras_h5_loader(backend): input_shape = (10,) model = tf.keras.models.Sequential([ diff --git a/test/pytest/test_merge.py b/test/pytest/test_merge.py index 470e9b3ffa..8ab4fa3a15 100644 --- a/test/pytest/test_merge.py +++ b/test/pytest/test_merge.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize('merge_layer', [Add, Average, Maximum, Minimum, Multiply, Subtract]) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_merge(merge_layer, io_type, backend): input_shape = (10, 10, 3) @@ -35,7 +35,7 @@ def test_merge(merge_layer, io_type, backend): @pytest.mark.parametrize('axes', [1]) @pytest.mark.parametrize('io_type', ['io_parallel']) # No io_stream implementation yet -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_dot(axes, io_type, backend): # Only 1D implemented input_shape = (10, ) @@ -61,7 +61,7 @@ def test_dot(axes, io_type, backend): np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_concatenate1d(io_type, backend): input_shape = (10,) @@ -87,7 +87,7 @@ def test_concatenate1d(io_type, backend): @pytest.mark.parametrize('axis', [1, 2]) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_concatenate2d(axis, io_type, backend): input_shape = (10, 3) @@ -114,7 +114,7 @@ def test_concatenate2d(axis, io_type, backend): @pytest.mark.parametrize('axis', [1, 2, 3]) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Quartus', 'Vivado']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_concatenate3d(axis, io_type, backend): input_shape = (10, 10, 3) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 7650056f87..d43e352883 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -20,11 +20,13 @@ @pytest.mark.parametrize('backend, io_type, strategy', [ ('Quartus', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'resource'), - + ('Vitis', 'io_parallel', 'resource'), ('Vivado', 'io_parallel', 'latency'), - + ('Vitis', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource') + ('Vivado', 'io_stream', 'resource'), + ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource'), ]) def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): model = tf.keras.models.Sequential() diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py index 19fa8375fc..e7fa1ea15a 100644 --- a/test/pytest/test_qkeras.py +++ b/test/pytest/test_qkeras.py @@ -127,7 +127,7 @@ def randX_100_16(): # https://github.com/fastmachinelearning/hls4ml/issues/381 # @pytest.mark.parametrize('bits', [4, 6, 8]) @pytest.mark.parametrize('bits,alpha', [(4, 1), (4, 'auto_po2')]) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_single_dense_activation_exact(randX_100_16, bits, alpha, backend, io_type): ''' @@ -197,7 +197,7 @@ def randX_100_10(): (7, 10, binary(), quantized_bits(5, 2), binary(), False, True), ], ) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_btnn(make_btnn, randX_100_10, backend, io_type): model, is_xnor, test_no = make_btnn @@ -240,7 +240,7 @@ def randX_1000_1(): (quantized_relu(10, 5)), ], ) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_quantizer(randX_1000_1, quantizer, backend, io_type): ''' diff --git a/test/pytest/test_rnn.py b/test/pytest/test_rnn.py index 12fc426019..aa49e43d3f 100644 --- a/test/pytest/test_rnn.py +++ b/test/pytest/test_rnn.py @@ -70,10 +70,14 @@ def test_rnn_parsing(rnn_layer, return_sequences): [ (SimpleRNN, 'Quartus', 'io_parallel'), (LSTM, 'Vivado', 'io_parallel'), + (LSTM, 'Vitis', 'io_parallel'), (LSTM, 'Quartus', 'io_parallel'), (LSTM, 'Vivado', 'io_stream'), + (LSTM, 'Vitis', 'io_stream'), (GRU, 'Vivado', 'io_parallel'), (GRU, 'Vivado', 'io_stream'), + (GRU, 'Vitis', 'io_parallel'), + (GRU, 'Vitis', 'io_stream'), (GRU, 'Quartus', 'io_parallel'), (GRU, 'Quartus', 'io_stream'), ], diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index 7815d57702..d32569449e 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -23,7 +23,8 @@ @pytest.mark.parametrize("kernels", kernel_options) @pytest.mark.parametrize("bias", bias_options) @pytest.mark.parametrize("io_type", io_type_options) -def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type): +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) +def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() input_shape = (28, 28, 3) model.add(conv2d(filters=32, @@ -42,8 +43,8 @@ def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type): config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>') stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '') - output_dir = str(test_root_path / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding'.format(conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds)) - hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, io_type=io_type) + output_dir = str(test_root_path / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds, backend, io_type)) + hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend) hls_model.compile() hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape) diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py index 749a019f39..9290faf500 100644 --- a/test/pytest/test_softmax.py +++ b/test/pytest/test_softmax.py @@ -23,7 +23,7 @@ def high_accuracy_distribution(shape): def generate_data(function, input_shape): return function((1000, *input_shape)) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('strategy', ['stable', 'argmax']) @pytest.mark.parametrize('function,input_shape,io_type', [ (flat_distribution, (8,), 'io_parallel'), @@ -58,7 +58,7 @@ def test_softmax(backend, strategy, generate_data, input_shape, io_type, functio assert acc_hls4ml >= 0.98 -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_softmax_skipped(backend, io_type): X = np.random.rand(100, 10) diff --git a/test/pytest/test_softsign.py b/test/pytest/test_softsign.py index 2f70b8251d..338aaf6f31 100644 --- a/test/pytest/test_softsign.py +++ b/test/pytest/test_softsign.py @@ -7,7 +7,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('input_shape, io_type', [ ((8, ), 'io_parallel'), ((8, ), 'io_stream'), diff --git a/test/pytest/test_trace.py b/test/pytest/test_trace.py index ce01c4213e..4c7cde4ac5 100644 --- a/test/pytest/test_trace.py +++ b/test/pytest/test_trace.py @@ -8,7 +8,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_trace(backend): '''Test the tracing feature with a simple Keras model.''' model = tf.keras.models.Sequential() diff --git a/test/pytest/test_transpose_concat.py b/test/pytest/test_transpose_concat.py index 488fc46b60..db3e03125f 100644 --- a/test/pytest/test_transpose_concat.py +++ b/test/pytest/test_transpose_concat.py @@ -29,7 +29,7 @@ def keras_model(): @pytest.fixture @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def hls_model(keras_model, backend, io_type): hls_config = hls4ml.utils.config_from_keras_model( keras_model, default_precision='ap_fixed<16,3,AP_RND_CONV,AP_SAT>', granularity='name' @@ -45,7 +45,7 @@ def hls_model(keras_model, backend, io_type): @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_accuracy(data, keras_model, hls_model): X = data model = keras_model diff --git a/test/pytest/test_upsampling.py b/test/pytest/test_upsampling.py index 7e698fd907..0f51301621 100644 --- a/test/pytest/test_upsampling.py +++ b/test/pytest/test_upsampling.py @@ -41,7 +41,7 @@ def keras_model_2d(): @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('model_type', ['1d', '2d']) def test_upsampling(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend): if model_type == '1d': diff --git a/test/pytest/test_zeropadding.py b/test/pytest/test_zeropadding.py index 219f727c06..ca539a9ef5 100644 --- a/test/pytest/test_zeropadding.py +++ b/test/pytest/test_zeropadding.py @@ -45,7 +45,7 @@ def keras_model_2d(): @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('model_type', ['1d', '2d']) def test_zeropadding(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend): if model_type == '1d': From f37f5af1f783fd1da0d4cffa8dd80a9ef9e9dc25 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Fri, 24 Mar 2023 14:41:33 -0700 Subject: [PATCH 20/20] Merge with main (#52) * Add quantized sigmoid, fix quantized tanh for QKeras (#569) * snapshot of beginnings * make version that works for Vivado, error for Quartus * Change order of precision from quantizer * add hard sigmoid and tanh * fix setting of slope and shift type * revert config parsing--seems a little strange but works * fix hard_sigmoid and hard_tanh for streaming * update pytest for quantized tanh and sigmoid * remove inadvertently included matoplotlib * add special case when W == min_width. * fix merge of main * Go back to having AP_TRN and AP_WRP as defaults * handle case when use_real_tanh is not defined * make the activations use AP_RND_CONV (and AP_SAT) by default * remove use of use_real_tanh in test since not always supported * fix incorrect default types for Keras (not QKeras) hard_sigmoid * Mostly fix up things for Quartus * get rid of intermediate cast * fix an i++ compilation issue * Quartus seems to not like ac_fixed<1,0,false>, so make 2 bits. * fix activation quantizer * make sat, round defeult activation parameters, don't need to set again * Make the slope and shift not be configurable for HardActivation * some pre-commit fixes * pre-commint //hls to // hls fixes * update CI version * fixes for parsing errors from pre-commits * remove qactivation from list of activation_layers * print_vivado_report function for nicer reports (#730) * print_vivado_report function for fancier reports * Fancy reports (#51) * fix uram divide by 0 * add test * fix parsing of vsynth in 2020.1; add test * Update test_report.py * exclude pregenerated reports --------- Co-authored-by: Javier Duarte --------- Co-authored-by: Jovan Mitrevski Co-authored-by: Vladimir --- .pre-commit-config.yaml | 2 +- .../backends/quartus/passes/core_templates.py | 58 +- .../backends/vivado/passes/core_templates.py | 58 +- hls4ml/converters/keras/core.py | 2 + hls4ml/converters/keras/qkeras_layers.py | 24 +- hls4ml/converters/keras_to_hls.py | 11 +- hls4ml/model/layers.py | 27 + hls4ml/report/__init__.py | 12 +- hls4ml/report/vivado_report.py | 438 ++++++++- hls4ml/templates/quartus/firmware/defines.h | 50 +- .../templates/quartus/firmware/myproject.cpp | 47 +- hls4ml/templates/quartus/firmware/myproject.h | 24 +- .../firmware/nnet_utils/nnet_activation.h | 465 +++++----- .../nnet_utils/nnet_activation_stream.h | 460 +++++---- .../templates/quartus/firmware/parameters.h | 5 +- hls4ml/templates/quartus/myproject_bridge.cpp | 21 +- .../quartus/myproject_test_parallel.cpp | 111 ++- .../quartus/myproject_test_stream.cpp | 122 ++- hls4ml/templates/vivado/firmware/defines.h | 6 +- .../templates/vivado/firmware/myproject.cpp | 8 +- hls4ml/templates/vivado/firmware/myproject.h | 4 +- hls4ml/templates/vivado/firmware/parameters.h | 12 +- hls4ml/templates/vivado/myproject_bridge.cpp | 24 +- hls4ml/templates/vivado/myproject_test.cpp | 142 ++- .../vivado/nnet_utils/nnet_activation.h | 481 +++++----- .../nnet_utils/nnet_activation_stream.h | 367 +++++--- .../vivado/nnet_utils/nnet_code_gen.h | 35 +- .../vivado_accelerator/myproject_axi.cpp | 17 +- .../vivado_accelerator/myproject_axi.h | 9 +- hls4ml/utils/config.py | 20 +- hls4ml/writer/quartus_writer.py | 71 +- hls4ml/writer/vivado_accelerator_writer.py | 305 +++--- hls4ml/writer/vivado_writer.py | 49 +- test/pytest/test_qkeras.py | 50 +- test/pytest/test_report.py | 71 ++ test/pytest/test_report/myproject_csynth.rpt | 196 ++++ test/pytest/test_report/myproject_csynth.xml | 878 ++++++++++++++++++ test/pytest/test_report/vivado_hls.app | 15 + test/pytest/test_report/vivado_synth.rpt | 184 ++++ 39 files changed, 3443 insertions(+), 1438 deletions(-) mode change 100755 => 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h create mode 100644 test/pytest/test_report.py create mode 100644 test/pytest/test_report/myproject_csynth.rpt create mode 100644 test/pytest/test_report/myproject_csynth.xml create mode 100644 test/pytest/test_report/vivado_hls.app create mode 100644 test/pytest/test_report/vivado_synth.rpt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0f1f6823a9..83d09fbe31 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -exclude: ^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/ +exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pytest/test_report/) repos: - repo: https://github.com/psf/black diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py index 26b99db105..aece9fc226 100644 --- a/hls4ml/backends/quartus/passes/core_templates.py +++ b/hls4ml/backends/quartus/passes/core_templates.py @@ -1,7 +1,6 @@ - from hls4ml.backends.backend import get_backend -from hls4ml.model.layers import Activation, BatchNormalization, Dense, PReLU, ParametrizedActivation, Softmax -from hls4ml.backends.template import LayerConfigTemplate, FunctionCallTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax # Dense templates @@ -38,24 +37,28 @@ dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] + class DenseConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__(Dense) self.template = dense_config_template - + def format(self, node): params = self._default_config_params(node) params['nzeros'] = node.get_weights('weight').nzeros params['nonzeros'] = node.get_weights('weight').nonzeros - params['product_type'] = get_backend('quartus').product_type(node.get_input_variable().type.precision, node.get_weights('weight').type.precision) + params['product_type'] = get_backend('quartus').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) return self.template.format(**params) + class DenseFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__(Dense, include_header=dense_include_list) self.template = dense_function_template - + def format(self, node): params = self._default_function_params(node) params['w'] = node.get_weights('weight').name @@ -82,23 +85,27 @@ def format(self, node): batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + class BatchNormalizationConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__(BatchNormalization) self.template = batchnorm_config_template - + def format(self, node): params = self._default_config_params(node) params['n_in'] = node.get_input_variable().size_cpp() - params['product_type'] = get_backend('quartus').product_type(node.get_input_variable().type.precision, node.get_weights('scale').type.precision) + params['product_type'] = get_backend('quartus').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) return self.template.format(**params) + class BatchNormalizationFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__(BatchNormalization, include_header=batchnorm_include_list) self.template = batchnorm_function_template - + def format(self, node): params = self._default_function_params(node) params['scale'] = node.get_weights('scale').name @@ -117,6 +124,16 @@ def format(self, node): typedef {table_t.name} table_t; }};\n""" +hard_activ_config_template = """struct {type}_config{index} {{ + static const unsigned n_in = {n_in}; + static const {slope_t.name} slope; + static const {shift_t.name} shift; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; +}}; +const {slope_t.name} {type}_config{index}::slope = {slope}; +const {shift_t.name} {type}_config{index}::shift = {shift};\n""" + softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ static const unsigned n_in = {n_in}; static const unsigned table_size = {table_size}; @@ -132,6 +149,7 @@ def format(self, node): activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] + class ActivationConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__((Activation, ParametrizedActivation, PReLU)) @@ -143,16 +161,30 @@ def format(self, node): return self.template.format(**params) + +class HardActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(HardActivation) + self.template = hard_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + class SoftmaxConfigTemplate(ActivationConfigTemplate): def __init__(self): - super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ + super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ self.template = softmax_config_template + class ActivationFunctionTemplate(FunctionCallTemplate): def __init__(self): - super().__init__((Activation, Softmax), include_header=activ_include_list) + super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list) self.template = activ_function_template - + def format(self, node): params = self._default_function_params(node) params['activation'] = node.get_attr('activation').lower() @@ -160,6 +192,7 @@ def format(self, node): return self.template.format(**params) + class ParametrizedActivationFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__(ParametrizedActivation, include_header=activ_include_list) @@ -173,6 +206,7 @@ def format(self, node): return self.template.format(**params) + class PReLUFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__(PReLU, include_header=activ_include_list) diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index 8327e3a7fe..c8119c0c2e 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -1,7 +1,6 @@ - from hls4ml.backends.backend import get_backend -from hls4ml.model.layers import Activation, BatchNormalization, Dense, Embedding, PReLU, ParametrizedActivation, Softmax -from hls4ml.backends.template import LayerConfigTemplate, FunctionCallTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax # Dense templates @@ -27,24 +26,28 @@ dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] + class DenseConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__(Dense) self.template = dense_config_template - + def format(self, node): params = self._default_config_params(node) params['nzeros'] = node.get_weights('weight').nzeros params['nonzeros'] = node.get_weights('weight').nonzeros - params['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('weight').type.precision) + params['product_type'] = get_backend('vivado').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) return self.template.format(**params) + class DenseFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__(Dense, include_header=dense_include_list) self.template = dense_function_template - + def format(self, node): params = self._default_function_params(node) params['w'] = node.get_weights('weight').name @@ -73,23 +76,27 @@ def format(self, node): batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + class BatchNormalizationConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__(BatchNormalization) self.template = batchnorm_config_template - + def format(self, node): params = self._default_config_params(node) params['n_in'] = node.get_input_variable().size_cpp() - params['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('scale').type.precision) + params['product_type'] = get_backend('vivado').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) return self.template.format(**params) + class BatchNormalizationFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__(BatchNormalization, include_header=batchnorm_include_list) self.template = batchnorm_function_template - + def format(self, node): params = self._default_function_params(node) params['scale'] = node.get_weights('scale').name @@ -108,6 +115,16 @@ def format(self, node): typedef {table_t.name} table_t; }};\n""" +hard_activ_config_template = """struct {type}_config{index} {{ + static const unsigned n_in = {n_in}; + static const {slope_t.name} slope; + static const {shift_t.name} shift; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; +}}; +const {slope_t.name} {type}_config{index}::slope = {slope}; +const {shift_t.name} {type}_config{index}::shift = {shift};\n""" + softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ static const unsigned n_in = {n_in}; static const unsigned table_size = {table_size}; @@ -124,6 +141,7 @@ def format(self, node): activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] + class ActivationConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__((Activation, ParametrizedActivation, PReLU)) @@ -135,16 +153,30 @@ def format(self, node): return self.template.format(**params) + +class HardActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(HardActivation) + self.template = hard_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + class SoftmaxConfigTemplate(ActivationConfigTemplate): def __init__(self): - super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ + super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ self.template = softmax_config_template + class ActivationFunctionTemplate(FunctionCallTemplate): def __init__(self): - super().__init__((Activation, Softmax), include_header=activ_include_list) + super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list) self.template = activ_function_template - + def format(self, node): params = self._default_function_params(node) params['activation'] = node.get_attr('activation').lower() @@ -152,6 +184,7 @@ def format(self, node): return self.template.format(**params) + class ParametrizedActivationFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__(ParametrizedActivation, include_header=activ_include_list) @@ -165,6 +198,7 @@ def format(self, node): return self.template.format(**params) + class PReLUFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__(PReLU, include_header=activ_include_list) diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py index 4411ae4c53..97bdefabd7 100644 --- a/hls4ml/converters/keras/core.py +++ b/hls4ml/converters/keras/core.py @@ -105,6 +105,8 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader): if layer['class_name'] == 'Activation' and layer['activation'] == 'softmax': layer['class_name'] = 'Softmax' + if layer['class_name'] == 'Activation' and layer['activation'] == 'hard_sigmoid': + layer['class_name'] = 'HardActivation' if layer['class_name'] == 'Softmax': layer['axis'] = keras_layer['config'].get('axis', -1) diff --git a/hls4ml/converters/keras/qkeras_layers.py b/hls4ml/converters/keras/qkeras_layers.py index 5839ca542a..b547c39685 100644 --- a/hls4ml/converters/keras/qkeras_layers.py +++ b/hls4ml/converters/keras/qkeras_layers.py @@ -4,6 +4,7 @@ from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_dense_layer from hls4ml.converters.keras.qkeras import get_quantizer_from_config from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer +from hls4ml.model.types import FixedPrecisionType @keras_handler('QDense') @@ -46,6 +47,7 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader) 'quantized_tanh', 'binary_tanh', 'ternary_tanh', + 'quantized_sigmoid', 'quantized_bits', 'binary', 'ternary', @@ -79,16 +81,32 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader) if activation_config['class_name'] not in supported_activations: raise Exception('Unsupported QKeras activation: {}'.format(activation_config['class_name'])) + if activation_config['class_name'] == 'quantized_bits': + activation_config['class_name'] = 'linear' + if activation_config['class_name'] == 'ternary_tanh': layer['class_name'] = 'TernaryTanh' layer['threshold'] = activation_config.get('config', {}).get('threshold', 0.33) if layer['threshold'] is None: layer['threshold'] = 0.33 # the default ternary tanh threshold for QKeras + layer['activation'] = 'ternary_tanh' + elif ( + activation_config['class_name'] == 'quantized_sigmoid' + and not activation_config['config'].get('use_real_sigmoid', False) + ) or ( + activation_config['class_name'] == 'quantized_tanh' and not activation_config['config'].get('use_real_tanh', False) + ): + layer['class_name'] = 'HardActivation' + layer['slope'] = 0.5 # the default values in QKeras + layer['shift'] = 0.5 + # Quartus seems to have trouble if the width is 1. + layer['slope_prec'] = FixedPrecisionType(width=2, integer=0, signed=False) + layer['shift_prec'] = FixedPrecisionType(width=2, integer=0, signed=False) + layer['activation'] = activation_config['class_name'].replace('quantized_', 'hard_') else: layer['class_name'] = 'Activation' - if activation_config['class_name'] == 'quantized_bits': - activation_config['class_name'] = 'linear' - layer['activation'] = activation_config['class_name'].replace('quantized_', '') + layer['activation'] = activation_config['class_name'].replace('quantized_', '') + layer['activation_quantizer'] = activation_config return layer, [shape for shape in input_shapes[0]] diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py index 2c875e42fb..43748246db 100644 --- a/hls4ml/converters/keras_to_hls.py +++ b/hls4ml/converters/keras_to_hls.py @@ -257,7 +257,16 @@ def parse_keras_model(model_arch, reader): # Define layers to skip for conversion to HLS skip_layers = ['Dropout'] # Activation layers - activation_layers = ['Activation', 'LeakyReLU', 'ThresholdedReLU', 'ELU', 'PReLU', 'Softmax', 'TernaryTanh'] + activation_layers = [ + 'Activation', + 'LeakyReLU', + 'ThresholdedReLU', + 'ELU', + 'PReLU', + 'Softmax', + 'TernaryTanh', + 'HardActivation', + ] # Recurrent layers recurrent_layers = ['SimpleRNN', 'LSTM', 'GRU'] # All supported layers diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index b8a3a1a4d9..77c874f589 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -768,6 +768,32 @@ def _get_act_function_name(self): return act # ELU activation +class HardActivation(Activation): + ''' + Implements the hard sigmoid and tan function in keras and qkeras + (Default parameters in qkeras are different, so should be configured) + The hard sigmoid unction is clip(slope * x + shift, 0, 1), and the + hard tanh function is 2 * hard_sigmoid - 1 + ''' + + _expected_attributes = [ + Attribute('slope', value_type=float, default=0.2, configurable=False), + Attribute('shift', value_type=float, default=0.5, configurable=False), + TypeAttribute('slope_t'), + TypeAttribute('shift_t'), + ] + + def initialize(self): + super().initialize() + slope_prec = self.get_attr('slope_prec', FixedPrecisionType(width=16, integer=0, signed=False)) + shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=1, integer=0, signed=False)) + index = self.get_attr('index') + slope_t = NamedType(f'slope{index}_t', precision=slope_prec) + shift_t = NamedType(f'shift{index}_t', precision=shift_prec) + self.set_attr('slope_t', slope_t) + self.set_attr('shift_t', shift_t) + + class PReLU(Activation): def initialize(self): super().initialize() @@ -1264,6 +1290,7 @@ def _initialize_transforms(self): 'PReLU': PReLU, 'Softmax': Softmax, 'TernaryTanh': TernaryTanh, + 'HardActivation': HardActivation, 'Reshape': Reshape, 'Dense': Dense, 'BinaryDense': Dense, diff --git a/hls4ml/report/__init__.py b/hls4ml/report/__init__.py index a75262d1fc..b73558f6ee 100644 --- a/hls4ml/report/__init__.py +++ b/hls4ml/report/__init__.py @@ -1,7 +1,5 @@ -from __future__ import absolute_import - -from hls4ml.report.vivado_report import read_vivado_report -from hls4ml.report.vivado_report import parse_vivado_report - -from hls4ml.report.quartus_report import read_quartus_report -from hls4ml.report.quartus_report import parse_quartus_report \ No newline at end of file +from hls4ml.report.quartus_report import parse_quartus_report # noqa: F401 +from hls4ml.report.quartus_report import read_quartus_report # noqa: F401 +from hls4ml.report.vivado_report import parse_vivado_report # noqa: F401 +from hls4ml.report.vivado_report import print_vivado_report # noqa: F401 +from hls4ml.report.vivado_report import read_vivado_report # noqa: F401 diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py index 736fc3354b..68c3ad9ddf 100644 --- a/hls4ml/report/vivado_report.py +++ b/hls4ml/report/vivado_report.py @@ -1,12 +1,12 @@ -from __future__ import print_function import os import re import sys import xml.etree.ElementTree as ET + def read_vivado_report(hls_dir, full_report=False): if not os.path.exists(hls_dir): - print('Path {} does not exist. Exiting.'.format(hls_dir)) + print(f'Path {hls_dir} does not exist. Exiting.') return prj_dir = None @@ -21,30 +21,37 @@ def read_vivado_report(hls_dir, full_report=False): sln_dir = hls_dir + '/' + prj_dir if not os.path.exists(sln_dir): - print('Project {} does not exist. Rerun "hls4ml build -p {}".'.format(prj_dir, hls_dir)) + print(f'Project {prj_dir} does not exist. Rerun "hls4ml build -p {hls_dir}".') return solutions = _find_solutions(sln_dir) - print('Found {} solution(s) in {}.'.format(len(solutions), sln_dir)) + print(f'Found {len(solutions)} solution(s) in {sln_dir}.') for sln in solutions: - print('Reports for solution "{}":\n'.format(sln)) + print(f'Reports for solution "{sln}":\n') _find_reports(sln_dir + '/' + sln, top_func_name, full_report) + def _parse_project_script(path): prj_dir = None top_func_name = None project_path = path + '/project.tcl' - with open(project_path, 'r') as f: + with open(project_path) as f: for line in f.readlines(): if 'set project_name' in line: top_func_name = line.split('"')[-2] prj_dir = top_func_name + '_prj' + if 'set backend' in line: + backend_name = line.split('"')[-2] + + if 'accelerator' in backend_name: + top_func_name += '_axi' return prj_dir, top_func_name + def _find_solutions(sln_dir): solutions = [] @@ -67,49 +74,55 @@ def _find_solutions(sln_dir): return solutions + def _find_reports(sln_dir, top_func_name, full_report=False): - csim_file = sln_dir + '/csim/report/{}_csim.log'.format(top_func_name) + csim_file = sln_dir + f'/csim/report/{top_func_name}_csim.log' if os.path.isfile(csim_file): _show_csim_report(csim_file) else: print('C simulation report not found.') - syn_file = sln_dir + '/syn/report/{}_csynth.rpt'.format(top_func_name) + syn_file = sln_dir + f'/syn/report/{top_func_name}_csynth.rpt' if os.path.isfile(syn_file): _show_synth_report(syn_file, full_report) else: print('Synthesis report not found.') - cosim_file = sln_dir + '/sim/report/{}_cosim.rpt'.format(top_func_name) + cosim_file = sln_dir + f'/sim/report/{top_func_name}_cosim.rpt' if os.path.isfile(cosim_file): _show_cosim_report(cosim_file) else: print('Co-simulation report not found.') + def _show_csim_report(csim_file): - with open(csim_file, 'r') as f: + with open(csim_file) as f: print('C SIMULATION RESULT:') print(f.read()) + def _show_synth_report(synth_file, full_report=False): - with open(synth_file, 'r') as f: + with open(synth_file) as f: print('SYNTHESIS REPORT:') for line in f.readlines()[2:]: if not full_report and '* DSP48' in line: break - print(line, end = '') + print(line, end='') + def _show_cosim_report(cosim_file): - with open(cosim_file, 'r') as f: + with open(cosim_file) as f: print('CO-SIMULATION RESULT:') print(f.read()) + def _get_abs_and_percentage_values(unparsed_cell): return int(unparsed_cell.split('(')[0]), float(unparsed_cell.split('(')[1].replace('%', '').replace(')', '')) + def parse_vivado_report(hls_dir): if not os.path.exists(hls_dir): - print('Path {} does not exist. Exiting.'.format(hls_dir)) + print(f'Path {hls_dir} does not exist. Exiting.') return prj_dir = None @@ -124,19 +137,19 @@ def parse_vivado_report(hls_dir): sln_dir = hls_dir + '/' + prj_dir if not os.path.exists(sln_dir): - print('Project {} does not exist. Rerun "hls4ml build -p {}".'.format(prj_dir, hls_dir)) + print(f'Project {prj_dir} does not exist. Rerun "hls4ml build -p {hls_dir}".') return solutions = _find_solutions(sln_dir) if len(solutions) > 1: - print('WARNING: Found {} solution(s) in {}. Using the first solution.'.format(len(solutions), sln_dir)) + print(f'WARNING: Found {len(solutions)} solution(s) in {sln_dir}. Using the first solution.') report = {} sim_file = hls_dir + '/tb_data/csim_results.log' if os.path.isfile(sim_file): csim_results = [] - with open(sim_file, 'r') as f: + with open(sim_file) as f: for line in f.readlines(): csim_results.append([r for r in line.split()]) report['CSimResults'] = csim_results @@ -144,18 +157,19 @@ def parse_vivado_report(hls_dir): sim_file = hls_dir + '/tb_data/rtl_cosim_results.log' if os.path.isfile(sim_file): cosim_results = [] - with open(sim_file, 'r') as f: + with open(sim_file) as f: for line in f.readlines(): cosim_results.append([r for r in line.split()]) report['CosimResults'] = cosim_results - syn_file = sln_dir + '/' + solutions[0] + '/syn/report/{}_csynth.xml'.format(top_func_name) + syn_file = sln_dir + '/' + solutions[0] + f'/syn/report/{top_func_name}_csynth.xml' c_synth_report = {} if os.path.isfile(syn_file): root = ET.parse(syn_file).getroot() # Performance perf_node = root.find('./PerformanceEstimates') + c_synth_report['TargetClockPeriod'] = root.find('./UserAssignments/TargetClockPeriod').text c_synth_report['EstimatedClockPeriod'] = perf_node.find('./SummaryOfTimingAnalysis/EstimatedClockPeriod').text c_synth_report['BestLatency'] = perf_node.find('./SummaryOfOverallLatency/Best-caseLatency').text c_synth_report['WorstLatency'] = perf_node.find('./SummaryOfOverallLatency/Worst-caseLatency').text @@ -187,9 +201,10 @@ def parse_vivado_report(hls_dir): section = int(match.group(1)) # Sometimes, phrases such as 'CLB Registers' can show up in the non-tabular sections of the report if '|' in line: - if 'CLB LUTs' in line and section == 1: + # CLB (2019.X) vs. Slice (2020.X) + if ('CLB LUTs' in line or 'Slice LUTs' in line) and section == 1: vivado_synth_rpt['LUT'] = line.split('|')[2].strip() - elif 'CLB Registers' in line and section == 1: + elif ('CLB Registers' in line or 'Slice Registers' in line) and section == 1: vivado_synth_rpt['FF'] = line.split('|')[2].strip() elif 'Block RAM Tile' in line and section == 2: vivado_synth_rpt['BRAM_18K'] = line.split('|')[2].strip() @@ -201,13 +216,13 @@ def parse_vivado_report(hls_dir): else: print('Vivado synthesis report not found.') - cosim_file = sln_dir + '/' + solutions[0] + '/sim/report/{}_cosim.rpt'.format(top_func_name) + cosim_file = sln_dir + '/' + solutions[0] + f'/sim/report/{top_func_name}_cosim.rpt' if os.path.isfile(cosim_file): cosim_report = {} - with open(cosim_file, 'r') as f: + with open(cosim_file) as f: for line in f.readlines(): if re.search('VHDL', line) or re.search('Verilog', line): - result = line[1:].split() # [1:] skips the leading '|' + result = line[1:].split() # [1:] skips the leading '|' result = [res[:-1] if res[-1] == '|' else res for res in result] # RTL, Status, Latency-min, Latency-avg, Latency-max, Interval-min, Interval-avg, Interval-max if result[1] == 'NA': @@ -224,27 +239,58 @@ def parse_vivado_report(hls_dir): print('Cosim report not found.') if os.path.isfile(cosim_file): - transaction_file = sln_dir + '/' + solutions[0] + '/sim/' + report['CosimReport']['RTL'].lower() + '/' + top_func_name + '.performance.result.transaction.xml' + transaction_file = ( + sln_dir + + '/' + + solutions[0] + + '/sim/' + + report['CosimReport']['RTL'].lower() + + '/' + + top_func_name + + '.performance.result.transaction.xml' + ) if os.path.isfile(transaction_file): - cosim_transactions = {'InitiationInterval': {'max': 0, 'min': sys.maxsize, 'avg': 0.0}, - 'Latency': {'max': 0, 'min': sys.maxsize, 'avg': 0.0}} - with open(transaction_file, 'r') as f: + cosim_transactions = { + 'InitiationInterval': {'max': 0, 'min': sys.maxsize, 'avg': 0.0}, + 'Latency': {'max': 0, 'min': sys.maxsize, 'avg': 0.0}, + } + with open(transaction_file) as f: i = 1 for line in f.readlines(): if re.search('transaction', line): result = line.split() # update min if result[3] != 'x': - cosim_transactions['InitiationInterval']['min'] = int(result[3]) if int(result[3]) < cosim_transactions['InitiationInterval']['min'] else cosim_transactions['InitiationInterval']['min'] - cosim_transactions['Latency']['min'] = int(result[2]) if int(result[2]) < cosim_transactions['Latency']['min'] else cosim_transactions['Latency']['min'] + cosim_transactions['InitiationInterval']['min'] = ( + int(result[3]) + if int(result[3]) < cosim_transactions['InitiationInterval']['min'] + else cosim_transactions['InitiationInterval']['min'] + ) + cosim_transactions['Latency']['min'] = ( + int(result[2]) + if int(result[2]) < cosim_transactions['Latency']['min'] + else cosim_transactions['Latency']['min'] + ) # update max if result[3] != 'x': - cosim_transactions['InitiationInterval']['max'] = int(result[3]) if int(result[3]) > cosim_transactions['InitiationInterval']['max'] else cosim_transactions['InitiationInterval']['max'] - cosim_transactions['Latency']['max'] = int(result[2]) if int(result[2]) > cosim_transactions['Latency']['max'] else cosim_transactions['Latency']['max'] + cosim_transactions['InitiationInterval']['max'] = ( + int(result[3]) + if int(result[3]) > cosim_transactions['InitiationInterval']['max'] + else cosim_transactions['InitiationInterval']['max'] + ) + cosim_transactions['Latency']['max'] = ( + int(result[2]) + if int(result[2]) > cosim_transactions['Latency']['max'] + else cosim_transactions['Latency']['max'] + ) # update avg if result[3] != 'x': - cosim_transactions['InitiationInterval']['avg'] = cosim_transactions['InitiationInterval']['avg'] + float((int(result[3]) - cosim_transactions['InitiationInterval']['avg']) / i) - cosim_transactions['Latency']['avg'] = cosim_transactions['Latency']['avg'] + float((int(result[2]) - cosim_transactions['Latency']['avg']) / i) + cosim_transactions['InitiationInterval']['avg'] = cosim_transactions['InitiationInterval'][ + 'avg' + ] + float((int(result[3]) - cosim_transactions['InitiationInterval']['avg']) / i) + cosim_transactions['Latency']['avg'] = cosim_transactions['Latency']['avg'] + float( + (int(result[2]) - cosim_transactions['Latency']['avg']) / i + ) i += 1 report['CosimReport']['LatencyMin'] = cosim_transactions['Latency']['min'] @@ -258,10 +304,10 @@ def parse_vivado_report(hls_dir): util_rpt_file = hls_dir + '/util.rpt' if os.path.isfile(util_rpt_file): implementation_report = {} - with open(util_rpt_file, 'r') as f: + with open(util_rpt_file) as f: for line in f.readlines(): - if re.search('\(top\)', line): - # Total LUTs | Logic LUTs | LUTRAMs | SRLs | FFs | RAMB36 | RAMB18 (| URAM )| DSP48 Blocks + if re.search(r'\(top\)', line): + # Total LUTs | Logic LUTs | LUTRAMs | SRLs | FFs | RAMB36 | RAMB18 (| URAM )| DSP48 Blocks # skipping the first 2 unuseful cells with [:2] results = [_get_abs_and_percentage_values(elem) for elem in line.replace('|', '').split()[2:]] implementation_report['TotLUTs'] = results[0][0] @@ -298,20 +344,25 @@ def parse_vivado_report(hls_dir): else: print('Implementation report not found.') - timing_report_file = hls_dir + '/' + prj_dir.split('_')[0] + '_vivado_accelerator/project_1.runs/impl_1/design_1_wrapper_timing_summary_routed.rpt' + timing_report_file = ( + hls_dir + + '/' + + prj_dir.split('_')[0] + + '_vivado_accelerator/project_1.runs/impl_1/design_1_wrapper_timing_summary_routed.rpt' + ) if os.path.isfile(timing_report_file): timing_report = {} - with open(timing_report_file, 'r') as f: + with open(timing_report_file) as f: while not re.search('WNS', next(f)): pass # skip the successive line next(f) result = next(f).split() - timing_report['WNS'] = float(result[0]) - timing_report['TNS'] = float(result[1]) - timing_report['WHS'] = float(result[4]) - timing_report['THS'] = float(result[5]) + timing_report['WNS'] = float(result[0]) + timing_report['TNS'] = float(result[1]) + timing_report['WHS'] = float(result[4]) + timing_report['THS'] = float(result[5]) timing_report['WPWS'] = float(result[8]) timing_report['TPWS'] = float(result[9]) @@ -320,3 +371,304 @@ def parse_vivado_report(hls_dir): print('Timing report not found.') return report + +def print_vivado_report(report_dict): + if _is_running_in_notebook(): + _print_ipython_report(report_dict) + else: + _print_str_report(report_dict) + + +def _print_ipython_report(report_dict): + from IPython.display import HTML, display + + html = '\n' + _table_css + '
' + body = _make_report_body(report_dict, _make_html_table_template, _make_html_header) + html += body + '\n
\n' + display(HTML(html)) + + +def _print_str_report(report_dict): + body = _make_report_body(report_dict, _make_str_table_template, _make_str_header) + print(body) + + +def _is_running_in_notebook(): + try: + from IPython import get_ipython + + shell = get_ipython().__class__.__name__ + if shell == 'ZMQInteractiveShell': + return True # Jupyter notebook or qtconsole + elif shell == 'TerminalInteractiveShell': + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + + +_table_css = """ + +""" + +_table_base_template = """ + + + + + + + +{table_rows} + +
{table_header}
+""" + +_row_base_template = " {row_title}{{{row_key}}}" + + +def _make_html_table_template(table_header, row_templates): + table_rows = '\n'.join( + [_row_base_template.format(row_title=row_title, row_key=row_key) for row_title, row_key in row_templates.items()] + ) + return _table_base_template.format(table_header=table_header, table_rows=table_rows) + + +def _make_str_table_template(table_header, row_templates): + len_title = 0 + for row_title in row_templates.keys(): + if len(row_title) > len_title: + len_title = len(row_title) + head = f'\n - {table_header}:\n' + table_rows = '\n'.join( + [' ' + f'{row_title}:'.ljust(len_title + 1) + f' {{{row_key}}}' for row_title, row_key in row_templates.items()] + ) + return head + table_rows + '\n' + + +def _make_html_header(report_header): + return f'

{report_header}:

' + + +def _make_str_header(report_header): + sep = '=' * 54 + '\n' + return '\n' + sep + '== ' + report_header + '\n' + sep + + +def _convert_cycles_to_time(n_cycles, clock_period): + time_in_ns = n_cycles * clock_period + if time_in_ns < 1000: + return str(time_in_ns) + ' ns' + + time_in_us = time_in_ns / 1000 + if time_in_us < 1000: + return str(time_in_us) + ' \u00B5s' + + time_in_ms = time_in_us / 1000 + if time_in_ms < 1000: + return str(time_in_ms) + ' ms' + + time_in_s = time_in_ms / 1000 + if time_in_s < 1000: + return str(time_in_s) + ' s' + + +def _make_report_body(report_dict, make_table_template, make_header_template): + body = '' + + if 'CSynthesisReport' in report_dict: + body += make_header_template('C Synthesis report') + perf_rows = { + 'Best-case latency': 'best_latency', + 'Worst-case latency': 'worst_latency', + 'Interval Min': 'interval_min', + 'Interval Max': 'interval_max', + 'Estimated Clock Period': 'estimated_clock', + } + area_rows = { + 'BRAM_18K': 'bram', + 'DSP48E': 'dsp', + 'FF': 'ff', + 'LUT': 'lut', + 'URAM': 'uram', + } + body += make_table_template('Performance estimates', perf_rows) + body += make_table_template('Resource estimates', area_rows) + + csynth_report = report_dict['CSynthesisReport'] + target_clock = float(csynth_report['TargetClockPeriod']) + best_latency = int(csynth_report['BestLatency']) + worst_latency = int(csynth_report['BestLatency']) + bram = int(csynth_report['BRAM_18K']) + avail_bram = int(csynth_report['AvailableBRAM_18K']) + dsp = int(csynth_report['DSP48E']) + avail_dsp = int(csynth_report['AvailableDSP48E']) + ff = int(csynth_report['FF']) + avail_ff = int(csynth_report['AvailableFF']) + lut = int(csynth_report['LUT']) + avail_lut = int(csynth_report['AvailableLUT']) + if 'URAM' in csynth_report: + uram = int(csynth_report['URAM']) + avail_uram = int(csynth_report['AvailableURAM']) + + params = {} + + params['best_latency'] = str(best_latency) + ' (' + _convert_cycles_to_time(best_latency, target_clock) + ')' + params['worst_latency'] = str(worst_latency) + ' (' + _convert_cycles_to_time(worst_latency, target_clock) + ')' + params['interval_min'] = csynth_report['IntervalMin'] + params['interval_max'] = csynth_report['IntervalMax'] + params['estimated_clock'] = csynth_report['EstimatedClockPeriod'] + + params['bram'] = str(bram) + ' / ' + str(avail_bram) + ' (' + str(round(bram / avail_bram * 100, 1)) + '%)' + params['dsp'] = str(dsp) + ' / ' + str(avail_dsp) + ' (' + str(round(dsp / avail_dsp * 100, 1)) + '%)' + params['ff'] = str(ff) + ' / ' + str(avail_ff) + ' (' + str(round(ff / avail_ff * 100, 1)) + '%)' + params['lut'] = str(lut) + ' / ' + str(avail_lut) + ' (' + str(round(lut / avail_lut * 100, 1)) + '%)' + if 'URAM' in csynth_report and avail_uram > 0: + params['uram'] = str(uram) + ' / ' + str(avail_uram) + ' (' + str(round(uram / avail_uram * 100, 1)) + '%)' + else: + params['uram'] = 'N/A' + + body = body.format(**params) + + if 'VivadoSynthReport' in report_dict: + body += make_header_template('Vivado Synthesis report') + area_rows = { + 'BRAM_18K': 'bram', + 'DSP48E': 'dsp', + 'FF': 'ff', + 'LUT': 'lut', + 'URAM': 'uram', + } + body += make_table_template('Resource utilization', area_rows) + + vsynth_report = report_dict['VivadoSynthReport'] + + params = {} + params['bram'] = vsynth_report['BRAM_18K'] + params['dsp'] = vsynth_report['DSP48E'] + params['ff'] = vsynth_report['FF'] + params['lut'] = vsynth_report['LUT'] + params['uram'] = vsynth_report['URAM'] if 'URAM' in vsynth_report else 'N/A' + + body = body.format(**params) + + if 'CosimReport' in report_dict: + body += make_header_template('Co-Simulation report') + perf_rows = { + 'Status': 'status', + 'Best-case latency': 'best_latency', + 'Worst-case latency': 'worst_latency', + 'Interval Min': 'interval_min', + 'Interval Max': 'interval_max', + } + body += make_table_template('Performance', perf_rows) + + cosim_report = report_dict['CosimReport'] + + params = {} + params['status'] = cosim_report['Status'] + params['best_latency'] = cosim_report['LatencyMin'] + params['worst_latency'] = cosim_report['LatencyMax'] + params['interval_min'] = cosim_report['IntervalMin'] + params['interval_max'] = cosim_report['IntervalMax'] + + body = body.format(**params) + + if 'ImplementationReport' in report_dict: + body += make_header_template('Implementation report') + area_rows = { + 'Total LUTs': 'lut', + 'Logic LUTs': 'logiclut', + 'LUTRAM': 'lutram', + 'SRLs': 'srl', + 'FF': 'ff', + 'RAMB18': 'bram18', + 'RAMB36': 'bram36', + 'DSP': 'dsp', + 'URAM': 'uram', + } + body += make_table_template('Resource utilization', area_rows) + + impl_report = report_dict['ImplementationReport'] + + params = {} + params['lut'] = impl_report['TotLUTs'] + ' (' + impl_report['TotLUTs%'] + '%)' + params['logiclut'] = impl_report['LogicLUTs'] + ' (' + impl_report['LogicLUTs%'] + '%)' + params['lutram'] = impl_report['LUTRAMs'] + ' (' + impl_report['LUTRAMs%'] + '%)' + params['srl'] = impl_report['SRLs'] + ' (' + impl_report['SRLs%'] + '%)' + params['ff'] = impl_report['FFs'] + ' (' + impl_report['FFs%'] + '%)' + params['bram18'] = impl_report['RAMB18s'] + ' (' + impl_report['RAMB18s%'] + '%)' + params['bram36'] = impl_report['RAMB36s'] + ' (' + impl_report['RAMB36s%'] + '%)' + params['dsp'] = impl_report['DSPs'] + ' (' + impl_report['DSPs%'] + '%)' + if 'URAMs' in impl_report: + params['uram'] = impl_report['URAMs'] + ' (' + impl_report['URAMs%'] + '%)' + else: + params['uram'] = 'N/A' + + body = body.format(**params) + + if 'TimingReport' in report_dict: + body += make_header_template('Timing report') + perf_rows = { + 'Worst Negative Slack (WNS)': 'wns', + 'Total Negative Slack (TNS)': 'tns', + 'Worst Hold Slack (WHS)': 'whs', + 'Total Hold Slack (THS)': 'ths', + 'Worst Pulse Width Slack (WPWS)': 'wpws', + 'Total Pulse Width Slack (TPWS)': 'tpws', + } + body += make_table_template('Timing', perf_rows) + + timing_report = report_dict['TimingReport'] + + params = {} + params['wns'] = round(timing_report['WNS'], 2) + params['tns'] = round(timing_report['TNS'], 2) + params['whs'] = round(timing_report['WHS'], 2) + params['ths'] = round(timing_report['THS'], 2) + params['wpws'] = round(timing_report['WPWS'], 2) + params['tpws'] = round(timing_report['TPWS'], 2) + + body = body.format(**params) + + return body diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h index 6e9b243d83..c3fe4ec402 100644 --- a/hls4ml/templates/quartus/firmware/defines.h +++ b/hls4ml/templates/quartus/firmware/defines.h @@ -2,54 +2,46 @@ #define DEFINES_H_ /* -* Intel HLS makes use of three streaming interfaces: -* (1) stream_in - used as the main input to a component -* (2) stream_out - used as the main output of a component -* (3) stream - allows both reading and writing; used for inter-component connections -* ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component -* Therefore, variables of type 'stream' are always passed by reference -*/ + * Intel HLS makes use of three streaming interfaces: + * (1) stream_in - used as the main input to a component + * (2) stream_out - used as the main output of a component + * (3) stream - allows both reading and writing; used for inter-component connections + * ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component + * Therefore, variables of type 'stream' are always passed by reference + */ #ifndef __INTELFPGA_COMPILER__ -#include "ac_int.h" #include "ac_fixed.h" +#include "ac_int.h" #define hls_register #include "stream.h" -template -using stream = nnet::stream; -template -using stream_in = nnet::stream; -template -using stream_out = nnet::stream; +template using stream = nnet::stream; +template using stream_in = nnet::stream; +template using stream_out = nnet::stream; #else -#include "HLS/hls.h" -#include "HLS/ac_int.h" #include "HLS/ac_fixed.h" +#include "HLS/ac_int.h" +#include "HLS/hls.h" -template -using stream = ihc::stream; -template -using stream_in = ihc::stream_in; -template -using stream_out = ihc::stream_out; +template using stream = ihc::stream; +template using stream_in = ihc::stream_in; +template using stream_out = ihc::stream_out; #endif // Include nnet::array - a custom array-like struct, mainly used with io_stream #include "nnet_utils/nnet_types.h" -//hls-fpga-machine-learning insert numbers - - -//hls-fpga-machine-learning insert layer-precision +// hls-fpga-machine-learning insert numbers +// hls-fpga-machine-learning insert layer-precision -#define DIV_ROUNDUP(n,d) ((n + d - 1) / d) -#define MIN(n,d) (n > d ? d : n) -#define MAX(n,d) (n < d ? d : n) +#define DIV_ROUNDUP(n, d) ((n + d - 1) / d) +#define MIN(n, d) (n > d ? d : n) +#define MAX(n, d) (n < d ? d : n) #endif diff --git a/hls4ml/templates/quartus/firmware/myproject.cpp b/hls4ml/templates/quartus/firmware/myproject.cpp index 9ca07fce64..8bde3194a6 100644 --- a/hls4ml/templates/quartus/firmware/myproject.cpp +++ b/hls4ml/templates/quartus/firmware/myproject.cpp @@ -18,16 +18,17 @@ // #include "myproject.h" +#include "parameters.h" -//hls-fpga-machine-learning insert weights +// hls-fpga-machine-learning insert weights /* -* Intel HLS requires that all 'stream' types are: -* (1) Passed by reference to the top-level entity or -* (2) Declared as global variables, outside of the main function -* Therefore, layer inputs/output (connections betweenn individual layers) are declared here -*/ -//hls-fpga-machine-learning insert inter-task streams + * Intel HLS requires that all 'stream' types are: + * (1) Passed by reference to the top-level entity or + * (2) Declared as global variables, outside of the main function + * Therefore, layer inputs/output (connections betweenn individual layers) are declared here + */ +// hls-fpga-machine-learning insert inter-task streams #ifndef __INTELFPGA_COMPILER__ /* @@ -42,25 +43,25 @@ - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor) * This distinction is handled in quartus_writer.py */ -//hls-fpga-machine-learning instantiate GCC top-level +// hls-fpga-machine-learning instantiate GCC top-level #else // Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here -//hls-fpga-machine-learning insert cpragmas +// hls-fpga-machine-learning insert cpragmas /* -* The top-level function used during HLS Synthesis goes here -* In a similar manner to GCC, there is a distinction between io_stream & io_parallel -*/ -//hls-fpga-machine-learning instantiate HLS top-level -#endif - // If using io_parallel, the output needs to be initialised and returned at the end of this function - // If using io_stream, no output is initialised, as it is passed by reference to the top-level function - //hls-fpga-machine-learning initialize input/output + * The top-level function used during HLS Synthesis goes here + * In a similar manner to GCC, there is a distinction between io_stream & io_parallel + */ +// hls-fpga-machine-learning instantiate HLS top-level +#endif +// If using io_parallel, the output needs to be initialised and returned at the end of this function +// If using io_stream, no output is initialised, as it is passed by reference to the top-level function +// hls-fpga-machine-learning initialize input/output + +// **************************************** +// NETWORK INSTANTIATION +// **************************************** - // **************************************** - // NETWORK INSTANTIATION - // **************************************** +// hls-fpga-machine-learning insert layers - //hls-fpga-machine-learning insert layers - - //hls-fpga-machine-learning return +// hls-fpga-machine-learning return diff --git a/hls4ml/templates/quartus/firmware/myproject.h b/hls4ml/templates/quartus/firmware/myproject.h index 1e11ec43d3..1ac03b7246 100644 --- a/hls4ml/templates/quartus/firmware/myproject.h +++ b/hls4ml/templates/quartus/firmware/myproject.h @@ -21,23 +21,23 @@ #define MYPROJECT_H_ #ifndef __INTELFPGA_COMPILER__ -#include "ac_int.h" #include "ac_fixed.h" +#include "ac_int.h" #define hls_register #else -#include "HLS/hls.h" -#include "HLS/ac_int.h" #include "HLS/ac_fixed.h" +#include "HLS/ac_int.h" +#include "HLS/hls.h" #endif // Streams are explicitly defined in defines.h, which are included for parameters.h // Defining them again in this file will cause compile-time errors -#include "parameters.h" +#include "defines.h" // If using io_parallel, inputs and output need to be initialised before calling the top-level function // If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function -//hls-fpga-machine-learning insert inputs -//hls-fpga-machine-learning insert outputs +// hls-fpga-machine-learning insert inputs +// hls-fpga-machine-learning insert outputs #ifndef __INTELFPGA_COMPILER__ /* @@ -52,16 +52,16 @@ - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor) * This distinction is handled in quartus_writer.py */ -//hls-fpga-machine-learning instantiate GCC top-level +// hls-fpga-machine-learning instantiate GCC top-level #else // Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here -//hls-fpga-machine-learning insert cpragmas +// hls-fpga-machine-learning insert cpragmas /* -* The top-level function used during HLS Synthesis goes here -* In a similar manner to GCC, there is a distinction between io_stream & io_parallel -*/ -//hls-fpga-machine-learning instantiate HLS top-level + * The top-level function used during HLS Synthesis goes here + * In a similar manner to GCC, there is a distinction between io_stream & io_parallel + */ +// hls-fpga-machine-learning instantiate HLS top-level #endif #endif diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h old mode 100755 new mode 100644 index 20790a390a..b750a688e9 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h @@ -24,8 +24,7 @@ namespace nnet { -struct activ_config -{ +struct activ_config { // IO size static const unsigned n_in = 10; @@ -37,17 +36,15 @@ struct activ_config static const unsigned reuse_factor = 1; // Internal data type definitions - typedef ac_fixed<16,8> table_t; + typedef ac_fixed<16, 8> table_t; }; // ************************************************* // LINEAR Activation -- See Issue 53 // ************************************************* -template -void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma unroll - for (int ii=0; ii -void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma unroll - for (int ii=0; ii 0) res[ii] = datareg; - else res[ii] = 0; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = 0; } } -template -void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma unroll - for (int ii=0; ii MAX_INT) res[ii] = MAX_INT; - else res[ii] = datareg; + if (datareg < 0) + res[ii] = 0; + else if (datareg > MAX_INT) + res[ii] = MAX_INT; + else + res[ii] = datareg; } } -template -void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { relu_max(data, res); } -template -void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { relu_max(data, res); } - // ************************************************* // Sigmoid Activation // ************************************************* -template -void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ - static const int MAX_VALUE=8; - #include "activation_tables/sigmoid_table.tb" +template +void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + static const int MAX_VALUE = 8; +#include "activation_tables/sigmoid_table.tb" #pragma unroll - for (int ii=0; ii < CONFIG_T::n_in; ii++) { - data_T absoluteValue hls_register; - res_T temp2 hls_register; - if(data[ii] < 0 ){ - absoluteValue = - data[ii]; - } - else{ + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T absoluteValue hls_register; + res_T temp2 hls_register; + if (data[ii] < 0) { + absoluteValue = -data[ii]; + } else { absoluteValue = data[ii]; } - int index = ( absoluteValue *( CONFIG_T::table_size / MAX_VALUE)).to_int(); - if (absoluteValue > MAX_VALUE ) index = CONFIG_T::table_size - 1; - temp2 = (res_T) sigmoid_table[index]; - if(data[ii] < 0 ){ - res[ii] = 1-temp2; - } - else{ + int index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + if (absoluteValue > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = (res_T)sigmoid_table[index]; + if (data[ii] < 0) { + res[ii] = 1 - temp2; + } else { res[ii] = temp2; } } @@ -126,35 +118,34 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // Softmax Activation // ************************************************* -enum class softmax_implementation {latency=0, legacy=1, stable=2, argmax=3}; +enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 }; -template -inline unsigned softmax_stable_idx_from_real_val(const data_T x){ +template inline unsigned softmax_stable_idx_from_real_val(const data_T x) { // Number of address bits for table static constexpr int N = ceillog2(CONFIG_T::table_size); // Slice the top N bits of the input - hls_register ac_int y = x.template slc(x.width-N-1); + hls_register ac_int y = x.template slc(x.width - N - 1); // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness - if (x != 0 && y == 0) y[0] = 1; + if (x != 0 && y == 0) + y[0] = 1; return y.to_uint(); } -template -inline unsigned softmax_latency_idx_from_real_val(const data_T x){ +template inline unsigned softmax_latency_idx_from_real_val(const data_T x) { // Number of address bits for table - static constexpr int N = ceillog2(CONFIG_T::table_size); + static constexpr int N = ceillog2(CONFIG_T::table_size); // Slice the top N bits of the input - hls_register ac_int y = x.template slc(x.width-N); + hls_register ac_int y = x.template slc(x.width - N); return y.to_uint(); } template -void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ - // Look-up tables - #include "activation_tables/exp_table.tb" - #include "activation_tables/invert_table.tb" +void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +// Look-up tables +#include "activation_tables/exp_table.tb" +#include "activation_tables/invert_table.tb" // Find maximum Op_max op_max; @@ -163,102 +154,109 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ // For the diffs, use the same type as the input but force rounding and saturation hls_register ac_fixed d_xi_xmax[CONFIG_T::n_in]; #pragma unroll - for(unsigned i = 0; i < CONFIG_T::n_in; i++){ + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { d_xi_xmax[i] = data[i] - x_max; } // Calculate all the e^x's hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll - for(unsigned i = 0; i < CONFIG_T::n_in; i++) { + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { exp_res[i] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[i])]; } // Explicitly sum previously calculated exponentials with an adder tree Op_add op_add; - hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); + hls_register typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_stable_idx_from_real_val(exp_sum)]; #pragma unroll - for(unsigned i = 0; i < CONFIG_T::n_in; i++) { + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { res[i] = exp_res[i] * inv_exp_sum; } } // TODO - Improve accuracy template -void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ - #include "activation_tables/exp_table_latency.tb" - #include "activation_tables/invert_table_latency.tb" - +void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +#include "activation_tables/exp_table_latency.tb" +#include "activation_tables/invert_table_latency.tb" + // Calculate all the e^x's hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll - for(unsigned i = 0; i < CONFIG_T::n_in; i++) { + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val(data[i])]; } // Explicitly sum the results with an adder tree. Op_add op_add; - hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); + hls_register typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; #pragma unroll - for(unsigned i = 0; i < CONFIG_T::n_in; i++){ + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { res[i] = exp_res[i] * inv_exp_sum; } } -template +template void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { - #include "activation_tables/exp_table_legacy.tb" - #include "activation_tables/invert_table_legacy.tb" +#include "activation_tables/exp_table_legacy.tb" +#include "activation_tables/invert_table_legacy.tb" hls_register int data_round[CONFIG_T::n_in]; - New_loop: +New_loop: #pragma unroll - for (int ii=0; ii CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index]; exp_res_temp += temp_exp; } } - int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int(); - if (exp_res_index < 0) exp_res_index = 0; - if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1; + int exp_res_index = (exp_res_temp * CONFIG_T::table_size / 64).to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; res[ii] = invert_table_legacy[exp_res_index]; } } -template +template void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma unroll for (int i = 0; i < CONFIG_T::n_in; i++) { - res[i] = (res_T) 0; + res[i] = (res_T)0; } hls_register data_T maximum = data[0]; - hls_register int idx = 0; + hls_register int idx = 0; #pragma ii 1 for (int i = 1; i < CONFIG_T::n_in; i++) { @@ -268,56 +266,55 @@ void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { } } - res[idx] = (res_T) 1; + res[idx] = (res_T)1; } -template -inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ - switch(CONFIG_T::implementation) { - case softmax_implementation::stable: - softmax_stable(data, res); - break; - case softmax_implementation::latency: - softmax_latency(data, res); - break; - case softmax_implementation::legacy: - softmax_legacy(data, res); - break; - default: - softmax_stable(data, res); - break; - case softmax_implementation::argmax: - softmax_argmax(data, res); - break; +template +inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + switch (CONFIG_T::implementation) { + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + default: + softmax_stable(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; } } // ************************************************* // TanH Activation // ************************************************* -template +template void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { - static const int MAX_VALUE=4; - // Initialize the lookup table - #include "activation_tables/tanh_table.tb" + static const int MAX_VALUE = 4; +// Initialize the lookup table +#include "activation_tables/tanh_table.tb" // Index into the lookup table based on data #pragma unroll - for (int ii=0; ii index = ( temp *(CONFIG_T::table_size/MAX_VALUE)).to_int(); - if (temp > MAX_VALUE ) index = CONFIG_T::table_size-1; - temp2 = (res_T) tanh_table[index]; - if(data[ii] < 0 ){ + ac_int<16> index = (temp * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + if (temp > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = (res_T)tanh_table[index]; + if (data[ii] < 0) { res[ii] = -temp2; - } - else{ + } else { res[ii] = temp2; } } @@ -326,95 +323,108 @@ void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // ************************************************* // Hard sigmoid Activation // ************************************************* -template -void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ - data_T slope = (data_T) 0.2; - data_T shift = (data_T) 0.5; +template +void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma unroll - for (int ii=0; ii 1) datareg = 1; - else if (datareg < 0) datareg = 0; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; res[ii] = datareg; } } +template +void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + res[ii] = 2 * sigmoid - 1; + } +} + // ************************************************* // Leaky RELU Activation // ************************************************* -template -void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) -{ +template +void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) { #pragma unroll - for (int ii=0; ii 0) res[ii] = datareg; - else res[ii] = alpha * datareg; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha * datareg; } } // ************************************************* // Thresholded RELU Activation // ************************************************* -template -void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) -{ +template +void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) { #pragma unroll - for (int ii=0; ii theta) res[ii] = datareg; - else res[ii] = 0; + if (datareg > theta) + res[ii] = datareg; + else + res[ii] = 0; } } // ************************************************* // Softplus Activation // ************************************************* -template -void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ - // Initialize the lookup table - #include "activation_tables/softplus_table.tb" +template +void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +// Initialize the lookup table +#include "activation_tables/softplus_table.tb" // Index into the lookup table based on data #pragma unroll - for (int ii=0; ii data_round = (data[ii]*CONFIG_T::table_size/16).to_int(); - ac_int<16> index = data_round + 8*CONFIG_T::table_size/16; - if (index < 0) index = 0; - if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; - res[ii] = (res_T) softplus_table[index]; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<16> data_round = (data[ii] * CONFIG_T::table_size / 16).to_int(); + ac_int<16> index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)softplus_table[index]; } } // ************************************************* // Softsign Activation // ************************************************* -template -void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ - static const int MAX_VALUE=8; - // Initialize the lookup table - #include "activation_tables/softsign_table.tb" +template +void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + static const int MAX_VALUE = 8; +// Initialize the lookup table +#include "activation_tables/softsign_table.tb" // Index into the lookup table based on data #pragma unroll - for (int ii=0; ii index = (temp*CONFIG_T::table_size/MAX_VALUE).to_int(); - if (temp > MAX_VALUE) index = CONFIG_T::table_size-1; - temp2 = (res_T) softsign_table[index]; - if(data[ii] < 0 ){ + ac_int<16> index = (temp * CONFIG_T::table_size / MAX_VALUE).to_int(); + if (temp > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = (res_T)softsign_table[index]; + if (data[ii] < 0) { res[ii] = -temp2; - } - else{ + } else { res[ii] = temp2; } } @@ -423,48 +433,45 @@ void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // ************************************************* // ELU Activation // ************************************************* -template -void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) -{ - // Initialize the lookup table - #include "activation_tables/elu_table.tb" +template +void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { +// Initialize the lookup table +#include "activation_tables/elu_table.tb" // Index into the lookup table based on data #pragma unroll - for (int ii=0; ii= 0) { res[ii] = datareg; } else { - ac_int<16> index = (datareg*CONFIG_T::table_size/-8).to_int(); - if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; res[ii] = alpha * elu_table[index]; } } } -template -void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ - elu(data, 1.0, res); +template void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + elu(data, 1.0, res); } // ************************************************* // SELU Activation // ************************************************* -template -void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ - // Initialize the lookup table - #include "activation_tables/selu_table.tb" +template void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +// Initialize the lookup table +#include "activation_tables/selu_table.tb" // Index into the lookup table based on data #pragma unroll - for (int ii=0; ii= 0) { res[ii] = res_T(1.0507009873554804934193349852946) * datareg; } else { - ac_int<16> index = (datareg*CONFIG_T::table_size/-8).to_int(); - if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; res[ii] = selu_table[index]; } } @@ -473,52 +480,56 @@ void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // ************************************************* // PReLU Activation // ************************************************* -template -void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma unroll - for (int ii=0; ii 0) res[ii] = datareg; - else res[ii] = alpha[ii] * datareg; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha[ii] * datareg; } } // ************************************************* // Binary TanH Activation // ************************************************* -template -void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma unroll - for (int ii=0; ii 0 ) cache = 1; - else cache = -1; + if (datareg > 0) + cache = 1; + else + cache = -1; - res[ii] = (res_T) cache; + res[ii] = (res_T)cache; } } // ************************************************* // Ternary TanH Activation // ************************************************* -template -void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ - #pragma unroll - for (int ii=0; ii 1 ) cache = 1; - else if( datareg > -1 && datareg <= 1) cache=0; - else cache = -1; - - res[ii] = (res_T) cache; - } +template +void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = 2 * data[ii]; + res_T cache; + if (datareg > 1) + cache = 1; + else if (datareg > -1 && datareg <= 1) + cache = 0; + else + cache = -1; + + res[ii] = (res_T)cache; + } } -} +} // namespace nnet #endif diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h index 3dae6d8d59..f0562a9b22 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h @@ -4,20 +4,19 @@ #include "nnet_common.h" #include "nnet_types.h" -namespace nnet{ +namespace nnet { // ************************************************* // Linear Activation // ************************************************* -template -void linear(stream &data, stream &res) { - LinearActLoop: +template void linear(stream &data, stream &res) { +LinearActLoop: #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - LinearPackLoop: + LinearPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { out_data[j] = in_data[j]; @@ -30,19 +29,20 @@ void linear(stream &data, stream &res) { // ************************************************* // ReLU Activation // ************************************************* -template -void relu(stream &data, stream &res) { - ReLUActLoop: +template void relu(stream &data, stream &res) { +ReLUActLoop: #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - ReLUPackLoop: + ReLUPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { - if (in_data[j] > 0) out_data[j] = in_data[j]; - else out_data[j] = 0; + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = 0; } res.write(out_data); @@ -52,22 +52,24 @@ void relu(stream &data, stream &res) { // ************************************************* // Leaky RELU Activation // ************************************************* -template +template void leaky_relu(stream &data, const typename data_T::value_type alpha, stream &res) { constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; - - LeakyReLUActLoop: + +LeakyReLUActLoop: #pragma ii pipeline for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - LeakyReLUPackLoop: + LeakyReLUPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { - if (in_data[j] > 0) out_data[j] = in_data[j]; - else out_data[j] = alpha * in_data[j]; + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha * in_data[j]; } res.write(out_data); @@ -77,19 +79,21 @@ void leaky_relu(stream &data, const typename data_T::value_type alpha, s // ************************************************* // Thresholded RELU Activation // ************************************************* -template +template void thresholded_relu(stream &data, const typename data_T::value_type theta, stream &res) { - ThresholdedReLUActLoop: +ThresholdedReLUActLoop: #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - ThresholdedReLUPackLoop: + ThresholdedReLUPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { - if (in_data[j] > theta) out_data[j] = in_data[j]; - else out_data[j] = 0; + if (in_data[j] > theta) + out_data[j] = in_data[j]; + else + out_data[j] = 0; } res.write(out_data); @@ -99,28 +103,29 @@ void thresholded_relu(stream &data, const typename data_T::value_type th // ************************************************* // ELU Activation // ************************************************* -template +template void elu(stream &data, const typename data_T::value_type alpha, stream &res) { - #include "activation_tables/elu_table.tb" +#include "activation_tables/elu_table.tb" constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; - EluActLoop: +EluActLoop: #pragma ii pipeline for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - EluPackLoop: + EluPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { hls_register typename data_T::value_type datareg = in_data[j]; if (datareg >= 0) { out_data[j] = datareg; } else { - int index = (datareg*CONFIG_T::table_size/-8).to_int(); - if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; out_data[j] = alpha * elu_table[index]; } } @@ -129,33 +134,32 @@ void elu(stream &data, const typename data_T::value_type alpha, stream -void elu(stream &data, stream &res) { +template void elu(stream &data, stream &res) { elu(data, 1.0, res); } // ************************************************* // SeLU Activation // ************************************************* -template -void selu(stream &data, stream &res) { - #include "activation_tables/selu_table.tb" +template void selu(stream &data, stream &res) { +#include "activation_tables/selu_table.tb" - SeluActLoop: +SeluActLoop: #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - SeluPackLoop: + SeluPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { hls_register typename data_T::value_type datareg = in_data[j]; if (datareg >= 0) { - out_data[j] = typename data_T::value_type (1.0507009873554804934193349852946) * datareg; + out_data[j] = typename data_T::value_type(1.0507009873554804934193349852946) * datareg; } else { - int index = (datareg*CONFIG_T::table_size/-8).to_int(); - if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; out_data[j] = selu_table[index]; } } @@ -167,22 +171,24 @@ void selu(stream &data, stream &res) { // ************************************************* // PReLU Activation // ************************************************* -template +template void prelu(stream &data, const typename data_T::value_type alpha[CONFIG_T::n_in], stream &res) { constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; - - PReLUActLoop: + +PReLUActLoop: #pragma ii pipeline for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - PReLUPackLoop: + PReLUPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { - if (in_data[j] > 0) out_data[j] = in_data[j]; - else out_data[j] = alpha[i*res_T::size+j] * in_data[j]; + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha[i * res_T::size + j] * in_data[j]; } res.write(out_data); @@ -192,23 +198,24 @@ void prelu(stream &data, const typename data_T::value_type alpha[CONFIG_ // ************************************************* // Softplus Activation // ************************************************* -template -void softplus(stream &data, stream &res) { - #include "activation_tables/softplus_table.tb" +template void softplus(stream &data, stream &res) { +#include "activation_tables/softplus_table.tb" - SoftplusActLoop: +SoftplusActLoop: #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - SoftplusPackLoop: + SoftplusPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { - hls_register int data_round = (in_data[j]*CONFIG_T::table_size/16).to_int(); - hls_register int index = data_round + 8*CONFIG_T::table_size/16; - if (index < 0) index = 0; - else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + hls_register int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int(); + hls_register int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; out_data[j] = softplus_table[index]; } @@ -219,35 +226,34 @@ void softplus(stream &data, stream &res) { // ************************************************* // Softsign Activation // ************************************************* -template -void softsign(stream &data, stream &res) { - #include "activation_tables/softsign_table.tb" +template void softsign(stream &data, stream &res) { +#include "activation_tables/softsign_table.tb" static const int MAX_VALUE = 8; - SoftsignActLoop: +SoftsignActLoop: #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - SoftsignPackLoop: + SoftsignPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { - hls_register typename data_T::value_type absValue;; - if(in_data[j] < 0){ + hls_register typename data_T::value_type absValue; + ; + if (in_data[j] < 0) { absValue = -in_data[j]; - } - else{ + } else { absValue = in_data[j]; } ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int(); - if (absValue > MAX_VALUE) index = CONFIG_T::table_size - 1; - if(in_data[j] < 0) { - out_data[j] = -(typename res_T::value_type) softsign_table[index]; - } - else { - out_data[j] = (typename res_T::value_type) softsign_table[index]; + if (absValue > MAX_VALUE) + index = CONFIG_T::table_size - 1; + if (in_data[j] < 0) { + out_data[j] = -(typename res_T::value_type)softsign_table[index]; + } else { + out_data[j] = (typename res_T::value_type)softsign_table[index]; } } @@ -259,101 +265,106 @@ void softsign(stream &data, stream &res) { // Softmax Activation // ************************************************* -template -void softmax_stable(stream &data, stream &res) { - #include "activation_tables/exp_table.tb" - #include "activation_tables/invert_table.tb" +template void softmax_stable(stream &data, stream &res) { +#include "activation_tables/exp_table.tb" +#include "activation_tables/invert_table.tb" constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; hls_register typename data_T::value_type data_array[data_T::size]; - - SoftmaxArrayLoop: + +SoftmaxArrayLoop: #pragma ii pipeline - for(unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { data_T in_pack = data.read(); - - SoftmaxArrayPackLoop: - #pragma unroll - for(unsigned j = 0; j < data_T::size; j++) { + + SoftmaxArrayPackLoop: + #pragma unroll + for (unsigned j = 0; j < data_T::size; j++) { data_array[j] = in_pack[j]; } // Find the max and compute all delta(x_i, x_max) Op_max op_max; - hls_register typename data_T::value_type x_max = reduce>(data_array, op_max); + hls_register typename data_T::value_type x_max = + reduce>(data_array, op_max); // For the diffs, use the same type as the input but force rounding and saturation - hls_register ac_fixed d_xi_xmax[data_T::size]; + hls_register ac_fixed + d_xi_xmax[data_T::size]; #pragma unroll - for(unsigned j = 0; j < data_T::size; j++){ + for (unsigned j = 0; j < data_T::size; j++) { d_xi_xmax[j] = data_array[j] - x_max; } // Calculate all the e^x's hls_register typename CONFIG_T::exp_table_t exp_res[data_T::size]; #pragma unroll - for(unsigned j = 0; j < data_T::size; j++) { + for (unsigned j = 0; j < data_T::size; j++) { exp_res[j] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[j])]; } // Explicitly sum the results with an adder tree. // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing Op_add op_add; - hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); + hls_register typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_stable_idx_from_real_val(exp_sum)]; res_T out_pack; - - SoftmaxInvPackLoop: + + SoftmaxInvPackLoop: #pragma unroll - for(unsigned j = 0; j < res_T::size; j++){ - + for (unsigned j = 0; j < res_T::size; j++) { + // TODO - Find Quartus-equivalent pragma // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation - + out_pack[j] = exp_res[j] * inv_exp_sum; } - + res.write(out_pack); } } -template -void softmax_latency(stream &data, stream &res){ - #include "activation_tables/exp_table_latency.tb" - #include "activation_tables/invert_table_latency.tb" - +template void softmax_latency(stream &data, stream &res) { +#include "activation_tables/exp_table_latency.tb" +#include "activation_tables/invert_table_latency.tb" + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; // Calculate all the e^x's hls_register typename CONFIG_T::exp_table_t exp_res[data_T::size]; - - SoftmaxExpLoop: + +SoftmaxExpLoop: #pragma ii pipeline - for(unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { data_T in_pack = data.read(); - - SoftmaxExpPackLoop: + + SoftmaxExpPackLoop: #pragma unroll - for(unsigned j = 0; j < data_T::size; j++) { - exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val(in_pack[j])]; + for (unsigned j = 0; j < data_T::size; j++) { + exp_res[j] = + exp_table_latency[softmax_latency_idx_from_real_val(in_pack[j])]; } // Explicitly sum the results with an adder tree. // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing Op_add op_add; - hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); + hls_register typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; res_T out_pack; - SoftmaxInvPackLoop: + SoftmaxInvPackLoop: #pragma unroll - for(unsigned j = 0; j < res_T::size; j++){ + for (unsigned j = 0; j < res_T::size; j++) { // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation out_pack[j] = exp_res[j] * inv_exp_sum; } @@ -362,41 +373,42 @@ void softmax_latency(stream &data, stream &res){ } } -template -void softmax_legacy(stream &data, stream &res) { - #include "activation_tables/exp_table_legacy.tb" - #include "activation_tables/invert_table_legacy.tb" - +template void softmax_legacy(stream &data, stream &res) { +#include "activation_tables/exp_table_legacy.tb" +#include "activation_tables/invert_table_legacy.tb" + // Index into the lookup table based on data for exponentials hls_register typename CONFIG_T::table_t exp_res[data_T::size]; hls_register typename CONFIG_T::table_t exp_diff_res; hls_register typename data_T::value_type data_cache[data_T::size]; - SoftmaxInitLoop: +SoftmaxInitLoop: #pragma ii 1 - for(unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) { + for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) { data_T in_pack = data.read(); - - SoftmaxInitPackLoop: + + SoftmaxInitPackLoop: #pragma unroll - for(unsigned j = 0; j < data_T::size; j++) { + for (unsigned j = 0; j < data_T::size; j++) { data_cache[j] = in_pack[j]; exp_res[j] = 0; } - SoftmaxExpLoop: + SoftmaxExpLoop: #pragma unroll for (int i = 0; i < data_T::size; i++) { - SoftmaxExpInner: + SoftmaxExpInner: #pragma unroll for (int j = 0; j < data_T::size; j++) { if (i == j) { exp_diff_res = 1; } else { - int data_round = ((data_cache[j] - data_cache[i])*CONFIG_T::table_size/16).to_int(); + int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int(); int index = data_round + 8 * CONFIG_T::table_size / 16; - if (index < 0) index = 0; - if (index > CONFIG_T::table_size - 1) index = CONFIG_T::table_size - 1; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; exp_diff_res = exp_table_legacy[index]; } exp_res[i] += exp_diff_res; @@ -404,21 +416,22 @@ void softmax_legacy(stream &data, stream &res) { } res_T out_pack; - SoftmaxInvPackLoop: + SoftmaxInvPackLoop: #pragma unroll - for(unsigned j = 0; j < res_T::size; j++) { - int exp_res_index = (exp_res[j]*CONFIG_T::table_size/64).to_int(); - if (exp_res_index < 0) exp_res_index = 0; - if (exp_res_index > CONFIG_T::table_size - 1) exp_res_index = CONFIG_T::table_size - 1; - out_pack[j] = (typename res_T::value_type) invert_table_legacy[exp_res_index]; + for (unsigned j = 0; j < res_T::size; j++) { + int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + out_pack[j] = (typename res_T::value_type)invert_table_legacy[exp_res_index]; } res.write(out_pack); } } -template -void softmax_argmax(stream &data, stream &res) { +template void softmax_argmax(stream &data, stream &res) { #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); @@ -426,11 +439,11 @@ void softmax_argmax(stream &data, stream &res) { #pragma unroll for (int i = 0; i < res_T::size; i++) { - out_data[i] = (typename res_T::value_type) 0; + out_data[i] = (typename res_T::value_type)0; } hls_register typename data_T::value_type maximum = in_data[0]; - hls_register int idx = 0; + hls_register int idx = 0; #pragma ii 1 for (int i = 1; i < res_T::size; i++) { @@ -440,64 +453,68 @@ void softmax_argmax(stream &data, stream &res) { } } - out_data[idx] = (typename res_T::value_type) 1; + out_data[idx] = (typename res_T::value_type)1; res.write(out_data); } } -template -void softmax(stream &data, stream &res) { - switch(CONFIG_T::implementation) { - case softmax_implementation::latency: - softmax_latency(data, res); - break; - case softmax_implementation::stable: - softmax_stable(data, res); - break; - case softmax_implementation::legacy: - softmax_legacy(data, res); - break; - case softmax_implementation::argmax: - softmax_argmax(data, res); - break; - default: - softmax_stable(data, res); - break; - } +template void softmax(stream &data, stream &res) { + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; + default: + softmax_stable(data, res); + break; + } } // ************************************************* // TanH Activation // ************************************************* -template -void dense_tanh(stream &data, stream &res) { - #include "activation_tables/tanh_table.tb" - static const int MAX_VALUE=4; +template void dense_tanh(stream &data, stream &res) { +#include "activation_tables/tanh_table.tb" + static const int MAX_VALUE = 4; constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; - TanHActLoop: +TanHActLoop: #pragma ii pipeline for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - TanHPackLoop: + TanHPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { hls_register typename data_T::value_type absoluteValue; - if(in_data[j] < 0) absoluteValue = (-1)*in_data[j]; - else absoluteValue = in_data[j]; + if (in_data[j] < 0) + absoluteValue = (-1) * in_data[j]; + else + absoluteValue = in_data[j]; hls_register int index; - if (absoluteValue <= MAX_VALUE) index = (absoluteValue*(CONFIG_T::table_size/MAX_VALUE)).to_int(); - else index = CONFIG_T::table_size-1; - - if(in_data[j] > 0) out_data[j] = tanh_table[index]; - else out_data[j] = -tanh_table[index]; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; + + if (in_data[j] > 0) + out_data[j] = tanh_table[index]; + else + out_data[j] = -tanh_table[index]; } res.write(out_data); @@ -507,34 +524,39 @@ void dense_tanh(stream &data, stream &res) { // ************************************************* // Sigmoid Activation // ************************************************* -template -void sigmoid(stream &data, stream &res) { - #include "activation_tables/sigmoid_table.tb" - static const int MAX_VALUE=8; +template void sigmoid(stream &data, stream &res) { +#include "activation_tables/sigmoid_table.tb" + static const int MAX_VALUE = 8; constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; - SigmoidActLoop: +SigmoidActLoop: #pragma ii pipeline for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - SigmoidPackLoop: + SigmoidPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { hls_register typename data_T::value_type absoluteValue; - if(in_data[j] < 0) absoluteValue = (-1)*in_data[j]; - else absoluteValue = in_data[j]; + if (in_data[j] < 0) + absoluteValue = (-1) * in_data[j]; + else + absoluteValue = in_data[j]; hls_register int index; - if (absoluteValue <= MAX_VALUE) index = (absoluteValue*(CONFIG_T::table_size/MAX_VALUE)).to_int(); - else index = CONFIG_T::table_size-1; - - if(in_data[j] > 0) out_data[j] = sigmoid_table[index]; - else out_data[j] = 1 - sigmoid_table[index]; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; + + if (in_data[j] > 0) + out_data[j] = sigmoid_table[index]; + else + out_data[j] = 1 - sigmoid_table[index]; } res.write(out_data); @@ -545,27 +567,26 @@ void sigmoid(stream &data, stream &res) { // Hard sigmoid Activation // ************************************************* // Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations -template -void hard_sigmoid(stream &data, stream &res) { - static const typename data_T::value_type slope = (typename data_T::value_type) 0.2; - static const typename data_T::value_type shift = (typename data_T::value_type) 0.5; +template void hard_sigmoid(stream &data, stream &res) { constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; - HardSigmoidActLoop: +HardSigmoidActLoop: #pragma ii pipeline for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); res_T out_data; - HardSigmoidPackLoop: + HardSigmoidPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { - hls_register typename data_T::value_type datareg = slope * in_data[j] + shift; - if (datareg > 1) datareg = 1; - else if (datareg < 0) datareg = 0; + hls_register auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; out_data[j] = datareg; } @@ -573,23 +594,51 @@ void hard_sigmoid(stream &data, stream &res) { } } +template void hard_tanh(stream &data, stream &res) { + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_T::size / multiplier_limit; + +HardSigmoidActLoop: + #pragma ii pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + + HardSigmoidPackLoop: + #pragma unroll + for (int j = 0; j < res_T::size; j++) { + auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + out_data[j] = 2 * sigmoid - 1; + } + + res.write(out_data); + } +} + // ************************************************* // Binary TanH Activation // ************************************************* -template -void binary_tanh(stream &data, stream &res) { - BinaryTanHActLoop: +template void binary_tanh(stream &data, stream &res) { +BinaryTanHActLoop: #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { - + hls_register data_T in_data = data.read(); hls_register res_T out_data; - BinaryTanHPackLoop: + BinaryTanHPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { - if(in_data[j] > 0) out_data[j] = (typename res_T::value_type) 1; - else out_data[j] = (typename res_T::value_type) -1; + if (in_data[j] > 0) + out_data[j] = (typename res_T::value_type)1; + else + out_data[j] = (typename res_T::value_type) - 1; } res.write(out_data); @@ -599,28 +648,29 @@ void binary_tanh(stream &data, stream &res) { // ************************************************* // Ternary TanH Activation // ************************************************* -template -void ternary_tanh(stream &data, stream &res) { - TernaryTanHActLoop: +template void ternary_tanh(stream &data, stream &res) { +TernaryTanHActLoop: #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { - + hls_register data_T in_data = data.read(); hls_register res_T out_data; - TernaryTanHPackLoop: + TernaryTanHPackLoop: #pragma unroll for (int j = 0; j < res_T::size; j++) { - if(in_data[j] > 1) out_data[j] = (typename res_T::value_type) 1; - else if (in_data[j] <=-1) out_data[j] = (typename res_T::value_type) -1; - else out_data[j] = (typename res_T::value_type) 0; + if (in_data[j] > 1) + out_data[j] = (typename res_T::value_type)1; + else if (in_data[j] <= -1) + out_data[j] = (typename res_T::value_type) - 1; + else + out_data[j] = (typename res_T::value_type)0; } res.write(out_data); } - } -} +} // namespace nnet -#endif \ No newline at end of file +#endif diff --git a/hls4ml/templates/quartus/firmware/parameters.h b/hls4ml/templates/quartus/firmware/parameters.h index 75a3a0d700..e23ca9770f 100644 --- a/hls4ml/templates/quartus/firmware/parameters.h +++ b/hls4ml/templates/quartus/firmware/parameters.h @@ -4,9 +4,8 @@ #include "defines.h" #include "nnet_utils/nnet_helpers.h" -//hls-fpga-machine-learning insert includes - -//hls-fpga-machine-learning insert layer-config +// hls-fpga-machine-learning insert includes +// hls-fpga-machine-learning insert layer-config #endif diff --git a/hls4ml/templates/quartus/myproject_bridge.cpp b/hls4ml/templates/quartus/myproject_bridge.cpp index b0cdfc7564..f4c23b20f7 100644 --- a/hls4ml/templates/quartus/myproject_bridge.cpp +++ b/hls4ml/templates/quartus/myproject_bridge.cpp @@ -7,10 +7,10 @@ #include namespace nnet { - bool trace_enabled = false; - std::map *trace_outputs = NULL; - size_t trace_type_size = sizeof(double); -} +bool trace_enabled = false; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet extern "C" { @@ -23,7 +23,7 @@ void allocate_trace_storage(size_t element_size) { nnet::trace_enabled = true; nnet::trace_outputs = new std::map; nnet::trace_type_size = element_size; - //hls-fpga-machine-learning insert trace_outputs + // hls-fpga-machine-learning insert trace_outputs } void free_trace_storage() { @@ -48,18 +48,17 @@ void collect_trace_output(struct trace_data *c_trace_outputs) { // Wrapper of top level function for Python bridge void myproject_float( - //hls-fpga-machine-learning insert header #float + // hls-fpga-machine-learning insert header #float ) { - - //hls-fpga-machine-learning insert wrapper #float + + // hls-fpga-machine-learning insert wrapper #float } void myproject_double( - //hls-fpga-machine-learning insert header #double + // hls-fpga-machine-learning insert header #double ) { - //hls-fpga-machine-learning insert wrapper #double + // hls-fpga-machine-learning insert wrapper #double } - } #endif diff --git a/hls4ml/templates/quartus/myproject_test_parallel.cpp b/hls4ml/templates/quartus/myproject_test_parallel.cpp index 27e1476fd2..3809418536 100644 --- a/hls4ml/templates/quartus/myproject_test_parallel.cpp +++ b/hls4ml/templates/quartus/myproject_test_parallel.cpp @@ -16,14 +16,13 @@ // You should have received a copy of the GNU General Public License // along with this program. If not, see . // +#include +#include #include #include -#include -#include #include -#include +#include -#include "firmware/parameters.h" #include "firmware/myproject.h" #define CHECKPOINT 5000 @@ -34,13 +33,12 @@ // This function returns the next float (by argument) at position pos, // updating pos. True is returned if conversion done, false if the string // has ended, and std::invalid_argument exception if the sting was bad. -bool nextToken(const std::string& str, std::size_t& pos, float& val) -{ +bool nextToken(const std::string &str, std::size_t &pos, float &val) { while (pos < str.size() && std::isspace(static_cast(str[pos]))) { - pos++; + pos++; } if (pos >= str.size()) { - return false; + return false; } std::size_t offset = 0; val = std::stof(str.substr(pos), &offset); @@ -49,12 +47,11 @@ bool nextToken(const std::string& str, std::size_t& pos, float& val) } int main(int argc, char **argv) { - //load input data from text file + // load input data from text file std::ifstream fin("tb_data/tb_input_features.dat"); - //load predictions from text file + // load predictions from text file std::ifstream fpr("tb_data/tb_output_predictions.dat"); - std::string RESULTS_LOG = "tb_data/results.log"; std::ofstream fout(RESULTS_LOG); @@ -65,62 +62,62 @@ int main(int argc, char **argv) { std::vector outputs; if (fin.is_open() && fpr.is_open()) { - std::vector > predictions; - unsigned int num_iterations = 0; - for (; std::getline(fin,iline) && std::getline (fpr,pline); num_iterations++) { - if (num_iterations % CHECKPOINT == 0) { - std::cout << "Processing input " << num_iterations << std::endl; + std::vector> predictions; + unsigned int num_iterations = 0; + for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) { + if (num_iterations % CHECKPOINT == 0) { + std::cout << "Processing input " << num_iterations << std::endl; + } + + std::vector in; + std::vector pr; + float current; + + std::size_t pos = 0; + while (nextToken(iline, pos, current)) { + in.push_back(current); + } + + pos = 0; + while (nextToken(pline, pos, current)) { + pr.push_back(current); + } + + // hls-fpga-machine-learning insert data + predictions.push_back(std::move(pr)); } - std::vector in; - std::vector pr; - float current; + // Do this separately to avoid vector reallocation + // hls-fpga-machine-learning insert top-level-function - std::size_t pos = 0; - while(nextToken(iline, pos, current)) { - in.push_back(current); - } + // hls-fpga-machine-learning insert run - pos = 0; - while(nextToken(pline, pos, current)) { - pr.push_back(current); + for (int j = 0; j < num_iterations; j++) { + // hls-fpga-machine-learning insert tb-output + if (j % CHECKPOINT == 0) { + std::cout << "Predictions" << std::endl; + // hls-fpga-machine-learning insert predictions + std::cout << "Quantized predictions" << std::endl; + // hls-fpga-machine-learning insert quantized + } } - - //hls-fpga-machine-learning insert data - predictions.push_back(std::move(pr)); - } - - // Do this separately to avoid vector reallocation - //hls-fpga-machine-learning insert top-level-function - - //hls-fpga-machine-learning insert run - - - for(int j = 0; j < num_iterations; j++) { - //hls-fpga-machine-learning insert tb-output - if (j % CHECKPOINT == 0) { - std::cout << "Predictions" << std::endl; - //hls-fpga-machine-learning insert predictions - std::cout << "Quantized predictions" << std::endl; - //hls-fpga-machine-learning insert quantized - } - } - fin.close(); - fpr.close(); + fin.close(); + fpr.close(); } else { - const unsigned int num_iterations = 10; - std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations << " invocations." << std::endl; - //hls-fpga-machine-learning insert zero + const unsigned int num_iterations = 10; + std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations + << " invocations." << std::endl; + // hls-fpga-machine-learning insert zero - //hls-fpga-machine-learning insert top-level-function + // hls-fpga-machine-learning insert top-level-function - //hls-fpga-machine-learning insert run + // hls-fpga-machine-learning insert run - for (int j = 0; j < num_iterations; j++) { - //hls-fpga-machine-learning insert output + for (int j = 0; j < num_iterations; j++) { + // hls-fpga-machine-learning insert output - //hls-fpga-machine-learning insert tb-output - } + // hls-fpga-machine-learning insert tb-output + } } fout.close(); diff --git a/hls4ml/templates/quartus/myproject_test_stream.cpp b/hls4ml/templates/quartus/myproject_test_stream.cpp index 881cbea4f9..5e5f89e75e 100644 --- a/hls4ml/templates/quartus/myproject_test_stream.cpp +++ b/hls4ml/templates/quartus/myproject_test_stream.cpp @@ -1,11 +1,10 @@ +#include +#include #include #include -#include -#include #include -#include +#include -#include "firmware/parameters.h" #include "firmware/myproject.h" #include "firmware/nnet_utils/nnet_helpers.h" @@ -18,12 +17,12 @@ // This function returns the next float (by argument) at position pos, // updating pos. True is returned if conversion done, false if the string // has ended, and std::invalid_argument exception if the sting was bad. -bool nextToken(const std::string& str, std::size_t& pos, float& val) { +bool nextToken(const std::string &str, std::size_t &pos, float &val) { while (pos < str.size() && std::isspace(static_cast(str[pos]))) { - pos++; + pos++; } if (pos >= str.size()) { - return false; + return false; } std::size_t offset = 0; val = std::stof(str.substr(pos), &offset); @@ -35,8 +34,8 @@ int main(int argc, char **argv) { // Load input data from text file std::ifstream fin("tb_data/tb_input_features.dat"); std::string iline; - - //Load predictions from text file + + // Load predictions from text file std::ifstream fpr("tb_data/tb_output_predictions.dat"); std::string pline; @@ -45,80 +44,79 @@ int main(int argc, char **argv) { std::ofstream fout(RESULTS_LOG); if (fin.is_open() && fpr.is_open()) { - std::vector> predictions; - - unsigned int iteration = 0; - while(std::getline(fin,iline) && std::getline(fpr,pline)) { - if (iteration % CHECKPOINT == 0) { - std::cout << "Processing input " << iteration << std::endl; - } + std::vector> predictions; - //hls-fpga-machine learning instantiate inputs and outputs + unsigned int iteration = 0; + while (std::getline(fin, iline) && std::getline(fpr, pline)) { + if (iteration % CHECKPOINT == 0) { + std::cout << "Processing input " << iteration << std::endl; + } - std::vector in; - std::vector pr; - float current; + // hls-fpga-machine learning instantiate inputs and outputs - std::size_t pos = 0; - while(nextToken(iline, pos, current)) { - in.push_back(current); - } + std::vector in; + std::vector pr; + float current; - pos = 0; - while(nextToken(pline, pos, current)) { - pr.push_back(current); - } + std::size_t pos = 0; + while (nextToken(iline, pos, current)) { + in.push_back(current); + } - //hls-fpga-machine-learning insert data + pos = 0; + while (nextToken(pline, pos, current)) { + pr.push_back(current); + } - predictions.push_back(std::move(pr)); + // hls-fpga-machine-learning insert data - //hls-fpga-machine-learning insert top-level-function - - //hls-fpga-machine-learning insert run + predictions.push_back(std::move(pr)); - //hls-fpga-machine-learning convert output + // hls-fpga-machine-learning insert top-level-function - //hls-fpga-machine-learning insert tb-output - - if (iteration % CHECKPOINT == 0) { - std::cout << "Python Predictions" << std::endl; - //hls-fpga-machine-learning print predictions - - std::cout << "HLS predictions" << std::endl; - //hls-fpga-machine-learning print output - } + // hls-fpga-machine-learning insert run + + // hls-fpga-machine-learning convert output + + // hls-fpga-machine-learning insert tb-output - iteration++; - } + if (iteration % CHECKPOINT == 0) { + std::cout << "Python Predictions" << std::endl; + // hls-fpga-machine-learning print predictions - fin.close(); - fpr.close(); + std::cout << "HLS predictions" << std::endl; + // hls-fpga-machine-learning print output + } + + iteration++; + } + + fin.close(); + fpr.close(); } else { - const unsigned int num_iterations = 10; - std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations << " invocations." << std::endl; + const unsigned int num_iterations = 10; + std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations + << " invocations." << std::endl; - for (int iteration = 0 ; iteration < num_iterations ; iteration++) { - //hls-fpga-machine learning instantiate inputs and outputs + for (int iteration = 0; iteration < num_iterations; iteration++) { + // hls-fpga-machine learning instantiate inputs and outputs - //hls-fpga-machine-learning insert zero + // hls-fpga-machine-learning insert zero - //hls-fpga-machine-learning insert top-level-function + // hls-fpga-machine-learning insert top-level-function - //hls-fpga-machine-learning insert run + // hls-fpga-machine-learning insert run - //hls-fpga-machine-learning convert output + // hls-fpga-machine-learning convert output - //hls-fpga-machine-learning insert tb-output + // hls-fpga-machine-learning insert tb-output - if (iteration % CHECKPOINT == 0) { - std::cout << "HLS predictions" << std::endl; - //hls-fpga-machine-learning print output + if (iteration % CHECKPOINT == 0) { + std::cout << "HLS predictions" << std::endl; + // hls-fpga-machine-learning print output + } } - - } - } fout.close(); diff --git a/hls4ml/templates/vivado/firmware/defines.h b/hls4ml/templates/vivado/firmware/defines.h index 40ec72ea8f..1f11b02095 100644 --- a/hls4ml/templates/vivado/firmware/defines.h +++ b/hls4ml/templates/vivado/firmware/defines.h @@ -1,14 +1,14 @@ #ifndef DEFINES_H_ #define DEFINES_H_ -#include "ap_int.h" #include "ap_fixed.h" +#include "ap_int.h" #include "nnet_utils/nnet_types.h" #include #include -//hls-fpga-machine-learning insert numbers +// hls-fpga-machine-learning insert numbers -//hls-fpga-machine-learning insert layer-precision +// hls-fpga-machine-learning insert layer-precision #endif diff --git a/hls4ml/templates/vivado/firmware/myproject.cpp b/hls4ml/templates/vivado/firmware/myproject.cpp index 3aa4d58a37..1c7342a344 100644 --- a/hls4ml/templates/vivado/firmware/myproject.cpp +++ b/hls4ml/templates/vivado/firmware/myproject.cpp @@ -22,15 +22,15 @@ #include "parameters.h" void myproject( - //hls-fpga-machine-learning insert header + // hls-fpga-machine-learning insert header ) { - //hls-fpga-machine-learning insert IO + // hls-fpga-machine-learning insert IO #ifndef __SYNTHESIS__ static bool loaded_weights = false; if (!loaded_weights) { - //hls-fpga-machine-learning insert load weights + // hls-fpga-machine-learning insert load weights loaded_weights = true; } #endif @@ -39,5 +39,5 @@ void myproject( // NETWORK INSTANTIATION // **************************************** - //hls-fpga-machine-learning insert layers + // hls-fpga-machine-learning insert layers } diff --git a/hls4ml/templates/vivado/firmware/myproject.h b/hls4ml/templates/vivado/firmware/myproject.h index 5e2130926f..1199fbc68e 100644 --- a/hls4ml/templates/vivado/firmware/myproject.h +++ b/hls4ml/templates/vivado/firmware/myproject.h @@ -20,15 +20,15 @@ #ifndef MYPROJECT_H_ #define MYPROJECT_H_ -#include "ap_int.h" #include "ap_fixed.h" +#include "ap_int.h" #include "hls_stream.h" #include "defines.h" // Prototype of top level function for C-synthesis void myproject( - //hls-fpga-machine-learning insert header + // hls-fpga-machine-learning insert header ); #endif diff --git a/hls4ml/templates/vivado/firmware/parameters.h b/hls4ml/templates/vivado/firmware/parameters.h index addee4ef27..2d9ddedb3e 100644 --- a/hls4ml/templates/vivado/firmware/parameters.h +++ b/hls4ml/templates/vivado/firmware/parameters.h @@ -1,15 +1,15 @@ #ifndef PARAMETERS_H_ #define PARAMETERS_H_ -#include "ap_int.h" #include "ap_fixed.h" +#include "ap_int.h" -#include "nnet_utils/nnet_helpers.h" #include "nnet_utils/nnet_code_gen.h" -//hls-fpga-machine-learning insert includes - -//hls-fpga-machine-learning insert weights +#include "nnet_utils/nnet_helpers.h" +// hls-fpga-machine-learning insert includes + +// hls-fpga-machine-learning insert weights -//hls-fpga-machine-learning insert layer-config +// hls-fpga-machine-learning insert layer-config #endif diff --git a/hls4ml/templates/vivado/myproject_bridge.cpp b/hls4ml/templates/vivado/myproject_bridge.cpp index 210635ac09..35c1997f62 100644 --- a/hls4ml/templates/vivado/myproject_bridge.cpp +++ b/hls4ml/templates/vivado/myproject_bridge.cpp @@ -6,14 +6,13 @@ #include #include -//hls-fpga-machine-learning insert bram - +// hls-fpga-machine-learning insert bram namespace nnet { - bool trace_enabled = false; - std::map *trace_outputs = NULL; - size_t trace_type_size = sizeof(double); -} +bool trace_enabled = false; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet extern "C" { @@ -26,7 +25,7 @@ void allocate_trace_storage(size_t element_size) { nnet::trace_enabled = true; nnet::trace_outputs = new std::map; nnet::trace_type_size = element_size; - //hls-fpga-machine-learning insert trace_outputs + // hls-fpga-machine-learning insert trace_outputs } void free_trace_storage() { @@ -51,18 +50,17 @@ void collect_trace_output(struct trace_data *c_trace_outputs) { // Wrapper of top level function for Python bridge void myproject_float( - //hls-fpga-machine-learning insert header #float + // hls-fpga-machine-learning insert header #float ) { - - //hls-fpga-machine-learning insert wrapper #float + + // hls-fpga-machine-learning insert wrapper #float } void myproject_double( - //hls-fpga-machine-learning insert header #double + // hls-fpga-machine-learning insert header #double ) { - //hls-fpga-machine-learning insert wrapper #double + // hls-fpga-machine-learning insert wrapper #double } - } #endif diff --git a/hls4ml/templates/vivado/myproject_test.cpp b/hls4ml/templates/vivado/myproject_test.cpp index 7de8dd4b4a..9d7e7685c3 100644 --- a/hls4ml/templates/vivado/myproject_test.cpp +++ b/hls4ml/templates/vivado/myproject_test.cpp @@ -16,97 +16,95 @@ // You should have received a copy of the GNU General Public License // along with this program. If not, see . // +#include #include #include -#include -#include #include +#include #include #include -#include +#include #include "firmware/myproject.h" #include "firmware/nnet_utils/nnet_helpers.h" -//hls-fpga-machine-learning insert bram +// hls-fpga-machine-learning insert bram #define CHECKPOINT 5000 namespace nnet { - bool trace_enabled = true; - std::map *trace_outputs = NULL; - size_t trace_type_size = sizeof(double); -} +bool trace_enabled = true; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet -int main(int argc, char **argv) -{ - //load input data from text file - std::ifstream fin("tb_data/tb_input_features.dat"); - //load predictions from text file - std::ifstream fpr("tb_data/tb_output_predictions.dat"); +int main(int argc, char **argv) { + // load input data from text file + std::ifstream fin("tb_data/tb_input_features.dat"); + // load predictions from text file + std::ifstream fpr("tb_data/tb_output_predictions.dat"); #ifdef RTL_SIM - std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log"; + std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log"; #else - std::string RESULTS_LOG = "tb_data/csim_results.log"; + std::string RESULTS_LOG = "tb_data/csim_results.log"; #endif - std::ofstream fout(RESULTS_LOG); - - std::string iline; - std::string pline; - int e = 0; - - if (fin.is_open() && fpr.is_open()) { - while ( std::getline(fin,iline) && std::getline (fpr,pline) ) { - if (e % CHECKPOINT == 0) std::cout << "Processing input " << e << std::endl; - char* cstr=const_cast(iline.c_str()); - char* current; - std::vector in; - current=strtok(cstr," "); - while(current!=NULL) { - in.push_back(atof(current)); - current=strtok(NULL," "); - } - cstr=const_cast(pline.c_str()); - std::vector pr; - current=strtok(cstr," "); - while(current!=NULL) { - pr.push_back(atof(current)); - current=strtok(NULL," "); - } - - //hls-fpga-machine-learning insert data - - //hls-fpga-machine-learning insert top-level-function - - if (e % CHECKPOINT == 0) { - std::cout << "Predictions" << std::endl; - //hls-fpga-machine-learning insert predictions - std::cout << "Quantized predictions" << std::endl; - //hls-fpga-machine-learning insert quantized - } - e++; - - //hls-fpga-machine-learning insert tb-output - + std::ofstream fout(RESULTS_LOG); + + std::string iline; + std::string pline; + int e = 0; + + if (fin.is_open() && fpr.is_open()) { + while (std::getline(fin, iline) && std::getline(fpr, pline)) { + if (e % CHECKPOINT == 0) + std::cout << "Processing input " << e << std::endl; + char *cstr = const_cast(iline.c_str()); + char *current; + std::vector in; + current = strtok(cstr, " "); + while (current != NULL) { + in.push_back(atof(current)); + current = strtok(NULL, " "); + } + cstr = const_cast(pline.c_str()); + std::vector pr; + current = strtok(cstr, " "); + while (current != NULL) { + pr.push_back(atof(current)); + current = strtok(NULL, " "); + } + + // hls-fpga-machine-learning insert data + + // hls-fpga-machine-learning insert top-level-function + + if (e % CHECKPOINT == 0) { + std::cout << "Predictions" << std::endl; + // hls-fpga-machine-learning insert predictions + std::cout << "Quantized predictions" << std::endl; + // hls-fpga-machine-learning insert quantized + } + e++; + + // hls-fpga-machine-learning insert tb-output + } + fin.close(); + fpr.close(); + } else { + std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl; + + // hls-fpga-machine-learning insert zero + + // hls-fpga-machine-learning insert top-level-function + + // hls-fpga-machine-learning insert output + + // hls-fpga-machine-learning insert tb-output } - fin.close(); - fpr.close(); - } else { - std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl; - - //hls-fpga-machine-learning insert zero - - //hls-fpga-machine-learning insert top-level-function - - //hls-fpga-machine-learning insert output - - //hls-fpga-machine-learning insert tb-output - - } - fout.close(); - std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; + fout.close(); + std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; - return 0; + return 0; } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h index af609d99da..3a96482db2 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h @@ -20,14 +20,13 @@ #ifndef NNET_ACTIVATION_H_ #define NNET_ACTIVATION_H_ -#include #include "ap_fixed.h" #include "nnet_common.h" +#include namespace nnet { -struct activ_config -{ +struct activ_config { // IO size static const unsigned n_in = 10; @@ -39,91 +38,80 @@ struct activ_config static const unsigned reuse_factor = 1; // Internal data type definitions - typedef ap_fixed<18,8> table_t; + typedef ap_fixed<18, 8> table_t; }; // ************************************************* // LINEAR Activation -- See Issue 53 // ************************************************* -template -void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE - for (int ii=0; ii -void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE data_T datareg; - for (int ii=0; ii 0) res[ii] = datareg; - else res[ii] = 0; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = 0; } } -template -void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE data_T datareg; - for (int ii=0; ii MAX_INT) res[ii] = MAX_INT; - else res[ii] = datareg; + if (datareg < 0) + res[ii] = 0; + else if (datareg > MAX_INT) + res[ii] = MAX_INT; + else + res[ii] = datareg; } } -template -void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { relu_max(data, res); } -template -void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { relu_max(data, res); } // ************************************************* // Sigmoid Activation // ************************************************* -inline float sigmoid_fcn_float(float input) { - return 1.0 / (1 + std::exp(-input)); -} +inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); } -template -void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) -{ +template void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) { // Default logistic sigmoid function: // result = 1/(1+e^(-x)) for (int ii = 0; ii < N_TABLE; ii++) { // First, convert from table index to X-value (signed 8-bit, range -8 to +8) - float in_val = 2*8.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE); + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); // Next, compute lookup table function typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val); - //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; table_out[ii] = real_val; } } -template -void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -142,12 +130,14 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // Index into the lookup table based on data int data_round; int index; - for (int ii=0; ii CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; - res[ii] = (res_T) sigmoid_table[index]; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)sigmoid_table[index]; } } @@ -155,33 +145,29 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // Softmax Activation // ************************************************* -enum class softmax_implementation {latency=0, legacy=1, stable=2, argmax=3}; +enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 }; -inline float exp_fcn_float(float input) { - return std::exp(input); -} +inline float exp_fcn_float(float input) { return std::exp(input); } -template -inline float softmax_real_val_from_idx(unsigned i){ +template inline float softmax_real_val_from_idx(unsigned i) { // Treat the index as the top N bits static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table data_T x(0); - x(x.width-1, x.width-N) = i; - return (float) x; + x(x.width - 1, x.width - N) = i; + return (float)x; } -template -inline unsigned softmax_idx_from_real_val(data_T x){ +template inline unsigned softmax_idx_from_real_val(data_T x) { // Slice the top N bits to get an index into the table static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table - ap_uint y = x(x.width-1, x.width-N); // slice the top N bits of input - return (unsigned) y(N-1, 0); + ap_uint y = x(x.width - 1, x.width - N); // slice the top N bits of input + return (unsigned)y(N - 1, 0); } -template -void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]){ +template +void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) { // The template data_T is the data type used to address the table - for(unsigned i = 0; i < CONFIG_T::table_size; i++){ + for (unsigned i = 0; i < CONFIG_T::table_size; i++) { // Slicing bits for address is going to round towards 0, so take the central value float x = softmax_real_val_from_idx(i); typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x); @@ -189,10 +175,10 @@ void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_siz } } -template -void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]){ +template +void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) { // The template data_T is the data type used to address the table - for(unsigned i = 0; i < CONFIG_T::table_size; i++){ + for (unsigned i = 0; i < CONFIG_T::table_size; i++) { float x = softmax_real_val_from_idx(i); typename CONFIG_T::inv_table_t inv_x = 1 / x; table_out[i] = inv_x; @@ -200,7 +186,7 @@ void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_ } template -void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ +void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS pipeline // Initialize the lookup tables #ifdef __HLS_SYN__ @@ -225,7 +211,7 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma HLS array_partition variable=exp_res complete typename CONFIG_T::exp_table_t exp_sum(0); - for(unsigned i = 0; i < CONFIG_T::n_in; i++){ + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { #pragma HLS unroll unsigned x = softmax_idx_from_real_val(data[i]); exp_res[i] = exp_table[x]; @@ -234,17 +220,19 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ // Explicitly sum the results with an adder tree. // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing Op_add op_add; - exp_sum = reduce>(exp_res, op_add); + exp_sum = + reduce>(exp_res, op_add); - typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; - for(unsigned i = 0; i < CONFIG_T::n_in; i++){ + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { #pragma HLS unroll res[i] = exp_res[i] * inv_exp_sum; } } template -void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ +void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS pipeline // Initialize the lookup tables #ifdef __HLS_SYN__ @@ -270,8 +258,8 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ data_T x_max = reduce>(data, op_max); // For the diffs, use the same type as the input but force rounding and saturation - ap_fixed d_xi_xmax[CONFIG_T::n_in]; - for(unsigned i = 0; i < CONFIG_T::n_in; i++){ + ap_fixed d_xi_xmax[CONFIG_T::n_in]; + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { #pragma HLS unroll d_xi_xmax[i] = data[i] - x_max; } @@ -280,7 +268,7 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma HLS array_partition variable=exp_res complete typename CONFIG_T::exp_table_t exp_sum(0); - for(unsigned i = 0; i < CONFIG_T::n_in; i++){ + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { #pragma HLS unroll unsigned x = softmax_idx_from_real_val(d_xi_xmax[i]); exp_res[i] = exp_table[x]; @@ -289,45 +277,44 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ // Explicitly sum the results with an adder tree. // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing Op_add op_add; - exp_sum = reduce>(exp_res, op_add); + exp_sum = + reduce>(exp_res, op_add); - typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; - for(unsigned i = 0; i < CONFIG_T::n_in; i++){ + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { #pragma HLS unroll res[i] = exp_res[i] * inv_exp_sum; } } -template -void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) -{ +template void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) { for (int ii = 0; ii < N_TABLE; ii++) { // First, convert from table index to X-value (signed 8-bit, range -8 to +8) - float in_val = 2*8.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE); + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); // Next, compute lookup table function typename CONFIG_T::table_t real_val = exp_fcn_float(in_val); - //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; table_out[ii] = real_val; } } -template -void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) -{ +template void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) { // Inversion function: // result = 1/x for (int ii = 0; ii < N_TABLE; ii++) { // First, convert from table index to X-value (signed 8-bit, range 0 to +64) - float in_val = 64.0*ii/float(N_TABLE); + float in_val = 64.0 * ii / float(N_TABLE); // Next, compute lookup table function - if (in_val > 0.0) table_out[ii] = 1.0/in_val; - else table_out[ii] = 0.0; + if (in_val > 0.0) + table_out[ii] = 1.0 / in_val; + else + table_out[ii] = 0.0; } } -template -void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -347,50 +334,54 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) #pragma HLS PIPELINE // Index into the lookup table based on data for exponentials - typename CONFIG_T::table_t exp_res[CONFIG_T::n_in];// different, independent, fixed point precision - typename CONFIG_T::table_t exp_diff_res;// different, independent, fixed point precision + typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision + typename CONFIG_T::table_t exp_diff_res; // different, independent, fixed point precision data_T data_cache[CONFIG_T::n_in]; int data_round; int index; - for (int ii=0; ii CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; exp_diff_res = exp_table[index]; } exp_res[ii] += exp_diff_res; } } - //Second loop to invert - for (int ii=0; ii CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1; - //typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index]; - res[ii] = (res_T) invert_table[exp_res_index]; + // Second loop to invert + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + int exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64; + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index]; + res[ii] = (res_T)invert_table[exp_res_index]; } - } -template +template void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { for (int i = 0; i < CONFIG_T::n_in; i++) { #pragma HLS UNROLL - res[i] = (res_T) 0; + res[i] = (res_T)0; } data_T maximum = data[0]; - int idx = 0; + int idx = 0; for (int i = 1; i < CONFIG_T::n_in; i++) { #pragma HLS PIPELINE @@ -400,13 +391,13 @@ void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { } } - res[idx] = (res_T) 1; + res[idx] = (res_T)1; } -template -void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ +template +void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS inline - switch(CONFIG_T::implementation){ + switch (CONFIG_T::implementation) { case softmax_implementation::latency: softmax_latency(data, res); break; @@ -425,24 +416,20 @@ void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ // ************************************************* // TanH Activation // ************************************************* -template -void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) -{ +template void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) { // Implement tanh lookup for (int ii = 0; ii < N_TABLE; ii++) { // First, convert from table index to X-value (signed 8-bit, range -4 to +4) - float in_val = 2*4.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE); + float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); // Next, compute lookup table function typename CONFIG_T::table_t real_val = tanh(in_val); - //std::cout << "Tanh: Lookup table Index: " << ii<< " In Value: " << in_val << " Result: " << real_val << std::endl; + // std::cout << "Tanh: Lookup table Index: " << ii<< " In Value: " << in_val << " Result: " << real_val << + // std::endl; table_out[ii] = real_val; } } - -template -void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -461,92 +448,105 @@ void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // Index into the lookup table based on data int data_round; int index; - for (int ii=0; ii CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; - res[ii] = (res_T) tanh_table[index]; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 8; + index = data_round + 4 * CONFIG_T::table_size / 8; + // std::cout << "Input: " << data[ii] << " Round: " << data_round << " Index: " << index << std::endl; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)tanh_table[index]; } } // ************************************************* // Hard sigmoid Activation // ************************************************* -template -void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE - data_T datareg; - data_T slope = (data_T) 0.2; - data_T shift = (data_T) 0.5; - for (int ii=0; ii 1) datareg = 1; - else if (datareg < 0) datareg = 0; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; res[ii] = datareg; } } +template +void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + if (CONFIG_T::io_type == io_parallel) { + #pragma HLS PIPELINE + } + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + res[ii] = 2 * sigmoid - 1; + } +} + // ************************************************* // Leaky RELU Activation // ************************************************* -template -void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) -{ +template +void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE data_T datareg; - for (int ii=0; ii 0) res[ii] = datareg; - else res[ii] = alpha * datareg; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha * datareg; } } // ************************************************* // Thresholded RELU Activation // ************************************************* -template -void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) -{ +template +void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE data_T datareg; - for (int ii=0; ii theta) res[ii] = datareg; - else res[ii] = 0; + if (datareg > theta) + res[ii] = datareg; + else + res[ii] = 0; } } // ************************************************* // Softplus Activation // ************************************************* -inline float softplus_fcn_float(float input) { - return std::log(std::exp(input) + 1.); -} +inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); } -template -void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) -{ +template void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) { // Default softplus function: // result = log(exp(x) + 1) for (int ii = 0; ii < N_TABLE; ii++) { // First, convert from table index to X-value (signed 8-bit, range -8 to +8) - float in_val = 2*8.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE); + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); // Next, compute lookup table function typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val); - //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; table_out[ii] = real_val; } } -template -void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -565,40 +565,37 @@ void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // Index into the lookup table based on data int data_round; int index; - for (int ii=0; ii CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; - res[ii] = (res_T) softplus_table[index]; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)softplus_table[index]; } } // ************************************************* // Softsign Activation // ************************************************* -inline float softsign_fcn_float(float input) { - return input / (std::abs(input) + 1.); -} +inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); } -template -void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) -{ +template void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) { // Default softsign function: // result = x / (abs(x) + 1) for (int ii = 0; ii < N_TABLE; ii++) { // First, convert from table index to X-value (signed 8-bit, range -8 to +8) - float in_val = 2*8.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE); + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); // Next, compute lookup table function typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val); - //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; table_out[ii] = real_val; } } -template -void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -617,40 +614,37 @@ void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // Index into the lookup table based on data int data_round; int index; - for (int ii=0; ii CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; - res[ii] = (res_T) softsign_table[index]; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)softsign_table[index]; } } // ************************************************* // ELU Activation // ************************************************* -inline float elu_fcn_float(float input) { - return std::exp(input) - 1.; -} +inline float elu_fcn_float(float input) { return std::exp(input) - 1.; } -template -void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) -{ +template void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) { // Default ELU function: // result = alpha * (e^(x) - 1) for (int ii = 0; ii < N_TABLE; ii++) { // First, convert from table index to X-value (signed 8-bit, range -8 to 0) - float in_val = -8.0*ii/float(N_TABLE); + float in_val = -8.0 * ii / float(N_TABLE); // Next, compute lookup table function typename CONFIG_T::table_t real_val = elu_fcn_float(in_val); - //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; table_out[ii] = real_val; } } -template -void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) -{ +template +void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -669,21 +663,20 @@ void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_ data_T datareg; // Index into the lookup table based on data int index; - for (int ii=0; ii= 0) { res[ii] = datareg; } else { - index = datareg*CONFIG_T::table_size/-8; - if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; res[ii] = alpha * elu_table[index]; } } } -template -void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { elu(data, 1.0, res); } @@ -694,24 +687,20 @@ inline float selu_fcn_float(float input) { return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.)); } -template -void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) -{ +template void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) { // Default SELU function: // result = 1.05 * (1.673 * (e^(x) - 1)) for (int ii = 0; ii < N_TABLE; ii++) { // First, convert from table index to X-value (signed 8-bit, range -8 to 0) - float in_val = -8.0*ii/float(N_TABLE); + float in_val = -8.0 * ii / float(N_TABLE); // Next, compute lookup table function typename CONFIG_T::table_t real_val = selu_fcn_float(in_val); - //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; table_out[ii] = real_val; } } -template -void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -730,13 +719,14 @@ void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) data_T datareg; // Index into the lookup table based on data int index; - for (int ii=0; ii= 0) { res[ii] = res_T(1.0507009873554804934193349852946) * datareg; } else { - index = datareg*CONFIG_T::table_size/-8; - if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; res[ii] = selu_table[index]; } } @@ -745,59 +735,62 @@ void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // ************************************************* // PReLU Activation // ************************************************* -template -void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE data_T datareg; - for (int ii=0; ii 0) res[ii] = datareg; - else res[ii] = alpha[ii] * datareg; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha[ii] * datareg; } } // ************************************************* // Binary TanH Activation // ************************************************* -template -void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE data_T datareg; res_T cache; - for (int ii=0; ii 0 ) cache = 1; - else cache = -1; + if (datareg > 0) + cache = 1; + else + cache = -1; - res[ii] = (res_T) cache; + res[ii] = (res_T)cache; } } // ************************************************* // Ternary TanH Activation // ************************************************* -template -void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +template +void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE - - data_T datareg; - res_T cache; - for (int ii=0; ii 1 ) cache = 1; - else if( datareg > -1 && datareg <= 1) cache=0; - else cache = -1; - - res[ii] = (res_T) cache; + + data_T datareg; + res_T cache; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = 2 * data[ii]; + if (datareg > 1) + cache = 1; + else if (datareg > -1 && datareg <= 1) + cache = 0; + else + cache = -1; + + res[ii] = (res_T)cache; } - } -} +} // namespace nnet -#endif \ No newline at end of file +#endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h index 3accdc6505..075672c6bf 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h @@ -20,29 +20,30 @@ #ifndef NNET_ACTIVATION_STREAM_H_ #define NNET_ACTIVATION_STREAM_H_ -#include #include "ap_fixed.h" #include "hls_stream.h" +#include "nnet_activation.h" #include "nnet_common.h" -#include "nnet_types.h" #include "nnet_stream.h" -#include "nnet_activation.h" +#include "nnet_types.h" +#include namespace nnet { // ************************************************* // LINEAR Activation // ************************************************* -template -void linear(hls::stream &data, hls::stream &res) { - LinearActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +template void linear(hls::stream &data, hls::stream &res) { +LinearActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - LinearPackLoop: for (int j = 0; j < res_T::size; j++) { + LinearPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL out_data[j] = in_data[j]; } @@ -51,23 +52,25 @@ void linear(hls::stream &data, hls::stream &res) { } } - // ************************************************* // RELU Activation // ************************************************* -template -void relu(hls::stream &data, hls::stream &res) { - ReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +template void relu(hls::stream &data, hls::stream &res) { +ReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - ReLUPackLoop: for (int j = 0; j < res_T::size; j++) { + ReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - if (in_data[j] > 0) out_data[j] = in_data[j]; - else out_data[j] = 0; + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = 0; } res.write(out_data); @@ -78,8 +81,7 @@ void relu(hls::stream &data, hls::stream &res) { // Sigmoid Activation // ************************************************* -template -void sigmoid(hls::stream &data, hls::stream &res) { +template void sigmoid(hls::stream &data, hls::stream &res) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -93,19 +95,23 @@ void sigmoid(hls::stream &data, hls::stream &res) { initialized = true; } - SigmoidActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +SigmoidActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - SigmoidPackLoop: for (int j = 0; j < res_T::size; j++) { + SigmoidPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - int data_round = in_data[j]*CONFIG_T::table_size/16; - int index = data_round + 8*CONFIG_T::table_size/16; - if (index < 0) index = 0; - else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + int data_round = in_data[j] * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; out_data[j] = sigmoid_table[index]; } @@ -113,13 +119,12 @@ void sigmoid(hls::stream &data, hls::stream &res) { } } - // ************************************************* // Softmax Activation // ************************************************* template -void softmax_latency(hls::stream &data, hls::stream &res){ +void softmax_latency(hls::stream &data, hls::stream &res) { // Initialize the lookup tables #ifdef __HLS_SYN__ bool initialized = false; @@ -146,11 +151,13 @@ void softmax_latency(hls::stream &data, hls::stream &res){ typename CONFIG_T::exp_table_t exp_res[data_T::size]; #pragma HLS array_partition variable=exp_res complete typename CONFIG_T::exp_table_t exp_sum(0); - SoftmaxExpLoop: for(unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++){ +SoftmaxExpLoop: + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { #pragma HLS PIPELINE II=ii data_T in_pack = data.read(); - SoftmaxExpPackLoop: for(unsigned j = 0; j < data_T::size; j++){ + SoftmaxExpPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { #pragma HLS UNROLL unsigned x = softmax_idx_from_real_val(in_pack[j]); exp_res[j] = exp_table[x]; @@ -159,13 +166,16 @@ void softmax_latency(hls::stream &data, hls::stream &res){ // Explicitly sum the results with an adder tree. // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing Op_add op_add; - exp_sum = reduce>(exp_res, op_add); + exp_sum = + reduce>(exp_res, op_add); - typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; res_T out_pack; - PRAGMA_DATA_PACK(out_pack) - SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){ + PRAGMA_DATA_PACK(out_pack) + SoftmaxInvPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { #pragma HLS UNROLL #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit out_pack[j] = exp_res[j] * inv_exp_sum; @@ -175,7 +185,7 @@ void softmax_latency(hls::stream &data, hls::stream &res){ } template -void softmax_stable(hls::stream &data, hls::stream &res){ +void softmax_stable(hls::stream &data, hls::stream &res) { // Initialize the lookup tables #ifdef __HLS_SYN__ bool initialized = false; @@ -199,23 +209,26 @@ void softmax_stable(hls::stream &data, hls::stream &res){ constexpr unsigned ii = data_T::size / multiplier_limit; typename data_T::value_type data_array[data_T::size]; - #pragma HLS ARRAY_PARTITION variable=data_array complete - SoftmaxArrayLoop: for(unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++){ +#pragma HLS ARRAY_PARTITION variable=data_array complete +SoftmaxArrayLoop: + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { #pragma HLS PIPELINE II=ii data_T in_pack = data.read(); - SoftmaxArrayPackLoop: for(unsigned j = 0; j < data_T::size; j++){ + SoftmaxArrayPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { #pragma HLS UNROLL data_array[j] = in_pack[j]; } // Find the max and compute all delta(x_i, x_max) Op_max op_max; - typename data_T::value_type x_max = reduce>(data_array, op_max); + typename data_T::value_type x_max = + reduce>(data_array, op_max); // For the diffs, use the same type as the input but force rounding and saturation - ap_fixed d_xi_xmax[data_T::size]; - for(unsigned j = 0; j < data_T::size; j++){ + ap_fixed d_xi_xmax[data_T::size]; + for (unsigned j = 0; j < data_T::size; j++) { #pragma HLS UNROLL d_xi_xmax[j] = data_array[j] - x_max; } @@ -224,7 +237,7 @@ void softmax_stable(hls::stream &data, hls::stream &res){ typename CONFIG_T::exp_table_t exp_res[data_T::size]; #pragma HLS ARRAY_PARTITION variable=exp_res complete typename CONFIG_T::exp_table_t exp_sum(0); - for(unsigned j = 0; j < data_T::size; j++){ + for (unsigned j = 0; j < data_T::size; j++) { #pragma HLS UNROLL unsigned x = softmax_idx_from_real_val(d_xi_xmax[j]); exp_res[j] = exp_table[x]; @@ -233,13 +246,16 @@ void softmax_stable(hls::stream &data, hls::stream &res){ // Explicitly sum the results with an adder tree. // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing Op_add op_add; - exp_sum = reduce>(exp_res, op_add); + exp_sum = + reduce>(exp_res, op_add); - typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; res_T out_pack; - PRAGMA_DATA_PACK(out_pack) - SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){ + PRAGMA_DATA_PACK(out_pack) + SoftmaxInvPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { #pragma HLS UNROLL #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit out_pack[j] = exp_res[j] * inv_exp_sum; @@ -248,7 +264,7 @@ void softmax_stable(hls::stream &data, hls::stream &res){ } } -template +template void softmax_legacy(hls::stream &data, hls::stream &res) { // Initialize the lookup table #ifdef __HLS_SYN__ @@ -271,18 +287,22 @@ void softmax_legacy(hls::stream &data, hls::stream &res) { typename CONFIG_T::table_t exp_diff_res; typename data_T::value_type data_cache[data_T::size]; - SoftmaxInitLoop: for(unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) { +SoftmaxInitLoop: + for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) { #pragma HLS PIPELINE data_T in_pack = data.read(); - SoftmaxInitPackLoop: for(unsigned j = 0; j < data_T::size; j++) { + SoftmaxInitPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { #pragma HLS UNROLL data_cache[j] = in_pack[j]; exp_res[j] = 0; } - SoftmaxExpLoop: for (int i = 0; i < data_T::size; i++) { - #pragma HLS UNROLL - SoftmaxExpInner: for (int j = 0; j < data_T::size; j++) { + SoftmaxExpLoop: + for (int i = 0; i < data_T::size; i++) { + #pragma HLS UNROLL + SoftmaxExpInner: + for (int j = 0; j < data_T::size; j++) { #pragma HLS UNROLL if (i == j) { @@ -290,8 +310,10 @@ void softmax_legacy(hls::stream &data, hls::stream &res) { } else { int data_round = (data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16; int index = data_round + 8 * CONFIG_T::table_size / 16; - if (index < 0) index = 0; - if (index > CONFIG_T::table_size - 1) index = CONFIG_T::table_size - 1; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; exp_diff_res = exp_table[index]; } @@ -300,21 +322,24 @@ void softmax_legacy(hls::stream &data, hls::stream &res) { } res_T out_pack; - PRAGMA_DATA_PACK(out_pack) - SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++) { + PRAGMA_DATA_PACK(out_pack) + SoftmaxInvPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { #pragma HLS UNROLL int exp_res_index = exp_res[j] * CONFIG_T::table_size / 64; - if (exp_res_index < 0) exp_res_index = 0; - if (exp_res_index > CONFIG_T::table_size - 1) exp_res_index = CONFIG_T::table_size - 1; + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; - out_pack[j] = (typename res_T::value_type) invert_table[exp_res_index]; + out_pack[j] = (typename res_T::value_type)invert_table[exp_res_index]; } res.write(out_pack); } } -template +template void softmax_argmax(hls::stream &data, hls::stream &res) { for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE @@ -323,11 +348,11 @@ void softmax_argmax(hls::stream &data, hls::stream &res) { for (int i = 0; i < res_T::size; i++) { #pragma HLS UNROLL - out_data[i] = (typename res_T::value_type) 0; + out_data[i] = (typename res_T::value_type)0; } typename data_T::value_type maximum = in_data[0]; - int idx = 0; + int idx = 0; for (int i = 1; i < res_T::size; i++) { #pragma HLS PIPELINE @@ -337,17 +362,15 @@ void softmax_argmax(hls::stream &data, hls::stream &res) { } } - out_data[idx] = (typename res_T::value_type) 1; + out_data[idx] = (typename res_T::value_type)1; res.write(out_data); } } - -template -void softmax(hls::stream &data, hls::stream &res){ +template void softmax(hls::stream &data, hls::stream &res) { assert(CONFIG_T::axis == -1); - switch(CONFIG_T::implementation){ + switch (CONFIG_T::implementation) { case softmax_implementation::latency: softmax_latency(data, res); break; @@ -360,16 +383,14 @@ void softmax(hls::stream &data, hls::stream &res){ case softmax_implementation::argmax: softmax_argmax(data, res); break; - } + } } // ************************************************* // TanH Activation // ************************************************* - -template -void tanh(hls::stream &data, hls::stream &res) { +template void tanh(hls::stream &data, hls::stream &res) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -383,19 +404,23 @@ void tanh(hls::stream &data, hls::stream &res) { initialized = true; } - TanHActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +TanHActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - TanHPackLoop: for (int j = 0; j < res_T::size; j++) { + TanHPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - int data_round = in_data[j]*CONFIG_T::table_size/8; - int index = data_round + 4*CONFIG_T::table_size/8; - if (index < 0) index = 0; - else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + int data_round = in_data[j] * CONFIG_T::table_size / 8; + int index = data_round + 4 * CONFIG_T::table_size / 8; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; out_data[j] = tanh_table[index]; } @@ -403,28 +428,29 @@ void tanh(hls::stream &data, hls::stream &res) { } } - // ************************************************* // Hard sigmoid Activation // ************************************************* -template +template void hard_sigmoid(hls::stream &data, hls::stream &res) { - typename data_T::value_type slope = (typename data_T::value_type) 0.2; - typename data_T::value_type shift = (typename data_T::value_type) 0.5; - HardSigmoidActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +HardSigmoidActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - HardSigmoidPackLoop: for (int j = 0; j < res_T::size; j++) { + HardSigmoidPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - typename data_T::value_type datareg = slope * in_data[j] + shift; - if (datareg > 1) datareg = 1; - else if (datareg < 0) datareg = 0; + auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; out_data[j] = datareg; } @@ -432,60 +458,89 @@ void hard_sigmoid(hls::stream &data, hls::stream &res) { } } +template void hard_tanh(hls::stream &data, hls::stream &res) { + +HardSigmoidActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + #pragma HLS DATA_PACK variable=out_data + + HardSigmoidPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + out_data[j] = 2 * sigmoid - 1; + } + + res.write(out_data); + } +} // ************************************************* // Leaky RELU Activation // ************************************************* -template +template void leaky_relu(hls::stream &data, typename data_T::value_type alpha, hls::stream &res) { - LeakyReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +LeakyReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - LeakyReLUPackLoop: for (int j = 0; j < res_T::size; j++) { + LeakyReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - if (in_data[j] > 0) out_data[j] = in_data[j]; - else out_data[j] = alpha * in_data[j]; + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha * in_data[j]; } res.write(out_data); } } - // ************************************************* // Thresholded RELU Activation // ************************************************* -template +template void thresholded_relu(hls::stream &data, typename data_T::value_type theta, hls::stream &res) { - ThresholdedReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +ThresholdedReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - ThresholdedReLUPackLoop: for (int j = 0; j < res_T::size; j++) { + ThresholdedReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - if (in_data[j] > theta) out_data[j] = in_data[j]; - else out_data[j] = 0; + if (in_data[j] > theta) + out_data[j] = in_data[j]; + else + out_data[j] = 0; } res.write(out_data); } } - // ************************************************* // Softplus Activation // ************************************************* -template -void softplus(hls::stream &data, hls::stream &res) { +template void softplus(hls::stream &data, hls::stream &res) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -499,32 +554,34 @@ void softplus(hls::stream &data, hls::stream &res) { initialized = true; } - SoftplusActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +SoftplusActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - SoftplusPackLoop: for (int j = 0; j < res_T::size; j++) { + SoftplusPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - int data_round = in_data[j]*CONFIG_T::table_size/16; - int index = data_round + 8*CONFIG_T::table_size/16; - if (index < 0) index = 0; - else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + int data_round = in_data[j] * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; out_data[j] = softplus_table[index]; } res.write(out_data); } } - // ************************************************* // Softsign Activation // ************************************************* -template -void softsign(hls::stream &data, hls::stream &res) { +template void softsign(hls::stream &data, hls::stream &res) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -538,30 +595,33 @@ void softsign(hls::stream &data, hls::stream &res) { initialized = true; } - SoftsignActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +SoftsignActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - SoftsignPackLoop: for (int j = 0; j < res_T::size; j++) { + SoftsignPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - int data_round = in_data[j]*CONFIG_T::table_size/16; - int index = data_round + 8*CONFIG_T::table_size/16; - if (index < 0) index = 0; - else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + int data_round = in_data[j] * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; out_data[j] = softsign_table[index]; } res.write(out_data); } } - // ************************************************* // ELU Activation // ************************************************* -template +template void elu(hls::stream &data, typename data_T::value_type alpha, hls::stream &res) { // Initialize the lookup table #ifdef __HLS_SYN__ @@ -576,22 +636,25 @@ void elu(hls::stream &data, typename data_T::value_type alpha, hls::stre initialized = true; } - EluActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +EluActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - EluPackLoop: for (int j = 0; j < res_T::size; j++) { + EluPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - + typename data_T::value_type datareg = in_data[j]; if (datareg >= 0) { out_data[j] = datareg; } else { - int index = datareg*CONFIG_T::table_size/-8; - if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + int index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; out_data[j] = alpha * elu_table[index]; } } @@ -599,8 +662,7 @@ void elu(hls::stream &data, typename data_T::value_type alpha, hls::stre } } -template -void elu(hls::stream &data, hls::stream &res) { +template void elu(hls::stream &data, hls::stream &res) { elu(data, 1.0, res); } @@ -608,8 +670,7 @@ void elu(hls::stream &data, hls::stream &res) { // SELU Activation // ************************************************* -template -void selu(hls::stream &data, hls::stream &res) { +template void selu(hls::stream &data, hls::stream &res) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -623,22 +684,25 @@ void selu(hls::stream &data, hls::stream &res) { initialized = true; } - SeluActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +SeluActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - SeluPackLoop: for (int j = 0; j < res_T::size; j++) { + SeluPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL typename data_T::value_type datareg = in_data[j]; if (datareg >= 0) { - out_data[j] = (typename data_T::value_type) 1.0507009873554804934193349852946 * datareg; + out_data[j] = (typename data_T::value_type)1.0507009873554804934193349852946 * datareg; } else { - int index = datareg*CONFIG_T::table_size/-8; - if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + int index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; out_data[j] = selu_table[index]; } } @@ -646,24 +710,27 @@ void selu(hls::stream &data, hls::stream &res) { } } - // ************************************************* // PReLU Activation // ************************************************* -template +template void prelu(hls::stream &data, typename data_T::value_type alpha[CONFIG_T::n_in], hls::stream &res) { - PReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +PReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - PReLUPackLoop: for (int j = 0; j < res_T::size; j++) { + PReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - if (in_data[j] > 0) out_data[j] = in_data[j]; - else out_data[j] = alpha[i*res_T::size+j] * in_data[j]; + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha[i * res_T::size + j] * in_data[j]; } res.write(out_data); } @@ -672,19 +739,23 @@ void prelu(hls::stream &data, typename data_T::value_type alpha[CONFIG_T // ************************************************* // Binary TanH Activation // ************************************************* -template +template void binary_tanh(hls::stream &data, hls::stream &res) { - PReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +PReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - PReLUPackLoop: for (int j = 0; j < res_T::size; j++) { + PReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - if(in_data[j] > 0) out_data[j] = (typename res_T::value_type) 1; - else out_data[j] = (typename res_T::value_type) -1; + if (in_data[j] > 0) + out_data[j] = (typename res_T::value_type)1; + else + out_data[j] = (typename res_T::value_type) - 1; } res.write(out_data); } @@ -693,26 +764,30 @@ void binary_tanh(hls::stream &data, hls::stream &res) { // ************************************************* // Ternary TanH Activation // ************************************************* -template +template void ternary_tanh(hls::stream &data, hls::stream &res) { - PReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { +PReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE data_T in_data = data.read(); res_T out_data; PRAGMA_DATA_PACK(out_data) - PReLUPackLoop: for (int j = 0; j < res_T::size; j++) { + PReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { #pragma HLS UNROLL - if(in_data[j] > 1) out_data[j] = (typename res_T::value_type) 1; - else if (in_data[j] <=-1) out_data[j] = (typename res_T::value_type) -1; - else out_data[j] = (typename res_T::value_type) 0; + if (in_data[j] > 1) + out_data[j] = (typename res_T::value_type)1; + else if (in_data[j] <= -1) + out_data[j] = (typename res_T::value_type) - 1; + else + out_data[j] = (typename res_T::value_type)0; } res.write(out_data); } } - -} +} // namespace nnet #endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index d170eb6678..e4db43682e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -1,37 +1,32 @@ #ifndef NNET_INSTR_GEN_H_ #define NNET_INSTR_GEN_H_ -#include #include "nnet_helpers.h" +#include namespace nnet { -template -class FillConv1DBuffer{ - public: - static void fill_buffer( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], - const unsigned partition - ) { +template class FillConv1DBuffer { + public: + static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { // To be implemented in subclasses } }; -template -class FillConv2DBuffer{ - public: - static void fill_buffer( - data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], - const unsigned partition - ) { +template class FillConv2DBuffer { + public: + static void + fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { // To be implemented in subclasses } }; -//hls4ml insert code +// hls4ml insert code -} +} // namespace nnet -#endif \ No newline at end of file +#endif diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp index 7a06633e58..05797f1f7b 100644 --- a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp +++ b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp @@ -1,17 +1,14 @@ -//hls-fpga-machine-learning insert include +// hls-fpga-machine-learning insert include -void myproject( - input_axi_t in[N_IN], - output_axi_t out[N_OUT] - ){ +void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]) { - //hls-fpga-machine-learning insert interface + // hls-fpga-machine-learning insert interface - //hls-fpga-machine-learning insert local vars + // hls-fpga-machine-learning insert local vars - //hls-fpga-machine-learning insert enqueue + // hls-fpga-machine-learning insert enqueue - //hls-fpga-machine-learning insert call + // hls-fpga-machine-learning insert call - //hls-fpga-machine-learning insert dequeue + // hls-fpga-machine-learning insert dequeue } diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.h b/hls4ml/templates/vivado_accelerator/myproject_axi.h index fe3dbc5cde..a60dab39c4 100644 --- a/hls4ml/templates/vivado_accelerator/myproject_axi.h +++ b/hls4ml/templates/vivado_accelerator/myproject_axi.h @@ -2,12 +2,9 @@ #define MYPROJECT_AXI_H_ #include -//hls-fpga-machine-learning insert include +// hls-fpga-machine-learning insert include -//hls-fpga-machine-learning insert definitions +// hls-fpga-machine-learning insert definitions -void myproject( - input_axi_t in[N_IN], - output_axi_t out[N_OUT] - ); +void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]); #endif diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py index b63e2b48e5..b1a47c6552 100644 --- a/hls4ml/utils/config.py +++ b/hls4ml/utils/config.py @@ -40,18 +40,32 @@ def _get_precision_from_quantizer(quantizer): 'quantized_bits', 'quantized_relu', 'quantized_tanh', + 'quantized_sigmoid', 'quantized_po2', 'quantized_relu_po2', 'linear', ] signed = True + rnd = "AP_TRN" + overflow = "AP_WRAP" + if quantizer['class_name'] in supported_quantizers: bits = int(quantizer['config']['bits']) # if integer isn't specified, it should be the same as bits integer = int(quantizer['config'].get('integer', bits - 1)) + 1 - if quantizer['class_name'] == 'quantized_relu': + # for quantizers use the following default rounding and overflow + rnd = "AP_RND_CONV" + overflow = "AP_SAT" + if quantizer['class_name'] in ('quantized_relu', 'quantized_relu_po2'): signed = False integer -= 1 + elif quantizer['class_name'] == 'quantized_tanh': + overflow = "AP_SAT_SYM" if quantizer['config']['symmetric'] else "AP_SAT" + integer = 1 + elif quantizer['class_name'] == 'quantized_sigmoid': + integer = 0 + signed = False + elif quantizer['class_name'] in ['binary', 'stochastic_binary', 'binary_tanh']: bits = 2 integer = 2 @@ -65,7 +79,9 @@ def _get_precision_from_quantizer(quantizer): decimal = bits - integer if decimal > 0: - return hls4ml.model.types.FixedPrecisionType(width=bits, integer=integer, signed=signed) + return hls4ml.model.types.FixedPrecisionType( + width=bits, integer=integer, signed=signed, rounding_mode=rnd, saturation_mode=overflow + ) else: return hls4ml.model.types.IntegerPrecisionType(width=integer, signed=signed) diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py index 28dd2d7eb7..a958a6b0b5 100644 --- a/hls4ml/writer/quartus_writer.py +++ b/hls4ml/writer/quartus_writer.py @@ -148,7 +148,7 @@ def write_project_cpp(self, model): # Intel HLS 'streams' need to be passed by reference to top-level entity or declared as global variables # Streams cannot be declared inside a function # Therefore, layer connections (inputs/outputs) are declared here - elif '//hls-fpga-machine-learning insert inter-task streams' in line: + elif '// hls-fpga-machine-learning insert inter-task streams' in line: newline = line if io_type == 'io_stream': for layer in model.get_layers(): @@ -159,7 +159,7 @@ def write_project_cpp(self, model): newline += def_cpp + ';\n' # Instantiate GCC top-level function, to be used during GCC compilation / hls4ml.predict() - elif '//hls-fpga-machine-learning instantiate GCC top-level' in line: + elif '// hls-fpga-machine-learning instantiate GCC top-level' in line: newline = line if io_type == 'io_stream': newline += f'void {project_name}(\n' @@ -174,7 +174,7 @@ def write_project_cpp(self, model): newline += ') {\n' # Instantiate HLS top-level function, to be used during HLS synthesis - elif '//hls-fpga-machine-learning instantiate HLS top-level' in line: + elif '// hls-fpga-machine-learning instantiate HLS top-level' in line: newline = line if io_type == 'io_stream': newline += f'component void {project_name}(\n' @@ -189,7 +189,7 @@ def write_project_cpp(self, model): newline += ') {\n' # Insert HLS pragmas such as maximum frequency, initiation interval etc. - elif '//hls-fpga-machine-learning insert cpragmas' in line: + elif '// hls-fpga-machine-learning insert cpragmas' in line: newline = line if io_type == 'io_parallel': newline += 'hls_max_concurrency(0)\n' @@ -202,7 +202,7 @@ def write_project_cpp(self, model): # In io_stream, the input is of type 'stream_in' and output is of type 'stream_out' # However, individual layers accept the type 'stream' # Therefore, data is first read from 'stream_in', written to 'stream' and propagated through network - elif '//hls-fpga-machine-learning initialize input/output' in line: + elif '// hls-fpga-machine-learning initialize input/output' in line: if io_type == 'io_stream': newline = line for inp in model_inputs: @@ -215,21 +215,21 @@ def write_project_cpp(self, model): newline += indent + 'hls_register output_data outputs;\n' # Insert weights - elif '//hls-fpga-machine-learning insert weights' in line: + elif '// hls-fpga-machine-learning insert weights' in line: newline = line for layer in model.get_layers(): for w in layer.get_weights(): newline += f'#include "weights/{w.name}.h"\n' # Insert test weights - elif '//hls-fpga-machine-learning insert test weights' in line: + elif '// hls-fpga-machine-learning insert test weights' in line: newline = line for layer in model.get_layers(): for w in layer.get_weights(): newline += f'#include "weights/{w.name}_test.h"\n' # Neural net instantiation - elif '//hls-fpga-machine-learning insert layers' in line: + elif '// hls-fpga-machine-learning insert layers' in line: newline = line + '\n' model_inputs = model.get_input_variables() model_outputs = model.get_output_variables() @@ -254,7 +254,7 @@ def write_project_cpp(self, model): newline += '\n' # In io_parallel, a return is required; for more details see myproject.cpp & myproject.h - elif '//hls-fpga-machine-learning return' in line: + elif '// hls-fpga-machine-learning return' in line: if io_type == 'io_stream': newline = line for out in model_outputs: @@ -304,7 +304,7 @@ def write_project_header(self, model): elif 'myproject' in line: newline = line.replace('myproject', project_name) - elif '//hls-fpga-machine-learning instantiate GCC top-level' in line: + elif '// hls-fpga-machine-learning instantiate GCC top-level' in line: newline = line # For io_stream, input and output are passed by reference; see myproject.h & myproject.cpp for more details @@ -322,7 +322,7 @@ def write_project_header(self, model): newline += ');\n' # Similar to GCC instantiation, but with the keyword 'component' - elif '//hls-fpga-machine-learning instantiate HLS top-level' in line: + elif '// hls-fpga-machine-learning instantiate HLS top-level' in line: newline = line if io_type == 'io_stream': newline += f'component void {project_name}(\n' @@ -336,7 +336,7 @@ def write_project_header(self, model): newline += indent + 'input_data inputs\n' newline += ');\n' - elif '//hls-fpga-machine-learning insert cpragmas' in line: + elif '// hls-fpga-machine-learning insert cpragmas' in line: newline = line if io_type == 'io_parallel': newline += 'hls_max_concurrency(0)\n' @@ -346,14 +346,14 @@ def write_project_header(self, model): # For io_stream, no inputs/outputs are instantiated, as they are passed by reference # For io_parallel, input/output structs are required - elif '//hls-fpga-machine-learning insert inputs' in line: + elif '// hls-fpga-machine-learning insert inputs' in line: newline = line if io_type != 'io_stream': newline += 'struct input_data { \n' for inp in model_inputs: newline += indent + inp.definition_cpp() + ';\n' newline += '};\n' - elif '//hls-fpga-machine-learning insert outputs' in line: + elif '// hls-fpga-machine-learning insert outputs' in line: newline = line if io_type != 'io_stream': newline += 'struct output_data { \n' @@ -382,12 +382,12 @@ def write_defines(self, model): for line in f.readlines(): # Insert numbers - if '//hls-fpga-machine-learning insert numbers' in line: + if '// hls-fpga-machine-learning insert numbers' in line: newline = line numbers = OrderedDict.fromkeys([layer.get_numbers_cpp() for layer in model.get_layers()]) newline += ''.join(numbers) - elif '//hls-fpga-machine-learning insert layer-precision' in line: + elif '// hls-fpga-machine-learning insert layer-precision' in line: newline = line all_precision = OrderedDict() for layer in model.get_layers(): @@ -418,12 +418,12 @@ def write_parameters(self, model): for line in f.readlines(): - if '//hls-fpga-machine-learning insert includes' in line: + if '// hls-fpga-machine-learning insert includes' in line: newline = line for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))): newline += '#include "%s"\n' % include - elif "//hls-fpga-machine-learning insert layer-config" in line: + elif "// hls-fpga-machine-learning insert layer-config" in line: newline = line for layer in model.get_layers(): config = layer.get_attr('config_cpp', None) @@ -487,7 +487,7 @@ def write_testbench_parallel(self, model): if 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) - elif '//hls-fpga-machine-learning insert data' in line: + elif '// hls-fpga-machine-learning insert data' in line: newline = line newline += ' std::vector::const_iterator in_begin = in.cbegin();\n' newline += ' std::vector::const_iterator in_end;\n' @@ -497,7 +497,7 @@ def write_testbench_parallel(self, model): newline += f' std::copy(in_begin, in_end, inputs.back().{inp.member_name});\n' newline += ' in_begin = in_end;\n' newline += ' outputs.emplace_back();\n' - elif '//hls-fpga-machine-learning insert zero' in line: + elif '// hls-fpga-machine-learning insert zero' in line: newline = line newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n' for inp in model.get_input_variables(): @@ -506,7 +506,7 @@ def write_testbench_parallel(self, model): newline += indent + f' std::fill_n(inputs[i].{inp.member_name}, {inp.size_cpp()}, 0.0);\n' newline += indent + '}\n' - elif '//hls-fpga-machine-learning insert top-level-function' in line: + elif '// hls-fpga-machine-learning insert top-level-function' in line: newline = line newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n' @@ -515,20 +515,21 @@ def write_testbench_parallel(self, model): elif 'hls-fpga-machine-learning insert run' in line: newline = line newline += ' ' + f'ihc_hls_component_run_all({model.config.get_project_name()});\n' - elif '//hls-fpga-machine-learning insert predictions' in line: + elif '// hls-fpga-machine-learning insert predictions' in line: newline = line newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' newline += indent + ' std::cout << predictions[j][i] << " ";\n' newline += indent + '}\n' newline += indent + 'std::cout << std::endl;\n' - elif '//hls-fpga-machine-learning insert tb-output' in line: + elif '// hls-fpga-machine-learning insert tb-output' in line: newline = line newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' newline += indent + f' fout << outputs[j].{outvar.member_name}[i] << " ";\n' newline += indent + '}\n' newline += indent + 'fout << std::endl;\n' elif ( - '//hls-fpga-machine-learning insert output' in line or '//hls-fpga-machine-learning insert quantized' in line + '// hls-fpga-machine-learning insert output' in line + or '// hls-fpga-machine-learning insert quantized' in line ): newline = line newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' @@ -589,7 +590,7 @@ def write_testbench_stream(self, model): if 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) - elif '//hls-fpga-machine learning instantiate inputs and outputs' in line: + elif '// hls-fpga-machine learning instantiate inputs and outputs' in line: newline = line for inp in model_inputs: newline += indent + f'stream_in<{inp.type.name}> {inp.name}_input;\n' @@ -597,7 +598,7 @@ def write_testbench_stream(self, model): newline += indent + f'stream_out<{out.type.name}> {out.name}_output;\n' # TODO - This is one-input specific (are multiple model inputs needed at all?) - elif '//hls-fpga-machine-learning insert data' in line: + elif '// hls-fpga-machine-learning insert data' in line: newline = line c = 0 for inp in model_inputs: @@ -611,7 +612,7 @@ def write_testbench_stream(self, model): ) c += 1 - elif '//hls-fpga-machine-learning insert zero' in line: + elif '// hls-fpga-machine-learning insert zero' in line: newline = line c = 0 for inp in model_inputs: @@ -625,7 +626,7 @@ def write_testbench_stream(self, model): ) c += 1 - elif '//hls-fpga-machine-learning insert top-level-function' in line: + elif '// hls-fpga-machine-learning insert top-level-function' in line: newline = line input_params = ', '.join([f'{i.name}_input' for i in model_inputs]) output_params = ', '.join([f'{o.name}_output' for o in model_outputs]) @@ -638,27 +639,27 @@ def write_testbench_stream(self, model): newline = line newline += indent + f'ihc_hls_component_run_all({model.config.get_project_name()});\n' - elif '//hls-fpga-machine-learning convert output' in line: + elif '// hls-fpga-machine-learning convert output' in line: newline = line newline += indent + f'float res[{outvar.size_cpp()}];\n' newline += indent + 'nnet::convert_data_back<{}, float, {}>({}_output, res);\n'.format( outvar.type.name, outvar.size_cpp(), outvar.name ) - elif '//hls-fpga-machine-learning insert tb-output' in line: + elif '// hls-fpga-machine-learning insert tb-output' in line: newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' newline += indent + ' fout << res[i] << " ";\n' newline += indent + '}\n' newline += indent + 'fout << std::endl;\n' - elif '//hls-fpga-machine-learning print predictions' in line: + elif '// hls-fpga-machine-learning print predictions' in line: newline = line newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' newline += indent + ' std::cout << predictions[iteration][i] << " ";\n' newline += indent + '}\n' newline += indent + 'std::cout << std::endl;\n' - elif '//hls-fpga-machine-learning print output' in line: + elif '// hls-fpga-machine-learning print output' in line: newline = line newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' newline += indent + ' std::cout << res[i] << " "; \n' @@ -711,7 +712,7 @@ def write_bridge(self, model): elif 'myproject' in line: newline = line.replace('myproject', format(model.config.get_project_name())) - elif '//hls-fpga-machine-learning insert header' in line: + elif '// hls-fpga-machine-learning insert header' in line: dtype = line.split('#', 1)[1].strip() if io_type == 'io_stream': inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs]) @@ -729,7 +730,7 @@ def write_bridge(self, model): newline += indent + insize_str + ',\n' newline += indent + outsize_str + '\n' - elif '//hls-fpga-machine-learning insert wrapper' in line: + elif '// hls-fpga-machine-learning insert wrapper' in line: dtype = line.split('#', 1)[1].strip() if io_type == 'io_stream': newline = '' @@ -782,7 +783,7 @@ def write_bridge(self, model): newline += indent + 'nnet::convert_data_back<{}, {}, {}>(outputs_ap.{}, {});\n'.format( o.type.name, dtype, o.size_cpp(), o.member_name, o.member_name ) - elif '//hls-fpga-machine-learning insert trace_outputs' in line: + elif '// hls-fpga-machine-learning insert trace_outputs' in line: newline = '' for layer in model.get_layers(): func = layer.get_attr('function_cpp') diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py index b92ce74ab7..46c193fdbd 100644 --- a/hls4ml/writer/vivado_accelerator_writer.py +++ b/hls4ml/writer/vivado_accelerator_writer.py @@ -1,71 +1,96 @@ import os -from shutil import copyfile, copytree from distutils.dir_util import copy_tree +from shutil import copyfile + from hls4ml.writer.vivado_writer import VivadoWriter -class VivadoAcceleratorWriter(VivadoWriter): +class VivadoAcceleratorWriter(VivadoWriter): def __init__(self): super().__init__() self.vivado_accelerator_config = None def write_axi_wrapper(self, model): - ''' Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces - Args: - model : The ModelGraph to write the wrapper for + '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces + Args: + model : The ModelGraph to write the wrapper for ''' inp_axi_t, out_axi_t, inp, out = self.vivado_accelerator_config.get_corrected_types() indent = ' ' ####################### - ## myproject_axi.h + # myproject_axi.h ####################### filedir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(filedir, '../templates/vivado_accelerator/myproject_axi.h'), 'r') - fout = open('{}/firmware/{}_axi.h'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w') + f = open(os.path.join(filedir, '../templates/vivado_accelerator/myproject_axi.h')) + fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w') for line in f.readlines(): if 'MYPROJECT' in line: newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) - elif '//hls-fpga-machine-learning insert include' in line: - newline = '#include "{}.h"\n'.format(model.config.get_project_name()) - elif 'void myproject(' in line: - newline = 'void {}_axi(\n'.format(model.config.get_project_name()) - elif '//hls-fpga-machine-learning insert definitions' in line: + elif '// hls-fpga-machine-learning insert include' in line: + newline = f'#include "{model.config.get_project_name()}.h"\n' + elif 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert definitions' in line: newline = '' - newline += 'static const unsigned N_IN = {};\n'.format(inp.size()) - newline += 'static const unsigned N_OUT = {};\n'.format(out.size()) + newline += f'static const unsigned N_IN = {inp.size()};\n' + newline += f'static const unsigned N_OUT = {out.size()};\n' if self.vivado_accelerator_config.get_interface() == 'axi_stream': - newline += 'typedef {} T_in;\n'.format(inp_axi_t) - newline += 'typedef {} T_out;\n'.format(out_axi_t) - newline += 'typedef struct in_struct {\n' + \ - indent + 'T_in data;\n' + \ - indent + 'ap_uint<1> last;\n' + \ - indent + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + \ - indent + 'in_struct(){this->data = 0; this->last = 0;};\n' + \ - indent + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n' + \ - indent + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n' + \ - indent + 'operator float() const {return this->data;}\n' + \ - indent + 'operator double() const {return this->data;}\n' + \ - indent + 'in_struct(float data) {this->data = data; this->last = 0;}\n' + \ - indent + 'in_struct(double data) {this->data = data; this->last = 0;}\n' + \ - '} input_axi_t;\n' - newline += 'typedef struct out_struct {\n' + \ - indent + 'T_out data;\n' + \ - indent + 'ap_uint<1> last;\n' + \ - indent + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + \ - indent + 'out_struct(){this->data = 0; this->last = 0;};\n' + \ - indent + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n' + \ - indent + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n' + \ - indent + 'operator float() const {return this->data;}\n' + \ - indent + 'operator double() const {return this->data;}\n' + \ - indent + 'out_struct(float data) {this->data = data; this->last = 0;}\n' + \ - indent + 'out_struct(double data) {this->data = data; this->last = 0;}\n' + \ - '} output_axi_t;\n' + newline += f'typedef {inp_axi_t} T_in;\n' + newline += f'typedef {out_axi_t} T_out;\n' + newline += ( + 'typedef struct in_struct {\n' + + indent + + 'T_in data;\n' + + indent + + 'ap_uint<1> last;\n' + + indent + + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + + indent + + 'in_struct(){this->data = 0; this->last = 0;};\n' + + indent + + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n' + + indent + + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n' + + indent + + 'operator float() const {return this->data;}\n' + + indent + + 'operator double() const {return this->data;}\n' + + indent + + 'in_struct(float data) {this->data = data; this->last = 0;}\n' + + indent + + 'in_struct(double data) {this->data = data; this->last = 0;}\n' + + '} input_axi_t;\n' + ) + newline += ( + 'typedef struct out_struct {\n' + + indent + + 'T_out data;\n' + + indent + + 'ap_uint<1> last;\n' + + indent + + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + + indent + + 'out_struct(){this->data = 0; this->last = 0;};\n' + + indent + + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n' + + indent + + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n' + + indent + + 'operator float() const {return this->data;}\n' + + indent + + 'operator double() const {return this->data;}\n' + + indent + + 'out_struct(float data) {this->data = data; this->last = 0;}\n' + + indent + + 'out_struct(double data) {this->data = data; this->last = 0;}\n' + + '} output_axi_t;\n' + ) else: - newline += 'typedef {} input_axi_t;\n'.format(inp_axi_t) - newline += 'typedef {} output_axi_t;\n'.format(out_axi_t) + newline += f'typedef {inp_axi_t} input_axi_t;\n' + newline += f'typedef {out_axi_t} output_axi_t;\n' else: newline = line fout.write(newline) @@ -73,21 +98,20 @@ def write_axi_wrapper(self, model): fout.close() ####################### - ## myproject_axi.cpp + # myproject_axi.cpp ####################### - f = open(os.path.join(filedir, '../templates/vivado_accelerator/myproject_axi.cpp'), 'r') - fout = open('{}/firmware/{}_axi.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()), - 'w') + f = open(os.path.join(filedir, '../templates/vivado_accelerator/myproject_axi.cpp')) + fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w') io_type = model.config.get_config_value("IOType") for line in f.readlines(): - if 'void myproject(' in line: - newline = 'void {}_axi(\n'.format(model.config.get_project_name()) - elif '//hls-fpga-machine-learning insert include' in line: - newline = '#include "{}_axi.h"\n'.format(model.config.get_project_name()) - elif '//hls-fpga-machine-learning insert local vars' in line: + if 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert include' in line: + newline = f'#include "{model.config.get_project_name()}_axi.h"\n' + elif '// hls-fpga-machine-learning insert local vars' in line: newline = '' if self.vivado_accelerator_config.get_interface() == 'axi_stream': newline += indent + 'bool is_last = false;\n' @@ -97,14 +121,15 @@ def write_axi_wrapper(self, model): elif io_type == 'io_stream': newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n' newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n' - newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'\ - .format(model.get_input_variables()[0].pragma[1]) - newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'\ - .format(model.get_output_variables()[0].pragma[1]) - elif '//hls-fpga-machine-learning insert call' in line: - newline = indent + '{}(in_local, out_local);\n'.format( - model.config.get_project_name()) - elif '//hls-fpga-machine-learning insert interface' in line: + newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format( + model.get_input_variables()[0].pragma[1] + ) + newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format( + model.get_output_variables()[0].pragma[1] + ) + elif '// hls-fpga-machine-learning insert call' in line: + newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n' + elif '// hls-fpga-machine-learning insert interface' in line: if self.vivado_accelerator_config.get_interface() == 'axi_lite': newline = '' newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' @@ -113,10 +138,12 @@ def write_axi_wrapper(self, model): elif self.vivado_accelerator_config.get_interface() == 'axi_master': newline = '' newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n' - newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'\ - .format(model.get_input_variables()[0].pragma[1]) - newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'\ - .format(model.get_output_variables()[0].pragma[1]) + newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format( + model.get_input_variables()[0].pragma[1] + ) + newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format( + model.get_output_variables()[0].pragma[1] + ) elif self.vivado_accelerator_config.get_interface() == 'axi_stream': newline = '' newline += indent + '#pragma HLS INTERFACE axis port=in\n' @@ -124,7 +151,7 @@ def write_axi_wrapper(self, model): newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' if model.config.get_config_value("IOType") == 'io_stream': newline += indent + '#pragma HLS DATAFLOW\n' - elif '//hls-fpga-machine-learning insert enqueue' in line: + elif '// hls-fpga-machine-learning insert enqueue' in line: io_type = model.config.get_config_value("IOType") if io_type == 'io_parallel': newline = '' @@ -146,15 +173,27 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' if self.vivado_accelerator_config.get_interface() == 'axi_stream': - newline += indent + indent + indent + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n' - newline += indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n' + newline += ( + indent + + indent + + indent + + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n' + ) + newline += ( + indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n' + ) else: - newline += indent + indent + indent + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j]);\n' + newline += ( + indent + + indent + + indent + + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j]);\n' + ) newline += indent + indent + '}}\n' newline += indent + indent + 'in_local.write(ctype);\n' newline += indent + '}}\n' newline = newline.format(input_t=inp.type.name) - elif '//hls-fpga-machine-learning insert dequeue' in line: + elif '// hls-fpga-machine-learning insert dequeue' in line: io_type = model.config.get_config_value("IOType") if io_type == 'io_parallel': newline = '' @@ -175,8 +214,15 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' if self.vivado_accelerator_config.get_interface() == 'axi_stream': - newline += indent + indent + indent + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n' - newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j], last);\n' + newline += ( + indent + + indent + + indent + + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n' + ) + newline += ( + indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j], last);\n' + ) else: newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n' newline += indent + indent + '}}\n' @@ -193,18 +239,20 @@ def modify_build_script(self, model): Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function ''' filedir = os.path.dirname(os.path.abspath(__file__)) - oldfile = '{}/build_prj.tcl'.format(model.config.get_output_dir()) - newfile = '{}/build_prj_axi.tcl'.format(model.config.get_output_dir()) - f = open(oldfile, 'r') + oldfile = f'{model.config.get_output_dir()}/build_prj.tcl' + newfile = f'{model.config.get_output_dir()}/build_prj_axi.tcl' + f = open(oldfile) fout = open(newfile, 'w') for line in f.readlines(): if 'set_top' in line: newline = line[:-1] + '_axi\n' # remove the newline from the line end and append _axi for the new top - newline += 'add_files firmware/{}_axi.cpp -cflags "-std=c++0x"\n'.format( - model.config.get_project_name()) - elif '{}_cosim'.format(model.config.get_project_name()) in line: - newline = line.replace('{}_cosim'.format(model.config.get_project_name()), '{}_axi_cosim'.format(model.config.get_project_name())) + newline += f'add_files firmware/{model.config.get_project_name()}_axi.cpp -cflags "-std=c++0x"\n' + elif f'{model.config.get_project_name()}_cosim' in line: + newline = line.replace( + f'{model.config.get_project_name()}_cosim', + f'{model.config.get_project_name()}_axi_cosim', + ) elif '${project_name}.tcl' in line: newline = line.replace('${project_name}.tcl', '${project_name}_axi.tcl') else: @@ -219,8 +267,8 @@ def modify_build_script(self, model): # build_lib.sh ################### - f = open(os.path.join(filedir, '../templates/vivado_accelerator/build_lib.sh'), 'r') - fout = open('{}/build_lib.sh'.format(model.config.get_output_dir()), 'w') + f = open(os.path.join(filedir, '../templates/vivado_accelerator/build_lib.sh')) + fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w') for line in f.readlines(): line = line.replace('myproject', model.config.get_project_name()) @@ -235,34 +283,37 @@ def write_wrapper_test(self, model): ################### # write myproject_test_wrapper.cpp ################### - oldfile = '{}/{}_test.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()) - newfile = '{}/{}_test_wrapper.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()) + oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp' + newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp' - f = open(oldfile, 'r') + f = open(oldfile) fout = open(newfile, 'w') inp = model.get_input_variables()[0] out = model.get_output_variables()[0] for line in f.readlines(): - if '{}.h'.format(model.config.get_project_name()) in line: - newline = line.replace('{}.h'.format(model.config.get_project_name()), - '{}_axi.h'.format(model.config.get_project_name())) + if f'{model.config.get_project_name()}.h' in line: + newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') elif inp.definition_cpp() in line: - newline = line.replace(inp.definition_cpp(), 'input_axi_t inputs[N_IN]') #TODO instead of replacing strings, how about we use proper variables and their definition? + newline = line.replace( + inp.definition_cpp(), 'input_axi_t inputs[N_IN]' + ) # TODO instead of replacing strings, how about we use proper variables and their definition? elif out.definition_cpp() in line: newline = line.replace(out.definition_cpp(), 'output_axi_t outputs[N_OUT]') elif 'unsigned short' in line: newline = '' - elif '{}('.format(model.config.get_project_name()) in line: + elif f'{model.config.get_project_name()}(' in line: indent_amount = line.split(model.config.get_project_name())[0] - newline = indent_amount + '{}_axi(inputs,outputs);\n'.format(model.config.get_project_name()) + newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n' elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: - newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, - 'input_axi_t') + newline = ( + line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'input_axi_t') + ) elif out.size_cpp() in line or out.name in line or out.type.name in line: - newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, - 'output_axi_t') + newline = ( + line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'output_axi_t') + ) else: newline = line if self.vivado_accelerator_config.get_interface() == 'axi_stream': @@ -280,29 +331,27 @@ def write_wrapper_test(self, model): ################### # write myproject_bridge_wrapper.cpp ################### - oldfile = '{}/{}_bridge.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()) - newfile = '{}/{}_bridge_wrapper.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()) + oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp' + newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge_wrapper.cpp' - f = open(oldfile, 'r') + f = open(oldfile) fout = open(newfile, 'w') inp = model.get_input_variables()[0] out = model.get_output_variables()[0] for line in f.readlines(): - if '{}.h'.format(model.config.get_project_name()) in line: - newline = line.replace('{}.h'.format(model.config.get_project_name()), - '{}_axi.h'.format(model.config.get_project_name())) + if f'{model.config.get_project_name()}.h' in line: + newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') elif inp.definition_cpp(name_suffix='_ap') in line: - newline = line.replace(inp.definition_cpp(name_suffix='_ap'), - 'input_axi_t {}_ap[N_IN]'.format(inp.name)) + newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'input_axi_t {inp.name}_ap[N_IN]') elif out.definition_cpp(name_suffix='_ap') in line: - newline = line.replace(out.definition_cpp(name_suffix='_ap'), - 'output_axi_t {}_ap[N_OUT]'.format(out.name)) - elif '{}('.format(model.config.get_project_name()) in line: + newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'output_axi_t {out.name}_ap[N_OUT]') + elif f'{model.config.get_project_name()}(' in line: indent_amount = line.split(model.config.get_project_name())[0] - newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(model.config.get_project_name(), inp.name, - out.name) + newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format( + model.config.get_project_name(), inp.name, out.name + ) elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'input_axi_t') elif out.size_cpp() in line or out.name in line or out.type.name in line: @@ -320,57 +369,61 @@ def write_board_script(self, model): Write the tcl scripts and kernel sources to create a Vivado IPI project for the VivadoAccelerator ''' filedir = os.path.dirname(os.path.abspath(__file__)) - copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_tcl_file_path()), - '{}/design.tcl'.format(model.config.get_output_dir())) + copyfile( + os.path.join(filedir, self.vivado_accelerator_config.get_tcl_file_path()), + f'{model.config.get_output_dir()}/design.tcl', + ) # Generic alveo board if self.vivado_accelerator_config.get_board().startswith('alveo'): - src_dir=os.path.join(filedir, self.vivado_accelerator_config.get_krnl_rtl_src_dir()) - dst_dir= os.path.abspath(model.config.get_output_dir())+'/src' - copy_tree(src_dir,dst_dir) + src_dir = os.path.join(filedir, self.vivado_accelerator_config.get_krnl_rtl_src_dir()) + dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src' + copy_tree(src_dir, dst_dir) ################### # project.tcl ################### - f = open('{}/project.tcl'.format(model.config.get_output_dir()), 'w') + f = open(f'{model.config.get_output_dir()}/project.tcl', 'w') f.write('variable project_name\n') - f.write('set project_name "{}"\n'.format(model.config.get_project_name())) + f.write(f'set project_name "{model.config.get_project_name()}"\n') f.write('variable backend\n') f.write('set backend "vivadoaccelerator"\n') f.write('variable part\n') - f.write('set part "{}"\n'.format(self.vivado_accelerator_config.get_part())) + f.write(f'set part "{self.vivado_accelerator_config.get_part()}"\n') f.write('variable clock_period\n') f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) f.write('variable clock_uncertainty\n') f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) if self.vivado_accelerator_config.get_interface() == 'axi_stream': in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth() - f.write('set bit_width_hls_output {}\n'.format(in_bit)) - f.write('set bit_width_hls_input {}\n'.format(out_bit)) + f.write(f'set bit_width_hls_output {in_bit}\n') + f.write(f'set bit_width_hls_input {out_bit}\n') f.close() def write_driver(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) - copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()), - ('{}/' + self.vivado_accelerator_config.get_driver_file()).format(model.config.get_output_dir())) - + copyfile( + os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()), + ('{}/' + self.vivado_accelerator_config.get_driver_file()).format(model.config.get_output_dir()), + ) + def write_new_tar(self, model): os.remove(model.config.get_output_dir() + '.tar.gz') - super(VivadoAcceleratorWriter, self).write_tar(model) + super().write_tar(model) - def write_hls(self, model): """ Write the HLS project. Calls the VivadoBackend writer, and extra steps for VivadoAccelerator/AXI interface """ - #TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package + # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package from hls4ml.backends import VivadoAcceleratorConfig - self.vivado_accelerator_config = VivadoAcceleratorConfig(model.config, model.get_input_variables(), - model.get_output_variables()) - super(VivadoAcceleratorWriter, self).write_hls(model) + + self.vivado_accelerator_config = VivadoAcceleratorConfig( + model.config, model.get_input_variables(), model.get_output_variables() + ) + super().write_hls(model) self.write_board_script(model) self.write_driver(model) self.write_wrapper_test(model) self.write_axi_wrapper(model) self.modify_build_script(model) self.write_new_tar(model) - diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index c70e28bb5f..a7d2691020 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -125,7 +125,7 @@ def write_project_cpp(self, model): # Add headers to weights and biases if 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) - elif '//hls-fpga-machine-learning insert header' in line: + elif '// hls-fpga-machine-learning insert header' in line: inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs]) outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs]) brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) @@ -137,7 +137,7 @@ def write_project_cpp(self, model): newline += ',\n' + brams_str newline += '\n' - elif '//hls-fpga-machine-learning insert load weights' in line: + elif '// hls-fpga-machine-learning insert load weights' in line: newline = line for layer in model.get_layers(): for w in layer.get_weights(): @@ -155,7 +155,7 @@ def write_project_cpp(self, model): ) # Add input/output type - elif '//hls-fpga-machine-learning insert IO' in line: + elif '// hls-fpga-machine-learning insert IO' in line: newline = line all_inputs = [i.name for i in model_inputs] all_outputs = [o.name for o in model_outputs] @@ -184,7 +184,7 @@ def write_project_cpp(self, model): newline += indent + '#pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams)) newline += indent + '#pragma HLS DATAFLOW \n' - elif '//hls-fpga-machine-learning insert layers' in line: + elif '// hls-fpga-machine-learning insert layers' in line: newline = line + '\n' for layer in model.get_layers(): vars = layer.get_variables() @@ -243,9 +243,9 @@ def write_project_header(self, model): if 'MYPROJECT' in line: newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) - elif 'void myproject(' in line: - newline = f'void {model.config.get_project_name()}(\n' - elif '//hls-fpga-machine-learning insert header' in line: + elif 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert header' in line: inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs]) outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs]) brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) @@ -276,12 +276,12 @@ def write_defines(self, model): for line in f.readlines(): # Insert numbers - if '//hls-fpga-machine-learning insert numbers' in line: + if '// hls-fpga-machine-learning insert numbers' in line: newline = line numbers = OrderedDict.fromkeys([layer.get_numbers_cpp() for layer in model.get_layers()]) newline += ''.join(numbers) - elif '//hls-fpga-machine-learning insert layer-precision' in line: + elif '// hls-fpga-machine-learning insert layer-precision' in line: newline = line all_precision = OrderedDict() for layer in model.get_layers(): @@ -312,19 +312,19 @@ def write_parameters(self, model): for line in f.readlines(): - if '//hls-fpga-machine-learning insert includes' in line: + if '// hls-fpga-machine-learning insert includes' in line: newline = line for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))): newline += '#include "%s"\n' % include - elif '//hls-fpga-machine-learning insert weights' in line: + elif '// hls-fpga-machine-learning insert weights' in line: newline = line for layer in model.get_layers(): for w in layer.get_weights(): if w.storage.lower() != 'bram': newline += f'#include "weights/{w.name}.h"\n' - elif "//hls-fpga-machine-learning insert layer-config" in line: + elif "// hls-fpga-machine-learning insert layer-config" in line: newline = line for layer in model.get_layers(): config = layer.get_attr('config_cpp', None) @@ -415,11 +415,11 @@ def write_test_bench(self, model): # Insert numbers if 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) - elif '//hls-fpga-machine-learning insert bram' in line: + elif '// hls-fpga-machine-learning insert bram' in line: newline = line for bram in model_brams: newline += f'#include \"firmware/weights/{bram.name}.h\"\n' - elif '//hls-fpga-machine-learning insert data' in line: + elif '// hls-fpga-machine-learning insert data' in line: newline = line offset = 0 for inp in model_inputs: @@ -430,14 +430,14 @@ def write_test_bench(self, model): offset += inp.size() for out in model_outputs: newline += ' ' + out.definition_cpp() + ';\n' - elif '//hls-fpga-machine-learning insert zero' in line: + elif '// hls-fpga-machine-learning insert zero' in line: newline = line for inp in model_inputs: newline += ' ' + inp.definition_cpp() + ';\n' newline += f' nnet::fill_zero<{inp.type.name}, {inp.size_cpp()}>({inp.name});\n' for out in model_outputs: newline += ' ' + out.definition_cpp() + ';\n' - elif '//hls-fpga-machine-learning insert top-level-function' in line: + elif '// hls-fpga-machine-learning insert top-level-function' in line: newline = line input_vars = ','.join([i.name for i in model_inputs]) @@ -450,21 +450,22 @@ def write_test_bench(self, model): top_level = indent + f'{model.config.get_project_name()}({all_vars});\n' newline += top_level - elif '//hls-fpga-machine-learning insert predictions' in line: + elif '// hls-fpga-machine-learning insert predictions' in line: newline = line for out in model_outputs: newline += indent + f'for(int i = 0; i < {out.size_cpp()}; i++) {{\n' newline += indent + ' std::cout << pr[i] << " ";\n' newline += indent + '}\n' newline += indent + 'std::cout << std::endl;\n' - elif '//hls-fpga-machine-learning insert tb-output' in line: + elif '// hls-fpga-machine-learning insert tb-output' in line: newline = line for out in model_outputs: newline += indent + 'nnet::print_result<{}, {}>({}, fout);\n'.format( out.type.name, out.size_cpp(), out.name ) # TODO enable this elif ( - '//hls-fpga-machine-learning insert output' in line or '//hls-fpga-machine-learning insert quantized' in line + '// hls-fpga-machine-learning insert output' in line + or '// hls-fpga-machine-learning insert quantized' in line ): newline = line for out in model_outputs: @@ -500,11 +501,11 @@ def write_bridge(self, model): newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) elif 'myproject' in line: newline = line.replace('myproject', format(model.config.get_project_name())) - elif '//hls-fpga-machine-learning insert bram' in line: + elif '// hls-fpga-machine-learning insert bram' in line: newline = line for bram in model_brams: newline += f'#include \"firmware/weights/{bram.name}.h\"\n' - elif '//hls-fpga-machine-learning insert header' in line: + elif '// hls-fpga-machine-learning insert header' in line: dtype = line.split('#', 1)[1].strip() inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs]) outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs]) @@ -512,7 +513,7 @@ def write_bridge(self, model): newline = '' newline += indent + inputs_str + ',\n' newline += indent + outputs_str + '\n' - elif '//hls-fpga-machine-learning insert wrapper' in line: + elif '// hls-fpga-machine-learning insert wrapper' in line: dtype = line.split('#', 1)[1].strip() newline = '' for i in model_inputs: @@ -543,7 +544,7 @@ def write_bridge(self, model): newline += indent + 'nnet::convert_data<{}, {}, {}>({}_ap, {});\n'.format( o.type.name, dtype, o.size_cpp(), o.name, o.name ) - elif '//hls-fpga-machine-learning insert trace_outputs' in line: + elif '// hls-fpga-machine-learning insert trace_outputs' in line: newline = '' for layer in model.get_layers(): func = layer.get_attr('function_cpp', None) @@ -661,7 +662,7 @@ def write_generated_code(self, model): f = open(path, 'w') for line in contents: - if '//hls4ml insert code' in line: + if '// hls4ml insert code' in line: newline = line for layer in model.get_layers(): for generated_code in layer.code.values(): diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py index e7fa1ea15a..8645ecd0bc 100644 --- a/test/pytest/test_qkeras.py +++ b/test/pytest/test_qkeras.py @@ -4,7 +4,7 @@ import numpy as np import pytest from qkeras.qlayers import QActivation, QDense -from qkeras.quantizers import binary, quantized_bits, quantized_relu, ternary +from qkeras.quantizers import binary, quantized_bits, quantized_relu, quantized_sigmoid, quantized_tanh, ternary from qkeras.utils import _add_supported_quantized_objects from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split @@ -65,9 +65,6 @@ def convert(load_jettagging_model, strategy): Convert a QKeras model trained on the jet tagging dataset ''' model = load_jettagging_model - hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure( - layers=['Activation'], rounding_mode='AP_RND', saturation_mode='AP_SAT' - ) config = hls4ml.utils.config_from_keras_model(model, granularity='name') config['Model']['Strategy'] = strategy @@ -79,7 +76,6 @@ def convert(load_jettagging_model, strategy): output_dir=str(test_root_path / f'hls4mlprj_qkeras_accuracy_{strategy}'), part='xcu250-figd2104-2L-e', ) - hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(layers=[]) hls_model.compile() return hls_model @@ -149,15 +145,11 @@ def test_single_dense_activation_exact(randX_100_16, bits, alpha, backend, io_ty model.add(QActivation(activation=quantized_relu(bits, 0), name='relu1')) model.compile() - hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure( - layers=['relu1'], rounding_mode='AP_RND_CONV', saturation_mode='AP_SAT' - ) config = hls4ml.utils.config_from_keras_model(model, granularity='name') output_dir = str(test_root_path / f'hls4mlprj_qkeras_single_dense_activation_exact_{bits}_{alpha}_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type ) - hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(layers=[]) hls_model.compile() y_qkeras = model.predict(X) @@ -185,6 +177,38 @@ def randX_100_10(): return randX(100, 10) +@pytest.mark.parametrize( + 'quantizer', [(quantized_tanh(8)), (quantized_sigmoid(5)), (quantized_sigmoid(7, use_real_sigmoid=True))] +) +@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +def test_quantizer_special(randX_1000_1, quantizer, backend, io_type): + ''' + Test a single quantizer (tanh or sigmoid) as an Activation function. + Checks the type inference through the conversion is correct without just + using the same logic. + ''' + X = randX_1000_1 + X = np.round(X * 2**10) * 2**-10 # make it an exact ap_fixed<16,6> + model = Sequential() + model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer')) + model.compile() + + config = hls4ml.utils.config_from_keras_model(model, granularity='name') + output_dir = str( + test_root_path / f'hls4mlprj_qkeras_quantizer_{quantizer.__class__.__name__}_{quantizer.bits}_{backend}_{io_type}' + ) + hls_model = hls4ml.converters.convert_from_keras_model( + model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type + ) + hls_model.compile() + + y_qkeras = model.predict(X) + y_hls4ml = hls_model.predict(X) + # Goal is to get it passing with all equal + np.testing.assert_allclose(y_qkeras, y_hls4ml, rtol=1e-2, atol=0.02) + + @pytest.mark.parametrize( 'test_no,N,kernel_quantizer,bias_quantizer,activation_quantizer,use_batchnorm,is_xnor', [ @@ -254,9 +278,6 @@ def test_quantizer(randX_1000_1, quantizer, backend, io_type): model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer')) model.compile() - hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure( - layers=['quantizer'], rounding_mode='AP_RND_CONV', saturation_mode='AP_SAT' - ) config = hls4ml.utils.config_from_keras_model(model, granularity='name') output_dir = str( test_root_path @@ -267,7 +288,6 @@ def test_quantizer(randX_1000_1, quantizer, backend, io_type): hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type ) - hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(layers=[]) hls_model.compile() y_qkeras = model.predict(X) @@ -304,15 +324,11 @@ def test_qactivation_kwarg(randX_100_10, activation_quantizer, weight_quantizer) )(inputs) model = Model(inputs, outputs) - hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure( - layers=[name], rounding_mode='AP_RND_CONV', saturation_mode='AP_SAT' - ) config = hls4ml.utils.config_from_keras_model(model, granularity='name') out_dir = str(test_root_path / f'hls4mlprj_qactivation_kwarg_{activation_quantizer}') hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=out_dir) - hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(layers=[]) hls_model.compile() # Verify if activation in hls_model diff --git a/test/pytest/test_report.py b/test/pytest/test_report.py new file mode 100644 index 0000000000..b08709b2b1 --- /dev/null +++ b/test/pytest/test_report.py @@ -0,0 +1,71 @@ +import os +import shutil +from pathlib import Path + +import pytest +from tensorflow.keras.layers import Dense +from tensorflow.keras.models import Sequential + +import hls4ml + +test_root_path = Path(__file__).parent + + +@pytest.mark.parametrize('backend', ['Vivado']) +def test_report(backend, capsys): + model = Sequential() + model.add(Dense(5, input_shape=(16,), name='fc1', activation='relu')) + + config = hls4ml.utils.config_from_keras_model(model, granularity='model') + + output_dir = str(test_root_path / f'hls4mlprj_report_{backend}') + + hls_model = hls4ml.converters.convert_from_keras_model( + model, io_type='io_stream', hls_config=config, output_dir=output_dir, part='xc7z020clg400-1', backend=backend + ) + hls_model.write() + + # to actually generate the reports (using Vivado 2020.1) + # hls_model.build(synth=True, vsynth=True) + + # copy pregenerated reports + os.makedirs(f'hls4mlprj_report_{backend}/myproject_prj/solution1/syn/report', exist_ok=True) + shutil.copy('test_report/vivado_hls.app', f'{output_dir}/myproject_prj/vivado_hls.app') + shutil.copy('test_report/myproject_csynth.rpt', f'{output_dir}/myproject_prj/solution1/syn/report/myproject_csynth.rpt') + shutil.copy('test_report/myproject_csynth.xml', f'{output_dir}/myproject_prj/solution1/syn/report/myproject_csynth.xml') + shutil.copy('test_report/vivado_synth.rpt', f'{output_dir}/vivado_synth.rpt') + + report = hls4ml.report.parse_vivado_report(output_dir) # or report = hls_model.build(...) + + capsys.readouterr() # capture to clear + hls4ml.report.print_vivado_report(report) + captured = capsys.readouterr() # capture again to test + + assert ( + captured.out + == '\n' + + '======================================================\n' + + '== C Synthesis report\n' + + '======================================================\n\n' + + ' - Performance estimates:\n' + + ' Best-case latency: 10 (50.0 ns)\n' + + ' Worst-case latency: 10 (50.0 ns)\n' + + ' Interval Min: 8\n' + + ' Interval Max: 8\n' + + ' Estimated Clock Period: 4.049\n\n' + + ' - Resource estimates:\n' + + ' BRAM_18K: 0 / 280 (0.0%)\n' + + ' DSP48E: 73 / 220 (33.2%)\n' + + ' FF: 7969 / 106400 (7.5%)\n' + + ' LUT: 2532 / 53200 (4.8%)\n' + + ' URAM: N/A\n\n' + + '======================================================\n' + + '== Vivado Synthesis report\n' + + '======================================================\n\n' + + ' - Resource utilization:\n' + + ' BRAM_18K: 0\n' + + ' DSP48E: 66\n' + + ' FF: 2428\n' + + ' LUT: 1526\n' + + ' URAM: N/A\n\n' + ) diff --git a/test/pytest/test_report/myproject_csynth.rpt b/test/pytest/test_report/myproject_csynth.rpt new file mode 100644 index 0000000000..8354501dbf --- /dev/null +++ b/test/pytest/test_report/myproject_csynth.rpt @@ -0,0 +1,196 @@ + + +================================================================ +== Vivado HLS Report for 'myproject' +================================================================ +* Date: Sat Mar 18 22:59:37 2023 + +* Version: 2020.1 (Build 2897737 on Wed May 27 20:21:37 MDT 2020) +* Project: myproject_prj +* Solution: solution1 +* Product family: zynq +* Target device: xc7z020-clg400-1 + + +================================================================ +== Performance Estimates +================================================================ ++ Timing: + * Summary: + +--------+---------+----------+------------+ + | Clock | Target | Estimated| Uncertainty| + +--------+---------+----------+------------+ + |ap_clk | 5.00 ns | 4.049 ns | 0.62 ns | + +--------+---------+----------+------------+ + ++ Latency: + * Summary: + +---------+---------+-----------+-----------+-----+-----+----------+ + | Latency (cycles) | Latency (absolute) | Interval | Pipeline | + | min | max | min | max | min | max | Type | + +---------+---------+-----------+-----------+-----+-----+----------+ + | 10| 10| 50.000 ns | 50.000 ns | 8| 8| dataflow | + +---------+---------+-----------+-----------+-----+-----+----------+ + + + Detail: + * Instance: + +-----------------------------------------------------+----------------------------------------------------+---------+---------+-----------+-----------+-----+-----+----------+ + | | | Latency (cycles) | Latency (absolute) | Interval | Pipeline | + | Instance | Module | min | max | min | max | min | max | Type | + +-----------------------------------------------------+----------------------------------------------------+---------+---------+-----------+-----------+-----+-----+----------+ + |dense_array_array_ap_fixed_16_6_5_3_0_5u_config2_U0 |dense_array_array_ap_fixed_16_6_5_3_0_5u_config2_s | 7| 7| 35.000 ns | 35.000 ns | 7| 7| none | + |relu_array_array_ap_fixed_5u_relu_config3_U0 |relu_array_array_ap_fixed_5u_relu_config3_s | 2| 2| 10.000 ns | 10.000 ns | 1| 1| function | + +-----------------------------------------------------+----------------------------------------------------+---------+---------+-----------+-----------+-----+-----+----------+ + + * Loop: + N/A + + + +================================================================ +== Utilization Estimates +================================================================ +* Summary: ++-----------------+---------+-------+--------+-------+-----+ +| Name | BRAM_18K| DSP48E| FF | LUT | URAM| ++-----------------+---------+-------+--------+-------+-----+ +|DSP | -| -| -| -| -| +|Expression | -| -| 0| 2| -| +|FIFO | 0| -| 25| 140| -| +|Instance | 0| 73| 7944| 2390| -| +|Memory | -| -| -| -| -| +|Multiplexer | -| -| -| -| -| +|Register | -| -| -| -| -| ++-----------------+---------+-------+--------+-------+-----+ +|Total | 0| 73| 7969| 2532| 0| ++-----------------+---------+-------+--------+-------+-----+ +|Available | 280| 220| 106400| 53200| 0| ++-----------------+---------+-------+--------+-------+-----+ +|Utilization (%) | 0| 33| 7| 4| 0| ++-----------------+---------+-------+--------+-------+-----+ + ++ Detail: + * Instance: + +-----------------------------------------------------+----------------------------------------------------+---------+-------+------+------+-----+ + | Instance | Module | BRAM_18K| DSP48E| FF | LUT | URAM| + +-----------------------------------------------------+----------------------------------------------------+---------+-------+------+------+-----+ + |dense_array_array_ap_fixed_16_6_5_3_0_5u_config2_U0 |dense_array_array_ap_fixed_16_6_5_3_0_5u_config2_s | 0| 73| 7860| 2134| 0| + |relu_array_array_ap_fixed_5u_relu_config3_U0 |relu_array_array_ap_fixed_5u_relu_config3_s | 0| 0| 84| 256| 0| + +-----------------------------------------------------+----------------------------------------------------+---------+-------+------+------+-----+ + |Total | | 0| 73| 7944| 2390| 0| + +-----------------------------------------------------+----------------------------------------------------+---------+-------+------+------+-----+ + + * DSP48E: + N/A + + * Memory: + N/A + + * FIFO: + +-------------------------+---------+---+----+-----+------+-----+---------+ + | Name | BRAM_18K| FF| LUT| URAM| Depth| Bits| Size:D*B| + +-------------------------+---------+---+----+-----+------+-----+---------+ + |layer2_out_V_data_0_V_U | 0| 5| 0| -| 1| 16| 16| + |layer2_out_V_data_1_V_U | 0| 5| 0| -| 1| 16| 16| + |layer2_out_V_data_2_V_U | 0| 5| 0| -| 1| 16| 16| + |layer2_out_V_data_3_V_U | 0| 5| 0| -| 1| 16| 16| + |layer2_out_V_data_4_V_U | 0| 5| 0| -| 1| 16| 16| + +-------------------------+---------+---+----+-----+------+-----+---------+ + |Total | 0| 25| 0| 0| 5| 80| 80| + +-------------------------+---------+---+----+-----+------+-----+---------+ + + * Expression: + +--------------+----------+-------+---+----+------------+------------+ + | Variable Name| Operation| DSP48E| FF| LUT| Bitwidth P0| Bitwidth P1| + +--------------+----------+-------+---+----+------------+------------+ + |ap_idle | and | 0| 0| 2| 1| 1| + +--------------+----------+-------+---+----+------------+------------+ + |Total | | 0| 0| 2| 1| 1| + +--------------+----------+-------+---+----+------------+------------+ + + * Multiplexer: + N/A + + * Register: + N/A + + + +================================================================ +== Interface +================================================================ +* Summary: ++------------------------------+-----+-----+------------+-----------------------+--------------+ +| RTL Ports | Dir | Bits| Protocol | Source Object | C Type | ++------------------------------+-----+-----+------------+-----------------------+--------------+ +|fc1_input_V_data_0_V_TDATA | in | 16| axis | fc1_input_V_data_0_V | pointer | +|fc1_input_V_data_0_V_TVALID | in | 1| axis | fc1_input_V_data_0_V | pointer | +|fc1_input_V_data_0_V_TREADY | out | 1| axis | fc1_input_V_data_0_V | pointer | +|fc1_input_V_data_1_V_TDATA | in | 16| axis | fc1_input_V_data_1_V | pointer | +|fc1_input_V_data_1_V_TVALID | in | 1| axis | fc1_input_V_data_1_V | pointer | +|fc1_input_V_data_1_V_TREADY | out | 1| axis | fc1_input_V_data_1_V | pointer | +|fc1_input_V_data_2_V_TDATA | in | 16| axis | fc1_input_V_data_2_V | pointer | +|fc1_input_V_data_2_V_TVALID | in | 1| axis | fc1_input_V_data_2_V | pointer | +|fc1_input_V_data_2_V_TREADY | out | 1| axis | fc1_input_V_data_2_V | pointer | +|fc1_input_V_data_3_V_TDATA | in | 16| axis | fc1_input_V_data_3_V | pointer | +|fc1_input_V_data_3_V_TVALID | in | 1| axis | fc1_input_V_data_3_V | pointer | +|fc1_input_V_data_3_V_TREADY | out | 1| axis | fc1_input_V_data_3_V | pointer | +|fc1_input_V_data_4_V_TDATA | in | 16| axis | fc1_input_V_data_4_V | pointer | +|fc1_input_V_data_4_V_TVALID | in | 1| axis | fc1_input_V_data_4_V | pointer | +|fc1_input_V_data_4_V_TREADY | out | 1| axis | fc1_input_V_data_4_V | pointer | +|fc1_input_V_data_5_V_TDATA | in | 16| axis | fc1_input_V_data_5_V | pointer | +|fc1_input_V_data_5_V_TVALID | in | 1| axis | fc1_input_V_data_5_V | pointer | +|fc1_input_V_data_5_V_TREADY | out | 1| axis | fc1_input_V_data_5_V | pointer | +|fc1_input_V_data_6_V_TDATA | in | 16| axis | fc1_input_V_data_6_V | pointer | +|fc1_input_V_data_6_V_TVALID | in | 1| axis | fc1_input_V_data_6_V | pointer | +|fc1_input_V_data_6_V_TREADY | out | 1| axis | fc1_input_V_data_6_V | pointer | +|fc1_input_V_data_7_V_TDATA | in | 16| axis | fc1_input_V_data_7_V | pointer | +|fc1_input_V_data_7_V_TVALID | in | 1| axis | fc1_input_V_data_7_V | pointer | +|fc1_input_V_data_7_V_TREADY | out | 1| axis | fc1_input_V_data_7_V | pointer | +|fc1_input_V_data_8_V_TDATA | in | 16| axis | fc1_input_V_data_8_V | pointer | +|fc1_input_V_data_8_V_TVALID | in | 1| axis | fc1_input_V_data_8_V | pointer | +|fc1_input_V_data_8_V_TREADY | out | 1| axis | fc1_input_V_data_8_V | pointer | +|fc1_input_V_data_9_V_TDATA | in | 16| axis | fc1_input_V_data_9_V | pointer | +|fc1_input_V_data_9_V_TVALID | in | 1| axis | fc1_input_V_data_9_V | pointer | +|fc1_input_V_data_9_V_TREADY | out | 1| axis | fc1_input_V_data_9_V | pointer | +|fc1_input_V_data_10_V_TDATA | in | 16| axis | fc1_input_V_data_10_V | pointer | +|fc1_input_V_data_10_V_TVALID | in | 1| axis | fc1_input_V_data_10_V | pointer | +|fc1_input_V_data_10_V_TREADY | out | 1| axis | fc1_input_V_data_10_V | pointer | +|fc1_input_V_data_11_V_TDATA | in | 16| axis | fc1_input_V_data_11_V | pointer | +|fc1_input_V_data_11_V_TVALID | in | 1| axis | fc1_input_V_data_11_V | pointer | +|fc1_input_V_data_11_V_TREADY | out | 1| axis | fc1_input_V_data_11_V | pointer | +|fc1_input_V_data_12_V_TDATA | in | 16| axis | fc1_input_V_data_12_V | pointer | +|fc1_input_V_data_12_V_TVALID | in | 1| axis | fc1_input_V_data_12_V | pointer | +|fc1_input_V_data_12_V_TREADY | out | 1| axis | fc1_input_V_data_12_V | pointer | +|fc1_input_V_data_13_V_TDATA | in | 16| axis | fc1_input_V_data_13_V | pointer | +|fc1_input_V_data_13_V_TVALID | in | 1| axis | fc1_input_V_data_13_V | pointer | +|fc1_input_V_data_13_V_TREADY | out | 1| axis | fc1_input_V_data_13_V | pointer | +|fc1_input_V_data_14_V_TDATA | in | 16| axis | fc1_input_V_data_14_V | pointer | +|fc1_input_V_data_14_V_TVALID | in | 1| axis | fc1_input_V_data_14_V | pointer | +|fc1_input_V_data_14_V_TREADY | out | 1| axis | fc1_input_V_data_14_V | pointer | +|fc1_input_V_data_15_V_TDATA | in | 16| axis | fc1_input_V_data_15_V | pointer | +|fc1_input_V_data_15_V_TVALID | in | 1| axis | fc1_input_V_data_15_V | pointer | +|fc1_input_V_data_15_V_TREADY | out | 1| axis | fc1_input_V_data_15_V | pointer | +|layer3_out_V_data_0_V_TDATA | out | 16| axis | layer3_out_V_data_0_V | pointer | +|layer3_out_V_data_0_V_TVALID | out | 1| axis | layer3_out_V_data_0_V | pointer | +|layer3_out_V_data_0_V_TREADY | in | 1| axis | layer3_out_V_data_0_V | pointer | +|layer3_out_V_data_1_V_TDATA | out | 16| axis | layer3_out_V_data_1_V | pointer | +|layer3_out_V_data_1_V_TVALID | out | 1| axis | layer3_out_V_data_1_V | pointer | +|layer3_out_V_data_1_V_TREADY | in | 1| axis | layer3_out_V_data_1_V | pointer | +|layer3_out_V_data_2_V_TDATA | out | 16| axis | layer3_out_V_data_2_V | pointer | +|layer3_out_V_data_2_V_TVALID | out | 1| axis | layer3_out_V_data_2_V | pointer | +|layer3_out_V_data_2_V_TREADY | in | 1| axis | layer3_out_V_data_2_V | pointer | +|layer3_out_V_data_3_V_TDATA | out | 16| axis | layer3_out_V_data_3_V | pointer | +|layer3_out_V_data_3_V_TVALID | out | 1| axis | layer3_out_V_data_3_V | pointer | +|layer3_out_V_data_3_V_TREADY | in | 1| axis | layer3_out_V_data_3_V | pointer | +|layer3_out_V_data_4_V_TDATA | out | 16| axis | layer3_out_V_data_4_V | pointer | +|layer3_out_V_data_4_V_TVALID | out | 1| axis | layer3_out_V_data_4_V | pointer | +|layer3_out_V_data_4_V_TREADY | in | 1| axis | layer3_out_V_data_4_V | pointer | +|ap_clk | in | 1| ap_ctrl_hs | myproject | return value | +|ap_rst_n | in | 1| ap_ctrl_hs | myproject | return value | +|ap_start | in | 1| ap_ctrl_hs | myproject | return value | +|ap_done | out | 1| ap_ctrl_hs | myproject | return value | +|ap_ready | out | 1| ap_ctrl_hs | myproject | return value | +|ap_idle | out | 1| ap_ctrl_hs | myproject | return value | ++------------------------------+-----+-----+------------+-----------------------+--------------+ + diff --git a/test/pytest/test_report/myproject_csynth.xml b/test/pytest/test_report/myproject_csynth.xml new file mode 100644 index 0000000000..711a5ec12b --- /dev/null +++ b/test/pytest/test_report/myproject_csynth.xml @@ -0,0 +1,878 @@ + + + +2020.1 + + + +ns +zynq +xc7z020-clg400-1 +myproject +5.00 +0.62 + + + +dataflow + +ns +4.049 + + +clock cycles +10 +10 +10 +50.000 ns +50.000 ns +50.000 ns +8 +8 +8 + + + + + +0 +73 +7969 +2532 +0 + + +280 +220 +106400 +53200 +0 + + + + + +fc1_input_V_data_0_V_TDATA +fc1_input_V_data_0_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_0_V_TVALID +fc1_input_V_data_0_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_0_V_TREADY +fc1_input_V_data_0_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_1_V_TDATA +fc1_input_V_data_1_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_1_V_TVALID +fc1_input_V_data_1_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_1_V_TREADY +fc1_input_V_data_1_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_2_V_TDATA +fc1_input_V_data_2_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_2_V_TVALID +fc1_input_V_data_2_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_2_V_TREADY +fc1_input_V_data_2_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_3_V_TDATA +fc1_input_V_data_3_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_3_V_TVALID +fc1_input_V_data_3_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_3_V_TREADY +fc1_input_V_data_3_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_4_V_TDATA +fc1_input_V_data_4_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_4_V_TVALID +fc1_input_V_data_4_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_4_V_TREADY +fc1_input_V_data_4_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_5_V_TDATA +fc1_input_V_data_5_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_5_V_TVALID +fc1_input_V_data_5_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_5_V_TREADY +fc1_input_V_data_5_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_6_V_TDATA +fc1_input_V_data_6_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_6_V_TVALID +fc1_input_V_data_6_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_6_V_TREADY +fc1_input_V_data_6_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_7_V_TDATA +fc1_input_V_data_7_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_7_V_TVALID +fc1_input_V_data_7_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_7_V_TREADY +fc1_input_V_data_7_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_8_V_TDATA +fc1_input_V_data_8_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_8_V_TVALID +fc1_input_V_data_8_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_8_V_TREADY +fc1_input_V_data_8_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_9_V_TDATA +fc1_input_V_data_9_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_9_V_TVALID +fc1_input_V_data_9_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_9_V_TREADY +fc1_input_V_data_9_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_10_V_TDATA +fc1_input_V_data_10_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_10_V_TVALID +fc1_input_V_data_10_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_10_V_TREADY +fc1_input_V_data_10_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_11_V_TDATA +fc1_input_V_data_11_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_11_V_TVALID +fc1_input_V_data_11_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_11_V_TREADY +fc1_input_V_data_11_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_12_V_TDATA +fc1_input_V_data_12_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_12_V_TVALID +fc1_input_V_data_12_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_12_V_TREADY +fc1_input_V_data_12_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_13_V_TDATA +fc1_input_V_data_13_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_13_V_TVALID +fc1_input_V_data_13_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_13_V_TREADY +fc1_input_V_data_13_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_14_V_TDATA +fc1_input_V_data_14_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_14_V_TVALID +fc1_input_V_data_14_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_14_V_TREADY +fc1_input_V_data_14_V +pointer + +axis +register, both mode +out +1 +control +int + + +fc1_input_V_data_15_V_TDATA +fc1_input_V_data_15_V +pointer + +axis +register, both mode +in +16 +data +int + + +fc1_input_V_data_15_V_TVALID +fc1_input_V_data_15_V +pointer + +axis +register, both mode +in +1 +control +int + + +fc1_input_V_data_15_V_TREADY +fc1_input_V_data_15_V +pointer + +axis +register, both mode +out +1 +control +int + + +layer3_out_V_data_0_V_TDATA +layer3_out_V_data_0_V +pointer + +axis +register, both mode +out +16 +data +int + + +layer3_out_V_data_0_V_TVALID +layer3_out_V_data_0_V +pointer + +axis +register, both mode +out +1 +control +int + + +layer3_out_V_data_0_V_TREADY +layer3_out_V_data_0_V +pointer + +axis +register, both mode +in +1 +control +int + + +layer3_out_V_data_1_V_TDATA +layer3_out_V_data_1_V +pointer + +axis +register, both mode +out +16 +data +int + + +layer3_out_V_data_1_V_TVALID +layer3_out_V_data_1_V +pointer + +axis +register, both mode +out +1 +control +int + + +layer3_out_V_data_1_V_TREADY +layer3_out_V_data_1_V +pointer + +axis +register, both mode +in +1 +control +int + + +layer3_out_V_data_2_V_TDATA +layer3_out_V_data_2_V +pointer + +axis +register, both mode +out +16 +data +int + + +layer3_out_V_data_2_V_TVALID +layer3_out_V_data_2_V +pointer + +axis +register, both mode +out +1 +control +int + + +layer3_out_V_data_2_V_TREADY +layer3_out_V_data_2_V +pointer + +axis +register, both mode +in +1 +control +int + + +layer3_out_V_data_3_V_TDATA +layer3_out_V_data_3_V +pointer + +axis +register, both mode +out +16 +data +int + + +layer3_out_V_data_3_V_TVALID +layer3_out_V_data_3_V +pointer + +axis +register, both mode +out +1 +control +int + + +layer3_out_V_data_3_V_TREADY +layer3_out_V_data_3_V +pointer + +axis +register, both mode +in +1 +control +int + + +layer3_out_V_data_4_V_TDATA +layer3_out_V_data_4_V +pointer + +axis +register, both mode +out +16 +data +int + + +layer3_out_V_data_4_V_TVALID +layer3_out_V_data_4_V +pointer + +axis +register, both mode +out +1 +control +int + + +layer3_out_V_data_4_V_TREADY +layer3_out_V_data_4_V +pointer + +axis +register, both mode +in +1 +control +int + + +ap_clk +myproject +return value + +ap_ctrl_hs + +in +1 +control + + +ap_rst_n +myproject +return value + +ap_ctrl_hs + +in +1 +control + + +ap_start +myproject +return value + +ap_ctrl_hs + +in +1 +control + + +ap_done +myproject +return value + +ap_ctrl_hs + +out +1 +control + + +ap_ready +myproject +return value + +ap_ctrl_hs + +out +1 +control + + +ap_idle +myproject +return value + +ap_ctrl_hs + +out +1 +control + + + + diff --git a/test/pytest/test_report/vivado_hls.app b/test/pytest/test_report/vivado_hls.app new file mode 100644 index 0000000000..c57b8a471d --- /dev/null +++ b/test/pytest/test_report/vivado_hls.app @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/test/pytest/test_report/vivado_synth.rpt b/test/pytest/test_report/vivado_synth.rpt new file mode 100644 index 0000000000..971f3e5496 --- /dev/null +++ b/test/pytest/test_report/vivado_synth.rpt @@ -0,0 +1,184 @@ +Copyright 1986-2020 Xilinx, Inc. All Rights Reserved. +------------------------------------------------------------------------------------ +| Tool Version : Vivado v.2020.1 (lin64) Build 2902540 Wed May 27 19:54:35 MDT 2020 +| Date : Sun Mar 19 07:48:36 2023 +| Host : mulder.t2.ucsd.edu running 64-bit unknown +| Command : report_utilization -file vivado_synth.rpt +| Design : myproject +| Device : 7z020clg400-1 +| Design State : Synthesized +------------------------------------------------------------------------------------ + +Utilization Design Information + +Table of Contents +----------------- +1. Slice Logic +1.1 Summary of Registers by Type +2. Memory +3. DSP +4. IO and GT Specific +5. Clocking +6. Specific Feature +7. Primitives +8. Black Boxes +9. Instantiated Netlists + +1. Slice Logic +-------------- + ++----------------------------+------+-------+-----------+-------+ +| Site Type | Used | Fixed | Available | Util% | ++----------------------------+------+-------+-----------+-------+ +| Slice LUTs* | 1526 | 0 | 53200 | 2.87 | +| LUT as Logic | 1478 | 0 | 53200 | 2.78 | +| LUT as Memory | 48 | 0 | 17400 | 0.28 | +| LUT as Distributed RAM | 0 | 0 | | | +| LUT as Shift Register | 48 | 0 | | | +| Slice Registers | 2428 | 0 | 106400 | 2.28 | +| Register as Flip Flop | 2428 | 0 | 106400 | 2.28 | +| Register as Latch | 0 | 0 | 106400 | 0.00 | +| F7 Muxes | 0 | 0 | 26600 | 0.00 | +| F8 Muxes | 0 | 0 | 13300 | 0.00 | ++----------------------------+------+-------+-----------+-------+ +* Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. + + +1.1 Summary of Registers by Type +-------------------------------- + ++-------+--------------+-------------+--------------+ +| Total | Clock Enable | Synchronous | Asynchronous | ++-------+--------------+-------------+--------------+ +| 0 | _ | - | - | +| 0 | _ | - | Set | +| 0 | _ | - | Reset | +| 0 | _ | Set | - | +| 0 | _ | Reset | - | +| 0 | Yes | - | - | +| 0 | Yes | - | Set | +| 0 | Yes | - | Reset | +| 18 | Yes | Set | - | +| 2410 | Yes | Reset | - | ++-------+--------------+-------------+--------------+ + + +2. Memory +--------- + ++----------------+------+-------+-----------+-------+ +| Site Type | Used | Fixed | Available | Util% | ++----------------+------+-------+-----------+-------+ +| Block RAM Tile | 0 | 0 | 140 | 0.00 | +| RAMB36/FIFO* | 0 | 0 | 140 | 0.00 | +| RAMB18 | 0 | 0 | 280 | 0.00 | ++----------------+------+-------+-----------+-------+ +* Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 + + +3. DSP +------ + ++----------------+------+-------+-----------+-------+ +| Site Type | Used | Fixed | Available | Util% | ++----------------+------+-------+-----------+-------+ +| DSPs | 66 | 0 | 220 | 30.00 | +| DSP48E1 only | 66 | | | | ++----------------+------+-------+-----------+-------+ + + +4. IO and GT Specific +--------------------- + ++-----------------------------+------+-------+-----------+--------+ +| Site Type | Used | Fixed | Available | Util% | ++-----------------------------+------+-------+-----------+--------+ +| Bonded IOB | 384 | 0 | 125 | 307.20 | +| Bonded IPADs | 0 | 0 | 2 | 0.00 | +| Bonded IOPADs | 0 | 0 | 130 | 0.00 | +| PHY_CONTROL | 0 | 0 | 4 | 0.00 | +| PHASER_REF | 0 | 0 | 4 | 0.00 | +| OUT_FIFO | 0 | 0 | 16 | 0.00 | +| IN_FIFO | 0 | 0 | 16 | 0.00 | +| IDELAYCTRL | 0 | 0 | 4 | 0.00 | +| IBUFDS | 0 | 0 | 121 | 0.00 | +| PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 16 | 0.00 | +| PHASER_IN/PHASER_IN_PHY | 0 | 0 | 16 | 0.00 | +| IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 200 | 0.00 | +| ILOGIC | 0 | 0 | 125 | 0.00 | +| OLOGIC | 0 | 0 | 125 | 0.00 | ++-----------------------------+------+-------+-----------+--------+ + + +5. Clocking +----------- + ++------------+------+-------+-----------+-------+ +| Site Type | Used | Fixed | Available | Util% | ++------------+------+-------+-----------+-------+ +| BUFGCTRL | 1 | 0 | 32 | 3.13 | +| BUFIO | 0 | 0 | 16 | 0.00 | +| MMCME2_ADV | 0 | 0 | 4 | 0.00 | +| PLLE2_ADV | 0 | 0 | 4 | 0.00 | +| BUFMRCE | 0 | 0 | 8 | 0.00 | +| BUFHCE | 0 | 0 | 72 | 0.00 | +| BUFR | 0 | 0 | 16 | 0.00 | ++------------+------+-------+-----------+-------+ + + +6. Specific Feature +------------------- + ++-------------+------+-------+-----------+-------+ +| Site Type | Used | Fixed | Available | Util% | ++-------------+------+-------+-----------+-------+ +| BSCANE2 | 0 | 0 | 4 | 0.00 | +| CAPTUREE2 | 0 | 0 | 1 | 0.00 | +| DNA_PORT | 0 | 0 | 1 | 0.00 | +| EFUSE_USR | 0 | 0 | 1 | 0.00 | +| FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | +| ICAPE2 | 0 | 0 | 2 | 0.00 | +| STARTUPE2 | 0 | 0 | 1 | 0.00 | +| XADC | 0 | 0 | 1 | 0.00 | ++-------------+------+-------+-----------+-------+ + + +7. Primitives +------------- + ++----------+------+---------------------+ +| Ref Name | Used | Functional Category | ++----------+------+---------------------+ +| FDRE | 2410 | Flop & Latch | +| LUT2 | 864 | LUT | +| LUT3 | 671 | LUT | +| LUT4 | 499 | LUT | +| CARRY4 | 295 | CarryLogic | +| IBUF | 280 | IO | +| OBUF | 104 | IO | +| DSP48E1 | 66 | Block Arithmetic | +| LUT1 | 63 | LUT | +| SRL16E | 48 | Distributed Memory | +| LUT5 | 43 | LUT | +| LUT6 | 34 | LUT | +| FDSE | 18 | Flop & Latch | +| BUFG | 1 | Clock | ++----------+------+---------------------+ + + +8. Black Boxes +-------------- + ++----------+------+ +| Ref Name | Used | ++----------+------+ + + +9. Instantiated Netlists +------------------------ + ++----------+------+ +| Ref Name | Used | ++----------+------+ + +