From cf442238f27742d5dff88fc11fbdc2a6bab4567c Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 28 Jul 2022 20:44:42 +0200
Subject: [PATCH 01/20] Remove the limit function, Vitis doesn't like this

---
 .../backends/vivado/passes/convolution_templates.py   |  7 +++++++
 hls4ml/backends/vivado/passes/core_templates.py       |  2 ++
 hls4ml/backends/vivado/passes/merge_templates.py      |  1 +
 hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h   | 11 ++++-------
 .../vivado/nnet_utils/nnet_batchnorm_stream.h         |  5 ++---
 .../templates/vivado/nnet_utils/nnet_dense_latency.h  |  3 +--
 hls4ml/templates/vivado/nnet_utils/nnet_merge.h       |  3 +--
 hls4ml/templates/vivado/nnet_utils/nnet_mult.h        |  9 +--------
 .../templates/vivado/nnet_utils/nnet_sepconv_stream.h |  3 +--
 9 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index d4ac2d5b0..a16ab80c6 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -10,6 +10,8 @@
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
     static const unsigned strategy = nnet::{strategy};
+    static const unsigned n_zeros = 0;
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
@@ -103,6 +105,7 @@ def format(self, node):
     static const unsigned out_width = {out_width};
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     static const unsigned strategy = nnet::{strategy};
     static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
@@ -278,6 +281,10 @@ def format(self, node):
 
         # Depthwise config
         params = self._default_config_params(node)
+        # Override bias and bias_t since these are zeros in depthwise step of SepConv2D
+        params['bias'] = params['zero_bias']
+        params['bias_t'] = params['zero_bias_t']
+
         params['n_filt'] = params['n_chan'] # In depthwise step n_chan == n_filt
         params['dilation'] = node.get_attr('dilation', 1)
         params['nzeros'] = node.get_weights('depthwise').nzeros
diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index f63c0f454..8327e3a7f 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -13,6 +13,7 @@
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
     static const unsigned n_nonzeros = {nonzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
@@ -60,6 +61,7 @@ def format(self, node):
     static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor);
     static const bool store_weights_in_bram = false;
     typedef {bias_t.name} bias_t;
     typedef {scale_t.name} scale_t;
diff --git a/hls4ml/backends/vivado/passes/merge_templates.py b/hls4ml/backends/vivado/passes/merge_templates.py
index 863512c4c..7aa705750 100644
--- a/hls4ml/backends/vivado/passes/merge_templates.py
+++ b/hls4ml/backends/vivado/passes/merge_templates.py
@@ -49,6 +49,7 @@ def format(self, node):
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor);
     typedef {accum_t.name} accum_t;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
index 200282784..2314f5609 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
@@ -70,18 +70,17 @@ void normalize(
     #pragma HLS ARRAY_PARTITION variable=scale complete
     #pragma HLS ARRAY_PARTITION variable=bias complete
 
-    int multiplier_limit  = ceil(float(CONFIG_T::n_in) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::limit(multiplier_limit);
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Calcuate result
     Result: for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
         if (CONFIG_T::n_filt==-1) {
             res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) + bias[ires];
-	    } else {
+        } else {
             int norm_index = ires%CONFIG_T::n_filt;
             res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) + bias[norm_index];
         }
-	}
+    }
 }
 
 // ****************************************************
@@ -108,13 +107,12 @@ void  normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T
     data_T datareg;   
     ap_uint<1> cache; 
     for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        datareg = data[ii];	 
+        datareg = data[ii];     
         int norm_index = CONFIG_T::n_filt==-1 ? ii : ii%CONFIG_T::n_filt;
         if( datareg > threshold[norm_index] ) cache = 1;
         else cache = 0;
 
         res[ii] = (ap_uint<1>) cache;
- 
     }   
 }
 
@@ -134,7 +132,6 @@ void  normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T
         else cache = 0;
 
         res[ii] = (ap_int<2>) cache;
-
     }
 }
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
index ce76c01bc..a2b406806 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
@@ -41,9 +41,8 @@ void normalize(
     #pragma HLS ARRAY_PARTITION variable=scale complete
     #pragma HLS ARRAY_PARTITION variable=bias complete
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    constexpr unsigned ii = CONFIG_T::n_in / multiplier_limit;
-    CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::limit(multiplier_limit);
+    constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit;
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     BatchNormLoop: for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
         #pragma HLS PIPELINE II=ii
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
index c4dcea4ab..464e8b495 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
@@ -52,8 +52,7 @@ void dense_latency(
     #pragma HLS ARRAY_PARTITION variable=mult complete
     #pragma HLS ARRAY_PARTITION variable=acc complete
 
-    int multiplier_limit  = ceil(float(CONFIG_T::n_in*CONFIG_T::n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Do the matrix-multiply
     Product1: for(int ii = 0; ii < CONFIG_T::n_in; ii++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
index a35c264d2..19f2b421d 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
@@ -140,8 +140,7 @@ void dot1d(
 {
     #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    CONFIG_T::template product<input1_T, input2_T>::limit(multiplier_limit);
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
     #pragma HLS ARRAY_PARTITION variable=mult complete
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
index 586bc65ae..966959c70 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
@@ -16,10 +16,7 @@ namespace product{
  * types of each.
  * --- */
 
-class Product{
-    public:
-    static void limit(unsigned multiplier_limit) {} // Nothing to do here
-};
+class Product{};
 
 template<class x_T, class w_T>
 class both_binary : public Product{
@@ -77,10 +74,6 @@ class mult : public Product{
         #pragma HLS INLINE
         return a * w;
     }
-    static void limit(unsigned multiplier_limit){
-        #pragma HLS INLINE
-        #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
-    }
 };
 
 template<class x_T, class w_T>
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
index e8826e300..5788d429e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
@@ -26,8 +26,7 @@ void depthwise_product(
 
     #pragma HLS ARRAY_PARTITION variable=mult complete
 
-    int multiplier_limit  = ceil(float(CONFIG_T::kernel_size * CONFIG_T::n_chan) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::limit(multiplier_limit);
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Do the matrix-multiply
     Product: for(int ii = 0; ii < CONFIG_T::kernel_size * CONFIG_T::n_chan; ii++) {

From baacb45e04281f5d123dc35d34b3fceebf75d8b4 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 28 Jul 2022 20:45:15 +0200
Subject: [PATCH 02/20] Rudimentary Vitis backend

---
 hls4ml/backends/__init__.py            |  2 ++
 hls4ml/backends/vitis/vitis_backend.py | 32 ++++++++++++++++++++++++++
 hls4ml/report/vivado_report.py         | 24 +++++++++++--------
 hls4ml/writer/__init__.py              |  2 ++
 hls4ml/writer/vitis_writer.py          | 15 ++++++++++++
 5 files changed, 66 insertions(+), 9 deletions(-)
 create mode 100644 hls4ml/backends/vitis/vitis_backend.py
 create mode 100644 hls4ml/writer/vitis_writer.py

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index cbd44d466..5fe692052 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -5,8 +5,10 @@
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig
+from hls4ml.backends.vitis.vitis_backend import VitisBackend
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
+register_backend('Vitis', VitisBackend)
 register_backend('Quartus', QuartusBackend)
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
new file mode 100644
index 000000000..512917e93
--- /dev/null
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -0,0 +1,32 @@
+import os
+import sys
+
+from hls4ml.backends import VivadoBackend
+from hls4ml.model.flow import register_flow
+from hls4ml.report import parse_vivado_report
+
+
+class VitisBackend(VivadoBackend):
+    def __init__(self):
+        super(VivadoBackend, self).__init__(name='Vitis')
+        self._register_flows()
+
+    def _register_flows(self):
+        vivado_ip = 'vivado:ip'
+        writer_passes = ['make_stamp', 'vitis:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name)
+        self._default_flow = vivado_ip
+
+    def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False):
+        if 'linux' in sys.platform:
+            found = os.system('command -v vitis_hls > /dev/null')
+            if found != 0:
+                raise Exception('Vitis HLS installation not found. Make sure "vitis_hls" is on PATH.')
+        
+        curr_dir = os.getcwd()
+        os.chdir(model.config.get_output_dir())
+        os.system('vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} validation={validation} export={export} vsynth={vsynth}"'
+            .format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth))
+        os.chdir(curr_dir)
+
+        return parse_vivado_report(model.config.get_output_dir())
\ No newline at end of file
diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index 7930564b8..4325afd29 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -53,15 +53,21 @@ def _find_solutions(sln_dir):
     solutions = []
 
     if os.path.isfile(sln_dir + '/vivado_hls.app'):
-        with open(sln_dir + '/vivado_hls.app') as f:
-            # Get rid of namespaces (workaround to support two types of vivado_hls.app files)
-            xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1)
-
-        root = ET.fromstring(xmlstring)
-        for sln_tag in root.findall('solutions/solution'):
-            sln_name = sln_tag.get('name')
-            if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name):
-                solutions.append(sln_name)
+        sln_file = 'vivado_hls.app'
+    elif os.path.isfile(sln_dir + '/hls.app'):
+        sln_file = 'hls.app'
+    else:
+        return solutions
+
+    with open(sln_dir + '/' + sln_file) as f:
+        # Get rid of namespaces (workaround to support two types of vivado_hls.app files)
+        xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1)
+
+    root = ET.fromstring(xmlstring)
+    for sln_tag in root.findall('solutions/solution'):
+        sln_name = sln_tag.get('name')
+        if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name):
+            solutions.append(sln_name)
 
     return solutions
 
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index b25576f9a..8ac4f1f8e 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -3,8 +3,10 @@
 from hls4ml.writer.writers import Writer, register_writer, get_writer
 from hls4ml.writer.vivado_writer import VivadoWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
+from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.quartus_writer import QuartusWriter
 
 register_writer('Vivado', VivadoWriter)
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
+register_writer('Vitis', VitisWriter)
 register_writer('Quartus', QuartusWriter)
diff --git a/hls4ml/writer/vitis_writer.py b/hls4ml/writer/vitis_writer.py
new file mode 100644
index 000000000..45784acb6
--- /dev/null
+++ b/hls4ml/writer/vitis_writer.py
@@ -0,0 +1,15 @@
+import os
+from shutil import copyfile, copytree
+from distutils.dir_util import copy_tree
+from hls4ml.writer.vivado_writer import VivadoWriter
+
+class VitisWriter(VivadoWriter):
+
+    def __init__(self):
+        super().__init__()
+
+    def write_hls(self, model):
+        """
+        Write the HLS project. Calls the steps from VivadoWriter, adapted for Vitis
+        """
+        super(VitisWriter, self).write_hls(model)

From be460ce88c759421c12e3414b02bc7de3ea8a30c Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 28 Jul 2022 21:17:44 +0200
Subject: [PATCH 03/20] Ensure default build options are the same

---
 hls4ml/templates/vivado/build_prj.tcl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index df01e459a..cba53de03 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -164,9 +164,11 @@ if {$opt(reset)} {
   open_solution "solution1"
 }
 catch {config_array_partition -maximum_size 4096}
-config_compile -name_max_length 60
+config_compile -name_max_length 80
 set_part {xcku115-flvb2104-2-i}
+config_schedule -enable_dsp_full_reg=false
 create_clock -period 5 -name default
+set_clock_uncertainty 12.5% default
 
 
 if {$opt(csim)} {

From c7c10e3896b0553699672935a09d55859791df99 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 1 Aug 2022 15:40:20 +0200
Subject: [PATCH 04/20] Conditionally use DATA_PACK pragma

---
 .../nnet_utils/nnet_activation_stream.h       | 34 ++++++------
 .../vivado/nnet_utils/nnet_batchnorm_stream.h |  6 +--
 .../templates/vivado/nnet_utils/nnet_common.h | 11 ++++
 .../vivado/nnet_utils/nnet_conv1d_stream.h    |  2 +-
 .../vivado/nnet_utils/nnet_conv2d_stream.h    |  2 +-
 .../vivado/nnet_utils/nnet_conv_stream.h      |  4 +-
 .../vivado/nnet_utils/nnet_dense_stream.h     |  2 +-
 .../vivado/nnet_utils/nnet_embed_stream.h     |  2 +-
 .../vivado/nnet_utils/nnet_image_stream.h     |  2 +-
 .../vivado/nnet_utils/nnet_merge_stream.h     | 30 +++++------
 .../vivado/nnet_utils/nnet_pooling_stream.h   | 16 +++---
 .../vivado/nnet_utils/nnet_sepconv1d_stream.h |  2 +-
 .../vivado/nnet_utils/nnet_sepconv2d_stream.h |  2 +-
 .../templates/vivado/nnet_utils/nnet_stream.h | 53 ++++++++++---------
 14 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
index 8f294daee..bc36c1a5c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
@@ -40,7 +40,7 @@ void linear(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         LinearPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -62,7 +62,7 @@ void relu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         ReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -98,7 +98,7 @@ void sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         SigmoidPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -164,7 +164,7 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res){
         typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
 
         res_T out_pack;
-        #pragma HLS DATA_PACK variable=out_pack
+        PRAGMA_DATA_PACK(out_pack)
         SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){
             #pragma HLS UNROLL
             #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
@@ -238,7 +238,7 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res){
         typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
 
         res_T out_pack;
-        #pragma HLS DATA_PACK variable=out_pack
+        PRAGMA_DATA_PACK(out_pack)
         SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){
             #pragma HLS UNROLL
             #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
@@ -300,7 +300,7 @@ void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         }
 
         res_T out_pack;
-        #pragma HLS DATA_PACK variable=out_pack
+        PRAGMA_DATA_PACK(out_pack)
         SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
 
@@ -356,7 +356,7 @@ void tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         TanHPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -386,7 +386,7 @@ void hard_sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         HardSigmoidPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -412,7 +412,7 @@ void leaky_relu(hls::stream<data_T> &data, typename data_T::value_type alpha, hl
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         LeakyReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -435,7 +435,7 @@ void thresholded_relu(hls::stream<data_T> &data, typename data_T::value_type the
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         ThresholdedReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -472,7 +472,7 @@ void softplus(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         SoftplusPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -511,7 +511,7 @@ void softsign(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         SoftsignPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -549,7 +549,7 @@ void elu(hls::stream<data_T> &data, typename data_T::value_type alpha, hls::stre
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         EluPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -596,7 +596,7 @@ void selu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         SeluPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -626,7 +626,7 @@ void prelu(hls::stream<data_T> &data, typename data_T::value_type alpha[CONFIG_T
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         PReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -647,7 +647,7 @@ void binary_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         PReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -668,7 +668,7 @@ void ternary_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         PReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
index a2b406806..ce49d65b0 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
@@ -49,7 +49,7 @@ void normalize(
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         BatchNormpack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
@@ -82,7 +82,7 @@ void normalize_binary_tanh(
 
         data_T in_data = data.read();
         nnet::array<ap_uint<1>, CONFIG_T::n_in> out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         BatchNormPack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
@@ -108,7 +108,7 @@ void normalize_ternary_tanh(
 
         data_T in_data = data.read();
         nnet::array<ap_int<2>, CONFIG_T::n_in> out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         BatchNormPack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index 9bfae8339..af59f9021 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -27,6 +27,17 @@
 #define MIN(n,d) (n > d ? d : n)
 #define MAX(n,d) (n > d ? n : d)
 
+#define STRINGIFY(x) #x
+#define EXPAND_STRING(x) STRINGIFY(x)
+
+#ifndef __VITIS_HLS__
+#define DATA_PACK_TXT HLS DATA_PACK variable=
+#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable
+#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable)))
+#else
+#define PRAGMA_DATA_PACK(variable)
+#endif
+
 namespace nnet {
 
 // Common type definitions
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
index e0f6f0833..e887b2564 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
@@ -40,7 +40,7 @@ void conv_1d_encoded_cl(
     #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
index 5d1c7d1ef..7d451b3ba 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
@@ -44,7 +44,7 @@ void conv_2d_encoded_cl(
     #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     ap_uint<CONFIG_T::filt_height * CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
index 862e8361a..a922d5854 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
@@ -263,7 +263,7 @@ void compute_output_buffer_2d(
     #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
 
     // Add pixel to buffer
     nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
@@ -333,7 +333,7 @@ void compute_output_buffer_1d(
     #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
 
     // Add pixel to buffer
     nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
index 52c96c52c..9b0deb9b4 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
@@ -56,7 +56,7 @@ void dense(
             #pragma HLS PIPELINE
         }
         res_T res_pack;
-        #pragma HLS DATA_PACK variable=res_pack
+        PRAGMA_DATA_PACK(res_pack)
         ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
             #pragma HLS UNROLL
             res_pack[i_pack] = res[i_out * res_T::size + i_pack];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h
index 3ada00b24..fb8e2fb43 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h
@@ -19,7 +19,7 @@ void embedding(
         #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
 
         res_T res_pack;
-        #pragma HLS DATA_PACK variable=res_pack
+        PRAGMA_DATA_PACK(res_pack)
 
         DenseEmbedding: for (int i = 0; i < CONFIG_T::n_out; i++) {
             #pragma HLS UNROLL
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h
index 42d2ce80e..89f91d6f0 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h
@@ -44,7 +44,7 @@ void resize_nearest(
 					#pragma HLS UNROLL
 				
 					data_T out_data;
-					#pragma HLS DATA_PACK variable=out_data
+					PRAGMA_DATA_PACK(out_data)
 				
 					ResizeChan: for (unsigned k = 0; k < CONFIG_T::n_chan; k++) {
 						#pragma HLS UNROLL
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
index 6b053b878..c9ac45edf 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
@@ -40,7 +40,7 @@ void add(
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         AddPack: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -65,7 +65,7 @@ void subtract(
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         SubtractPack: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -90,7 +90,7 @@ void multiply(
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         MultiplyPack: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -115,7 +115,7 @@ void average(
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         AveragePack: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -140,7 +140,7 @@ void maximum(
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         MaximumPack: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -165,7 +165,7 @@ void minimum(
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         MinimumPack: for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -188,7 +188,7 @@ void concatenate3d_0(
 
             input1_T in_data1 = data1.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
             ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) {
                 #pragma HLS UNROLL
@@ -204,7 +204,7 @@ void concatenate3d_0(
 
             input2_T in_data2 = data2.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
             ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) {
                 #pragma HLS UNROLL
@@ -228,7 +228,7 @@ void concatenate3d_1(
 
             input1_T in_data1 = data1.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
             ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) {
                 #pragma HLS UNROLL
@@ -242,7 +242,7 @@ void concatenate3d_1(
 
             input2_T in_data2 = data2.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
             ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) {
                 #pragma HLS UNROLL
@@ -267,7 +267,7 @@ void concatenate3d_2(
             input1_T in_data1 = data1.read();
             input2_T in_data2 = data2.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
             ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) {
                 #pragma HLS UNROLL
@@ -310,7 +310,7 @@ void concatenate2d_0(
 
         input1_T in_data1 = data1.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) {
             #pragma HLS UNROLL
@@ -324,7 +324,7 @@ void concatenate2d_0(
 
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) {
             #pragma HLS UNROLL
@@ -347,7 +347,7 @@ void concatenate2d_1(
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) {
             #pragma HLS UNROLL
@@ -383,7 +383,7 @@ void concatenate1d(
     hls::stream<res_T> &res)
 {
     res_T out_data;
-    #pragma HLS DATA_PACK variable=out_data
+    PRAGMA_DATA_PACK(out_data)
     ConcatLoop1: for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
     	#pragma HLS PIPELINE
         input1_T in_data1 = data1.read();
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h
index 80fa1d287..4113d9e36 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h
@@ -130,7 +130,7 @@ void pooling2d_encoded_cl(
     assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     hls::stream<typename data_T::value_type> data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
@@ -176,7 +176,7 @@ void compute_pool_buffer_2d(
     #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
 
     // Add pixel into line buffer, return pooling kernels
     nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
@@ -344,7 +344,7 @@ void pooling1d_encoded_cl(
     assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     hls::stream<typename data_T::value_type> data_window[CONFIG_T::pool_width * CONFIG_T::n_filt];
@@ -385,7 +385,7 @@ void compute_pool_buffer_1d(
     #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
 
     // Add pixel into line buffer, return pooling kernels
     // 1D case line buffer not necessary. Put directly into the kernel_data buffer
@@ -523,7 +523,7 @@ void global_pooling2d_cl(
             #pragma HLS PIPELINE
 
             res_T res_pack;
-            #pragma HLS DATA_PACK variable=res_pack
+            PRAGMA_DATA_PACK(res_pack)
             MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
                 #pragma HLS UNROLL
                 res_pack[i_pack] = data_window[i_pack];
@@ -535,7 +535,7 @@ void global_pooling2d_cl(
             #pragma HLS PIPELINE
 
             res_T res_pack;
-            #pragma HLS DATA_PACK variable=res_pack
+            PRAGMA_DATA_PACK(res_pack)
             AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
                 #pragma HLS UNROLL
                 res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width);
@@ -577,7 +577,7 @@ void global_pooling1d_cl(
             #pragma HLS PIPELINE
 
             res_T res_pack;
-            #pragma HLS DATA_PACK variable=res_pack
+            PRAGMA_DATA_PACK(res_pack)
             MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
                 #pragma HLS UNROLL
                 res_pack[i_pack] = data_window[i_pack];
@@ -589,7 +589,7 @@ void global_pooling1d_cl(
             #pragma HLS PIPELINE
 
             res_T res_pack;
-            #pragma HLS DATA_PACK variable=res_pack
+            PRAGMA_DATA_PACK(res_pack)
             AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
                 #pragma HLS UNROLL
 	      res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in;
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
index b0f6ce9c6..71ccf1a01 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
@@ -26,7 +26,7 @@ void depthwise_conv_1d_encoded_cl(
     #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
index 69e272652..b2c80950a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
@@ -27,7 +27,7 @@ void depthwise_conv_2d_encoded_cl(
     #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     ap_uint<CONFIG_T::filt_height * CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
index 9ee6628fe..b4de14ffd 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
@@ -3,17 +3,18 @@
 #define NNET_STREAM_H
 
 #include "hls_stream.h"
+#include "nnet_common.h"
 
 namespace nnet {
 
 struct broadcast_config
 {
-  static const unsigned in_height = 1;
-  static const unsigned in_width = 1;
-  static const unsigned in_chan = 3;
-  static const unsigned out_height = 2;
-  static const unsigned out_width = 2;
-  static const unsigned out_chan = 3;
+    static const unsigned in_height = 1;
+    static const unsigned in_width = 1;
+    static const unsigned in_chan = 3;
+    static const unsigned out_height = 2;
+    static const unsigned out_width = 2;
+    static const unsigned out_chan = 3;
 };
 
 template<class data_T, class res_T, int N>
@@ -24,8 +25,8 @@ void clone_stream(hls::stream<data_T> &data, hls::stream<res_T> &res1, hls::stre
         data_T in_data = data.read();
         res_T out_data1;
         res_T out_data2;
-        #pragma HLS DATA_PACK variable=out_data1
-        #pragma HLS DATA_PACK variable=out_data2
+        PRAGMA_DATA_PACK(out_data1)
+        PRAGMA_DATA_PACK(out_data2)
 
         ClonePack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
@@ -47,9 +48,9 @@ void clone_stream(hls::stream<data_T> &data, hls::stream<res_T> &res1, hls::stre
         res_T out_data1;
         res_T out_data2;
         res_T out_data3;
-        #pragma HLS DATA_PACK variable=out_data1
-        #pragma HLS DATA_PACK variable=out_data2
-        #pragma HLS DATA_PACK variable=out_data3
+        PRAGMA_DATA_PACK(out_data1)
+        PRAGMA_DATA_PACK(out_data2)
+        PRAGMA_DATA_PACK(out_data3)
 
         ClonePack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
@@ -72,7 +73,7 @@ void repack_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
             data_T in_data = data.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
             for (int j = 0; j < data_T::size; j++) {
                 #pragma HLS UNROLL
@@ -90,7 +91,7 @@ void repack_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
             data_T in_data = data.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
             for (int j = 0; j < pack_diff; j++) {
                 #pragma HLS PIPELINE
@@ -136,7 +137,7 @@ void broadcast_stream_1x1xC(hls::stream<data_T> &data, hls::stream<res_T> &res)
         for (int j = 0; j < n_dupl; j++) {
             #pragma HLS PIPELINE
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
             for (int k = 0; k < res_T::size; k++) {
                 #pragma HLS UNROLL
                 out_data[k] = in_data[k];
@@ -152,20 +153,20 @@ void broadcast_stream_HxWx1(hls::stream<data_T> &data, hls::stream<res_T> &res)
     BroadcastLoop: for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) {
         #pragma HLS PIPELINE
         data_T in_data = data.read();
-	res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
-	for (int k = 0; k < res_T::size; k++) {
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+        for (int k = 0; k < res_T::size; k++) {
             #pragma HLS UNROLL
-	    out_data[k] = in_data[0];
-	}
-	res.write(out_data);
+            out_data[k] = in_data[0];
+        }
+        res.write(out_data);
     }
 }
 
 template<class data_T, class res_T, typename CONFIG_T>
 void broadcast_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     if(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) {
-	broadcast_stream_1x1xC<data_T, res_T, CONFIG_T>(data, res);
+        broadcast_stream_1x1xC<data_T, res_T, CONFIG_T>(data, res);
     }
     else if(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && CONFIG_T::in_width == CONFIG_T::out_width) {
         broadcast_stream_HxWx1<data_T, res_T, CONFIG_T>(data, res);
@@ -180,19 +181,19 @@ void transpose_2d(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) {
         #pragma HLS PIPELINE
         data_T in_data = data.read();
-	for (int j = 0; j < data_T::size; j++) {
-	    data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
+        for (int j = 0; j < data_T::size; j++) {
+            data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
         }
     }
   
     for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) {
         #pragma HLS PIPELINE
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
         for (int j = 0; j < res_T::size; j++) {
-	    out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
+            out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
         }
-	res.write(out_data);
+        res.write(out_data);
     }
 } 
 }

From ad89e3c03686d966c37e58b9ddbfd8aa886f5e7e Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 1 Aug 2022 17:15:59 +0200
Subject: [PATCH 05/20] Fix allocation pragmas

---
 .../nnet_utils/nnet_activation_stream.h       |  4 +-
 .../vivado/nnet_utils/nnet_conv2d_latency.h   | 53 ++-----------------
 .../vivado/nnet_utils/nnet_dense_compressed.h |  8 +--
 .../vivado/nnet_utils/nnet_pooling.h          |  8 +--
 4 files changed, 14 insertions(+), 59 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
index bc36c1a5c..f866aa16f 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
@@ -167,7 +167,7 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res){
         PRAGMA_DATA_PACK(out_pack)
         SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){
             #pragma HLS UNROLL
-            #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+            #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
             out_pack[j] = exp_res[j] * inv_exp_sum;
         }
         res.write(out_pack);
@@ -241,7 +241,7 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res){
         PRAGMA_DATA_PACK(out_pack)
         SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){
             #pragma HLS UNROLL
-            #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+            #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
             out_pack[j] = exp_res[j] * inv_exp_sum;
         }
         res.write(out_pack);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
index 24132e5c6..724eedbeb 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
@@ -6,50 +6,6 @@
 
 namespace nnet {
 
-//Computes multiplier limit
-//This function should not be synthesized into firmware
-template<typename CONFIG_T>
-    int compute_multiplier_limit_conv2d(
-    typename CONFIG_T::weight_t  weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt]
-)
-{
-    int n_mult = 0;
-
-    for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-        for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-            for(int ff = 0; ff < CONFIG_T::n_filt; ff++){
-                for(int cc = 0; cc < CONFIG_T::n_chan; cc++){
-                    for(int fh = 0; fh < CONFIG_T::filt_height; fh++){
-                        for(int fw = 0; fw < CONFIG_T::filt_width; fw++){
-
-                                int index_weight = fh*CONFIG_T::filt_width*CONFIG_T::n_chan*CONFIG_T::n_filt
-                                                 + fw*CONFIG_T::n_chan*CONFIG_T::n_filt
-                                                 + cc*CONFIG_T::n_filt
-                                                  + ff;
-
-                                if ((oh*CONFIG_T::stride_height+fh) < CONFIG_T::pad_top
-                                || (oh*CONFIG_T::stride_height+fh) >= (CONFIG_T::pad_top+CONFIG_T::in_height)
-                                || (ow*CONFIG_T::stride_width+fw) < CONFIG_T::pad_left
-                                || (ow*CONFIG_T::stride_width+fw) >= (CONFIG_T::pad_left+CONFIG_T::in_width)) {
-                                    //padded - do nothing
-                                    continue;
-                                } else {
-                                    if (weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20) {
-                                          n_mult++;
-                                    }
-                                }
-
-                        }//end mult loop
-                    }//end channel loop
-                }//end filter width loop
-            }//end filter height loop
-        }//end output width loop
-    }//end output height loop
-
-    return ceil( float(n_mult) / float(CONFIG_T::reuse_factor) );
-
-}//end compute_n_mult
-
 template<class data_T, class res_T, typename CONFIG_T>
 void conv_2d_latency_cf(
     data_T data[CONFIG_T::in_height*CONFIG_T::in_width*CONFIG_T::n_chan],
@@ -72,8 +28,7 @@ void conv_2d_latency_cf(
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
-    #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Convolve, saving all multiplication results to accumulate later
     ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
@@ -188,8 +143,7 @@ void conv_2d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
-    #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Convolve, saving all multiplication results to accumulate later
     ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
@@ -303,8 +257,7 @@ void pointwise_conv_2d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
-    #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Convolve, saving all multiplication results to accumulate later
     ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
index dc803ff2b..7202b3a10 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
@@ -51,10 +51,12 @@ void dense_compressed(
     #pragma HLS ARRAY_PARTITION variable=acc    complete
     #pragma HLS ARRAY_PARTITION variable=biases complete
     #pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
-    //if (CONFIG_T::store_weights_in_bram){
-    //#pragma HLS RESOURCE variable=weights core=ROM_1P_BRAM
+
+#ifdef __VITIS_HLS__
+    #pragma HLS AGGREGATE variable=weights
+#else
     #pragma HLS data_pack variable=weights struct_level
-    //}
+#endif
 
     InitAccum:
     for(unsigned i = 0; i < CONFIG_T::n_out; i++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
index 5267a58fc..cd2c580f1 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
@@ -109,7 +109,7 @@ void pooling1d_cl(
 
     // TODO partition the arrays according to the reuse factor
     const int limit = pool_op_limit_1d<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
     // Add any necessary padding
     unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
     if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
@@ -158,7 +158,7 @@ void global_pooling1d_cl(
 
     // TODO partition the arrays according to the reuse factor
     const int limit = pool_op_limit_1d<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
 
     for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
         data_T pool[CONFIG_T::n_in];
@@ -209,7 +209,7 @@ void pooling2d_cl(
 
     // TODO partition the arrays according to the reuse factor
     const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
     // Add any necessary padding
     unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
     unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
@@ -263,7 +263,7 @@ void pooling2d_cf(
 
     // TODO partition the arrays according to the reuse factor
     const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
     // Add any necessary padding
     unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
     unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;

From 88c39555973559cfd7a1a1f2bc485969cf9cedde Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 1 Aug 2022 17:42:23 +0200
Subject: [PATCH 06/20] Use recursive inlining instead of region

---
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h  | 2 +-
 hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h  | 2 +-
 hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h    | 6 +++---
 hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h | 2 +-
 hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h   | 2 +-
 hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h | 4 ++--
 hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h | 8 ++++----
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
index e887b2564..0dd2e8cde 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
@@ -81,7 +81,7 @@ void conv_1d_cl(
     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
     typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
 {
-    #pragma HLS inline region
+    #pragma HLS inline recursive
     switch(CONFIG_T::implementation){
         case conv_implementation::linebuffer:
             conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
index 7d451b3ba..6eff1eef6 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
@@ -97,7 +97,7 @@ void conv_2d_cl(
     typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
     typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
 {
-    #pragma HLS inline region
+    #pragma HLS inline recursive
     switch(CONFIG_T::implementation){
         case conv_implementation::linebuffer:
             conv_2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
index a922d5854..dab9c39ad 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
@@ -89,7 +89,7 @@ void mult_buffer(
         data[id] = data_window[id].read();
     }
 
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
         dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(data, res, weights, biases);
     } else {
@@ -272,7 +272,7 @@ void compute_output_buffer_2d(
     if ( (sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
         
         // Dense multiply
-        #pragma HLS INLINE region
+        #pragma HLS INLINE recursive
         if (CONFIG_T::strategy == nnet::latency) {
             dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(kernel_data, res_out, weights, biases);
         } else {
@@ -342,7 +342,7 @@ void compute_output_buffer_1d(
     if ( (sX - lShiftX) == 0 && pX > lShiftX - 1 ) {
         
         // Dense multiply
-        #pragma HLS INLINE region
+        #pragma HLS INLINE recursive
         if (CONFIG_T::strategy == nnet::latency) {
             dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(kernel_data, res_out, weights, biases);
         } else {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
index c0e5d1759..180365327 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
@@ -270,7 +270,7 @@ void dense_resource(
     typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
     typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
 
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
 
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
         dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
index 9b0deb9b4..564bafac9 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
@@ -16,7 +16,7 @@ void dense_wrapper(
     typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
     typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]
 ) {
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
         #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
         dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h
index 4113d9e36..08c4a6a8a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h
@@ -245,7 +245,7 @@ void pooling2d_cl(
     hls::stream<data_T> &data,
     hls::stream<res_T> &res
 ) {
-    #pragma HLS inline region
+    #pragma HLS inline recursive
     switch(CONFIG_T::implementation){
         case conv_implementation::linebuffer:
             pooling2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res);
@@ -441,7 +441,7 @@ void pooling1d_cl(
     hls::stream<data_T> &data,
     hls::stream<res_T> &res
 ) {
-    #pragma HLS inline region
+    #pragma HLS inline recursive
     switch(CONFIG_T::implementation){
         case conv_implementation::linebuffer:
             pooling1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
index 5788d429e..ce6528995 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
@@ -76,7 +76,7 @@ void depthwise_mult_buffer(
         data[id] = data_window[id].read();
     }
 
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
         depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
     } else {
@@ -156,7 +156,7 @@ void pointwise_mult_buffer(
         data[id] = data_pack[id];
     }
 
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
         dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(data, res, weights, biases);
     } else {
@@ -203,7 +203,7 @@ void compute_depthwise_output_buffer_1d(
     // Check to see if we have a full kernel
     if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { 
       // Dense multiply
-      #pragma HLS INLINE region
+      #pragma HLS INLINE recursive
       if (CONFIG_T::strategy == nnet::latency) {
         depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out, weights, biases);
       } else {
@@ -267,7 +267,7 @@ void compute_depthwise_output_buffer_2d(
     // Check to see if we have a full kernel
     if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { 
       // Dense multiply
-      #pragma HLS INLINE region
+      #pragma HLS INLINE recursive
       if (CONFIG_T::strategy == nnet::latency) {
         depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out, weights, biases);
       } else {

From e8b21760825d4671e517744710c14826b6575c44 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 3 Aug 2022 20:05:36 +0200
Subject: [PATCH 07/20] Add Vitis HLS implementation overrides

---
 .../vitis/nnet_utils/nnet_conv1d_stream.h     |  36 ++
 .../vitis/nnet_utils/nnet_conv2d_stream.h     |  81 +++++
 .../vitis/nnet_utils/nnet_dense_stream.h      | 102 ++++++
 .../vitis/nnet_utils/nnet_pooling_stream.h    | 341 ++++++++++++++++++
 .../vitis/nnet_utils/nnet_sepconv1d_stream.h  |  88 +++++
 .../vitis/nnet_utils/nnet_sepconv2d_stream.h  | 112 ++++++
 hls4ml/writer/vitis_writer.py                 |  20 +-
 7 files changed, 778 insertions(+), 2 deletions(-)
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h
new file mode 100644
index 000000000..f054adc3d
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h
@@ -0,0 +1,36 @@
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+#include "hls_stream.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    } else {
+        ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    }
+
+}
+
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h
new file mode 100644
index 000000000..1c77f4f3e
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h
@@ -0,0 +1,81 @@
+#ifndef NNET_CONV2D_STREAM_H_
+#define NNET_CONV2D_STREAM_H_
+
+#include "ap_shift_reg.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+#include "hls_stream.h"
+
+namespace nnet {
+
+// Line Buffer
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_buffer_latency_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1,1)][CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+            if (CONFIG_T::filt_height > 1) {
+                compute_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+            } else {
+                compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_buffer_resource_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1,1)][CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+
+            if (CONFIG_T::filt_height > 1) {
+                compute_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+            } else {
+                compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_buffer_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_buffer_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h
new file mode 100644
index 000000000..f8469f0cb
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h
@@ -0,0 +1,102 @@
+#ifndef NNET_DENSE_STREAM_H_
+#define NNET_DENSE_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_types.h"
+#include "hls_stream.h"
+#include <math.h>
+#include <assert.h>
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_wrapper(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]
+) {
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template<class data_T, typename CONFIG_T>
+void data_prepare(
+    hls::stream<data_T> &data_stream,
+    typename data_T::value_type data[CONFIG_T::n_in]
+) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::n_in / data_T::size > 1) {
+        DataPrepare: for(int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) {
+            #pragma HLS PIPELINE
+            data_T data_pack = data_stream.read();
+            DataPackPipeline: for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                data[i_in * data_T::size + i_pack] = data_pack[i_pack];
+            }
+        }
+    } else {
+        data_T data_pack = data_stream.read();
+        DataPackSingle: for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            data[i_pack] = data_pack[i_pack];
+        }
+    }
+}
+
+template<class res_T, typename CONFIG_T>
+void res_write(
+    typename res_T::value_type res[CONFIG_T::n_out],
+    hls::stream<res_T>  &res_stream
+) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::n_out / res_T::size > 1) {
+        ResWrite: for(unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) {
+            #pragma HLS PIPELINE
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            ResPackPipeline: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = res[i_out * res_T::size + i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+    } else {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+        ResPackSingle: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            res_pack[i_pack] = res[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense(
+    hls::stream<data_T> &data_stream,
+    hls::stream<res_T>  &res_stream,
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out])
+{
+    typename data_T::value_type data[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=data complete
+
+    typename res_T::value_type res[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_prepare<data_T, CONFIG_T>(data_stream, data);
+    dense_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+    res_write<res_T, CONFIG_T>(res, res_stream);
+}
+
+}
+
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h
new file mode 100644
index 000000000..f936c7c88
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h
@@ -0,0 +1,341 @@
+#ifndef NNET_POOLING_STREAM_H_
+#define NNET_POOLING_STREAM_H_
+
+#include "utils/x_hls_utils.h"
+#include "ap_shift_reg.h"
+#include "nnet_common.h"
+#include "nnet_pooling.h"
+#include "nnet_conv_stream.h"
+#include "hls_stream.h"
+
+namespace nnet {
+
+// *************************************************
+//       Max/average pooling
+// *************************************************
+
+template <class T, int N, class CONFIG_T>
+T reduce_pool(T x[N]) {
+    #pragma HLS INLINE
+    if (CONFIG_T::pool_op == Max) {
+        Op_max<T> op_max;
+        return reduce<T, N, Op_max<T>>(x, op_max);
+    } else {
+        Op_add<T> op_add;
+        T sum = reduce<T, N, Op_add<T>>(x, op_add);
+        return sum / N;
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_2d(
+    const data_T& in_elem,
+    ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::pool_height - 1,1)][CONFIG_T::n_filt],
+    hls::stream<res_T> &res
+) {
+    #pragma HLS INLINE
+    const static int lShiftX = CONFIG_T::pool_width - 1;
+    const static int lShiftY = CONFIG_T::pool_height - 1;
+    static int pX = 0; // pixel X 
+    static int pY = 0; // pixel Y
+    static int sX = 0; // stride X
+    static int sY = 0; // stride Y
+
+    typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width];
+    #pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    static typename data_T::value_type kernel_data[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel into line buffer, return pooling kernels
+    nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
+
+    // Can compute pooling output
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
+        FiltLoop: for(unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            #pragma HLS PIPELINE
+
+            // Retrieve data for current channel
+            PoolLoop: for(unsigned i_ihw = 0; i_ihw < CONFIG_T::pool_height * CONFIG_T::pool_width; i_ihw++) {
+                pool_window[i_ihw] = kernel_data[i_ihw * CONFIG_T::n_filt + i_ic]; 
+            }
+
+            // Compute Pooling
+            res_pack[i_ic] = reduce_pool<typename data_T::value_type, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T>(pool_window);
+        }
+
+        // Write to output
+        res.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::in_width)  // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+        if (pY + 1 == CONFIG_T::in_height) {  // Reached bottom of image
+            pY = 0;
+            sY = 0;
+        } else { // Next line
+            pY = pY + 1;
+            // Update stride (threshold) ? subtract stride : increment stride
+            sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; 
+        }
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; 
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T> &res
+) {
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    #pragma HLS INLINE recursive
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::pool_height - 1,1)][CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            #pragma HLS PIPELINE
+
+            compute_pool_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res);
+        }
+    }
+}
+
+// *************************************************
+//                  Pooling 1D
+// *************************************************
+template<class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_1d(
+    const data_T& in_elem,
+    hls::stream<res_T> &res
+) {
+    #pragma HLS INLINE
+    const static int lShiftX = CONFIG_T::pool_width - 1;
+    // Counters
+    static int pX = 0;
+    static int sX = 0;
+
+    typename data_T::value_type pool_window[CONFIG_T::pool_width];
+    #pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    static typename data_T::value_type kernel_data[CONFIG_T::pool_width * CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel into line buffer, return pooling kernels
+    // 1D case line buffer not necessary. Put directly into the kernel_data buffer
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
+
+    // Can compute pooling output
+    if ( (sX - lShiftX) == 0 && pX > lShiftX - 1) {
+        FiltLoop: for(unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            #pragma HLS PIPELINE
+
+            // Retrieve data for current channel
+            PoolLoop: for(unsigned i_iw = 0; i_iw < CONFIG_T::pool_width; i_iw++) {
+                pool_window[i_iw] = kernel_data[i_iw * CONFIG_T::n_filt + i_ic]; 
+            }
+
+            // Compute Pooling
+            res_pack[i_ic] = reduce_pool<typename data_T::value_type, CONFIG_T::pool_width, CONFIG_T>(pool_window);
+        }
+
+        // Write to output
+        res.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::n_in)  // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; 
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T> &res
+) {
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    #pragma HLS inline recursive
+
+    ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in; i_iw++) {
+        #pragma HLS PIPELINE
+        compute_pool_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res);
+    }
+}
+
+
+// *************************************************
+//       Global max/average pooling
+// *************************************************
+
+template <class T, int N, class CONFIG_T>
+T reduce_global_pool(T x, T y[N]) {
+    #pragma HLS INLINE
+    if (CONFIG_T::pool_op == Max) {
+        Op_max<T> op_max;
+        T y_max = reduce<T, N, Op_max<T>>(y, op_max);
+        return (x > y_max) ? x : y_max;
+    } else {
+        Op_add<T> op_add;
+        T y_sum = reduce<T, N, Op_add<T>>(y, op_add);
+        return x + y_sum;
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void compute_global_pool(
+    const data_T& in_elem,
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]
+) {
+    PoolFilt: for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+        #pragma HLS UNROLL
+
+        typename CONFIG_T::accum_t data_pack[data_T::size / CONFIG_T::n_filt];
+        #pragma HLS ARRAY_PARTITION variable=data_pack complete dim=0
+
+        PixelLoop: for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) {
+            #pragma HLS UNROLL
+            data_pack[p] = in_elem[p * CONFIG_T::n_filt + c];
+        }
+        data_window[c] = reduce_global_pool<typename CONFIG_T::accum_t, data_T::size / CONFIG_T::n_filt, CONFIG_T>(data_window[c], data_pack);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T> &res
+) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable=data_window complete
+
+    typename CONFIG_T::accum_t init = 0;
+    if (CONFIG_T::pool_op == Max) {
+        init = hls::numeric_limits<typename CONFIG_T::accum_t>::min();
+    }
+
+    PoolInitLoop: for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) {
+        #pragma HLS UNROLL
+        data_window[i_init] = init;
+    }
+
+    ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_filt); i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_window);
+        }
+    }
+
+    if (CONFIG_T::pool_op == Max) {
+        MaxPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack];
+            }
+            res.write(res_pack);
+        }
+    } else {
+        AvgPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width);
+            }
+            res.write(res_pack);
+        }
+    }
+
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T> &res
+) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable=data_window complete
+
+    typename CONFIG_T::accum_t init = 0;
+    if (CONFIG_T::pool_op == Max) {
+        init = hls::numeric_limits<typename CONFIG_T::accum_t>::min();
+    }
+
+    PoolInitLoop: for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) {
+        #pragma HLS UNROLL
+        data_window[i_init] = init;
+    }
+
+    ReadInput: for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (data_T::size / CONFIG_T::n_filt); i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_window);
+    }
+
+    if (CONFIG_T::pool_op == Max) {
+        MaxPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack];
+            }
+            res.write(res_pack);
+        }
+    } else {
+        AvgPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+	      res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in;
+            }
+            res.write(res_pack);
+        }
+    }
+
+}
+
+}
+
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
new file mode 100644
index 000000000..d36dbe5f8
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
@@ -0,0 +1,88 @@
+#ifndef NNET_SEPARABLE_CONV1D_STREAM_H_
+#define NNET_SEPARABLE_CONV1D_STREAM_H_
+
+#include "nnet_common.h"
+#include "hls_stream.h"
+#include "nnet_sepconv_stream.h"
+#include "nnet_conv1d_stream.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_buffer_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_chan])
+{
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    } else {
+        ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_width == 1);
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            if (i_iw % CONFIG_T::stride_width == 0) {
+                pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            } else {
+                data.read();
+            }
+        }
+    } else {
+        ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            if (i_iw % CONFIG_T::stride_width == 0) {
+                pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            } else {
+                data.read();
+            }
+        }
+    }
+}
+
+
+template<class data_T, class res_T, typename CONFIG_T>
+void separable_conv_1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::depthwise_config::weight_t depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+    typename CONFIG_T::pointwise_config::weight_t pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+    typename CONFIG_T::depthwise_config::bias_t   depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+    typename CONFIG_T::pointwise_config::bias_t   pointwise_biases[CONFIG_T::pointwise_config::n_filt]
+) {
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    #pragma HLS DATAFLOW
+
+    hls::stream<data_T> depthwise_res;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    #pragma HLS STREAM variable=depthwise_res depth=res_depth
+
+    depthwise_conv_1d_buffer_cl<data_T, data_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights, depthwise_biases);
+    pointwise_conv_1d_cl<data_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights, pointwise_biases);
+}
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
new file mode 100644
index 000000000..a483c46dd
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
@@ -0,0 +1,112 @@
+#ifndef NNET_SEPARABLE_CONV2D_STREAM_H_
+#define NNET_SEPARABLE_CONV2D_STREAM_H_
+
+#include "nnet_common.h"
+#include "hls_stream.h"
+#include "nnet_sepconv_stream.h"
+#include "nnet_conv2d_stream.h"
+
+namespace nnet {
+
+// Line Buffer Implementation (Phil's)
+template<class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_buffer_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_chan])
+{
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::filt_height - 1][CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+            ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+                #pragma HLS LOOP_FLATTEN
+                #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+                if (CONFIG_T::filt_height > 1) {
+                    compute_depthwise_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+                } else {
+                    compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+                }
+            }
+        }
+    } else {
+        ReadInputHeightSerial: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+            ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+                #pragma HLS LOOP_FLATTEN
+                if (CONFIG_T::filt_height > 1) {
+                    compute_depthwise_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+                } else {
+                    compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+                }
+            }
+        }
+    }
+}
+
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+            ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+                #pragma HLS LOOP_FLATTEN
+                #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+                if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) {
+                    pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+                } else {
+                    data.read();
+                }
+            }
+        }
+    } else {
+        ReadInputHeightSerial: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+            ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+                #pragma HLS LOOP_FLATTEN
+                if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) {
+                    pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+                } else {
+                    data.read();
+                }
+            }
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void separable_conv_2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::depthwise_config::weight_t depthwise_weights[CONFIG_T::depthwise_config::filt_height * CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+    typename CONFIG_T::pointwise_config::weight_t pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+    typename CONFIG_T::depthwise_config::bias_t   depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+    typename CONFIG_T::pointwise_config::bias_t   pointwise_biases[CONFIG_T::pointwise_config::n_filt]
+) {
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    #pragma HLS DATAFLOW
+
+    hls::stream<data_T> depthwise_res;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    #pragma HLS STREAM variable=depthwise_res depth=res_depth
+
+    depthwise_conv_2d_buffer_cl<data_T, data_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights, depthwise_biases);
+    pointwise_conv_2d_cl<data_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights, pointwise_biases);
+}
+
+}
+#endif
diff --git a/hls4ml/writer/vitis_writer.py b/hls4ml/writer/vitis_writer.py
index 45784acb6..44b7d97c0 100644
--- a/hls4ml/writer/vitis_writer.py
+++ b/hls4ml/writer/vitis_writer.py
@@ -1,6 +1,6 @@
 import os
-from shutil import copyfile, copytree
-from distutils.dir_util import copy_tree
+import glob
+from shutil import copy
 from hls4ml.writer.vivado_writer import VivadoWriter
 
 class VitisWriter(VivadoWriter):
@@ -8,8 +8,24 @@ class VitisWriter(VivadoWriter):
     def __init__(self):
         super().__init__()
 
+    def write_nnet_utils_overrides(self, model):
+        ###################
+        ## nnet_utils
+        ###################
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        srcpath = os.path.join(filedir,'../templates/vitis/nnet_utils/')
+        dstpath = '{}/firmware/nnet_utils/'.format(model.config.get_output_dir())
+
+        headers = [os.path.basename(h) for h in glob.glob(srcpath + '*.h')]
+
+        for h in headers:
+            copy(srcpath + h, dstpath + h)
+
     def write_hls(self, model):
         """
         Write the HLS project. Calls the steps from VivadoWriter, adapted for Vitis
         """
         super(VitisWriter, self).write_hls(model)
+        self.write_nnet_utils_overrides(model)

From ef5530df0678d7121c2500233ce2d754e1f3e8f5 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 3 Aug 2022 20:05:57 +0200
Subject: [PATCH 08/20] Supported feature validation for Vitis backend

---
 hls4ml/backends/vitis/passes/feature_check.py | 25 +++++++++++++++++++
 hls4ml/backends/vitis/vitis_backend.py        | 17 ++++++++++---
 2 files changed, 38 insertions(+), 4 deletions(-)
 create mode 100644 hls4ml/backends/vitis/passes/feature_check.py

diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py
new file mode 100644
index 000000000..ee3e6d83b
--- /dev/null
+++ b/hls4ml/backends/vitis/passes/feature_check.py
@@ -0,0 +1,25 @@
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ValidateConvImplementation(OptimizerPass):
+
+    def match(self, node):
+        return 'Conv' in node.class_name
+
+    def transform(self, model, node):
+        if node.get_attr('implementation', 'linebuffer') == 'encoded':
+            print(f'WARNING: "Encoded" implementation in "{node.name}" ({node.class_name}) is not supported in Vitis backend. Switching to "LineBuffer" implementation.')
+            node.set_attr('implementation', 'linebuffer')
+
+
+class ValidateStrategy(OptimizerPass):
+    _resource_layer_cls = ['Conv1D', 'Conv2D', 'Dense']
+
+    def match(self, node):
+        is_resource_layer = len([layer_cls for layer_cls in self._resource_layer_cls if layer_cls in node.class_name]) > 0
+        is_resource_strategy = node.model.config.is_resource_strategy(node)
+        
+        return is_resource_layer and is_resource_strategy
+
+    def transform(self, model, node):
+        print(f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis backend due to use of "urem" cores. Consider switching to "Latency" strategy.')
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 512917e93..b8b4bc798 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -2,7 +2,7 @@
 import sys
 
 from hls4ml.backends import VivadoBackend
-from hls4ml.model.flow import register_flow
+from hls4ml.model.flow import register_flow, get_flow
 from hls4ml.report import parse_vivado_report
 
 
@@ -12,10 +12,19 @@ def __init__(self):
         self._register_flows()
 
     def _register_flows(self):
-        vivado_ip = 'vivado:ip'
+        validation_passes = [
+            'vitis:validate_conv_implementation',
+            'vitis:validate_strategy',
+        ]
+        validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name)
+
         writer_passes = ['make_stamp', 'vitis:write_hls']
-        self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name)
-        self._default_flow = vivado_ip
+        self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name)
+
+        ip_flow_requirements = get_flow('vivado:ip').requires.copy()
+        ip_flow_requirements.insert(ip_flow_requirements.index('vivado:init_layers'), validation_flow)
+
+        self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
 
     def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False):
         if 'linux' in sys.platform:

From 088496121d01aa90a2696882885c9e59097dcf10 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 4 Aug 2022 10:21:30 +0200
Subject: [PATCH 09/20] Fix Vitis pragmas

---
 .../vitis/nnet_utils/nnet_dense_stream.h      | 29 +++++++++-----
 .../vivado/nnet_utils/nnet_conv1d_latency.h   | 40 +------------------
 2 files changed, 22 insertions(+), 47 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h
index f8469f0cb..955dc9e78 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h
@@ -10,19 +10,24 @@
 namespace nnet {
 
 template<class data_T, class res_T, typename CONFIG_T>
-void dense_wrapper(
+void dense_latency_wrapper(
     data_T data[CONFIG_T::n_in],
     res_T  res[CONFIG_T::n_out],
     typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
     typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]
 ) {
-    #pragma HLS INLINE recursive
-    if (CONFIG_T::strategy == nnet::latency) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource_wrapper(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]
+) {
+    dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
 }
 
 template<class data_T, typename CONFIG_T>
@@ -86,6 +91,8 @@ void dense(
     typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
     typename CONFIG_T::bias_t   biases[CONFIG_T::n_out])
 {
+    #pragma HLS INLINE recursive
+
     typename data_T::value_type data[CONFIG_T::n_in];
     #pragma HLS ARRAY_PARTITION variable=data complete
 
@@ -93,7 +100,11 @@ void dense(
     #pragma HLS ARRAY_PARTITION variable=res complete
 
     data_prepare<data_T, CONFIG_T>(data_stream, data);
-    dense_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+    }
     res_write<res_T, CONFIG_T>(res, res_stream);
 }
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index f79903ee2..15fd2b49e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -6,40 +6,6 @@
 
 namespace nnet {
 
-//Computes multiplier limit
-//This function should not be synthesized into firmware
-template<typename CONFIG_T>
-int compute_multiplier_limit(
-    typename CONFIG_T::weight_t  weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt]
-)
-{
-    int n_mult = 0;
-    for(int ii = 0; ii < CONFIG_T::out_width; ii++) {
-        for(int ff = 0; ff < CONFIG_T::n_filt; ff++){
-            for(int cc = 0; cc < CONFIG_T::n_chan; cc++){
-                for(int jj = 0; jj < CONFIG_T::filt_width; jj++){
-
-                    int index_weight = jj*CONFIG_T::n_chan*CONFIG_T::n_filt + cc*CONFIG_T::n_filt + ff;
-
-                    if((ii*CONFIG_T::stride_width+jj) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width+jj) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){
-                        //padded -- do nothing
-                        continue;
-                    } else {
-                        //need to tune this cut?
-                        if( weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20 ){
-                            n_mult++;
-                        }//end if nonzero weight
-                    }//end not padding
-                }//end loop accross filter
-            }//end channel loop
-        }//end filter loop
-    }//end output loop
-
-    return ceil( float(n_mult) / float(CONFIG_T::reuse_factor) );
-
-}//end compute_n_mult
-
-
 template<class data_T, class res_T, typename CONFIG_T>
 void conv_1d_latency_cl(
     data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
@@ -63,8 +29,7 @@ void conv_1d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    const int multiplier_limit = compute_multiplier_limit<CONFIG_T>(weights);
-    #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Convolve, saving all multiplication results to accumulate later
     ConvOut: for(int ii = 0; ii < CONFIG_T::out_width; ii++) {
@@ -141,8 +106,7 @@ void pointwise_conv_1d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    const int multiplier_limit = compute_multiplier_limit<CONFIG_T>(weights);
-    #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Convolve, saving all multiplication results to accumulate later
     ConvOut: for(int ii = 0; ii < CONFIG_T::out_width; ii++) {

From b033ec642e947ddec66cd9d235f6749836e2f36b Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 4 Aug 2022 10:31:33 +0200
Subject: [PATCH 10/20] Treat Vitis backend as module

---
 hls4ml/backends/vitis/__init__.py        | 0
 hls4ml/backends/vitis/passes/__init__.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 hls4ml/backends/vitis/__init__.py
 create mode 100644 hls4ml/backends/vitis/passes/__init__.py

diff --git a/hls4ml/backends/vitis/__init__.py b/hls4ml/backends/vitis/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/hls4ml/backends/vitis/passes/__init__.py b/hls4ml/backends/vitis/passes/__init__.py
new file mode 100644
index 000000000..e69de29bb

From d7e6527f22cb081b58632063cb89243868025b95 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 17 Aug 2022 18:36:55 +0200
Subject: [PATCH 11/20] Limit function instances in pooling layers

---
 .../templates/vitis/nnet_utils/nnet_pooling.h | 314 ++++++++++++++++++
 1 file changed, 314 insertions(+)
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_pooling.h

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h
new file mode 100644
index 000000000..1fb2ecca7
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h
@@ -0,0 +1,314 @@
+#ifndef NNET_POOLING_H_
+#define NNET_POOLING_H_
+
+#include <iostream>
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+
+namespace nnet{
+
+// Return the maximum value from an array
+template<typename T, int N>
+T max(T x[N]){
+    T y = x[0];
+    for(int i = 1; i < N; i++){
+        y = x[i] > y ? x[i] : y;
+    }
+    return y;
+}
+
+template<int W, int N>
+ap_int<W> avg(ap_int<W> (&x)[N]){
+    // Use a wider accumulator than the input to avoid overflow
+    ap_int<W + ceillog2(N)> tmp = 0;
+    for(int i = 0; i < N; i++){
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ap_int<W> y = tmp;
+    return tmp;
+}
+
+template<int W, int I, int N>
+ap_fixed<W, I> avg(ap_fixed<W, I> (&x)[N]){
+    // Use a wider accumulator than the input to avoid overflow
+    ap_fixed<W + ceillog2(N), I + ceillog2(N)> tmp = 0;
+    for(int i = 0; i < N; i++){
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ap_fixed<W, I> y = tmp;
+    return y;
+}
+
+// Return the mean value of an array
+template<typename T, int N>
+T avg(T (&x)[N]){
+    T y = 0;
+    for(int i = 0; i < N; i++){
+        y += x[i];
+    }
+    y /= N;
+    return y;
+}
+
+// Enumeration for pooling operation (max, avg, l2norm pooling)
+enum Pool_Op { Max, Average }; // L2Norm };
+template<typename T, int N, Pool_Op op>
+T pool_op(T (&x)[N]){
+    switch(op){
+    case Max: return max<T, N>(x);
+    case Average: return avg(x);
+    // case L2Norm: return l2norm<T, N>(x);
+    }
+}
+
+template<typename T, Pool_Op op>
+T pad_val(){
+    /*---
+     *- In Tensorflow, pooling ignores the value in the padded cells
+     *- For Avg pooling, return 0 (the divisior is modified to the
+     *- area overlapping the unpadded image.
+     *- For max pooling, return the most negative value for the type.
+     *- TODO this is not really generic, it assumes fixed point or integer T
+    ---*/
+    switch(op){
+    case Max:{
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+        break;}
+    case Average: return 0;
+    }
+}
+
+struct pooling1d_config{
+    // IO size
+    static const unsigned n_in = 10;
+    static const unsigned pool_width = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template<typename CONFIG_T>
+constexpr int pool_op_limit_1d() {
+    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(
+    data_T data[CONFIG_T::n_in * CONFIG_T::n_filt],
+    res_T res[CONFIG_T::n_out * CONFIG_T::n_filt])
+{
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
+    // Add any necessary padding
+    unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image x in steps of stride
+        for(int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) {
+            data_T pool[CONFIG_T::pool_width];
+            // Keep track of number of pixels in image vs padding region
+            unsigned img_overlap = 0;
+            // Loop over pool window x
+            for(int jj = 0; jj < CONFIG_T::stride_width; jj++) {
+                if(ii+jj < CONFIG_T::pad_left || ii+jj >= (padded_width - CONFIG_T::pad_right)) {
+                    // Add padding
+                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
+                }else{
+                    pool[jj] = data[(ii + jj) * CONFIG_T::n_filt + ff];
+                    img_overlap++;
+                }
+            }
+            // do the pooling
+            // TODO in the case of average pooling, need to reduce width to area of pool window
+            // not overlapping padding region
+            res[(ii/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] =
+                    pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+            // If the pool op is Average, the zero-padding needs to be removed from the results
+            if(CONFIG_T::pool_op == Average) {
+                data_T rescale = CONFIG_T::pool_width / img_overlap;
+                res[(ii/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] *= rescale;
+            }
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(
+    data_T data[CONFIG_T::n_in * CONFIG_T::n_filt],
+    res_T res[CONFIG_T::n_filt])
+{
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
+
+    for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        data_T pool[CONFIG_T::n_in];
+        for(int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
+        }
+        // do the pooling
+        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op>(pool);
+    }
+}
+
+struct pooling2d_config{
+    // IO size
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_filt = 4;
+    static const unsigned stride_height = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned pool_height = 2;
+    static const unsigned pool_width = 2;
+    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
+    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
+    // Padding
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+    // Reuse factor
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef float accum_t;
+};
+
+template<typename CONFIG_T>
+constexpr int pool_op_limit(){
+    return DIV_ROUNDUP((CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt, CONFIG_T::reuse_factor);
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt])
+{
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
+    // Add any necessary padding
+    unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height);
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for(int ff = 0; ff < CONFIG_T::n_filt; ff++){
+        // Loop over input image y in steps of stride
+        for(int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height){
+            // Loop over input image x in steps of stride
+            for(int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width){
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for(int kk = 0; kk < CONFIG_T::stride_height; kk++){
+                    // Loop over pool window x
+                    for(int ll = 0; ll < CONFIG_T::stride_width; ll++){
+                        if(ii+kk < CONFIG_T::pad_top || ii+kk >= (padded_height - CONFIG_T::pad_bottom) || jj+ll < CONFIG_T::pad_left || jj+ll >= (padded_width - CONFIG_T::pad_right)){
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                        }else{
+                            pool[kk * CONFIG_T::stride_width + ll] = data[(ii + kk) * CONFIG_T::in_width * CONFIG_T::n_filt + (jj + ll) * CONFIG_T::n_filt + ff];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (jj/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] =
+                        pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if(CONFIG_T::pool_op == Average){
+                    data_T rescale = CONFIG_T::pool_height * CONFIG_T::pool_width / img_overlap;
+                    res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (jj/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cf(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt])
+{
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
+    // Add any necessary padding
+    unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height);
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for(int ff = 0; ff < CONFIG_T::n_filt; ff++){
+        // Loop over input image y in steps of stride
+        for(int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height){
+            // Loop over input image x in steps of stride
+            for(int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width){
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for(int kk = 0; kk < CONFIG_T::stride_height; kk++){
+                    // Loop over pool window x
+                    for(int ll = 0; ll < CONFIG_T::stride_width; ll++){
+                        if(ii+kk < CONFIG_T::pad_top || ii+kk >= (padded_height - CONFIG_T::pad_bottom) || jj+ll < CONFIG_T::pad_left || jj+ll >= (padded_width - CONFIG_T::pad_right)){
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                        }else{
+                            pool[kk * CONFIG_T::stride_width + ll] = data[(ii + kk) * CONFIG_T::in_width + ff * CONFIG_T::in_width*CONFIG_T::in_height + ll + jj];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width + (jj/CONFIG_T::stride_width) + ff* CONFIG_T::out_height* CONFIG_T::out_width] =
+                        pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if(CONFIG_T::pool_op == Average){
+                    data_T rescale = CONFIG_T::pool_height * CONFIG_T::pool_width / img_overlap;
+                    res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width + (jj/CONFIG_T::stride_width) + ff* CONFIG_T::out_height* CONFIG_T::out_width] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+}
+
+#endif

From 6916392ddd9a3ccb7d506cbb52fa73a733252152 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 18 Aug 2022 16:03:14 +0200
Subject: [PATCH 12/20] Use consistent resource names for reports

---
 hls4ml/report/vivado_report.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index c7773ad9f..736fc3354 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -164,8 +164,13 @@ def parse_vivado_report(hls_dir):
         # Area
         area_node = root.find('./AreaEstimates')
         for child in area_node.find('./Resources'):
+            # DSPs are called 'DSP48E' in Vivado and just 'DSP' in Vitis. Overriding here to have consistent keys
+            if child.tag == 'DSP48E':
+                child.tag = 'DSP'
             c_synth_report[child.tag] = child.text
         for child in area_node.find('./AvailableResources'):
+            if child.tag == 'DSP48E':
+                child.tag = 'DSP'
             c_synth_report['Available' + child.tag] = child.text
         report['CSynthesisReport'] = c_synth_report
     else:

From f44d621ec44763b1f3a7d84321f861cd0347cc82 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 18 Aug 2022 19:27:18 +0200
Subject: [PATCH 13/20] Support RNNs in Vitis

---
 hls4ml/backends/vitis/vitis_backend.py               | 1 +
 hls4ml/backends/vivado/passes/recurrent_templates.py | 1 +
 hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h  | 8 ++++----
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index b8b4bc798..dbcf87c31 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -9,6 +9,7 @@
 class VitisBackend(VivadoBackend):
     def __init__(self):
         super(VivadoBackend, self).__init__(name='Vitis')
+        self._register_layer_attributes()
         self._register_flows()
 
     def _register_flows(self):
diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py
index 74ec61e82..d7c826e74 100644
--- a/hls4ml/backends/vivado/passes/recurrent_templates.py
+++ b/hls4ml/backends/vivado/passes/recurrent_templates.py
@@ -12,6 +12,7 @@
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
     static const unsigned n_nonzeros = {nonzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
index e94286aa8..a2581a94c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
@@ -293,7 +293,7 @@ template<class data_T, class res_T, typename CONFIG_T>
         nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state,data_in,h_newstate, s_newstate, param,param_r,param_b, param_br);
       if (CONFIG_T::n_sequence_out > 1){
         res_T res_pack;
-        #pragma HLS DATA_PACK variable=res_pack
+        PRAGMA_DATA_PACK(res_pack)
         ResPack_sequences: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
             #pragma HLS UNROLL
             res_pack[i_pack] = h_newstate[i_pack];
@@ -305,7 +305,7 @@ template<class data_T, class res_T, typename CONFIG_T>
 
     if (CONFIG_T::n_sequence_out == 1){
       res_T res_pack;
-      #pragma HLS DATA_PACK variable=res_pack
+      PRAGMA_DATA_PACK(res_pack)
       ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
           #pragma HLS UNROLL
           res_pack[i_pack] = h_newstate[i_pack];
@@ -565,7 +565,7 @@ template<class data_T, class res_T, typename CONFIG_T>
         nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state,data_in,h_newstate,param,param_zr,param_b, param_br);
       if (CONFIG_T::n_sequence_out > 1){
         res_T res_pack;
-        #pragma HLS DATA_PACK variable=res_pack
+        PRAGMA_DATA_PACK(res_pack)
         ResPack_sequences: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
             #pragma HLS UNROLL
             res_pack[i_pack] = h_newstate[i_pack];
@@ -577,7 +577,7 @@ template<class data_T, class res_T, typename CONFIG_T>
 
     if (CONFIG_T::n_sequence_out == 1){
       res_T res_pack;
-      #pragma HLS DATA_PACK variable=res_pack
+      PRAGMA_DATA_PACK(res_pack)
       ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
           #pragma HLS UNROLL
           res_pack[i_pack] = h_newstate[i_pack];

From b5afb520c36abf971a7124ddd57c3bd6ffd13570 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 19 Oct 2022 19:12:18 +0200
Subject: [PATCH 14/20] Properly set the multiplier limit for Conv1D/2D

---
 .../vivado/nnet_utils/nnet_conv1d_latency.h   |   2 +-
 .../vivado/nnet_utils/nnet_conv2d_latency.h   | 117 +-----------------
 2 files changed, 2 insertions(+), 117 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 0f9f51deb..dd7225346 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -30,7 +30,7 @@ void conv_1d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=biases complete
 
     // Limit multipliers to control parallelization
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
 
     PartitionLoop:
     for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
index 6d46836f8..43222696c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
@@ -7,121 +7,6 @@
 
 namespace nnet {
 
-template<class data_T, class res_T, typename CONFIG_T>
-void conv_2d_latency_cf(
-    data_T data[CONFIG_T::in_height*CONFIG_T::in_width*CONFIG_T::n_chan],
-    res_T  res[CONFIG_T::out_height*CONFIG_T::out_width*CONFIG_T::n_filt],
-    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
-{
-
-    typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width];
-    typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt];
-
-    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
-    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    #pragma HLS function_instantiate variable=weights,biases
-
-    // Parallel mode
-    #pragma HLS PIPELINE
-    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
-
-    // Limit multipliers to control parallelization
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-    // Convolve, saving all multiplication results to accumulate later
-    ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-        ConvOutWidth: for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-            ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++){
-                ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++){
-                    ConvFiltHeight: for(int fh = 0; fh < CONFIG_T::filt_height; fh++){
-                        ConvFiltWidth: for(int fw = 0; fw < CONFIG_T::filt_width; fw++){
-
-                            int index_mult = oh*CONFIG_T::out_width*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + ow*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + ff*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + cc*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + fh*CONFIG_T::filt_width
-                                           + fw;
-
-                                int index_weight = fh*CONFIG_T::filt_width*CONFIG_T::n_chan*CONFIG_T::n_filt
-                                                 + fw*CONFIG_T::n_chan*CONFIG_T::n_filt
-                                                 + cc*CONFIG_T::n_filt
-                                                 + ff;
-
-                                if ((oh*CONFIG_T::stride_height+fh) < CONFIG_T::pad_top
-                                || (oh*CONFIG_T::stride_height+fh) >= (CONFIG_T::pad_top+CONFIG_T::in_height)
-                                || (ow*CONFIG_T::stride_width+fw) < CONFIG_T::pad_left
-                                || (ow*CONFIG_T::stride_width+fw) >= (CONFIG_T::pad_left+CONFIG_T::in_width)) {
-                                    mult[index_mult] = 0;
-                                } else {
-                                    int index_data = cc*CONFIG_T::in_height*CONFIG_T::in_width
-                                                   + (oh*CONFIG_T::stride_height+fh-CONFIG_T::pad_top)*CONFIG_T::in_width
-                                                   + (ow*CONFIG_T::stride_width+fw-CONFIG_T::pad_left);
-                                    mult[index_mult] = data[index_data] * weights[index_weight];
-                                }
-
-                        }//end mult loop
-                    }//end channel loop
-                  }//end filter width loop
-            }//end filter height loop
-        }//end output width loop
-    }//end output height loop
-
-
-    // Initialize accumulator with input biases
-    for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-        for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-            for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-                acc[oh*CONFIG_T::out_width*CONFIG_T::n_filt + ow*CONFIG_T::n_filt + ff]=biases[ff];
-            }
-        }
-    }
-
-
-    // Accumulate multiplication result
-    AccumOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-        AccumOutWidth: for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-            AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-                //Do "dot product" sum within filter and sum over channels
-                AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++){
-                    AccumDotHeight: for(int fh = 0; fh < CONFIG_T::filt_height; fh++){
-                        AccumDotWidth: for(int fw = 0; fw < CONFIG_T::filt_width; fw++){
-
-                            int index_mult = oh*CONFIG_T::out_width*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + ow*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + ff*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + cc*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + fh*CONFIG_T::filt_width
-                                           + fw;
-                            int index_acc = oh*CONFIG_T::out_width*CONFIG_T::n_filt
-                                          + ow*CONFIG_T::n_filt
-                                          + ff;
-
-                            acc[index_acc] += mult[index_mult];
-
-                        }//end dot product filter width loop
-                    }//end dot product filter height loop
-                }//end n channel loop
-            }//end n filter loop
-        }//end output width loop
-    }//end output height loop
-
-    // Cast to "res_t" type
-    for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-            for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-                int res_index = ff*CONFIG_T::out_height*CONFIG_T::out_width + oh*CONFIG_T::out_width + ow;
-                int acc_index = oh*CONFIG_T::out_width*CONFIG_T::n_filt + ow*CONFIG_T::n_filt + ff;
-                res[res_index] = acc[acc_index];
-            }
-        }
-    }
-
-}//end conv2d
-
 template<class data_T, class res_T, typename CONFIG_T>
 void conv_2d_latency_cl(
     data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
@@ -145,7 +30,7 @@ void conv_2d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=biases complete
 
     // Limit multipliers to control parallelization
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
 
     PartitionLoop:
     for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {

From 797c5f4df48e47656561c5fade8b31f61054ada6 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 19 Oct 2022 19:39:42 +0200
Subject: [PATCH 15/20] Add option to configure clock uncertainty

---
 hls4ml/templates/vivado/build_prj.tcl | 2 +-
 hls4ml/writer/vivado_writer.py        | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index f05985209..d34337c57 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -166,7 +166,7 @@ config_compile -name_max_length 80
 set_part $part
 config_schedule -enable_dsp_full_reg=false
 create_clock -period $clock_period -name default
-set_clock_uncertainty 12.5% default
+set_clock_uncertainty $clock_uncertainty default
 
 
 if {$opt(csim)} {
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 9cdbefb4e..03a8923c7 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -516,6 +516,8 @@ def write_build_script(self, model):
         f.write('set part "{}"\n'.format(model.config.get_config_value('Part')))
         f.write('variable clock_period\n')
         f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
+        f.write('variable clock_uncertainty\n')
+        f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
         f.close()
 
         ###################

From 8c7a6b0b26ee6648d1149a974a79eb89bbb9d707 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 20 Oct 2022 14:59:35 +0200
Subject: [PATCH 16/20] Add uncertainty to accelerator writer as well

---
 hls4ml/writer/vivado_accelerator_writer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py
index f979b6032..b92ce74ab 100644
--- a/hls4ml/writer/vivado_accelerator_writer.py
+++ b/hls4ml/writer/vivado_accelerator_writer.py
@@ -340,6 +340,8 @@ def write_board_script(self, model):
         f.write('set part "{}"\n'.format(self.vivado_accelerator_config.get_part()))
         f.write('variable clock_period\n')
         f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
+        f.write('variable clock_uncertainty\n')
+        f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
         if self.vivado_accelerator_config.get_interface() == 'axi_stream':
             in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth()
             f.write('set bit_width_hls_output {}\n'.format(in_bit))

From 94dbe80fc3e6d821edad5ad50e32396ef5589cd9 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 20 Oct 2022 21:02:56 +0200
Subject: [PATCH 17/20] Enabling resource strategy for Vitis backend

---
 hls4ml/backends/vitis/passes/feature_check.py |   5 +-
 .../vitis/nnet_utils/nnet_dense_resource.h    | 247 ++++++++++++++++++
 2 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h

diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py
index ee3e6d83b..eddd5530f 100644
--- a/hls4ml/backends/vitis/passes/feature_check.py
+++ b/hls4ml/backends/vitis/passes/feature_check.py
@@ -22,4 +22,7 @@ def match(self, node):
         return is_resource_layer and is_resource_strategy
 
     def transform(self, model, node):
-        print(f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis backend due to use of "urem" cores. Consider switching to "Latency" strategy.')
+        n_in, _ = model.config.backend.get_layer_mult_size(node)
+        rf = node.get_attr('reuse_factor')
+        if rf > n_in and rf % n_in > 0:
+            print(f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis backend due to use of "urem" cores. Consider using a different ReuseFactor or switching to "Latency" strategy.')
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h
new file mode 100644
index 000000000..d96b75b47
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h
@@ -0,0 +1,247 @@
+#ifndef NNET_DENSE_RESOURCE_H_
+#define NNET_DENSE_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "hls_stream.h"
+#include <math.h>
+#include <assert.h>
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_leq_nin(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+
+    assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; // I got you now motherfucker!
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    InitAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
+    }
+
+    ReuseLoop:
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        int in_index = ir;
+        int out_index = 0;
+        int acc_step = 0;
+
+        MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+              CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights_2d[im][ir]));
+
+            // Increment in_index
+            in_index += CONFIG_T::reuse_factor;
+            if (in_index >= CONFIG_T::n_in) {
+                in_index = ir;
+            }
+            // Increment out_index
+            if (acc_step + 1 >= multscale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+
+    // Cast to "res_t" type
+    Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin_rem0(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::n_in);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+
+    assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in && CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights;
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    InitAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
+    }
+
+    int in_index = 0;
+    int out_index;
+    int outstep = 0;
+    const int outscale = CONFIG_T::reuse_factor / CONFIG_T::n_in;
+
+    int outidx[CONFIG_T::reuse_factor];
+    IndexLoop:
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        outidx[ir] = outstep;
+        if ((ir + 1) % CONFIG_T::n_in == 0) {
+            outstep++;
+        }
+    }
+
+    ReuseLoop:
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        out_index = outidx[ir]/*outstep*/;
+
+        MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+              CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights_2d[im][ir]));
+
+            out_index += outscale;
+        }
+
+        in_index++;
+        if (in_index >= CONFIG_T::n_in) {
+            in_index = 0;
+            //outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
+        }
+    }
+
+    // Cast to "res_t" type
+    Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = CONFIG_T::n_out;
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+
+    assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights;
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    InitAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
+    }
+
+    ReuseLoop:
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+        typename CONFIG_T::accum_t tmpmult[block_factor];
+        #pragma HLS ARRAY_PARTITION variable=tmpmult complete
+
+        MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + CONFIG_T::reuse_factor * im;
+            int in_index = w_index % CONFIG_T::n_in; // As of Vitis HLS 2022.1, this still results in urem core being used.
+            tmpmult[im] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights_2d[im][ir]);
+        }
+
+        typename CONFIG_T::accum_t mult[multiplier_limit];
+        #pragma HLS ARRAY_PARTITION variable=mult complete
+
+        ResetMult:
+        for (int imult = 0; imult < multiplier_limit; imult++) {
+            #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+        AccumLoop1:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + CONFIG_T::reuse_factor * im;
+            int out_index = w_index / CONFIG_T::n_in;
+            if (out_index >= multiplier_limit) continue; // check out of bounds
+            mult[out_index] += tmpmult[im];
+        }
+
+        AccumLoop2:
+        for (int im = 0; im < multiplier_limit; im++) {
+            #pragma HLS UNROLL
+            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
+        }
+    }
+
+    // Cast to "res_t" type
+    Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    #pragma HLS INLINE recursive
+
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+}
+
+#endif

From 9dcf2f34549030ff2e05c5c4bf769ce58377f2da Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 24 Oct 2022 20:11:44 +0200
Subject: [PATCH 18/20] Resource strategy for Conv1D/2D io_parallel

---
 .../vitis/nnet_utils/nnet_conv1d_resource.h   | 102 +++++++++++++++++
 .../vitis/nnet_utils/nnet_conv2d_resource.h   | 104 ++++++++++++++++++
 2 files changed, 206 insertions(+)
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h
 create mode 100644 hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h
new file mode 100644
index 000000000..6477bbd90
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h
@@ -0,0 +1,102 @@
+#ifndef NNET_CONV1D_RESOURCE_H_
+#define NNET_CONV1D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void conv_1d_resource_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T  res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+    constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor);
+    constexpr unsigned multscale = block_factor / mult_n_out;
+
+    assert((block_factor % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor <= CONFIG_T::filt_width * CONFIG_T::n_chan) && "This function is correct only for RF <= FILT_WIDTH * N_CHAN");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    PartitionLoop:
+    for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        //#pragma HLS UNROLL // We don't want this loop unrolled
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+        PixelInitAccumLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            InitAccumLoop:
+            for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t) biases[i_acc];
+            }
+        }
+
+        ReuseLoop:
+        for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) {
+            #pragma HLS PIPELINE II=1 rewind
+
+            unsigned i_in = i_rf;
+            unsigned i_out = 0;
+            unsigned i_acc = 0;
+
+            MultLoop:
+            for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) {
+                #pragma HLS UNROLL
+
+                PixelMultLoop:
+                for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+                    #pragma HLS UNROLL
+
+                    acc[i_pxl][i_out] += static_cast<typename CONFIG_T::accum_t>(
+                            CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data_buf[i_pxl][i_in], weights_2d[i_blk][i_rf]));
+                }
+
+                // Increment i_in
+                i_in += CONFIG_T::reuse_factor;
+                if (i_in >= mult_n_in) {
+                    i_in = i_rf;
+                }
+                // Increment i_out
+                if (i_acc + 1 >= multscale) {
+                    i_acc = 0;
+                    i_out++;
+                } else {
+                    i_acc++;
+                }
+            }
+        }
+
+        PixelResultLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+            // Cast to "res_t" type
+            ResultLoop:
+            for (unsigned i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
+            }
+        }
+    }
+}
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h
new file mode 100644
index 000000000..ea0afc7d2
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h
@@ -0,0 +1,104 @@
+#ifndef NNET_CONV2D_RESOURCE_H_
+#define NNET_CONV2D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void conv_2d_resource_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T  res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+    constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor);
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor);
+    constexpr unsigned multscale = multiplier_limit / mult_n_out;
+
+    assert((multiplier_limit % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= FILT_HEIGHT * FILT_WIDTH * N_CHAN");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    PartitionLoop:
+    for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        //#pragma HLS UNROLL // We don't want this loop unrolled
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+        PixelInitAccumLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            InitAccumLoop:
+            for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t) biases[i_acc];
+            }
+        }
+
+        ReuseLoop:
+        for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) {
+            #pragma HLS PIPELINE II=1 rewind
+
+            unsigned i_in = i_rf;
+            unsigned i_out = 0;
+            unsigned i_acc = 0;
+
+            MultLoop:
+            for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) {
+                #pragma HLS UNROLL
+
+                PixelMultLoop:
+                for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+                    #pragma HLS UNROLL
+
+                    acc[i_pxl][i_out] += static_cast<typename CONFIG_T::accum_t>(
+                            CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data_buf[i_pxl][i_in], weights_2d[i_blk][i_rf]));
+                }
+
+                // Increment i_in
+                i_in += CONFIG_T::reuse_factor;
+                if (i_in >= mult_n_in) {
+                    i_in = i_rf;
+                }
+                // Increment i_out
+                if (i_acc + 1 >= multscale) {
+                    i_acc = 0;
+                    i_out++;
+                } else {
+                    i_acc++;
+                }
+            }
+        }
+
+        PixelResultLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+            // Cast to "res_t" type
+            ResultLoop:
+            for (unsigned i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
+            }
+        }
+    }
+}
+
+}
+#endif

From a2119aab30aa2683ecf29c3dae7dd6ac8990ff7b Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Fri, 17 Mar 2023 22:06:43 +0100
Subject: [PATCH 19/20] Add Vitis to tests

---
 hls4ml/backends/vitis/vitis_backend.py        |  4 +++
 .../templates/vitis/nnet_utils/nnet_pooling.h | 26 +++++++++++++++++++
 .../vitis/nnet_utils/nnet_sepconv1d_stream.h  |  3 ++-
 .../vitis/nnet_utils/nnet_sepconv2d_stream.h  |  3 ++-
 hls4ml/writer/vivado_writer.py                |  2 +-
 test/pytest/test_activations.py               |  2 +-
 test/pytest/test_batchnorm.py                 |  2 +-
 test/pytest/test_causalpadding.py             |  2 +-
 test/pytest/test_cnn_mnist.py                 |  4 +++
 test/pytest/test_cnn_mnist_qkeras.py          | 14 ++++++++--
 test/pytest/test_conv1d.py                    | 12 +++++++--
 test/pytest/test_embed.py                     |  4 +--
 test/pytest/test_extensions.py                |  9 ++++---
 test/pytest/test_globalpooling.py             |  4 +--
 test/pytest/test_keras_api.py                 | 14 +++++-----
 test/pytest/test_keras_h5_loader.py           |  2 +-
 test/pytest/test_merge.py                     | 10 +++----
 test/pytest/test_pointwiseconv.py             |  8 +++---
 test/pytest/test_qkeras.py                    |  6 ++---
 test/pytest/test_rnn.py                       |  4 +++
 test/pytest/test_sepconv2d.py                 |  7 ++---
 test/pytest/test_softmax.py                   |  4 +--
 test/pytest/test_softsign.py                  |  2 +-
 test/pytest/test_trace.py                     |  2 +-
 test/pytest/test_transpose_concat.py          |  4 +--
 test/pytest/test_upsampling.py                |  2 +-
 test/pytest/test_zeropadding.py               |  2 +-
 27 files changed, 111 insertions(+), 47 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index dbcf87c31..8fc4ab9c3 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -19,11 +19,15 @@ def _register_flows(self):
         ]
         validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name)
 
+        # Any potential templates registered specifically for Vitis backend
+        template_flow = register_flow('apply_templates', self._get_layer_templates, requires=['vivado:init_layers'], backend=self.name)
+
         writer_passes = ['make_stamp', 'vitis:write_hls']
         self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name)
 
         ip_flow_requirements = get_flow('vivado:ip').requires.copy()
         ip_flow_requirements.insert(ip_flow_requirements.index('vivado:init_layers'), validation_flow)
+        ip_flow_requirements.insert(ip_flow_requirements.index('vivado:apply_templates'), template_flow)
 
         self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
 
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h
index 1fb2ecca7..ac921e0d3 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h
@@ -309,6 +309,32 @@ void pooling2d_cf(
     }
 }
 
+
+template<class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
+
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width * CONFIG_T::pool_height, CONFIG_T::pool_op> limit=limit
+
+    FiltLoop:
+    for(int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+        
+        InputLoop:
+        for (int i = 0 ; i < CONFIG_T::in_height * CONFIG_T::in_width ; i++) {
+          pool[i] = data[i * CONFIG_T::n_filt + filt];
+        }
+                  
+        res[filt] = static_cast<res_T>(pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op>(pool));
+    }
+}
+
 }
 
 #endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
index d36dbe5f8..6850497ff 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
@@ -72,7 +72,8 @@ void separable_conv_1d_cl(
     typename CONFIG_T::depthwise_config::bias_t   depthwise_biases[CONFIG_T::depthwise_config::n_chan],
     typename CONFIG_T::pointwise_config::bias_t   pointwise_biases[CONFIG_T::pointwise_config::n_filt]
 ) {
-    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+    assert(CONFIG_T::depthwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+    assert(CONFIG_T::pointwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
 
     #pragma HLS DATAFLOW
 
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
index a483c46dd..352828ecd 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
@@ -96,7 +96,8 @@ void separable_conv_2d_cl(
     typename CONFIG_T::depthwise_config::bias_t   depthwise_biases[CONFIG_T::depthwise_config::n_chan],
     typename CONFIG_T::pointwise_config::bias_t   pointwise_biases[CONFIG_T::pointwise_config::n_filt]
 ) {
-    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+    assert(CONFIG_T::depthwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+    assert(CONFIG_T::pointwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
 
     #pragma HLS DATAFLOW
 
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 4e8cd9ad9..c70e28bb5 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -643,7 +643,7 @@ def write_nnet_utils(self, model):
         # custom source
         filedir = os.path.dirname(os.path.abspath(__file__))
 
-        custom_source = get_backend('Vivado').get_custom_source()
+        custom_source = model.config.backend.get_custom_source()
         for dst, srcpath in custom_source.items():
             dstpath = f'{model.config.get_output_dir()}/firmware/{dst}'
             copyfile(srcpath, dstpath)
diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py
index 7aea0884e..9875bfe14 100644
--- a/test/pytest/test_activations.py
+++ b/test/pytest/test_activations.py
@@ -9,7 +9,7 @@
 
 # Variable 'name' is simply used as an identifier for the activation
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('shape, io_type', [
                             ((8, ), 'io_parallel'),
                             ((8, ), 'io_stream'),
diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py
index 1b17637d9..f50329230 100644
--- a/test/pytest/test_batchnorm.py
+++ b/test/pytest/test_batchnorm.py
@@ -29,7 +29,7 @@ def model():
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_batchnorm(model, data, backend, io_type):
 
     default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>'
diff --git a/test/pytest/test_causalpadding.py b/test/pytest/test_causalpadding.py
index d183d81c4..4e128b874 100644
--- a/test/pytest/test_causalpadding.py
+++ b/test/pytest/test_causalpadding.py
@@ -10,7 +10,7 @@
 atol = 5e-3
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_causalpadding(io_type, backend):
     
     model = Sequential()
diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py
index 262ae5013..ab3365f22 100644
--- a/test/pytest/test_cnn_mnist.py
+++ b/test/pytest/test_cnn_mnist.py
@@ -58,6 +58,10 @@ def keras_model(mnist_data):
         ('Vivado', 'io_parallel', 'latency'),
         ('Vivado', 'io_stream', 'latency'),
         ('Vivado', 'io_stream', 'resource'),
+        ('Vitis', 'io_parallel', 'resource'),
+        ('Vitis', 'io_parallel', 'latency'),
+        ('Vitis', 'io_stream', 'latency'),
+        ('Vitis', 'io_stream', 'resource'),
     ],
 )
 def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy):
diff --git a/test/pytest/test_cnn_mnist_qkeras.py b/test/pytest/test_cnn_mnist_qkeras.py
index c34e0965a..cf3dbf17d 100644
--- a/test/pytest/test_cnn_mnist_qkeras.py
+++ b/test/pytest/test_cnn_mnist_qkeras.py
@@ -40,7 +40,12 @@ def mnist_model():
                                       ('Vivado', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+
+                                      ('Vitis', 'io_parallel', 'resource'),
+                                      ('Vitis', 'io_parallel', 'latency'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource')
                                     ])
 def hls_model(mnist_model, backend, io_type, strategy):
   keras_model = mnist_model
@@ -66,7 +71,12 @@ def hls_model(mnist_model, backend, io_type, strategy):
                                       ('Vivado', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+
+                                      ('Vitis', 'io_parallel', 'resource'),
+                                      ('Vitis', 'io_parallel', 'latency'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource')
                                     ])
 def test_accuracy(mnist_data, mnist_model, hls_model):
   x_train, y_train, x_test, y_test = mnist_data
diff --git a/test/pytest/test_conv1d.py b/test/pytest/test_conv1d.py
index 1d91d80ea..bc8a68002 100644
--- a/test/pytest/test_conv1d.py
+++ b/test/pytest/test_conv1d.py
@@ -30,7 +30,11 @@ def keras_model():
                                       ('Vivado', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+                                      ('Vitis', 'io_parallel', 'resource'),
+                                      ('Vitis', 'io_parallel', 'latency'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource'),
                                     ])
 def hls_model(keras_model, backend, io_type, strategy):
     default_precision = 'ap_fixed<16,3,AP_RND_CONV,AP_SAT>' if backend=='Vivado' else 'ac_fixed<16,3,true,AC_RND_CONV,AC_SAT>'
@@ -63,7 +67,11 @@ def hls_model(keras_model, backend, io_type, strategy):
                                       ('Vivado', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+                                      ('Vitis', 'io_parallel', 'resource'),
+                                      ('Vitis', 'io_parallel', 'latency'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource'),
                                     ])
 def test_accuracy(data, keras_model, hls_model):
     X = data
diff --git a/test/pytest/test_embed.py b/test/pytest/test_embed.py
index 8073a7a1a..fd8e39cdb 100644
--- a/test/pytest/test_embed.py
+++ b/test/pytest/test_embed.py
@@ -25,7 +25,7 @@ def keras_model():
 
 
 @pytest.fixture
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(keras_model, default_precision='ap_fixed<16,6>', granularity='name')
@@ -39,7 +39,7 @@ def hls_model(keras_model, backend, io_type):
     return hls_model
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_embedding_accuracy(data, keras_model, hls_model):
     X = data
diff --git a/test/pytest/test_extensions.py b/test/pytest/test_extensions.py
index 1c8e07198..9945768ea 100644
--- a/test/pytest/test_extensions.py
+++ b/test/pytest/test_extensions.py
@@ -126,11 +126,14 @@ def regsister_custom_layer():
     hls4ml.model.layers.register_layer('HReverse', HReverse)
 
 
-@pytest.mark.parametrize('backend_id', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend_id', ['Vivado', 'Vitis', 'Quartus'])
 def test_extensions(tmp_path, backend_id):
     # Register the optimization passes (if any)
     backend = hls4ml.backends.get_backend(backend_id)
-    backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=f'{backend_id.lower()}:optimize')
+    ip_flow = hls4ml.model.flow.get_flow(backend.get_default_flow())
+    # Add the pass into the main optimization flow
+    optimize_flow = [flow for flow in ip_flow.requires if ':optimize' in flow][0]
+    backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=optimize_flow)
 
     # Register template passes for the given backend
     backend.register_template(HReverseConfigTemplate)
@@ -168,6 +171,6 @@ def test_extensions(tmp_path, backend_id):
     hres = hmodel.predict(x.astype('float32'))
 
     # Check if the optimizer pass was applied
-    assert f'{backend_id.lower()}:remove_duplicate_reverse' in hmodel._applied_flows[0][f'{backend_id.lower()}:optimize']
+    assert f'{backend_id.lower()}:remove_duplicate_reverse' in hmodel._applied_flows[0][optimize_flow]
 
     np.testing.assert_array_equal(kres, hres)
diff --git a/test/pytest/test_globalpooling.py b/test/pytest/test_globalpooling.py
index 79260afbd..829c8f5d9 100644
--- a/test/pytest/test_globalpooling.py
+++ b/test/pytest/test_globalpooling.py
@@ -30,7 +30,7 @@ def keras_model_avg_1d():
     return model
  
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
 @pytest.mark.parametrize('model_type', ['max', 'avg'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_global_pool1d(backend, keras_model_max_1d, keras_model_avg_1d, data_1d, model_type, io_type):
@@ -70,7 +70,7 @@ def keras_model_avg_2d():
     model.compile()
     return model
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
 @pytest.mark.parametrize('model_type', ['max', 'avg'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_global_pool2d(backend, keras_model_max_2d, keras_model_avg_2d, data_2d, model_type, io_type):
diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py
index bd3f175b1..6da516646 100644
--- a/test/pytest/test_keras_api.py
+++ b/test/pytest/test_keras_api.py
@@ -15,7 +15,7 @@
 
 test_root_path = Path(__file__).parent
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_dense(backend, io_type):
     model = tf.keras.models.Sequential()
@@ -66,7 +66,7 @@ def test_dense(backend, io_type):
                                                  PReLU(alpha_initializer="zeros",),
                                                  Activation(activation='sigmoid', name='Activation')])
                                                  #ThresholdedReLU(theta=1.0)])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activations(activation_function, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -94,7 +94,7 @@ def test_activations(activation_function, backend, io_type):
 
 padds_options = ['same', 'valid']
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv1d(padds, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -123,8 +123,8 @@ def test_conv1d(padds, backend, io_type):
      # 5e-2 might be too high
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=5e-2)
 
-    if not (backend=='Vivado' and io_type=='io_stream' and padds=='same'):
-      # Vivado inserts and additional layer for 'same' padding in io_stream
+    if not (backend in ['Vivado', 'Vitis'] and io_type=='io_stream' and padds=='same'):
+      # Vivado/Vitis inserts and additional layer for 'same' padding in io_stream
       assert len(model.layers) + 2 == len(hls_model.get_layers())
       assert list(hls_model.get_layers())[1].attributes['name'] == model.layers[0]._name
       assert list(hls_model.get_layers())[1].attributes['class_name'] == 'Conv1D'
@@ -154,7 +154,7 @@ def test_conv1d(padds, backend, io_type):
 padds_options=['same', 'valid']
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds',  padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv2d(chans, padds, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -235,7 +235,7 @@ def test_conv2d(chans, padds, backend, io_type):
 @pytest.mark.parametrize('pooling', pooling_layers)
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('chans', chans_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_pooling(pooling, padds, chans, backend):
     assert '1D' in pooling.__name__ or '2D' in pooling.__name__
     
diff --git a/test/pytest/test_keras_h5_loader.py b/test/pytest/test_keras_h5_loader.py
index 0fa689e45..08753d584 100644
--- a/test/pytest/test_keras_h5_loader.py
+++ b/test/pytest/test_keras_h5_loader.py
@@ -8,7 +8,7 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_keras_h5_loader(backend):
     input_shape = (10,)
     model = tf.keras.models.Sequential([
diff --git a/test/pytest/test_merge.py b/test/pytest/test_merge.py
index 470e9b3ff..8ab4fa3a1 100644
--- a/test/pytest/test_merge.py
+++ b/test/pytest/test_merge.py
@@ -9,7 +9,7 @@
 
 @pytest.mark.parametrize('merge_layer', [Add, Average, Maximum, Minimum, Multiply, Subtract])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_merge(merge_layer, io_type, backend):
     input_shape = (10, 10, 3)
 
@@ -35,7 +35,7 @@ def test_merge(merge_layer, io_type, backend):
 
 @pytest.mark.parametrize('axes', [1])
 @pytest.mark.parametrize('io_type', ['io_parallel']) # No io_stream implementation yet
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_dot(axes, io_type, backend):
      # Only 1D implemented
     input_shape = (10, )
@@ -61,7 +61,7 @@ def test_dot(axes, io_type, backend):
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_concatenate1d(io_type, backend):
     input_shape = (10,)
 
@@ -87,7 +87,7 @@ def test_concatenate1d(io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_concatenate2d(axis, io_type, backend):
     input_shape = (10, 3)
 
@@ -114,7 +114,7 @@ def test_concatenate2d(axis, io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2, 3])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_concatenate3d(axis, io_type, backend):
     input_shape = (10, 10, 3)
 
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index 7650056f8..d43e35288 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -20,11 +20,13 @@
 @pytest.mark.parametrize('backend, io_type, strategy', [
                                       ('Quartus', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'resource'),
-
+                                      ('Vitis', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
-                                      
+                                      ('Vitis', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource'),
                                     ])
 def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
     model = tf.keras.models.Sequential()
diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py
index 19fa8375f..e7fa1ea15 100644
--- a/test/pytest/test_qkeras.py
+++ b/test/pytest/test_qkeras.py
@@ -127,7 +127,7 @@ def randX_100_16():
 # https://github.com/fastmachinelearning/hls4ml/issues/381
 # @pytest.mark.parametrize('bits', [4, 6, 8])
 @pytest.mark.parametrize('bits,alpha', [(4, 1), (4, 'auto_po2')])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_single_dense_activation_exact(randX_100_16, bits, alpha, backend, io_type):
     '''
@@ -197,7 +197,7 @@ def randX_100_10():
         (7, 10, binary(), quantized_bits(5, 2), binary(), False, True),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_btnn(make_btnn, randX_100_10, backend, io_type):
     model, is_xnor, test_no = make_btnn
@@ -240,7 +240,7 @@ def randX_1000_1():
         (quantized_relu(10, 5)),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_quantizer(randX_1000_1, quantizer, backend, io_type):
     '''
diff --git a/test/pytest/test_rnn.py b/test/pytest/test_rnn.py
index 12fc42601..aa49e43d3 100644
--- a/test/pytest/test_rnn.py
+++ b/test/pytest/test_rnn.py
@@ -70,10 +70,14 @@ def test_rnn_parsing(rnn_layer, return_sequences):
     [
         (SimpleRNN, 'Quartus', 'io_parallel'),
         (LSTM, 'Vivado', 'io_parallel'),
+        (LSTM, 'Vitis', 'io_parallel'),
         (LSTM, 'Quartus', 'io_parallel'),
         (LSTM, 'Vivado', 'io_stream'),
+        (LSTM, 'Vitis', 'io_stream'),
         (GRU, 'Vivado', 'io_parallel'),
         (GRU, 'Vivado', 'io_stream'),
+        (GRU, 'Vitis', 'io_parallel'),
+        (GRU, 'Vitis', 'io_stream'),
         (GRU, 'Quartus', 'io_parallel'),
         (GRU, 'Quartus', 'io_stream'),
     ],
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 7815d5770..d32569449 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -23,7 +23,8 @@
 @pytest.mark.parametrize("kernels", kernel_options)
 @pytest.mark.parametrize("bias", bias_options)
 @pytest.mark.parametrize("io_type", io_type_options)
-def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type):
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
     input_shape = (28, 28, 3)
     model.add(conv2d(filters=32,
@@ -42,8 +43,8 @@ def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type):
     config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>')
     stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '')
     kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '')
-    output_dir = str(test_root_path / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding'.format(conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds))
-    hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, io_type=io_type)
+    output_dir = str(test_root_path / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds, backend, io_type))
+    hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend)
     hls_model.compile()
     hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
 
diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py
index 749a019f3..9290faf50 100644
--- a/test/pytest/test_softmax.py
+++ b/test/pytest/test_softmax.py
@@ -23,7 +23,7 @@ def high_accuracy_distribution(shape):
 def generate_data(function, input_shape):
     return function((1000, *input_shape))
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('strategy', ['stable', 'argmax'])
 @pytest.mark.parametrize('function,input_shape,io_type', [
                             (flat_distribution, (8,), 'io_parallel'),
@@ -58,7 +58,7 @@ def test_softmax(backend, strategy, generate_data, input_shape, io_type, functio
 
     assert acc_hls4ml >= 0.98
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_softmax_skipped(backend, io_type):
     X = np.random.rand(100, 10)
diff --git a/test/pytest/test_softsign.py b/test/pytest/test_softsign.py
index 2f70b8251..338aaf6f3 100644
--- a/test/pytest/test_softsign.py
+++ b/test/pytest/test_softsign.py
@@ -7,7 +7,7 @@
 
 test_root_path = Path(__file__).parent
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('input_shape, io_type', [
                             ((8, ), 'io_parallel'),
                             ((8, ), 'io_stream'),
diff --git a/test/pytest/test_trace.py b/test/pytest/test_trace.py
index ce01c4213..4c7cde4ac 100644
--- a/test/pytest/test_trace.py
+++ b/test/pytest/test_trace.py
@@ -8,7 +8,7 @@
 
 test_root_path = Path(__file__).parent
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_trace(backend):
     '''Test the tracing feature with a simple Keras model.'''
     model = tf.keras.models.Sequential()
diff --git a/test/pytest/test_transpose_concat.py b/test/pytest/test_transpose_concat.py
index 488fc46b6..db3e03125 100644
--- a/test/pytest/test_transpose_concat.py
+++ b/test/pytest/test_transpose_concat.py
@@ -29,7 +29,7 @@ def keras_model():
 
 @pytest.fixture
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(
         keras_model, default_precision='ap_fixed<16,3,AP_RND_CONV,AP_SAT>', granularity='name'
@@ -45,7 +45,7 @@ def hls_model(keras_model, backend, io_type):
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_accuracy(data, keras_model, hls_model):
     X = data
     model = keras_model
diff --git a/test/pytest/test_upsampling.py b/test/pytest/test_upsampling.py
index 7e698fd90..0f5130162 100644
--- a/test/pytest/test_upsampling.py
+++ b/test/pytest/test_upsampling.py
@@ -41,7 +41,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_upsampling(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':
diff --git a/test/pytest/test_zeropadding.py b/test/pytest/test_zeropadding.py
index 219f727c0..ca539a9ef 100644
--- a/test/pytest/test_zeropadding.py
+++ b/test/pytest/test_zeropadding.py
@@ -45,7 +45,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_zeropadding(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':

From 35fe5720223195bcf3044b1d89fac1cff03513f6 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Fri, 24 Mar 2023 14:41:33 -0700
Subject: [PATCH 20/20] Merge with main (#52)

* Add quantized sigmoid, fix quantized tanh for QKeras (#569)

* snapshot of beginnings

* make version that works for Vivado, error for Quartus

* Change order of precision from quantizer

* add hard sigmoid and tanh

* fix setting of slope and shift type

* revert config parsing--seems a little strange but works

* fix hard_sigmoid and hard_tanh for streaming

* update pytest for quantized tanh and sigmoid

* remove inadvertently included matoplotlib

* add special case when W == min_width.

* fix merge of main

* Go back to having AP_TRN and AP_WRP as defaults

* handle case when use_real_tanh is not defined

* make the activations use AP_RND_CONV (and AP_SAT) by default

* remove use of use_real_tanh in test since not always supported

* fix incorrect default types for Keras (not QKeras) hard_sigmoid

* Mostly fix up things for Quartus

* get rid of intermediate cast

* fix an i++ compilation issue

* Quartus seems to not like ac_fixed<1,0,false>, so make 2 bits.

* fix activation quantizer

* make sat, round defeult activation parameters, don't need to set again

* Make the slope and shift not be configurable for HardActivation

* some pre-commit fixes

* pre-commint //hls to // hls fixes

* update CI version

* fixes for parsing errors from pre-commits

* remove qactivation from list of activation_layers

* print_vivado_report function for nicer reports (#730)

* print_vivado_report function for fancier reports

* Fancy reports (#51)

* fix uram divide by 0

* add test

* fix parsing of vsynth in 2020.1; add test

* Update test_report.py

* exclude pregenerated reports

---------

Co-authored-by: Javier Duarte <jduarte@ucsd.edu>

---------

Co-authored-by: Jovan Mitrevski <jmitrevs@fnal.gov>
Co-authored-by: Vladimir <vloncar@users.noreply.github.com>
---
 .pre-commit-config.yaml                       |   2 +-
 .../backends/quartus/passes/core_templates.py |  58 +-
 .../backends/vivado/passes/core_templates.py  |  58 +-
 hls4ml/converters/keras/core.py               |   2 +
 hls4ml/converters/keras/qkeras_layers.py      |  24 +-
 hls4ml/converters/keras_to_hls.py             |  11 +-
 hls4ml/model/layers.py                        |  27 +
 hls4ml/report/__init__.py                     |  12 +-
 hls4ml/report/vivado_report.py                | 438 ++++++++-
 hls4ml/templates/quartus/firmware/defines.h   |  50 +-
 .../templates/quartus/firmware/myproject.cpp  |  47 +-
 hls4ml/templates/quartus/firmware/myproject.h |  24 +-
 .../firmware/nnet_utils/nnet_activation.h     | 465 +++++-----
 .../nnet_utils/nnet_activation_stream.h       | 460 +++++----
 .../templates/quartus/firmware/parameters.h   |   5 +-
 hls4ml/templates/quartus/myproject_bridge.cpp |  21 +-
 .../quartus/myproject_test_parallel.cpp       | 111 ++-
 .../quartus/myproject_test_stream.cpp         | 122 ++-
 hls4ml/templates/vivado/firmware/defines.h    |   6 +-
 .../templates/vivado/firmware/myproject.cpp   |   8 +-
 hls4ml/templates/vivado/firmware/myproject.h  |   4 +-
 hls4ml/templates/vivado/firmware/parameters.h |  12 +-
 hls4ml/templates/vivado/myproject_bridge.cpp  |  24 +-
 hls4ml/templates/vivado/myproject_test.cpp    | 142 ++-
 .../vivado/nnet_utils/nnet_activation.h       | 481 +++++-----
 .../nnet_utils/nnet_activation_stream.h       | 367 +++++---
 .../vivado/nnet_utils/nnet_code_gen.h         |  35 +-
 .../vivado_accelerator/myproject_axi.cpp      |  17 +-
 .../vivado_accelerator/myproject_axi.h        |   9 +-
 hls4ml/utils/config.py                        |  20 +-
 hls4ml/writer/quartus_writer.py               |  71 +-
 hls4ml/writer/vivado_accelerator_writer.py    | 305 +++---
 hls4ml/writer/vivado_writer.py                |  49 +-
 test/pytest/test_qkeras.py                    |  50 +-
 test/pytest/test_report.py                    |  71 ++
 test/pytest/test_report/myproject_csynth.rpt  | 196 ++++
 test/pytest/test_report/myproject_csynth.xml  | 878 ++++++++++++++++++
 test/pytest/test_report/vivado_hls.app        |  15 +
 test/pytest/test_report/vivado_synth.rpt      | 184 ++++
 39 files changed, 3443 insertions(+), 1438 deletions(-)
 mode change 100755 => 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
 create mode 100644 test/pytest/test_report.py
 create mode 100644 test/pytest/test_report/myproject_csynth.rpt
 create mode 100644 test/pytest/test_report/myproject_csynth.xml
 create mode 100644 test/pytest/test_report/vivado_hls.app
 create mode 100644 test/pytest/test_report/vivado_synth.rpt

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0f1f6823a..83d09fbe3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-exclude: ^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/
+exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pytest/test_report/)
 
 repos:
 - repo: https://github.com/psf/black
diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py
index 26b99db10..aece9fc22 100644
--- a/hls4ml/backends/quartus/passes/core_templates.py
+++ b/hls4ml/backends/quartus/passes/core_templates.py
@@ -1,7 +1,6 @@
-
 from hls4ml.backends.backend import get_backend
-from hls4ml.model.layers import Activation, BatchNormalization, Dense, PReLU, ParametrizedActivation, Softmax
-from hls4ml.backends.template import LayerConfigTemplate, FunctionCallTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
 
 # Dense templates
 
@@ -38,24 +37,28 @@
 
 dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
 
+
 class DenseConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(Dense)
         self.template = dense_config_template
-    
+
     def format(self, node):
         params = self._default_config_params(node)
         params['nzeros'] = node.get_weights('weight').nzeros
         params['nonzeros'] = node.get_weights('weight').nonzeros
-        params['product_type'] = get_backend('quartus').product_type(node.get_input_variable().type.precision, node.get_weights('weight').type.precision)
+        params['product_type'] = get_backend('quartus').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
 
         return self.template.format(**params)
 
+
 class DenseFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(Dense, include_header=dense_include_list)
         self.template = dense_function_template
-    
+
     def format(self, node):
         params = self._default_function_params(node)
         params['w'] = node.get_weights('weight').name
@@ -82,23 +85,27 @@ def format(self, node):
 
 batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
 
+
 class BatchNormalizationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(BatchNormalization)
         self.template = batchnorm_config_template
-    
+
     def format(self, node):
         params = self._default_config_params(node)
         params['n_in'] = node.get_input_variable().size_cpp()
-        params['product_type'] = get_backend('quartus').product_type(node.get_input_variable().type.precision, node.get_weights('scale').type.precision)
+        params['product_type'] = get_backend('quartus').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
 
         return self.template.format(**params)
 
+
 class BatchNormalizationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(BatchNormalization, include_header=batchnorm_include_list)
         self.template = batchnorm_function_template
-    
+
     def format(self, node):
         params = self._default_function_params(node)
         params['scale'] = node.get_weights('scale').name
@@ -117,6 +124,16 @@ def format(self, node):
     typedef {table_t.name} table_t;
 }};\n"""
 
+hard_activ_config_template = """struct {type}_config{index} {{
+    static const unsigned n_in = {n_in};
+    static const {slope_t.name} slope;
+    static const {shift_t.name} shift;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+}};
+const {slope_t.name} {type}_config{index}::slope = {slope};
+const {shift_t.name} {type}_config{index}::shift = {shift};\n"""
+
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
     static const unsigned n_in = {n_in};
     static const unsigned table_size = {table_size};
@@ -132,6 +149,7 @@ def format(self, node):
 
 activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
 
+
 class ActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__((Activation, ParametrizedActivation, PReLU))
@@ -143,16 +161,30 @@ def format(self, node):
 
         return self.template.format(**params)
 
+
+class HardActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(HardActivation)
+        self.template = hard_activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
 class SoftmaxConfigTemplate(ActivationConfigTemplate):
     def __init__(self):
-        super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__
+        super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
         self.template = softmax_config_template
 
+
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
-        super().__init__((Activation, Softmax), include_header=activ_include_list)
+        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
         self.template = activ_function_template
-    
+
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
@@ -160,6 +192,7 @@ def format(self, node):
 
         return self.template.format(**params)
 
+
 class ParametrizedActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(ParametrizedActivation, include_header=activ_include_list)
@@ -173,6 +206,7 @@ def format(self, node):
 
         return self.template.format(**params)
 
+
 class PReLUFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(PReLU, include_header=activ_include_list)
diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 8327e3a7f..c8119c0c2 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -1,7 +1,6 @@
-
 from hls4ml.backends.backend import get_backend
-from hls4ml.model.layers import Activation, BatchNormalization, Dense, Embedding, PReLU, ParametrizedActivation, Softmax
-from hls4ml.backends.template import LayerConfigTemplate, FunctionCallTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
 
 # Dense templates
 
@@ -27,24 +26,28 @@
 
 dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
 
+
 class DenseConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(Dense)
         self.template = dense_config_template
-    
+
     def format(self, node):
         params = self._default_config_params(node)
         params['nzeros'] = node.get_weights('weight').nzeros
         params['nonzeros'] = node.get_weights('weight').nonzeros
-        params['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('weight').type.precision)
+        params['product_type'] = get_backend('vivado').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
 
         return self.template.format(**params)
 
+
 class DenseFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(Dense, include_header=dense_include_list)
         self.template = dense_function_template
-    
+
     def format(self, node):
         params = self._default_function_params(node)
         params['w'] = node.get_weights('weight').name
@@ -73,23 +76,27 @@ def format(self, node):
 
 batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
 
+
 class BatchNormalizationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(BatchNormalization)
         self.template = batchnorm_config_template
-    
+
     def format(self, node):
         params = self._default_config_params(node)
         params['n_in'] = node.get_input_variable().size_cpp()
-        params['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('scale').type.precision)
+        params['product_type'] = get_backend('vivado').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
 
         return self.template.format(**params)
 
+
 class BatchNormalizationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(BatchNormalization, include_header=batchnorm_include_list)
         self.template = batchnorm_function_template
-    
+
     def format(self, node):
         params = self._default_function_params(node)
         params['scale'] = node.get_weights('scale').name
@@ -108,6 +115,16 @@ def format(self, node):
     typedef {table_t.name} table_t;
 }};\n"""
 
+hard_activ_config_template = """struct {type}_config{index} {{
+    static const unsigned n_in = {n_in};
+    static const {slope_t.name} slope;
+    static const {shift_t.name} shift;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+}};
+const {slope_t.name} {type}_config{index}::slope = {slope};
+const {shift_t.name} {type}_config{index}::shift = {shift};\n"""
+
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
     static const unsigned n_in = {n_in};
     static const unsigned table_size = {table_size};
@@ -124,6 +141,7 @@ def format(self, node):
 
 activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
 
+
 class ActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__((Activation, ParametrizedActivation, PReLU))
@@ -135,16 +153,30 @@ def format(self, node):
 
         return self.template.format(**params)
 
+
+class HardActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(HardActivation)
+        self.template = hard_activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
 class SoftmaxConfigTemplate(ActivationConfigTemplate):
     def __init__(self):
-        super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__
+        super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
         self.template = softmax_config_template
 
+
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
-        super().__init__((Activation, Softmax), include_header=activ_include_list)
+        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
         self.template = activ_function_template
-    
+
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
@@ -152,6 +184,7 @@ def format(self, node):
 
         return self.template.format(**params)
 
+
 class ParametrizedActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(ParametrizedActivation, include_header=activ_include_list)
@@ -165,6 +198,7 @@ def format(self, node):
 
         return self.template.format(**params)
 
+
 class PReLUFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(PReLU, include_header=activ_include_list)
diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index 4411ae4c5..97bdefabd 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -105,6 +105,8 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader):
 
     if layer['class_name'] == 'Activation' and layer['activation'] == 'softmax':
         layer['class_name'] = 'Softmax'
+    if layer['class_name'] == 'Activation' and layer['activation'] == 'hard_sigmoid':
+        layer['class_name'] = 'HardActivation'
     if layer['class_name'] == 'Softmax':
         layer['axis'] = keras_layer['config'].get('axis', -1)
 
diff --git a/hls4ml/converters/keras/qkeras_layers.py b/hls4ml/converters/keras/qkeras_layers.py
index 5839ca542..b547c3968 100644
--- a/hls4ml/converters/keras/qkeras_layers.py
+++ b/hls4ml/converters/keras/qkeras_layers.py
@@ -4,6 +4,7 @@
 from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_dense_layer
 from hls4ml.converters.keras.qkeras import get_quantizer_from_config
 from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer
+from hls4ml.model.types import FixedPrecisionType
 
 
 @keras_handler('QDense')
@@ -46,6 +47,7 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader)
         'quantized_tanh',
         'binary_tanh',
         'ternary_tanh',
+        'quantized_sigmoid',
         'quantized_bits',
         'binary',
         'ternary',
@@ -79,16 +81,32 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader)
     if activation_config['class_name'] not in supported_activations:
         raise Exception('Unsupported QKeras activation: {}'.format(activation_config['class_name']))
 
+    if activation_config['class_name'] == 'quantized_bits':
+        activation_config['class_name'] = 'linear'
+
     if activation_config['class_name'] == 'ternary_tanh':
         layer['class_name'] = 'TernaryTanh'
         layer['threshold'] = activation_config.get('config', {}).get('threshold', 0.33)
         if layer['threshold'] is None:
             layer['threshold'] = 0.33  # the default ternary tanh threshold for QKeras
+        layer['activation'] = 'ternary_tanh'
+    elif (
+        activation_config['class_name'] == 'quantized_sigmoid'
+        and not activation_config['config'].get('use_real_sigmoid', False)
+    ) or (
+        activation_config['class_name'] == 'quantized_tanh' and not activation_config['config'].get('use_real_tanh', False)
+    ):
+        layer['class_name'] = 'HardActivation'
+        layer['slope'] = 0.5  # the default values in QKeras
+        layer['shift'] = 0.5
+        # Quartus seems to have trouble if the width is 1.
+        layer['slope_prec'] = FixedPrecisionType(width=2, integer=0, signed=False)
+        layer['shift_prec'] = FixedPrecisionType(width=2, integer=0, signed=False)
+        layer['activation'] = activation_config['class_name'].replace('quantized_', 'hard_')
     else:
         layer['class_name'] = 'Activation'
-    if activation_config['class_name'] == 'quantized_bits':
-        activation_config['class_name'] = 'linear'
-    layer['activation'] = activation_config['class_name'].replace('quantized_', '')
+        layer['activation'] = activation_config['class_name'].replace('quantized_', '')
+
     layer['activation_quantizer'] = activation_config
     return layer, [shape for shape in input_shapes[0]]
 
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 2c875e42f..43748246d 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -257,7 +257,16 @@ def parse_keras_model(model_arch, reader):
     # Define layers to skip for conversion to HLS
     skip_layers = ['Dropout']
     # Activation layers
-    activation_layers = ['Activation', 'LeakyReLU', 'ThresholdedReLU', 'ELU', 'PReLU', 'Softmax', 'TernaryTanh']
+    activation_layers = [
+        'Activation',
+        'LeakyReLU',
+        'ThresholdedReLU',
+        'ELU',
+        'PReLU',
+        'Softmax',
+        'TernaryTanh',
+        'HardActivation',
+    ]
     # Recurrent layers
     recurrent_layers = ['SimpleRNN', 'LSTM', 'GRU']
     # All supported layers
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index b8a3a1a4d..77c874f58 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -768,6 +768,32 @@ def _get_act_function_name(self):
             return act  # ELU activation
 
 
+class HardActivation(Activation):
+    '''
+    Implements the hard sigmoid and tan function in keras and qkeras
+    (Default parameters in qkeras are different, so should be configured)
+    The hard sigmoid unction is clip(slope * x + shift, 0, 1), and the
+    hard tanh function is 2 * hard_sigmoid - 1
+    '''
+
+    _expected_attributes = [
+        Attribute('slope', value_type=float, default=0.2, configurable=False),
+        Attribute('shift', value_type=float, default=0.5, configurable=False),
+        TypeAttribute('slope_t'),
+        TypeAttribute('shift_t'),
+    ]
+
+    def initialize(self):
+        super().initialize()
+        slope_prec = self.get_attr('slope_prec', FixedPrecisionType(width=16, integer=0, signed=False))
+        shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=1, integer=0, signed=False))
+        index = self.get_attr('index')
+        slope_t = NamedType(f'slope{index}_t', precision=slope_prec)
+        shift_t = NamedType(f'shift{index}_t', precision=shift_prec)
+        self.set_attr('slope_t', slope_t)
+        self.set_attr('shift_t', shift_t)
+
+
 class PReLU(Activation):
     def initialize(self):
         super().initialize()
@@ -1264,6 +1290,7 @@ def _initialize_transforms(self):
     'PReLU': PReLU,
     'Softmax': Softmax,
     'TernaryTanh': TernaryTanh,
+    'HardActivation': HardActivation,
     'Reshape': Reshape,
     'Dense': Dense,
     'BinaryDense': Dense,
diff --git a/hls4ml/report/__init__.py b/hls4ml/report/__init__.py
index a75262d1f..b73558f6e 100644
--- a/hls4ml/report/__init__.py
+++ b/hls4ml/report/__init__.py
@@ -1,7 +1,5 @@
-from __future__ import absolute_import
-
-from hls4ml.report.vivado_report import read_vivado_report
-from hls4ml.report.vivado_report import parse_vivado_report
-
-from hls4ml.report.quartus_report import read_quartus_report
-from hls4ml.report.quartus_report import parse_quartus_report
\ No newline at end of file
+from hls4ml.report.quartus_report import parse_quartus_report  # noqa: F401
+from hls4ml.report.quartus_report import read_quartus_report  # noqa: F401
+from hls4ml.report.vivado_report import parse_vivado_report  # noqa: F401
+from hls4ml.report.vivado_report import print_vivado_report  # noqa: F401
+from hls4ml.report.vivado_report import read_vivado_report  # noqa: F401
diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index 736fc3354..68c3ad9dd 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -1,12 +1,12 @@
-from __future__ import print_function
 import os
 import re
 import sys
 import xml.etree.ElementTree as ET
 
+
 def read_vivado_report(hls_dir, full_report=False):
     if not os.path.exists(hls_dir):
-        print('Path {} does not exist. Exiting.'.format(hls_dir))
+        print(f'Path {hls_dir} does not exist. Exiting.')
         return
 
     prj_dir = None
@@ -21,30 +21,37 @@ def read_vivado_report(hls_dir, full_report=False):
 
     sln_dir = hls_dir + '/' + prj_dir
     if not os.path.exists(sln_dir):
-        print('Project {} does not exist. Rerun "hls4ml build -p {}".'.format(prj_dir, hls_dir))
+        print(f'Project {prj_dir} does not exist. Rerun "hls4ml build -p {hls_dir}".')
         return
 
     solutions = _find_solutions(sln_dir)
-    print('Found {} solution(s) in {}.'.format(len(solutions), sln_dir))
+    print(f'Found {len(solutions)} solution(s) in {sln_dir}.')
 
     for sln in solutions:
-        print('Reports for solution "{}":\n'.format(sln))
+        print(f'Reports for solution "{sln}":\n')
         _find_reports(sln_dir + '/' + sln, top_func_name, full_report)
 
+
 def _parse_project_script(path):
     prj_dir = None
     top_func_name = None
 
     project_path = path + '/project.tcl'
 
-    with open(project_path, 'r') as f:
+    with open(project_path) as f:
         for line in f.readlines():
             if 'set project_name' in line:
                 top_func_name = line.split('"')[-2]
                 prj_dir = top_func_name + '_prj'
+            if 'set backend' in line:
+                backend_name = line.split('"')[-2]
+
+    if 'accelerator' in backend_name:
+        top_func_name += '_axi'
 
     return prj_dir, top_func_name
 
+
 def _find_solutions(sln_dir):
     solutions = []
 
@@ -67,49 +74,55 @@ def _find_solutions(sln_dir):
 
     return solutions
 
+
 def _find_reports(sln_dir, top_func_name, full_report=False):
-    csim_file = sln_dir + '/csim/report/{}_csim.log'.format(top_func_name)
+    csim_file = sln_dir + f'/csim/report/{top_func_name}_csim.log'
     if os.path.isfile(csim_file):
         _show_csim_report(csim_file)
     else:
         print('C simulation report not found.')
 
-    syn_file = sln_dir + '/syn/report/{}_csynth.rpt'.format(top_func_name)
+    syn_file = sln_dir + f'/syn/report/{top_func_name}_csynth.rpt'
     if os.path.isfile(syn_file):
         _show_synth_report(syn_file, full_report)
     else:
         print('Synthesis report not found.')
 
-    cosim_file = sln_dir + '/sim/report/{}_cosim.rpt'.format(top_func_name)
+    cosim_file = sln_dir + f'/sim/report/{top_func_name}_cosim.rpt'
     if os.path.isfile(cosim_file):
         _show_cosim_report(cosim_file)
     else:
         print('Co-simulation report not found.')
 
+
 def _show_csim_report(csim_file):
-    with open(csim_file, 'r') as f:
+    with open(csim_file) as f:
         print('C SIMULATION RESULT:')
         print(f.read())
 
+
 def _show_synth_report(synth_file, full_report=False):
-    with open(synth_file, 'r') as f:
+    with open(synth_file) as f:
         print('SYNTHESIS REPORT:')
         for line in f.readlines()[2:]:
             if not full_report and '* DSP48' in line:
                 break
-            print(line, end = '')
+            print(line, end='')
+
 
 def _show_cosim_report(cosim_file):
-    with open(cosim_file, 'r') as f:
+    with open(cosim_file) as f:
         print('CO-SIMULATION RESULT:')
         print(f.read())
 
+
 def _get_abs_and_percentage_values(unparsed_cell):
     return int(unparsed_cell.split('(')[0]), float(unparsed_cell.split('(')[1].replace('%', '').replace(')', ''))
 
+
 def parse_vivado_report(hls_dir):
     if not os.path.exists(hls_dir):
-        print('Path {} does not exist. Exiting.'.format(hls_dir))
+        print(f'Path {hls_dir} does not exist. Exiting.')
         return
 
     prj_dir = None
@@ -124,19 +137,19 @@ def parse_vivado_report(hls_dir):
 
     sln_dir = hls_dir + '/' + prj_dir
     if not os.path.exists(sln_dir):
-        print('Project {} does not exist. Rerun "hls4ml build -p {}".'.format(prj_dir, hls_dir))
+        print(f'Project {prj_dir} does not exist. Rerun "hls4ml build -p {hls_dir}".')
         return
 
     solutions = _find_solutions(sln_dir)
     if len(solutions) > 1:
-        print('WARNING: Found {} solution(s) in {}. Using the first solution.'.format(len(solutions), sln_dir))
+        print(f'WARNING: Found {len(solutions)} solution(s) in {sln_dir}. Using the first solution.')
 
     report = {}
 
     sim_file = hls_dir + '/tb_data/csim_results.log'
     if os.path.isfile(sim_file):
         csim_results = []
-        with open(sim_file, 'r') as f:
+        with open(sim_file) as f:
             for line in f.readlines():
                 csim_results.append([r for r in line.split()])
         report['CSimResults'] = csim_results
@@ -144,18 +157,19 @@ def parse_vivado_report(hls_dir):
     sim_file = hls_dir + '/tb_data/rtl_cosim_results.log'
     if os.path.isfile(sim_file):
         cosim_results = []
-        with open(sim_file, 'r') as f:
+        with open(sim_file) as f:
             for line in f.readlines():
                 cosim_results.append([r for r in line.split()])
         report['CosimResults'] = cosim_results
 
-    syn_file = sln_dir + '/' + solutions[0] + '/syn/report/{}_csynth.xml'.format(top_func_name)
+    syn_file = sln_dir + '/' + solutions[0] + f'/syn/report/{top_func_name}_csynth.xml'
     c_synth_report = {}
     if os.path.isfile(syn_file):
         root = ET.parse(syn_file).getroot()
 
         # Performance
         perf_node = root.find('./PerformanceEstimates')
+        c_synth_report['TargetClockPeriod'] = root.find('./UserAssignments/TargetClockPeriod').text
         c_synth_report['EstimatedClockPeriod'] = perf_node.find('./SummaryOfTimingAnalysis/EstimatedClockPeriod').text
         c_synth_report['BestLatency'] = perf_node.find('./SummaryOfOverallLatency/Best-caseLatency').text
         c_synth_report['WorstLatency'] = perf_node.find('./SummaryOfOverallLatency/Worst-caseLatency').text
@@ -187,9 +201,10 @@ def parse_vivado_report(hls_dir):
                     section = int(match.group(1))
                 # Sometimes, phrases such as 'CLB Registers' can show up in the non-tabular sections of the report
                 if '|' in line:
-                    if 'CLB LUTs' in line and section == 1:
+                    # CLB (2019.X) vs. Slice (2020.X)
+                    if ('CLB LUTs' in line or 'Slice LUTs' in line) and section == 1:
                         vivado_synth_rpt['LUT'] = line.split('|')[2].strip()
-                    elif 'CLB Registers' in line and section == 1:
+                    elif ('CLB Registers' in line or 'Slice Registers' in line) and section == 1:
                         vivado_synth_rpt['FF'] = line.split('|')[2].strip()
                     elif 'Block RAM Tile' in line and section == 2:
                         vivado_synth_rpt['BRAM_18K'] = line.split('|')[2].strip()
@@ -201,13 +216,13 @@ def parse_vivado_report(hls_dir):
     else:
         print('Vivado synthesis report not found.')
 
-    cosim_file = sln_dir + '/' + solutions[0] + '/sim/report/{}_cosim.rpt'.format(top_func_name)
+    cosim_file = sln_dir + '/' + solutions[0] + f'/sim/report/{top_func_name}_cosim.rpt'
     if os.path.isfile(cosim_file):
         cosim_report = {}
-        with open(cosim_file, 'r') as f:
+        with open(cosim_file) as f:
             for line in f.readlines():
                 if re.search('VHDL', line) or re.search('Verilog', line):
-                    result = line[1:].split() # [1:] skips the leading '|'
+                    result = line[1:].split()  # [1:] skips the leading '|'
                     result = [res[:-1] if res[-1] == '|' else res for res in result]
                     # RTL, Status, Latency-min, Latency-avg, Latency-max, Interval-min, Interval-avg, Interval-max
                     if result[1] == 'NA':
@@ -224,27 +239,58 @@ def parse_vivado_report(hls_dir):
         print('Cosim report not found.')
 
     if os.path.isfile(cosim_file):
-        transaction_file = sln_dir + '/' + solutions[0] + '/sim/' + report['CosimReport']['RTL'].lower() + '/' + top_func_name + '.performance.result.transaction.xml'
+        transaction_file = (
+            sln_dir
+            + '/'
+            + solutions[0]
+            + '/sim/'
+            + report['CosimReport']['RTL'].lower()
+            + '/'
+            + top_func_name
+            + '.performance.result.transaction.xml'
+        )
         if os.path.isfile(transaction_file):
-            cosim_transactions = {'InitiationInterval': {'max': 0, 'min': sys.maxsize, 'avg': 0.0},
-                                  'Latency': {'max': 0, 'min': sys.maxsize, 'avg': 0.0}}
-            with open(transaction_file, 'r') as f:
+            cosim_transactions = {
+                'InitiationInterval': {'max': 0, 'min': sys.maxsize, 'avg': 0.0},
+                'Latency': {'max': 0, 'min': sys.maxsize, 'avg': 0.0},
+            }
+            with open(transaction_file) as f:
                 i = 1
                 for line in f.readlines():
                     if re.search('transaction', line):
                         result = line.split()
                         # update min
                         if result[3] != 'x':
-                            cosim_transactions['InitiationInterval']['min'] = int(result[3]) if int(result[3]) < cosim_transactions['InitiationInterval']['min'] else cosim_transactions['InitiationInterval']['min']
-                        cosim_transactions['Latency']['min'] = int(result[2]) if int(result[2]) < cosim_transactions['Latency']['min'] else cosim_transactions['Latency']['min']
+                            cosim_transactions['InitiationInterval']['min'] = (
+                                int(result[3])
+                                if int(result[3]) < cosim_transactions['InitiationInterval']['min']
+                                else cosim_transactions['InitiationInterval']['min']
+                            )
+                        cosim_transactions['Latency']['min'] = (
+                            int(result[2])
+                            if int(result[2]) < cosim_transactions['Latency']['min']
+                            else cosim_transactions['Latency']['min']
+                        )
                         # update max
                         if result[3] != 'x':
-                            cosim_transactions['InitiationInterval']['max'] = int(result[3]) if int(result[3]) > cosim_transactions['InitiationInterval']['max'] else cosim_transactions['InitiationInterval']['max']
-                        cosim_transactions['Latency']['max'] = int(result[2]) if int(result[2]) > cosim_transactions['Latency']['max'] else cosim_transactions['Latency']['max']
+                            cosim_transactions['InitiationInterval']['max'] = (
+                                int(result[3])
+                                if int(result[3]) > cosim_transactions['InitiationInterval']['max']
+                                else cosim_transactions['InitiationInterval']['max']
+                            )
+                        cosim_transactions['Latency']['max'] = (
+                            int(result[2])
+                            if int(result[2]) > cosim_transactions['Latency']['max']
+                            else cosim_transactions['Latency']['max']
+                        )
                         # update avg
                         if result[3] != 'x':
-                            cosim_transactions['InitiationInterval']['avg'] = cosim_transactions['InitiationInterval']['avg'] + float((int(result[3]) - cosim_transactions['InitiationInterval']['avg']) / i)
-                        cosim_transactions['Latency']['avg'] = cosim_transactions['Latency']['avg'] + float((int(result[2]) - cosim_transactions['Latency']['avg']) / i)
+                            cosim_transactions['InitiationInterval']['avg'] = cosim_transactions['InitiationInterval'][
+                                'avg'
+                            ] + float((int(result[3]) - cosim_transactions['InitiationInterval']['avg']) / i)
+                        cosim_transactions['Latency']['avg'] = cosim_transactions['Latency']['avg'] + float(
+                            (int(result[2]) - cosim_transactions['Latency']['avg']) / i
+                        )
                         i += 1
 
             report['CosimReport']['LatencyMin'] = cosim_transactions['Latency']['min']
@@ -258,10 +304,10 @@ def parse_vivado_report(hls_dir):
         util_rpt_file = hls_dir + '/util.rpt'
         if os.path.isfile(util_rpt_file):
             implementation_report = {}
-            with open(util_rpt_file, 'r') as f:
+            with open(util_rpt_file) as f:
                 for line in f.readlines():
-                    if re.search('\(top\)', line):
-                        # Total LUTs  |   Logic LUTs  |   LUTRAMs  |     SRLs    |      FFs      |    RAMB36   |   RAMB18  (|   URAM   )| DSP48 Blocks
+                    if re.search(r'\(top\)', line):
+                        # Total LUTs | Logic LUTs | LUTRAMs | SRLs | FFs | RAMB36 | RAMB18 (|   URAM   )| DSP48 Blocks
                         # skipping the first 2 unuseful cells with [:2]
                         results = [_get_abs_and_percentage_values(elem) for elem in line.replace('|', '').split()[2:]]
                         implementation_report['TotLUTs'] = results[0][0]
@@ -298,20 +344,25 @@ def parse_vivado_report(hls_dir):
         else:
             print('Implementation report not found.')
 
-    timing_report_file = hls_dir + '/' + prj_dir.split('_')[0] + '_vivado_accelerator/project_1.runs/impl_1/design_1_wrapper_timing_summary_routed.rpt'
+    timing_report_file = (
+        hls_dir
+        + '/'
+        + prj_dir.split('_')[0]
+        + '_vivado_accelerator/project_1.runs/impl_1/design_1_wrapper_timing_summary_routed.rpt'
+    )
     if os.path.isfile(timing_report_file):
         timing_report = {}
-        with open(timing_report_file, 'r') as f:
+        with open(timing_report_file) as f:
             while not re.search('WNS', next(f)):
                 pass
             # skip the successive line
             next(f)
             result = next(f).split()
 
-        timing_report['WNS']  = float(result[0])
-        timing_report['TNS']  = float(result[1])
-        timing_report['WHS']  = float(result[4])
-        timing_report['THS']  = float(result[5])
+        timing_report['WNS'] = float(result[0])
+        timing_report['TNS'] = float(result[1])
+        timing_report['WHS'] = float(result[4])
+        timing_report['THS'] = float(result[5])
         timing_report['WPWS'] = float(result[8])
         timing_report['TPWS'] = float(result[9])
 
@@ -320,3 +371,304 @@ def parse_vivado_report(hls_dir):
         print('Timing report not found.')
     return report
 
+
+def print_vivado_report(report_dict):
+    if _is_running_in_notebook():
+        _print_ipython_report(report_dict)
+    else:
+        _print_str_report(report_dict)
+
+
+def _print_ipython_report(report_dict):
+    from IPython.display import HTML, display
+
+    html = '<html>\n' + _table_css + '<div class="hls4ml">'
+    body = _make_report_body(report_dict, _make_html_table_template, _make_html_header)
+    html += body + '\n</div>\n</html>'
+    display(HTML(html))
+
+
+def _print_str_report(report_dict):
+    body = _make_report_body(report_dict, _make_str_table_template, _make_str_header)
+    print(body)
+
+
+def _is_running_in_notebook():
+    try:
+        from IPython import get_ipython
+
+        shell = get_ipython().__class__.__name__
+        if shell == 'ZMQInteractiveShell':
+            return True  # Jupyter notebook or qtconsole
+        elif shell == 'TerminalInteractiveShell':
+            return False  # Terminal running IPython
+        else:
+            return False  # Other type (?)
+    except NameError:
+        return False  # Probably standard Python interpreter
+
+
+_table_css = """
+<style>
+.hls4ml {
+    font-family: Tahoma, Geneva, sans-serif;
+}
+.hls4ml h3 {
+    font-size: 15px;
+    font-weight: 600;
+    color: #54585d;
+}
+.hls4ml table {
+    border-collapse: collapse;
+    display: inline-block;
+    padding: 2px;
+}
+.hls4ml table td {
+    text-align: left;
+    padding: 10px;
+}
+.hls4ml table td:nth-child(2) {
+    text-align: right;
+}
+.hls4ml table thead td {
+    background-color: #54585d;
+    color: #ffffff;
+    font-weight: bold;
+    font-size: 11px;
+    border: 1px solid #54585d;
+}
+.hls4ml table tbody td {
+    color: #636363;
+    border: 1px solid #dddfe1;
+    font-size: 11px;
+}
+.hls4ml table tbody tr {
+    background-color: #f9fafb;
+}
+.hls4ml table tbody tr:nth-child(odd) {
+    background-color: #ffffff;
+}
+</style>
+"""
+
+_table_base_template = """
+<table>
+    <thead>
+        <tr>
+            <td colspan=2>{table_header}</td>
+        </tr>
+    </thead>
+    <tbody>
+{table_rows}
+    </tbody>
+</table>
+"""
+
+_row_base_template = "        <tr><td>{row_title}</td><td>{{{row_key}}}</td>"
+
+
+def _make_html_table_template(table_header, row_templates):
+    table_rows = '\n'.join(
+        [_row_base_template.format(row_title=row_title, row_key=row_key) for row_title, row_key in row_templates.items()]
+    )
+    return _table_base_template.format(table_header=table_header, table_rows=table_rows)
+
+
+def _make_str_table_template(table_header, row_templates):
+    len_title = 0
+    for row_title in row_templates.keys():
+        if len(row_title) > len_title:
+            len_title = len(row_title)
+    head = f'\n - {table_header}:\n'
+    table_rows = '\n'.join(
+        ['    ' + f'{row_title}:'.ljust(len_title + 1) + f' {{{row_key}}}' for row_title, row_key in row_templates.items()]
+    )
+    return head + table_rows + '\n'
+
+
+def _make_html_header(report_header):
+    return f'<h3>{report_header}:</h3>'
+
+
+def _make_str_header(report_header):
+    sep = '=' * 54 + '\n'
+    return '\n' + sep + '== ' + report_header + '\n' + sep
+
+
+def _convert_cycles_to_time(n_cycles, clock_period):
+    time_in_ns = n_cycles * clock_period
+    if time_in_ns < 1000:
+        return str(time_in_ns) + ' ns'
+
+    time_in_us = time_in_ns / 1000
+    if time_in_us < 1000:
+        return str(time_in_us) + ' \u00B5s'
+
+    time_in_ms = time_in_us / 1000
+    if time_in_ms < 1000:
+        return str(time_in_ms) + ' ms'
+
+    time_in_s = time_in_ms / 1000
+    if time_in_s < 1000:
+        return str(time_in_s) + ' s'
+
+
+def _make_report_body(report_dict, make_table_template, make_header_template):
+    body = ''
+
+    if 'CSynthesisReport' in report_dict:
+        body += make_header_template('C Synthesis report')
+        perf_rows = {
+            'Best-case latency': 'best_latency',
+            'Worst-case latency': 'worst_latency',
+            'Interval Min': 'interval_min',
+            'Interval Max': 'interval_max',
+            'Estimated Clock Period': 'estimated_clock',
+        }
+        area_rows = {
+            'BRAM_18K': 'bram',
+            'DSP48E': 'dsp',
+            'FF': 'ff',
+            'LUT': 'lut',
+            'URAM': 'uram',
+        }
+        body += make_table_template('Performance estimates', perf_rows)
+        body += make_table_template('Resource estimates', area_rows)
+
+        csynth_report = report_dict['CSynthesisReport']
+        target_clock = float(csynth_report['TargetClockPeriod'])
+        best_latency = int(csynth_report['BestLatency'])
+        worst_latency = int(csynth_report['BestLatency'])
+        bram = int(csynth_report['BRAM_18K'])
+        avail_bram = int(csynth_report['AvailableBRAM_18K'])
+        dsp = int(csynth_report['DSP48E'])
+        avail_dsp = int(csynth_report['AvailableDSP48E'])
+        ff = int(csynth_report['FF'])
+        avail_ff = int(csynth_report['AvailableFF'])
+        lut = int(csynth_report['LUT'])
+        avail_lut = int(csynth_report['AvailableLUT'])
+        if 'URAM' in csynth_report:
+            uram = int(csynth_report['URAM'])
+            avail_uram = int(csynth_report['AvailableURAM'])
+
+        params = {}
+
+        params['best_latency'] = str(best_latency) + ' (' + _convert_cycles_to_time(best_latency, target_clock) + ')'
+        params['worst_latency'] = str(worst_latency) + ' (' + _convert_cycles_to_time(worst_latency, target_clock) + ')'
+        params['interval_min'] = csynth_report['IntervalMin']
+        params['interval_max'] = csynth_report['IntervalMax']
+        params['estimated_clock'] = csynth_report['EstimatedClockPeriod']
+
+        params['bram'] = str(bram) + ' / ' + str(avail_bram) + ' (' + str(round(bram / avail_bram * 100, 1)) + '%)'
+        params['dsp'] = str(dsp) + ' / ' + str(avail_dsp) + ' (' + str(round(dsp / avail_dsp * 100, 1)) + '%)'
+        params['ff'] = str(ff) + ' / ' + str(avail_ff) + ' (' + str(round(ff / avail_ff * 100, 1)) + '%)'
+        params['lut'] = str(lut) + ' / ' + str(avail_lut) + ' (' + str(round(lut / avail_lut * 100, 1)) + '%)'
+        if 'URAM' in csynth_report and avail_uram > 0:
+            params['uram'] = str(uram) + ' / ' + str(avail_uram) + ' (' + str(round(uram / avail_uram * 100, 1)) + '%)'
+        else:
+            params['uram'] = 'N/A'
+
+        body = body.format(**params)
+
+    if 'VivadoSynthReport' in report_dict:
+        body += make_header_template('Vivado Synthesis report')
+        area_rows = {
+            'BRAM_18K': 'bram',
+            'DSP48E': 'dsp',
+            'FF': 'ff',
+            'LUT': 'lut',
+            'URAM': 'uram',
+        }
+        body += make_table_template('Resource utilization', area_rows)
+
+        vsynth_report = report_dict['VivadoSynthReport']
+
+        params = {}
+        params['bram'] = vsynth_report['BRAM_18K']
+        params['dsp'] = vsynth_report['DSP48E']
+        params['ff'] = vsynth_report['FF']
+        params['lut'] = vsynth_report['LUT']
+        params['uram'] = vsynth_report['URAM'] if 'URAM' in vsynth_report else 'N/A'
+
+        body = body.format(**params)
+
+    if 'CosimReport' in report_dict:
+        body += make_header_template('Co-Simulation report')
+        perf_rows = {
+            'Status': 'status',
+            'Best-case latency': 'best_latency',
+            'Worst-case latency': 'worst_latency',
+            'Interval Min': 'interval_min',
+            'Interval Max': 'interval_max',
+        }
+        body += make_table_template('Performance', perf_rows)
+
+        cosim_report = report_dict['CosimReport']
+
+        params = {}
+        params['status'] = cosim_report['Status']
+        params['best_latency'] = cosim_report['LatencyMin']
+        params['worst_latency'] = cosim_report['LatencyMax']
+        params['interval_min'] = cosim_report['IntervalMin']
+        params['interval_max'] = cosim_report['IntervalMax']
+
+        body = body.format(**params)
+
+    if 'ImplementationReport' in report_dict:
+        body += make_header_template('Implementation report')
+        area_rows = {
+            'Total LUTs': 'lut',
+            'Logic LUTs': 'logiclut',
+            'LUTRAM': 'lutram',
+            'SRLs': 'srl',
+            'FF': 'ff',
+            'RAMB18': 'bram18',
+            'RAMB36': 'bram36',
+            'DSP': 'dsp',
+            'URAM': 'uram',
+        }
+        body += make_table_template('Resource utilization', area_rows)
+
+        impl_report = report_dict['ImplementationReport']
+
+        params = {}
+        params['lut'] = impl_report['TotLUTs'] + ' (' + impl_report['TotLUTs%'] + '%)'
+        params['logiclut'] = impl_report['LogicLUTs'] + ' (' + impl_report['LogicLUTs%'] + '%)'
+        params['lutram'] = impl_report['LUTRAMs'] + ' (' + impl_report['LUTRAMs%'] + '%)'
+        params['srl'] = impl_report['SRLs'] + ' (' + impl_report['SRLs%'] + '%)'
+        params['ff'] = impl_report['FFs'] + ' (' + impl_report['FFs%'] + '%)'
+        params['bram18'] = impl_report['RAMB18s'] + ' (' + impl_report['RAMB18s%'] + '%)'
+        params['bram36'] = impl_report['RAMB36s'] + ' (' + impl_report['RAMB36s%'] + '%)'
+        params['dsp'] = impl_report['DSPs'] + ' (' + impl_report['DSPs%'] + '%)'
+        if 'URAMs' in impl_report:
+            params['uram'] = impl_report['URAMs'] + ' (' + impl_report['URAMs%'] + '%)'
+        else:
+            params['uram'] = 'N/A'
+
+        body = body.format(**params)
+
+    if 'TimingReport' in report_dict:
+        body += make_header_template('Timing report')
+        perf_rows = {
+            'Worst Negative Slack (WNS)': 'wns',
+            'Total Negative Slack (TNS)': 'tns',
+            'Worst Hold Slack (WHS)': 'whs',
+            'Total Hold Slack (THS)': 'ths',
+            'Worst Pulse Width Slack (WPWS)': 'wpws',
+            'Total Pulse Width Slack (TPWS)': 'tpws',
+        }
+        body += make_table_template('Timing', perf_rows)
+
+        timing_report = report_dict['TimingReport']
+
+        params = {}
+        params['wns'] = round(timing_report['WNS'], 2)
+        params['tns'] = round(timing_report['TNS'], 2)
+        params['whs'] = round(timing_report['WHS'], 2)
+        params['ths'] = round(timing_report['THS'], 2)
+        params['wpws'] = round(timing_report['WPWS'], 2)
+        params['tpws'] = round(timing_report['TPWS'], 2)
+
+        body = body.format(**params)
+
+    return body
diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h
index 6e9b243d8..c3fe4ec40 100644
--- a/hls4ml/templates/quartus/firmware/defines.h
+++ b/hls4ml/templates/quartus/firmware/defines.h
@@ -2,54 +2,46 @@
 #define DEFINES_H_
 
 /*
-* Intel HLS makes use of three streaming interfaces:
-*   (1) stream_in - used as the main input to a component
-*   (2) stream_out - used as the main output of a component
-*   (3) stream - allows both reading and writing; used for inter-component connections
-* ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
-* Therefore, variables of type 'stream' are always passed by reference
-*/
+ * Intel HLS makes use of three streaming interfaces:
+ *   (1) stream_in - used as the main input to a component
+ *   (2) stream_out - used as the main output of a component
+ *   (3) stream - allows both reading and writing; used for inter-component connections
+ * ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
+ * Therefore, variables of type 'stream' are always passed by reference
+ */
 
 #ifndef __INTELFPGA_COMPILER__
 
-#include "ac_int.h"
 #include "ac_fixed.h"
+#include "ac_int.h"
 #define hls_register
 
 #include "stream.h"
-template<typename T>
-using stream = nnet::stream<T>;
-template<typename T>
-using stream_in = nnet::stream<T>;
-template<typename T>
-using stream_out = nnet::stream<T>;
+template <typename T> using stream = nnet::stream<T>;
+template <typename T> using stream_in = nnet::stream<T>;
+template <typename T> using stream_out = nnet::stream<T>;
 
 #else
 
-#include "HLS/hls.h"
-#include "HLS/ac_int.h"
 #include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/hls.h"
 
-template<typename T>
-using stream = ihc::stream<T>;
-template<typename T>
-using stream_in = ihc::stream_in<T>;
-template<typename T>
-using stream_out = ihc::stream_out<T>;
+template <typename T> using stream = ihc::stream<T>;
+template <typename T> using stream_in = ihc::stream_in<T>;
+template <typename T> using stream_out = ihc::stream_out<T>;
 
 #endif
 
 // Include nnet::array - a custom array-like struct, mainly used with io_stream
 #include "nnet_utils/nnet_types.h"
 
-//hls-fpga-machine-learning insert numbers
-
-
-//hls-fpga-machine-learning insert layer-precision
+// hls-fpga-machine-learning insert numbers
 
+// hls-fpga-machine-learning insert layer-precision
 
-#define DIV_ROUNDUP(n,d) ((n + d - 1) / d)
-#define MIN(n,d) (n > d ? d : n)
-#define MAX(n,d) (n < d ? d : n)
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n < d ? d : n)
 
 #endif
diff --git a/hls4ml/templates/quartus/firmware/myproject.cpp b/hls4ml/templates/quartus/firmware/myproject.cpp
index 9ca07fce6..8bde3194a 100644
--- a/hls4ml/templates/quartus/firmware/myproject.cpp
+++ b/hls4ml/templates/quartus/firmware/myproject.cpp
@@ -18,16 +18,17 @@
 //
 
 #include "myproject.h"
+#include "parameters.h"
 
-//hls-fpga-machine-learning insert weights
+// hls-fpga-machine-learning insert weights
 
 /*
-* Intel HLS requires that all 'stream' types are:
-*     (1) Passed by reference to the top-level entity or
-*     (2) Declared as global variables, outside of the main function
-* Therefore, layer inputs/output (connections betweenn individual layers) are declared here
-*/
-//hls-fpga-machine-learning insert inter-task streams
+ * Intel HLS requires that all 'stream' types are:
+ *     (1) Passed by reference to the top-level entity or
+ *     (2) Declared as global variables, outside of the main function
+ * Therefore, layer inputs/output (connections betweenn individual layers) are declared here
+ */
+// hls-fpga-machine-learning insert inter-task streams
 
 #ifndef __INTELFPGA_COMPILER__
 /*
@@ -42,25 +43,25 @@
                - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
 * This distinction is handled in quartus_writer.py
 */
-//hls-fpga-machine-learning instantiate GCC top-level
+// hls-fpga-machine-learning instantiate GCC top-level
 #else
 // Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
-//hls-fpga-machine-learning insert cpragmas
+// hls-fpga-machine-learning insert cpragmas
 
 /*
-* The top-level function used during HLS Synthesis goes here
-* In a similar manner to GCC, there is a distinction between io_stream & io_parallel
-*/
-//hls-fpga-machine-learning instantiate HLS top-level
-#endif 
-   // If using io_parallel, the output needs to be initialised and returned at the end of this function
-   // If using io_stream, no output is initialised, as it is passed by reference to the top-level function
-   //hls-fpga-machine-learning initialize input/output
+ * The top-level function used during HLS Synthesis goes here
+ * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+ */
+// hls-fpga-machine-learning instantiate HLS top-level
+#endif
+// If using io_parallel, the output needs to be initialised and returned at the end of this function
+// If using io_stream, no output is initialised, as it is passed by reference to the top-level function
+// hls-fpga-machine-learning initialize input/output
+
+// ****************************************
+// NETWORK INSTANTIATION
+// ****************************************
 
-   // ****************************************
-   // NETWORK INSTANTIATION
-   // ****************************************
+// hls-fpga-machine-learning insert layers
 
-   //hls-fpga-machine-learning insert layers
-   
-   //hls-fpga-machine-learning return
+// hls-fpga-machine-learning return
diff --git a/hls4ml/templates/quartus/firmware/myproject.h b/hls4ml/templates/quartus/firmware/myproject.h
index 1e11ec43d..1ac03b724 100644
--- a/hls4ml/templates/quartus/firmware/myproject.h
+++ b/hls4ml/templates/quartus/firmware/myproject.h
@@ -21,23 +21,23 @@
 #define MYPROJECT_H_
 
 #ifndef __INTELFPGA_COMPILER__
-#include "ac_int.h"
 #include "ac_fixed.h"
+#include "ac_int.h"
 #define hls_register
 #else
-#include "HLS/hls.h"
-#include "HLS/ac_int.h"
 #include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/hls.h"
 #endif
 
 // Streams are explicitly defined in defines.h, which are included for parameters.h
 // Defining them again in this file will cause compile-time errors
-#include "parameters.h"
+#include "defines.h"
 
 // If using io_parallel, inputs and output need to be initialised before calling the top-level function
 // If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function
-//hls-fpga-machine-learning insert inputs
-//hls-fpga-machine-learning insert outputs
+// hls-fpga-machine-learning insert inputs
+// hls-fpga-machine-learning insert outputs
 
 #ifndef __INTELFPGA_COMPILER__
 /*
@@ -52,16 +52,16 @@
                - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
 * This distinction is handled in quartus_writer.py
 */
-//hls-fpga-machine-learning instantiate GCC top-level
+// hls-fpga-machine-learning instantiate GCC top-level
 #else
 // Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
-//hls-fpga-machine-learning insert cpragmas
+// hls-fpga-machine-learning insert cpragmas
 
 /*
-* The top-level function used during HLS Synthesis goes here
-* In a similar manner to GCC, there is a distinction between io_stream & io_parallel
-*/
-//hls-fpga-machine-learning instantiate HLS top-level
+ * The top-level function used during HLS Synthesis goes here
+ * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+ */
+// hls-fpga-machine-learning instantiate HLS top-level
 #endif
 
 #endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
old mode 100755
new mode 100644
index 20790a390..b750a688e
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
@@ -24,8 +24,7 @@
 
 namespace nnet {
 
-struct activ_config
-{
+struct activ_config {
     // IO size
     static const unsigned n_in = 10;
 
@@ -37,17 +36,15 @@ struct activ_config
     static const unsigned reuse_factor = 1;
 
     // Internal data type definitions
-    typedef ac_fixed<16,8> table_t;
+    typedef ac_fixed<16, 8> table_t;
 };
 
 // *************************************************
 //       LINEAR Activation -- See Issue 53
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
         res[ii] = datareg;
     }
@@ -56,67 +53,62 @@ void  linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
 // *************************************************
 //       RELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
-        if (datareg > 0) res[ii] = datareg;
-        else res[ii] = 0;
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
     }
 }
 
-template<class data_T, class res_T, int MAX_INT, typename CONFIG_T>
-void  relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
+void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
-        if (datareg < 0) res[ii] = 0;
-        else if (datareg > MAX_INT) res[ii] = MAX_INT;
-        else res[ii] = datareg;
+        if (datareg < 0)
+            res[ii] = 0;
+        else if (datareg > MAX_INT)
+            res[ii] = MAX_INT;
+        else
+            res[ii] = datareg;
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
 }
 
-
 // *************************************************
 //       Sigmoid Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
-    static const int MAX_VALUE=8;
-    #include "activation_tables/sigmoid_table.tb"
+template <class data_T, class res_T, typename CONFIG_T>
+void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    static const int MAX_VALUE = 8;
+#include "activation_tables/sigmoid_table.tb"
     #pragma unroll
-    for (int ii=0; ii < CONFIG_T::n_in; ii++) {
-        data_T absoluteValue  hls_register;
-        res_T  temp2 hls_register;
-        if(data[ii] < 0 ){
-            absoluteValue = - data[ii];
-        }
-        else{
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T absoluteValue hls_register;
+        res_T temp2 hls_register;
+        if (data[ii] < 0) {
+            absoluteValue = -data[ii];
+        } else {
             absoluteValue = data[ii];
         }
-        int index = ( absoluteValue *( CONFIG_T::table_size / MAX_VALUE)).to_int();
-        if (absoluteValue > MAX_VALUE ) index = CONFIG_T::table_size - 1;
-        temp2 = (res_T) sigmoid_table[index];
-        if(data[ii] < 0 ){
-            res[ii] = 1-temp2;
-        }
-        else{
+        int index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+        if (absoluteValue > MAX_VALUE)
+            index = CONFIG_T::table_size - 1;
+        temp2 = (res_T)sigmoid_table[index];
+        if (data[ii] < 0) {
+            res[ii] = 1 - temp2;
+        } else {
             res[ii] = temp2;
         }
     }
@@ -126,35 +118,34 @@ void  sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
 //       Softmax Activation
 // *************************************************
 
-enum class softmax_implementation {latency=0, legacy=1, stable=2, argmax=3};
+enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
 
-template<class data_T, typename CONFIG_T>
-inline unsigned softmax_stable_idx_from_real_val(const data_T x){
+template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
     // Number of address bits for table
     static constexpr int N = ceillog2(CONFIG_T::table_size);
 
     // Slice the top N bits of the input
-    hls_register ac_int<N, false> y = x.template slc<N>(x.width-N-1);
+    hls_register ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
     // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
-    if (x != 0 && y == 0) y[0] = 1;
+    if (x != 0 && y == 0)
+        y[0] = 1;
     return y.to_uint();
 }
 
-template<class data_T, typename CONFIG_T>
-inline unsigned softmax_latency_idx_from_real_val(const data_T x){
+template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
     // Number of address bits for table
-    static constexpr int N = ceillog2(CONFIG_T::table_size);    
+    static constexpr int N = ceillog2(CONFIG_T::table_size);
 
     // Slice the top N bits of the input
-    hls_register ac_int<N, false> y = x.template slc<N>(x.width-N);             
+    hls_register ac_int<N, false> y = x.template slc<N>(x.width - N);
     return y.to_uint();
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
-    // Look-up tables
-    #include "activation_tables/exp_table.tb"
-    #include "activation_tables/invert_table.tb"
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+// Look-up tables
+#include "activation_tables/exp_table.tb"
+#include "activation_tables/invert_table.tb"
 
     // Find maximum
     Op_max<data_T> op_max;
@@ -163,102 +154,109 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
     // For the diffs, use the same type as the input but force rounding and saturation
     hls_register ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
     #pragma unroll
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         d_xi_xmax[i] = data[i] - x_max;
     }
 
     // Calculate all the e^x's
     hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         exp_res[i] = exp_table[softmax_stable_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
     }
 
     // Explicitly sum previously calculated exponentials with an adder tree
     Op_add<typename CONFIG_T::exp_table_t> op_add;
-    hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    hls_register typename CONFIG_T::exp_table_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
     // Multiply previously calculated exponetials with the reciprocal of the sum
-    hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+    hls_register typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
     #pragma unroll
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = exp_res[i] * inv_exp_sum;
     }
 }
 
 // TODO - Improve accuracy
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
-    #include "activation_tables/exp_table_latency.tb"
-    #include "activation_tables/invert_table_latency.tb"
-    
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+#include "activation_tables/exp_table_latency.tb"
+#include "activation_tables/invert_table_latency.tb"
+
     // Calculate all the e^x's
     hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val<data_T, CONFIG_T>(data[i])];
     }
 
     // Explicitly sum the results with an adder tree.
     Op_add<typename CONFIG_T::exp_table_t> op_add;
-    hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    hls_register typename CONFIG_T::exp_table_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
     // Multiply previously calculated exponetials with the reciprocal of the sum
-    hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+    hls_register typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
     #pragma unroll
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = exp_res[i] * inv_exp_sum;
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #include "activation_tables/exp_table_legacy.tb"
-    #include "activation_tables/invert_table_legacy.tb"
+#include "activation_tables/exp_table_legacy.tb"
+#include "activation_tables/invert_table_legacy.tb"
 
     hls_register int data_round[CONFIG_T::n_in];
-    New_loop:
+New_loop:
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        data_round[ii] = (data[ii] * CONFIG_T::table_size/16).to_int();
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round[ii] = (data[ii] * CONFIG_T::table_size / 16).to_int();
     }
-    NN_Outer:
+NN_Outer:
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         typename CONFIG_T::exp_table_t exp_res_temp = 0;
-        NN_Inner:
+    NN_Inner:
         #pragma unroll
-        for (int jj=0; jj<CONFIG_T::n_in; jj++) {
-            if (ii==jj) {
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            if (ii == jj) {
                 exp_res_temp += 1;
-            }
-            else {
-                int _data_cache = (data_round[jj]-data_round[ii]);
-                int index = _data_cache + 8*CONFIG_T::table_size/16;
+            } else {
+                int _data_cache = (data_round[jj] - data_round[ii]);
+                int index = _data_cache + 8 * CONFIG_T::table_size / 16;
 
-                if (index < 0)   index = 0;
-                if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+                if (index < 0)
+                    index = 0;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
 
                 typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index];
                 exp_res_temp += temp_exp;
             }
         }
-        int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int();
-        if (exp_res_index < 0)   exp_res_index = 0;
-        if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
+        int exp_res_index = (exp_res_temp * CONFIG_T::table_size / 64).to_int();
+        if (exp_res_index < 0)
+            exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size - 1)
+            exp_res_index = CONFIG_T::table_size - 1;
         res[ii] = invert_table_legacy[exp_res_index];
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_in; i++) {
-        res[i] = (res_T) 0;
+        res[i] = (res_T)0;
     }
 
     hls_register data_T maximum = data[0];
-    hls_register int idx = 0; 
+    hls_register int idx = 0;
 
     #pragma ii 1
     for (int i = 1; i < CONFIG_T::n_in; i++) {
@@ -268,56 +266,55 @@ void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
         }
     }
 
-    res[idx] = (res_T) 1;
+    res[idx] = (res_T)1;
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
-    switch(CONFIG_T::implementation) {
-        case softmax_implementation::stable:
-            softmax_stable<data_T, res_T, CONFIG_T>(data, res);
-            break;
-        case softmax_implementation::latency:
-            softmax_latency<data_T, res_T, CONFIG_T>(data, res);
-            break;
-        case softmax_implementation::legacy:
-            softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
-            break;
-        default:
-            softmax_stable<data_T, res_T, CONFIG_T>(data, res);
-            break;
-        case softmax_implementation::argmax:
-            softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
-            break;
+template <class data_T, class res_T, typename CONFIG_T>
+inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    default:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
     }
 }
 
 // *************************************************
 //       TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    static const int MAX_VALUE=4;
-    // Initialize the lookup table
-    #include "activation_tables/tanh_table.tb"
+    static const int MAX_VALUE = 4;
+// Initialize the lookup table
+#include "activation_tables/tanh_table.tb"
     // Index into the lookup table based on data
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        data_T temp  hls_register;
-        res_T  temp2 hls_register;
-        if(data[ii] < 0 ){
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T temp hls_register;
+        res_T temp2 hls_register;
+        if (data[ii] < 0) {
             temp = -data[ii];
-        }
-        else{
+        } else {
             temp = data[ii];
         }
-        ac_int<16> index = ( temp *(CONFIG_T::table_size/MAX_VALUE)).to_int();
-        if (temp > MAX_VALUE ) index = CONFIG_T::table_size-1;
-        temp2 = (res_T) tanh_table[index];
-        if(data[ii] < 0 ){
+        ac_int<16> index = (temp * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+        if (temp > MAX_VALUE)
+            index = CONFIG_T::table_size - 1;
+        temp2 = (res_T)tanh_table[index];
+        if (data[ii] < 0) {
             res[ii] = -temp2;
-        }
-        else{
+        } else {
             res[ii] = temp2;
         }
     }
@@ -326,95 +323,108 @@ void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 // *************************************************
 //       Hard sigmoid Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
-    data_T slope = (data_T) 0.2;
-    data_T shift = (data_T) 0.5;
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        data_T datareg = slope * data[ii] + shift;
-        if (datareg > 1) datareg = 1;
-        else if (datareg < 0) datareg = 0;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (datareg > 1)
+            datareg = 1;
+        else if (datareg < 0)
+            datareg = 0;
         res[ii] = datareg;
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (sigmoid > 1)
+            sigmoid = 1;
+        else if (sigmoid < 0)
+            sigmoid = 0;
+        res[ii] = 2 * sigmoid - 1;
+    }
+}
+
 // *************************************************
 //       Leaky RELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
-        if (datareg > 0) res[ii] = datareg;
-        else res[ii] = alpha * datareg;
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha * datareg;
     }
 }
 
 // *************************************************
 //       Thresholded RELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
-        if (datareg > theta) res[ii] = datareg;
-        else res[ii] = 0;
+        if (datareg > theta)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
     }
 }
 
 // *************************************************
 //       Softplus Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
-    // Initialize the lookup table
-    #include "activation_tables/softplus_table.tb"
+template <class data_T, class res_T, typename CONFIG_T>
+void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+// Initialize the lookup table
+#include "activation_tables/softplus_table.tb"
     // Index into the lookup table based on data
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        ac_int<16> data_round = (data[ii]*CONFIG_T::table_size/16).to_int();
-        ac_int<16> index = data_round + 8*CONFIG_T::table_size/16;
-        if (index < 0)   index = 0;
-        if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-        res[ii] = (res_T) softplus_table[index];
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<16> data_round = (data[ii] * CONFIG_T::table_size / 16).to_int();
+        ac_int<16> index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softplus_table[index];
     }
 }
 
 // *************************************************
 //       Softsign Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
-    static const int MAX_VALUE=8;
-    // Initialize the lookup table
-    #include "activation_tables/softsign_table.tb"
+template <class data_T, class res_T, typename CONFIG_T>
+void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    static const int MAX_VALUE = 8;
+// Initialize the lookup table
+#include "activation_tables/softsign_table.tb"
 
     // Index into the lookup table based on data
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        data_T temp  hls_register;
-        res_T  temp2 hls_register;
-        if(data[ii] < 0 ){
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T temp hls_register;
+        res_T temp2 hls_register;
+        if (data[ii] < 0) {
             temp = -data[ii];
-        }
-        else{
+        } else {
             temp = data[ii];
         }
-        ac_int<16> index = (temp*CONFIG_T::table_size/MAX_VALUE).to_int();
-        if (temp > MAX_VALUE) index = CONFIG_T::table_size-1;
-        temp2 = (res_T) softsign_table[index];
-        if(data[ii] < 0 ){
+        ac_int<16> index = (temp * CONFIG_T::table_size / MAX_VALUE).to_int();
+        if (temp > MAX_VALUE)
+            index = CONFIG_T::table_size - 1;
+        temp2 = (res_T)softsign_table[index];
+        if (data[ii] < 0) {
             res[ii] = -temp2;
-        }
-        else{
+        } else {
             res[ii] = temp2;
         }
     }
@@ -423,48 +433,45 @@ void  softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
 // *************************************************
 //       ELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in])
-{
-    // Initialize the lookup table
-    #include "activation_tables/elu_table.tb"
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
+// Initialize the lookup table
+#include "activation_tables/elu_table.tb"
     // Index into the lookup table based on data
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
         if (datareg >= 0) {
             res[ii] = datareg;
         } else {
-            ac_int<16> index = (datareg*CONFIG_T::table_size/-8).to_int();
-            if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+            ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int();
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
             res[ii] = alpha * elu_table[index];
         }
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
-	elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
+template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
 }
 
 // *************************************************
 //       SELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
-    // Initialize the lookup table
-    #include "activation_tables/selu_table.tb"
+template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+// Initialize the lookup table
+#include "activation_tables/selu_table.tb"
     // Index into the lookup table based on data
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
         if (datareg >= 0) {
             res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
         } else {
-            ac_int<16> index = (datareg*CONFIG_T::table_size/-8).to_int();
-            if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+            ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int();
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
             res[ii] = selu_table[index];
         }
     }
@@ -473,52 +480,56 @@ void  selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
 // *************************************************
 //       PReLU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
-        if (datareg > 0) res[ii] = datareg;
-        else res[ii] = alpha[ii] * datareg;
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha[ii] * datareg;
     }
 }
 
 // *************************************************
 //       Binary TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma unroll
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
         res_T cache;
-        if( datareg > 0 ) cache = 1;
-        else cache = -1;
+        if (datareg > 0)
+            cache = 1;
+        else
+            cache = -1;
 
-        res[ii] = (res_T) cache;
+        res[ii] = (res_T)cache;
     }
 }
 
 // *************************************************
 //       Ternary TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
-  #pragma unroll
-  for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-    data_T datareg = 2*data[ii];
-    res_T cache;
-    if( datareg > 1 ) cache = 1;
-    else if( datareg > -1 && datareg <= 1) cache=0;
-    else cache = -1;
-
-    res[ii] = (res_T) cache;
-  }
+template <class data_T, class res_T, typename CONFIG_T>
+void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = 2 * data[ii];
+        res_T cache;
+        if (datareg > 1)
+            cache = 1;
+        else if (datareg > -1 && datareg <= 1)
+            cache = 0;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
 }
 
-}
+} // namespace nnet
 
 #endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h
index 3dae6d8d5..f0562a9b2 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h
@@ -4,20 +4,19 @@
 #include "nnet_common.h"
 #include "nnet_types.h"
 
-namespace nnet{
+namespace nnet {
 
 // *************************************************
 //       Linear Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void linear(stream<data_T> &data, stream<res_T> &res) {
-    LinearActLoop:
+template <class data_T, class res_T, typename CONFIG_T> void linear(stream<data_T> &data, stream<res_T> &res) {
+LinearActLoop:
     #pragma ii 1
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        LinearPackLoop:
+    LinearPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
             out_data[j] = in_data[j];
@@ -30,19 +29,20 @@ void linear(stream<data_T> &data, stream<res_T> &res) {
 // *************************************************
 //       ReLU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void relu(stream<data_T> &data, stream<res_T> &res) {
-    ReLUActLoop:
+template <class data_T, class res_T, typename CONFIG_T> void relu(stream<data_T> &data, stream<res_T> &res) {
+ReLUActLoop:
     #pragma ii 1
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        ReLUPackLoop:
+    ReLUPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
-            if (in_data[j] > 0) out_data[j] = in_data[j];
-            else out_data[j] = 0;
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
         }
 
         res.write(out_data);
@@ -52,22 +52,24 @@ void relu(stream<data_T> &data, stream<res_T> &res) {
 // *************************************************
 //       Leaky RELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void leaky_relu(stream<data_T> &data, const typename data_T::value_type alpha, stream<res_T> &res) {
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = data_T::size / multiplier_limit;
-    
-    LeakyReLUActLoop:
+
+LeakyReLUActLoop:
     #pragma ii pipeline
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        LeakyReLUPackLoop:
+    LeakyReLUPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
-            if (in_data[j] > 0) out_data[j] = in_data[j];
-            else out_data[j] = alpha * in_data[j];
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha * in_data[j];
         }
 
         res.write(out_data);
@@ -77,19 +79,21 @@ void leaky_relu(stream<data_T> &data, const typename data_T::value_type alpha, s
 // *************************************************
 //       Thresholded RELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void thresholded_relu(stream<data_T> &data, const typename data_T::value_type theta, stream<res_T> &res) {
-    ThresholdedReLUActLoop:
+ThresholdedReLUActLoop:
     #pragma ii 1
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        ThresholdedReLUPackLoop:
+    ThresholdedReLUPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
-            if (in_data[j] > theta) out_data[j] = in_data[j];
-            else out_data[j] = 0;
+            if (in_data[j] > theta)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
         }
 
         res.write(out_data);
@@ -99,28 +103,29 @@ void thresholded_relu(stream<data_T> &data, const typename data_T::value_type th
 // *************************************************
 //       ELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void elu(stream<data_T> &data, const typename data_T::value_type alpha, stream<res_T> &res) {
-    #include "activation_tables/elu_table.tb"
+#include "activation_tables/elu_table.tb"
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = data_T::size / multiplier_limit;
 
-    EluActLoop:
+EluActLoop:
     #pragma ii pipeline
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        EluPackLoop:
+    EluPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
             hls_register typename data_T::value_type datareg = in_data[j];
             if (datareg >= 0) {
                 out_data[j] = datareg;
             } else {
-                int index = (datareg*CONFIG_T::table_size/-8).to_int();
-                if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+                int index = (datareg * CONFIG_T::table_size / -8).to_int();
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
                 out_data[j] = alpha * elu_table[index];
             }
         }
@@ -129,33 +134,32 @@ void elu(stream<data_T> &data, const typename data_T::value_type alpha, stream<r
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void elu(stream<data_T> &data, stream<res_T> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void elu(stream<data_T> &data, stream<res_T> &res) {
     elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
 }
 
 // *************************************************
 //       SeLU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void selu(stream<data_T> &data, stream<res_T> &res) {
-    #include "activation_tables/selu_table.tb"
+template <class data_T, class res_T, typename CONFIG_T> void selu(stream<data_T> &data, stream<res_T> &res) {
+#include "activation_tables/selu_table.tb"
 
-    SeluActLoop:
+SeluActLoop:
     #pragma ii 1
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        SeluPackLoop:
+    SeluPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
             hls_register typename data_T::value_type datareg = in_data[j];
             if (datareg >= 0) {
-                out_data[j] = typename data_T::value_type (1.0507009873554804934193349852946) * datareg;
+                out_data[j] = typename data_T::value_type(1.0507009873554804934193349852946) * datareg;
             } else {
-                int index = (datareg*CONFIG_T::table_size/-8).to_int();
-                if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+                int index = (datareg * CONFIG_T::table_size / -8).to_int();
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
                 out_data[j] = selu_table[index];
             }
         }
@@ -167,22 +171,24 @@ void selu(stream<data_T> &data, stream<res_T> &res) {
 // *************************************************
 //       PReLU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void prelu(stream<data_T> &data, const typename data_T::value_type alpha[CONFIG_T::n_in], stream<res_T> &res) {
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = data_T::size / multiplier_limit;
-    
-    PReLUActLoop:
+
+PReLUActLoop:
     #pragma ii pipeline
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        PReLUPackLoop:
+    PReLUPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
-            if (in_data[j] > 0) out_data[j] = in_data[j];
-            else out_data[j] = alpha[i*res_T::size+j] * in_data[j];
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha[i * res_T::size + j] * in_data[j];
         }
 
         res.write(out_data);
@@ -192,23 +198,24 @@ void prelu(stream<data_T> &data, const typename data_T::value_type alpha[CONFIG_
 // *************************************************
 //       Softplus Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void softplus(stream<data_T> &data, stream<res_T> &res) {
-    #include "activation_tables/softplus_table.tb"
+template <class data_T, class res_T, typename CONFIG_T> void softplus(stream<data_T> &data, stream<res_T> &res) {
+#include "activation_tables/softplus_table.tb"
 
-    SoftplusActLoop:
+SoftplusActLoop:
     #pragma ii 1
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        SoftplusPackLoop:
+    SoftplusPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
-            hls_register int data_round = (in_data[j]*CONFIG_T::table_size/16).to_int();
-            hls_register int index = data_round + 8*CONFIG_T::table_size/16;
-            if (index < 0) index = 0;
-            else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+            hls_register int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int();
+            hls_register int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
             out_data[j] = softplus_table[index];
         }
 
@@ -219,35 +226,34 @@ void softplus(stream<data_T> &data, stream<res_T> &res) {
 // *************************************************
 //       Softsign Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void softsign(stream<data_T> &data, stream<res_T> &res) {
-    #include "activation_tables/softsign_table.tb"
+template <class data_T, class res_T, typename CONFIG_T> void softsign(stream<data_T> &data, stream<res_T> &res) {
+#include "activation_tables/softsign_table.tb"
 
     static const int MAX_VALUE = 8;
 
-    SoftsignActLoop:
+SoftsignActLoop:
     #pragma ii 1
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        SoftsignPackLoop:
+    SoftsignPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
-            hls_register typename data_T::value_type absValue;;
-            if(in_data[j] < 0){
+            hls_register typename data_T::value_type absValue;
+            ;
+            if (in_data[j] < 0) {
                 absValue = -in_data[j];
-            }
-            else{
+            } else {
                 absValue = in_data[j];
             }
             ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int();
-            if (absValue > MAX_VALUE) index = CONFIG_T::table_size - 1;
-            if(in_data[j] < 0) {
-                out_data[j] = -(typename res_T::value_type) softsign_table[index];
-            }
-            else {
-                out_data[j] = (typename res_T::value_type) softsign_table[index];
+            if (absValue > MAX_VALUE)
+                index = CONFIG_T::table_size - 1;
+            if (in_data[j] < 0) {
+                out_data[j] = -(typename res_T::value_type)softsign_table[index];
+            } else {
+                out_data[j] = (typename res_T::value_type)softsign_table[index];
             }
         }
 
@@ -259,101 +265,106 @@ void softsign(stream<data_T> &data, stream<res_T> &res) {
 //       Softmax Activation
 // *************************************************
 
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(stream<data_T> &data, stream<res_T> &res) {
-    #include "activation_tables/exp_table.tb"
-    #include "activation_tables/invert_table.tb"
+template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(stream<data_T> &data, stream<res_T> &res) {
+#include "activation_tables/exp_table.tb"
+#include "activation_tables/invert_table.tb"
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = data_T::size / multiplier_limit;
 
     hls_register typename data_T::value_type data_array[data_T::size];
-    
-    SoftmaxArrayLoop: 
+
+SoftmaxArrayLoop:
     #pragma ii pipeline
-    for(unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {    
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
         data_T in_pack = data.read();
-        
-        SoftmaxArrayPackLoop: 
-        #pragma unroll 
-        for(unsigned j = 0; j < data_T::size; j++) {
+
+    SoftmaxArrayPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < data_T::size; j++) {
             data_array[j] = in_pack[j];
         }
 
         // Find the max and compute all delta(x_i, x_max)
         Op_max<typename data_T::value_type> op_max;
-        hls_register typename data_T::value_type x_max = reduce<typename data_T::value_type, data_T::size, Op_max<typename data_T::value_type>>(data_array, op_max);
+        hls_register typename data_T::value_type x_max =
+            reduce<typename data_T::value_type, data_T::size, Op_max<typename data_T::value_type>>(data_array, op_max);
 
         // For the diffs, use the same type as the input but force rounding and saturation
-        hls_register ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT> d_xi_xmax[data_T::size];
+        hls_register ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
+            d_xi_xmax[data_T::size];
         #pragma unroll
-        for(unsigned j = 0; j < data_T::size; j++){
+        for (unsigned j = 0; j < data_T::size; j++) {
             d_xi_xmax[j] = data_array[j] - x_max;
         }
 
         // Calculate all the e^x's
         hls_register typename CONFIG_T::exp_table_t exp_res[data_T::size];
         #pragma unroll
-        for(unsigned j = 0; j < data_T::size; j++) {
+        for (unsigned j = 0; j < data_T::size; j++) {
             exp_res[j] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[j])];
         }
 
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
         Op_add<typename CONFIG_T::exp_table_t> op_add;
-        hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+        hls_register typename CONFIG_T::exp_table_t exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
-        hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+        hls_register typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
         res_T out_pack;
-        
-        SoftmaxInvPackLoop: 
+
+    SoftmaxInvPackLoop:
         #pragma unroll
-        for(unsigned j = 0; j < res_T::size; j++){
-            
+        for (unsigned j = 0; j < res_T::size; j++) {
+
             // TODO - Find Quartus-equivalent pragma
             // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
-            
+
             out_pack[j] = exp_res[j] * inv_exp_sum;
         }
-        
+
         res.write(out_pack);
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(stream<data_T> &data, stream<res_T> &res){
-    #include "activation_tables/exp_table_latency.tb"
-    #include "activation_tables/invert_table_latency.tb"
-    
+template <class data_T, class res_T, typename CONFIG_T> void softmax_latency(stream<data_T> &data, stream<res_T> &res) {
+#include "activation_tables/exp_table_latency.tb"
+#include "activation_tables/invert_table_latency.tb"
+
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = data_T::size / multiplier_limit;
 
     // Calculate all the e^x's
     hls_register typename CONFIG_T::exp_table_t exp_res[data_T::size];
-    
-    SoftmaxExpLoop: 
+
+SoftmaxExpLoop:
     #pragma ii pipeline
-    for(unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
         data_T in_pack = data.read();
-        
-        SoftmaxExpPackLoop: 
+
+    SoftmaxExpPackLoop:
         #pragma unroll
-        for(unsigned j = 0; j < data_T::size; j++) {
-            exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val<typename data_T::value_type, CONFIG_T>(in_pack[j])];
+        for (unsigned j = 0; j < data_T::size; j++) {
+            exp_res[j] =
+                exp_table_latency[softmax_latency_idx_from_real_val<typename data_T::value_type, CONFIG_T>(in_pack[j])];
         }
 
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
         Op_add<typename CONFIG_T::exp_table_t> op_add;
-        hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+        hls_register typename CONFIG_T::exp_table_t exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
         // Multiply previously calculated exponetials with the reciprocal of the sum
-        hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+        hls_register typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
 
         res_T out_pack;
-        SoftmaxInvPackLoop: 
+    SoftmaxInvPackLoop:
         #pragma unroll
-        for(unsigned j = 0; j < res_T::size; j++){
+        for (unsigned j = 0; j < res_T::size; j++) {
             // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
             out_pack[j] = exp_res[j] * inv_exp_sum;
         }
@@ -362,41 +373,42 @@ void softmax_latency(stream<data_T> &data, stream<res_T> &res){
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void softmax_legacy(stream<data_T> &data, stream<res_T> &res) {
-    #include "activation_tables/exp_table_legacy.tb"
-    #include "activation_tables/invert_table_legacy.tb"
-    
+template <class data_T, class res_T, typename CONFIG_T> void softmax_legacy(stream<data_T> &data, stream<res_T> &res) {
+#include "activation_tables/exp_table_legacy.tb"
+#include "activation_tables/invert_table_legacy.tb"
+
     // Index into the lookup table based on data for exponentials
     hls_register typename CONFIG_T::table_t exp_res[data_T::size];
     hls_register typename CONFIG_T::table_t exp_diff_res;
     hls_register typename data_T::value_type data_cache[data_T::size];
 
-    SoftmaxInitLoop: 
+SoftmaxInitLoop:
     #pragma ii 1
-    for(unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) {
+    for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) {
         data_T in_pack = data.read();
-        
-        SoftmaxInitPackLoop: 
+
+    SoftmaxInitPackLoop:
         #pragma unroll
-        for(unsigned j = 0; j < data_T::size; j++) {
+        for (unsigned j = 0; j < data_T::size; j++) {
             data_cache[j] = in_pack[j];
             exp_res[j] = 0;
         }
 
-        SoftmaxExpLoop: 
+    SoftmaxExpLoop:
         #pragma unroll
         for (int i = 0; i < data_T::size; i++) {
-            SoftmaxExpInner: 
+        SoftmaxExpInner:
             #pragma unroll
             for (int j = 0; j < data_T::size; j++) {
                 if (i == j) {
                     exp_diff_res = 1;
                 } else {
-                    int data_round = ((data_cache[j] - data_cache[i])*CONFIG_T::table_size/16).to_int();
+                    int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int();
                     int index = data_round + 8 * CONFIG_T::table_size / 16;
-                    if (index < 0) index = 0;
-                    if (index > CONFIG_T::table_size - 1) index = CONFIG_T::table_size - 1;
+                    if (index < 0)
+                        index = 0;
+                    if (index > CONFIG_T::table_size - 1)
+                        index = CONFIG_T::table_size - 1;
                     exp_diff_res = exp_table_legacy[index];
                 }
                 exp_res[i] += exp_diff_res;
@@ -404,21 +416,22 @@ void softmax_legacy(stream<data_T> &data, stream<res_T> &res) {
         }
 
         res_T out_pack;
-        SoftmaxInvPackLoop: 
+    SoftmaxInvPackLoop:
         #pragma unroll
-        for(unsigned j = 0; j < res_T::size; j++) {
-            int exp_res_index = (exp_res[j]*CONFIG_T::table_size/64).to_int();
-            if (exp_res_index < 0) exp_res_index = 0;
-            if (exp_res_index > CONFIG_T::table_size - 1) exp_res_index = CONFIG_T::table_size - 1;
-            out_pack[j] = (typename res_T::value_type) invert_table_legacy[exp_res_index];
+        for (unsigned j = 0; j < res_T::size; j++) {
+            int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int();
+            if (exp_res_index < 0)
+                exp_res_index = 0;
+            if (exp_res_index > CONFIG_T::table_size - 1)
+                exp_res_index = CONFIG_T::table_size - 1;
+            out_pack[j] = (typename res_T::value_type)invert_table_legacy[exp_res_index];
         }
 
         res.write(out_pack);
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void softmax_argmax(stream<data_T> &data, stream<res_T> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void softmax_argmax(stream<data_T> &data, stream<res_T> &res) {
     #pragma ii 1
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
@@ -426,11 +439,11 @@ void softmax_argmax(stream<data_T> &data, stream<res_T> &res) {
 
         #pragma unroll
         for (int i = 0; i < res_T::size; i++) {
-            out_data[i] = (typename res_T::value_type) 0;
+            out_data[i] = (typename res_T::value_type)0;
         }
 
         hls_register typename data_T::value_type maximum = in_data[0];
-        hls_register int idx = 0; 
+        hls_register int idx = 0;
 
         #pragma ii 1
         for (int i = 1; i < res_T::size; i++) {
@@ -440,64 +453,68 @@ void softmax_argmax(stream<data_T> &data, stream<res_T> &res) {
             }
         }
 
-        out_data[idx] = (typename res_T::value_type) 1;
+        out_data[idx] = (typename res_T::value_type)1;
         res.write(out_data);
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void softmax(stream<data_T> &data, stream<res_T> &res) {
-    switch(CONFIG_T::implementation) {
-        case softmax_implementation::latency:
-            softmax_latency<data_T, res_T, CONFIG_T>(data, res);
-            break;
-        case softmax_implementation::stable:
-            softmax_stable<data_T, res_T, CONFIG_T>(data, res);
-            break;
-        case softmax_implementation::legacy:
-            softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
-            break;
-        case softmax_implementation::argmax:
-            softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
-            break;
-        default:
-            softmax_stable<data_T, res_T, CONFIG_T>(data, res);
-            break;
-    }    
+template <class data_T, class res_T, typename CONFIG_T> void softmax(stream<data_T> &data, stream<res_T> &res) {
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    default:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
 }
 
 // *************************************************
 //       TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void dense_tanh(stream<data_T> &data, stream<res_T> &res) {
-    #include "activation_tables/tanh_table.tb"
-    static const int MAX_VALUE=4;
+template <class data_T, class res_T, typename CONFIG_T> void dense_tanh(stream<data_T> &data, stream<res_T> &res) {
+#include "activation_tables/tanh_table.tb"
+    static const int MAX_VALUE = 4;
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = data_T::size / multiplier_limit;
 
-    TanHActLoop:
+TanHActLoop:
     #pragma ii pipeline
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
 
         data_T in_data = data.read();
         res_T out_data;
 
-        TanHPackLoop:
+    TanHPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
             hls_register typename data_T::value_type absoluteValue;
 
-            if(in_data[j] < 0) absoluteValue = (-1)*in_data[j];
-            else absoluteValue = in_data[j];
+            if (in_data[j] < 0)
+                absoluteValue = (-1) * in_data[j];
+            else
+                absoluteValue = in_data[j];
 
             hls_register int index;
-            if (absoluteValue <= MAX_VALUE) index = (absoluteValue*(CONFIG_T::table_size/MAX_VALUE)).to_int();
-            else index = CONFIG_T::table_size-1;
-
-            if(in_data[j] > 0) out_data[j] = tanh_table[index];
-            else out_data[j] = -tanh_table[index];
+            if (absoluteValue <= MAX_VALUE)
+                index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+            else
+                index = CONFIG_T::table_size - 1;
+
+            if (in_data[j] > 0)
+                out_data[j] = tanh_table[index];
+            else
+                out_data[j] = -tanh_table[index];
         }
 
         res.write(out_data);
@@ -507,34 +524,39 @@ void dense_tanh(stream<data_T> &data, stream<res_T> &res) {
 // *************************************************
 //       Sigmoid Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void sigmoid(stream<data_T> &data, stream<res_T> &res) {
-    #include "activation_tables/sigmoid_table.tb"
-    static const int MAX_VALUE=8;
+template <class data_T, class res_T, typename CONFIG_T> void sigmoid(stream<data_T> &data, stream<res_T> &res) {
+#include "activation_tables/sigmoid_table.tb"
+    static const int MAX_VALUE = 8;
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = data_T::size / multiplier_limit;
 
-    SigmoidActLoop:
+SigmoidActLoop:
     #pragma ii pipeline
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         data_T in_data = data.read();
         res_T out_data;
 
-        SigmoidPackLoop:
+    SigmoidPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
             hls_register typename data_T::value_type absoluteValue;
 
-            if(in_data[j] < 0) absoluteValue = (-1)*in_data[j];
-            else absoluteValue = in_data[j];
+            if (in_data[j] < 0)
+                absoluteValue = (-1) * in_data[j];
+            else
+                absoluteValue = in_data[j];
 
             hls_register int index;
-            if (absoluteValue <= MAX_VALUE) index = (absoluteValue*(CONFIG_T::table_size/MAX_VALUE)).to_int();
-            else index = CONFIG_T::table_size-1;
-
-            if(in_data[j] > 0) out_data[j] = sigmoid_table[index];
-            else out_data[j] = 1 - sigmoid_table[index];
+            if (absoluteValue <= MAX_VALUE)
+                index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+            else
+                index = CONFIG_T::table_size - 1;
+
+            if (in_data[j] > 0)
+                out_data[j] = sigmoid_table[index];
+            else
+                out_data[j] = 1 - sigmoid_table[index];
         }
 
         res.write(out_data);
@@ -545,27 +567,26 @@ void sigmoid(stream<data_T> &data, stream<res_T> &res) {
 //       Hard sigmoid Activation
 // *************************************************
 // Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations
-template<class data_T, class res_T, typename CONFIG_T>
-void hard_sigmoid(stream<data_T> &data, stream<res_T> &res) {
-    static const typename data_T::value_type slope = (typename data_T::value_type) 0.2;
-    static const typename data_T::value_type shift = (typename data_T::value_type) 0.5;
+template <class data_T, class res_T, typename CONFIG_T> void hard_sigmoid(stream<data_T> &data, stream<res_T> &res) {
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = data_T::size / multiplier_limit;
 
-    HardSigmoidActLoop:
+HardSigmoidActLoop:
     #pragma ii pipeline
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
 
         data_T in_data = data.read();
         res_T out_data;
 
-        HardSigmoidPackLoop:
+    HardSigmoidPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
-            hls_register typename data_T::value_type datareg = slope * in_data[j] + shift;
-            if (datareg > 1) datareg = 1;
-            else if (datareg < 0) datareg = 0;
+            hls_register auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (datareg > 1)
+                datareg = 1;
+            else if (datareg < 0)
+                datareg = 0;
             out_data[j] = datareg;
         }
 
@@ -573,23 +594,51 @@ void hard_sigmoid(stream<data_T> &data, stream<res_T> &res) {
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(stream<data_T> &data, stream<res_T> &res) {
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_T::size / multiplier_limit;
+
+HardSigmoidActLoop:
+    #pragma ii pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+
+    HardSigmoidPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_T::size; j++) {
+            auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (sigmoid > 1)
+                sigmoid = 1;
+            else if (sigmoid < 0)
+                sigmoid = 0;
+            out_data[j] = 2 * sigmoid - 1;
+        }
+
+        res.write(out_data);
+    }
+}
+
 // *************************************************
 //       Binary TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void binary_tanh(stream<data_T> &data, stream<res_T> &res) {
-    BinaryTanHActLoop: 
+template <class data_T, class res_T, typename CONFIG_T> void binary_tanh(stream<data_T> &data, stream<res_T> &res) {
+BinaryTanHActLoop:
     #pragma ii 1
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
-        
+
         hls_register data_T in_data = data.read();
         hls_register res_T out_data;
 
-        BinaryTanHPackLoop: 
+    BinaryTanHPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
-            if(in_data[j] > 0) out_data[j] = (typename res_T::value_type) 1;
-            else out_data[j] = (typename res_T::value_type) -1;
+            if (in_data[j] > 0)
+                out_data[j] = (typename res_T::value_type)1;
+            else
+                out_data[j] = (typename res_T::value_type) - 1;
         }
 
         res.write(out_data);
@@ -599,28 +648,29 @@ void binary_tanh(stream<data_T> &data, stream<res_T> &res) {
 // *************************************************
 //       Ternary TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void ternary_tanh(stream<data_T> &data, stream<res_T> &res) {
-  TernaryTanHActLoop: 
+template <class data_T, class res_T, typename CONFIG_T> void ternary_tanh(stream<data_T> &data, stream<res_T> &res) {
+TernaryTanHActLoop:
     #pragma ii 1
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
-        
+
         hls_register data_T in_data = data.read();
         hls_register res_T out_data;
 
-        TernaryTanHPackLoop: 
+    TernaryTanHPackLoop:
         #pragma unroll
         for (int j = 0; j < res_T::size; j++) {
-            if(in_data[j] > 1) out_data[j] = (typename res_T::value_type) 1;
-            else if (in_data[j] <=-1) out_data[j] = (typename res_T::value_type) -1;
-            else out_data[j] = (typename res_T::value_type) 0;
+            if (in_data[j] > 1)
+                out_data[j] = (typename res_T::value_type)1;
+            else if (in_data[j] <= -1)
+                out_data[j] = (typename res_T::value_type) - 1;
+            else
+                out_data[j] = (typename res_T::value_type)0;
         }
 
         res.write(out_data);
     }
-  
 }
 
-}
+} // namespace nnet
 
-#endif
\ No newline at end of file
+#endif
diff --git a/hls4ml/templates/quartus/firmware/parameters.h b/hls4ml/templates/quartus/firmware/parameters.h
index 75a3a0d70..e23ca9770 100644
--- a/hls4ml/templates/quartus/firmware/parameters.h
+++ b/hls4ml/templates/quartus/firmware/parameters.h
@@ -4,9 +4,8 @@
 #include "defines.h"
 
 #include "nnet_utils/nnet_helpers.h"
-//hls-fpga-machine-learning insert includes
-
-//hls-fpga-machine-learning insert layer-config
+// hls-fpga-machine-learning insert includes
 
+// hls-fpga-machine-learning insert layer-config
 
 #endif
diff --git a/hls4ml/templates/quartus/myproject_bridge.cpp b/hls4ml/templates/quartus/myproject_bridge.cpp
index b0cdfc756..f4c23b20f 100644
--- a/hls4ml/templates/quartus/myproject_bridge.cpp
+++ b/hls4ml/templates/quartus/myproject_bridge.cpp
@@ -7,10 +7,10 @@
 #include <map>
 
 namespace nnet {
-    bool trace_enabled = false;
-    std::map<std::string, void *> *trace_outputs = NULL;
-    size_t trace_type_size = sizeof(double);
-}
+bool trace_enabled = false;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
 
 extern "C" {
 
@@ -23,7 +23,7 @@ void allocate_trace_storage(size_t element_size) {
     nnet::trace_enabled = true;
     nnet::trace_outputs = new std::map<std::string, void *>;
     nnet::trace_type_size = element_size;
-    //hls-fpga-machine-learning insert trace_outputs
+    // hls-fpga-machine-learning insert trace_outputs
 }
 
 void free_trace_storage() {
@@ -48,18 +48,17 @@ void collect_trace_output(struct trace_data *c_trace_outputs) {
 
 // Wrapper of top level function for Python bridge
 void myproject_float(
-    //hls-fpga-machine-learning insert header #float
+    // hls-fpga-machine-learning insert header #float
 ) {
-    
-    //hls-fpga-machine-learning insert wrapper #float
+
+    // hls-fpga-machine-learning insert wrapper #float
 }
 
 void myproject_double(
-    //hls-fpga-machine-learning insert header #double
+    // hls-fpga-machine-learning insert header #double
 ) {
-    //hls-fpga-machine-learning insert wrapper #double
+    // hls-fpga-machine-learning insert wrapper #double
 }
-
 }
 
 #endif
diff --git a/hls4ml/templates/quartus/myproject_test_parallel.cpp b/hls4ml/templates/quartus/myproject_test_parallel.cpp
index 27e1476fd..380941853 100644
--- a/hls4ml/templates/quartus/myproject_test_parallel.cpp
+++ b/hls4ml/templates/quartus/myproject_test_parallel.cpp
@@ -16,14 +16,13 @@
 //    You should have received a copy of the GNU General Public License
 //    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
+#include <algorithm>
+#include <cctype>
 #include <fstream>
 #include <iostream>
-#include <algorithm>
-#include <vector>
 #include <string>
-#include <cctype>
+#include <vector>
 
-#include "firmware/parameters.h"
 #include "firmware/myproject.h"
 
 #define CHECKPOINT 5000
@@ -34,13 +33,12 @@
 // This function returns the next float (by argument) at position pos,
 // updating pos. True is returned if conversion done, false if the string
 // has ended, and std::invalid_argument exception if the sting was bad.
-bool nextToken(const std::string& str, std::size_t& pos, float& val)
-{
+bool nextToken(const std::string &str, std::size_t &pos, float &val) {
     while (pos < str.size() && std::isspace(static_cast<unsigned char>(str[pos]))) {
-      pos++;
+        pos++;
     }
     if (pos >= str.size()) {
-      return false;
+        return false;
     }
     std::size_t offset = 0;
     val = std::stof(str.substr(pos), &offset);
@@ -49,12 +47,11 @@ bool nextToken(const std::string& str, std::size_t& pos, float& val)
 }
 
 int main(int argc, char **argv) {
-    //load input data from text file
+    // load input data from text file
     std::ifstream fin("tb_data/tb_input_features.dat");
-    //load predictions from text file
+    // load predictions from text file
     std::ifstream fpr("tb_data/tb_output_predictions.dat");
 
-
     std::string RESULTS_LOG = "tb_data/results.log";
     std::ofstream fout(RESULTS_LOG);
 
@@ -65,62 +62,62 @@ int main(int argc, char **argv) {
     std::vector<output_data> outputs;
 
     if (fin.is_open() && fpr.is_open()) {
-      std::vector<std::vector<float> > predictions;
-      unsigned int num_iterations = 0;
-      for (; std::getline(fin,iline) && std::getline (fpr,pline); num_iterations++) {
-        if (num_iterations % CHECKPOINT == 0) {
-          std::cout << "Processing input "  << num_iterations << std::endl;
+        std::vector<std::vector<float>> predictions;
+        unsigned int num_iterations = 0;
+        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
+            if (num_iterations % CHECKPOINT == 0) {
+                std::cout << "Processing input " << num_iterations << std::endl;
+            }
+
+            std::vector<float> in;
+            std::vector<float> pr;
+            float current;
+
+            std::size_t pos = 0;
+            while (nextToken(iline, pos, current)) {
+                in.push_back(current);
+            }
+
+            pos = 0;
+            while (nextToken(pline, pos, current)) {
+                pr.push_back(current);
+            }
+
+            // hls-fpga-machine-learning insert data
+            predictions.push_back(std::move(pr));
         }
 
-        std::vector<float> in;
-        std::vector<float> pr;
-        float current;
+        // Do this separately to avoid vector reallocation
+        // hls-fpga-machine-learning insert top-level-function
 
-        std::size_t pos = 0;
-        while(nextToken(iline, pos, current)) {
-          in.push_back(current);
-        }
+        // hls-fpga-machine-learning insert run
 
-        pos = 0;
-        while(nextToken(pline, pos, current)) {
-          pr.push_back(current);
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert tb-output
+            if (j % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
         }
-
-        //hls-fpga-machine-learning insert data
-        predictions.push_back(std::move(pr));
-      }
-
-      // Do this separately to avoid vector reallocation
-      //hls-fpga-machine-learning insert top-level-function
-
-      //hls-fpga-machine-learning insert run
-
-
-      for(int j = 0; j < num_iterations; j++) {
-        //hls-fpga-machine-learning insert tb-output
-        if (j % CHECKPOINT == 0) {
-          std::cout << "Predictions" << std::endl;
-          //hls-fpga-machine-learning insert predictions
-          std::cout << "Quantized predictions" << std::endl;
-          //hls-fpga-machine-learning insert quantized
-        }
-      }
-      fin.close();
-      fpr.close();
+        fin.close();
+        fpr.close();
     } else {
-      const unsigned int num_iterations = 10;
-      std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations << " invocations." << std::endl;
-      //hls-fpga-machine-learning insert zero
+        const unsigned int num_iterations = 10;
+        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                  << " invocations." << std::endl;
+        // hls-fpga-machine-learning insert zero
 
-      //hls-fpga-machine-learning insert top-level-function
+        // hls-fpga-machine-learning insert top-level-function
 
-      //hls-fpga-machine-learning insert run
+        // hls-fpga-machine-learning insert run
 
-      for (int j = 0; j < num_iterations; j++) {
-        //hls-fpga-machine-learning insert output
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert output
 
-        //hls-fpga-machine-learning insert tb-output
-      }
+            // hls-fpga-machine-learning insert tb-output
+        }
     }
 
     fout.close();
diff --git a/hls4ml/templates/quartus/myproject_test_stream.cpp b/hls4ml/templates/quartus/myproject_test_stream.cpp
index 881cbea4f..5e5f89e75 100644
--- a/hls4ml/templates/quartus/myproject_test_stream.cpp
+++ b/hls4ml/templates/quartus/myproject_test_stream.cpp
@@ -1,11 +1,10 @@
+#include <algorithm>
+#include <cctype>
 #include <fstream>
 #include <iostream>
-#include <algorithm>
-#include <vector>
 #include <string>
-#include <cctype>
+#include <vector>
 
-#include "firmware/parameters.h"
 #include "firmware/myproject.h"
 
 #include "firmware/nnet_utils/nnet_helpers.h"
@@ -18,12 +17,12 @@
 // This function returns the next float (by argument) at position pos,
 // updating pos. True is returned if conversion done, false if the string
 // has ended, and std::invalid_argument exception if the sting was bad.
-bool nextToken(const std::string& str, std::size_t& pos, float& val) {
+bool nextToken(const std::string &str, std::size_t &pos, float &val) {
     while (pos < str.size() && std::isspace(static_cast<unsigned char>(str[pos]))) {
-      pos++;
+        pos++;
     }
     if (pos >= str.size()) {
-      return false;
+        return false;
     }
     std::size_t offset = 0;
     val = std::stof(str.substr(pos), &offset);
@@ -35,8 +34,8 @@ int main(int argc, char **argv) {
     // Load input data from text file
     std::ifstream fin("tb_data/tb_input_features.dat");
     std::string iline;
-    
-    //Load predictions from text file
+
+    // Load predictions from text file
     std::ifstream fpr("tb_data/tb_output_predictions.dat");
     std::string pline;
 
@@ -45,80 +44,79 @@ int main(int argc, char **argv) {
     std::ofstream fout(RESULTS_LOG);
 
     if (fin.is_open() && fpr.is_open()) {
-      std::vector<std::vector<float>> predictions;
-      
-      unsigned int iteration = 0;
-      while(std::getline(fin,iline) && std::getline(fpr,pline)) {
-        if (iteration % CHECKPOINT == 0) {
-          std::cout << "Processing input "  << iteration << std::endl;
-        }
+        std::vector<std::vector<float>> predictions;
 
-        //hls-fpga-machine learning instantiate inputs and outputs  
+        unsigned int iteration = 0;
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (iteration % CHECKPOINT == 0) {
+                std::cout << "Processing input " << iteration << std::endl;
+            }
 
-        std::vector<float> in;
-        std::vector<float> pr;
-        float current;
+            // hls-fpga-machine learning instantiate inputs and outputs
 
-        std::size_t pos = 0;
-        while(nextToken(iline, pos, current)) {
-          in.push_back(current);
-        }
+            std::vector<float> in;
+            std::vector<float> pr;
+            float current;
 
-        pos = 0;
-        while(nextToken(pline, pos, current)) {
-          pr.push_back(current);
-        }
+            std::size_t pos = 0;
+            while (nextToken(iline, pos, current)) {
+                in.push_back(current);
+            }
 
-        //hls-fpga-machine-learning insert data
+            pos = 0;
+            while (nextToken(pline, pos, current)) {
+                pr.push_back(current);
+            }
 
-        predictions.push_back(std::move(pr));
+            // hls-fpga-machine-learning insert data
 
-        //hls-fpga-machine-learning insert top-level-function
-        
-        //hls-fpga-machine-learning insert run
+            predictions.push_back(std::move(pr));
 
-        //hls-fpga-machine-learning convert output
+            // hls-fpga-machine-learning insert top-level-function
 
-        //hls-fpga-machine-learning insert tb-output
-        
-        if (iteration % CHECKPOINT == 0) {
-          std::cout << "Python Predictions" << std::endl;
-          //hls-fpga-machine-learning print predictions
-          
-          std::cout << "HLS predictions" << std::endl;
-          //hls-fpga-machine-learning print output
-        }
+            // hls-fpga-machine-learning insert run
+
+            // hls-fpga-machine-learning convert output
+
+            // hls-fpga-machine-learning insert tb-output
 
-        iteration++;
-      }
+            if (iteration % CHECKPOINT == 0) {
+                std::cout << "Python Predictions" << std::endl;
+                // hls-fpga-machine-learning print predictions
 
-      fin.close();
-      fpr.close();
+                std::cout << "HLS predictions" << std::endl;
+                // hls-fpga-machine-learning print output
+            }
+
+            iteration++;
+        }
+
+        fin.close();
+        fpr.close();
 
     } else {
-      const unsigned int num_iterations = 10;
-      std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations << " invocations." << std::endl;
+        const unsigned int num_iterations = 10;
+        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                  << " invocations." << std::endl;
 
-      for (int iteration = 0 ; iteration < num_iterations ; iteration++) {
-        //hls-fpga-machine learning instantiate inputs and outputs  
+        for (int iteration = 0; iteration < num_iterations; iteration++) {
+            // hls-fpga-machine learning instantiate inputs and outputs
 
-        //hls-fpga-machine-learning insert zero
+            // hls-fpga-machine-learning insert zero
 
-        //hls-fpga-machine-learning insert top-level-function
+            // hls-fpga-machine-learning insert top-level-function
 
-        //hls-fpga-machine-learning insert run
+            // hls-fpga-machine-learning insert run
 
-        //hls-fpga-machine-learning convert output
+            // hls-fpga-machine-learning convert output
 
-        //hls-fpga-machine-learning insert tb-output
+            // hls-fpga-machine-learning insert tb-output
 
-        if (iteration % CHECKPOINT == 0) {        
-          std::cout << "HLS predictions" << std::endl;
-          //hls-fpga-machine-learning print output
+            if (iteration % CHECKPOINT == 0) {
+                std::cout << "HLS predictions" << std::endl;
+                // hls-fpga-machine-learning print output
+            }
         }
-
-      }
-      
     }
 
     fout.close();
diff --git a/hls4ml/templates/vivado/firmware/defines.h b/hls4ml/templates/vivado/firmware/defines.h
index 40ec72ea8..1f11b0209 100644
--- a/hls4ml/templates/vivado/firmware/defines.h
+++ b/hls4ml/templates/vivado/firmware/defines.h
@@ -1,14 +1,14 @@
 #ifndef DEFINES_H_
 #define DEFINES_H_
 
-#include "ap_int.h"
 #include "ap_fixed.h"
+#include "ap_int.h"
 #include "nnet_utils/nnet_types.h"
 #include <cstddef>
 #include <cstdio>
 
-//hls-fpga-machine-learning insert numbers
+// hls-fpga-machine-learning insert numbers
 
-//hls-fpga-machine-learning insert layer-precision
+// hls-fpga-machine-learning insert layer-precision
 
 #endif
diff --git a/hls4ml/templates/vivado/firmware/myproject.cpp b/hls4ml/templates/vivado/firmware/myproject.cpp
index 3aa4d58a3..1c7342a34 100644
--- a/hls4ml/templates/vivado/firmware/myproject.cpp
+++ b/hls4ml/templates/vivado/firmware/myproject.cpp
@@ -22,15 +22,15 @@
 #include "parameters.h"
 
 void myproject(
-	//hls-fpga-machine-learning insert header
+    // hls-fpga-machine-learning insert header
 ) {
 
-    //hls-fpga-machine-learning insert IO
+    // hls-fpga-machine-learning insert IO
 
 #ifndef __SYNTHESIS__
     static bool loaded_weights = false;
     if (!loaded_weights) {
-        //hls-fpga-machine-learning insert load weights
+        // hls-fpga-machine-learning insert load weights
         loaded_weights = true;
     }
 #endif
@@ -39,5 +39,5 @@ void myproject(
     // NETWORK INSTANTIATION
     // ****************************************
 
-    //hls-fpga-machine-learning insert layers
+    // hls-fpga-machine-learning insert layers
 }
diff --git a/hls4ml/templates/vivado/firmware/myproject.h b/hls4ml/templates/vivado/firmware/myproject.h
index 5e2130926..1199fbc68 100644
--- a/hls4ml/templates/vivado/firmware/myproject.h
+++ b/hls4ml/templates/vivado/firmware/myproject.h
@@ -20,15 +20,15 @@
 #ifndef MYPROJECT_H_
 #define MYPROJECT_H_
 
-#include "ap_int.h"
 #include "ap_fixed.h"
+#include "ap_int.h"
 #include "hls_stream.h"
 
 #include "defines.h"
 
 // Prototype of top level function for C-synthesis
 void myproject(
-    //hls-fpga-machine-learning insert header
+    // hls-fpga-machine-learning insert header
 );
 
 #endif
diff --git a/hls4ml/templates/vivado/firmware/parameters.h b/hls4ml/templates/vivado/firmware/parameters.h
index addee4ef2..2d9ddedb3 100644
--- a/hls4ml/templates/vivado/firmware/parameters.h
+++ b/hls4ml/templates/vivado/firmware/parameters.h
@@ -1,15 +1,15 @@
 #ifndef PARAMETERS_H_
 #define PARAMETERS_H_
 
-#include "ap_int.h"
 #include "ap_fixed.h"
+#include "ap_int.h"
 
-#include "nnet_utils/nnet_helpers.h"
 #include "nnet_utils/nnet_code_gen.h"
-//hls-fpga-machine-learning insert includes
- 
-//hls-fpga-machine-learning insert weights
+#include "nnet_utils/nnet_helpers.h"
+// hls-fpga-machine-learning insert includes
+
+// hls-fpga-machine-learning insert weights
 
-//hls-fpga-machine-learning insert layer-config
+// hls-fpga-machine-learning insert layer-config
 
 #endif
diff --git a/hls4ml/templates/vivado/myproject_bridge.cpp b/hls4ml/templates/vivado/myproject_bridge.cpp
index 210635ac0..35c1997f6 100644
--- a/hls4ml/templates/vivado/myproject_bridge.cpp
+++ b/hls4ml/templates/vivado/myproject_bridge.cpp
@@ -6,14 +6,13 @@
 #include <algorithm>
 #include <map>
 
-//hls-fpga-machine-learning insert bram
-
+// hls-fpga-machine-learning insert bram
 
 namespace nnet {
-    bool trace_enabled = false;
-    std::map<std::string, void *> *trace_outputs = NULL;
-    size_t trace_type_size = sizeof(double);
-}
+bool trace_enabled = false;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
 
 extern "C" {
 
@@ -26,7 +25,7 @@ void allocate_trace_storage(size_t element_size) {
     nnet::trace_enabled = true;
     nnet::trace_outputs = new std::map<std::string, void *>;
     nnet::trace_type_size = element_size;
-    //hls-fpga-machine-learning insert trace_outputs
+    // hls-fpga-machine-learning insert trace_outputs
 }
 
 void free_trace_storage() {
@@ -51,18 +50,17 @@ void collect_trace_output(struct trace_data *c_trace_outputs) {
 
 // Wrapper of top level function for Python bridge
 void myproject_float(
-    //hls-fpga-machine-learning insert header #float
+    // hls-fpga-machine-learning insert header #float
 ) {
-    
-    //hls-fpga-machine-learning insert wrapper #float
+
+    // hls-fpga-machine-learning insert wrapper #float
 }
 
 void myproject_double(
-    //hls-fpga-machine-learning insert header #double
+    // hls-fpga-machine-learning insert header #double
 ) {
-    //hls-fpga-machine-learning insert wrapper #double
+    // hls-fpga-machine-learning insert wrapper #double
 }
-
 }
 
 #endif
diff --git a/hls4ml/templates/vivado/myproject_test.cpp b/hls4ml/templates/vivado/myproject_test.cpp
index 7de8dd4b4..9d7e7685c 100644
--- a/hls4ml/templates/vivado/myproject_test.cpp
+++ b/hls4ml/templates/vivado/myproject_test.cpp
@@ -16,97 +16,95 @@
 //    You should have received a copy of the GNU General Public License
 //    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
+#include <algorithm>
 #include <fstream>
 #include <iostream>
-#include <algorithm>
-#include <vector>
 #include <map>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
+#include <vector>
 
 #include "firmware/myproject.h"
 #include "firmware/nnet_utils/nnet_helpers.h"
 
-//hls-fpga-machine-learning insert bram
+// hls-fpga-machine-learning insert bram
 
 #define CHECKPOINT 5000
 
 namespace nnet {
-    bool trace_enabled = true;
-    std::map<std::string, void *> *trace_outputs = NULL;
-    size_t trace_type_size = sizeof(double);
-}
+bool trace_enabled = true;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
 
-int main(int argc, char **argv)
-{
-  //load input data from text file
-  std::ifstream fin("tb_data/tb_input_features.dat");
-  //load predictions from text file
-  std::ifstream fpr("tb_data/tb_output_predictions.dat");
+int main(int argc, char **argv) {
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
 
 #ifdef RTL_SIM
-  std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
+    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
 #else
-  std::string RESULTS_LOG = "tb_data/csim_results.log";
+    std::string RESULTS_LOG = "tb_data/csim_results.log";
 #endif
-  std::ofstream fout(RESULTS_LOG);
-
-  std::string iline;
-  std::string pline;
-  int e = 0;
-
-  if (fin.is_open() && fpr.is_open()) {
-    while ( std::getline(fin,iline) && std::getline (fpr,pline) ) {
-      if (e % CHECKPOINT == 0) std::cout << "Processing input " << e << std::endl;
-      char* cstr=const_cast<char*>(iline.c_str());
-      char* current;
-      std::vector<float> in;
-      current=strtok(cstr," ");
-      while(current!=NULL) {
-        in.push_back(atof(current));
-        current=strtok(NULL," ");
-      }
-      cstr=const_cast<char*>(pline.c_str());
-      std::vector<float> pr;
-      current=strtok(cstr," ");
-      while(current!=NULL) {
-        pr.push_back(atof(current));
-        current=strtok(NULL," ");
-      }
-
-      //hls-fpga-machine-learning insert data
-
-      //hls-fpga-machine-learning insert top-level-function
-
-      if (e % CHECKPOINT == 0) {
-        std::cout << "Predictions" << std::endl;
-        //hls-fpga-machine-learning insert predictions
-        std::cout << "Quantized predictions" << std::endl;
-        //hls-fpga-machine-learning insert quantized
-      }
-      e++;
-
-      //hls-fpga-machine-learning insert tb-output
-
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+    int e = 0;
+
+    if (fin.is_open() && fpr.is_open()) {
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (e % CHECKPOINT == 0)
+                std::cout << "Processing input " << e << std::endl;
+            char *cstr = const_cast<char *>(iline.c_str());
+            char *current;
+            std::vector<float> in;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                in.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            cstr = const_cast<char *>(pline.c_str());
+            std::vector<float> pr;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                pr.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+
+            // hls-fpga-machine-learning insert data
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            if (e % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+            e++;
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
+
+        // hls-fpga-machine-learning insert zero
+
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert output
+
+        // hls-fpga-machine-learning insert tb-output
     }
-    fin.close();
-    fpr.close();
-  } else {
-    std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
-
-    //hls-fpga-machine-learning insert zero
-
-    //hls-fpga-machine-learning insert top-level-function
-
-    //hls-fpga-machine-learning insert output
-
-    //hls-fpga-machine-learning insert tb-output
-
-  }
 
-  fout.close();
-  std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
 
-  return 0;
+    return 0;
 }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index af609d99d..3a96482db 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -20,14 +20,13 @@
 #ifndef NNET_ACTIVATION_H_
 #define NNET_ACTIVATION_H_
 
-#include <cmath>
 #include "ap_fixed.h"
 #include "nnet_common.h"
+#include <cmath>
 
 namespace nnet {
 
-struct activ_config
-{
+struct activ_config {
     // IO size
     static const unsigned n_in = 10;
 
@@ -39,91 +38,80 @@ struct activ_config
     static const unsigned reuse_factor = 1;
 
     // Internal data type definitions
-    typedef ap_fixed<18,8> table_t;
+    typedef ap_fixed<18, 8> table_t;
 };
 
 // *************************************************
 //       LINEAR Activation -- See Issue 53
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS PIPELINE
 
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         res[ii] = data[ii];
     }
 }
 
-
-
 // *************************************************
 //       RELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS PIPELINE
 
     data_T datareg;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         datareg = data[ii];
-        if (datareg > 0) res[ii] = datareg;
-        else res[ii] = 0;
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
     }
 }
 
-template<class data_T, class res_T, int MAX_INT, typename CONFIG_T>
-void  relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
+void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS PIPELINE
 
     data_T datareg;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         datareg = data[ii];
-        if (datareg < 0) res[ii] = 0;
-        else if (datareg > MAX_INT) res[ii] = MAX_INT;
-        else res[ii] = datareg;
+        if (datareg < 0)
+            res[ii] = 0;
+        else if (datareg > MAX_INT)
+            res[ii] = MAX_INT;
+        else
+            res[ii] = datareg;
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
 }
 
 // *************************************************
 //       Sigmoid Activation
 // *************************************************
-inline float sigmoid_fcn_float(float input) {
-    return 1.0 / (1 + std::exp(-input));
-}
+inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); }
 
-template<typename CONFIG_T, int N_TABLE>
-void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE])
-{
+template <typename CONFIG_T, int N_TABLE> void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Default logistic sigmoid function:
     //   result = 1/(1+e^(-x))
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2*8.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE);
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
         // Next, compute lookup table function
         typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val);
-        //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
         table_out[ii] = real_val;
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -142,12 +130,14 @@ void  sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
     // Index into the lookup table based on data
     int data_round;
     int index;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        data_round = data[ii]*CONFIG_T::table_size/16;
-        index = data_round + 8*CONFIG_T::table_size/16;
-        if (index < 0)   index = 0;
-        if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-        res[ii] = (res_T) sigmoid_table[index];
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)sigmoid_table[index];
     }
 }
 
@@ -155,33 +145,29 @@ void  sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
 //       Softmax Activation
 // *************************************************
 
-enum class softmax_implementation {latency=0, legacy=1, stable=2, argmax=3};
+enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
 
-inline float exp_fcn_float(float input) {
-    return std::exp(input);
-}
+inline float exp_fcn_float(float input) { return std::exp(input); }
 
-template<class data_T, typename CONFIG_T>
-inline float softmax_real_val_from_idx(unsigned i){
+template <class data_T, typename CONFIG_T> inline float softmax_real_val_from_idx(unsigned i) {
     // Treat the index as the top N bits
     static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
     data_T x(0);
-    x(x.width-1, x.width-N) = i;
-    return (float) x;
+    x(x.width - 1, x.width - N) = i;
+    return (float)x;
 }
 
-template<class data_T, typename CONFIG_T>
-inline unsigned softmax_idx_from_real_val(data_T x){
+template <class data_T, typename CONFIG_T> inline unsigned softmax_idx_from_real_val(data_T x) {
     // Slice the top N bits to get an index into the table
     static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
-    ap_uint<N> y = x(x.width-1, x.width-N); // slice the top N bits of input
-    return (unsigned) y(N-1, 0);
+    ap_uint<N> y = x(x.width - 1, x.width - N);              // slice the top N bits of input
+    return (unsigned)y(N - 1, 0);
 }
 
-template<class data_T, typename CONFIG_T>
-void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]){
+template <class data_T, typename CONFIG_T>
+void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) {
     // The template data_T is the data type used to address the table
-    for(unsigned i = 0; i < CONFIG_T::table_size; i++){
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
         // Slicing bits for address is going to round towards 0, so take the central value
         float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
         typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
@@ -189,10 +175,10 @@ void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_siz
     }
 }
 
-template<class data_T, typename CONFIG_T>
-void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]){
+template <class data_T, typename CONFIG_T>
+void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) {
     // The template data_T is the data type used to address the table
-    for(unsigned i = 0; i < CONFIG_T::table_size; i++){
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
         float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
         typename CONFIG_T::inv_table_t inv_x = 1 / x;
         table_out[i] = inv_x;
@@ -200,7 +186,7 @@ void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS pipeline
     // Initialize the lookup tables
 #ifdef __HLS_SYN__
@@ -225,7 +211,7 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
     typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma HLS array_partition variable=exp_res complete
     typename CONFIG_T::exp_table_t exp_sum(0);
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
         unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(data[i]);
         exp_res[i] = exp_table[x];
@@ -234,17 +220,19 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
     // Explicitly sum the results with an adder tree.
     // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
     Op_add<typename CONFIG_T::exp_table_t> op_add;
-    exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
-    typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
         res[i] = exp_res[i] * inv_exp_sum;
     }
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS pipeline
     // Initialize the lookup tables
 #ifdef __HLS_SYN__
@@ -270,8 +258,8 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
     data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
 
     // For the diffs, use the same type as the input but force rounding and saturation
-    ap_fixed<data_T::width, data_T::iwidth,AP_RND,AP_SAT> d_xi_xmax[CONFIG_T::n_in];
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+    ap_fixed<data_T::width, data_T::iwidth, AP_RND, AP_SAT> d_xi_xmax[CONFIG_T::n_in];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
         d_xi_xmax[i] = data[i] - x_max;
     }
@@ -280,7 +268,7 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
     typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma HLS array_partition variable=exp_res complete
     typename CONFIG_T::exp_table_t exp_sum(0);
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
         unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i]);
         exp_res[i] = exp_table[x];
@@ -289,45 +277,44 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
     // Explicitly sum the results with an adder tree.
     // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
     Op_add<typename CONFIG_T::exp_table_t> op_add;
-    exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
-    typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
-    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
         res[i] = exp_res[i] * inv_exp_sum;
     }
 }
 
-template<typename CONFIG_T, int N_TABLE>
-void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE])
-{
+template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2*8.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE);
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
         // Next, compute lookup table function
         typename CONFIG_T::table_t real_val = exp_fcn_float(in_val);
-        //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
         table_out[ii] = real_val;
     }
 }
 
-template<typename CONFIG_T, int N_TABLE>
-void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE])
-{
+template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Inversion function:
     //   result = 1/x
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range 0 to +64)
-        float in_val = 64.0*ii/float(N_TABLE);
+        float in_val = 64.0 * ii / float(N_TABLE);
         // Next, compute lookup table function
-        if (in_val > 0.0) table_out[ii] = 1.0/in_val;
-        else table_out[ii] = 0.0;
+        if (in_val > 0.0)
+            table_out[ii] = 1.0 / in_val;
+        else
+            table_out[ii] = 0.0;
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -347,50 +334,54 @@ void  softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
     #pragma HLS PIPELINE
 
     // Index into the lookup table based on data for exponentials
-    typename CONFIG_T::table_t exp_res[CONFIG_T::n_in];// different, independent, fixed point precision
-    typename CONFIG_T::table_t exp_diff_res;// different, independent, fixed point precision
+    typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
+    typename CONFIG_T::table_t exp_diff_res;            // different, independent, fixed point precision
     data_T data_cache[CONFIG_T::n_in];
     int data_round;
     int index;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_cache[ii] = data[ii];
         exp_res[ii] = 0;
     }
 
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        for (int jj=0; jj<CONFIG_T::n_in; jj++) {
-            if (ii==jj) exp_diff_res = 1;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            if (ii == jj)
+                exp_diff_res = 1;
             else {
-                data_round = (data_cache[jj]-data_cache[ii])*CONFIG_T::table_size/16;
-                index = data_round + 8*CONFIG_T::table_size/16;
-                if (index < 0)   index = 0;
-                if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+                data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16;
+                index = data_round + 8 * CONFIG_T::table_size / 16;
+                if (index < 0)
+                    index = 0;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
                 exp_diff_res = exp_table[index];
             }
             exp_res[ii] += exp_diff_res;
         }
     }
 
-    //Second loop to invert
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        int exp_res_index = exp_res[ii]*CONFIG_T::table_size/64;
-        if (exp_res_index < 0)   exp_res_index = 0;
-        if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
-        //typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
-        res[ii] = (res_T) invert_table[exp_res_index];
+    // Second loop to invert
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        int exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64;
+        if (exp_res_index < 0)
+            exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size - 1)
+            exp_res_index = CONFIG_T::table_size - 1;
+        // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
+        res[ii] = (res_T)invert_table[exp_res_index];
     }
-
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     for (int i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS UNROLL
-        res[i] = (res_T) 0;
+        res[i] = (res_T)0;
     }
 
     data_T maximum = data[0];
-    int idx = 0; 
+    int idx = 0;
 
     for (int i = 1; i < CONFIG_T::n_in; i++) {
         #pragma HLS PIPELINE
@@ -400,13 +391,13 @@ void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
         }
     }
 
-    res[idx] = (res_T) 1;
+    res[idx] = (res_T)1;
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS inline
-    switch(CONFIG_T::implementation){
+    switch (CONFIG_T::implementation) {
     case softmax_implementation::latency:
         softmax_latency<data_T, res_T, CONFIG_T>(data, res);
         break;
@@ -425,24 +416,20 @@ void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
 // *************************************************
 //       TanH Activation
 // *************************************************
-template<typename CONFIG_T, int N_TABLE>
-void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE])
-{
+template <typename CONFIG_T, int N_TABLE> void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Implement tanh lookup
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -4 to +4)
-        float in_val = 2*4.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE);
+        float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
         // Next, compute lookup table function
         typename CONFIG_T::table_t real_val = tanh(in_val);
-        //std::cout << "Tanh:  Lookup table Index: " <<  ii<< " In Value: " << in_val << " Result: " << real_val << std::endl;
+        // std::cout << "Tanh:  Lookup table Index: " <<  ii<< " In Value: " << in_val << " Result: " << real_val <<
+        // std::endl;
         table_out[ii] = real_val;
     }
 }
 
-
-template<class data_T, class res_T, typename CONFIG_T>
-void  tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -461,92 +448,105 @@ void  tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
     // Index into the lookup table based on data
     int data_round;
     int index;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        data_round = data[ii]*CONFIG_T::table_size/8;
-        index = data_round + 4*CONFIG_T::table_size/8;
-        //std::cout << "Input: "  << data[ii] << " Round: " << data_round << " Index: " << index << std::endl;
-        if (index < 0)   index = 0;
-        if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-        res[ii] = (res_T) tanh_table[index];
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 8;
+        index = data_round + 4 * CONFIG_T::table_size / 8;
+        // std::cout << "Input: "  << data[ii] << " Round: " << data_round << " Index: " << index << std::endl;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)tanh_table[index];
     }
 }
 
 // *************************************************
 //       Hard sigmoid Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS PIPELINE
 
-    data_T datareg;
-    data_T slope = (data_T) 0.2;
-    data_T shift = (data_T) 0.5;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        datareg = slope * data[ii] + shift;
-        if (datareg > 1) datareg = 1;
-        else if (datareg < 0) datareg = 0;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (datareg > 1)
+            datareg = 1;
+        else if (datareg < 0)
+            datareg = 0;
         res[ii] = datareg;
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    if (CONFIG_T::io_type == io_parallel) {
+        #pragma HLS PIPELINE
+    }
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (sigmoid > 1)
+            sigmoid = 1;
+        else if (sigmoid < 0)
+            sigmoid = 0;
+        res[ii] = 2 * sigmoid - 1;
+    }
+}
+
 // *************************************************
 //       Leaky RELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
     #pragma HLS PIPELINE
 
     data_T datareg;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         datareg = data[ii];
-        if (datareg > 0) res[ii] = datareg;
-        else res[ii] = alpha * datareg;
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha * datareg;
     }
 }
 
 // *************************************************
 //       Thresholded RELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
     #pragma HLS PIPELINE
 
     data_T datareg;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         datareg = data[ii];
-        if (datareg > theta) res[ii] = datareg;
-        else res[ii] = 0;
+        if (datareg > theta)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
     }
 }
 
 // *************************************************
 //       Softplus Activation
 // *************************************************
-inline float softplus_fcn_float(float input) {
-    return std::log(std::exp(input) + 1.);
-}
+inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); }
 
-template<typename CONFIG_T, int N_TABLE>
-void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE])
-{
+template <typename CONFIG_T, int N_TABLE> void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Default softplus function:
     //   result = log(exp(x) + 1)
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2*8.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE);
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
         // Next, compute lookup table function
         typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val);
-        //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
         table_out[ii] = real_val;
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -565,40 +565,37 @@ void  softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
     // Index into the lookup table based on data
     int data_round;
     int index;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        data_round = data[ii]*CONFIG_T::table_size/16;
-        index = data_round + 8*CONFIG_T::table_size/16;
-        if (index < 0)   index = 0;
-        if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-        res[ii] = (res_T) softplus_table[index];
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softplus_table[index];
     }
 }
 
 // *************************************************
 //       Softsign Activation
 // *************************************************
-inline float softsign_fcn_float(float input) {
-    return input / (std::abs(input) + 1.);
-}
+inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); }
 
-template<typename CONFIG_T, int N_TABLE>
-void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE])
-{
+template <typename CONFIG_T, int N_TABLE> void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Default softsign function:
     //   result = x / (abs(x) + 1)
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2*8.0*(ii-float(N_TABLE)/2.0)/float(N_TABLE);
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
         // Next, compute lookup table function
         typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val);
-        //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
         table_out[ii] = real_val;
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -617,40 +614,37 @@ void  softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
     // Index into the lookup table based on data
     int data_round;
     int index;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        data_round = data[ii]*CONFIG_T::table_size/16;
-        index = data_round + 8*CONFIG_T::table_size/16;
-        if (index < 0)   index = 0;
-        if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-        res[ii] = (res_T) softsign_table[index];
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softsign_table[index];
     }
 }
 
 // *************************************************
 //       ELU Activation
 // *************************************************
-inline float elu_fcn_float(float input) {
-    return std::exp(input) - 1.;
-}
+inline float elu_fcn_float(float input) { return std::exp(input) - 1.; }
 
-template<typename CONFIG_T, int N_TABLE>
-void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE])
-{
+template <typename CONFIG_T, int N_TABLE> void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Default ELU function:
     //   result = alpha * (e^(x) - 1)
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
-        float in_val = -8.0*ii/float(N_TABLE);
+        float in_val = -8.0 * ii / float(N_TABLE);
         // Next, compute lookup table function
         typename CONFIG_T::table_t real_val = elu_fcn_float(in_val);
-        //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
         table_out[ii] = real_val;
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -669,21 +663,20 @@ void  elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_
     data_T datareg;
     // Index into the lookup table based on data
     int index;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         datareg = data[ii];
         if (datareg >= 0) {
             res[ii] = datareg;
         } else {
-            index = datareg*CONFIG_T::table_size/-8;
-            if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
             res[ii] = alpha * elu_table[index];
         }
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
 }
 
@@ -694,24 +687,20 @@ inline float selu_fcn_float(float input) {
     return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.));
 }
 
-template<typename CONFIG_T, int N_TABLE>
-void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE])
-{
+template <typename CONFIG_T, int N_TABLE> void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Default SELU function:
     //   result = 1.05 * (1.673 * (e^(x) - 1))
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
-        float in_val = -8.0*ii/float(N_TABLE);
+        float in_val = -8.0 * ii / float(N_TABLE);
         // Next, compute lookup table function
         typename CONFIG_T::table_t real_val = selu_fcn_float(in_val);
-        //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
         table_out[ii] = real_val;
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void  selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -730,13 +719,14 @@ void  selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
     data_T datareg;
     // Index into the lookup table based on data
     int index;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         datareg = data[ii];
         if (datareg >= 0) {
             res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
         } else {
-            index = datareg*CONFIG_T::table_size/-8;
-            if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
             res[ii] = selu_table[index];
         }
     }
@@ -745,59 +735,62 @@ void  selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
 // *************************************************
 //       PReLU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS PIPELINE
 
     data_T datareg;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         datareg = data[ii];
-        if (datareg > 0) res[ii] = datareg;
-        else res[ii] = alpha[ii] * datareg;
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha[ii] * datareg;
     }
 }
 
 // *************************************************
 //       Binary TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS PIPELINE
 
     data_T datareg;
     res_T cache;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         datareg = data[ii];
-        if( datareg > 0 ) cache = 1;
-        else cache = -1;
+        if (datareg > 0)
+            cache = 1;
+        else
+            cache = -1;
 
-        res[ii] = (res_T) cache;
+        res[ii] = (res_T)cache;
     }
 }
 
 // *************************************************
 //       Ternary TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void  ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #pragma HLS PIPELINE
-  
-    data_T datareg;   
-    res_T cache; 
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        datareg = 2*data[ii];
-        if( datareg > 1 ) cache = 1;
-        else if( datareg > -1 && datareg <= 1) cache=0;
-        else cache = -1;
-  
-        res[ii] = (res_T) cache;
+
+    data_T datareg;
+    res_T cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = 2 * data[ii];
+        if (datareg > 1)
+            cache = 1;
+        else if (datareg > -1 && datareg <= 1)
+            cache = 0;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
     }
- 
 }
 
-}
+} // namespace nnet
 
-#endif
\ No newline at end of file
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
index 3accdc650..075672c6b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
@@ -20,29 +20,30 @@
 #ifndef NNET_ACTIVATION_STREAM_H_
 #define NNET_ACTIVATION_STREAM_H_
 
-#include <cmath>
 #include "ap_fixed.h"
 #include "hls_stream.h"
+#include "nnet_activation.h"
 #include "nnet_common.h"
-#include "nnet_types.h"
 #include "nnet_stream.h"
-#include "nnet_activation.h"
+#include "nnet_types.h"
+#include <cmath>
 
 namespace nnet {
 
 // *************************************************
 //       LINEAR Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void linear(hls::stream<data_T> &data, hls::stream<res_T> &res) {
-    LinearActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+template <class data_T, class res_T, typename CONFIG_T> void linear(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+LinearActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        LinearPackLoop: for (int j = 0; j < res_T::size; j++) {
+    LinearPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
             out_data[j] = in_data[j];
         }
@@ -51,23 +52,25 @@ void linear(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     }
 }
 
-
 // *************************************************
 //       RELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
-void relu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
-    ReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+template <class data_T, class res_T, typename CONFIG_T> void relu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+ReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        ReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
+    ReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            if (in_data[j] > 0) out_data[j] = in_data[j];
-            else out_data[j] = 0;
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
         }
 
         res.write(out_data);
@@ -78,8 +81,7 @@ void relu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 //       Sigmoid Activation
 // *************************************************
 
-template<class data_T, class res_T, typename CONFIG_T>
-void sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -93,19 +95,23 @@ void sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         initialized = true;
     }
 
-    SigmoidActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+SigmoidActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        SigmoidPackLoop: for (int j = 0; j < res_T::size; j++) {
+    SigmoidPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            int data_round = in_data[j]*CONFIG_T::table_size/16;
-            int index = data_round + 8*CONFIG_T::table_size/16;
-            if (index < 0)   index = 0;
-            else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+            int data_round = in_data[j] * CONFIG_T::table_size / 16;
+            int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
             out_data[j] = sigmoid_table[index];
         }
 
@@ -113,13 +119,12 @@ void sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     }
 }
 
-
 // *************************************************
 //       Softmax Activation
 // *************************************************
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res){
+void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup tables
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -146,11 +151,13 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res){
     typename CONFIG_T::exp_table_t exp_res[data_T::size];
     #pragma HLS array_partition variable=exp_res complete
     typename CONFIG_T::exp_table_t exp_sum(0);
-    SoftmaxExpLoop: for(unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++){
+SoftmaxExpLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
         #pragma HLS PIPELINE II=ii
 
         data_T in_pack = data.read();
-        SoftmaxExpPackLoop: for(unsigned j = 0; j < data_T::size; j++){
+    SoftmaxExpPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
             unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(in_pack[j]);
             exp_res[j] = exp_table[x];
@@ -159,13 +166,16 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res){
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
         Op_add<typename CONFIG_T::exp_table_t> op_add;
-        exp_sum = reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+        exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
-        typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+        typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
 
         res_T out_pack;
-        PRAGMA_DATA_PACK(out_pack)
-        SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){
+    PRAGMA_DATA_PACK(out_pack)
+    SoftmaxInvPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
             #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
             out_pack[j] = exp_res[j] * inv_exp_sum;
@@ -175,7 +185,7 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res){
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res){
+void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup tables
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -199,23 +209,26 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res){
     constexpr unsigned ii = data_T::size / multiplier_limit;
 
     typename data_T::value_type data_array[data_T::size];
-    #pragma HLS ARRAY_PARTITION variable=data_array complete
-    SoftmaxArrayLoop: for(unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++){
+#pragma HLS ARRAY_PARTITION variable=data_array complete
+SoftmaxArrayLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
         #pragma HLS PIPELINE II=ii
 
         data_T in_pack = data.read();
-        SoftmaxArrayPackLoop: for(unsigned j = 0; j < data_T::size; j++){
+    SoftmaxArrayPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
             data_array[j] = in_pack[j];
         }
 
         // Find the max and compute all delta(x_i, x_max)
         Op_max<typename data_T::value_type> op_max;
-        typename data_T::value_type x_max = reduce<typename data_T::value_type, data_T::size, Op_max<typename data_T::value_type>>(data_array, op_max);
+        typename data_T::value_type x_max =
+            reduce<typename data_T::value_type, data_T::size, Op_max<typename data_T::value_type>>(data_array, op_max);
 
         // For the diffs, use the same type as the input but force rounding and saturation
-        ap_fixed<data_T::value_type::width, data_T::value_type::iwidth,AP_RND,AP_SAT> d_xi_xmax[data_T::size];
-        for(unsigned j = 0; j < data_T::size; j++){
+        ap_fixed<data_T::value_type::width, data_T::value_type::iwidth, AP_RND, AP_SAT> d_xi_xmax[data_T::size];
+        for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
             d_xi_xmax[j] = data_array[j] - x_max;
         }
@@ -224,7 +237,7 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res){
         typename CONFIG_T::exp_table_t exp_res[data_T::size];
         #pragma HLS ARRAY_PARTITION variable=exp_res complete
         typename CONFIG_T::exp_table_t exp_sum(0);
-        for(unsigned j = 0; j < data_T::size; j++){
+        for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
             unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[j]);
             exp_res[j] = exp_table[x];
@@ -233,13 +246,16 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res){
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
         Op_add<typename CONFIG_T::exp_table_t> op_add;
-        exp_sum = reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+        exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
-        typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+        typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
 
         res_T out_pack;
-        PRAGMA_DATA_PACK(out_pack)
-        SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++){
+    PRAGMA_DATA_PACK(out_pack)
+    SoftmaxInvPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
             #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
             out_pack[j] = exp_res[j] * inv_exp_sum;
@@ -248,7 +264,7 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res){
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
@@ -271,18 +287,22 @@ void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     typename CONFIG_T::table_t exp_diff_res;
     typename data_T::value_type data_cache[data_T::size];
 
-    SoftmaxInitLoop: for(unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) {
+SoftmaxInitLoop:
+    for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) {
         #pragma HLS PIPELINE
         data_T in_pack = data.read();
-        SoftmaxInitPackLoop: for(unsigned j = 0; j < data_T::size; j++) {
+    SoftmaxInitPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
             data_cache[j] = in_pack[j];
             exp_res[j] = 0;
         }
 
-        SoftmaxExpLoop: for (int i = 0; i < data_T::size; i++) {
-            #pragma HLS UNROLL
-            SoftmaxExpInner: for (int j = 0; j < data_T::size; j++) {
+    SoftmaxExpLoop:
+        for (int i = 0; i < data_T::size; i++) {
+        #pragma HLS UNROLL
+        SoftmaxExpInner:
+            for (int j = 0; j < data_T::size; j++) {
                 #pragma HLS UNROLL
 
                 if (i == j) {
@@ -290,8 +310,10 @@ void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
                 } else {
                     int data_round = (data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16;
                     int index = data_round + 8 * CONFIG_T::table_size / 16;
-                    if (index < 0) index = 0;
-                    if (index > CONFIG_T::table_size - 1) index = CONFIG_T::table_size - 1;
+                    if (index < 0)
+                        index = 0;
+                    if (index > CONFIG_T::table_size - 1)
+                        index = CONFIG_T::table_size - 1;
                     exp_diff_res = exp_table[index];
                 }
 
@@ -300,21 +322,24 @@ void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         }
 
         res_T out_pack;
-        PRAGMA_DATA_PACK(out_pack)
-        SoftmaxInvPackLoop: for(unsigned j = 0; j < res_T::size; j++) {
+    PRAGMA_DATA_PACK(out_pack)
+    SoftmaxInvPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
 
             int exp_res_index = exp_res[j] * CONFIG_T::table_size / 64;
-            if (exp_res_index < 0) exp_res_index = 0;
-            if (exp_res_index > CONFIG_T::table_size - 1) exp_res_index = CONFIG_T::table_size - 1;
+            if (exp_res_index < 0)
+                exp_res_index = 0;
+            if (exp_res_index > CONFIG_T::table_size - 1)
+                exp_res_index = CONFIG_T::table_size - 1;
 
-            out_pack[j] = (typename res_T::value_type) invert_table[exp_res_index];
+            out_pack[j] = (typename res_T::value_type)invert_table[exp_res_index];
         }
         res.write(out_pack);
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void softmax_argmax(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
@@ -323,11 +348,11 @@ void softmax_argmax(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         for (int i = 0; i < res_T::size; i++) {
             #pragma HLS UNROLL
-            out_data[i] = (typename res_T::value_type) 0;
+            out_data[i] = (typename res_T::value_type)0;
         }
 
         typename data_T::value_type maximum = in_data[0];
-        int idx = 0; 
+        int idx = 0;
 
         for (int i = 1; i < res_T::size; i++) {
             #pragma HLS PIPELINE
@@ -337,17 +362,15 @@ void softmax_argmax(hls::stream<data_T> &data, hls::stream<res_T> &res) {
             }
         }
 
-        out_data[idx] = (typename res_T::value_type) 1;
+        out_data[idx] = (typename res_T::value_type)1;
         res.write(out_data);
     }
 }
 
-
-template<class data_T, class res_T, typename CONFIG_T>
-void softmax(hls::stream<data_T> &data, hls::stream<res_T> &res){
+template <class data_T, class res_T, typename CONFIG_T> void softmax(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     assert(CONFIG_T::axis == -1);
 
-    switch(CONFIG_T::implementation){
+    switch (CONFIG_T::implementation) {
     case softmax_implementation::latency:
         softmax_latency<data_T, res_T, CONFIG_T>(data, res);
         break;
@@ -360,16 +383,14 @@ void softmax(hls::stream<data_T> &data, hls::stream<res_T> &res){
     case softmax_implementation::argmax:
         softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
         break;
-    }  
+    }
 }
 
 // *************************************************
 //       TanH Activation
 // *************************************************
 
-
-template<class data_T, class res_T, typename CONFIG_T>
-void tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -383,19 +404,23 @@ void tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         initialized = true;
     }
 
-    TanHActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+TanHActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        TanHPackLoop: for (int j = 0; j < res_T::size; j++) {
+    TanHPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            int data_round = in_data[j]*CONFIG_T::table_size/8;
-            int index = data_round + 4*CONFIG_T::table_size/8;
-            if (index < 0)   index = 0;
-            else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+            int data_round = in_data[j] * CONFIG_T::table_size / 8;
+            int index = data_round + 4 * CONFIG_T::table_size / 8;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
             out_data[j] = tanh_table[index];
         }
 
@@ -403,28 +428,29 @@ void tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     }
 }
 
-
 // *************************************************
 //       Hard sigmoid Activation
 // *************************************************
 
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void hard_sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
-    typename data_T::value_type slope = (typename data_T::value_type) 0.2;
-    typename data_T::value_type shift = (typename data_T::value_type) 0.5;
 
-    HardSigmoidActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+HardSigmoidActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        HardSigmoidPackLoop: for (int j = 0; j < res_T::size; j++) {
+    HardSigmoidPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            typename data_T::value_type datareg = slope * in_data[j] + shift;
-            if (datareg > 1) datareg = 1;
-            else if (datareg < 0) datareg = 0;
+            auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (datareg > 1)
+                datareg = 1;
+            else if (datareg < 0)
+                datareg = 0;
             out_data[j] = datareg;
         }
 
@@ -432,60 +458,89 @@ void hard_sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+
+HardSigmoidActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        #pragma HLS DATA_PACK variable=out_data
+
+    HardSigmoidPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (sigmoid > 1)
+                sigmoid = 1;
+            else if (sigmoid < 0)
+                sigmoid = 0;
+            out_data[j] = 2 * sigmoid - 1;
+        }
+
+        res.write(out_data);
+    }
+}
 
 // *************************************************
 //       Leaky RELU Activation
 // *************************************************
 
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void leaky_relu(hls::stream<data_T> &data, typename data_T::value_type alpha, hls::stream<res_T> &res) {
-    LeakyReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+LeakyReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        LeakyReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
+    LeakyReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            if (in_data[j] > 0) out_data[j] = in_data[j];
-            else out_data[j] = alpha * in_data[j];
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha * in_data[j];
         }
         res.write(out_data);
     }
 }
 
-
 // *************************************************
 //       Thresholded RELU Activation
 // *************************************************
 
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void thresholded_relu(hls::stream<data_T> &data, typename data_T::value_type theta, hls::stream<res_T> &res) {
-    ThresholdedReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+ThresholdedReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        ThresholdedReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
+    ThresholdedReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            if (in_data[j] > theta) out_data[j] = in_data[j];
-            else out_data[j] = 0;
+            if (in_data[j] > theta)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
         }
 
         res.write(out_data);
     }
 }
 
-
 // *************************************************
 //       Softplus Activation
 // *************************************************
 
-template<class data_T, class res_T, typename CONFIG_T>
-void softplus(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void softplus(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -499,32 +554,34 @@ void softplus(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         initialized = true;
     }
 
-    SoftplusActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+SoftplusActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        SoftplusPackLoop: for (int j = 0; j < res_T::size; j++) {
+    SoftplusPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            int data_round = in_data[j]*CONFIG_T::table_size/16;
-            int index = data_round + 8*CONFIG_T::table_size/16;
-            if (index < 0)   index = 0;
-            else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+            int data_round = in_data[j] * CONFIG_T::table_size / 16;
+            int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
             out_data[j] = softplus_table[index];
         }
         res.write(out_data);
     }
 }
 
-
 // *************************************************
 //       Softsign Activation
 // *************************************************
 
-template<class data_T, class res_T, typename CONFIG_T>
-void softsign(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void softsign(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -538,30 +595,33 @@ void softsign(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         initialized = true;
     }
 
-    SoftsignActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+SoftsignActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        SoftsignPackLoop: for (int j = 0; j < res_T::size; j++) {
+    SoftsignPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            int data_round = in_data[j]*CONFIG_T::table_size/16;
-            int index = data_round + 8*CONFIG_T::table_size/16;
-            if (index < 0)   index = 0;
-            else if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+            int data_round = in_data[j] * CONFIG_T::table_size / 16;
+            int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
             out_data[j] = softsign_table[index];
         }
         res.write(out_data);
     }
 }
 
-
 // *************************************************
 //       ELU Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void elu(hls::stream<data_T> &data, typename data_T::value_type alpha, hls::stream<res_T> &res) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
@@ -576,22 +636,25 @@ void elu(hls::stream<data_T> &data, typename data_T::value_type alpha, hls::stre
         initialized = true;
     }
 
-    EluActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+EluActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        EluPackLoop: for (int j = 0; j < res_T::size; j++) {
+    EluPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            
+
             typename data_T::value_type datareg = in_data[j];
             if (datareg >= 0) {
                 out_data[j] = datareg;
             } else {
-                int index = datareg*CONFIG_T::table_size/-8;
-                if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+                int index = datareg * CONFIG_T::table_size / -8;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
                 out_data[j] = alpha * elu_table[index];
             }
         }
@@ -599,8 +662,7 @@ void elu(hls::stream<data_T> &data, typename data_T::value_type alpha, hls::stre
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void elu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void elu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
 }
 
@@ -608,8 +670,7 @@ void elu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 //       SELU Activation
 // *************************************************
 
-template<class data_T, class res_T, typename CONFIG_T>
-void selu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void selu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -623,22 +684,25 @@ void selu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         initialized = true;
     }
 
-    SeluActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+SeluActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        SeluPackLoop: for (int j = 0; j < res_T::size; j++) {
+    SeluPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
 
             typename data_T::value_type datareg = in_data[j];
             if (datareg >= 0) {
-                out_data[j] = (typename data_T::value_type) 1.0507009873554804934193349852946 * datareg;
+                out_data[j] = (typename data_T::value_type)1.0507009873554804934193349852946 * datareg;
             } else {
-                int index = datareg*CONFIG_T::table_size/-8;
-                if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+                int index = datareg * CONFIG_T::table_size / -8;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
                 out_data[j] = selu_table[index];
             }
         }
@@ -646,24 +710,27 @@ void selu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     }
 }
 
-
 // *************************************************
 //       PReLU Activation
 // *************************************************
 
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void prelu(hls::stream<data_T> &data, typename data_T::value_type alpha[CONFIG_T::n_in], hls::stream<res_T> &res) {
-    PReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+PReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        PReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
+    PReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            if (in_data[j] > 0) out_data[j] = in_data[j];
-            else out_data[j] = alpha[i*res_T::size+j] * in_data[j];
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha[i * res_T::size + j] * in_data[j];
         }
         res.write(out_data);
     }
@@ -672,19 +739,23 @@ void prelu(hls::stream<data_T> &data, typename data_T::value_type alpha[CONFIG_T
 // *************************************************
 //       Binary TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void binary_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
-    PReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+PReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        PReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
+    PReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            if(in_data[j] > 0) out_data[j] = (typename res_T::value_type) 1;
-            else out_data[j] = (typename res_T::value_type) -1;
+            if (in_data[j] > 0)
+                out_data[j] = (typename res_T::value_type)1;
+            else
+                out_data[j] = (typename res_T::value_type) - 1;
         }
         res.write(out_data);
     }
@@ -693,26 +764,30 @@ void binary_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 // *************************************************
 //       Ternary TanH Activation
 // *************************************************
-template<class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_T, typename CONFIG_T>
 void ternary_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
-    PReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+PReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
         #pragma HLS PIPELINE
 
         data_T in_data = data.read();
         res_T out_data;
         PRAGMA_DATA_PACK(out_data)
 
-        PReLUPackLoop: for (int j = 0; j < res_T::size; j++) {
+    PReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            if(in_data[j] > 1) out_data[j] = (typename res_T::value_type) 1;
-            else if (in_data[j] <=-1) out_data[j] = (typename res_T::value_type) -1;
-            else out_data[j] = (typename res_T::value_type) 0;
+            if (in_data[j] > 1)
+                out_data[j] = (typename res_T::value_type)1;
+            else if (in_data[j] <= -1)
+                out_data[j] = (typename res_T::value_type) - 1;
+            else
+                out_data[j] = (typename res_T::value_type)0;
         }
         res.write(out_data);
     }
 }
 
-
-}
+} // namespace nnet
 
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
index d170eb667..e4db43682 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
@@ -1,37 +1,32 @@
 #ifndef NNET_INSTR_GEN_H_
 #define NNET_INSTR_GEN_H_
 
-#include <iostream>
 #include "nnet_helpers.h"
+#include <iostream>
 
 namespace nnet {
 
-template<class data_T, typename CONFIG_T>
-class FillConv1DBuffer{
-    public:
-    static void fill_buffer(
-        data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
-        const unsigned partition
-    ) {
+template <class data_T, typename CONFIG_T> class FillConv1DBuffer {
+  public:
+    static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                            data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
+                            const unsigned partition) {
         // To be implemented in subclasses
     }
 };
 
-template<class data_T, typename CONFIG_T>
-class FillConv2DBuffer{
-    public:
-    static void fill_buffer(
-        data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
-        const unsigned partition
-    ) {
+template <class data_T, typename CONFIG_T> class FillConv2DBuffer {
+  public:
+    static void
+    fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+                const unsigned partition) {
         // To be implemented in subclasses
     }
 };
 
-//hls4ml insert code
+// hls4ml insert code
 
-}
+} // namespace nnet
 
-#endif
\ No newline at end of file
+#endif
diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp
index 7a06633e5..05797f1f7 100644
--- a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp
+++ b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp
@@ -1,17 +1,14 @@
-//hls-fpga-machine-learning insert include
+// hls-fpga-machine-learning insert include
 
-void myproject(
-    input_axi_t in[N_IN],
-    output_axi_t out[N_OUT]
-        ){
+void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]) {
 
-    //hls-fpga-machine-learning insert interface
+    // hls-fpga-machine-learning insert interface
 
-    //hls-fpga-machine-learning insert local vars
+    // hls-fpga-machine-learning insert local vars
 
-    //hls-fpga-machine-learning insert enqueue
+    // hls-fpga-machine-learning insert enqueue
 
-    //hls-fpga-machine-learning insert call
+    // hls-fpga-machine-learning insert call
 
-    //hls-fpga-machine-learning insert dequeue
+    // hls-fpga-machine-learning insert dequeue
 }
diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.h b/hls4ml/templates/vivado_accelerator/myproject_axi.h
index fe3dbc5cd..a60dab39c 100644
--- a/hls4ml/templates/vivado_accelerator/myproject_axi.h
+++ b/hls4ml/templates/vivado_accelerator/myproject_axi.h
@@ -2,12 +2,9 @@
 #define MYPROJECT_AXI_H_
 
 #include <iostream>
-//hls-fpga-machine-learning insert include
+// hls-fpga-machine-learning insert include
 
-//hls-fpga-machine-learning insert definitions
+// hls-fpga-machine-learning insert definitions
 
-void myproject(
-    input_axi_t in[N_IN],
-    output_axi_t out[N_OUT]
-        );
+void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]);
 #endif
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index b63e2b48e..b1a47c655 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -40,18 +40,32 @@ def _get_precision_from_quantizer(quantizer):
         'quantized_bits',
         'quantized_relu',
         'quantized_tanh',
+        'quantized_sigmoid',
         'quantized_po2',
         'quantized_relu_po2',
         'linear',
     ]
     signed = True
+    rnd = "AP_TRN"
+    overflow = "AP_WRAP"
+
     if quantizer['class_name'] in supported_quantizers:
         bits = int(quantizer['config']['bits'])
         # if integer isn't specified, it should be the same as bits
         integer = int(quantizer['config'].get('integer', bits - 1)) + 1
-        if quantizer['class_name'] == 'quantized_relu':
+        # for quantizers use the following default rounding and overflow
+        rnd = "AP_RND_CONV"
+        overflow = "AP_SAT"
+        if quantizer['class_name'] in ('quantized_relu', 'quantized_relu_po2'):
             signed = False
             integer -= 1
+        elif quantizer['class_name'] == 'quantized_tanh':
+            overflow = "AP_SAT_SYM" if quantizer['config']['symmetric'] else "AP_SAT"
+            integer = 1
+        elif quantizer['class_name'] == 'quantized_sigmoid':
+            integer = 0
+            signed = False
+
     elif quantizer['class_name'] in ['binary', 'stochastic_binary', 'binary_tanh']:
         bits = 2
         integer = 2
@@ -65,7 +79,9 @@ def _get_precision_from_quantizer(quantizer):
     decimal = bits - integer
 
     if decimal > 0:
-        return hls4ml.model.types.FixedPrecisionType(width=bits, integer=integer, signed=signed)
+        return hls4ml.model.types.FixedPrecisionType(
+            width=bits, integer=integer, signed=signed, rounding_mode=rnd, saturation_mode=overflow
+        )
     else:
         return hls4ml.model.types.IntegerPrecisionType(width=integer, signed=signed)
 
diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py
index 28dd2d7eb..a958a6b0b 100644
--- a/hls4ml/writer/quartus_writer.py
+++ b/hls4ml/writer/quartus_writer.py
@@ -148,7 +148,7 @@ def write_project_cpp(self, model):
             # Intel HLS 'streams' need to be passed by reference to top-level entity or declared as global variables
             # Streams cannot be declared inside a function
             # Therefore, layer connections (inputs/outputs) are declared here
-            elif '//hls-fpga-machine-learning insert inter-task streams' in line:
+            elif '// hls-fpga-machine-learning insert inter-task streams' in line:
                 newline = line
                 if io_type == 'io_stream':
                     for layer in model.get_layers():
@@ -159,7 +159,7 @@ def write_project_cpp(self, model):
                                 newline += def_cpp + ';\n'
 
             # Instantiate GCC top-level function, to be used during GCC compilation / hls4ml.predict()
-            elif '//hls-fpga-machine-learning instantiate GCC top-level' in line:
+            elif '// hls-fpga-machine-learning instantiate GCC top-level' in line:
                 newline = line
                 if io_type == 'io_stream':
                     newline += f'void {project_name}(\n'
@@ -174,7 +174,7 @@ def write_project_cpp(self, model):
                     newline += ') {\n'
 
             # Instantiate HLS top-level function, to be used during HLS synthesis
-            elif '//hls-fpga-machine-learning instantiate HLS top-level' in line:
+            elif '// hls-fpga-machine-learning instantiate HLS top-level' in line:
                 newline = line
                 if io_type == 'io_stream':
                     newline += f'component void {project_name}(\n'
@@ -189,7 +189,7 @@ def write_project_cpp(self, model):
                     newline += ') {\n'
 
             # Insert HLS pragmas such as maximum frequency, initiation interval etc.
-            elif '//hls-fpga-machine-learning insert cpragmas' in line:
+            elif '// hls-fpga-machine-learning insert cpragmas' in line:
                 newline = line
                 if io_type == 'io_parallel':
                     newline += 'hls_max_concurrency(0)\n'
@@ -202,7 +202,7 @@ def write_project_cpp(self, model):
             # In io_stream, the input is of type 'stream_in' and output is of type 'stream_out'
             # However, individual layers accept the type 'stream'
             # Therefore, data is first read from 'stream_in', written to 'stream' and propagated through network
-            elif '//hls-fpga-machine-learning initialize input/output' in line:
+            elif '// hls-fpga-machine-learning initialize input/output' in line:
                 if io_type == 'io_stream':
                     newline = line
                     for inp in model_inputs:
@@ -215,21 +215,21 @@ def write_project_cpp(self, model):
                     newline += indent + 'hls_register output_data outputs;\n'
 
             # Insert weights
-            elif '//hls-fpga-machine-learning insert weights' in line:
+            elif '// hls-fpga-machine-learning insert weights' in line:
                 newline = line
                 for layer in model.get_layers():
                     for w in layer.get_weights():
                         newline += f'#include "weights/{w.name}.h"\n'
 
             # Insert test weights
-            elif '//hls-fpga-machine-learning insert test weights' in line:
+            elif '// hls-fpga-machine-learning insert test weights' in line:
                 newline = line
                 for layer in model.get_layers():
                     for w in layer.get_weights():
                         newline += f'#include "weights/{w.name}_test.h"\n'
 
             # Neural net instantiation
-            elif '//hls-fpga-machine-learning insert layers' in line:
+            elif '// hls-fpga-machine-learning insert layers' in line:
                 newline = line + '\n'
                 model_inputs = model.get_input_variables()
                 model_outputs = model.get_output_variables()
@@ -254,7 +254,7 @@ def write_project_cpp(self, model):
                         newline += '\n'
 
             # In io_parallel, a return is required; for more details see myproject.cpp & myproject.h
-            elif '//hls-fpga-machine-learning return' in line:
+            elif '// hls-fpga-machine-learning return' in line:
                 if io_type == 'io_stream':
                     newline = line
                     for out in model_outputs:
@@ -304,7 +304,7 @@ def write_project_header(self, model):
             elif 'myproject' in line:
                 newline = line.replace('myproject', project_name)
 
-            elif '//hls-fpga-machine-learning instantiate GCC top-level' in line:
+            elif '// hls-fpga-machine-learning instantiate GCC top-level' in line:
                 newline = line
                 # For io_stream, input and output are passed by reference; see myproject.h & myproject.cpp for more details
 
@@ -322,7 +322,7 @@ def write_project_header(self, model):
                     newline += ');\n'
 
             # Similar to GCC instantiation, but with the keyword 'component'
-            elif '//hls-fpga-machine-learning instantiate HLS top-level' in line:
+            elif '// hls-fpga-machine-learning instantiate HLS top-level' in line:
                 newline = line
                 if io_type == 'io_stream':
                     newline += f'component void {project_name}(\n'
@@ -336,7 +336,7 @@ def write_project_header(self, model):
                     newline += indent + 'input_data inputs\n'
                     newline += ');\n'
 
-            elif '//hls-fpga-machine-learning insert cpragmas' in line:
+            elif '// hls-fpga-machine-learning insert cpragmas' in line:
                 newline = line
                 if io_type == 'io_parallel':
                     newline += 'hls_max_concurrency(0)\n'
@@ -346,14 +346,14 @@ def write_project_header(self, model):
 
             # For io_stream, no inputs/outputs are instantiated, as they are passed by reference
             # For io_parallel, input/output structs are required
-            elif '//hls-fpga-machine-learning insert inputs' in line:
+            elif '// hls-fpga-machine-learning insert inputs' in line:
                 newline = line
                 if io_type != 'io_stream':
                     newline += 'struct input_data { \n'
                     for inp in model_inputs:
                         newline += indent + inp.definition_cpp() + ';\n'
                     newline += '};\n'
-            elif '//hls-fpga-machine-learning insert outputs' in line:
+            elif '// hls-fpga-machine-learning insert outputs' in line:
                 newline = line
                 if io_type != 'io_stream':
                     newline += 'struct output_data { \n'
@@ -382,12 +382,12 @@ def write_defines(self, model):
         for line in f.readlines():
 
             # Insert numbers
-            if '//hls-fpga-machine-learning insert numbers' in line:
+            if '// hls-fpga-machine-learning insert numbers' in line:
                 newline = line
                 numbers = OrderedDict.fromkeys([layer.get_numbers_cpp() for layer in model.get_layers()])
                 newline += ''.join(numbers)
 
-            elif '//hls-fpga-machine-learning insert layer-precision' in line:
+            elif '// hls-fpga-machine-learning insert layer-precision' in line:
                 newline = line
                 all_precision = OrderedDict()
                 for layer in model.get_layers():
@@ -418,12 +418,12 @@ def write_parameters(self, model):
 
         for line in f.readlines():
 
-            if '//hls-fpga-machine-learning insert includes' in line:
+            if '// hls-fpga-machine-learning insert includes' in line:
                 newline = line
                 for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))):
                     newline += '#include "%s"\n' % include
 
-            elif "//hls-fpga-machine-learning insert layer-config" in line:
+            elif "// hls-fpga-machine-learning insert layer-config" in line:
                 newline = line
                 for layer in model.get_layers():
                     config = layer.get_attr('config_cpp', None)
@@ -487,7 +487,7 @@ def write_testbench_parallel(self, model):
 
             if 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
-            elif '//hls-fpga-machine-learning insert data' in line:
+            elif '// hls-fpga-machine-learning insert data' in line:
                 newline = line
                 newline += '      std::vector<float>::const_iterator in_begin = in.cbegin();\n'
                 newline += '      std::vector<float>::const_iterator in_end;\n'
@@ -497,7 +497,7 @@ def write_testbench_parallel(self, model):
                     newline += f'      std::copy(in_begin, in_end, inputs.back().{inp.member_name});\n'
                     newline += '      in_begin = in_end;\n'
                 newline += '      outputs.emplace_back();\n'
-            elif '//hls-fpga-machine-learning insert zero' in line:
+            elif '// hls-fpga-machine-learning insert zero' in line:
                 newline = line
                 newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n'
                 for inp in model.get_input_variables():
@@ -506,7 +506,7 @@ def write_testbench_parallel(self, model):
                     newline += indent + f'  std::fill_n(inputs[i].{inp.member_name}, {inp.size_cpp()}, 0.0);\n'
                 newline += indent + '}\n'
 
-            elif '//hls-fpga-machine-learning insert top-level-function' in line:
+            elif '// hls-fpga-machine-learning insert top-level-function' in line:
                 newline = line
 
                 newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n'
@@ -515,20 +515,21 @@ def write_testbench_parallel(self, model):
             elif 'hls-fpga-machine-learning insert run' in line:
                 newline = line
                 newline += '    ' + f'ihc_hls_component_run_all({model.config.get_project_name()});\n'
-            elif '//hls-fpga-machine-learning insert predictions' in line:
+            elif '// hls-fpga-machine-learning insert predictions' in line:
                 newline = line
                 newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
                 newline += indent + '  std::cout << predictions[j][i] << " ";\n'
                 newline += indent + '}\n'
                 newline += indent + 'std::cout << std::endl;\n'
-            elif '//hls-fpga-machine-learning insert tb-output' in line:
+            elif '// hls-fpga-machine-learning insert tb-output' in line:
                 newline = line
                 newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
                 newline += indent + f'  fout << outputs[j].{outvar.member_name}[i] << " ";\n'
                 newline += indent + '}\n'
                 newline += indent + 'fout << std::endl;\n'
             elif (
-                '//hls-fpga-machine-learning insert output' in line or '//hls-fpga-machine-learning insert quantized' in line
+                '// hls-fpga-machine-learning insert output' in line
+                or '// hls-fpga-machine-learning insert quantized' in line
             ):
                 newline = line
                 newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
@@ -589,7 +590,7 @@ def write_testbench_stream(self, model):
             if 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
 
-            elif '//hls-fpga-machine learning instantiate inputs and outputs' in line:
+            elif '// hls-fpga-machine learning instantiate inputs and outputs' in line:
                 newline = line
                 for inp in model_inputs:
                     newline += indent + f'stream_in<{inp.type.name}> {inp.name}_input;\n'
@@ -597,7 +598,7 @@ def write_testbench_stream(self, model):
                     newline += indent + f'stream_out<{out.type.name}> {out.name}_output;\n'
 
             # TODO - This is one-input specific (are multiple model inputs needed at all?)
-            elif '//hls-fpga-machine-learning insert data' in line:
+            elif '// hls-fpga-machine-learning insert data' in line:
                 newline = line
                 c = 0
                 for inp in model_inputs:
@@ -611,7 +612,7 @@ def write_testbench_stream(self, model):
                     )
                     c += 1
 
-            elif '//hls-fpga-machine-learning insert zero' in line:
+            elif '// hls-fpga-machine-learning insert zero' in line:
                 newline = line
                 c = 0
                 for inp in model_inputs:
@@ -625,7 +626,7 @@ def write_testbench_stream(self, model):
                     )
                     c += 1
 
-            elif '//hls-fpga-machine-learning insert top-level-function' in line:
+            elif '// hls-fpga-machine-learning insert top-level-function' in line:
                 newline = line
                 input_params = ', '.join([f'{i.name}_input' for i in model_inputs])
                 output_params = ', '.join([f'{o.name}_output' for o in model_outputs])
@@ -638,27 +639,27 @@ def write_testbench_stream(self, model):
                 newline = line
                 newline += indent + f'ihc_hls_component_run_all({model.config.get_project_name()});\n'
 
-            elif '//hls-fpga-machine-learning convert output' in line:
+            elif '// hls-fpga-machine-learning convert output' in line:
                 newline = line
                 newline += indent + f'float res[{outvar.size_cpp()}];\n'
                 newline += indent + 'nnet::convert_data_back<{}, float, {}>({}_output, res);\n'.format(
                     outvar.type.name, outvar.size_cpp(), outvar.name
                 )
 
-            elif '//hls-fpga-machine-learning insert tb-output' in line:
+            elif '// hls-fpga-machine-learning insert tb-output' in line:
                 newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
                 newline += indent + '  fout << res[i] << " ";\n'
                 newline += indent + '}\n'
                 newline += indent + 'fout << std::endl;\n'
 
-            elif '//hls-fpga-machine-learning print predictions' in line:
+            elif '// hls-fpga-machine-learning print predictions' in line:
                 newline = line
                 newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
                 newline += indent + '  std::cout << predictions[iteration][i] << " ";\n'
                 newline += indent + '}\n'
                 newline += indent + 'std::cout << std::endl;\n'
 
-            elif '//hls-fpga-machine-learning print output' in line:
+            elif '// hls-fpga-machine-learning print output' in line:
                 newline = line
                 newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
                 newline += indent + '  std::cout << res[i] << " "; \n'
@@ -711,7 +712,7 @@ def write_bridge(self, model):
             elif 'myproject' in line:
                 newline = line.replace('myproject', format(model.config.get_project_name()))
 
-            elif '//hls-fpga-machine-learning insert header' in line:
+            elif '// hls-fpga-machine-learning insert header' in line:
                 dtype = line.split('#', 1)[1].strip()
                 if io_type == 'io_stream':
                     inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
@@ -729,7 +730,7 @@ def write_bridge(self, model):
                 newline += indent + insize_str + ',\n'
                 newline += indent + outsize_str + '\n'
 
-            elif '//hls-fpga-machine-learning insert wrapper' in line:
+            elif '// hls-fpga-machine-learning insert wrapper' in line:
                 dtype = line.split('#', 1)[1].strip()
                 if io_type == 'io_stream':
                     newline = ''
@@ -782,7 +783,7 @@ def write_bridge(self, model):
                         newline += indent + 'nnet::convert_data_back<{}, {}, {}>(outputs_ap.{}, {});\n'.format(
                             o.type.name, dtype, o.size_cpp(), o.member_name, o.member_name
                         )
-            elif '//hls-fpga-machine-learning insert trace_outputs' in line:
+            elif '// hls-fpga-machine-learning insert trace_outputs' in line:
                 newline = ''
                 for layer in model.get_layers():
                     func = layer.get_attr('function_cpp')
diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py
index b92ce74ab..46c193fdb 100644
--- a/hls4ml/writer/vivado_accelerator_writer.py
+++ b/hls4ml/writer/vivado_accelerator_writer.py
@@ -1,71 +1,96 @@
 import os
-from shutil import copyfile, copytree
 from distutils.dir_util import copy_tree
+from shutil import copyfile
+
 from hls4ml.writer.vivado_writer import VivadoWriter
 
-class VivadoAcceleratorWriter(VivadoWriter):
 
+class VivadoAcceleratorWriter(VivadoWriter):
     def __init__(self):
         super().__init__()
         self.vivado_accelerator_config = None
 
     def write_axi_wrapper(self, model):
-        ''' Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces
-            Args:
-                model : The ModelGraph to write the wrapper for
+        '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces
+        Args:
+            model : The ModelGraph to write the wrapper for
         '''
         inp_axi_t, out_axi_t, inp, out = self.vivado_accelerator_config.get_corrected_types()
         indent = '    '
 
         #######################
-        ## myproject_axi.h
+        # myproject_axi.h
         #######################
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        f = open(os.path.join(filedir, '../templates/vivado_accelerator/myproject_axi.h'), 'r')
-        fout = open('{}/firmware/{}_axi.h'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w')
+        f = open(os.path.join(filedir, '../templates/vivado_accelerator/myproject_axi.h'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w')
 
         for line in f.readlines():
             if 'MYPROJECT' in line:
                 newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
-            elif '//hls-fpga-machine-learning insert include' in line:
-                newline = '#include "{}.h"\n'.format(model.config.get_project_name())
-            elif 'void myproject(' in line:
-                newline = 'void {}_axi(\n'.format(model.config.get_project_name())
-            elif '//hls-fpga-machine-learning insert definitions' in line:
+            elif '// hls-fpga-machine-learning insert include' in line:
+                newline = f'#include "{model.config.get_project_name()}.h"\n'
+            elif 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert definitions' in line:
                 newline = ''
-                newline += 'static const unsigned N_IN = {};\n'.format(inp.size())
-                newline += 'static const unsigned N_OUT = {};\n'.format(out.size())
+                newline += f'static const unsigned N_IN = {inp.size()};\n'
+                newline += f'static const unsigned N_OUT = {out.size()};\n'
                 if self.vivado_accelerator_config.get_interface() == 'axi_stream':
-                    newline += 'typedef {} T_in;\n'.format(inp_axi_t)
-                    newline += 'typedef {} T_out;\n'.format(out_axi_t)
-                    newline += 'typedef struct in_struct {\n' + \
-                               indent + 'T_in data;\n' + \
-                               indent + 'ap_uint<1> last;\n' + \
-                               indent + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + \
-                               indent + 'in_struct(){this->data = 0; this->last = 0;};\n' + \
-                               indent + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n' + \
-                               indent + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n' + \
-                               indent + 'operator float() const {return this->data;}\n' + \
-                               indent + 'operator double() const {return this->data;}\n' + \
-                               indent + 'in_struct(float data) {this->data = data; this->last = 0;}\n' + \
-                               indent + 'in_struct(double data) {this->data = data; this->last = 0;}\n' + \
-                               '} input_axi_t;\n'
-                    newline += 'typedef struct out_struct {\n' + \
-                               indent + 'T_out data;\n' + \
-                               indent + 'ap_uint<1> last;\n' + \
-                               indent + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + \
-                               indent + 'out_struct(){this->data = 0; this->last = 0;};\n' + \
-                               indent + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n' + \
-                               indent + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n' + \
-                               indent + 'operator float() const {return this->data;}\n' + \
-                               indent + 'operator double() const {return this->data;}\n' + \
-                               indent + 'out_struct(float data) {this->data = data; this->last = 0;}\n' + \
-                               indent + 'out_struct(double data) {this->data = data; this->last = 0;}\n' + \
-                               '} output_axi_t;\n'
+                    newline += f'typedef {inp_axi_t} T_in;\n'
+                    newline += f'typedef {out_axi_t} T_out;\n'
+                    newline += (
+                        'typedef struct in_struct {\n'
+                        + indent
+                        + 'T_in data;\n'
+                        + indent
+                        + 'ap_uint<1> last;\n'
+                        + indent
+                        + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
+                        + indent
+                        + 'in_struct(){this->data = 0; this->last = 0;};\n'
+                        + indent
+                        + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n'
+                        + indent
+                        + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n'
+                        + indent
+                        + 'operator float() const {return this->data;}\n'
+                        + indent
+                        + 'operator double() const {return this->data;}\n'
+                        + indent
+                        + 'in_struct(float data) {this->data = data; this->last = 0;}\n'
+                        + indent
+                        + 'in_struct(double data) {this->data = data; this->last = 0;}\n'
+                        + '} input_axi_t;\n'
+                    )
+                    newline += (
+                        'typedef struct out_struct {\n'
+                        + indent
+                        + 'T_out data;\n'
+                        + indent
+                        + 'ap_uint<1> last;\n'
+                        + indent
+                        + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
+                        + indent
+                        + 'out_struct(){this->data = 0; this->last = 0;};\n'
+                        + indent
+                        + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n'
+                        + indent
+                        + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n'
+                        + indent
+                        + 'operator float() const {return this->data;}\n'
+                        + indent
+                        + 'operator double() const {return this->data;}\n'
+                        + indent
+                        + 'out_struct(float data) {this->data = data; this->last = 0;}\n'
+                        + indent
+                        + 'out_struct(double data) {this->data = data; this->last = 0;}\n'
+                        + '} output_axi_t;\n'
+                    )
                 else:
-                    newline += 'typedef {} input_axi_t;\n'.format(inp_axi_t)
-                    newline += 'typedef {} output_axi_t;\n'.format(out_axi_t)
+                    newline += f'typedef {inp_axi_t} input_axi_t;\n'
+                    newline += f'typedef {out_axi_t} output_axi_t;\n'
             else:
                 newline = line
             fout.write(newline)
@@ -73,21 +98,20 @@ def write_axi_wrapper(self, model):
         fout.close()
 
         #######################
-        ## myproject_axi.cpp
+        # myproject_axi.cpp
         #######################
 
-        f = open(os.path.join(filedir, '../templates/vivado_accelerator/myproject_axi.cpp'), 'r')
-        fout = open('{}/firmware/{}_axi.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()),
-                    'w')
+        f = open(os.path.join(filedir, '../templates/vivado_accelerator/myproject_axi.cpp'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w')
 
         io_type = model.config.get_config_value("IOType")
 
         for line in f.readlines():
-            if 'void myproject(' in line:
-                newline = 'void {}_axi(\n'.format(model.config.get_project_name())
-            elif '//hls-fpga-machine-learning insert include' in line:
-                newline = '#include "{}_axi.h"\n'.format(model.config.get_project_name())
-            elif '//hls-fpga-machine-learning insert local vars' in line:
+            if 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert include' in line:
+                newline = f'#include "{model.config.get_project_name()}_axi.h"\n'
+            elif '// hls-fpga-machine-learning insert local vars' in line:
                 newline = ''
                 if self.vivado_accelerator_config.get_interface() == 'axi_stream':
                     newline += indent + 'bool is_last = false;\n'
@@ -97,14 +121,15 @@ def write_axi_wrapper(self, model):
                 elif io_type == 'io_stream':
                     newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n'
                     newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n'
-                    newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'\
-                        .format(model.get_input_variables()[0].pragma[1])
-                    newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'\
-                        .format(model.get_output_variables()[0].pragma[1])
-            elif '//hls-fpga-machine-learning insert call' in line:
-                newline = indent + '{}(in_local, out_local);\n'.format(
-                    model.config.get_project_name())
-            elif '//hls-fpga-machine-learning insert interface' in line:
+                    newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format(
+                        model.get_input_variables()[0].pragma[1]
+                    )
+                    newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format(
+                        model.get_output_variables()[0].pragma[1]
+                    )
+            elif '// hls-fpga-machine-learning insert call' in line:
+                newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n'
+            elif '// hls-fpga-machine-learning insert interface' in line:
                 if self.vivado_accelerator_config.get_interface() == 'axi_lite':
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
@@ -113,10 +138,12 @@ def write_axi_wrapper(self, model):
                 elif self.vivado_accelerator_config.get_interface() == 'axi_master':
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n'
-                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'\
-                        .format(model.get_input_variables()[0].pragma[1])
-                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'\
-                        .format(model.get_output_variables()[0].pragma[1])
+                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format(
+                        model.get_input_variables()[0].pragma[1]
+                    )
+                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format(
+                        model.get_output_variables()[0].pragma[1]
+                    )
                 elif self.vivado_accelerator_config.get_interface() == 'axi_stream':
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE axis port=in\n'
@@ -124,7 +151,7 @@ def write_axi_wrapper(self, model):
                     newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
                     if model.config.get_config_value("IOType") == 'io_stream':
                         newline += indent + '#pragma HLS DATAFLOW\n'
-            elif '//hls-fpga-machine-learning insert enqueue' in line:
+            elif '// hls-fpga-machine-learning insert enqueue' in line:
                 io_type = model.config.get_config_value("IOType")
                 if io_type == 'io_parallel':
                     newline = ''
@@ -146,15 +173,27 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
                     if self.vivado_accelerator_config.get_interface() == 'axi_stream':
-                        newline += indent + indent + indent + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n'
-                        newline += indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n'
+                        newline += (
+                            indent
+                            + indent
+                            + indent
+                            + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n'
+                        )
+                        newline += (
+                            indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n'
+                        )
                     else:
-                        newline += indent + indent + indent + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j]);\n'
+                        newline += (
+                            indent
+                            + indent
+                            + indent
+                            + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j]);\n'
+                        )
                     newline += indent + indent + '}}\n'
                     newline += indent + indent + 'in_local.write(ctype);\n'
                     newline += indent + '}}\n'
                     newline = newline.format(input_t=inp.type.name)
-            elif '//hls-fpga-machine-learning insert dequeue' in line:
+            elif '// hls-fpga-machine-learning insert dequeue' in line:
                 io_type = model.config.get_config_value("IOType")
                 if io_type == 'io_parallel':
                     newline = ''
@@ -175,8 +214,15 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
                     if self.vivado_accelerator_config.get_interface() == 'axi_stream':
-                        newline += indent + indent + indent + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n'
-                        newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j], last);\n'
+                        newline += (
+                            indent
+                            + indent
+                            + indent
+                            + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n'
+                        )
+                        newline += (
+                            indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j], last);\n'
+                        )
                     else:
                         newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n'
                     newline += indent + indent + '}}\n'
@@ -193,18 +239,20 @@ def modify_build_script(self, model):
         Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function
         '''
         filedir = os.path.dirname(os.path.abspath(__file__))
-        oldfile = '{}/build_prj.tcl'.format(model.config.get_output_dir())
-        newfile = '{}/build_prj_axi.tcl'.format(model.config.get_output_dir())
-        f = open(oldfile, 'r')
+        oldfile = f'{model.config.get_output_dir()}/build_prj.tcl'
+        newfile = f'{model.config.get_output_dir()}/build_prj_axi.tcl'
+        f = open(oldfile)
         fout = open(newfile, 'w')
 
         for line in f.readlines():
             if 'set_top' in line:
                 newline = line[:-1] + '_axi\n'  # remove the newline from the line end and append _axi for the new top
-                newline += 'add_files firmware/{}_axi.cpp -cflags "-std=c++0x"\n'.format(
-                    model.config.get_project_name())
-            elif '{}_cosim'.format(model.config.get_project_name()) in line:
-                newline = line.replace('{}_cosim'.format(model.config.get_project_name()), '{}_axi_cosim'.format(model.config.get_project_name()))
+                newline += f'add_files firmware/{model.config.get_project_name()}_axi.cpp -cflags "-std=c++0x"\n'
+            elif f'{model.config.get_project_name()}_cosim' in line:
+                newline = line.replace(
+                    f'{model.config.get_project_name()}_cosim',
+                    f'{model.config.get_project_name()}_axi_cosim',
+                )
             elif '${project_name}.tcl' in line:
                 newline = line.replace('${project_name}.tcl', '${project_name}_axi.tcl')
             else:
@@ -219,8 +267,8 @@ def modify_build_script(self, model):
         # build_lib.sh
         ###################
 
-        f = open(os.path.join(filedir, '../templates/vivado_accelerator/build_lib.sh'), 'r')
-        fout = open('{}/build_lib.sh'.format(model.config.get_output_dir()), 'w')
+        f = open(os.path.join(filedir, '../templates/vivado_accelerator/build_lib.sh'))
+        fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w')
 
         for line in f.readlines():
             line = line.replace('myproject', model.config.get_project_name())
@@ -235,34 +283,37 @@ def write_wrapper_test(self, model):
         ###################
         # write myproject_test_wrapper.cpp
         ###################
-        oldfile = '{}/{}_test.cpp'.format(model.config.get_output_dir(), model.config.get_project_name())
-        newfile = '{}/{}_test_wrapper.cpp'.format(model.config.get_output_dir(), model.config.get_project_name())
+        oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp'
+        newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp'
 
-        f = open(oldfile, 'r')
+        f = open(oldfile)
         fout = open(newfile, 'w')
 
         inp = model.get_input_variables()[0]
         out = model.get_output_variables()[0]
 
         for line in f.readlines():
-            if '{}.h'.format(model.config.get_project_name()) in line:
-                newline = line.replace('{}.h'.format(model.config.get_project_name()),
-                                       '{}_axi.h'.format(model.config.get_project_name()))
+            if f'{model.config.get_project_name()}.h' in line:
+                newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
             elif inp.definition_cpp() in line:
-                newline = line.replace(inp.definition_cpp(), 'input_axi_t inputs[N_IN]') #TODO instead of replacing strings, how about we use proper variables and their definition?
+                newline = line.replace(
+                    inp.definition_cpp(), 'input_axi_t inputs[N_IN]'
+                )  # TODO instead of replacing strings, how about we use proper variables and their definition?
             elif out.definition_cpp() in line:
                 newline = line.replace(out.definition_cpp(), 'output_axi_t outputs[N_OUT]')
             elif 'unsigned short' in line:
                 newline = ''
-            elif '{}('.format(model.config.get_project_name()) in line:
+            elif f'{model.config.get_project_name()}(' in line:
                 indent_amount = line.split(model.config.get_project_name())[0]
-                newline = indent_amount + '{}_axi(inputs,outputs);\n'.format(model.config.get_project_name())
+                newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
-                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name,
-                                                                                                      'input_axi_t')
+                newline = (
+                    line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'input_axi_t')
+                )
             elif out.size_cpp() in line or out.name in line or out.type.name in line:
-                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name,
-                                                                                                        'output_axi_t')
+                newline = (
+                    line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'output_axi_t')
+                )
             else:
                 newline = line
             if self.vivado_accelerator_config.get_interface() == 'axi_stream':
@@ -280,29 +331,27 @@ def write_wrapper_test(self, model):
         ###################
         # write myproject_bridge_wrapper.cpp
         ###################
-        oldfile = '{}/{}_bridge.cpp'.format(model.config.get_output_dir(), model.config.get_project_name())
-        newfile = '{}/{}_bridge_wrapper.cpp'.format(model.config.get_output_dir(), model.config.get_project_name())
+        oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp'
+        newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge_wrapper.cpp'
 
-        f = open(oldfile, 'r')
+        f = open(oldfile)
         fout = open(newfile, 'w')
 
         inp = model.get_input_variables()[0]
         out = model.get_output_variables()[0]
 
         for line in f.readlines():
-            if '{}.h'.format(model.config.get_project_name()) in line:
-                newline = line.replace('{}.h'.format(model.config.get_project_name()),
-                                       '{}_axi.h'.format(model.config.get_project_name()))
+            if f'{model.config.get_project_name()}.h' in line:
+                newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
             elif inp.definition_cpp(name_suffix='_ap') in line:
-                newline = line.replace(inp.definition_cpp(name_suffix='_ap'),
-                                       'input_axi_t {}_ap[N_IN]'.format(inp.name))
+                newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'input_axi_t {inp.name}_ap[N_IN]')
             elif out.definition_cpp(name_suffix='_ap') in line:
-                newline = line.replace(out.definition_cpp(name_suffix='_ap'),
-                                       'output_axi_t {}_ap[N_OUT]'.format(out.name))
-            elif '{}('.format(model.config.get_project_name()) in line:
+                newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'output_axi_t {out.name}_ap[N_OUT]')
+            elif f'{model.config.get_project_name()}(' in line:
                 indent_amount = line.split(model.config.get_project_name())[0]
-                newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(model.config.get_project_name(), inp.name,
-                                                                          out.name)
+                newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(
+                    model.config.get_project_name(), inp.name, out.name
+                )
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
                 newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'input_axi_t')
             elif out.size_cpp() in line or out.name in line or out.type.name in line:
@@ -320,57 +369,61 @@ def write_board_script(self, model):
         Write the tcl scripts and kernel sources to create a Vivado IPI project for the VivadoAccelerator
         '''
         filedir = os.path.dirname(os.path.abspath(__file__))
-        copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_tcl_file_path()),
-                 '{}/design.tcl'.format(model.config.get_output_dir()))
+        copyfile(
+            os.path.join(filedir, self.vivado_accelerator_config.get_tcl_file_path()),
+            f'{model.config.get_output_dir()}/design.tcl',
+        )
         # Generic alveo board
         if self.vivado_accelerator_config.get_board().startswith('alveo'):
-            src_dir=os.path.join(filedir, self.vivado_accelerator_config.get_krnl_rtl_src_dir())
-            dst_dir= os.path.abspath(model.config.get_output_dir())+'/src'
-            copy_tree(src_dir,dst_dir)
+            src_dir = os.path.join(filedir, self.vivado_accelerator_config.get_krnl_rtl_src_dir())
+            dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src'
+            copy_tree(src_dir, dst_dir)
 
         ###################
         # project.tcl
         ###################
-        f = open('{}/project.tcl'.format(model.config.get_output_dir()), 'w')
+        f = open(f'{model.config.get_output_dir()}/project.tcl', 'w')
         f.write('variable project_name\n')
-        f.write('set project_name "{}"\n'.format(model.config.get_project_name()))
+        f.write(f'set project_name "{model.config.get_project_name()}"\n')
         f.write('variable backend\n')
         f.write('set backend "vivadoaccelerator"\n')
         f.write('variable part\n')
-        f.write('set part "{}"\n'.format(self.vivado_accelerator_config.get_part()))
+        f.write(f'set part "{self.vivado_accelerator_config.get_part()}"\n')
         f.write('variable clock_period\n')
         f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
         f.write('variable clock_uncertainty\n')
         f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
         if self.vivado_accelerator_config.get_interface() == 'axi_stream':
             in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth()
-            f.write('set bit_width_hls_output {}\n'.format(in_bit))
-            f.write('set bit_width_hls_input {}\n'.format(out_bit))
+            f.write(f'set bit_width_hls_output {in_bit}\n')
+            f.write(f'set bit_width_hls_input {out_bit}\n')
         f.close()
 
     def write_driver(self, model):
         filedir = os.path.dirname(os.path.abspath(__file__))
-        copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()),
-                 ('{}/' + self.vivado_accelerator_config.get_driver_file()).format(model.config.get_output_dir()))
-        
+        copyfile(
+            os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()),
+            ('{}/' + self.vivado_accelerator_config.get_driver_file()).format(model.config.get_output_dir()),
+        )
+
     def write_new_tar(self, model):
         os.remove(model.config.get_output_dir() + '.tar.gz')
-        super(VivadoAcceleratorWriter, self).write_tar(model)
+        super().write_tar(model)
 
-        
     def write_hls(self, model):
         """
         Write the HLS project. Calls the VivadoBackend writer, and extra steps for VivadoAccelerator/AXI interface
         """
-        #TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package
+        # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package
         from hls4ml.backends import VivadoAcceleratorConfig
-        self.vivado_accelerator_config = VivadoAcceleratorConfig(model.config, model.get_input_variables(),
-                                                                 model.get_output_variables())
-        super(VivadoAcceleratorWriter, self).write_hls(model)
+
+        self.vivado_accelerator_config = VivadoAcceleratorConfig(
+            model.config, model.get_input_variables(), model.get_output_variables()
+        )
+        super().write_hls(model)
         self.write_board_script(model)
         self.write_driver(model)
         self.write_wrapper_test(model)
         self.write_axi_wrapper(model)
         self.modify_build_script(model)
         self.write_new_tar(model)
-
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index c70e28bb5..a7d269102 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -125,7 +125,7 @@ def write_project_cpp(self, model):
             # Add headers to weights and biases
             if 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
-            elif '//hls-fpga-machine-learning insert header' in line:
+            elif '// hls-fpga-machine-learning insert header' in line:
                 inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs])
                 outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs])
                 brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
@@ -137,7 +137,7 @@ def write_project_cpp(self, model):
                     newline += ',\n' + brams_str
                 newline += '\n'
 
-            elif '//hls-fpga-machine-learning insert load weights' in line:
+            elif '// hls-fpga-machine-learning insert load weights' in line:
                 newline = line
                 for layer in model.get_layers():
                     for w in layer.get_weights():
@@ -155,7 +155,7 @@ def write_project_cpp(self, model):
                             )
 
             # Add input/output type
-            elif '//hls-fpga-machine-learning insert IO' in line:
+            elif '// hls-fpga-machine-learning insert IO' in line:
                 newline = line
                 all_inputs = [i.name for i in model_inputs]
                 all_outputs = [o.name for o in model_outputs]
@@ -184,7 +184,7 @@ def write_project_cpp(self, model):
                         newline += indent + '#pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams))
                     newline += indent + '#pragma HLS DATAFLOW \n'
 
-            elif '//hls-fpga-machine-learning insert layers' in line:
+            elif '// hls-fpga-machine-learning insert layers' in line:
                 newline = line + '\n'
                 for layer in model.get_layers():
                     vars = layer.get_variables()
@@ -243,9 +243,9 @@ def write_project_header(self, model):
 
             if 'MYPROJECT' in line:
                 newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
-            elif 'void myproject(' in line:
-                newline = f'void {model.config.get_project_name()}(\n'
-            elif '//hls-fpga-machine-learning insert header' in line:
+            elif 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert header' in line:
                 inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs])
                 outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs])
                 brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
@@ -276,12 +276,12 @@ def write_defines(self, model):
         for line in f.readlines():
 
             # Insert numbers
-            if '//hls-fpga-machine-learning insert numbers' in line:
+            if '// hls-fpga-machine-learning insert numbers' in line:
                 newline = line
                 numbers = OrderedDict.fromkeys([layer.get_numbers_cpp() for layer in model.get_layers()])
                 newline += ''.join(numbers)
 
-            elif '//hls-fpga-machine-learning insert layer-precision' in line:
+            elif '// hls-fpga-machine-learning insert layer-precision' in line:
                 newline = line
                 all_precision = OrderedDict()
                 for layer in model.get_layers():
@@ -312,19 +312,19 @@ def write_parameters(self, model):
 
         for line in f.readlines():
 
-            if '//hls-fpga-machine-learning insert includes' in line:
+            if '// hls-fpga-machine-learning insert includes' in line:
                 newline = line
                 for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))):
                     newline += '#include "%s"\n' % include
 
-            elif '//hls-fpga-machine-learning insert weights' in line:
+            elif '// hls-fpga-machine-learning insert weights' in line:
                 newline = line
                 for layer in model.get_layers():
                     for w in layer.get_weights():
                         if w.storage.lower() != 'bram':
                             newline += f'#include "weights/{w.name}.h"\n'
 
-            elif "//hls-fpga-machine-learning insert layer-config" in line:
+            elif "// hls-fpga-machine-learning insert layer-config" in line:
                 newline = line
                 for layer in model.get_layers():
                     config = layer.get_attr('config_cpp', None)
@@ -415,11 +415,11 @@ def write_test_bench(self, model):
             # Insert numbers
             if 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
-            elif '//hls-fpga-machine-learning insert bram' in line:
+            elif '// hls-fpga-machine-learning insert bram' in line:
                 newline = line
                 for bram in model_brams:
                     newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
-            elif '//hls-fpga-machine-learning insert data' in line:
+            elif '// hls-fpga-machine-learning insert data' in line:
                 newline = line
                 offset = 0
                 for inp in model_inputs:
@@ -430,14 +430,14 @@ def write_test_bench(self, model):
                     offset += inp.size()
                 for out in model_outputs:
                     newline += '      ' + out.definition_cpp() + ';\n'
-            elif '//hls-fpga-machine-learning insert zero' in line:
+            elif '// hls-fpga-machine-learning insert zero' in line:
                 newline = line
                 for inp in model_inputs:
                     newline += '    ' + inp.definition_cpp() + ';\n'
                     newline += f'    nnet::fill_zero<{inp.type.name}, {inp.size_cpp()}>({inp.name});\n'
                 for out in model_outputs:
                     newline += '    ' + out.definition_cpp() + ';\n'
-            elif '//hls-fpga-machine-learning insert top-level-function' in line:
+            elif '// hls-fpga-machine-learning insert top-level-function' in line:
                 newline = line
 
                 input_vars = ','.join([i.name for i in model_inputs])
@@ -450,21 +450,22 @@ def write_test_bench(self, model):
                 top_level = indent + f'{model.config.get_project_name()}({all_vars});\n'
 
                 newline += top_level
-            elif '//hls-fpga-machine-learning insert predictions' in line:
+            elif '// hls-fpga-machine-learning insert predictions' in line:
                 newline = line
                 for out in model_outputs:
                     newline += indent + f'for(int i = 0; i < {out.size_cpp()}; i++) {{\n'
                     newline += indent + '  std::cout << pr[i] << " ";\n'
                     newline += indent + '}\n'
                     newline += indent + 'std::cout << std::endl;\n'
-            elif '//hls-fpga-machine-learning insert tb-output' in line:
+            elif '// hls-fpga-machine-learning insert tb-output' in line:
                 newline = line
                 for out in model_outputs:
                     newline += indent + 'nnet::print_result<{}, {}>({}, fout);\n'.format(
                         out.type.name, out.size_cpp(), out.name
                     )  # TODO enable this
             elif (
-                '//hls-fpga-machine-learning insert output' in line or '//hls-fpga-machine-learning insert quantized' in line
+                '// hls-fpga-machine-learning insert output' in line
+                or '// hls-fpga-machine-learning insert quantized' in line
             ):
                 newline = line
                 for out in model_outputs:
@@ -500,11 +501,11 @@ def write_bridge(self, model):
                 newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
             elif 'myproject' in line:
                 newline = line.replace('myproject', format(model.config.get_project_name()))
-            elif '//hls-fpga-machine-learning insert bram' in line:
+            elif '// hls-fpga-machine-learning insert bram' in line:
                 newline = line
                 for bram in model_brams:
                     newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
-            elif '//hls-fpga-machine-learning insert header' in line:
+            elif '// hls-fpga-machine-learning insert header' in line:
                 dtype = line.split('#', 1)[1].strip()
                 inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
                 outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs])
@@ -512,7 +513,7 @@ def write_bridge(self, model):
                 newline = ''
                 newline += indent + inputs_str + ',\n'
                 newline += indent + outputs_str + '\n'
-            elif '//hls-fpga-machine-learning insert wrapper' in line:
+            elif '// hls-fpga-machine-learning insert wrapper' in line:
                 dtype = line.split('#', 1)[1].strip()
                 newline = ''
                 for i in model_inputs:
@@ -543,7 +544,7 @@ def write_bridge(self, model):
                     newline += indent + 'nnet::convert_data<{}, {}, {}>({}_ap, {});\n'.format(
                         o.type.name, dtype, o.size_cpp(), o.name, o.name
                     )
-            elif '//hls-fpga-machine-learning insert trace_outputs' in line:
+            elif '// hls-fpga-machine-learning insert trace_outputs' in line:
                 newline = ''
                 for layer in model.get_layers():
                     func = layer.get_attr('function_cpp', None)
@@ -661,7 +662,7 @@ def write_generated_code(self, model):
         f = open(path, 'w')
 
         for line in contents:
-            if '//hls4ml insert code' in line:
+            if '// hls4ml insert code' in line:
                 newline = line
                 for layer in model.get_layers():
                     for generated_code in layer.code.values():
diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py
index e7fa1ea15..8645ecd0b 100644
--- a/test/pytest/test_qkeras.py
+++ b/test/pytest/test_qkeras.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pytest
 from qkeras.qlayers import QActivation, QDense
-from qkeras.quantizers import binary, quantized_bits, quantized_relu, ternary
+from qkeras.quantizers import binary, quantized_bits, quantized_relu, quantized_sigmoid, quantized_tanh, ternary
 from qkeras.utils import _add_supported_quantized_objects
 from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split
@@ -65,9 +65,6 @@ def convert(load_jettagging_model, strategy):
     Convert a QKeras model trained on the jet tagging dataset
     '''
     model = load_jettagging_model
-    hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(
-        layers=['Activation'], rounding_mode='AP_RND', saturation_mode='AP_SAT'
-    )
 
     config = hls4ml.utils.config_from_keras_model(model, granularity='name')
     config['Model']['Strategy'] = strategy
@@ -79,7 +76,6 @@ def convert(load_jettagging_model, strategy):
         output_dir=str(test_root_path / f'hls4mlprj_qkeras_accuracy_{strategy}'),
         part='xcu250-figd2104-2L-e',
     )
-    hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(layers=[])
     hls_model.compile()
     return hls_model
 
@@ -149,15 +145,11 @@ def test_single_dense_activation_exact(randX_100_16, bits, alpha, backend, io_ty
     model.add(QActivation(activation=quantized_relu(bits, 0), name='relu1'))
     model.compile()
 
-    hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(
-        layers=['relu1'], rounding_mode='AP_RND_CONV', saturation_mode='AP_SAT'
-    )
     config = hls4ml.utils.config_from_keras_model(model, granularity='name')
     output_dir = str(test_root_path / f'hls4mlprj_qkeras_single_dense_activation_exact_{bits}_{alpha}_{backend}_{io_type}')
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
     )
-    hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(layers=[])
     hls_model.compile()
 
     y_qkeras = model.predict(X)
@@ -185,6 +177,38 @@ def randX_100_10():
     return randX(100, 10)
 
 
+@pytest.mark.parametrize(
+    'quantizer', [(quantized_tanh(8)), (quantized_sigmoid(5)), (quantized_sigmoid(7, use_real_sigmoid=True))]
+)
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+def test_quantizer_special(randX_1000_1, quantizer, backend, io_type):
+    '''
+    Test a single quantizer (tanh or sigmoid) as an Activation function.
+    Checks the type inference through the conversion is correct without just
+    using the same logic.
+    '''
+    X = randX_1000_1
+    X = np.round(X * 2**10) * 2**-10  # make it an exact ap_fixed<16,6>
+    model = Sequential()
+    model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer'))
+    model.compile()
+
+    config = hls4ml.utils.config_from_keras_model(model, granularity='name')
+    output_dir = str(
+        test_root_path / f'hls4mlprj_qkeras_quantizer_{quantizer.__class__.__name__}_{quantizer.bits}_{backend}_{io_type}'
+    )
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+    hls_model.compile()
+
+    y_qkeras = model.predict(X)
+    y_hls4ml = hls_model.predict(X)
+    # Goal is to get it passing with all equal
+    np.testing.assert_allclose(y_qkeras, y_hls4ml, rtol=1e-2, atol=0.02)
+
+
 @pytest.mark.parametrize(
     'test_no,N,kernel_quantizer,bias_quantizer,activation_quantizer,use_batchnorm,is_xnor',
     [
@@ -254,9 +278,6 @@ def test_quantizer(randX_1000_1, quantizer, backend, io_type):
     model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer'))
     model.compile()
 
-    hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(
-        layers=['quantizer'], rounding_mode='AP_RND_CONV', saturation_mode='AP_SAT'
-    )
     config = hls4ml.utils.config_from_keras_model(model, granularity='name')
     output_dir = str(
         test_root_path
@@ -267,7 +288,6 @@ def test_quantizer(randX_1000_1, quantizer, backend, io_type):
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
     )
-    hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(layers=[])
     hls_model.compile()
 
     y_qkeras = model.predict(X)
@@ -304,15 +324,11 @@ def test_qactivation_kwarg(randX_100_10, activation_quantizer, weight_quantizer)
     )(inputs)
     model = Model(inputs, outputs)
 
-    hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(
-        layers=[name], rounding_mode='AP_RND_CONV', saturation_mode='AP_SAT'
-    )
     config = hls4ml.utils.config_from_keras_model(model, granularity='name')
 
     out_dir = str(test_root_path / f'hls4mlprj_qactivation_kwarg_{activation_quantizer}')
 
     hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=out_dir)
-    hls4ml.model.optimizer.get_optimizer('output_rounding_saturation_mode').configure(layers=[])
     hls_model.compile()
 
     # Verify if activation in hls_model
diff --git a/test/pytest/test_report.py b/test/pytest/test_report.py
new file mode 100644
index 000000000..b08709b2b
--- /dev/null
+++ b/test/pytest/test_report.py
@@ -0,0 +1,71 @@
+import os
+import shutil
+from pathlib import Path
+
+import pytest
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.models import Sequential
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+
+@pytest.mark.parametrize('backend', ['Vivado'])
+def test_report(backend, capsys):
+    model = Sequential()
+    model.add(Dense(5, input_shape=(16,), name='fc1', activation='relu'))
+
+    config = hls4ml.utils.config_from_keras_model(model, granularity='model')
+
+    output_dir = str(test_root_path / f'hls4mlprj_report_{backend}')
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, io_type='io_stream', hls_config=config, output_dir=output_dir, part='xc7z020clg400-1', backend=backend
+    )
+    hls_model.write()
+
+    # to actually generate the reports (using Vivado 2020.1)
+    # hls_model.build(synth=True, vsynth=True)
+
+    # copy pregenerated reports
+    os.makedirs(f'hls4mlprj_report_{backend}/myproject_prj/solution1/syn/report', exist_ok=True)
+    shutil.copy('test_report/vivado_hls.app', f'{output_dir}/myproject_prj/vivado_hls.app')
+    shutil.copy('test_report/myproject_csynth.rpt', f'{output_dir}/myproject_prj/solution1/syn/report/myproject_csynth.rpt')
+    shutil.copy('test_report/myproject_csynth.xml', f'{output_dir}/myproject_prj/solution1/syn/report/myproject_csynth.xml')
+    shutil.copy('test_report/vivado_synth.rpt', f'{output_dir}/vivado_synth.rpt')
+
+    report = hls4ml.report.parse_vivado_report(output_dir)  # or report = hls_model.build(...)
+
+    capsys.readouterr()  # capture to clear
+    hls4ml.report.print_vivado_report(report)
+    captured = capsys.readouterr()  # capture again to test
+
+    assert (
+        captured.out
+        == '\n'
+        + '======================================================\n'
+        + '== C Synthesis report\n'
+        + '======================================================\n\n'
+        + ' - Performance estimates:\n'
+        + '    Best-case latency:      10 (50.0 ns)\n'
+        + '    Worst-case latency:     10 (50.0 ns)\n'
+        + '    Interval Min:           8\n'
+        + '    Interval Max:           8\n'
+        + '    Estimated Clock Period: 4.049\n\n'
+        + ' - Resource estimates:\n'
+        + '    BRAM_18K: 0 / 280 (0.0%)\n'
+        + '    DSP48E:   73 / 220 (33.2%)\n'
+        + '    FF:       7969 / 106400 (7.5%)\n'
+        + '    LUT:      2532 / 53200 (4.8%)\n'
+        + '    URAM:     N/A\n\n'
+        + '======================================================\n'
+        + '== Vivado Synthesis report\n'
+        + '======================================================\n\n'
+        + ' - Resource utilization:\n'
+        + '    BRAM_18K: 0\n'
+        + '    DSP48E:   66\n'
+        + '    FF:       2428\n'
+        + '    LUT:      1526\n'
+        + '    URAM:     N/A\n\n'
+    )
diff --git a/test/pytest/test_report/myproject_csynth.rpt b/test/pytest/test_report/myproject_csynth.rpt
new file mode 100644
index 000000000..8354501db
--- /dev/null
+++ b/test/pytest/test_report/myproject_csynth.rpt
@@ -0,0 +1,196 @@
+
+
+================================================================
+== Vivado HLS Report for 'myproject'
+================================================================
+* Date:           Sat Mar 18 22:59:37 2023
+
+* Version:        2020.1 (Build 2897737 on Wed May 27 20:21:37 MDT 2020)
+* Project:        myproject_prj
+* Solution:       solution1
+* Product family: zynq
+* Target device:  xc7z020-clg400-1
+
+
+================================================================
+== Performance Estimates
+================================================================
++ Timing: 
+    * Summary: 
+    +--------+---------+----------+------------+
+    |  Clock |  Target | Estimated| Uncertainty|
+    +--------+---------+----------+------------+
+    |ap_clk  | 5.00 ns | 4.049 ns |   0.62 ns  |
+    +--------+---------+----------+------------+
+
++ Latency: 
+    * Summary: 
+    +---------+---------+-----------+-----------+-----+-----+----------+
+    |  Latency (cycles) |   Latency (absolute)  |  Interval | Pipeline |
+    |   min   |   max   |    min    |    max    | min | max |   Type   |
+    +---------+---------+-----------+-----------+-----+-----+----------+
+    |       10|       10| 50.000 ns | 50.000 ns |    8|    8| dataflow |
+    +---------+---------+-----------+-----------+-----+-----+----------+
+
+    + Detail: 
+        * Instance: 
+        +-----------------------------------------------------+----------------------------------------------------+---------+---------+-----------+-----------+-----+-----+----------+
+        |                                                     |                                                    |  Latency (cycles) |   Latency (absolute)  |  Interval | Pipeline |
+        |                       Instance                      |                       Module                       |   min   |   max   |    min    |    max    | min | max |   Type   |
+        +-----------------------------------------------------+----------------------------------------------------+---------+---------+-----------+-----------+-----+-----+----------+
+        |dense_array_array_ap_fixed_16_6_5_3_0_5u_config2_U0  |dense_array_array_ap_fixed_16_6_5_3_0_5u_config2_s  |        7|        7| 35.000 ns | 35.000 ns |    7|    7|   none   |
+        |relu_array_array_ap_fixed_5u_relu_config3_U0         |relu_array_array_ap_fixed_5u_relu_config3_s         |        2|        2| 10.000 ns | 10.000 ns |    1|    1| function |
+        +-----------------------------------------------------+----------------------------------------------------+---------+---------+-----------+-----------+-----+-----+----------+
+
+        * Loop: 
+        N/A
+
+
+
+================================================================
+== Utilization Estimates
+================================================================
+* Summary: 
++-----------------+---------+-------+--------+-------+-----+
+|       Name      | BRAM_18K| DSP48E|   FF   |  LUT  | URAM|
++-----------------+---------+-------+--------+-------+-----+
+|DSP              |        -|      -|       -|      -|    -|
+|Expression       |        -|      -|       0|      2|    -|
+|FIFO             |        0|      -|      25|    140|    -|
+|Instance         |        0|     73|    7944|   2390|    -|
+|Memory           |        -|      -|       -|      -|    -|
+|Multiplexer      |        -|      -|       -|      -|    -|
+|Register         |        -|      -|       -|      -|    -|
++-----------------+---------+-------+--------+-------+-----+
+|Total            |        0|     73|    7969|   2532|    0|
++-----------------+---------+-------+--------+-------+-----+
+|Available        |      280|    220|  106400|  53200|    0|
++-----------------+---------+-------+--------+-------+-----+
+|Utilization (%)  |        0|     33|       7|      4|    0|
++-----------------+---------+-------+--------+-------+-----+
+
++ Detail: 
+    * Instance: 
+    +-----------------------------------------------------+----------------------------------------------------+---------+-------+------+------+-----+
+    |                       Instance                      |                       Module                       | BRAM_18K| DSP48E|  FF  |  LUT | URAM|
+    +-----------------------------------------------------+----------------------------------------------------+---------+-------+------+------+-----+
+    |dense_array_array_ap_fixed_16_6_5_3_0_5u_config2_U0  |dense_array_array_ap_fixed_16_6_5_3_0_5u_config2_s  |        0|     73|  7860|  2134|    0|
+    |relu_array_array_ap_fixed_5u_relu_config3_U0         |relu_array_array_ap_fixed_5u_relu_config3_s         |        0|      0|    84|   256|    0|
+    +-----------------------------------------------------+----------------------------------------------------+---------+-------+------+------+-----+
+    |Total                                                |                                                    |        0|     73|  7944|  2390|    0|
+    +-----------------------------------------------------+----------------------------------------------------+---------+-------+------+------+-----+
+
+    * DSP48E: 
+    N/A
+
+    * Memory: 
+    N/A
+
+    * FIFO: 
+    +-------------------------+---------+---+----+-----+------+-----+---------+
+    |           Name          | BRAM_18K| FF| LUT| URAM| Depth| Bits| Size:D*B|
+    +-------------------------+---------+---+----+-----+------+-----+---------+
+    |layer2_out_V_data_0_V_U  |        0|  5|   0|    -|     1|   16|       16|
+    |layer2_out_V_data_1_V_U  |        0|  5|   0|    -|     1|   16|       16|
+    |layer2_out_V_data_2_V_U  |        0|  5|   0|    -|     1|   16|       16|
+    |layer2_out_V_data_3_V_U  |        0|  5|   0|    -|     1|   16|       16|
+    |layer2_out_V_data_4_V_U  |        0|  5|   0|    -|     1|   16|       16|
+    +-------------------------+---------+---+----+-----+------+-----+---------+
+    |Total                    |        0| 25|   0|    0|     5|   80|       80|
+    +-------------------------+---------+---+----+-----+------+-----+---------+
+
+    * Expression: 
+    +--------------+----------+-------+---+----+------------+------------+
+    | Variable Name| Operation| DSP48E| FF| LUT| Bitwidth P0| Bitwidth P1|
+    +--------------+----------+-------+---+----+------------+------------+
+    |ap_idle       |    and   |      0|  0|   2|           1|           1|
+    +--------------+----------+-------+---+----+------------+------------+
+    |Total         |          |      0|  0|   2|           1|           1|
+    +--------------+----------+-------+---+----+------------+------------+
+
+    * Multiplexer: 
+    N/A
+
+    * Register: 
+    N/A
+
+
+
+================================================================
+== Interface
+================================================================
+* Summary: 
++------------------------------+-----+-----+------------+-----------------------+--------------+
+|           RTL Ports          | Dir | Bits|  Protocol  |     Source Object     |    C Type    |
++------------------------------+-----+-----+------------+-----------------------+--------------+
+|fc1_input_V_data_0_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_0_V |    pointer   |
+|fc1_input_V_data_0_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_0_V |    pointer   |
+|fc1_input_V_data_0_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_0_V |    pointer   |
+|fc1_input_V_data_1_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_1_V |    pointer   |
+|fc1_input_V_data_1_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_1_V |    pointer   |
+|fc1_input_V_data_1_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_1_V |    pointer   |
+|fc1_input_V_data_2_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_2_V |    pointer   |
+|fc1_input_V_data_2_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_2_V |    pointer   |
+|fc1_input_V_data_2_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_2_V |    pointer   |
+|fc1_input_V_data_3_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_3_V |    pointer   |
+|fc1_input_V_data_3_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_3_V |    pointer   |
+|fc1_input_V_data_3_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_3_V |    pointer   |
+|fc1_input_V_data_4_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_4_V |    pointer   |
+|fc1_input_V_data_4_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_4_V |    pointer   |
+|fc1_input_V_data_4_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_4_V |    pointer   |
+|fc1_input_V_data_5_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_5_V |    pointer   |
+|fc1_input_V_data_5_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_5_V |    pointer   |
+|fc1_input_V_data_5_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_5_V |    pointer   |
+|fc1_input_V_data_6_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_6_V |    pointer   |
+|fc1_input_V_data_6_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_6_V |    pointer   |
+|fc1_input_V_data_6_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_6_V |    pointer   |
+|fc1_input_V_data_7_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_7_V |    pointer   |
+|fc1_input_V_data_7_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_7_V |    pointer   |
+|fc1_input_V_data_7_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_7_V |    pointer   |
+|fc1_input_V_data_8_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_8_V |    pointer   |
+|fc1_input_V_data_8_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_8_V |    pointer   |
+|fc1_input_V_data_8_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_8_V |    pointer   |
+|fc1_input_V_data_9_V_TDATA    |  in |   16|    axis    |  fc1_input_V_data_9_V |    pointer   |
+|fc1_input_V_data_9_V_TVALID   |  in |    1|    axis    |  fc1_input_V_data_9_V |    pointer   |
+|fc1_input_V_data_9_V_TREADY   | out |    1|    axis    |  fc1_input_V_data_9_V |    pointer   |
+|fc1_input_V_data_10_V_TDATA   |  in |   16|    axis    | fc1_input_V_data_10_V |    pointer   |
+|fc1_input_V_data_10_V_TVALID  |  in |    1|    axis    | fc1_input_V_data_10_V |    pointer   |
+|fc1_input_V_data_10_V_TREADY  | out |    1|    axis    | fc1_input_V_data_10_V |    pointer   |
+|fc1_input_V_data_11_V_TDATA   |  in |   16|    axis    | fc1_input_V_data_11_V |    pointer   |
+|fc1_input_V_data_11_V_TVALID  |  in |    1|    axis    | fc1_input_V_data_11_V |    pointer   |
+|fc1_input_V_data_11_V_TREADY  | out |    1|    axis    | fc1_input_V_data_11_V |    pointer   |
+|fc1_input_V_data_12_V_TDATA   |  in |   16|    axis    | fc1_input_V_data_12_V |    pointer   |
+|fc1_input_V_data_12_V_TVALID  |  in |    1|    axis    | fc1_input_V_data_12_V |    pointer   |
+|fc1_input_V_data_12_V_TREADY  | out |    1|    axis    | fc1_input_V_data_12_V |    pointer   |
+|fc1_input_V_data_13_V_TDATA   |  in |   16|    axis    | fc1_input_V_data_13_V |    pointer   |
+|fc1_input_V_data_13_V_TVALID  |  in |    1|    axis    | fc1_input_V_data_13_V |    pointer   |
+|fc1_input_V_data_13_V_TREADY  | out |    1|    axis    | fc1_input_V_data_13_V |    pointer   |
+|fc1_input_V_data_14_V_TDATA   |  in |   16|    axis    | fc1_input_V_data_14_V |    pointer   |
+|fc1_input_V_data_14_V_TVALID  |  in |    1|    axis    | fc1_input_V_data_14_V |    pointer   |
+|fc1_input_V_data_14_V_TREADY  | out |    1|    axis    | fc1_input_V_data_14_V |    pointer   |
+|fc1_input_V_data_15_V_TDATA   |  in |   16|    axis    | fc1_input_V_data_15_V |    pointer   |
+|fc1_input_V_data_15_V_TVALID  |  in |    1|    axis    | fc1_input_V_data_15_V |    pointer   |
+|fc1_input_V_data_15_V_TREADY  | out |    1|    axis    | fc1_input_V_data_15_V |    pointer   |
+|layer3_out_V_data_0_V_TDATA   | out |   16|    axis    | layer3_out_V_data_0_V |    pointer   |
+|layer3_out_V_data_0_V_TVALID  | out |    1|    axis    | layer3_out_V_data_0_V |    pointer   |
+|layer3_out_V_data_0_V_TREADY  |  in |    1|    axis    | layer3_out_V_data_0_V |    pointer   |
+|layer3_out_V_data_1_V_TDATA   | out |   16|    axis    | layer3_out_V_data_1_V |    pointer   |
+|layer3_out_V_data_1_V_TVALID  | out |    1|    axis    | layer3_out_V_data_1_V |    pointer   |
+|layer3_out_V_data_1_V_TREADY  |  in |    1|    axis    | layer3_out_V_data_1_V |    pointer   |
+|layer3_out_V_data_2_V_TDATA   | out |   16|    axis    | layer3_out_V_data_2_V |    pointer   |
+|layer3_out_V_data_2_V_TVALID  | out |    1|    axis    | layer3_out_V_data_2_V |    pointer   |
+|layer3_out_V_data_2_V_TREADY  |  in |    1|    axis    | layer3_out_V_data_2_V |    pointer   |
+|layer3_out_V_data_3_V_TDATA   | out |   16|    axis    | layer3_out_V_data_3_V |    pointer   |
+|layer3_out_V_data_3_V_TVALID  | out |    1|    axis    | layer3_out_V_data_3_V |    pointer   |
+|layer3_out_V_data_3_V_TREADY  |  in |    1|    axis    | layer3_out_V_data_3_V |    pointer   |
+|layer3_out_V_data_4_V_TDATA   | out |   16|    axis    | layer3_out_V_data_4_V |    pointer   |
+|layer3_out_V_data_4_V_TVALID  | out |    1|    axis    | layer3_out_V_data_4_V |    pointer   |
+|layer3_out_V_data_4_V_TREADY  |  in |    1|    axis    | layer3_out_V_data_4_V |    pointer   |
+|ap_clk                        |  in |    1| ap_ctrl_hs |       myproject       | return value |
+|ap_rst_n                      |  in |    1| ap_ctrl_hs |       myproject       | return value |
+|ap_start                      |  in |    1| ap_ctrl_hs |       myproject       | return value |
+|ap_done                       | out |    1| ap_ctrl_hs |       myproject       | return value |
+|ap_ready                      | out |    1| ap_ctrl_hs |       myproject       | return value |
+|ap_idle                       | out |    1| ap_ctrl_hs |       myproject       | return value |
++------------------------------+-----+-----+------------+-----------------------+--------------+
+
diff --git a/test/pytest/test_report/myproject_csynth.xml b/test/pytest/test_report/myproject_csynth.xml
new file mode 100644
index 000000000..711a5ec12
--- /dev/null
+++ b/test/pytest/test_report/myproject_csynth.xml
@@ -0,0 +1,878 @@
+<profile>
+
+<ReportVersion>
+<Version>2020.1</Version>
+</ReportVersion>
+
+<UserAssignments>
+<unit>ns</unit>
+<ProductFamily>zynq</ProductFamily>
+<Part>xc7z020-clg400-1</Part>
+<TopModelName>myproject</TopModelName>
+<TargetClockPeriod>5.00</TargetClockPeriod>
+<ClockUncertainty>0.62</ClockUncertainty>
+</UserAssignments>
+
+<PerformanceEstimates>
+<PipelineType>dataflow</PipelineType>
+<SummaryOfTimingAnalysis>
+<unit>ns</unit>
+<EstimatedClockPeriod>4.049</EstimatedClockPeriod>
+</SummaryOfTimingAnalysis>
+<SummaryOfOverallLatency>
+<unit>clock cycles</unit>
+<Best-caseLatency>10</Best-caseLatency>
+<Average-caseLatency>10</Average-caseLatency>
+<Worst-caseLatency>10</Worst-caseLatency>
+<Best-caseRealTimeLatency>50.000 ns</Best-caseRealTimeLatency>
+<Average-caseRealTimeLatency>50.000 ns</Average-caseRealTimeLatency>
+<Worst-caseRealTimeLatency>50.000 ns</Worst-caseRealTimeLatency>
+<DataflowPipelineThroughput>8</DataflowPipelineThroughput>
+<Interval-min>8</Interval-min>
+<Interval-max>8</Interval-max>
+</SummaryOfOverallLatency>
+</PerformanceEstimates>
+
+<AreaEstimates>
+<Resources>
+<BRAM_18K>0</BRAM_18K>
+<DSP48E>73</DSP48E>
+<FF>7969</FF>
+<LUT>2532</LUT>
+<URAM>0</URAM>
+</Resources>
+<AvailableResources>
+<BRAM_18K>280</BRAM_18K>
+<DSP48E>220</DSP48E>
+<FF>106400</FF>
+<LUT>53200</LUT>
+<URAM>0</URAM>
+</AvailableResources>
+</AreaEstimates>
+
+<InterfaceSummary>
+<RtlPorts>
+<name>fc1_input_V_data_0_V_TDATA</name>
+<Object>fc1_input_V_data_0_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_0_V_TVALID</name>
+<Object>fc1_input_V_data_0_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_0_V_TREADY</name>
+<Object>fc1_input_V_data_0_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_1_V_TDATA</name>
+<Object>fc1_input_V_data_1_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_1_V_TVALID</name>
+<Object>fc1_input_V_data_1_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_1_V_TREADY</name>
+<Object>fc1_input_V_data_1_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_2_V_TDATA</name>
+<Object>fc1_input_V_data_2_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_2_V_TVALID</name>
+<Object>fc1_input_V_data_2_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_2_V_TREADY</name>
+<Object>fc1_input_V_data_2_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_3_V_TDATA</name>
+<Object>fc1_input_V_data_3_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_3_V_TVALID</name>
+<Object>fc1_input_V_data_3_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_3_V_TREADY</name>
+<Object>fc1_input_V_data_3_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_4_V_TDATA</name>
+<Object>fc1_input_V_data_4_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_4_V_TVALID</name>
+<Object>fc1_input_V_data_4_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_4_V_TREADY</name>
+<Object>fc1_input_V_data_4_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_5_V_TDATA</name>
+<Object>fc1_input_V_data_5_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_5_V_TVALID</name>
+<Object>fc1_input_V_data_5_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_5_V_TREADY</name>
+<Object>fc1_input_V_data_5_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_6_V_TDATA</name>
+<Object>fc1_input_V_data_6_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_6_V_TVALID</name>
+<Object>fc1_input_V_data_6_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_6_V_TREADY</name>
+<Object>fc1_input_V_data_6_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_7_V_TDATA</name>
+<Object>fc1_input_V_data_7_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_7_V_TVALID</name>
+<Object>fc1_input_V_data_7_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_7_V_TREADY</name>
+<Object>fc1_input_V_data_7_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_8_V_TDATA</name>
+<Object>fc1_input_V_data_8_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_8_V_TVALID</name>
+<Object>fc1_input_V_data_8_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_8_V_TREADY</name>
+<Object>fc1_input_V_data_8_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_9_V_TDATA</name>
+<Object>fc1_input_V_data_9_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_9_V_TVALID</name>
+<Object>fc1_input_V_data_9_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_9_V_TREADY</name>
+<Object>fc1_input_V_data_9_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_10_V_TDATA</name>
+<Object>fc1_input_V_data_10_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_10_V_TVALID</name>
+<Object>fc1_input_V_data_10_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_10_V_TREADY</name>
+<Object>fc1_input_V_data_10_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_11_V_TDATA</name>
+<Object>fc1_input_V_data_11_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_11_V_TVALID</name>
+<Object>fc1_input_V_data_11_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_11_V_TREADY</name>
+<Object>fc1_input_V_data_11_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_12_V_TDATA</name>
+<Object>fc1_input_V_data_12_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_12_V_TVALID</name>
+<Object>fc1_input_V_data_12_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_12_V_TREADY</name>
+<Object>fc1_input_V_data_12_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_13_V_TDATA</name>
+<Object>fc1_input_V_data_13_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_13_V_TVALID</name>
+<Object>fc1_input_V_data_13_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_13_V_TREADY</name>
+<Object>fc1_input_V_data_13_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_14_V_TDATA</name>
+<Object>fc1_input_V_data_14_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_14_V_TVALID</name>
+<Object>fc1_input_V_data_14_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_14_V_TREADY</name>
+<Object>fc1_input_V_data_14_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_15_V_TDATA</name>
+<Object>fc1_input_V_data_15_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_15_V_TVALID</name>
+<Object>fc1_input_V_data_15_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>fc1_input_V_data_15_V_TREADY</name>
+<Object>fc1_input_V_data_15_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_0_V_TDATA</name>
+<Object>layer3_out_V_data_0_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_0_V_TVALID</name>
+<Object>layer3_out_V_data_0_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_0_V_TREADY</name>
+<Object>layer3_out_V_data_0_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_1_V_TDATA</name>
+<Object>layer3_out_V_data_1_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_1_V_TVALID</name>
+<Object>layer3_out_V_data_1_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_1_V_TREADY</name>
+<Object>layer3_out_V_data_1_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_2_V_TDATA</name>
+<Object>layer3_out_V_data_2_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_2_V_TVALID</name>
+<Object>layer3_out_V_data_2_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_2_V_TREADY</name>
+<Object>layer3_out_V_data_2_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_3_V_TDATA</name>
+<Object>layer3_out_V_data_3_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_3_V_TVALID</name>
+<Object>layer3_out_V_data_3_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_3_V_TREADY</name>
+<Object>layer3_out_V_data_3_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_4_V_TDATA</name>
+<Object>layer3_out_V_data_4_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>16</Bits>
+<Attribute>data</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_4_V_TVALID</name>
+<Object>layer3_out_V_data_4_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>layer3_out_V_data_4_V_TREADY</name>
+<Object>layer3_out_V_data_4_V</Object>
+<Type>pointer</Type>
+<Scope></Scope>
+<IOProtocol>axis</IOProtocol>
+<IOConfig>register, both mode</IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+<CType>int</CType>
+</RtlPorts>
+<RtlPorts>
+<name>ap_clk</name>
+<Object>myproject</Object>
+<Type>return value</Type>
+<Scope></Scope>
+<IOProtocol>ap_ctrl_hs</IOProtocol>
+<IOConfig></IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+</RtlPorts>
+<RtlPorts>
+<name>ap_rst_n</name>
+<Object>myproject</Object>
+<Type>return value</Type>
+<Scope></Scope>
+<IOProtocol>ap_ctrl_hs</IOProtocol>
+<IOConfig></IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+</RtlPorts>
+<RtlPorts>
+<name>ap_start</name>
+<Object>myproject</Object>
+<Type>return value</Type>
+<Scope></Scope>
+<IOProtocol>ap_ctrl_hs</IOProtocol>
+<IOConfig></IOConfig>
+<Dir>in</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+</RtlPorts>
+<RtlPorts>
+<name>ap_done</name>
+<Object>myproject</Object>
+<Type>return value</Type>
+<Scope></Scope>
+<IOProtocol>ap_ctrl_hs</IOProtocol>
+<IOConfig></IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+</RtlPorts>
+<RtlPorts>
+<name>ap_ready</name>
+<Object>myproject</Object>
+<Type>return value</Type>
+<Scope></Scope>
+<IOProtocol>ap_ctrl_hs</IOProtocol>
+<IOConfig></IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+</RtlPorts>
+<RtlPorts>
+<name>ap_idle</name>
+<Object>myproject</Object>
+<Type>return value</Type>
+<Scope></Scope>
+<IOProtocol>ap_ctrl_hs</IOProtocol>
+<IOConfig></IOConfig>
+<Dir>out</Dir>
+<Bits>1</Bits>
+<Attribute>control</Attribute>
+</RtlPorts>
+</InterfaceSummary>
+
+</profile>
diff --git a/test/pytest/test_report/vivado_hls.app b/test/pytest/test_report/vivado_hls.app
new file mode 100644
index 000000000..c57b8a471
--- /dev/null
+++ b/test/pytest/test_report/vivado_hls.app
@@ -0,0 +1,15 @@
+<AutoPilot:project xmlns:AutoPilot="com.autoesl.autopilot.project" projectType="C/C++" name="myproject_prj" top="myproject">
+    <files>
+        <file name="../../tb_data" sc="0" tb="1" cflags=" -Wno-unknown-pragmas" csimflags=" -Wno-unknown-pragmas" blackbox="false"/>
+        <file name="../../firmware/weights" sc="0" tb="1" cflags=" -Wno-unknown-pragmas" csimflags=" -Wno-unknown-pragmas" blackbox="false"/>
+        <file name="../../myproject_test.cpp" sc="0" tb="1" cflags=" -std=c++0x -Wno-unknown-pragmas" csimflags=" -Wno-unknown-pragmas" blackbox="false"/>
+        <file name="firmware/myproject.cpp" sc="0" tb="false" cflags="-std=c++0x" csimflags="" blackbox="false"/>
+    </files>
+    <solutions>
+        <solution name="solution1" status=""/>
+    </solutions>
+    <Simulation argv="">
+        <SimFlow name="csim" setup="false" optimizeCompile="false" clean="false" ldflags="" mflags=""/>
+    </Simulation>
+</AutoPilot:project>
+
diff --git a/test/pytest/test_report/vivado_synth.rpt b/test/pytest/test_report/vivado_synth.rpt
new file mode 100644
index 000000000..971f3e549
--- /dev/null
+++ b/test/pytest/test_report/vivado_synth.rpt
@@ -0,0 +1,184 @@
+Copyright 1986-2020 Xilinx, Inc. All Rights Reserved.
+------------------------------------------------------------------------------------
+| Tool Version : Vivado v.2020.1 (lin64) Build 2902540 Wed May 27 19:54:35 MDT 2020
+| Date         : Sun Mar 19 07:48:36 2023
+| Host         : mulder.t2.ucsd.edu running 64-bit unknown
+| Command      : report_utilization -file vivado_synth.rpt
+| Design       : myproject
+| Device       : 7z020clg400-1
+| Design State : Synthesized
+------------------------------------------------------------------------------------
+
+Utilization Design Information
+
+Table of Contents
+-----------------
+1. Slice Logic
+1.1 Summary of Registers by Type
+2. Memory
+3. DSP
+4. IO and GT Specific
+5. Clocking
+6. Specific Feature
+7. Primitives
+8. Black Boxes
+9. Instantiated Netlists
+
+1. Slice Logic
+--------------
+
++----------------------------+------+-------+-----------+-------+
+|          Site Type         | Used | Fixed | Available | Util% |
++----------------------------+------+-------+-----------+-------+
+| Slice LUTs*                | 1526 |     0 |     53200 |  2.87 |
+|   LUT as Logic             | 1478 |     0 |     53200 |  2.78 |
+|   LUT as Memory            |   48 |     0 |     17400 |  0.28 |
+|     LUT as Distributed RAM |    0 |     0 |           |       |
+|     LUT as Shift Register  |   48 |     0 |           |       |
+| Slice Registers            | 2428 |     0 |    106400 |  2.28 |
+|   Register as Flip Flop    | 2428 |     0 |    106400 |  2.28 |
+|   Register as Latch        |    0 |     0 |    106400 |  0.00 |
+| F7 Muxes                   |    0 |     0 |     26600 |  0.00 |
+| F8 Muxes                   |    0 |     0 |     13300 |  0.00 |
++----------------------------+------+-------+-----------+-------+
+* Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count.
+
+
+1.1 Summary of Registers by Type
+--------------------------------
+
++-------+--------------+-------------+--------------+
+| Total | Clock Enable | Synchronous | Asynchronous |
++-------+--------------+-------------+--------------+
+| 0     |            _ |           - |            - |
+| 0     |            _ |           - |          Set |
+| 0     |            _ |           - |        Reset |
+| 0     |            _ |         Set |            - |
+| 0     |            _ |       Reset |            - |
+| 0     |          Yes |           - |            - |
+| 0     |          Yes |           - |          Set |
+| 0     |          Yes |           - |        Reset |
+| 18    |          Yes |         Set |            - |
+| 2410  |          Yes |       Reset |            - |
++-------+--------------+-------------+--------------+
+
+
+2. Memory
+---------
+
++----------------+------+-------+-----------+-------+
+|    Site Type   | Used | Fixed | Available | Util% |
++----------------+------+-------+-----------+-------+
+| Block RAM Tile |    0 |     0 |       140 |  0.00 |
+|   RAMB36/FIFO* |    0 |     0 |       140 |  0.00 |
+|   RAMB18       |    0 |     0 |       280 |  0.00 |
++----------------+------+-------+-----------+-------+
+* Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1
+
+
+3. DSP
+------
+
++----------------+------+-------+-----------+-------+
+|    Site Type   | Used | Fixed | Available | Util% |
++----------------+------+-------+-----------+-------+
+| DSPs           |   66 |     0 |       220 | 30.00 |
+|   DSP48E1 only |   66 |       |           |       |
++----------------+------+-------+-----------+-------+
+
+
+4. IO and GT Specific
+---------------------
+
++-----------------------------+------+-------+-----------+--------+
+|          Site Type          | Used | Fixed | Available |  Util% |
++-----------------------------+------+-------+-----------+--------+
+| Bonded IOB                  |  384 |     0 |       125 | 307.20 |
+| Bonded IPADs                |    0 |     0 |         2 |   0.00 |
+| Bonded IOPADs               |    0 |     0 |       130 |   0.00 |
+| PHY_CONTROL                 |    0 |     0 |         4 |   0.00 |
+| PHASER_REF                  |    0 |     0 |         4 |   0.00 |
+| OUT_FIFO                    |    0 |     0 |        16 |   0.00 |
+| IN_FIFO                     |    0 |     0 |        16 |   0.00 |
+| IDELAYCTRL                  |    0 |     0 |         4 |   0.00 |
+| IBUFDS                      |    0 |     0 |       121 |   0.00 |
+| PHASER_OUT/PHASER_OUT_PHY   |    0 |     0 |        16 |   0.00 |
+| PHASER_IN/PHASER_IN_PHY     |    0 |     0 |        16 |   0.00 |
+| IDELAYE2/IDELAYE2_FINEDELAY |    0 |     0 |       200 |   0.00 |
+| ILOGIC                      |    0 |     0 |       125 |   0.00 |
+| OLOGIC                      |    0 |     0 |       125 |   0.00 |
++-----------------------------+------+-------+-----------+--------+
+
+
+5. Clocking
+-----------
+
++------------+------+-------+-----------+-------+
+|  Site Type | Used | Fixed | Available | Util% |
++------------+------+-------+-----------+-------+
+| BUFGCTRL   |    1 |     0 |        32 |  3.13 |
+| BUFIO      |    0 |     0 |        16 |  0.00 |
+| MMCME2_ADV |    0 |     0 |         4 |  0.00 |
+| PLLE2_ADV  |    0 |     0 |         4 |  0.00 |
+| BUFMRCE    |    0 |     0 |         8 |  0.00 |
+| BUFHCE     |    0 |     0 |        72 |  0.00 |
+| BUFR       |    0 |     0 |        16 |  0.00 |
++------------+------+-------+-----------+-------+
+
+
+6. Specific Feature
+-------------------
+
++-------------+------+-------+-----------+-------+
+|  Site Type  | Used | Fixed | Available | Util% |
++-------------+------+-------+-----------+-------+
+| BSCANE2     |    0 |     0 |         4 |  0.00 |
+| CAPTUREE2   |    0 |     0 |         1 |  0.00 |
+| DNA_PORT    |    0 |     0 |         1 |  0.00 |
+| EFUSE_USR   |    0 |     0 |         1 |  0.00 |
+| FRAME_ECCE2 |    0 |     0 |         1 |  0.00 |
+| ICAPE2      |    0 |     0 |         2 |  0.00 |
+| STARTUPE2   |    0 |     0 |         1 |  0.00 |
+| XADC        |    0 |     0 |         1 |  0.00 |
++-------------+------+-------+-----------+-------+
+
+
+7. Primitives
+-------------
+
++----------+------+---------------------+
+| Ref Name | Used | Functional Category |
++----------+------+---------------------+
+| FDRE     | 2410 |        Flop & Latch |
+| LUT2     |  864 |                 LUT |
+| LUT3     |  671 |                 LUT |
+| LUT4     |  499 |                 LUT |
+| CARRY4   |  295 |          CarryLogic |
+| IBUF     |  280 |                  IO |
+| OBUF     |  104 |                  IO |
+| DSP48E1  |   66 |    Block Arithmetic |
+| LUT1     |   63 |                 LUT |
+| SRL16E   |   48 |  Distributed Memory |
+| LUT5     |   43 |                 LUT |
+| LUT6     |   34 |                 LUT |
+| FDSE     |   18 |        Flop & Latch |
+| BUFG     |    1 |               Clock |
++----------+------+---------------------+
+
+
+8. Black Boxes
+--------------
+
++----------+------+
+| Ref Name | Used |
++----------+------+
+
+
+9. Instantiated Netlists
+------------------------
+
++----------+------+
+| Ref Name | Used |
++----------+------+
+
+