diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index cbd44d466..5fe692052 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -5,8 +5,10 @@
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig
+from hls4ml.backends.vitis.vitis_backend import VitisBackend
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
+register_backend('Vitis', VitisBackend)
 register_backend('Quartus', QuartusBackend)
diff --git a/hls4ml/backends/vitis/__init__.py b/hls4ml/backends/vitis/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/hls4ml/backends/vitis/passes/__init__.py b/hls4ml/backends/vitis/passes/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py
new file mode 100644
index 000000000..eddd5530f
--- /dev/null
+++ b/hls4ml/backends/vitis/passes/feature_check.py
@@ -0,0 +1,28 @@
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ValidateConvImplementation(OptimizerPass):
+
+    def match(self, node):
+        return 'Conv' in node.class_name
+
+    def transform(self, model, node):
+        if node.get_attr('implementation', 'linebuffer') == 'encoded':
+            print(f'WARNING: "Encoded" implementation in "{node.name}" ({node.class_name}) is not supported in Vitis backend. Switching to "LineBuffer" implementation.')
+            node.set_attr('implementation', 'linebuffer')
+
+
+class ValidateStrategy(OptimizerPass):
+    _resource_layer_cls = ['Conv1D', 'Conv2D', 'Dense']
+
+    def match(self, node):
+        is_resource_layer = len([layer_cls for layer_cls in self._resource_layer_cls if layer_cls in node.class_name]) > 0
+        is_resource_strategy = node.model.config.is_resource_strategy(node)
+        
+        return is_resource_layer and is_resource_strategy
+
+    def transform(self, model, node):
+        n_in, _ = model.config.backend.get_layer_mult_size(node)
+        rf = node.get_attr('reuse_factor')
+        if rf > n_in and rf % n_in > 0:
+            print(f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis backend due to use of "urem" cores. Consider using a different ReuseFactor or switching to "Latency" strategy.')
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
new file mode 100644
index 000000000..8fc4ab9c3
--- /dev/null
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -0,0 +1,46 @@
+import os
+import sys
+
+from hls4ml.backends import VivadoBackend
+from hls4ml.model.flow import register_flow, get_flow
+from hls4ml.report import parse_vivado_report
+
+
+class VitisBackend(VivadoBackend):
+    def __init__(self):
+        super(VivadoBackend, self).__init__(name='Vitis')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def _register_flows(self):
+        validation_passes = [
+            'vitis:validate_conv_implementation',
+            'vitis:validate_strategy',
+        ]
+        validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name)
+
+        # Any potential templates registered specifically for Vitis backend
+        template_flow = register_flow('apply_templates', self._get_layer_templates, requires=['vivado:init_layers'], backend=self.name)
+
+        writer_passes = ['make_stamp', 'vitis:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name)
+
+        ip_flow_requirements = get_flow('vivado:ip').requires.copy()
+        ip_flow_requirements.insert(ip_flow_requirements.index('vivado:init_layers'), validation_flow)
+        ip_flow_requirements.insert(ip_flow_requirements.index('vivado:apply_templates'), template_flow)
+
+        self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
+
+    def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False):
+        if 'linux' in sys.platform:
+            found = os.system('command -v vitis_hls > /dev/null')
+            if found != 0:
+                raise Exception('Vitis HLS installation not found. Make sure "vitis_hls" is on PATH.')
+        
+        curr_dir = os.getcwd()
+        os.chdir(model.config.get_output_dir())
+        os.system('vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} validation={validation} export={export} vsynth={vsynth}"'
+            .format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth))
+        os.chdir(curr_dir)
+
+        return parse_vivado_report(model.config.get_output_dir())
\ No newline at end of file
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 195fc00b5..005a0718a 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -9,6 +9,8 @@
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
     static const unsigned strategy = nnet::{strategy};
+    static const unsigned n_zeros = 0;
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
@@ -123,6 +125,7 @@ def format(self, node):
     static const unsigned out_width = {out_width};
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     static const unsigned strategy = nnet::{strategy};
     static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
@@ -363,6 +366,9 @@ def format(self, node):
 
         # Depthwise config
         params = self._default_config_params(node)
+        # Override bias and bias_t since these are zeros in depthwise step of SepConv2D
+        params['bias'] = params['zero_bias']
+        params['bias_t'] = params['zero_bias_t']
         params['n_filt'] = params['n_chan']  # In depthwise step n_chan == n_filt
         params['dilation'] = node.get_attr('dilation', 1)
         params['nzeros'] = node.get_weights('depthwise').nzeros
diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index a73350a29..c8119c0c2 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -12,6 +12,7 @@
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
     static const unsigned n_nonzeros = {nonzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
@@ -63,6 +64,7 @@ def format(self, node):
     static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor);
     static const bool store_weights_in_bram = false;
     typedef {bias_t.name} bias_t;
     typedef {scale_t.name} scale_t;
diff --git a/hls4ml/backends/vivado/passes/merge_templates.py b/hls4ml/backends/vivado/passes/merge_templates.py
index 863512c4c..7aa705750 100644
--- a/hls4ml/backends/vivado/passes/merge_templates.py
+++ b/hls4ml/backends/vivado/passes/merge_templates.py
@@ -49,6 +49,7 @@ def format(self, node):
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor);
     typedef {accum_t.name} accum_t;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py
index 74ec61e82..d7c826e74 100644
--- a/hls4ml/backends/vivado/passes/recurrent_templates.py
+++ b/hls4ml/backends/vivado/passes/recurrent_templates.py
@@ -12,6 +12,7 @@
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
     static const unsigned n_nonzeros = {nonzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index 504df597e..68c3ad9dd 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -56,15 +56,21 @@ def _find_solutions(sln_dir):
     solutions = []
 
     if os.path.isfile(sln_dir + '/vivado_hls.app'):
-        with open(sln_dir + '/vivado_hls.app') as f:
-            # Get rid of namespaces (workaround to support two types of vivado_hls.app files)
-            xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1)
+        sln_file = 'vivado_hls.app'
+    elif os.path.isfile(sln_dir + '/hls.app'):
+        sln_file = 'hls.app'
+    else:
+        return solutions
+
+    with open(sln_dir + '/' + sln_file) as f:
+        # Get rid of namespaces (workaround to support two types of vivado_hls.app files)
+        xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1)
 
-        root = ET.fromstring(xmlstring)
-        for sln_tag in root.findall('solutions/solution'):
-            sln_name = sln_tag.get('name')
-            if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name):
-                solutions.append(sln_name)
+    root = ET.fromstring(xmlstring)
+    for sln_tag in root.findall('solutions/solution'):
+        sln_name = sln_tag.get('name')
+        if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name):
+            solutions.append(sln_name)
 
     return solutions
 
@@ -172,8 +178,13 @@ def parse_vivado_report(hls_dir):
         # Area
         area_node = root.find('./AreaEstimates')
         for child in area_node.find('./Resources'):
+            # DSPs are called 'DSP48E' in Vivado and just 'DSP' in Vitis. Overriding here to have consistent keys
+            if child.tag == 'DSP48E':
+                child.tag = 'DSP'
             c_synth_report[child.tag] = child.text
         for child in area_node.find('./AvailableResources'):
+            if child.tag == 'DSP48E':
+                child.tag = 'DSP'
             c_synth_report['Available' + child.tag] = child.text
         report['CSynthesisReport'] = c_synth_report
     else:
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h
new file mode 100644
index 000000000..6477bbd90
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h
@@ -0,0 +1,102 @@
+#ifndef NNET_CONV1D_RESOURCE_H_
+#define NNET_CONV1D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void conv_1d_resource_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T  res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+    constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor);
+    constexpr unsigned multscale = block_factor / mult_n_out;
+
+    assert((block_factor % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor <= CONFIG_T::filt_width * CONFIG_T::n_chan) && "This function is correct only for RF <= FILT_WIDTH * N_CHAN");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    PartitionLoop:
+    for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        //#pragma HLS UNROLL // We don't want this loop unrolled
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+        PixelInitAccumLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            InitAccumLoop:
+            for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t) biases[i_acc];
+            }
+        }
+
+        ReuseLoop:
+        for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) {
+            #pragma HLS PIPELINE II=1 rewind
+
+            unsigned i_in = i_rf;
+            unsigned i_out = 0;
+            unsigned i_acc = 0;
+
+            MultLoop:
+            for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) {
+                #pragma HLS UNROLL
+
+                PixelMultLoop:
+                for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+                    #pragma HLS UNROLL
+
+                    acc[i_pxl][i_out] += static_cast<typename CONFIG_T::accum_t>(
+                            CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data_buf[i_pxl][i_in], weights_2d[i_blk][i_rf]));
+                }
+
+                // Increment i_in
+                i_in += CONFIG_T::reuse_factor;
+                if (i_in >= mult_n_in) {
+                    i_in = i_rf;
+                }
+                // Increment i_out
+                if (i_acc + 1 >= multscale) {
+                    i_acc = 0;
+                    i_out++;
+                } else {
+                    i_acc++;
+                }
+            }
+        }
+
+        PixelResultLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+            // Cast to "res_t" type
+            ResultLoop:
+            for (unsigned i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
+            }
+        }
+    }
+}
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h
new file mode 100644
index 000000000..f054adc3d
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h
@@ -0,0 +1,36 @@
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+#include "hls_stream.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    } else {
+        ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    }
+
+}
+
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h
new file mode 100644
index 000000000..ea0afc7d2
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h
@@ -0,0 +1,104 @@
+#ifndef NNET_CONV2D_RESOURCE_H_
+#define NNET_CONV2D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void conv_2d_resource_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T  res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+    constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor);
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor);
+    constexpr unsigned multscale = multiplier_limit / mult_n_out;
+
+    assert((multiplier_limit % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= FILT_HEIGHT * FILT_WIDTH * N_CHAN");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    PartitionLoop:
+    for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        //#pragma HLS UNROLL // We don't want this loop unrolled
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+        PixelInitAccumLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            InitAccumLoop:
+            for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t) biases[i_acc];
+            }
+        }
+
+        ReuseLoop:
+        for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) {
+            #pragma HLS PIPELINE II=1 rewind
+
+            unsigned i_in = i_rf;
+            unsigned i_out = 0;
+            unsigned i_acc = 0;
+
+            MultLoop:
+            for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) {
+                #pragma HLS UNROLL
+
+                PixelMultLoop:
+                for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+                    #pragma HLS UNROLL
+
+                    acc[i_pxl][i_out] += static_cast<typename CONFIG_T::accum_t>(
+                            CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data_buf[i_pxl][i_in], weights_2d[i_blk][i_rf]));
+                }
+
+                // Increment i_in
+                i_in += CONFIG_T::reuse_factor;
+                if (i_in >= mult_n_in) {
+                    i_in = i_rf;
+                }
+                // Increment i_out
+                if (i_acc + 1 >= multscale) {
+                    i_acc = 0;
+                    i_out++;
+                } else {
+                    i_acc++;
+                }
+            }
+        }
+
+        PixelResultLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+            // Cast to "res_t" type
+            ResultLoop:
+            for (unsigned i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
+            }
+        }
+    }
+}
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h
new file mode 100644
index 000000000..1c77f4f3e
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_stream.h
@@ -0,0 +1,81 @@
+#ifndef NNET_CONV2D_STREAM_H_
+#define NNET_CONV2D_STREAM_H_
+
+#include "ap_shift_reg.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+#include "hls_stream.h"
+
+namespace nnet {
+
+// Line Buffer
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_buffer_latency_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1,1)][CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+            if (CONFIG_T::filt_height > 1) {
+                compute_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+            } else {
+                compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_buffer_resource_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1,1)][CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+
+            if (CONFIG_T::filt_height > 1) {
+                compute_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+            } else {
+                compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_buffer_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_buffer_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h
new file mode 100644
index 000000000..d96b75b47
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_resource.h
@@ -0,0 +1,247 @@
+#ifndef NNET_DENSE_RESOURCE_H_
+#define NNET_DENSE_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "hls_stream.h"
+#include <math.h>
+#include <assert.h>
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_leq_nin(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+
+    assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights; // I got you now motherfucker!
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    InitAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
+    }
+
+    ReuseLoop:
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        int in_index = ir;
+        int out_index = 0;
+        int acc_step = 0;
+
+        MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+              CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights_2d[im][ir]));
+
+            // Increment in_index
+            in_index += CONFIG_T::reuse_factor;
+            if (in_index >= CONFIG_T::n_in) {
+                in_index = ir;
+            }
+            // Increment out_index
+            if (acc_step + 1 >= multscale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+
+    // Cast to "res_t" type
+    Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin_rem0(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::n_in);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+
+    assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in && CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights;
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    InitAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
+    }
+
+    int in_index = 0;
+    int out_index;
+    int outstep = 0;
+    const int outscale = CONFIG_T::reuse_factor / CONFIG_T::n_in;
+
+    int outidx[CONFIG_T::reuse_factor];
+    IndexLoop:
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        outidx[ir] = outstep;
+        if ((ir + 1) % CONFIG_T::n_in == 0) {
+            outstep++;
+        }
+    }
+
+    ReuseLoop:
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        out_index = outidx[ir]/*outstep*/;
+
+        MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+              CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights_2d[im][ir]));
+
+            out_index += outscale;
+        }
+
+        in_index++;
+        if (in_index >= CONFIG_T::n_in) {
+            in_index = 0;
+            //outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
+        }
+    }
+
+    // Cast to "res_t" type
+    Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = CONFIG_T::n_out;
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+
+    assert((multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights;
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    InitAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
+    }
+
+    ReuseLoop:
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+        typename CONFIG_T::accum_t tmpmult[block_factor];
+        #pragma HLS ARRAY_PARTITION variable=tmpmult complete
+
+        MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + CONFIG_T::reuse_factor * im;
+            int in_index = w_index % CONFIG_T::n_in; // As of Vitis HLS 2022.1, this still results in urem core being used.
+            tmpmult[im] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights_2d[im][ir]);
+        }
+
+        typename CONFIG_T::accum_t mult[multiplier_limit];
+        #pragma HLS ARRAY_PARTITION variable=mult complete
+
+        ResetMult:
+        for (int imult = 0; imult < multiplier_limit; imult++) {
+            #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+        AccumLoop1:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + CONFIG_T::reuse_factor * im;
+            int out_index = w_index / CONFIG_T::n_in;
+            if (out_index >= multiplier_limit) continue; // check out of bounds
+            mult[out_index] += tmpmult[im];
+        }
+
+        AccumLoop2:
+        for (int im = 0; im < multiplier_limit; im++) {
+            #pragma HLS UNROLL
+            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
+        }
+    }
+
+    // Cast to "res_t" type
+    Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    #pragma HLS INLINE recursive
+
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+}
+
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h
new file mode 100644
index 000000000..955dc9e78
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_dense_stream.h
@@ -0,0 +1,113 @@
+#ifndef NNET_DENSE_STREAM_H_
+#define NNET_DENSE_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_types.h"
+#include "hls_stream.h"
+#include <math.h>
+#include <assert.h>
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_latency_wrapper(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]
+) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_resource_wrapper(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]
+) {
+    dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template<class data_T, typename CONFIG_T>
+void data_prepare(
+    hls::stream<data_T> &data_stream,
+    typename data_T::value_type data[CONFIG_T::n_in]
+) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::n_in / data_T::size > 1) {
+        DataPrepare: for(int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) {
+            #pragma HLS PIPELINE
+            data_T data_pack = data_stream.read();
+            DataPackPipeline: for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                data[i_in * data_T::size + i_pack] = data_pack[i_pack];
+            }
+        }
+    } else {
+        data_T data_pack = data_stream.read();
+        DataPackSingle: for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            data[i_pack] = data_pack[i_pack];
+        }
+    }
+}
+
+template<class res_T, typename CONFIG_T>
+void res_write(
+    typename res_T::value_type res[CONFIG_T::n_out],
+    hls::stream<res_T>  &res_stream
+) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::n_out / res_T::size > 1) {
+        ResWrite: for(unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) {
+            #pragma HLS PIPELINE
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            ResPackPipeline: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = res[i_out * res_T::size + i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+    } else {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+        ResPackSingle: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            res_pack[i_pack] = res[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense(
+    hls::stream<data_T> &data_stream,
+    hls::stream<res_T>  &res_stream,
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out])
+{
+    #pragma HLS INLINE recursive
+
+    typename data_T::value_type data[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=data complete
+
+    typename res_T::value_type res[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_prepare<data_T, CONFIG_T>(data_stream, data);
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+    }
+    res_write<res_T, CONFIG_T>(res, res_stream);
+}
+
+}
+
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h
new file mode 100644
index 000000000..ac921e0d3
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h
@@ -0,0 +1,340 @@
+#ifndef NNET_POOLING_H_
+#define NNET_POOLING_H_
+
+#include <iostream>
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+
+namespace nnet{
+
+// Return the maximum value from an array
+template<typename T, int N>
+T max(T x[N]){
+    T y = x[0];
+    for(int i = 1; i < N; i++){
+        y = x[i] > y ? x[i] : y;
+    }
+    return y;
+}
+
+template<int W, int N>
+ap_int<W> avg(ap_int<W> (&x)[N]){
+    // Use a wider accumulator than the input to avoid overflow
+    ap_int<W + ceillog2(N)> tmp = 0;
+    for(int i = 0; i < N; i++){
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ap_int<W> y = tmp;
+    return tmp;
+}
+
+template<int W, int I, int N>
+ap_fixed<W, I> avg(ap_fixed<W, I> (&x)[N]){
+    // Use a wider accumulator than the input to avoid overflow
+    ap_fixed<W + ceillog2(N), I + ceillog2(N)> tmp = 0;
+    for(int i = 0; i < N; i++){
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ap_fixed<W, I> y = tmp;
+    return y;
+}
+
+// Return the mean value of an array
+template<typename T, int N>
+T avg(T (&x)[N]){
+    T y = 0;
+    for(int i = 0; i < N; i++){
+        y += x[i];
+    }
+    y /= N;
+    return y;
+}
+
+// Enumeration for pooling operation (max, avg, l2norm pooling)
+enum Pool_Op { Max, Average }; // L2Norm };
+template<typename T, int N, Pool_Op op>
+T pool_op(T (&x)[N]){
+    switch(op){
+    case Max: return max<T, N>(x);
+    case Average: return avg(x);
+    // case L2Norm: return l2norm<T, N>(x);
+    }
+}
+
+template<typename T, Pool_Op op>
+T pad_val(){
+    /*---
+     *- In Tensorflow, pooling ignores the value in the padded cells
+     *- For Avg pooling, return 0 (the divisior is modified to the
+     *- area overlapping the unpadded image.
+     *- For max pooling, return the most negative value for the type.
+     *- TODO this is not really generic, it assumes fixed point or integer T
+    ---*/
+    switch(op){
+    case Max:{
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+        break;}
+    case Average: return 0;
+    }
+}
+
+struct pooling1d_config{
+    // IO size
+    static const unsigned n_in = 10;
+    static const unsigned pool_width = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template<typename CONFIG_T>
+constexpr int pool_op_limit_1d() {
+    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(
+    data_T data[CONFIG_T::n_in * CONFIG_T::n_filt],
+    res_T res[CONFIG_T::n_out * CONFIG_T::n_filt])
+{
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
+    // Add any necessary padding
+    unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image x in steps of stride
+        for(int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) {
+            data_T pool[CONFIG_T::pool_width];
+            // Keep track of number of pixels in image vs padding region
+            unsigned img_overlap = 0;
+            // Loop over pool window x
+            for(int jj = 0; jj < CONFIG_T::stride_width; jj++) {
+                if(ii+jj < CONFIG_T::pad_left || ii+jj >= (padded_width - CONFIG_T::pad_right)) {
+                    // Add padding
+                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
+                }else{
+                    pool[jj] = data[(ii + jj) * CONFIG_T::n_filt + ff];
+                    img_overlap++;
+                }
+            }
+            // do the pooling
+            // TODO in the case of average pooling, need to reduce width to area of pool window
+            // not overlapping padding region
+            res[(ii/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] =
+                    pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+            // If the pool op is Average, the zero-padding needs to be removed from the results
+            if(CONFIG_T::pool_op == Average) {
+                data_T rescale = CONFIG_T::pool_width / img_overlap;
+                res[(ii/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] *= rescale;
+            }
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(
+    data_T data[CONFIG_T::n_in * CONFIG_T::n_filt],
+    res_T res[CONFIG_T::n_filt])
+{
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
+
+    for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        data_T pool[CONFIG_T::n_in];
+        for(int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
+        }
+        // do the pooling
+        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op>(pool);
+    }
+}
+
+struct pooling2d_config{
+    // IO size
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_filt = 4;
+    static const unsigned stride_height = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned pool_height = 2;
+    static const unsigned pool_width = 2;
+    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
+    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
+    // Padding
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+    // Reuse factor
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef float accum_t;
+};
+
+template<typename CONFIG_T>
+constexpr int pool_op_limit(){
+    return DIV_ROUNDUP((CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt, CONFIG_T::reuse_factor);
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt])
+{
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
+    // Add any necessary padding
+    unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height);
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for(int ff = 0; ff < CONFIG_T::n_filt; ff++){
+        // Loop over input image y in steps of stride
+        for(int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height){
+            // Loop over input image x in steps of stride
+            for(int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width){
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for(int kk = 0; kk < CONFIG_T::stride_height; kk++){
+                    // Loop over pool window x
+                    for(int ll = 0; ll < CONFIG_T::stride_width; ll++){
+                        if(ii+kk < CONFIG_T::pad_top || ii+kk >= (padded_height - CONFIG_T::pad_bottom) || jj+ll < CONFIG_T::pad_left || jj+ll >= (padded_width - CONFIG_T::pad_right)){
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                        }else{
+                            pool[kk * CONFIG_T::stride_width + ll] = data[(ii + kk) * CONFIG_T::in_width * CONFIG_T::n_filt + (jj + ll) * CONFIG_T::n_filt + ff];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (jj/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] =
+                        pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if(CONFIG_T::pool_op == Average){
+                    data_T rescale = CONFIG_T::pool_height * CONFIG_T::pool_width / img_overlap;
+                    res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (jj/CONFIG_T::stride_width)* CONFIG_T::n_filt + ff] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cf(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt])
+{
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
+    // Add any necessary padding
+    unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height);
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for(int ff = 0; ff < CONFIG_T::n_filt; ff++){
+        // Loop over input image y in steps of stride
+        for(int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height){
+            // Loop over input image x in steps of stride
+            for(int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width){
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for(int kk = 0; kk < CONFIG_T::stride_height; kk++){
+                    // Loop over pool window x
+                    for(int ll = 0; ll < CONFIG_T::stride_width; ll++){
+                        if(ii+kk < CONFIG_T::pad_top || ii+kk >= (padded_height - CONFIG_T::pad_bottom) || jj+ll < CONFIG_T::pad_left || jj+ll >= (padded_width - CONFIG_T::pad_right)){
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                        }else{
+                            pool[kk * CONFIG_T::stride_width + ll] = data[(ii + kk) * CONFIG_T::in_width + ff * CONFIG_T::in_width*CONFIG_T::in_height + ll + jj];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width + (jj/CONFIG_T::stride_width) + ff* CONFIG_T::out_height* CONFIG_T::out_width] =
+                        pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if(CONFIG_T::pool_op == Average){
+                    data_T rescale = CONFIG_T::pool_height * CONFIG_T::pool_width / img_overlap;
+                    res[(ii/CONFIG_T::stride_height) * CONFIG_T::out_width + (jj/CONFIG_T::stride_width) + ff* CONFIG_T::out_height* CONFIG_T::out_width] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+
+template<class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
+
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width * CONFIG_T::pool_height, CONFIG_T::pool_op> limit=limit
+
+    FiltLoop:
+    for(int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+        
+        InputLoop:
+        for (int i = 0 ; i < CONFIG_T::in_height * CONFIG_T::in_width ; i++) {
+          pool[i] = data[i * CONFIG_T::n_filt + filt];
+        }
+                  
+        res[filt] = static_cast<res_T>(pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op>(pool));
+    }
+}
+
+}
+
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h
new file mode 100644
index 000000000..f936c7c88
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling_stream.h
@@ -0,0 +1,341 @@
+#ifndef NNET_POOLING_STREAM_H_
+#define NNET_POOLING_STREAM_H_
+
+#include "utils/x_hls_utils.h"
+#include "ap_shift_reg.h"
+#include "nnet_common.h"
+#include "nnet_pooling.h"
+#include "nnet_conv_stream.h"
+#include "hls_stream.h"
+
+namespace nnet {
+
+// *************************************************
+//       Max/average pooling
+// *************************************************
+
+template <class T, int N, class CONFIG_T>
+T reduce_pool(T x[N]) {
+    #pragma HLS INLINE
+    if (CONFIG_T::pool_op == Max) {
+        Op_max<T> op_max;
+        return reduce<T, N, Op_max<T>>(x, op_max);
+    } else {
+        Op_add<T> op_add;
+        T sum = reduce<T, N, Op_add<T>>(x, op_add);
+        return sum / N;
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_2d(
+    const data_T& in_elem,
+    ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::pool_height - 1,1)][CONFIG_T::n_filt],
+    hls::stream<res_T> &res
+) {
+    #pragma HLS INLINE
+    const static int lShiftX = CONFIG_T::pool_width - 1;
+    const static int lShiftY = CONFIG_T::pool_height - 1;
+    static int pX = 0; // pixel X 
+    static int pY = 0; // pixel Y
+    static int sX = 0; // stride X
+    static int sY = 0; // stride Y
+
+    typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width];
+    #pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    static typename data_T::value_type kernel_data[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel into line buffer, return pooling kernels
+    nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
+
+    // Can compute pooling output
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
+        FiltLoop: for(unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            #pragma HLS PIPELINE
+
+            // Retrieve data for current channel
+            PoolLoop: for(unsigned i_ihw = 0; i_ihw < CONFIG_T::pool_height * CONFIG_T::pool_width; i_ihw++) {
+                pool_window[i_ihw] = kernel_data[i_ihw * CONFIG_T::n_filt + i_ic]; 
+            }
+
+            // Compute Pooling
+            res_pack[i_ic] = reduce_pool<typename data_T::value_type, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T>(pool_window);
+        }
+
+        // Write to output
+        res.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::in_width)  // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+        if (pY + 1 == CONFIG_T::in_height) {  // Reached bottom of image
+            pY = 0;
+            sY = 0;
+        } else { // Next line
+            pY = pY + 1;
+            // Update stride (threshold) ? subtract stride : increment stride
+            sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; 
+        }
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; 
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T> &res
+) {
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    #pragma HLS INLINE recursive
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::pool_height - 1,1)][CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            #pragma HLS PIPELINE
+
+            compute_pool_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res);
+        }
+    }
+}
+
+// *************************************************
+//                  Pooling 1D
+// *************************************************
+template<class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_1d(
+    const data_T& in_elem,
+    hls::stream<res_T> &res
+) {
+    #pragma HLS INLINE
+    const static int lShiftX = CONFIG_T::pool_width - 1;
+    // Counters
+    static int pX = 0;
+    static int sX = 0;
+
+    typename data_T::value_type pool_window[CONFIG_T::pool_width];
+    #pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    static typename data_T::value_type kernel_data[CONFIG_T::pool_width * CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel into line buffer, return pooling kernels
+    // 1D case line buffer not necessary. Put directly into the kernel_data buffer
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
+
+    // Can compute pooling output
+    if ( (sX - lShiftX) == 0 && pX > lShiftX - 1) {
+        FiltLoop: for(unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            #pragma HLS PIPELINE
+
+            // Retrieve data for current channel
+            PoolLoop: for(unsigned i_iw = 0; i_iw < CONFIG_T::pool_width; i_iw++) {
+                pool_window[i_iw] = kernel_data[i_iw * CONFIG_T::n_filt + i_ic]; 
+            }
+
+            // Compute Pooling
+            res_pack[i_ic] = reduce_pool<typename data_T::value_type, CONFIG_T::pool_width, CONFIG_T>(pool_window);
+        }
+
+        // Write to output
+        res.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::n_in)  // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; 
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T> &res
+) {
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    #pragma HLS inline recursive
+
+    ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in; i_iw++) {
+        #pragma HLS PIPELINE
+        compute_pool_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res);
+    }
+}
+
+
+// *************************************************
+//       Global max/average pooling
+// *************************************************
+
+template <class T, int N, class CONFIG_T>
+T reduce_global_pool(T x, T y[N]) {
+    #pragma HLS INLINE
+    if (CONFIG_T::pool_op == Max) {
+        Op_max<T> op_max;
+        T y_max = reduce<T, N, Op_max<T>>(y, op_max);
+        return (x > y_max) ? x : y_max;
+    } else {
+        Op_add<T> op_add;
+        T y_sum = reduce<T, N, Op_add<T>>(y, op_add);
+        return x + y_sum;
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void compute_global_pool(
+    const data_T& in_elem,
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]
+) {
+    PoolFilt: for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+        #pragma HLS UNROLL
+
+        typename CONFIG_T::accum_t data_pack[data_T::size / CONFIG_T::n_filt];
+        #pragma HLS ARRAY_PARTITION variable=data_pack complete dim=0
+
+        PixelLoop: for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) {
+            #pragma HLS UNROLL
+            data_pack[p] = in_elem[p * CONFIG_T::n_filt + c];
+        }
+        data_window[c] = reduce_global_pool<typename CONFIG_T::accum_t, data_T::size / CONFIG_T::n_filt, CONFIG_T>(data_window[c], data_pack);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T> &res
+) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable=data_window complete
+
+    typename CONFIG_T::accum_t init = 0;
+    if (CONFIG_T::pool_op == Max) {
+        init = hls::numeric_limits<typename CONFIG_T::accum_t>::min();
+    }
+
+    PoolInitLoop: for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) {
+        #pragma HLS UNROLL
+        data_window[i_init] = init;
+    }
+
+    ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_filt); i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_window);
+        }
+    }
+
+    if (CONFIG_T::pool_op == Max) {
+        MaxPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack];
+            }
+            res.write(res_pack);
+        }
+    } else {
+        AvgPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width);
+            }
+            res.write(res_pack);
+        }
+    }
+
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T> &res
+) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable=data_window complete
+
+    typename CONFIG_T::accum_t init = 0;
+    if (CONFIG_T::pool_op == Max) {
+        init = hls::numeric_limits<typename CONFIG_T::accum_t>::min();
+    }
+
+    PoolInitLoop: for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) {
+        #pragma HLS UNROLL
+        data_window[i_init] = init;
+    }
+
+    ReadInput: for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (data_T::size / CONFIG_T::n_filt); i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_window);
+    }
+
+    if (CONFIG_T::pool_op == Max) {
+        MaxPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack];
+            }
+            res.write(res_pack);
+        }
+    } else {
+        AvgPoolRes: for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+            AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+	      res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in;
+            }
+            res.write(res_pack);
+        }
+    }
+
+}
+
+}
+
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
new file mode 100644
index 000000000..6850497ff
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
@@ -0,0 +1,89 @@
+#ifndef NNET_SEPARABLE_CONV1D_STREAM_H_
+#define NNET_SEPARABLE_CONV1D_STREAM_H_
+
+#include "nnet_common.h"
+#include "hls_stream.h"
+#include "nnet_sepconv_stream.h"
+#include "nnet_conv1d_stream.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_buffer_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_chan])
+{
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    } else {
+        ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_width == 1);
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            if (i_iw % CONFIG_T::stride_width == 0) {
+                pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            } else {
+                data.read();
+            }
+        }
+    } else {
+        ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            if (i_iw % CONFIG_T::stride_width == 0) {
+                pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            } else {
+                data.read();
+            }
+        }
+    }
+}
+
+
+template<class data_T, class res_T, typename CONFIG_T>
+void separable_conv_1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::depthwise_config::weight_t depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+    typename CONFIG_T::pointwise_config::weight_t pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+    typename CONFIG_T::depthwise_config::bias_t   depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+    typename CONFIG_T::pointwise_config::bias_t   pointwise_biases[CONFIG_T::pointwise_config::n_filt]
+) {
+    assert(CONFIG_T::depthwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+    assert(CONFIG_T::pointwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    #pragma HLS DATAFLOW
+
+    hls::stream<data_T> depthwise_res;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    #pragma HLS STREAM variable=depthwise_res depth=res_depth
+
+    depthwise_conv_1d_buffer_cl<data_T, data_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights, depthwise_biases);
+    pointwise_conv_1d_cl<data_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights, pointwise_biases);
+}
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
new file mode 100644
index 000000000..352828ecd
--- /dev/null
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
@@ -0,0 +1,113 @@
+#ifndef NNET_SEPARABLE_CONV2D_STREAM_H_
+#define NNET_SEPARABLE_CONV2D_STREAM_H_
+
+#include "nnet_common.h"
+#include "hls_stream.h"
+#include "nnet_sepconv_stream.h"
+#include "nnet_conv2d_stream.h"
+
+namespace nnet {
+
+// Line Buffer Implementation (Phil's)
+template<class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_buffer_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_chan])
+{
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::filt_height - 1][CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+            ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+                #pragma HLS LOOP_FLATTEN
+                #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+                if (CONFIG_T::filt_height > 1) {
+                    compute_depthwise_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+                } else {
+                    compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+                }
+            }
+        }
+    } else {
+        ReadInputHeightSerial: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+            ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+                #pragma HLS LOOP_FLATTEN
+                if (CONFIG_T::filt_height > 1) {
+                    compute_depthwise_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+                } else {
+                    compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+                }
+            }
+        }
+    }
+}
+
+
+template<class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+            ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+                #pragma HLS LOOP_FLATTEN
+                #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+                if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) {
+                    pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+                } else {
+                    data.read();
+                }
+            }
+        }
+    } else {
+        ReadInputHeightSerial: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+            ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+                #pragma HLS LOOP_FLATTEN
+                if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) {
+                    pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+                } else {
+                    data.read();
+                }
+            }
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void separable_conv_2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::depthwise_config::weight_t depthwise_weights[CONFIG_T::depthwise_config::filt_height * CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+    typename CONFIG_T::pointwise_config::weight_t pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+    typename CONFIG_T::depthwise_config::bias_t   depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+    typename CONFIG_T::pointwise_config::bias_t   pointwise_biases[CONFIG_T::pointwise_config::n_filt]
+) {
+    assert(CONFIG_T::depthwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+    assert(CONFIG_T::pointwise_config::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    #pragma HLS DATAFLOW
+
+    hls::stream<data_T> depthwise_res;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    #pragma HLS STREAM variable=depthwise_res depth=res_depth
+
+    depthwise_conv_2d_buffer_cl<data_T, data_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights, depthwise_biases);
+    pointwise_conv_2d_cl<data_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights, pointwise_biases);
+}
+
+}
+#endif
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index 3b0f9ad53..d34337c57 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -162,9 +162,11 @@ if {$opt(reset)} {
     open_solution "solution1"
 }
 catch {config_array_partition -maximum_size 4096}
-config_compile -name_max_length 60
+config_compile -name_max_length 80
 set_part $part
+config_schedule -enable_dsp_full_reg=false
 create_clock -period $clock_period -name default
+set_clock_uncertainty $clock_uncertainty default
 
 
 if {$opt(csim)} {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
index cf97ce099..dcfcb00d4 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
@@ -40,7 +40,7 @@ template <class data_T, class res_T, typename CONFIG_T> void linear(hls::stream<
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     LinearPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -62,7 +62,7 @@ template <class data_T, class res_T, typename CONFIG_T> void relu(hls::stream<da
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     ReLUPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -101,7 +101,7 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(hls::stream
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     SigmoidPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -173,11 +173,12 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res) {
             invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
 
         res_T out_pack;
-    #pragma HLS DATA_PACK variable=out_pack
+        PRAGMA_DATA_PACK(out_pack)
+
     SoftmaxInvPackLoop:
         for (unsigned j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+            #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
             out_pack[j] = exp_res[j] * inv_exp_sum;
         }
         res.write(out_pack);
@@ -253,11 +254,12 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res) {
             invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
 
         res_T out_pack;
-    #pragma HLS DATA_PACK variable=out_pack
+        PRAGMA_DATA_PACK(out_pack)
+
     SoftmaxInvPackLoop:
         for (unsigned j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
-            #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+            #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
             out_pack[j] = exp_res[j] * inv_exp_sum;
         }
         res.write(out_pack);
@@ -322,7 +324,8 @@ void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         }
 
         res_T out_pack;
-    #pragma HLS DATA_PACK variable=out_pack
+        PRAGMA_DATA_PACK(out_pack)
+
     SoftmaxInvPackLoop:
         for (unsigned j = 0; j < res_T::size; j++) {
             #pragma HLS UNROLL
@@ -410,7 +413,7 @@ template <class data_T, class res_T, typename CONFIG_T> void tanh(hls::stream<da
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     TanHPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -441,7 +444,7 @@ void hard_sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     HardSigmoidPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -495,7 +498,7 @@ void leaky_relu(hls::stream<data_T> &data, typename data_T::value_type alpha, hl
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     LeakyReLUPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -521,7 +524,7 @@ void thresholded_relu(hls::stream<data_T> &data, typename data_T::value_type the
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     ThresholdedReLUPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -560,7 +563,7 @@ template <class data_T, class res_T, typename CONFIG_T> void softplus(hls::strea
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     SoftplusPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -601,7 +604,7 @@ template <class data_T, class res_T, typename CONFIG_T> void softsign(hls::strea
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     SoftsignPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -642,7 +645,7 @@ void elu(hls::stream<data_T> &data, typename data_T::value_type alpha, hls::stre
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     EluPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -690,7 +693,7 @@ template <class data_T, class res_T, typename CONFIG_T> void selu(hls::stream<da
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     SeluPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -722,7 +725,7 @@ void prelu(hls::stream<data_T> &data, typename data_T::value_type alpha[CONFIG_T
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     PReLUPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -747,7 +750,7 @@ void binary_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     PReLUPackLoop:
         for (int j = 0; j < res_T::size; j++) {
@@ -772,7 +775,7 @@ void ternary_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     PReLUPackLoop:
         for (int j = 0; j < res_T::size; j++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
index 200282784..2314f5609 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
@@ -70,18 +70,17 @@ void normalize(
     #pragma HLS ARRAY_PARTITION variable=scale complete
     #pragma HLS ARRAY_PARTITION variable=bias complete
 
-    int multiplier_limit  = ceil(float(CONFIG_T::n_in) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::limit(multiplier_limit);
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Calcuate result
     Result: for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
         if (CONFIG_T::n_filt==-1) {
             res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) + bias[ires];
-	    } else {
+        } else {
             int norm_index = ires%CONFIG_T::n_filt;
             res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) + bias[norm_index];
         }
-	}
+    }
 }
 
 // ****************************************************
@@ -108,13 +107,12 @@ void  normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T
     data_T datareg;   
     ap_uint<1> cache; 
     for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-        datareg = data[ii];	 
+        datareg = data[ii];     
         int norm_index = CONFIG_T::n_filt==-1 ? ii : ii%CONFIG_T::n_filt;
         if( datareg > threshold[norm_index] ) cache = 1;
         else cache = 0;
 
         res[ii] = (ap_uint<1>) cache;
- 
     }   
 }
 
@@ -134,7 +132,6 @@ void  normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T
         else cache = 0;
 
         res[ii] = (ap_int<2>) cache;
-
     }
 }
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
index ce76c01bc..ce49d65b0 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
@@ -41,16 +41,15 @@ void normalize(
     #pragma HLS ARRAY_PARTITION variable=scale complete
     #pragma HLS ARRAY_PARTITION variable=bias complete
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    constexpr unsigned ii = CONFIG_T::n_in / multiplier_limit;
-    CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::limit(multiplier_limit);
+    constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit;
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     BatchNormLoop: for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
         #pragma HLS PIPELINE II=ii
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         BatchNormpack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
@@ -83,7 +82,7 @@ void normalize_binary_tanh(
 
         data_T in_data = data.read();
         nnet::array<ap_uint<1>, CONFIG_T::n_in> out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         BatchNormPack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
@@ -109,7 +108,7 @@ void normalize_ternary_tanh(
 
         data_T in_data = data.read();
         nnet::array<ap_int<2>, CONFIG_T::n_in> out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
         BatchNormPack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index 9bfae8339..af59f9021 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -27,6 +27,17 @@
 #define MIN(n,d) (n > d ? d : n)
 #define MAX(n,d) (n > d ? n : d)
 
+#define STRINGIFY(x) #x
+#define EXPAND_STRING(x) STRINGIFY(x)
+
+#ifndef __VITIS_HLS__
+#define DATA_PACK_TXT HLS DATA_PACK variable=
+#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable
+#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable)))
+#else
+#define PRAGMA_DATA_PACK(variable)
+#endif
+
 namespace nnet {
 
 // Common type definitions
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 65f71d080..dd7225346 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -29,8 +29,8 @@ void conv_1d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=weights complete
     #pragma HLS ARRAY_PARTITION variable=biases complete
 
-    int multiplier_limit  = CONFIG_T::n_pixels * (ceil(float(mult_n_in * mult_n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::mult_config::n_zeros) / float(CONFIG_T::reuse_factor)));
-    CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::limit(multiplier_limit);
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
 
     PartitionLoop:
     for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
index 40568921e..b23c330c7 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
@@ -36,7 +36,7 @@ void conv_1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
@@ -74,7 +74,7 @@ template <class data_T, class res_T, typename CONFIG_T>
 void conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
                 typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
                 typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    #pragma HLS inline region
+    #pragma HLS inline recursive
     switch (CONFIG_T::implementation) {
     case conv_implementation::linebuffer:
         conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
index ff2fb181c..43222696c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
@@ -7,166 +7,6 @@
 
 namespace nnet {
 
-//Computes multiplier limit
-//This function should not be synthesized into firmware
-template<typename CONFIG_T>
-    int compute_multiplier_limit_conv2d(
-    typename CONFIG_T::weight_t  weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt]
-)
-{
-    int n_mult = 0;
-
-    for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-        for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-            for(int ff = 0; ff < CONFIG_T::n_filt; ff++){
-                for(int cc = 0; cc < CONFIG_T::n_chan; cc++){
-                    for(int fh = 0; fh < CONFIG_T::filt_height; fh++){
-                        for(int fw = 0; fw < CONFIG_T::filt_width; fw++){
-
-                                int index_weight = fh*CONFIG_T::filt_width*CONFIG_T::n_chan*CONFIG_T::n_filt
-                                                 + fw*CONFIG_T::n_chan*CONFIG_T::n_filt
-                                                 + cc*CONFIG_T::n_filt
-                                                  + ff;
-
-                                if ((oh*CONFIG_T::stride_height+fh) < CONFIG_T::pad_top
-                                || (oh*CONFIG_T::stride_height+fh) >= (CONFIG_T::pad_top+CONFIG_T::in_height)
-                                || (ow*CONFIG_T::stride_width+fw) < CONFIG_T::pad_left
-                                || (ow*CONFIG_T::stride_width+fw) >= (CONFIG_T::pad_left+CONFIG_T::in_width)) {
-                                    //padded - do nothing
-                                    continue;
-                                } else {
-                                    if (weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20) {
-                                          n_mult++;
-                                    }
-                                }
-
-                        }//end mult loop
-                    }//end channel loop
-                }//end filter width loop
-            }//end filter height loop
-        }//end output width loop
-    }//end output height loop
-
-    return ceil( float(n_mult) / float(CONFIG_T::reuse_factor) );
-
-}//end compute_n_mult
-
-template<class data_T, class res_T, typename CONFIG_T>
-void conv_2d_latency_cf(
-    data_T data[CONFIG_T::in_height*CONFIG_T::in_width*CONFIG_T::n_chan],
-    res_T  res[CONFIG_T::out_height*CONFIG_T::out_width*CONFIG_T::n_filt],
-    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
-{
-
-    typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width];
-    typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt];
-
-    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
-    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    #pragma HLS function_instantiate variable=weights,biases
-
-    // Parallel mode
-    #pragma HLS PIPELINE
-    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
-
-    // Limit multipliers to control parallelization
-    const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
-    #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
-
-    // Convolve, saving all multiplication results to accumulate later
-    ConvOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-        ConvOutWidth: for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-            ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++){
-                ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++){
-                    ConvFiltHeight: for(int fh = 0; fh < CONFIG_T::filt_height; fh++){
-                        ConvFiltWidth: for(int fw = 0; fw < CONFIG_T::filt_width; fw++){
-
-                            int index_mult = oh*CONFIG_T::out_width*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + ow*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + ff*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + cc*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + fh*CONFIG_T::filt_width
-                                           + fw;
-
-                                int index_weight = fh*CONFIG_T::filt_width*CONFIG_T::n_chan*CONFIG_T::n_filt
-                                                 + fw*CONFIG_T::n_chan*CONFIG_T::n_filt
-                                                 + cc*CONFIG_T::n_filt
-                                                 + ff;
-
-                                if ((oh*CONFIG_T::stride_height+fh) < CONFIG_T::pad_top
-                                || (oh*CONFIG_T::stride_height+fh) >= (CONFIG_T::pad_top+CONFIG_T::in_height)
-                                || (ow*CONFIG_T::stride_width+fw) < CONFIG_T::pad_left
-                                || (ow*CONFIG_T::stride_width+fw) >= (CONFIG_T::pad_left+CONFIG_T::in_width)) {
-                                    mult[index_mult] = 0;
-                                } else {
-                                    int index_data = cc*CONFIG_T::in_height*CONFIG_T::in_width
-                                                   + (oh*CONFIG_T::stride_height+fh-CONFIG_T::pad_top)*CONFIG_T::in_width
-                                                   + (ow*CONFIG_T::stride_width+fw-CONFIG_T::pad_left);
-                                    mult[index_mult] = data[index_data] * weights[index_weight];
-                                }
-
-                        }//end mult loop
-                    }//end channel loop
-                  }//end filter width loop
-            }//end filter height loop
-        }//end output width loop
-    }//end output height loop
-
-
-    // Initialize accumulator with input biases
-    for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-        for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-            for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-                acc[oh*CONFIG_T::out_width*CONFIG_T::n_filt + ow*CONFIG_T::n_filt + ff]=biases[ff];
-            }
-        }
-    }
-
-
-    // Accumulate multiplication result
-    AccumOutHeight: for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-        AccumOutWidth: for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-            AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-                //Do "dot product" sum within filter and sum over channels
-                AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++){
-                    AccumDotHeight: for(int fh = 0; fh < CONFIG_T::filt_height; fh++){
-                        AccumDotWidth: for(int fw = 0; fw < CONFIG_T::filt_width; fw++){
-
-                            int index_mult = oh*CONFIG_T::out_width*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + ow*CONFIG_T::n_filt*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + ff*CONFIG_T::n_chan*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + cc*CONFIG_T::filt_height*CONFIG_T::filt_width
-                                           + fh*CONFIG_T::filt_width
-                                           + fw;
-                            int index_acc = oh*CONFIG_T::out_width*CONFIG_T::n_filt
-                                          + ow*CONFIG_T::n_filt
-                                          + ff;
-
-                            acc[index_acc] += mult[index_mult];
-
-                        }//end dot product filter width loop
-                    }//end dot product filter height loop
-                }//end n channel loop
-            }//end n filter loop
-        }//end output width loop
-    }//end output height loop
-
-    // Cast to "res_t" type
-    for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        for(int oh = 0; oh < CONFIG_T::out_height; oh++) {
-            for(int ow = 0; ow < CONFIG_T::out_width; ow++) {
-                int res_index = ff*CONFIG_T::out_height*CONFIG_T::out_width + oh*CONFIG_T::out_width + ow;
-                int acc_index = oh*CONFIG_T::out_width*CONFIG_T::n_filt + ow*CONFIG_T::n_filt + ff;
-                res[res_index] = acc[acc_index];
-            }
-        }
-    }
-
-}//end conv2d
-
 template<class data_T, class res_T, typename CONFIG_T>
 void conv_2d_latency_cl(
     data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
@@ -189,8 +29,8 @@ void conv_2d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=weights complete
     #pragma HLS ARRAY_PARTITION variable=biases complete
 
-    int multiplier_limit  = CONFIG_T::n_pixels * (ceil(float(mult_n_in * mult_n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::mult_config::n_zeros) / float(CONFIG_T::reuse_factor)));
-    CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::limit(multiplier_limit);
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
 
     PartitionLoop:
     for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
index ccc048d4f..8a4fb6be8 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
@@ -42,7 +42,7 @@ void conv_2d_encoded_cl(
     #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     ap_uint<CONFIG_T::filt_height * CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
@@ -97,7 +97,7 @@ void conv_2d_cl(
     hls::stream<data_T> &data, hls::stream<res_T> &res,
     typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    #pragma HLS inline region
+    #pragma HLS inline recursive
     switch (CONFIG_T::implementation) {
     case conv_implementation::linebuffer:
         conv_2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
index ef46a5210..203810f28 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
@@ -94,7 +94,7 @@ void mult_buffer(hls::stream<typename data_T::value_type> data_window[CONFIG_T::
         data[id] = data_window[id].read();
     }
 
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
         dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
             data, res, weights, biases);
@@ -280,7 +280,7 @@ void compute_output_buffer_2d(
     #pragma HLS ARRAY_PARTITION variable = res_out complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable = res_pack
+    PRAGMA_DATA_PACK(res_pack)
 
     // Add pixel to buffer
     nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
@@ -289,7 +289,7 @@ void compute_output_buffer_2d(
     if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
 
         // Dense multiply
-        #pragma HLS INLINE region
+        #pragma HLS INLINE recursive
         if (CONFIG_T::strategy == nnet::latency) {
             dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
                 kernel_data, res_out, weights, biases);
@@ -351,7 +351,7 @@ void compute_output_buffer_1d(
     #pragma HLS ARRAY_PARTITION variable = res_out complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable = res_pack
+    PRAGMA_DATA_PACK(res_pack)
 
     // Add pixel to buffer
     nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
@@ -360,7 +360,7 @@ void compute_output_buffer_1d(
     if ((sX - lShiftX) == 0 && pX > lShiftX - 1) {
 
         // Dense multiply
-        #pragma HLS INLINE region
+        #pragma HLS INLINE recursive
         if (CONFIG_T::strategy == nnet::latency) {
             dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
                 kernel_data, res_out, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
index dc803ff2b..7202b3a10 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
@@ -51,10 +51,12 @@ void dense_compressed(
     #pragma HLS ARRAY_PARTITION variable=acc    complete
     #pragma HLS ARRAY_PARTITION variable=biases complete
     #pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
-    //if (CONFIG_T::store_weights_in_bram){
-    //#pragma HLS RESOURCE variable=weights core=ROM_1P_BRAM
+
+#ifdef __VITIS_HLS__
+    #pragma HLS AGGREGATE variable=weights
+#else
     #pragma HLS data_pack variable=weights struct_level
-    //}
+#endif
 
     InitAccum:
     for(unsigned i = 0; i < CONFIG_T::n_out; i++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
index c4dcea4ab..464e8b495 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
@@ -52,8 +52,7 @@ void dense_latency(
     #pragma HLS ARRAY_PARTITION variable=mult complete
     #pragma HLS ARRAY_PARTITION variable=acc complete
 
-    int multiplier_limit  = ceil(float(CONFIG_T::n_in*CONFIG_T::n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Do the matrix-multiply
     Product1: for(int ii = 0; ii < CONFIG_T::n_in; ii++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
index c0e5d1759..180365327 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
@@ -270,7 +270,7 @@ void dense_resource(
     typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
     typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
 
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
 
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
         dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
index 52c96c52c..564bafac9 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
@@ -16,7 +16,7 @@ void dense_wrapper(
     typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
     typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]
 ) {
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
         #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
         dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
@@ -56,7 +56,7 @@ void dense(
             #pragma HLS PIPELINE
         }
         res_T res_pack;
-        #pragma HLS DATA_PACK variable=res_pack
+        PRAGMA_DATA_PACK(res_pack)
         ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
             #pragma HLS UNROLL
             res_pack[i_pack] = res[i_out * res_T::size + i_pack];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h
index 3ada00b24..fb8e2fb43 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_embed_stream.h
@@ -19,7 +19,7 @@ void embedding(
         #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
 
         res_T res_pack;
-        #pragma HLS DATA_PACK variable=res_pack
+        PRAGMA_DATA_PACK(res_pack)
 
         DenseEmbedding: for (int i = 0; i < CONFIG_T::n_out; i++) {
             #pragma HLS UNROLL
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h
index 42d2ce80e..89f91d6f0 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_image_stream.h
@@ -44,7 +44,7 @@ void resize_nearest(
 					#pragma HLS UNROLL
 				
 					data_T out_data;
-					#pragma HLS DATA_PACK variable=out_data
+					PRAGMA_DATA_PACK(out_data)
 				
 					ResizeChan: for (unsigned k = 0; k < CONFIG_T::n_chan; k++) {
 						#pragma HLS UNROLL
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
index a35c264d2..19f2b421d 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
@@ -140,8 +140,7 @@ void dot1d(
 {
     #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    CONFIG_T::template product<input1_T, input2_T>::limit(multiplier_limit);
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
     #pragma HLS ARRAY_PARTITION variable=mult complete
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
index 6a1d81a4a..1ebbb9662 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
@@ -37,7 +37,7 @@ void add(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     AddPack:
         for (int j = 0; j < res_T::size; j++) {
@@ -60,7 +60,7 @@ void subtract(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::s
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     SubtractPack:
         for (int j = 0; j < res_T::size; j++) {
@@ -83,7 +83,7 @@ void multiply(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::s
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     MultiplyPack:
         for (int j = 0; j < res_T::size; j++) {
@@ -106,7 +106,7 @@ void average(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::st
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     AveragePack:
         for (int j = 0; j < res_T::size; j++) {
@@ -129,7 +129,7 @@ void maximum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::st
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     MaximumPack:
         for (int j = 0; j < res_T::size; j++) {
@@ -152,7 +152,7 @@ void minimum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::st
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     MinimumPack:
         for (int j = 0; j < res_T::size; j++) {
@@ -174,7 +174,7 @@ void concatenate3d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2,
 
             input1_T in_data1 = data1.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
         ConcatPackInput1:
             for (int k = 0; k < input1_T::size; k++) {
@@ -193,7 +193,7 @@ void concatenate3d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2,
 
             input2_T in_data2 = data2.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
         ConcatPackInput2:
             for (int k = 0; k < input2_T::size; k++) {
@@ -216,7 +216,7 @@ void concatenate3d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2,
 
             input1_T in_data1 = data1.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
         ConcatPackInput1:
             for (int k = 0; k < input1_T::size; k++) {
@@ -232,7 +232,7 @@ void concatenate3d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2,
 
             input2_T in_data2 = data2.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
         ConcatPackInput2:
             for (int k = 0; k < input2_T::size; k++) {
@@ -256,7 +256,7 @@ void concatenate3d_2(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2,
             input1_T in_data1 = data1.read();
             input2_T in_data2 = data2.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
         ConcatPackInput1:
             for (int k = 0; k < input1_T::size; k++) {
@@ -294,7 +294,7 @@ void concatenate2d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2,
 
         input1_T in_data1 = data1.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     ConcatPackInput1:
         for (int k = 0; k < input1_T::size; k++) {
@@ -310,7 +310,7 @@ void concatenate2d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2,
 
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     ConcatPackInput2:
         for (int k = 0; k < input2_T::size; k++) {
@@ -331,7 +331,7 @@ void concatenate2d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2,
         input1_T in_data1 = data1.read();
         input2_T in_data2 = data2.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     ConcatPackInput1:
         for (int k = 0; k < input1_T::size; k++) {
@@ -361,7 +361,7 @@ void concatenate2d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, h
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
 void concatenate1d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
     res_T out_data;
-#pragma HLS DATA_PACK variable=out_data
+    PRAGMA_DATA_PACK(out_data)
 ConcatLoop1:
     for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
         #pragma HLS PIPELINE
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
index 586bc65ae..966959c70 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
@@ -16,10 +16,7 @@ namespace product{
  * types of each.
  * --- */
 
-class Product{
-    public:
-    static void limit(unsigned multiplier_limit) {} // Nothing to do here
-};
+class Product{};
 
 template<class x_T, class w_T>
 class both_binary : public Product{
@@ -77,10 +74,6 @@ class mult : public Product{
         #pragma HLS INLINE
         return a * w;
     }
-    static void limit(unsigned multiplier_limit){
-        #pragma HLS INLINE
-        #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
-    }
 };
 
 template<class x_T, class w_T>
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
index 303315b52..cd7d1a135 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
@@ -109,7 +109,7 @@ void pooling1d_cl(
 
     // TODO partition the arrays according to the reuse factor
     const int limit = pool_op_limit_1d<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
     // Add any necessary padding
     unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
     if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
@@ -159,7 +159,7 @@ void global_pooling1d_cl(
 
     // TODO partition the arrays according to the reuse factor
     const int limit = pool_op_limit_1d<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
 
     for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
         data_T pool[CONFIG_T::n_in];
@@ -211,7 +211,7 @@ void pooling2d_cl(
 
     // TODO partition the arrays according to the reuse factor
     const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
     // Add any necessary padding
     unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
     unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
@@ -266,7 +266,7 @@ void pooling2d_cf(
 
     // TODO partition the arrays according to the reuse factor
     const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
     // Add any necessary padding
     unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
     unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h
index 837cd416a..af06624fa 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling_stream.h
@@ -130,7 +130,7 @@ void pooling2d_encoded_cl(
     assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     hls::stream<typename data_T::value_type> data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
@@ -176,7 +176,7 @@ void compute_pool_buffer_2d(
     #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
 
     // Add pixel into line buffer, return pooling kernels
     nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
@@ -245,7 +245,7 @@ void pooling2d_cl(
     hls::stream<data_T> &data,
     hls::stream<res_T> &res
 ) {
-    #pragma HLS inline region
+    #pragma HLS inline recursive
     switch(CONFIG_T::implementation){
         case conv_implementation::linebuffer:
             pooling2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res);
@@ -344,7 +344,7 @@ void pooling1d_encoded_cl(
     assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     hls::stream<typename data_T::value_type> data_window[CONFIG_T::pool_width * CONFIG_T::n_filt];
@@ -385,7 +385,7 @@ void compute_pool_buffer_1d(
     #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
 
     // Add pixel into line buffer, return pooling kernels
     // 1D case line buffer not necessary. Put directly into the kernel_data buffer
@@ -441,7 +441,7 @@ void pooling1d_cl(
     hls::stream<data_T> &data,
     hls::stream<res_T> &res
 ) {
-    #pragma HLS inline region
+    #pragma HLS inline recursive
     switch(CONFIG_T::implementation){
         case conv_implementation::linebuffer:
             pooling1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res);
@@ -523,7 +523,7 @@ void global_pooling2d_cl(
             #pragma HLS PIPELINE
 
             res_T res_pack;
-            #pragma HLS DATA_PACK variable=res_pack
+            PRAGMA_DATA_PACK(res_pack)
             MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
                 #pragma HLS UNROLL
                 res_pack[i_pack] = data_window[i_pack];
@@ -535,7 +535,7 @@ void global_pooling2d_cl(
             #pragma HLS PIPELINE
 
             res_T res_pack;
-            #pragma HLS DATA_PACK variable=res_pack
+            PRAGMA_DATA_PACK(res_pack)
             AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
                 #pragma HLS UNROLL
                 res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width);
@@ -577,7 +577,7 @@ void global_pooling1d_cl(
             #pragma HLS PIPELINE
 
             res_T res_pack;
-            #pragma HLS DATA_PACK variable=res_pack
+            PRAGMA_DATA_PACK(res_pack)
             MaxPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
                 #pragma HLS UNROLL
                 res_pack[i_pack] = data_window[i_pack];
@@ -589,7 +589,7 @@ void global_pooling1d_cl(
             #pragma HLS PIPELINE
 
             res_T res_pack;
-            #pragma HLS DATA_PACK variable=res_pack
+            PRAGMA_DATA_PACK(res_pack)
             AvgPoolPack: for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
                 #pragma HLS UNROLL
 	      res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in;
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
index e94286aa8..a2581a94c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
@@ -293,7 +293,7 @@ template<class data_T, class res_T, typename CONFIG_T>
         nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state,data_in,h_newstate, s_newstate, param,param_r,param_b, param_br);
       if (CONFIG_T::n_sequence_out > 1){
         res_T res_pack;
-        #pragma HLS DATA_PACK variable=res_pack
+        PRAGMA_DATA_PACK(res_pack)
         ResPack_sequences: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
             #pragma HLS UNROLL
             res_pack[i_pack] = h_newstate[i_pack];
@@ -305,7 +305,7 @@ template<class data_T, class res_T, typename CONFIG_T>
 
     if (CONFIG_T::n_sequence_out == 1){
       res_T res_pack;
-      #pragma HLS DATA_PACK variable=res_pack
+      PRAGMA_DATA_PACK(res_pack)
       ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
           #pragma HLS UNROLL
           res_pack[i_pack] = h_newstate[i_pack];
@@ -565,7 +565,7 @@ template<class data_T, class res_T, typename CONFIG_T>
         nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state,data_in,h_newstate,param,param_zr,param_b, param_br);
       if (CONFIG_T::n_sequence_out > 1){
         res_T res_pack;
-        #pragma HLS DATA_PACK variable=res_pack
+        PRAGMA_DATA_PACK(res_pack)
         ResPack_sequences: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
             #pragma HLS UNROLL
             res_pack[i_pack] = h_newstate[i_pack];
@@ -577,7 +577,7 @@ template<class data_T, class res_T, typename CONFIG_T>
 
     if (CONFIG_T::n_sequence_out == 1){
       res_T res_pack;
-      #pragma HLS DATA_PACK variable=res_pack
+      PRAGMA_DATA_PACK(res_pack)
       ResPack: for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
           #pragma HLS UNROLL
           res_pack[i_pack] = h_newstate[i_pack];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
index b0f6ce9c6..71ccf1a01 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
@@ -26,7 +26,7 @@ void depthwise_conv_1d_encoded_cl(
     #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
index 69e272652..b2c80950a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
@@ -27,7 +27,7 @@ void depthwise_conv_2d_encoded_cl(
     #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(res_pack)
     unsigned outputs_ready = 0;
 
     ap_uint<CONFIG_T::filt_height * CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
index e8826e300..ce6528995 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
@@ -26,8 +26,7 @@ void depthwise_product(
 
     #pragma HLS ARRAY_PARTITION variable=mult complete
 
-    int multiplier_limit  = ceil(float(CONFIG_T::kernel_size * CONFIG_T::n_chan) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::limit(multiplier_limit);
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
 
     // Do the matrix-multiply
     Product: for(int ii = 0; ii < CONFIG_T::kernel_size * CONFIG_T::n_chan; ii++) {
@@ -77,7 +76,7 @@ void depthwise_mult_buffer(
         data[id] = data_window[id].read();
     }
 
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
         depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
     } else {
@@ -157,7 +156,7 @@ void pointwise_mult_buffer(
         data[id] = data_pack[id];
     }
 
-    #pragma HLS INLINE region
+    #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
         dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(data, res, weights, biases);
     } else {
@@ -204,7 +203,7 @@ void compute_depthwise_output_buffer_1d(
     // Check to see if we have a full kernel
     if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { 
       // Dense multiply
-      #pragma HLS INLINE region
+      #pragma HLS INLINE recursive
       if (CONFIG_T::strategy == nnet::latency) {
         depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out, weights, biases);
       } else {
@@ -268,7 +267,7 @@ void compute_depthwise_output_buffer_2d(
     // Check to see if we have a full kernel
     if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { 
       // Dense multiply
-      #pragma HLS INLINE region
+      #pragma HLS INLINE recursive
       if (CONFIG_T::strategy == nnet::latency) {
         depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out, weights, biases);
       } else {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
index 9ee6628fe..b4de14ffd 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
@@ -3,17 +3,18 @@
 #define NNET_STREAM_H
 
 #include "hls_stream.h"
+#include "nnet_common.h"
 
 namespace nnet {
 
 struct broadcast_config
 {
-  static const unsigned in_height = 1;
-  static const unsigned in_width = 1;
-  static const unsigned in_chan = 3;
-  static const unsigned out_height = 2;
-  static const unsigned out_width = 2;
-  static const unsigned out_chan = 3;
+    static const unsigned in_height = 1;
+    static const unsigned in_width = 1;
+    static const unsigned in_chan = 3;
+    static const unsigned out_height = 2;
+    static const unsigned out_width = 2;
+    static const unsigned out_chan = 3;
 };
 
 template<class data_T, class res_T, int N>
@@ -24,8 +25,8 @@ void clone_stream(hls::stream<data_T> &data, hls::stream<res_T> &res1, hls::stre
         data_T in_data = data.read();
         res_T out_data1;
         res_T out_data2;
-        #pragma HLS DATA_PACK variable=out_data1
-        #pragma HLS DATA_PACK variable=out_data2
+        PRAGMA_DATA_PACK(out_data1)
+        PRAGMA_DATA_PACK(out_data2)
 
         ClonePack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
@@ -47,9 +48,9 @@ void clone_stream(hls::stream<data_T> &data, hls::stream<res_T> &res1, hls::stre
         res_T out_data1;
         res_T out_data2;
         res_T out_data3;
-        #pragma HLS DATA_PACK variable=out_data1
-        #pragma HLS DATA_PACK variable=out_data2
-        #pragma HLS DATA_PACK variable=out_data3
+        PRAGMA_DATA_PACK(out_data1)
+        PRAGMA_DATA_PACK(out_data2)
+        PRAGMA_DATA_PACK(out_data3)
 
         ClonePack: for (int j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
@@ -72,7 +73,7 @@ void repack_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
             data_T in_data = data.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
             for (int j = 0; j < data_T::size; j++) {
                 #pragma HLS UNROLL
@@ -90,7 +91,7 @@ void repack_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 
             data_T in_data = data.read();
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
 
             for (int j = 0; j < pack_diff; j++) {
                 #pragma HLS PIPELINE
@@ -136,7 +137,7 @@ void broadcast_stream_1x1xC(hls::stream<data_T> &data, hls::stream<res_T> &res)
         for (int j = 0; j < n_dupl; j++) {
             #pragma HLS PIPELINE
             res_T out_data;
-            #pragma HLS DATA_PACK variable=out_data
+            PRAGMA_DATA_PACK(out_data)
             for (int k = 0; k < res_T::size; k++) {
                 #pragma HLS UNROLL
                 out_data[k] = in_data[k];
@@ -152,20 +153,20 @@ void broadcast_stream_HxWx1(hls::stream<data_T> &data, hls::stream<res_T> &res)
     BroadcastLoop: for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) {
         #pragma HLS PIPELINE
         data_T in_data = data.read();
-	res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
-	for (int k = 0; k < res_T::size; k++) {
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+        for (int k = 0; k < res_T::size; k++) {
             #pragma HLS UNROLL
-	    out_data[k] = in_data[0];
-	}
-	res.write(out_data);
+            out_data[k] = in_data[0];
+        }
+        res.write(out_data);
     }
 }
 
 template<class data_T, class res_T, typename CONFIG_T>
 void broadcast_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     if(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) {
-	broadcast_stream_1x1xC<data_T, res_T, CONFIG_T>(data, res);
+        broadcast_stream_1x1xC<data_T, res_T, CONFIG_T>(data, res);
     }
     else if(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && CONFIG_T::in_width == CONFIG_T::out_width) {
         broadcast_stream_HxWx1<data_T, res_T, CONFIG_T>(data, res);
@@ -180,19 +181,19 @@ void transpose_2d(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) {
         #pragma HLS PIPELINE
         data_T in_data = data.read();
-	for (int j = 0; j < data_T::size; j++) {
-	    data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
+        for (int j = 0; j < data_T::size; j++) {
+            data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
         }
     }
   
     for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) {
         #pragma HLS PIPELINE
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
         for (int j = 0; j < res_T::size; j++) {
-	    out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
+            out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
         }
-	res.write(out_data);
+        res.write(out_data);
     }
 } 
 }
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index b25576f9a..8ac4f1f8e 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -3,8 +3,10 @@
 from hls4ml.writer.writers import Writer, register_writer, get_writer
 from hls4ml.writer.vivado_writer import VivadoWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
+from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.quartus_writer import QuartusWriter
 
 register_writer('Vivado', VivadoWriter)
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
+register_writer('Vitis', VitisWriter)
 register_writer('Quartus', QuartusWriter)
diff --git a/hls4ml/writer/vitis_writer.py b/hls4ml/writer/vitis_writer.py
new file mode 100644
index 000000000..44b7d97c0
--- /dev/null
+++ b/hls4ml/writer/vitis_writer.py
@@ -0,0 +1,31 @@
+import os
+import glob
+from shutil import copy
+from hls4ml.writer.vivado_writer import VivadoWriter
+
+class VitisWriter(VivadoWriter):
+
+    def __init__(self):
+        super().__init__()
+
+    def write_nnet_utils_overrides(self, model):
+        ###################
+        ## nnet_utils
+        ###################
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        srcpath = os.path.join(filedir,'../templates/vitis/nnet_utils/')
+        dstpath = '{}/firmware/nnet_utils/'.format(model.config.get_output_dir())
+
+        headers = [os.path.basename(h) for h in glob.glob(srcpath + '*.h')]
+
+        for h in headers:
+            copy(srcpath + h, dstpath + h)
+
+    def write_hls(self, model):
+        """
+        Write the HLS project. Calls the steps from VivadoWriter, adapted for Vitis
+        """
+        super(VitisWriter, self).write_hls(model)
+        self.write_nnet_utils_overrides(model)
diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py
index 0ce72d1af..46c193fdb 100644
--- a/hls4ml/writer/vivado_accelerator_writer.py
+++ b/hls4ml/writer/vivado_accelerator_writer.py
@@ -391,6 +391,8 @@ def write_board_script(self, model):
         f.write(f'set part "{self.vivado_accelerator_config.get_part()}"\n')
         f.write('variable clock_period\n')
         f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
+        f.write('variable clock_uncertainty\n')
+        f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
         if self.vivado_accelerator_config.get_interface() == 'axi_stream':
             in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth()
             f.write(f'set bit_width_hls_output {in_bit}\n')
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 9f67df4cb..a7d269102 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -583,6 +583,8 @@ def write_build_script(self, model):
         f.write('set part "{}"\n'.format(model.config.get_config_value('Part')))
         f.write('variable clock_period\n')
         f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
+        f.write('variable clock_uncertainty\n')
+        f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
         f.close()
 
         # build_prj.tcl
@@ -642,7 +644,7 @@ def write_nnet_utils(self, model):
         # custom source
         filedir = os.path.dirname(os.path.abspath(__file__))
 
-        custom_source = get_backend('Vivado').get_custom_source()
+        custom_source = model.config.backend.get_custom_source()
         for dst, srcpath in custom_source.items():
             dstpath = f'{model.config.get_output_dir()}/firmware/{dst}'
             copyfile(srcpath, dstpath)
diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py
index 7aea0884e..9875bfe14 100644
--- a/test/pytest/test_activations.py
+++ b/test/pytest/test_activations.py
@@ -9,7 +9,7 @@
 
 # Variable 'name' is simply used as an identifier for the activation
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('shape, io_type', [
                             ((8, ), 'io_parallel'),
                             ((8, ), 'io_stream'),
diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py
index 1b17637d9..f50329230 100644
--- a/test/pytest/test_batchnorm.py
+++ b/test/pytest/test_batchnorm.py
@@ -29,7 +29,7 @@ def model():
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_batchnorm(model, data, backend, io_type):
 
     default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>'
diff --git a/test/pytest/test_causalpadding.py b/test/pytest/test_causalpadding.py
index d183d81c4..4e128b874 100644
--- a/test/pytest/test_causalpadding.py
+++ b/test/pytest/test_causalpadding.py
@@ -10,7 +10,7 @@
 atol = 5e-3
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_causalpadding(io_type, backend):
     
     model = Sequential()
diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py
index 262ae5013..ab3365f22 100644
--- a/test/pytest/test_cnn_mnist.py
+++ b/test/pytest/test_cnn_mnist.py
@@ -58,6 +58,10 @@ def keras_model(mnist_data):
         ('Vivado', 'io_parallel', 'latency'),
         ('Vivado', 'io_stream', 'latency'),
         ('Vivado', 'io_stream', 'resource'),
+        ('Vitis', 'io_parallel', 'resource'),
+        ('Vitis', 'io_parallel', 'latency'),
+        ('Vitis', 'io_stream', 'latency'),
+        ('Vitis', 'io_stream', 'resource'),
     ],
 )
 def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy):
diff --git a/test/pytest/test_cnn_mnist_qkeras.py b/test/pytest/test_cnn_mnist_qkeras.py
index c34e0965a..cf3dbf17d 100644
--- a/test/pytest/test_cnn_mnist_qkeras.py
+++ b/test/pytest/test_cnn_mnist_qkeras.py
@@ -40,7 +40,12 @@ def mnist_model():
                                       ('Vivado', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+
+                                      ('Vitis', 'io_parallel', 'resource'),
+                                      ('Vitis', 'io_parallel', 'latency'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource')
                                     ])
 def hls_model(mnist_model, backend, io_type, strategy):
   keras_model = mnist_model
@@ -66,7 +71,12 @@ def hls_model(mnist_model, backend, io_type, strategy):
                                       ('Vivado', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+
+                                      ('Vitis', 'io_parallel', 'resource'),
+                                      ('Vitis', 'io_parallel', 'latency'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource')
                                     ])
 def test_accuracy(mnist_data, mnist_model, hls_model):
   x_train, y_train, x_test, y_test = mnist_data
diff --git a/test/pytest/test_conv1d.py b/test/pytest/test_conv1d.py
index 1d91d80ea..bc8a68002 100644
--- a/test/pytest/test_conv1d.py
+++ b/test/pytest/test_conv1d.py
@@ -30,7 +30,11 @@ def keras_model():
                                       ('Vivado', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+                                      ('Vitis', 'io_parallel', 'resource'),
+                                      ('Vitis', 'io_parallel', 'latency'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource'),
                                     ])
 def hls_model(keras_model, backend, io_type, strategy):
     default_precision = 'ap_fixed<16,3,AP_RND_CONV,AP_SAT>' if backend=='Vivado' else 'ac_fixed<16,3,true,AC_RND_CONV,AC_SAT>'
@@ -63,7 +67,11 @@ def hls_model(keras_model, backend, io_type, strategy):
                                       ('Vivado', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+                                      ('Vitis', 'io_parallel', 'resource'),
+                                      ('Vitis', 'io_parallel', 'latency'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource'),
                                     ])
 def test_accuracy(data, keras_model, hls_model):
     X = data
diff --git a/test/pytest/test_embed.py b/test/pytest/test_embed.py
index 8073a7a1a..fd8e39cdb 100644
--- a/test/pytest/test_embed.py
+++ b/test/pytest/test_embed.py
@@ -25,7 +25,7 @@ def keras_model():
 
 
 @pytest.fixture
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(keras_model, default_precision='ap_fixed<16,6>', granularity='name')
@@ -39,7 +39,7 @@ def hls_model(keras_model, backend, io_type):
     return hls_model
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_embedding_accuracy(data, keras_model, hls_model):
     X = data
diff --git a/test/pytest/test_extensions.py b/test/pytest/test_extensions.py
index 1c8e07198..9945768ea 100644
--- a/test/pytest/test_extensions.py
+++ b/test/pytest/test_extensions.py
@@ -126,11 +126,14 @@ def regsister_custom_layer():
     hls4ml.model.layers.register_layer('HReverse', HReverse)
 
 
-@pytest.mark.parametrize('backend_id', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend_id', ['Vivado', 'Vitis', 'Quartus'])
 def test_extensions(tmp_path, backend_id):
     # Register the optimization passes (if any)
     backend = hls4ml.backends.get_backend(backend_id)
-    backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=f'{backend_id.lower()}:optimize')
+    ip_flow = hls4ml.model.flow.get_flow(backend.get_default_flow())
+    # Add the pass into the main optimization flow
+    optimize_flow = [flow for flow in ip_flow.requires if ':optimize' in flow][0]
+    backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=optimize_flow)
 
     # Register template passes for the given backend
     backend.register_template(HReverseConfigTemplate)
@@ -168,6 +171,6 @@ def test_extensions(tmp_path, backend_id):
     hres = hmodel.predict(x.astype('float32'))
 
     # Check if the optimizer pass was applied
-    assert f'{backend_id.lower()}:remove_duplicate_reverse' in hmodel._applied_flows[0][f'{backend_id.lower()}:optimize']
+    assert f'{backend_id.lower()}:remove_duplicate_reverse' in hmodel._applied_flows[0][optimize_flow]
 
     np.testing.assert_array_equal(kres, hres)
diff --git a/test/pytest/test_globalpooling.py b/test/pytest/test_globalpooling.py
index e1d3b1fec..ebbdb2419 100644
--- a/test/pytest/test_globalpooling.py
+++ b/test/pytest/test_globalpooling.py
@@ -32,7 +32,7 @@ def keras_model_1d(request):
     return model, model_type, keepdims
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
 @pytest.mark.parametrize(
     'keras_model_1d',
     [
@@ -88,7 +88,7 @@ def keras_model_2d(request):
     return model, model_type, keepdims
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
 @pytest.mark.parametrize(
     'keras_model_2d',
     [
diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py
index bd3f175b1..6da516646 100644
--- a/test/pytest/test_keras_api.py
+++ b/test/pytest/test_keras_api.py
@@ -15,7 +15,7 @@
 
 test_root_path = Path(__file__).parent
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_dense(backend, io_type):
     model = tf.keras.models.Sequential()
@@ -66,7 +66,7 @@ def test_dense(backend, io_type):
                                                  PReLU(alpha_initializer="zeros",),
                                                  Activation(activation='sigmoid', name='Activation')])
                                                  #ThresholdedReLU(theta=1.0)])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activations(activation_function, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -94,7 +94,7 @@ def test_activations(activation_function, backend, io_type):
 
 padds_options = ['same', 'valid']
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv1d(padds, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -123,8 +123,8 @@ def test_conv1d(padds, backend, io_type):
      # 5e-2 might be too high
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=5e-2)
 
-    if not (backend=='Vivado' and io_type=='io_stream' and padds=='same'):
-      # Vivado inserts and additional layer for 'same' padding in io_stream
+    if not (backend in ['Vivado', 'Vitis'] and io_type=='io_stream' and padds=='same'):
+      # Vivado/Vitis inserts and additional layer for 'same' padding in io_stream
       assert len(model.layers) + 2 == len(hls_model.get_layers())
       assert list(hls_model.get_layers())[1].attributes['name'] == model.layers[0]._name
       assert list(hls_model.get_layers())[1].attributes['class_name'] == 'Conv1D'
@@ -154,7 +154,7 @@ def test_conv1d(padds, backend, io_type):
 padds_options=['same', 'valid']
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds',  padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv2d(chans, padds, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -235,7 +235,7 @@ def test_conv2d(chans, padds, backend, io_type):
 @pytest.mark.parametrize('pooling', pooling_layers)
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('chans', chans_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_pooling(pooling, padds, chans, backend):
     assert '1D' in pooling.__name__ or '2D' in pooling.__name__
     
diff --git a/test/pytest/test_keras_h5_loader.py b/test/pytest/test_keras_h5_loader.py
index 0fa689e45..08753d584 100644
--- a/test/pytest/test_keras_h5_loader.py
+++ b/test/pytest/test_keras_h5_loader.py
@@ -8,7 +8,7 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_keras_h5_loader(backend):
     input_shape = (10,)
     model = tf.keras.models.Sequential([
diff --git a/test/pytest/test_merge.py b/test/pytest/test_merge.py
index 470e9b3ff..8ab4fa3a1 100644
--- a/test/pytest/test_merge.py
+++ b/test/pytest/test_merge.py
@@ -9,7 +9,7 @@
 
 @pytest.mark.parametrize('merge_layer', [Add, Average, Maximum, Minimum, Multiply, Subtract])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_merge(merge_layer, io_type, backend):
     input_shape = (10, 10, 3)
 
@@ -35,7 +35,7 @@ def test_merge(merge_layer, io_type, backend):
 
 @pytest.mark.parametrize('axes', [1])
 @pytest.mark.parametrize('io_type', ['io_parallel']) # No io_stream implementation yet
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_dot(axes, io_type, backend):
      # Only 1D implemented
     input_shape = (10, )
@@ -61,7 +61,7 @@ def test_dot(axes, io_type, backend):
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_concatenate1d(io_type, backend):
     input_shape = (10,)
 
@@ -87,7 +87,7 @@ def test_concatenate1d(io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_concatenate2d(axis, io_type, backend):
     input_shape = (10, 3)
 
@@ -114,7 +114,7 @@ def test_concatenate2d(axis, io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2, 3])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Quartus', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_concatenate3d(axis, io_type, backend):
     input_shape = (10, 10, 3)
 
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index 7650056f8..d43e35288 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -20,11 +20,13 @@
 @pytest.mark.parametrize('backend, io_type, strategy', [
                                       ('Quartus', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'resource'),
-
+                                      ('Vitis', 'io_parallel', 'resource'),
                                       ('Vivado', 'io_parallel', 'latency'),
-                                      
+                                      ('Vitis', 'io_parallel', 'latency'),
                                       ('Vivado', 'io_stream', 'latency'),
-                                      ('Vivado', 'io_stream', 'resource')
+                                      ('Vivado', 'io_stream', 'resource'),
+                                      ('Vitis', 'io_stream', 'latency'),
+                                      ('Vitis', 'io_stream', 'resource'),
                                     ])
 def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
     model = tf.keras.models.Sequential()
diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py
index 399154144..8645ecd0b 100644
--- a/test/pytest/test_qkeras.py
+++ b/test/pytest/test_qkeras.py
@@ -123,7 +123,7 @@ def randX_100_16():
 # https://github.com/fastmachinelearning/hls4ml/issues/381
 # @pytest.mark.parametrize('bits', [4, 6, 8])
 @pytest.mark.parametrize('bits,alpha', [(4, 1), (4, 'auto_po2')])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_single_dense_activation_exact(randX_100_16, bits, alpha, backend, io_type):
     '''
@@ -221,7 +221,7 @@ def test_quantizer_special(randX_1000_1, quantizer, backend, io_type):
         (7, 10, binary(), quantized_bits(5, 2), binary(), False, True),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_btnn(make_btnn, randX_100_10, backend, io_type):
     model, is_xnor, test_no = make_btnn
@@ -264,7 +264,7 @@ def randX_1000_1():
         (quantized_relu(10, 5)),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_quantizer(randX_1000_1, quantizer, backend, io_type):
     '''
diff --git a/test/pytest/test_rnn.py b/test/pytest/test_rnn.py
index 12fc42601..aa49e43d3 100644
--- a/test/pytest/test_rnn.py
+++ b/test/pytest/test_rnn.py
@@ -70,10 +70,14 @@ def test_rnn_parsing(rnn_layer, return_sequences):
     [
         (SimpleRNN, 'Quartus', 'io_parallel'),
         (LSTM, 'Vivado', 'io_parallel'),
+        (LSTM, 'Vitis', 'io_parallel'),
         (LSTM, 'Quartus', 'io_parallel'),
         (LSTM, 'Vivado', 'io_stream'),
+        (LSTM, 'Vitis', 'io_stream'),
         (GRU, 'Vivado', 'io_parallel'),
         (GRU, 'Vivado', 'io_stream'),
+        (GRU, 'Vitis', 'io_parallel'),
+        (GRU, 'Vitis', 'io_stream'),
         (GRU, 'Quartus', 'io_parallel'),
         (GRU, 'Quartus', 'io_stream'),
     ],
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 7815d5770..d32569449 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -23,7 +23,8 @@
 @pytest.mark.parametrize("kernels", kernel_options)
 @pytest.mark.parametrize("bias", bias_options)
 @pytest.mark.parametrize("io_type", io_type_options)
-def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type):
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
     input_shape = (28, 28, 3)
     model.add(conv2d(filters=32,
@@ -42,8 +43,8 @@ def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type):
     config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>')
     stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '')
     kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '')
-    output_dir = str(test_root_path / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding'.format(conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds))
-    hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, io_type=io_type)
+    output_dir = str(test_root_path / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds, backend, io_type))
+    hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend)
     hls_model.compile()
     hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
 
diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py
index 749a019f3..9290faf50 100644
--- a/test/pytest/test_softmax.py
+++ b/test/pytest/test_softmax.py
@@ -23,7 +23,7 @@ def high_accuracy_distribution(shape):
 def generate_data(function, input_shape):
     return function((1000, *input_shape))
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('strategy', ['stable', 'argmax'])
 @pytest.mark.parametrize('function,input_shape,io_type', [
                             (flat_distribution, (8,), 'io_parallel'),
@@ -58,7 +58,7 @@ def test_softmax(backend, strategy, generate_data, input_shape, io_type, functio
 
     assert acc_hls4ml >= 0.98
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_softmax_skipped(backend, io_type):
     X = np.random.rand(100, 10)
diff --git a/test/pytest/test_softsign.py b/test/pytest/test_softsign.py
index 2f70b8251..338aaf6f3 100644
--- a/test/pytest/test_softsign.py
+++ b/test/pytest/test_softsign.py
@@ -7,7 +7,7 @@
 
 test_root_path = Path(__file__).parent
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('input_shape, io_type', [
                             ((8, ), 'io_parallel'),
                             ((8, ), 'io_stream'),
diff --git a/test/pytest/test_trace.py b/test/pytest/test_trace.py
index ce01c4213..4c7cde4ac 100644
--- a/test/pytest/test_trace.py
+++ b/test/pytest/test_trace.py
@@ -8,7 +8,7 @@
 
 test_root_path = Path(__file__).parent
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_trace(backend):
     '''Test the tracing feature with a simple Keras model.'''
     model = tf.keras.models.Sequential()
diff --git a/test/pytest/test_transpose_concat.py b/test/pytest/test_transpose_concat.py
index 488fc46b6..db3e03125 100644
--- a/test/pytest/test_transpose_concat.py
+++ b/test/pytest/test_transpose_concat.py
@@ -29,7 +29,7 @@ def keras_model():
 
 @pytest.fixture
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(
         keras_model, default_precision='ap_fixed<16,3,AP_RND_CONV,AP_SAT>', granularity='name'
@@ -45,7 +45,7 @@ def hls_model(keras_model, backend, io_type):
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 def test_accuracy(data, keras_model, hls_model):
     X = data
     model = keras_model
diff --git a/test/pytest/test_upsampling.py b/test/pytest/test_upsampling.py
index 7e698fd90..0f5130162 100644
--- a/test/pytest/test_upsampling.py
+++ b/test/pytest/test_upsampling.py
@@ -41,7 +41,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_upsampling(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':
diff --git a/test/pytest/test_zeropadding.py b/test/pytest/test_zeropadding.py
index 219f727c0..ca539a9ef 100644
--- a/test/pytest/test_zeropadding.py
+++ b/test/pytest/test_zeropadding.py
@@ -45,7 +45,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_zeropadding(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':