fastmachinelearning · jmitrevs · Jun 19, 2023 · Jun 16, 2023 · Jun 16, 2023
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -0,0 +1,68 @@
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_latency.h"
+#include "nnet_conv1d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv1d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 0;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+    static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
+    //#pragma HLS INLINE recursive
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
+    //#pragma HLS INLINE recursive
+
+    // Nothing special to be done for io_parallel implementation
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -0,0 +1,89 @@
+#ifndef NNET_CONV1D_LATENCY_H_
+#define NNET_CONV1D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                        res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+            Product2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    mult[i_in * mult_n_out + i_out] =
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            cache, weights[i_in * mult_n_out + i_out]);
+                }
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] =
+                    cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h
@@ -0,0 +1,77 @@
+#ifndef NNET_CONV2D_H_
+#define NNET_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_latency.h"
+#include "nnet_conv2d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv2d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 1;
+    static const unsigned filt_height = 1;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_height * filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_height = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned dilation_height = 1;
+    static const unsigned dilation_width = 1;
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
+    //#pragma HLS INLINE recursive
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
+    //#pragma HLS INLINE recursive
+
+    // Nothing special to be done for io_parallel implementation
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_latency.h
@@ -0,0 +1,90 @@
+#ifndef NNET_CONV2D_LATENCY_H_
+#define NNET_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_latency_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+            Product2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    mult[i_in * mult_n_out + i_out] =
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            cache, weights[i_in * mult_n_out + i_out]);
+                }
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] =
+                    cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
@@ -450,7 +450,7 @@ template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(hls::stre
 
         data_T in_data = data.read();
         res_T out_data;
-        #pragma HLS DATA_PACK variable=out_data
+        PRAGMA_DATA_PACK(out_data)
 
     HardSigmoidPackLoop:
         for (int j = 0; j < res_T::size; j++) {

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
@@ -145,7 +145,7 @@ void pointwise_mult_buffer(const data_T &data_pack, hls::stream<res_T> &res_stre
     #pragma HLS ARRAY_PARTITION variable=res complete
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(out_data)
 
 InitData:
     for (int id = 0; id < CONFIG_T::n_chan; id++) {
@@ -192,7 +192,7 @@ void compute_depthwise_output_buffer_1d(const data_T &in_elem, hls::stream<res_T
     #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(out_data)
 
     // Add pixel to buffer
     nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
@@ -257,7 +257,7 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem,
     #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
 
     res_T res_pack;
-    #pragma HLS DATA_PACK variable=res_pack
+    PRAGMA_DATA_PACK(out_data)
 
     // Add pixel to buffer
     nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);