Skip to content

Commit

Permalink
Merge pull request fastmachinelearning#815 from vloncar/vitis_conv_fix
Browse files Browse the repository at this point in the history
Fix Vitis Conv1D/2D latency strategy
  • Loading branch information
jmitrevs authored Jun 19, 2023
2 parents 6075981 + ff7f3f3 commit 163391e
Show file tree
Hide file tree
Showing 6 changed files with 328 additions and 4 deletions.
68 changes: 68 additions & 0 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#ifndef NNET_CONV1D_H_
#define NNET_CONV1D_H_

#include "nnet_common.h"
#include "nnet_conv1d_latency.h"
#include "nnet_conv1d_resource.h"
#include <cstdlib>

namespace nnet {

struct conv1d_config {
// Internal data type definitions
typedef float bias_t;
typedef float weight_t;
typedef float accum_t;

// Convolutional parameters
static const unsigned pad_left = 0;
static const unsigned pad_right = 0;
static const unsigned in_width = 10;
static const unsigned n_chan = 0;
static const unsigned filt_width = 1;
static const unsigned kernel_size = filt_width;
static const unsigned n_filt = 1;
static const unsigned stride_width = 1;
static const unsigned dilation = 1;
static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1

static const unsigned reuse_factor = 1;
static const bool store_weights_in_bram = false;
static const unsigned n_zeros = 0; // not used yet
};

template <class data_T, class res_T, typename CONFIG_T>
void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive

if (CONFIG_T::strategy == nnet::latency) {
conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
} else {
conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
}

template <class data_T, class res_T, typename CONFIG_T>
void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
assert(CONFIG_T::filt_width == 1);

// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive

// Nothing special to be done for io_parallel implementation
if (CONFIG_T::strategy == nnet::latency) {
conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
} else {
conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
}

} // namespace nnet

#endif
89 changes: 89 additions & 0 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#ifndef NNET_CONV1D_LATENCY_H_
#define NNET_CONV1D_LATENCY_H_

#include "nnet_common.h"
#include "nnet_mult.h"
#include <cstdlib>

namespace nnet {

template <class data_T, class res_T, typename CONFIG_T>
void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
constexpr unsigned mult_n_out = CONFIG_T::n_filt;

data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
#pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0

typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
#pragma HLS ARRAY_PARTITION variable=mult complete

typename CONFIG_T::accum_t acc[mult_n_out];
#pragma HLS ARRAY_PARTITION variable=acc complete

#pragma HLS ARRAY_PARTITION variable=weights complete
#pragma HLS ARRAY_PARTITION variable=biases complete

// Limit multipliers to control parallelization
#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit

PartitionLoop:
for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
#pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind

CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);

PixelLoop:
for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
#pragma HLS UNROLL

data_T cache;

// Do the matrix-multiply
Product1:
for (int i_in = 0; i_in < mult_n_in; i_in++) {
#pragma HLS UNROLL
cache = data_buf[i_pxl][i_in];
Product2:
for (int i_out = 0; i_out < mult_n_out; i_out++) {
#pragma HLS UNROLL
mult[i_in * mult_n_out + i_out] =
CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
cache, weights[i_in * mult_n_out + i_out]);
}
}

// Initialize accumulator with input biases
ResetAccum:
for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
#pragma HLS UNROLL
acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
}

// Accumulate multiplication result
Accum1:
for (int i_in = 0; i_in < mult_n_in; i_in++) {
#pragma HLS UNROLL
Accum2:
for (int i_out = 0; i_out < mult_n_out; i_out++) {
#pragma HLS UNROLL
acc[i_out] += mult[i_in * mult_n_out + i_out];
}
}

// Cast to "res_t" type
Result:
for (int i_res = 0; i_res < mult_n_out; i_res++) {
#pragma HLS UNROLL
res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] =
cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
}
}
}
}

} // namespace nnet
#endif
77 changes: 77 additions & 0 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#ifndef NNET_CONV2D_H_
#define NNET_CONV2D_H_

#include "nnet_common.h"
#include "nnet_conv2d_latency.h"
#include "nnet_conv2d_resource.h"
#include <cstdlib>

namespace nnet {

struct conv2d_config {
// Internal data type definitions
typedef float bias_t;
typedef float weight_t;
typedef float accum_t;

// Convolutional parameters
static const unsigned pad_top = 0;
static const unsigned pad_bottom = 0;
static const unsigned pad_left = 0;
static const unsigned pad_right = 0;
static const unsigned in_height = 10;
static const unsigned in_width = 10;
static const unsigned n_chan = 1;
static const unsigned filt_height = 1;
static const unsigned filt_width = 1;
static const unsigned kernel_size = filt_height * filt_width;
static const unsigned n_filt = 1;
static const unsigned stride_height = 1;
static const unsigned stride_width = 1;
static const unsigned out_height = 10;
static const unsigned out_width = 10;
static const unsigned dilation_height = 1;
static const unsigned dilation_width = 1;

static const unsigned reuse_factor = 1;
static const bool store_weights_in_bram = false;
static const unsigned n_zeros = 0; // not used yet
};

template <class data_T, class res_T, typename CONFIG_T>
void conv_2d_cl(
data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive

if (CONFIG_T::strategy == nnet::latency) {
conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
} else {
conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
}

template <class data_T, class res_T, typename CONFIG_T>
void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
assert(CONFIG_T::filt_width == 1);

// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive

// Nothing special to be done for io_parallel implementation
if (CONFIG_T::strategy == nnet::latency) {
conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
} else {
conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
}

} // namespace nnet

#endif
90 changes: 90 additions & 0 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv2d_latency.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#ifndef NNET_CONV2D_LATENCY_H_
#define NNET_CONV2D_LATENCY_H_

#include "nnet_common.h"
#include "nnet_mult.h"
#include <cstdlib>

namespace nnet {

template <class data_T, class res_T, typename CONFIG_T>
void conv_2d_latency_cl(
data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
constexpr unsigned mult_n_out = CONFIG_T::n_filt;

data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
#pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0

typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
#pragma HLS ARRAY_PARTITION variable=mult complete

typename CONFIG_T::accum_t acc[mult_n_out];
#pragma HLS ARRAY_PARTITION variable=acc complete

#pragma HLS ARRAY_PARTITION variable=weights complete
#pragma HLS ARRAY_PARTITION variable=biases complete

// Limit multipliers to control parallelization
#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit

PartitionLoop:
for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
#pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind

CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);

PixelLoop:
for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
#pragma HLS UNROLL

data_T cache;

// Do the matrix-multiply
Product1:
for (int i_in = 0; i_in < mult_n_in; i_in++) {
#pragma HLS UNROLL
cache = data_buf[i_pxl][i_in];
Product2:
for (int i_out = 0; i_out < mult_n_out; i_out++) {
#pragma HLS UNROLL
mult[i_in * mult_n_out + i_out] =
CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
cache, weights[i_in * mult_n_out + i_out]);
}
}

// Initialize accumulator with input biases
ResetAccum:
for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
#pragma HLS UNROLL
acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
}

// Accumulate multiplication result
Accum1:
for (int i_in = 0; i_in < mult_n_in; i_in++) {
#pragma HLS UNROLL
Accum2:
for (int i_out = 0; i_out < mult_n_out; i_out++) {
#pragma HLS UNROLL
acc[i_out] += mult[i_in * mult_n_out + i_out];
}
}

// Cast to "res_t" type
Result:
for (int i_res = 0; i_res < mult_n_out; i_res++) {
#pragma HLS UNROLL
res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] =
cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
}
}
}
}

} // namespace nnet
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@ template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(hls::stre

data_T in_data = data.read();
res_T out_data;
#pragma HLS DATA_PACK variable=out_data
PRAGMA_DATA_PACK(out_data)

HardSigmoidPackLoop:
for (int j = 0; j < res_T::size; j++) {
Expand Down
6 changes: 3 additions & 3 deletions hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ void pointwise_mult_buffer(const data_T &data_pack, hls::stream<res_T> &res_stre
#pragma HLS ARRAY_PARTITION variable=res complete

res_T res_pack;
#pragma HLS DATA_PACK variable=res_pack
PRAGMA_DATA_PACK(out_data)

InitData:
for (int id = 0; id < CONFIG_T::n_chan; id++) {
Expand Down Expand Up @@ -192,7 +192,7 @@ void compute_depthwise_output_buffer_1d(const data_T &in_elem, hls::stream<res_T
#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0

res_T res_pack;
#pragma HLS DATA_PACK variable=res_pack
PRAGMA_DATA_PACK(out_data)

// Add pixel to buffer
nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
Expand Down Expand Up @@ -257,7 +257,7 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem,
#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0

res_T res_pack;
#pragma HLS DATA_PACK variable=res_pack
PRAGMA_DATA_PACK(out_data)

// Add pixel to buffer
nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
Expand Down

0 comments on commit 163391e

Please sign in to comment.