Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Vitis Conv1D/2D latency strategy #815

Merged
merged 2 commits into from
Jun 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#ifndef NNET_CONV1D_H_
#define NNET_CONV1D_H_

#include "nnet_common.h"
#include "nnet_conv1d_latency.h"
#include "nnet_conv1d_resource.h"
#include <cstdlib>

namespace nnet {

struct conv1d_config {
// Internal data type definitions
typedef float bias_t;
typedef float weight_t;
typedef float accum_t;

// Convolutional parameters
static const unsigned pad_left = 0;
static const unsigned pad_right = 0;
static const unsigned in_width = 10;
static const unsigned n_chan = 0;
static const unsigned filt_width = 1;
static const unsigned kernel_size = filt_width;
static const unsigned n_filt = 1;
static const unsigned stride_width = 1;
static const unsigned dilation = 1;
static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1

static const unsigned reuse_factor = 1;
static const bool store_weights_in_bram = false;
static const unsigned n_zeros = 0; // not used yet
};

template <class data_T, class res_T, typename CONFIG_T>
void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive

if (CONFIG_T::strategy == nnet::latency) {
conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
} else {
conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
}

template <class data_T, class res_T, typename CONFIG_T>
void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
assert(CONFIG_T::filt_width == 1);

// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive

// Nothing special to be done for io_parallel implementation
if (CONFIG_T::strategy == nnet::latency) {
conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
} else {
conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
}

} // namespace nnet

#endif
89 changes: 89 additions & 0 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#ifndef NNET_CONV1D_LATENCY_H_
#define NNET_CONV1D_LATENCY_H_

#include "nnet_common.h"
#include "nnet_mult.h"
#include <cstdlib>

namespace nnet {

template <class data_T, class res_T, typename CONFIG_T>
void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
constexpr unsigned mult_n_out = CONFIG_T::n_filt;

data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
#pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0

typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
#pragma HLS ARRAY_PARTITION variable=mult complete

typename CONFIG_T::accum_t acc[mult_n_out];
#pragma HLS ARRAY_PARTITION variable=acc complete

#pragma HLS ARRAY_PARTITION variable=weights complete
#pragma HLS ARRAY_PARTITION variable=biases complete

// Limit multipliers to control parallelization
#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit

PartitionLoop:
for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
#pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind

CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);

PixelLoop:
for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
#pragma HLS UNROLL

data_T cache;

// Do the matrix-multiply
Product1:
for (int i_in = 0; i_in < mult_n_in; i_in++) {
#pragma HLS UNROLL
cache = data_buf[i_pxl][i_in];
Product2:
for (int i_out = 0; i_out < mult_n_out; i_out++) {
#pragma HLS UNROLL
mult[i_in * mult_n_out + i_out] =
CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
cache, weights[i_in * mult_n_out + i_out]);
}
}

// Initialize accumulator with input biases
ResetAccum:
for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
#pragma HLS UNROLL
acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
}

// Accumulate multiplication result
Accum1:
for (int i_in = 0; i_in < mult_n_in; i_in++) {
#pragma HLS UNROLL
Accum2:
for (int i_out = 0; i_out < mult_n_out; i_out++) {
#pragma HLS UNROLL
acc[i_out] += mult[i_in * mult_n_out + i_out];
}
}

// Cast to "res_t" type
Result:
for (int i_res = 0; i_res < mult_n_out; i_res++) {
#pragma HLS UNROLL
res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] =
cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
}
}
}
}

} // namespace nnet
#endif
77 changes: 77 additions & 0 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#ifndef NNET_CONV2D_H_
#define NNET_CONV2D_H_

#include "nnet_common.h"
#include "nnet_conv2d_latency.h"
#include "nnet_conv2d_resource.h"
#include <cstdlib>

namespace nnet {

struct conv2d_config {
// Internal data type definitions
typedef float bias_t;
typedef float weight_t;
typedef float accum_t;

// Convolutional parameters
static const unsigned pad_top = 0;
static const unsigned pad_bottom = 0;
static const unsigned pad_left = 0;
static const unsigned pad_right = 0;
static const unsigned in_height = 10;
static const unsigned in_width = 10;
static const unsigned n_chan = 1;
static const unsigned filt_height = 1;
static const unsigned filt_width = 1;
static const unsigned kernel_size = filt_height * filt_width;
static const unsigned n_filt = 1;
static const unsigned stride_height = 1;
static const unsigned stride_width = 1;
static const unsigned out_height = 10;
static const unsigned out_width = 10;
static const unsigned dilation_height = 1;
static const unsigned dilation_width = 1;

static const unsigned reuse_factor = 1;
static const bool store_weights_in_bram = false;
static const unsigned n_zeros = 0; // not used yet
};

template <class data_T, class res_T, typename CONFIG_T>
void conv_2d_cl(
data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive

if (CONFIG_T::strategy == nnet::latency) {
conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
} else {
conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
}

template <class data_T, class res_T, typename CONFIG_T>
void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
assert(CONFIG_T::filt_width == 1);

// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive

// Nothing special to be done for io_parallel implementation
if (CONFIG_T::strategy == nnet::latency) {
conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
} else {
conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
}

} // namespace nnet

#endif
90 changes: 90 additions & 0 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv2d_latency.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#ifndef NNET_CONV2D_LATENCY_H_
#define NNET_CONV2D_LATENCY_H_

#include "nnet_common.h"
#include "nnet_mult.h"
#include <cstdlib>

namespace nnet {

template <class data_T, class res_T, typename CONFIG_T>
void conv_2d_latency_cl(
data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
constexpr unsigned mult_n_out = CONFIG_T::n_filt;

data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
#pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0

typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
#pragma HLS ARRAY_PARTITION variable=mult complete

typename CONFIG_T::accum_t acc[mult_n_out];
#pragma HLS ARRAY_PARTITION variable=acc complete

#pragma HLS ARRAY_PARTITION variable=weights complete
#pragma HLS ARRAY_PARTITION variable=biases complete

// Limit multipliers to control parallelization
#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit

PartitionLoop:
for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
#pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind

CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);

PixelLoop:
for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
#pragma HLS UNROLL

data_T cache;

// Do the matrix-multiply
Product1:
for (int i_in = 0; i_in < mult_n_in; i_in++) {
#pragma HLS UNROLL
cache = data_buf[i_pxl][i_in];
Product2:
for (int i_out = 0; i_out < mult_n_out; i_out++) {
#pragma HLS UNROLL
mult[i_in * mult_n_out + i_out] =
CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
cache, weights[i_in * mult_n_out + i_out]);
}
}

// Initialize accumulator with input biases
ResetAccum:
for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
#pragma HLS UNROLL
acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
}

// Accumulate multiplication result
Accum1:
for (int i_in = 0; i_in < mult_n_in; i_in++) {
#pragma HLS UNROLL
Accum2:
for (int i_out = 0; i_out < mult_n_out; i_out++) {
#pragma HLS UNROLL
acc[i_out] += mult[i_in * mult_n_out + i_out];
}
}

// Cast to "res_t" type
Result:
for (int i_res = 0; i_res < mult_n_out; i_res++) {
#pragma HLS UNROLL
res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] =
cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
}
}
}
}

} // namespace nnet
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@ template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(hls::stre

data_T in_data = data.read();
res_T out_data;
#pragma HLS DATA_PACK variable=out_data
PRAGMA_DATA_PACK(out_data)

HardSigmoidPackLoop:
for (int j = 0; j < res_T::size; j++) {
Expand Down
6 changes: 3 additions & 3 deletions hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ void pointwise_mult_buffer(const data_T &data_pack, hls::stream<res_T> &res_stre
#pragma HLS ARRAY_PARTITION variable=res complete

res_T res_pack;
#pragma HLS DATA_PACK variable=res_pack
PRAGMA_DATA_PACK(out_data)

InitData:
for (int id = 0; id < CONFIG_T::n_chan; id++) {
Expand Down Expand Up @@ -192,7 +192,7 @@ void compute_depthwise_output_buffer_1d(const data_T &in_elem, hls::stream<res_T
#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0

res_T res_pack;
#pragma HLS DATA_PACK variable=res_pack
PRAGMA_DATA_PACK(out_data)

// Add pixel to buffer
nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
Expand Down Expand Up @@ -257,7 +257,7 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem,
#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0

res_T res_pack;
#pragma HLS DATA_PACK variable=res_pack
PRAGMA_DATA_PACK(out_data)

// Add pixel to buffer
nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
Expand Down