Skip to content

Commit

Permalink
Merge pull request #600 from vloncar/instruct_cnn
Browse files Browse the repository at this point in the history
Unrolled CNN implementation
  • Loading branch information
jmitrevs authored Oct 4, 2022
2 parents ee891c3 + cd915eb commit 90d760a
Show file tree
Hide file tree
Showing 19 changed files with 658 additions and 834 deletions.
238 changes: 238 additions & 0 deletions hls4ml/backends/fpga/fpga_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,27 @@ def set_target_reuse_factor(self, layer):

layer.set_attr('reuse_factor', float(rf) / kernel_multiplies)

def get_valid_conv_partition_splits(self, out_height, out_width):
"""Generate valid partition splits of a Conv1D/2D layer.
Essentially a list of divisors of the number of pixels of the output image.
Args:
out_height (int): The height of the output image
out_width (int): The width of the output image
Returns:
list: List of valid partition splits
"""
n_pixels = out_height * out_width
valid_n_partitions = []
for i in range(1, int(n_pixels / 2) + 1):
if n_pixels % i == 0:
valid_n_partitions.append(i)
valid_n_partitions.append(n_pixels)

return valid_n_partitions

@classmethod
def convert_precision_string(cls, precision):
if isinstance(precision, IntegerPrecisionType) or isinstance(precision, FixedPrecisionType):
Expand Down Expand Up @@ -384,6 +405,223 @@ def compute_conv2d_instructions(self, in_H, in_W, in_C, kernel_size=3, stride=1,

return (min_H, min_W, windows_int)

def _compute_conv1d_im2col(self, input_shape, kernel=3, stride=1, pad=(0,0), dilation=1):
W, C = input_shape
pad_l, pad_r = pad

out_w = (W + pad_l + pad_r - (dilation * (kernel - 1) + 1)) // stride + 1

input_img = np.arange(1, W * C + 1)
im_matrix = np.zeros((kernel * C * out_w, ))

index = 0
for i_ow in range(out_w):
for i_kw in range(kernel):
for i_c in range(C):
input_col = -pad_l + i_kw * dilation + i_ow * stride
if (input_col >= 0 and input_col < W):
im_matrix[index] = input_img[input_col * C + i_c]
else:
im_matrix[index] = 0
index += 1

im_matrix = im_matrix.reshape(out_w, -1)
return im_matrix


def generate_conv1d_line_buffer_fn(self, layer_idx, n_partitions, in_W, in_C, kernel=3, stride=1, pad=0, dilation=1):
"""Generate a C++ function that mimics the im2col algorithm. This function works for 1D convolution.
The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
we need to do this for every convolution layer.
Args:
layer_idx (int): Index of layer ('index' attribute).
n_partitions (int): Number of partitions to divide the input into. The pixels in each partition will be processed in parallel.
in_W (int): Width of input.
in_C (int): Number of channels.
kernel (int, optional): Size of the kernel. Defaults to 3.
stride (int, optional): Stride length. Defaults to 1.
pad (int or Iterable, optional): Padding to apply. Specified as either a number or a list [left_pad, right_pad]. Defaults to 0.
dilation (int, optional): Dilation rate. Defaults to 1.
Returns:
str: Generated C++ function
"""
if isinstance(pad, Iterable):
pad_left = pad[0]
pad_right = pad[1]
else:
pad_left = pad
pad_right = pad

im2col_matrix = self._compute_conv1d_im2col(
(in_W, in_C),
kernel,
stride,
(pad_left, pad_right),
dilation
)

generated_code = (
"template<class data_T, typename CONFIG_T>\n"
"class fill_buffer_{index} : public FillConv1DBuffer<data_T, CONFIG_T> {{\n"
" public:\n"
" static void fill_buffer(\n"
" data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
" data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],\n"
" const unsigned partition\n"
" ) {{\n"
).format(index=layer_idx)
indent = ' '

for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
for pixel_idx, arr in enumerate(partition):
buffer_stmts = []
for j, v in enumerate(arr):
if v == 0:
val = '0'
else:
val = 'data[{}]'.format(int(v-1))
buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
generated_code += '\n' + indent * 2 + '}\n'

generated_code += indent + '}\n'
generated_code += '};\n'

return generated_code

def _compute_conv2d_im2col(self, input_shape, kernel=(3, 3), stride=(1, 1), pad=(0, 0, 0, 0), dilation=(1,1)):
H, W, C = input_shape
kernel_h, kernel_w = kernel
stride_h, stride_w = stride
pad_t, pad_b, pad_l, pad_r = pad
dilation_h, dilation_w = dilation

out_h = (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
out_w = (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1

input_img = np.arange(1, H * W * C + 1)
im_matrix = np.zeros((kernel_h * kernel_w * C * out_h * out_w, ))

index = 0
for i_oh in range(out_h):
for i_ow in range(out_w):
for i_kh in range(kernel_h):
input_row = -pad_t + i_kh * dilation_h + i_oh * stride_h
for i_kw in range(kernel_w):
for i_c in range(C):
if (input_row < 0 or input_row >= H):
im_matrix[index] = 0
else:
input_col = -pad_l + i_kw * dilation_w + i_ow * stride_w
if (input_col >= 0 and input_col < W):
im_matrix[index] = input_img[input_row * W * C + input_col * C + i_c]
else:
im_matrix[index] = 0
index += 1

im_matrix = im_matrix.reshape(out_h * out_w, -1)
return im_matrix


def generate_conv2d_line_buffer_fn(self, layer_idx, n_partitions, in_H, in_W, in_C, kernel=(3, 3), stride=(1, 1), pad=(0, 0, 0, 0), dilation=(1, 1)):
"""Generate a C++ function that mimics the im2col algorithm. This function works for 2D convolution.
The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
we need to do this for every convolution layer.
Args:
layer_idx (int): Index of layer ('index' attribute).
n_partitions (int): Number of partitions to divide the input into. The pixels in each partition will be processed in parallel.
in_H (int): Height of input.
in_W (int): Width of input.
in_C (int): Number of channels.
kernel (int or Iterable, optional): Size of the kernel. Defaults to (3,3).
stride (int or Iterable, optional): Stride length. Defaults to (1,1).
pad (int or Iterable, optional): Padding to apply. Specified as either a number or a list [top_pad, bottom_pad, left_pad, right_pad]. Defaults to 0.
dilation (int or Iterable, optional): Dilation rate. Defaults to (1,1).
Returns:
str: Generated C++ function
"""

if isinstance(kernel, Iterable):
kernel_height = kernel[0]
kernel_width = kernel[1]
else:
kernel_height = kernel
kernel_width = kernel

if isinstance(stride, Iterable):
stride_height = stride[0]
stride_width = stride[1]
else:
stride_height = stride
stride_width = stride

if isinstance(pad, Iterable):
pad_top = pad[0]
pad_bottom = pad[1]
pad_left = pad[2]
pad_right = pad[3]
else:
pad_top = pad
pad_bottom = pad
pad_left = pad
pad_right = pad

if isinstance(dilation, Iterable):
dilation_height = dilation[0]
dilation_width = dilation[1]
else:
dilation_height = dilation
dilation_width = dilation

im2col_matrix = self._compute_conv2d_im2col(
(in_H, in_W, in_C),
(kernel_height, kernel_width),
(stride_height, stride_width),
(pad_top, pad_bottom, pad_left, pad_right),
(dilation_height, dilation_width)
)

generated_code = (
"template<class data_T, typename CONFIG_T>\n"
"class fill_buffer_{index} : public FillConv2DBuffer<data_T, CONFIG_T> {{\n"
" public:\n"
" static void fill_buffer(\n"
" data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],\n"
" data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],\n"
" const unsigned partition\n"
" ) {{\n"
).format(index=layer_idx)
indent = ' '

for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
for pixel_idx, arr in enumerate(partition):
buffer_stmts = []
for j, v in enumerate(arr):
if v == 0:
val = '0'
else:
val = 'data[{}]'.format(int(v-1))
buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
generated_code += '\n' + indent * 2 + '}\n'

generated_code += indent + '}\n'
generated_code += '};\n'

return generated_code

@model_optimizer()
def write_hls(self, model):
self.writer.write_hls(model)
Expand Down
45 changes: 45 additions & 0 deletions hls4ml/backends/fpga/passes/codegen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from hls4ml.model.optimizer import OptimizerPass
from hls4ml.model.layers import Conv1D, Conv2D
from hls4ml.model.types import Source

class GenerateConvIm2col(OptimizerPass):
''' Generates tcode for im2col step of 1D/2d convolution '''
def match(self, node):
return isinstance(node, (Conv1D, Conv2D)) and \
node.model.config.get_config_value('IOType') == 'io_parallel'

def transform(self, model, node):
node_class = node.__class__.__name__
if '1D' in node_class:
self._generate_im2col_1d(node)
elif '2D' in node_class:
self._generate_im2col_2d(node)
else:
raise Exception('Cannot generate instructions for node {} ({})'.format(node.name, node_class))

def _generate_im2col_1d(self, node):
code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
node.get_attr('index'),
node.get_attr('n_partitions'),
node.get_input_variable().shape[0],
node.get_input_variable().shape[1],
kernel=node.get_attr('filt_width'),
stride=node.get_attr('stride_width'),
pad=(node.get_attr('pad_left'), node.get_attr('pad_right'))
)

node.set_attr('line_buffer_codegen', Source(code_str))

def _generate_im2col_2d(self, node):
code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
node.get_attr('index'),
node.get_attr('n_partitions'),
node.get_input_variable().shape[0],
node.get_input_variable().shape[1],
node.get_input_variable().shape[2],
kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')),
stride=(node.get_attr('stride_height'), node.get_attr('stride_width')),
pad=(node.get_attr('pad_top'), node.get_attr('pad_bottom'), node.get_attr('pad_left'), node.get_attr('pad_right'))
)

node.set_attr('line_buffer_codegen', Source(code_str))
4 changes: 4 additions & 0 deletions hls4ml/backends/quartus/quartus_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ def init_conv1d(self, layer):
# - Winograd - use Winograd, if possible
layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'combination'))

layer.set_attr('n_partitions', 1) #TODO Not used yet as there is no codegen implementation of CNNs for Quartus backend

@layer_optimizer(Conv2D)
def init_conv2d(self, layer):
# This can happen if we assign weights of Dense layer to 1x1 Conv2D
Expand All @@ -281,6 +283,8 @@ def init_conv2d(self, layer):
# - im2col - specifically use im2col
# - Winograd - use Winograd, if possible
layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'combination'))

layer.set_attr('n_partitions', 1) #TODO Not used yet as there is no codegen implementation of CNNs for Quartus backend

@layer_optimizer(LSTM)
def init_lstm(self, layer):
Expand Down
20 changes: 20 additions & 0 deletions hls4ml/backends/vivado/passes/convolution_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
static const unsigned min_width = {min_width};
static const ap_uint<filt_width> pixels[min_width];
static const unsigned n_partitions = {n_partitions};
static const unsigned n_pixels = out_width / n_partitions;
template<class data_T, class CONFIG_T>
using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
typedef {accum_t.name} accum_t;
typedef {bias_t.name} bias_t;
typedef {weight_t.name} weight_t;
Expand All @@ -60,6 +64,10 @@ def format(self, node):
params['nzeros'] = node.get_weights('weight').nzeros

params['config_t'] = 'config{}_mult'.format(node.index)
if node.model.config.get_config_value('IOType') == 'io_parallel':
params['fill_fn'] = 'fill_buffer_{}'.format(node.index)
else:
params['fill_fn'] = 'FillConv1DBuffer'
conv_config = self.template.format(**params)

mult_params = self._default_config_params(node)
Expand Down Expand Up @@ -109,6 +117,10 @@ def format(self, node):
static const unsigned min_height = {min_height};
static const unsigned min_width = {min_width};
static const ap_uint<filt_height * filt_width> pixels[min_height * min_width];
static const unsigned n_partitions = {n_partitions};
static const unsigned n_pixels = out_height * out_width / n_partitions;
template<class data_T, class CONFIG_T>
using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
typedef {accum_t.name} accum_t;
typedef {bias_t.name} bias_t;
typedef {weight_t.name} weight_t;
Expand All @@ -133,6 +145,10 @@ def format(self, node):
params['nzeros'] = node.get_weights('weight').nzeros

params['config_t'] = 'config{}_mult'.format(node.index)
if node.model.config.get_config_value('IOType') == 'io_parallel':
params['fill_fn'] = 'fill_buffer_{}'.format(node.index)
else:
params['fill_fn'] = 'FillConv2DBuffer'
conv_config = self.template.format(**params)

mult_params = self._default_config_params(node)
Expand Down Expand Up @@ -198,6 +214,7 @@ def format(self, node):
params['nzeros'] = node.get_weights('depthwise').nzeros
params['index'] = str(node.index) + '_depthwise'
params['weight_t'] = node.get_weights('depthwise').type
params['fill_fn'] = 'FillConv1DBuffer'

params['config_t'] = 'config{}_depthwise_mult'.format(node.index)
depthwise_config = self.depthwise_template.format(**params)
Expand Down Expand Up @@ -229,6 +246,7 @@ def format(self, node):
params['weight_t'] = node.get_weights('pointwise').type
params['min_width'] = params['in_width']
params['instructions'] = '0'
params['fill_fn'] = 'FillConv1DBuffer'

params['config_t'] = 'config{}_pointwise_mult'.format(node.index)
pointwise_config = self.pointwise_template.format(**params)
Expand Down Expand Up @@ -283,6 +301,7 @@ def format(self, node):
params['nzeros'] = node.get_weights('depthwise').nzeros
params['index'] = str(node.index) + '_depthwise'
params['weight_t'] = node.get_weights('depthwise').type
params['fill_fn'] = 'FillConv2DBuffer'

params['config_t'] = 'config{}_depthwise_mult'.format(node.index)
depthwise_config = self.depthwise_template.format(**params)
Expand Down Expand Up @@ -314,6 +333,7 @@ def format(self, node):
params['min_height'] = params['in_height']
params['min_width'] = params['in_width']
params['instructions'] = '0'
params['fill_fn'] = 'FillConv2DBuffer'

params['config_t'] = 'config{}_pointwise_mult'.format(node.index)
pointwise_config = self.pointwise_template.format(**params)
Expand Down
Loading

0 comments on commit 90d760a

Please sign in to comment.