attempt to add support for conv1d transpose

add new files for conv1dtranspose resource clean up so that conv code is reached. Still need to get the actual implementation matching keras implement conv1dtranspose super inefficiently (gets correct answer though) try to fix indices to make code work make the c code work for conv1dtranspose reduce weight dimensions to properly reflect transposed kernel size clean up so that transpose filter width is passes around from config fix code such that simple transpose layer gets synthesized move variables out of loops, optimize slightly and add in alternative method of computation to compute by kernel (that option is not optimized as of now) add in conv1d transpose linebuffer format code. seems to work, unsure of if it is optimized yet trying to fix stream behavior get transpose compilation working mostly as expected. weird jump in latency from reuse 1 to 2 still exists initial conv2dtranspose addition. Output is permuted as of now. output in correct order. using large array to buffer output though fix up conv1dtranspose a bit to pad correctly. fix up stream instructions for both 1d and 2d transposes fix allowed reuse factors for transpose layers update to new conv methods for io_parallel. Still some issues with multiple filters as well as some padding issues clean up error with multiple filters and larger kernels optimize conv transpose resource to get it working reasonably well. may still have slight optimization left fix output to conv1d transpose resource add conv2dtranspose io_parallel implementation. Can still be optimized small changeup to data storage in conv1d parallel fix zero padding pass addition for transpose stream layers move transposing of weight matrix to resource_strategy for transpose layers change how stream loads in weights to be like parallel for conv transposes. unroll all stride steps completely fix output of 1d transpose parallel to be faster change 1d transpose weight input to be 2-dimensional (passed from python code) change 2d transpose weight input to be 3-dimensional (passed from python code) small changes to transposes Revert "fix nondefault project name handling (fastmachinelearning#626)". The commit breaks the Vivado Accelerator workflow, and the fix is unclear to me right now. This reverts commit e8f048a. steps towards getting integer inputs to work
jmduarte · Mar 18, 2023 · 5175e1a · 5175e1a
1 parent d54843d
commit 5175e1a
Show file tree

Hide file tree

Showing 29 changed files with 1,881 additions and 233 deletions.
diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
@@ -79,6 +79,22 @@ def get_layer_mult_size(self, layer):
             n_out = layer.get_attr('n_out')
             return n_in, n_out
 
+        if 'Conv1DTranspose' in layer.class_name:
+            trfilt_width = (layer.get_attr('filt_width') + layer.get_attr('stride_width') - 1) \
+                // layer.get_attr('stride_width')
+            n_in = layer.get_attr('n_chan') * trfilt_width
+            n_out = layer.get_attr('n_filt')
+            return n_in, n_out
+
+        if 'Conv2DTranspose' in layer.class_name:
+            trfilt_width = (layer.get_attr('filt_width') + layer.get_attr('stride_width') - 1) \
+                // layer.get_attr('stride_width')
+            trfilt_height = (layer.get_attr('filt_height') + layer.get_attr('stride_height') - 1) \
+                // layer.get_attr('stride_height')
+            n_in = layer.get_attr('n_chan') * trfilt_height * trfilt_width
+            n_out = layer.get_attr('n_filt')
+            return n_in, n_out
+
         if 'Conv1D' in layer.class_name:
             n_in = layer.get_attr('n_chan') * layer.get_attr('filt_width')
             n_out = layer.get_attr('n_filt')
@@ -476,7 +492,67 @@ def generate_conv1d_line_buffer_fn(self, layer_idx, n_partitions, in_W, in_C, ke
             "    ) {{\n"
         ).format(index=layer_idx)
         indent = '    '
+        for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
+            generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
+            for pixel_idx, arr in enumerate(partition):
+                buffer_stmts = []
+                for j, v in enumerate(arr):
+                    if v == 0:
+                        val = '0'
+                    else:
+                        val = 'data[{}]'.format(int(v-1))
+                    buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
+                generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
+            generated_code += '\n' + indent * 2 + '}\n'
+
+        generated_code += indent + '}\n'
+        generated_code += '};\n'
+
+        return generated_code
+
+    def _compute_conv1d_tr_im2col(self, input_shape, out_w, kernel=3, stride=1):
+        W, C = input_shape
+
+        tr_kernel = (kernel+stride-1)//stride
+
+        input_img = np.arange(1, W * C + 1)
+        im_matrix = np.zeros((tr_kernel * C * out_w, ))
+
+        index = 0
+        for i_ow in range(out_w):
+            for i_kw in range(tr_kernel):
+                for i_c in range(C):
+                    # input column is just the output column shifted
+                    input_col = i_ow - (tr_kernel-1) + i_kw
+                    if (input_col >= 0 and input_col < W):
+                        im_matrix[index] = input_img[input_col * C + i_c]
+                    else:
+                        im_matrix[index] = 0
+                    index += 1
+        im_matrix = im_matrix.reshape(out_w, -1)
+        return im_matrix
+
 
+    def generate_conv1d_tr_line_buffer_fn(self, layer_idx, n_partitions, in_W, in_C, out_W, kernel=3, stride=1):
+
+        im2col_matrix = self._compute_conv1d_tr_im2col(
+            (in_W, in_C),
+            out_W,
+            kernel,
+            stride,
+        )
+
+        generated_code = (
+            "template<class data_T, typename CONFIG_T>\n"
+            "class fill_buffer_{index} : public FillConv1DBuffer<data_T, CONFIG_T> {{\n"
+            "    public:\n"
+            "    static void fill_buffer(\n"
+            "        data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
+            "        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::trfilt_width * CONFIG_T::n_chan],\n"
+            "        const unsigned partition\n"
+            "    ) {{\n"
+        ).format(index=layer_idx)
+        indent = '    '
         for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
             generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
             for pixel_idx, arr in enumerate(partition):
@@ -622,6 +698,91 @@ def generate_conv2d_line_buffer_fn(self, layer_idx, n_partitions, in_H, in_W, in
 
         return generated_code
 
+    def _compute_conv2d_tr_im2col(self, input_shape, out_shape, kernel=(3, 3), stride=(1, 1)):
+        H, W, C = input_shape
+        kernel_h, kernel_w = kernel
+        stride_h, stride_w = stride
+        out_h, out_w = out_shape
+
+        tr_kernel_h = (kernel_h+stride_h-1)//stride_h
+        tr_kernel_w = (kernel_w+stride_w-1)//stride_w
+
+        input_img = np.arange(1, H * W * C + 1)
+        im_matrix = np.zeros((tr_kernel_h * tr_kernel_w * C * out_h * out_w, ))
+
+        index = 0
+        for i_oh in range(out_h):
+            for i_ow in range(out_w):
+                for i_kh in range(tr_kernel_h):
+                    input_row = i_oh - (tr_kernel_h-1) + i_kh
+                    for i_kw in range(tr_kernel_w):
+                        for i_c in range(C):
+                            if (input_row < 0 or input_row >= H):
+                                im_matrix[index] = 0
+                            else:
+                                input_col = i_ow - (tr_kernel_w-1) + i_kw
+                                if (input_col >= 0 and input_col < W):
+                                    im_matrix[index] = input_img[input_row * W * C + input_col * C + i_c]
+                                else:
+                                    im_matrix[index] = 0
+                            index += 1
+
+        im_matrix = im_matrix.reshape(out_h * out_w, -1)
+        return im_matrix
+
+
+    def generate_conv2d_tr_line_buffer_fn(self, layer_idx, n_partitions, in_H, in_W, in_C, out_H, out_W, kernel=(3, 3), stride=(1, 1)):
+        if isinstance(kernel, Iterable):
+            kernel_height = kernel[0]
+            kernel_width = kernel[1]
+        else:
+            kernel_height = kernel
+            kernel_width = kernel
+
+        if isinstance(stride, Iterable):
+            stride_height = stride[0]
+            stride_width = stride[1]
+        else:
+            stride_height = stride
+            stride_width = stride
+
+        im2col_matrix = self._compute_conv2d_tr_im2col(
+            (in_H, in_W, in_C),
+            (out_W, out_W),
+            (kernel_height, kernel_width),
+            (stride_height, stride_width),
+        )
+
+        generated_code = (
+            "template<class data_T, typename CONFIG_T>\n"
+            "class fill_buffer_{index} : public FillConv2DBuffer<data_T, CONFIG_T> {{\n"
+            "    public:\n"
+            "    static void fill_buffer(\n"
+            "        data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],\n"
+            "        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan],\n"
+            "        const unsigned partition\n"
+            "    ) {{\n"
+        ).format(index=layer_idx)
+        indent = '    '
+
+        for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
+            generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
+            for pixel_idx, arr in enumerate(partition):
+                buffer_stmts = []
+                for j, v in enumerate(arr):
+                    if v == 0:
+                        val = '0'
+                    else:
+                        val = 'data[{}]'.format(int(v-1))
+                    buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
+                generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
+            generated_code += '\n' + indent * 2 + '}\n'
+
+        generated_code += indent + '}\n'
+        generated_code += '};\n'
+
+        return generated_code
+
     @model_optimizer()
     def write_hls(self, model):
         self.writer.write_hls(model)

diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
@@ -326,6 +326,15 @@ def __init__(self, type_converter):
 
 class StaticWeightVariableDefinition(VariableDefinition):
     def definition_cpp(self, name_suffix='', as_reference=False):
+        if self.keep_dims > 0:
+            size_str = ''
+            for dim in range(self.keep_dims):
+                size_str += '[{cur_dim}]'.format(cur_dim=self.shape[dim])
+            final_dim = 1
+            for dim in range(self.keep_dims, len(self.shape)):
+                final_dim *= self.shape[dim]
+            size_str += '[{last_dim}]'.format(last_dim=final_dim)
+            return '{type} {name}{sizes}'.format(type=self.type.name, name=self.name, sizes=size_str)
         return '{type} {name}[{size}]'.format(type=self.type.name, name=self.name, size=self.data_length)
 
 class StaticWeightVariableConverter(object):

diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py
@@ -1,17 +1,21 @@
 from hls4ml.model.optimizer import OptimizerPass
-from hls4ml.model.layers import Conv1D, Conv2D
+from hls4ml.model.layers import Conv1D, Conv2D, Conv1DTranspose, Conv2DTranspose
 from hls4ml.model.types import Source
 
 class GenerateConvIm2col(OptimizerPass):
     ''' Generates tcode for im2col step of 1D/2d convolution '''
     def match(self, node):
-        return isinstance(node, (Conv1D, Conv2D)) and \
+        return isinstance(node, (Conv1D, Conv2D, Conv1DTranspose, Conv2DTranspose)) and \
             node.model.config.get_config_value('IOType') == 'io_parallel'
 
     def transform(self, model, node):
         node_class = node.__class__.__name__
-        if '1D' in node_class:
+        if '1DTranspose' in node_class:
+            self._generate_im2col_1d_transpose(node)
+        elif '1D' in node_class:
             self._generate_im2col_1d(node)
+        elif '2DTranspose' in node_class:
+            self._generate_im2col_2d_transpose(node)
         elif '2D' in node_class:
             self._generate_im2col_2d(node)
         else:
@@ -30,6 +34,19 @@ def _generate_im2col_1d(self, node):
 
         node.set_attr('line_buffer_codegen', Source(code_str))
 
+    def _generate_im2col_1d_transpose(self, node):
+        code_str = node.model.config.backend.generate_conv1d_tr_line_buffer_fn(
+            node.get_attr('index'),
+            node.get_attr('n_partitions'),
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            node.get_attr('proc_width'),
+            kernel=node.get_attr('filt_width'),
+            stride=node.get_attr('stride_width'),
+        )
+
+        node.set_attr('line_buffer_codegen', Source(code_str))
+
     def _generate_im2col_2d(self, node):
         code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
             node.get_attr('index'),
@@ -43,3 +60,18 @@ def _generate_im2col_2d(self, node):
         )
 
         node.set_attr('line_buffer_codegen', Source(code_str))
+
+    def _generate_im2col_2d_transpose(self, node):
+        code_str = node.model.config.backend.generate_conv2d_tr_line_buffer_fn(
+            node.get_attr('index'),
+            node.get_attr('n_partitions'),
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            node.get_input_variable().shape[2],
+            node.get_attr('proc_height'),
+            node.get_attr('proc_width'),
+            kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')),
+            stride=(node.get_attr('stride_height'), node.get_attr('stride_width')),
+        )
+
+        node.set_attr('line_buffer_codegen', Source(code_str))
diff --git a/hls4ml/backends/vivado/passes/conv_same_pad.py b/hls4ml/backends/vivado/passes/conv_same_pad.py
@@ -1,5 +1,5 @@
 from hls4ml.model.optimizer import OptimizerPass
-from hls4ml.model.layers import Conv1D, SeparableConv1D, Conv2D, SeparableConv2D
+from hls4ml.model.layers import Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, Conv1DTranspose, Conv2DTranspose
 
 class InsertZeroPaddingBeforeConv1D(OptimizerPass):
     name = 'insert_zero_padding_before_conv1d'
@@ -46,6 +46,53 @@ def transform(self, model, node):
 
         return True
 
+class InsertZeroPaddingBeforeConv1DTranspose(OptimizerPass):
+    name = 'insert_zero_padding_before_conv1dtranspose'
+
+    def match(self, node):
+        is_match = isinstance(node, (Conv1DTranspose)) and \
+            node.get_attr('padding') == 'same' and \
+            node.get_attr('filt_width') != 1
+        return is_match
+
+    def transform(self, model, node):
+        if model.config.get_config_value('IOType') != 'io_stream':
+            return False
+
+        # Get the padding parameters from Conv1D layer
+        pad_left = node.get_attr('pad_left')
+        pad_right = node.get_attr('pad_right')
+        convtr_out_width = node.get_attr('out_width')
+        in_width = node.get_attr('in_width')
+        stride_width = node.get_attr('stride_width')
+        trfilt_width = (node.get_attr('filt_width') + node.get_attr('stride_width') - 1) \
+            // node.get_attr('stride_width')
+
+        add_right = (convtr_out_width + pad_left)//stride_width - (in_width-1)
+
+        out_width = in_width + add_right + trfilt_width-1
+
+        attrs = {
+            'pad_left': trfilt_width-1,
+            'pad_right': add_right,
+            'in_width': in_width,
+            'out_width': out_width,
+            'n_chan': node.get_attr('n_chan'),
+            'data_format': node.get_attr('data_format', 'channels_last')
+        }
+
+        # Switch Conv1DTranspose to be 'valid'. I think this is wrong
+        node.set_attr('padding', 'valid')
+        node.set_attr('in_width', out_width)
+        node.set_attr('pad_left', pad_left + (trfilt_width-1)*stride_width)
+
+        # Insert new ZeroPadding1D node above Conv1DTranspose
+        padding_layer = model.make_node('ZeroPadding1D', 'zp1d_' + node.name, attrs, node.inputs.copy())
+        padding_layer.get_output_variable().type.precision = node.get_input_variable().type.precision
+        model.insert_node(padding_layer)
+
+        return True
+
 class InsertZeroPaddingBeforeConv2D(OptimizerPass):
     name = 'insert_zero_padding_before_conv2d'
 
@@ -100,3 +147,66 @@ def transform(self, model, node):
         model.insert_node(padding_layer, before=node)
 
         return True
+
+class InsertZeroPaddingBeforeConv2DTranspose(OptimizerPass):
+    name = 'insert_zero_padding_before_conv2dtranspose'
+
+    def match(self, node):
+        is_match = isinstance(node, Conv2DTranspose) and \
+            node.get_attr('padding') == 'same' and \
+            node.get_attr('filt_width') != 1
+        return is_match
+
+    def transform(self, model, node):
+        if model.config.get_config_value('IOType') != 'io_stream':
+            return False
+
+        # Get the padding parameters from Conv2DTranspose layer
+        pad_left = node.get_attr('pad_left')
+        pad_right = node.get_attr('pad_right')
+        pad_top = node.get_attr('pad_top')
+        pad_bottom = node.get_attr('pad_bottom')
+        convtr_out_width = node.get_attr('out_width')
+        convtr_out_height = node.get_attr('out_height')
+        in_width = node.get_attr('in_width')
+        in_height = node.get_attr('in_height')
+        stride_width = node.get_attr('stride_width')
+        stride_height = node.get_attr('stride_height')
+        trfilt_width = (node.get_attr('filt_width') + node.get_attr('stride_width') - 1) \
+             // node.get_attr('stride_width')
+        trfilt_height = (node.get_attr('filt_height') + node.get_attr('stride_height') - 1) \
+             // node.get_attr('stride_height')
+
+        add_right = (convtr_out_width + pad_left)//stride_width-(in_width-1)
+        add_bottom = (convtr_out_height + pad_top)//stride_height-(in_height-1)
+
+        out_width = in_width + add_right + trfilt_width-1
+        out_height = in_height + add_bottom + trfilt_height-1
+
+        attrs = {
+            'pad_left': trfilt_width-1,
+            'pad_right': add_right,
+            'pad_top': trfilt_height-1,
+            'pad_bottom': add_bottom,
+            'in_width': in_width,
+            'in_height': in_height,
+            'out_width': out_width,
+            'out_height': out_height,
+            'n_chan': node.get_attr('n_chan'),
+            'data_format': node.get_attr('data_format', 'channels_last')
+        }
+
+        # switch Conv2DTranspose to be 'valid'. This is technically not true though
+        node.set_attr('padding', 'valid')
+        node.set_attr('in_width', out_width)
+        node.set_attr('in_height', out_height)
+        node.set_attr('pad_left', pad_left + (trfilt_width-1)*stride_width)
+        node.set_attr('pad_top', pad_top + (trfilt_height-1)*stride_height)
+
+        # insert new ZeroPadding2D ndoe above Conv2DTranspose
+        padding_layer = model.make_node('ZeroPadding2D', 'zp2d_' + node.name, attrs, node.inputs.copy())
+        padding_layer.get_output_variable().type.precision = node.get_input_variable().type.precision
+        model.insert_node(padding_layer, before=node)
+
+        return True
+