Merge pull request fastmachinelearning#629 from vloncar/vitis_port

Vitis HLS backend
calad0i · Mar 31, 2023 · 0da2d8e · 0da2d8e
2 parents c947ec9 + 5be3146
commit 0da2d8e
Show file tree

Hide file tree

Showing 72 changed files with 1,923 additions and 346 deletions.
diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
@@ -5,8 +5,10 @@
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig
+from hls4ml.backends.vitis.vitis_backend import VitisBackend
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
+register_backend('Vitis', VitisBackend)
 register_backend('Quartus', QuartusBackend)
diff --git a/hls4ml/backends/vitis/__init__.py b/hls4ml/backends/vitis/__init__.py
diff --git a/hls4ml/backends/vitis/passes/__init__.py b/hls4ml/backends/vitis/passes/__init__.py
diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py
@@ -0,0 +1,28 @@
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ValidateConvImplementation(OptimizerPass):
+
+    def match(self, node):
+        return 'Conv' in node.class_name
+
+    def transform(self, model, node):
+        if node.get_attr('implementation', 'linebuffer') == 'encoded':
+            print(f'WARNING: "Encoded" implementation in "{node.name}" ({node.class_name}) is not supported in Vitis backend. Switching to "LineBuffer" implementation.')
+            node.set_attr('implementation', 'linebuffer')
+
+
+class ValidateStrategy(OptimizerPass):
+    _resource_layer_cls = ['Conv1D', 'Conv2D', 'Dense']
+
+    def match(self, node):
+        is_resource_layer = len([layer_cls for layer_cls in self._resource_layer_cls if layer_cls in node.class_name]) > 0
+        is_resource_strategy = node.model.config.is_resource_strategy(node)
+
+        return is_resource_layer and is_resource_strategy
+
+    def transform(self, model, node):
+        n_in, _ = model.config.backend.get_layer_mult_size(node)
+        rf = node.get_attr('reuse_factor')
+        if rf > n_in and rf % n_in > 0:
+            print(f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis backend due to use of "urem" cores. Consider using a different ReuseFactor or switching to "Latency" strategy.')
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
@@ -0,0 +1,46 @@
+import os
+import sys
+
+from hls4ml.backends import VivadoBackend
+from hls4ml.model.flow import register_flow, get_flow
+from hls4ml.report import parse_vivado_report
+
+
+class VitisBackend(VivadoBackend):
+    def __init__(self):
+        super(VivadoBackend, self).__init__(name='Vitis')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def _register_flows(self):
+        validation_passes = [
+            'vitis:validate_conv_implementation',
+            'vitis:validate_strategy',
+        ]
+        validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name)
+
+        # Any potential templates registered specifically for Vitis backend
+        template_flow = register_flow('apply_templates', self._get_layer_templates, requires=['vivado:init_layers'], backend=self.name)
+
+        writer_passes = ['make_stamp', 'vitis:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name)
+
+        ip_flow_requirements = get_flow('vivado:ip').requires.copy()
+        ip_flow_requirements.insert(ip_flow_requirements.index('vivado:init_layers'), validation_flow)
+        ip_flow_requirements.insert(ip_flow_requirements.index('vivado:apply_templates'), template_flow)
+
+        self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
+
+    def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False):
+        if 'linux' in sys.platform:
+            found = os.system('command -v vitis_hls > /dev/null')
+            if found != 0:
+                raise Exception('Vitis HLS installation not found. Make sure "vitis_hls" is on PATH.')
+
+        curr_dir = os.getcwd()
+        os.chdir(model.config.get_output_dir())
+        os.system('vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} validation={validation} export={export} vsynth={vsynth}"'
+            .format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth))
+        os.chdir(curr_dir)
+
+        return parse_vivado_report(model.config.get_output_dir())
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -9,6 +9,8 @@
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
     static const unsigned strategy = nnet::{strategy};
+    static const unsigned n_zeros = 0;
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
@@ -123,6 +125,7 @@ def format(self, node):
     static const unsigned out_width = {out_width};
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     static const unsigned strategy = nnet::{strategy};
     static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
@@ -363,6 +366,9 @@ def format(self, node):
 
         # Depthwise config
         params = self._default_config_params(node)
+        # Override bias and bias_t since these are zeros in depthwise step of SepConv2D
+        params['bias'] = params['zero_bias']
+        params['bias_t'] = params['zero_bias_t']
         params['n_filt'] = params['n_chan']  # In depthwise step n_chan == n_filt
         params['dilation'] = node.get_attr('dilation', 1)
         params['nzeros'] = node.get_weights('depthwise').nzeros

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
@@ -12,6 +12,7 @@
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
     static const unsigned n_nonzeros = {nonzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
@@ -63,6 +64,7 @@ def format(self, node):
     static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor);
     static const bool store_weights_in_bram = false;
     typedef {bias_t.name} bias_t;
     typedef {scale_t.name} scale_t;

diff --git a/hls4ml/backends/vivado/passes/merge_templates.py b/hls4ml/backends/vivado/passes/merge_templates.py
@@ -49,6 +49,7 @@ def format(self, node):
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor);
     typedef {accum_t.name} accum_t;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;

diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py
@@ -12,6 +12,7 @@
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
     static const unsigned n_nonzeros = {nonzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;

diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
@@ -56,15 +56,21 @@ def _find_solutions(sln_dir):
     solutions = []
 
     if os.path.isfile(sln_dir + '/vivado_hls.app'):
-        with open(sln_dir + '/vivado_hls.app') as f:
-            # Get rid of namespaces (workaround to support two types of vivado_hls.app files)
-            xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1)
+        sln_file = 'vivado_hls.app'
+    elif os.path.isfile(sln_dir + '/hls.app'):
+        sln_file = 'hls.app'
+    else:
+        return solutions
+
+    with open(sln_dir + '/' + sln_file) as f:
+        # Get rid of namespaces (workaround to support two types of vivado_hls.app files)
+        xmlstring = re.sub(' xmlns="[^"]+"', '', f.read(), count=1)
 
-        root = ET.fromstring(xmlstring)
-        for sln_tag in root.findall('solutions/solution'):
-            sln_name = sln_tag.get('name')
-            if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name):
-                solutions.append(sln_name)
+    root = ET.fromstring(xmlstring)
+    for sln_tag in root.findall('solutions/solution'):
+        sln_name = sln_tag.get('name')
+        if sln_name is not None and os.path.isdir(sln_dir + '/' + sln_name):
+            solutions.append(sln_name)
 
     return solutions
 
@@ -172,8 +178,13 @@ def parse_vivado_report(hls_dir):
         # Area
         area_node = root.find('./AreaEstimates')
         for child in area_node.find('./Resources'):
+            # DSPs are called 'DSP48E' in Vivado and just 'DSP' in Vitis. Overriding here to have consistent keys
+            if child.tag == 'DSP48E':
+                child.tag = 'DSP'
             c_synth_report[child.tag] = child.text
         for child in area_node.find('./AvailableResources'):
+            if child.tag == 'DSP48E':
+                child.tag = 'DSP'
             c_synth_report['Available' + child.tag] = child.text
         report['CSynthesisReport'] = c_synth_report
     else:

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h
@@ -0,0 +1,102 @@
+#ifndef NNET_CONV1D_RESOURCE_H_
+#define NNET_CONV1D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void conv_1d_resource_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T  res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+    constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor);
+    constexpr unsigned multscale = block_factor / mult_n_out;
+
+    assert((block_factor % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor <= CONFIG_T::filt_width * CONFIG_T::n_chan) && "This function is correct only for RF <= FILT_WIDTH * N_CHAN");
+
+    // Treating weights as 2d is required to make sure Vitis doesn't use urem cores to calculate indices.
+    // Also, we don't apply ARRAY_RESHAPE pragma as Vitis figures this out on its own.
+    typename CONFIG_T::weight_t (*weights_2d)[CONFIG_T::reuse_factor] = (typename CONFIG_T::weight_t (*)[CONFIG_T::reuse_factor]) weights;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    PartitionLoop:
+    for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        //#pragma HLS UNROLL // We don't want this loop unrolled
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+        PixelInitAccumLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            InitAccumLoop:
+            for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t) biases[i_acc];
+            }
+        }
+
+        ReuseLoop:
+        for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) {
+            #pragma HLS PIPELINE II=1 rewind
+
+            unsigned i_in = i_rf;
+            unsigned i_out = 0;
+            unsigned i_acc = 0;
+
+            MultLoop:
+            for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) {
+                #pragma HLS UNROLL
+
+                PixelMultLoop:
+                for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+                    #pragma HLS UNROLL
+
+                    acc[i_pxl][i_out] += static_cast<typename CONFIG_T::accum_t>(
+                            CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data_buf[i_pxl][i_in], weights_2d[i_blk][i_rf]));
+                }
+
+                // Increment i_in
+                i_in += CONFIG_T::reuse_factor;
+                if (i_in >= mult_n_in) {
+                    i_in = i_rf;
+                }
+                // Increment i_out
+                if (i_acc + 1 >= multscale) {
+                    i_acc = 0;
+                    i_out++;
+                } else {
+                    i_acc++;
+                }
+            }
+        }
+
+        PixelResultLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+            // Cast to "res_t" type
+            ResultLoop:
+            for (unsigned i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
+            }
+        }
+    }
+}
+
+}
+#endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_stream.h
@@ -0,0 +1,36 @@
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+#include "hls_stream.h"
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::implementation == conv_implementation::linebuffer && "Only \"linebuffer\" implementation is supported in Vitis HLS.");
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    } else {
+        ReadInputWidthSerial: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        }
+    }
+
+}
+
+
+}
+#endif