diff --git a/ben_setup.py b/ben_setup.py new file mode 100644 index 00000000..7ae435be --- /dev/null +++ b/ben_setup.py @@ -0,0 +1,37 @@ +import setuptools + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +setuptools.setup( + name="fpgaconvnet-optimiser-BenDev", # Replace with your own username + version="0.0.6", + author="Alex Montgomerie & Ben Biggs", + author_email="am9215@ic.ac.uk bb2515@ic.ac.uk", + description="Optimiser for mapping convolutional neural network models to FPGA platforms.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/AlexMontgomerie/fpgaconvnet-optimiser", + include_package_data=True, + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', + install_requires=[ + "networkx>=2.5", + "numpy>=1.19.2", + "protobuf>=3.13.0", + "torch>=1.7.1", + "pyyaml>=5.1.0", + "scipy>=1.2.1", + "torchvision>=0.8.2", + "onnx==1.8.0", + "onnxruntime>=1.6.0", + "graphviz>=0.16", + "pydot>=1.4.2", + "onnxoptimizer>=0.2.5", + "ddt>=1.4.2", + ] +) diff --git a/fpgaconvnet_optimiser/models/layers/BufferLayer.py b/fpgaconvnet_optimiser/models/layers/BufferLayer.py new file mode 100644 index 00000000..1c5eb296 --- /dev/null +++ b/fpgaconvnet_optimiser/models/layers/BufferLayer.py @@ -0,0 +1,138 @@ +""" +Buffering layer + +Stores intermediate compute information such as results from Conv or Pool layers. +During DSE the required size will be calculated to store intermediate results at +branching layers. The position of the buffer layer will then be moved along a +given branch until the buffer size is feasible and the latency of the exit +condition is mitigated/matched. For effective pipelining I think. + +Secondary function of the buffer is to "drop" a partial calculation. +Clear a FIFO - takes X number of cycles? +Drop signal will be control signal from the Exit Condition. + +Future goal will be to have buffer as an offchip memory link. +In this case, the drop might not be used. + +If "drop_mode" True then when True ctrl signal received, drop the data. +If "drop_mode" False then use inverted ctrl signal. +""" + +import numpy as np +import math +import pydot +import torch + +from fpgaconvnet_optimiser.models.modules import Buffer +#from fpgaconvnet_optimiser.models.modules import Fork +from fpgaconvnet_optimiser.models.layers import Layer + +class BufferLayer(Layer): + def __init__( + self, + rows: int, + cols: int, + channels: int, + coarse_in: int, + coarse_out: int, + ctrledge, + drop_mode =True, + data_width =16, + ): + # initialise parent class + super().__init__([rows],[cols],[channels],[coarse_in],[coarse_out]) + + #ctrledge links to exit condition layer + self.ctrledge = ctrledge + self.drop_mode = drop_mode + + #init modules + self.modules = { + "buffer" : Buffer(rows,cols,channels, ctrledge, data_width) + } + self.update() + + ## LAYER INFO ## + def layer_info(self,parameters,batch_size=1): + parameters.batch_size = batch_size + parameters.buffer_depth = self.buffer_depth + parameters.rows_in = self.rows_in(0) + parameters.cols_in = self.cols_in(0) + parameters.channels_in = self.channels_in0() + parameters.rows_out = self.rows_out(0) + parameters.cols_out = self.cols_out(0) + parameters.channels_out = self.channels_out(0) + parameters.coarse_in = self.coarse_in + parameters.coarse_out = self.coarse_out + + ## UPDATE MODULES ## + def update(self): + self.modules['buffer'].rows = self.rows_in(0) + self.modules['buffer'].cols = self.cols_in(0) + self.modules['buffer'].channels = self.channels_in(0) + #TODO work out if channels = int(self.channels/self.coarse_in) + + + ### RATES ### + def rates_graph(self): + rates_graph = np.zeros( shape=(1,2) , dtype=float ) + #buffer + rates_graph[0,0] = self.modules['buffer'].rate_in(0) + rates_graph[0,1] = self.modules['buffer'].rate_out(0) + return rates_graph + + def update_coarse_in(self, coarse_in): + self.coarse_in = coarse_in + + def update_coarse_out(self, coarse_out): + self.coarse_out = coarse_out + + #def get_weights_reloading_feasible(self): + + def resource(self): + + buff_rsc = self.modules['buffer'].rsc() + + # Total + return { + "LUT" : buff_rsc['LUT']*self.coarse_in, + "FF" : buff_rsc['FF']*self.coarse_in, + "BRAM" : buff_rsc['BRAM']*self.coarse_in, + "DSP" : buff_rsc['DSP']*self.coarse_in, + } + + def visualise(self,name): + cluster = pydot.Cluster(name,label=name) + + for i in range(self.coarse_in): + cluster.add_node(pydot.Node( "_".join([name,"buff",str(i)]), label="buff" )) + + # get nodes in and out + nodes_in = [ "_".join([name,"buff",str(i)]) for i in range(self.coarse_in) ] + nodes_out = nodes_in + + return cluster, nodes_in, nodes_out + + def functional_model(self, data, ctrl_drop): + #Buffer is not an ONNX or pytorch op + # check input dimensionality + assert data.shape[0] == self.rows_in(0) , "ERROR (data): invalid row dimension" + assert data.shape[1] == self.cols_in(0) , "ERROR (data): invalid column dimension" + assert data.shape[2] == self.channels_in(0), "ERROR (data): invalid channel dimension" + + out = np.zeros(( + self.rows, + self.cols, + self.channels),dtype=float) + + if self.drop_mode: #non-inverted + if ctrl_drop: + return out + else: + return data #pass through + else: #inverted + if not ctrl_drop: + return out + else: + return data #pass through + diff --git a/fpgaconvnet_optimiser/models/layers/ExitConditionLayer.py b/fpgaconvnet_optimiser/models/layers/ExitConditionLayer.py new file mode 100644 index 00000000..5be72de2 --- /dev/null +++ b/fpgaconvnet_optimiser/models/layers/ExitConditionLayer.py @@ -0,0 +1,106 @@ +""" +Exit Condition layer + +Takes the input and performs Softmax, then takes the maximum value, and +compares it to the exit threshold value specified in the model. +This generates a control signal that will terminate the execution early +or allow the input sample to pass through the graph. + +TODO Add other variations of the exit condition. +""" + +import torch +import math +import numpy as np +import pydot + +#from fpgaconvnet_optimiser.models.modules import SlidingWindow +#from fpgaconvnet_optimiser.models.modules import Pool +from fpgaconvnet_optimiser.models.layers import Layer + +class ExitConditionLayer(Layer): + def __init__( + self, + rows: int, + cols: int, + channels: int, + coarse_in: int, + coarse_out: int, + ctrledges: [str], #expecting list + cond_type = 'top1', + data_width = 16 + ): + super().__init__([rows],[cols],[channels],[coarse_in],[coarse_out]) + + self.ctrledges = ctrledges + self.cond_type = cond_type + + #update flags + + #init modules + #TODO + self.modules = { + } + + self.update() + + def layer_info(self,parameters,batch_size=1): + parameters.batch_size = batch_size + parameters.buffer_depth = self.buffer_depth + parameters.rows_in = self.rows_in() + parameters.rows_in = self.rows_in() + parameters.cols_in = self.cols_in() + parameters.channels_in = self.channels_in() + parameters.rows_out = self.rows_out() + parameters.cols_out = self.cols_out() + parameters.channels_out = self.channels_out() + parameters.coarse = self.coarse_in + parameters.coarse_in = self.coarse_in + parameters.coarse_out = self.coarse_out + + def update(self): #TODO + Layer.update(self) + + def rates_graph(self): #TODO + rates_graph = np.zeros( shape=(1,2), dtype=float) + return rates_graph + + def resource(self): #TODO + mod_rsc = 0#self.modules['mod'].rsc() + + # Total + return { + "LUT" : mod_rsc['LUT']*self.coarse_in, + "FF" : mod_rsc['FF']*self.coarse_in, + "BRAM" : mod_rsc['BRAM']*self.coarse_in, + "DSP" : mod_rsc['DSP']*self.coarse_in, + } + + def visualise(self,name): #TODO replace 'mod' with actual modules used + cluster = pydot.Cluster(name,label=name) + + for i in range(self.coarse_in): + cluster.add_node(pydot.Node( "_".join([name,"mod",str(i)]), label="mod" )) + + for i in range(self.coarse_out): + cluster.add_node(pydot.Node( "_".join([name,"mod",str(i)]), label="mod" )) + + # get nodes in and out + nodes_in = [ "_".join([name,"mod",str(i)]) for i in range(self.coarse_in) ] + nodes_out = [ "_".join([name,"mod",str(i)]) for i in range(self.coarse_out) ] + + return cluster, nodes_in, nodes_out + + def functional_model(self, data, threshold): + + assert data.shape[0] == self.rows , "ERROR (data): invalid row dimension" + assert data.shape[1] == self.cols , "ERROR (data): invalid column dimension" + assert data.shape[2] == self.channels, "ERROR (data): invalid channel dimension" + + #instantiate softmax layer + softmax_layer = torch.nn.Softmax() #TODO move softmax to separate layer + pk = softmax_layer(torch.from_numpy(data)).detach() + #get max value + top1 = torch.max(pk) + #True = early exit, drop buffered data + return top1 > threshold diff --git a/fpgaconvnet_optimiser/models/layers/ExitSelectLayer.py b/fpgaconvnet_optimiser/models/layers/ExitSelectLayer.py new file mode 100644 index 00000000..1d4ff91b --- /dev/null +++ b/fpgaconvnet_optimiser/models/layers/ExitSelectLayer.py @@ -0,0 +1,111 @@ +""" +Exit Selection Layer + +This layer merges all exit results into a single point for the output to offchip mem. +The select lines will be driven by the control signal from each exit condition layer. + +""" + +import numpy as np +import math +import pydot +import torch + +from fpgaconvnet_optimiser.models.layers import Layer + +class ExitSelectLayer(Layer): + def __init__( + self, + rows: int, + cols: int, + channels: int, + coarse_in: int, + coarse_out: int, + ctrledge: str, + data_width =16, + ): + # initialise parent class + #rows, cols, channels will be the same for both inputs + super().__init__( [rows,rows], + [cols,cols], + [channels,channels], + [coarse_in,coarse_in], + [coarse_out,coarse_out]) + + #index 0 is then_branch, index 1 is else_branch + #ctrledge links to exit condition layer + self.ctrledge = ctrledge + + #init modules + self.modules = { + } + self.update() + + ## LAYER INFO ## + def layer_info(self,parameters,batch_size=1): + #TODO + #parameters.batch_size = batch_size + #parameters.buffer_depth = self.buffer_depth + #parameters.rows_in = self.rows_in() + #parameters.cols_in = self.cols_in() + #parameters.channels_in = self.channels_in() + #parameters.rows_out = self.rows_out() + #parameters.cols_out = self.cols_out() + #parameters.channels_out = self.channels_out() + #parameters.coarse_in = self.coarse_in + #parameters.coarse_out = self.coarse_out + return + + ## UPDATE MODULES ## + def update(self): #TODO + return + + ### RATES ### + def rates_graph(self): + rates_graph = np.zeros( shape=(1,2) , dtype=float ) + + #rates_graph[0,0] = self.modules['mod'].rate_in() + #rates_graph[0,1] = self.modules['mod'].rate_out() + return rates_graph + + def resource(self): + + mod_rsc = self.modules['mod'].rsc() + + # Total + return { + "LUT" : buff_rsc['LUT']*self.coarse_in, + "FF" : buff_rsc['FF']*self.coarse_in, + "BRAM" : buff_rsc['BRAM']*self.coarse_in, + "DSP" : buff_rsc['DSP']*self.coarse_in, + } + + def visualise(self,name): #TODO replace 'mod' with actual modules used + cluster = pydot.Cluster(name,label=name) + + for i in range(self.coarse_in): + cluster.add_node(pydot.Node( "_".join([name,"mod",str(i)]), label="mod" )) + + for i in range(self.coarse_out): + cluster.add_node(pydot.Node( "_".join([name,"mod",str(i)]), label="mod" )) + + # get nodes in and out + nodes_in = [ "_".join([name,"mod",str(i)]) for i in range(self.coarse_in) ] + nodes_out = [ "_".join([name,"mod",str(i)]) for i in range(self.coarse_out) ] + + return cluster, nodes_in, nodes_out + + def functional_model(self, EEdata, LEdata, ctrl_pass): + #Exit select is not an ONNX or pytorch op + # check input dimensionality + assert EEdata.shape[0] == self.rows , "ERROR: invalid row dimension" + assert EEdata.shape[1] == self.cols , "ERROR: invalid column dimension" + assert EEdata.shape[2] == self.channels, "ERROR: invalid channel dimension" + assert LEdata.shape[0] == self.rows , "ERROR: invalid row dimension" + assert LEdata.shape[1] == self.cols , "ERROR: invalid column dimension" + assert LEdata.shape[2] == self.channels, "ERROR: invalid channel dimension" + + if ctrl_pass: + return EEdata + else: + return LEdata diff --git a/fpgaconvnet_optimiser/models/layers/PoolingLayer.py b/fpgaconvnet_optimiser/models/layers/PoolingLayer.py index 914e58ae..7d5b588e 100644 --- a/fpgaconvnet_optimiser/models/layers/PoolingLayer.py +++ b/fpgaconvnet_optimiser/models/layers/PoolingLayer.py @@ -19,7 +19,7 @@ def __init__( k_size =2, stride =2, pad =0, - fine =1 + fine =1, ): # initialise parent class @@ -52,8 +52,8 @@ def __init__( #self.load_coef() # switching activity - self.sa = sa - self.sa_out = sa_out + #self.sa = sa + #self.sa_out = sa_out def rows_out(self, port_index): assert port_index == 0, "ERROR: Pooling layers can only have 1 port" diff --git a/fpgaconvnet_optimiser/models/layers/__init__.py b/fpgaconvnet_optimiser/models/layers/__init__.py index cd481f69..b0c11091 100644 --- a/fpgaconvnet_optimiser/models/layers/__init__.py +++ b/fpgaconvnet_optimiser/models/layers/__init__.py @@ -2,13 +2,17 @@ Layers are comprised of modules. They have the same functionality of the equivalent layers of the CNN model. """ -from .Layer import Layer -from .BatchNormLayer import BatchNormLayer -from .InnerProductLayer import InnerProductLayer -from .PoolingLayer import PoolingLayer -from .ReLULayer import ReLULayer -from .ConvolutionLayer import ConvolutionLayer -from .SqueezeLayer import SqueezeLayer -from .SplitLayer import SplitLayer -from .LRNLayer import LRNLayer -from .SoftMaxLayer import SoftMaxLayer +from .Layer import Layer +from .BatchNormLayer import BatchNormLayer +from .InnerProductLayer import InnerProductLayer +from .PoolingLayer import PoolingLayer +from .ReLULayer import ReLULayer +from .ConvolutionLayer import ConvolutionLayer +from .SqueezeLayer import SqueezeLayer +from .LRNLayer import LRNLayer +from .SoftMaxLayer import SoftMaxLayer +#EE Layers +from .SplitLayer import SplitLayer +from .BufferLayer import BufferLayer +from .ExitConditionLayer import ExitConditionLayer +from .ExitSelectLayer import ExitSelectLayer diff --git a/fpgaconvnet_optimiser/models/modules/Buffer.py b/fpgaconvnet_optimiser/models/modules/Buffer.py new file mode 100644 index 00000000..b1edaed2 --- /dev/null +++ b/fpgaconvnet_optimiser/models/modules/Buffer.py @@ -0,0 +1,110 @@ +""" +Buffering Module + +Stores intermediate compute information such as results from Conv or Pool layers. +During DSE the required size will be calculated to store intermediate results at +branching layers. The position of the buffer layer will then be moved along a +given branch until the buffer size is feasible and the latency of the exit +condition is mitigated/matched. For effective pipelining I think. + +Secondary function of the buffer is to "drop" a partial calculation. +Clear a FIFO - takes X number of cycles? +Drop signal will be control signal from the Exit Condition. + +Future goal will be to have buffer as an offchip memory link. +In this case, the drop might not be used. +""" + +from fpgaconvnet_optimiser.models.modules import Module +import numpy as np +import math +import os + +class Buffer(Module): + def __init__( + self, + rows, + cols, + channels, + ctrledge, + drop_mode =True, + data_width=16 + ): + # module name + self.name = "buff" + + # init module + Module.__init__(self,rows,cols,channels,data_width) + + # init variables + self.ctrledge = ctrledge + self.drop_mode = drop_mode + #self.filters = filters + #self.groups = groups + + # load resource coefficients + #TODO resource coefficients file for buffer module + #self.rsc_coef = np.load(os.path.join(os.path.dirname(__file__), + # "../../coefficients/buffer_rsc_coef.npy")) + + def module_info(self): + return { + 'type' : self.__class__.__name__.upper(), + 'rows' : self.rows_in(), + 'cols' : self.cols_in(), + 'groups' : self.groups, + 'channels' : self.channels_in(), + 'rows_out' : self.rows_out(), + 'cols_out' : self.cols_out(), + 'channels_out' : self.channels_out() + } + + def utilisation_model(self): + #TODO work out what this should be + #how should the FIFOs be laid out? + return [ + 1, + self.data_width, + self.data_width*self.channels + ] + + def pipeline_depth(self): + #TODO work out if this module can be/needs pipelining + return 0 + + + def rsc(self): + #basic version is just a single FIFO matching input dims + bram_buffer_size = self.rows*self.cols*self.channels*self.data_width + + bram_buffer = 0 + if bram_buffer_size >= 512: #taken from Accum.py modules + bram_buffer = math.ceil( (bram_acc_buffer_size)/18000) + return { + "LUT" : 0, #int(np.dot(self.utilisation_model(), self.rsc_coef[0])), + "BRAM" : bram_buffer, + "DSP" : 0, + "FF" : 0 #int(np.dot(self.utilisation_model(), self.rsc_coef[3])), + } + + def functional_model(self, data, ctrl_drop): + # check input dimensionality + assert data.shape[0] == self.rows , "ERROR: invalid row dimension" + assert data.shape[1] == self.cols , "ERROR: invalid column dimension" + assert data.shape[2] == self.channels, "ERROR: invalid channel dimension" + + out = np.zeros(( + self.rows, + self.cols, + self.channels),dtype=float) + + if self.drop_mode: #non-inverted + if ctrl_drop: + return out + else: + return data #pass through + else: #inverted + if not ctrl_drop: + return out + else: + return data #pass through diff --git a/fpgaconvnet_optimiser/models/modules/Module.py b/fpgaconvnet_optimiser/models/modules/Module.py index 9d60f674..f68eeabb 100644 --- a/fpgaconvnet_optimiser/models/modules/Module.py +++ b/fpgaconvnet_optimiser/models/modules/Module.py @@ -9,16 +9,16 @@ class Module: """ - modules are the fundamental building block for the hardware + modules are the fundamental building block for the hardware framework. In this base class, performance and resource model - templates are included, as well as a template for functional + templates are included, as well as a template for functional models. All modules are derived from this base class and contain - the same methods. + the same methods. .. note:: - The model expects that the module is run for a single three - dimensional featuremap. For intermediate modules within a layer, - they may not be operating on a three dimensional tensor, and + The model expects that the module is run for a single three + dimensional featuremap. For intermediate modules within a layer, + they may not be operating on a three dimensional tensor, and so the `rows`, `cols` and `channels` attributes are representative of the tensor if it was flattened to three dimensions. """ @@ -48,10 +48,10 @@ def __init__( channels: int channel dimension of input featuremap data_width: int - bitwidth of featuremap pixels + bitwidth of featuremap pixels rsc_coef: list list of resource model coefficients. Corresponds - to `LUT`, `BRAM`, `DSP` and `FF` resources in + to `LUT`, `BRAM`, `DSP` and `FF` resources in that order. """ # init variables @@ -62,16 +62,19 @@ def __init__( self.data_width = data_width # coefficients - self.rsc_coef = {} - rsc_types = ["FF","LUT","DSP","BRAM"] - for rsc_type in rsc_types: - self.rsc_coef[rsc_type] = np.load(os.path.join(os.path.dirname(__file__), - f"../../coefficients/{self.name}_{rsc_type.lower()}.npy")) + #self.rsc_coef = {} + #rsc_types = ["FF","LUT","DSP","BRAM"] + #for rsc_type in rsc_types: + # self.rsc_coef[rsc_type] = np.load(os.path.join(os.path.dirname(__file__), + # f"../../coefficients/{self.name}_{rsc_type.lower()}.npy")) + self.static_coef = [ 0 ] + self.dynamic_coef = [ 0 ] + self.rsc_coef = [ 0,0,0,0 ] def module_info(self): """ creates a dictionary containing information and - parameters for the module. + parameters for the module. """ return { 'type' : self.__class__.__name__.upper(), @@ -85,7 +88,7 @@ def module_info(self): def load_coef(self,rsc_coef_path): """ - loads coefficients of the module's resource + loads coefficients of the module's resource and power models. Parameters @@ -94,7 +97,7 @@ def load_coef(self,rsc_coef_path): path to `.npy` file containing resource model coefficients. """ - self.rsc_coef = np.load(rsc_coef_path) + self.rsc_coef = np.load(rsc_coef_path) def utilisation_model(self): """ @@ -104,7 +107,7 @@ def utilisation_model(self): utilisation of resources model. Defaults to zero resources. """ - + return [0] @@ -166,7 +169,7 @@ def rate_in(self): """ Returns ------- - float + float rate of words into module. As a fraction of a clock cycle. @@ -178,8 +181,8 @@ def rate_out(self): """ Returns ------- - float - rate of words out of the module. As a fraction + float + rate of words out of the module. As a fraction of a clock cycle. default is 1.0 @@ -190,10 +193,10 @@ def get_latency(self): """ Returns ------- - int - calculates the number of clock cycles latency + int + calculates the number of clock cycles latency it takes for the module to process a featuremap. - First latency in and latency out is calculated, + First latency in and latency out is calculated, then the latency of the module is the largest of the two. """ @@ -205,19 +208,19 @@ def pipeline_depth(self): """ Returns ------- - int + int depth of the pipeline for the module in clock cycles. default is 0. """ - return 0 + return 0 def rsc(self): """ Returns ------- - dict + dict estimated resource usage of the module. Uses the resource coefficients for the estimate. """ @@ -232,9 +235,9 @@ def functional_model(self,data): """ functional model of the module. Used for verification of hardware modules. - + Returns ------- - np.array + np.array """ return data diff --git a/fpgaconvnet_optimiser/models/modules/__init__.py b/fpgaconvnet_optimiser/models/modules/__init__.py index 00480b73..1f15835d 100644 --- a/fpgaconvnet_optimiser/models/modules/__init__.py +++ b/fpgaconvnet_optimiser/models/modules/__init__.py @@ -12,3 +12,5 @@ from .ReLU import ReLU from .SlidingWindow import SlidingWindow from .Squeeze import Squeeze +#EE modules +from .Buffer import Buffer diff --git a/fpgaconvnet_optimiser/tools/layer_enum.py b/fpgaconvnet_optimiser/tools/layer_enum.py index 15e2bebd..036dd3e0 100644 --- a/fpgaconvnet_optimiser/tools/layer_enum.py +++ b/fpgaconvnet_optimiser/tools/layer_enum.py @@ -1,5 +1,5 @@ from enum import Enum -import fpgaconvnet_optimiser.proto.fpgaconvnet_pb2 as fpgaconvnet_pb2 +import fpgaconvnet_optimiser.proto.fpgaconvnet_pb2 as fpgaconvnet_pb2 # Get enumeration from: # https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto @@ -20,12 +20,19 @@ class LAYER_TYPE(Enum): Split = 42 Merge = 43 Squeeze = 44 - Transpose = 45 + Transpose = 45 Flatten = 46 Cast = 47 Clip = 48 Shape = 49 - + #EE Layers - arbitrarily assigned + If = 50 + ReduceMax = 51 + Greater = 52 + Identity = 53 + #Not an ONNX op in this case + Buffer = 54 + @classmethod def get_type(cls, t): if type(t) is str: @@ -34,23 +41,23 @@ def get_type(cls, t): return cls(t) def to_proto_layer_type(layer_type): - layer_types = { - LAYER_TYPE.Convolution : fpgaconvnet_pb2.layer.layer_type.CONVOLUTION, - LAYER_TYPE.InnerProduct : fpgaconvnet_pb2.layer.layer_type.INNER_PRODUCT, - LAYER_TYPE.Pooling : fpgaconvnet_pb2.layer.layer_type.POOLING, - LAYER_TYPE.ReLU : fpgaconvnet_pb2.layer.layer_type.RELU, - LAYER_TYPE.Squeeze : fpgaconvnet_pb2.layer.layer_type.SQUEEZE, - LAYER_TYPE.Concat : fpgaconvnet_pb2.layer.layer_type.CONCAT, - LAYER_TYPE.BatchNorm : fpgaconvnet_pb2.layer.layer_type.BATCH_NORM + layer_types = { + LAYER_TYPE.Convolution : fpgaconvnet_pb2.layer.layer_type.CONVOLUTION, + LAYER_TYPE.InnerProduct : fpgaconvnet_pb2.layer.layer_type.INNER_PRODUCT, + LAYER_TYPE.Pooling : fpgaconvnet_pb2.layer.layer_type.POOLING, + LAYER_TYPE.ReLU : fpgaconvnet_pb2.layer.layer_type.RELU, + LAYER_TYPE.Squeeze : fpgaconvnet_pb2.layer.layer_type.SQUEEZE, + LAYER_TYPE.Concat : fpgaconvnet_pb2.layer.layer_type.CONCAT, + LAYER_TYPE.BatchNorm : fpgaconvnet_pb2.layer.layer_type.BATCH_NORM } return layer_types.get(layer_type, lambda: "Invalid Layer Type") def from_proto_layer_type(layer_type): - layer_types = { - fpgaconvnet_pb2.layer.layer_type.CONVOLUTION : LAYER_TYPE.Convolution, + layer_types = { + fpgaconvnet_pb2.layer.layer_type.CONVOLUTION : LAYER_TYPE.Convolution, fpgaconvnet_pb2.layer.layer_type.INNER_PRODUCT : LAYER_TYPE.InnerProduct, - fpgaconvnet_pb2.layer.layer_type.POOLING : LAYER_TYPE.Pooling, - fpgaconvnet_pb2.layer.layer_type.RELU : LAYER_TYPE.ReLU, + fpgaconvnet_pb2.layer.layer_type.POOLING : LAYER_TYPE.Pooling, + fpgaconvnet_pb2.layer.layer_type.RELU : LAYER_TYPE.ReLU, fpgaconvnet_pb2.layer.layer_type.SQUEEZE : LAYER_TYPE.Squeeze, fpgaconvnet_pb2.layer.layer_type.CONCAT : LAYER_TYPE.Concat, fpgaconvnet_pb2.layer.layer_type.BATCH_NORM : LAYER_TYPE.BatchNorm, diff --git a/fpgaconvnet_optimiser/tools/onnx_helper.py b/fpgaconvnet_optimiser/tools/onnx_helper.py index db335f1b..1b3d1b9a 100644 --- a/fpgaconvnet_optimiser/tools/onnx_helper.py +++ b/fpgaconvnet_optimiser/tools/onnx_helper.py @@ -145,23 +145,41 @@ def _name(node): #return _format_name( node.name if node.name else node.output[0] ) return _format_name( node.output[0] ) -def get_model_node(model, name): +def get_model_node(model, name, submodels=[]): for node in model.graph.node: if _name(node) == name: # formatted match return node + for sm in submodels: #look through submodels + for subnode in sm.g.node: #g used in subgraphs for graph + if _name(subnode) == name: # formatted match + return subnode + raise NameError("Node name not in graphs") -def get_model_value_info(model, name): +def get_model_value_info(model, name, submodels=[]): for node in model.graph.value_info: if _format_name(node.name) == name: # formatted match return node + for sm in submodels: #look through submodels + for subnode in sm.g.value_info: #g used in subgraphs for graph + if _format_name(subnode.name) == name: # formatted match + return subnode def get_model_input(model, name): for node in model.graph.input: if node.name == name: # exact match return node +def get_model_output(model, name, submodels=[]): + for node in model.graph.output: + if node.name == name: + return node + for sm in submodels: #look through submodels + for subnode in sm.g.output: #g used in subgraphs for graph + if _format_name(subnode.name) == name: # formatted match + return subnode + def get_model_initializer(model, name, to_tensor=True): - for node in model.graph.initializer: + for node in model.graph.initializer: #works with subgraphs if node.name == name: # exact match if to_tensor: return onnx.numpy_helper.to_array(node) @@ -175,9 +193,12 @@ def _format_attr(attribute): attr_out[attr.name] = [ int(i) for i in attr.ints ] return attr_out -def _out_dim(model, name): +def _out_dim(model, submodels, name): dim = [0,0,0] - value_info = get_model_value_info(model, name) + value_info = get_model_value_info(model, name, submodels) + if value_info == None: + #try looking through outputs - required for subgraphs + value_info = get_model_output(model, name, submodels) if len(value_info.type.tensor_type.shape.dim) == 4: #dim[0] = int(node.type.tensor_type.shape.dim[0].dim_value) # batch size dim[0] = int(value_info.type.tensor_type.shape.dim[1].dim_value) # channels @@ -190,5 +211,3 @@ def _out_dim(model, name): dim[1] = 1 # rows dim[2] = 1 # cols return dim - - diff --git a/fpgaconvnet_optimiser/tools/parser.py b/fpgaconvnet_optimiser/tools/parser.py index 0aa2bf61..129b1212 100644 --- a/fpgaconvnet_optimiser/tools/parser.py +++ b/fpgaconvnet_optimiser/tools/parser.py @@ -2,7 +2,7 @@ import pydot import os import random -import copy +import copy import onnx import onnx.utils import onnx.numpy_helper @@ -18,11 +18,16 @@ from fpgaconvnet_optimiser.models.layers import ReLULayer from fpgaconvnet_optimiser.models.layers import LRNLayer from fpgaconvnet_optimiser.models.layers import SoftMaxLayer +#EE layers +from fpgaconvnet_optimiser.models.layers import BufferLayer +from fpgaconvnet_optimiser.models.layers import SplitLayer +from fpgaconvnet_optimiser.models.layers import ExitConditionLayer +from fpgaconvnet_optimiser.models.layers import ExitSelectLayer from fpgaconvnet_optimiser.tools.layer_enum import LAYER_TYPE def _layer_type(op_type): - layer_types = { + layer_types = { "Conv" : LAYER_TYPE.Convolution, "Gemm" : LAYER_TYPE.InnerProduct, "Relu" : LAYER_TYPE.ReLU, @@ -40,6 +45,15 @@ def _layer_type(op_type): "Clip" : LAYER_TYPE.Clip, "Shape" : LAYER_TYPE.Shape, "Squeeze" : LAYER_TYPE.Squeeze, + #placeholder layers for branching + exit decision + "If" : LAYER_TYPE.If, + "ReduceMax" : LAYER_TYPE.ReduceMax, + "Greater" : LAYER_TYPE.Greater, + "Identity" : LAYER_TYPE.Identity, + #hw layer to help split dataflow + "Split" : LAYER_TYPE.Split, + #flexble buffer point for intermediate results + "Buffer" : LAYER_TYPE.Buffer, } return layer_types.get(op_type, lambda: TypeError) @@ -62,6 +76,12 @@ def filter_node_types(graph, layer_type): def build_graph(model): # graph structure graph = nx.DiGraph() + submodels = [] #links to the subgraphs in If ops + ctrledges = [] #the name/ID of the if nodes [[ifnode,then,else, cond]] + edges = [] #dataflow edges + exitedges = [] #dataflow from exits to parent If op + #TODO this would be the point to add in branch execution rates + # add all nodes from network for node in model.graph.node: # get name of node @@ -69,12 +89,35 @@ def build_graph(model): # add node to graph graph.add_node( name, type=_layer_type(node.op_type), hw=None, inputs={} ) if _layer_type(node.op_type) in [ LAYER_TYPE.Convolution, LAYER_TYPE.InnerProduct ]: - graph.nodes[name]['inputs'] = { "weights": "", "bias": "" } + graph.nodes[name]['inputs'] = { "weights": "", "bias": "" } + + #add subgraphs to the network + if _layer_type(node.op_type) == LAYER_TYPE.If: + ifnode = [name, None, None, None] + #access the subgraphs + for subgraph in node.attribute: + submodels.append(subgraph) #record link to submodels + subnode_head = onnx_helper._name(subgraph.g.node[0]) + if subgraph.name == "then_branch": + ifnode[1] = subnode_head + elif subgraph.name == "else_branch": + ifnode[2] = subnode_head + else: + raise NameError("Incorrect branch names") + + last_name = None + for subnode in subgraph.g.node: + subname = onnx_helper._name(subnode) + # add sub graph node to graph + graph.add_node(subname, type=_layer_type(subnode.op_type), hw=None, inputs={} ) + last_name=subname + exitedges.append((last_name, name)) #dataflow from last node in branch to If op + ctrledges.append(ifnode) + # add all edges from network - edges = [] for name in graph.nodes(): # get node from model - node = onnx_helper.get_model_node(model, name) + node = onnx_helper.get_model_node(model, name, submodels=submodels) # add edges into node for input_node in node.input: # add initializers @@ -107,15 +150,84 @@ def build_graph(model): for edge in edges: graph.add_edge(*edge) # return graph - return graph + return submodels, graph, ctrledges, exitedges + +def add_split_nodes(graph, ctrledges): + #adding the split nodes for branching + splitnodes = [] + for node in graph.nodes: + successors = graphs.get_next_nodes(graph, node) + if len(successors) > 1: #general split node placement + splitnodes.append((node, successors)) + save_nodes = [] #store the split nodes for later use (not part of model) + for i,(node,successors) in enumerate(splitnodes): + splitname = "split_" + str(i) + graph.add_node(splitname, type=LAYER_TYPE.Split, hw=None, inputs={} ) + for succ in successors: + graph.remove_edge(node, succ) + graph.add_edge(splitname, succ) + graph.add_edge(node, splitname) + save_nodes.append(splitname) + return save_nodes -def add_hardware(model, graph): +def add_buffer_nodes(graph, ctrledges): + #adding buffer nodes to store/buffer compute at branch points + def _add_buffer_nodes(graph, ctrledges, node, instance, save_nodes): + predec = graphs.get_prev_nodes(graph, node) + if len(predec) > 1: + raise Exception("Multiple predecessors not supported") + buffername = "buffer_" + str(instance) + graph.add_node(buffername, type=LAYER_TYPE.Buffer, hw=None, inputs={} ) + #insert buffer layer + graph.remove_edge(predec[0], node) + graph.add_edge(predec[0], buffername) + graph.add_edge(buffername, node) + #update ctrledge to point to buffer + save_nodes.append(buffername) + return buffername + + save_nodes = [] #store the buffer nodes for later use (not part of model) + i=0 + for branch_start in ctrledges: + buffname = _add_buffer_nodes(graph, ctrledges, branch_start[1], i, save_nodes) + branch_start[1] = buffname #then_branch + buffname = _add_buffer_nodes(graph, ctrledges, branch_start[2], i+1, save_nodes) + branch_start[2] = buffname #else_branch + i+=2 + return save_nodes + +def update_crtledges(graph, ctrledges): + #ASSUMPTION: that target layer will be immediate predecessor + for node in ctrledges: #will perform at each If op + #ctrledges[i][0] is the If node + predec = graphs.get_prev_nodes(graph, node[0]) + if len(predec) > 1: + raise Exception("Multiple predecessors not supported") + if graph.nodes[predec[0]]["type"] not in [LAYER_TYPE.Greater]: + raise Exception("Other layer types not supported") + graph.remove_edge(predec[0], node[0]) #remove dataflow edge + node[3] = node[0] #set If op to have ctrl edge + node[0] = predec[0] #Replace with predecessor + +def find_ctrl_origin(graph, ctrledges, node): + for ctrl in ctrledges: + if node == ctrl[1]: + return ctrl[0], True #then branch link so EE + elif node == ctrl[2]: + return ctrl[0], False #else branch link so not EE + elif node == ctrl[3]: #for linking exit select layer + return ctrl[0], None #TODO tidy this up + raise Exception("Node has no control input") + +def add_hardware(model, submodels, graph, ctrledges, hw_only_nodes): # iterate over nodes in graph - for node in model.graph.node: - # get node name - name = onnx_helper._name(node) - # check if node in graph - if not name in graph.nodes(): + all_nodes = [*model.graph.node] + for submodel in submodels: + for subnode in submodel.g.node: + all_nodes.append(subnode) + for node in all_nodes: + name = onnx_helper._name(node) # get node name + if not name in graph.nodes(): # check if node in graph continue # Convolution layer if graph.nodes[name]['type'] == LAYER_TYPE.Convolution: @@ -124,7 +236,7 @@ def add_hardware(model, graph): weights_dim = onnx_helper.get_model_input(model,weights_input) filters = int(weights_dim.type.tensor_type.shape.dim[0].dim_value) # get node attributes - attr = onnx_helper._format_attr(node.attribute) + attr = onnx_helper._format_attr(node.attribute) # default attributes attr.setdefault("group", 1) attr.setdefault("strides", [1,1]) @@ -136,8 +248,8 @@ def add_hardware(model, graph): 0, # initialise rows to 0 0, # initialise cols to 0 0, # initialise channels to 0 - 1, # initialise coarse in to 0 - 1, # initialise coarse out to 0 + 1, # initialise coarse in to 1 + 1, # initialise coarse out to 1 k_size =attr["kernel_shape"][0], stride =attr["strides"][0], pad =attr["pads"][0], @@ -156,14 +268,14 @@ def add_hardware(model, graph): 0, # initialise rows to 0 0, # initialise cols to 0 0, # initialise channels to 0 - 1, # initialise coarse in to 0 - 1, # initialise coarse out to 0 + 1, # initialise coarse in to 1 + 1, # initialise coarse out to 1 ) continue # Pooling layer if graph.nodes[name]['type'] == LAYER_TYPE.Pooling: # get node attributes - attr = onnx_helper._format_attr(node.attribute) + attr = onnx_helper._format_attr(node.attribute) # default attributes attr.setdefault("strides", [1,1]) attr.setdefault("pads", [0,0,0,0]) @@ -173,8 +285,8 @@ def add_hardware(model, graph): 0, # initialise rows to 0 0, # initialise cols to 0 0, # initialise channels to 0 - 1, # initialise coarse in to 0 - 1, # initialise coarse out to 0 + 1, # initialise coarse in to 1 + 1, # initialise coarse out to 1 pool_type = 'max', # TODO: change so that it does AVG also k_size =attr["kernel_shape"][0], stride =attr["strides"][0], @@ -188,8 +300,8 @@ def add_hardware(model, graph): 0, # initialise rows to 0 0, # initialise cols to 0 0, # initialise channels to 0 - 1, # initialise coarse in to 0 - 1, # initialise coarse out to 0 + 1, # initialise coarse in to 1 + 1, # initialise coarse out to 1 ) continue # BatchNorm Layer @@ -198,14 +310,79 @@ def add_hardware(model, graph): 0, # initialise rows to 0 0, # initialise cols to 0 0, # initialise channels to 0 - 1, # initialise coarse in to 0 - 1, # initialise coarse out to 0 + 1, # initialise coarse in to 1 + 1, # initialise coarse out to 1 ) continue - raise NameError - print(name,graph.nodes[name]['type']) + #top1 exit criterion layer + if graph.nodes[name]['type'] == LAYER_TYPE.Greater: + #need to have some idea of the hw layer for EC + #add the control edges to the buffers/drop points + for ctrl in ctrledges: + if name == ctrl[0]: + ctrlout = ctrl[1:] + if len(ctrlout) == 0: + raise NameError("Control edges not found") + graph.nodes[name]['hw'] = ExitConditionLayer( + 0, # initialise rows to 0 + 0, # initialise cols to 0 + 0, # initialise channels to 0 + 1, # initialise coarse in to 1 + 1, # initialise coarse out to 1 + ctrlout + ) + continue + #early exit layer + if graph.nodes[name]['type'] == LAYER_TYPE.If: + #with two exits it makes sense to pull from the if + #will need to generalise assumptions for >2 exits + #graph - two dataflow inputs, pick either or on hw level + ctrl_origin, _ = find_ctrl_origin(graph, ctrledges, name) + graph.nodes[name]['hw'] = ExitSelectLayer( + 0, # initialise rows to 0 + 0, # initialise cols to 0 + 0, # initialise channels to 0 + 1, # initialise coarse in to 1 + 1, # initialise coarse out to 1 + ctrl_origin + ) + continue + raise NameError(name, node.op_type) -def add_dimensions(model, graph): + #add hardware for the non-ONNX nodes + for name in hw_only_nodes: + #split layer + if graph.nodes[name]['type'] == LAYER_TYPE.Split: + #has input and minimum two outputs + graph.nodes[name]['hw'] = SplitLayer( + 0, # initialise rows to 0 + 0, # initialise cols to 0 + 0, # initialise channels to 0 + 1, # initialise coarse to 1 + ports_out = 2, #TODO make this variable + ) + continue + #buffer layer + if graph.nodes[name]['type'] == LAYER_TYPE.Buffer: + #buffer point to be moved up and down links-depending on exit laten + #maybe do a size calc here? + #might need to specify link from EC to here? + ctrl_origin, EE_flag = find_ctrl_origin(graph, ctrledges, name) + #ASSUMPTION: then_branch corresponds to EE + graph.nodes[name]['hw'] = BufferLayer( + 0, # initialise rows to 0 + 0, # initialise cols to 0 + 0, # initialise channels to 0 + 1, # initialise coarse in to 1 + 1, # initialise coarse out to 1 + ctrl_origin, + drop_mode=EE_flag + ) + continue + raise NameError(name, graph.nodes[name]['type']) + + +def add_dimensions(model, submodels, graph): # add input dimensions input_channels = int(model.graph.input[0].type.tensor_type.shape.dim[1].dim_value) input_rows = int(model.graph.input[0].type.tensor_type.shape.dim[2].dim_value) @@ -218,25 +395,41 @@ def add_dimensions(model, graph): # iterate over layers in model nodes = list(graph.nodes()) nodes.remove(input_node) + + def _find_valid_prev_node(graph, node): + prev_nodes = graphs.get_prev_nodes(graph, node) + if len(prev_nodes) > 1: + raise Exception("Multiple inputs not currently supported") + if graph.nodes[prev_nodes[0]]['type'] in [LAYER_TYPE.Split, LAYER_TYPE.Buffer]: + return _find_valid_prev_node(graph, prev_nodes[0]) #go round again + else: + return prev_nodes[0] + for node in nodes: # find previous node prev_nodes = graphs.get_prev_nodes(graph, node) - for prev_node in prev_nodes: # TODO: support parallel networks - # get previous node output dimensions - dim = onnx_helper._out_dim(model, prev_node) - # update input dimensions - graph.nodes[node]['hw'].channels[0] = dim[0] - graph.nodes[node]['hw'].rows[0] = dim[1] - graph.nodes[node]['hw'].cols[0] = dim[2] + # TODO: support parallel networks + if len(prev_nodes) > 1 and graph.nodes[node]['type'] != LAYER_TYPE.If: + #If layer has 2 dataflow inputs of identical shape - so use the first + raise Exception("Multiple inputs not currently supported") + prev_node = prev_nodes[0] + #split and buffer layers won't have value info - so use prev prev nodes. + if graph.nodes[prev_node]['type'] in [LAYER_TYPE.Split, LAYER_TYPE.Buffer]: + prev_node = _find_valid_prev_node(graph, prev_node) + # get previous node output dimensions + dim = onnx_helper._out_dim(model, submodels, prev_node) + # update input dimensions + graph.nodes[node]['hw'].channels[0] = dim[0] + graph.nodes[node]['hw'].rows[0] = dim[1] + graph.nodes[node]['hw'].cols[0] = dim[2] def parse_net(filepath,view=True): - # load onnx model model = onnx_helper.load(filepath) - + # get graph - graph = build_graph(model) - + submodels, graph, ctrledges, exitedges = build_graph(model) + # remove input node remove_nodes = [] for node in graph.nodes: @@ -253,18 +446,36 @@ def parse_net(filepath,view=True): filter_node_types(graph, LAYER_TYPE.Cast) filter_node_types(graph, LAYER_TYPE.Squeeze) filter_node_types(graph, LAYER_TYPE.Shape) + #TODO softmax needed for exit condition, remove filter when ONNX input updated filter_node_types(graph, LAYER_TYPE.Softmax) filter_node_types(graph, LAYER_TYPE.LRN) + #remove ReduceMax since it's implied as part of the EC + filter_node_types(graph, LAYER_TYPE.ReduceMax) + + #add in split and buffer/offchip store layer nodes + hw_only_nodes = add_split_nodes(graph, ctrledges) + hw_only_nodes += add_buffer_nodes(graph, ctrledges) + + #shift control edge start from If to Greater (the layer standing in as EC) + #append the ctrl edge from the Greater to If and remove the data edge + update_crtledges(graph, ctrledges) + + #determine Early Exit points (Identity operations, edge to exit) + for eedge in exitedges: + graph.add_edge(*eedge) + #remove pass through node + filter_node_types(graph, LAYER_TYPE.Identity) + #TODO separate softmax layer from other layers in model + # add hardware to graph - add_hardware(model, graph) + add_hardware(model, submodels, graph, ctrledges, hw_only_nodes) # add layer dimensions - add_dimensions(model, graph) + add_dimensions(model, submodels, graph) # update all layers for node in graph.nodes: graph.nodes[node]['hw'].update() return model, graph - diff --git a/fpgaconvnet_optimiser/tools/parsing_ee_expr.py b/fpgaconvnet_optimiser/tools/parsing_ee_expr.py new file mode 100644 index 00000000..6bc5558e --- /dev/null +++ b/fpgaconvnet_optimiser/tools/parsing_ee_expr.py @@ -0,0 +1,46 @@ +from graphviz import Digraph +import pydot +import os +import random +import copy +import onnx +import onnx.utils +import onnx.numpy_helper +import networkx as nx + + +import onnxoptimizer as optimizer + +#import fpgaconvnet_optimiser.tools.graphs as graphs +#import fpgaconvnet_optimiser.tools.onnx_helper as onnx_helper + +#from fpgaconvnet_optimiser.models.layers import BatchNormLayer +#from fpgaconvnet_optimiser.models.layers import ConvolutionLayer +#from fpgaconvnet_optimiser.models.layers import InnerProductLayer +#from fpgaconvnet_optimiser.models.layers import PoolingLayer +#from fpgaconvnet_optimiser.models.layers import ReLULayer +#from fpgaconvnet_optimiser.models.layers import LRNLayer +#from fpgaconvnet_optimiser.models.layers import SoftMaxLayer + +#from fpgaconvnet_optimiser.tools.layer_enum import LAYER_TYPE + +#import the current parser functions to see what they can do +import parser + +def main(): + print("Parser experiments") + + #attempt to parse the graph and see what errors + filepath = "/home/benubu/phd/fpgaconvnet-optimiser/examples/models/speedy-brn-top1ee-bsf.onnx" + #filepath = "/home/benubu/phd/fpgaconvnet-optimiser/examples/models/pt_fulltest.onnx" + model, graph, ctrledges = parser.parse_net(filepath, view=False) #check what view does + + print(graph.nodes) + print(graph.edges) + print(ctrledges) + + for node in graph.nodes: + print(graph.nodes[node]['hw']) + +if __name__ == "__main__": + main()